mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 22:59:04 +08:00
Nick: formatting done
This commit is contained in:
parent
994e1eb502
commit
498558d358
@ -1,8 +1,6 @@
|
|||||||
import request from "supertest";
|
import request from "supertest";
|
||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
import {
|
import { ScrapeRequestInput } from "../../controllers/v1/types";
|
||||||
ScrapeRequestInput,
|
|
||||||
} from "../../controllers/v1/types";
|
|
||||||
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
|
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
@ -19,8 +17,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
|
|
||||||
describe("GET /is-production", () => {
|
describe("GET /is-production", () => {
|
||||||
it.concurrent("should return the production status", async () => {
|
it.concurrent("should return the production status", async () => {
|
||||||
const response: any =
|
const response: any = await request(TEST_URL).get("/is-production");
|
||||||
await request(TEST_URL).get("/is-production");
|
|
||||||
|
|
||||||
console.log(
|
console.log(
|
||||||
"process.env.USE_DB_AUTHENTICATION",
|
"process.env.USE_DB_AUTHENTICATION",
|
||||||
@ -274,8 +271,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
url: "https://www.scrapethissite.com/",
|
url: "https://www.scrapethissite.com/",
|
||||||
onlyMainContent: false, // default is true
|
onlyMainContent: false, // default is true
|
||||||
};
|
};
|
||||||
const responseWithoutRemoveTags: any =
|
const responseWithoutRemoveTags: any = await request(TEST_URL)
|
||||||
await request(TEST_URL)
|
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
|
@ -1,8 +1,6 @@
|
|||||||
import request from "supertest";
|
import request from "supertest";
|
||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
import {
|
import { ScrapeRequest } from "../../controllers/v1/types";
|
||||||
ScrapeRequest,
|
|
||||||
} from "../../controllers/v1/types";
|
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
const FIRECRAWL_API_URL = "http://127.0.0.1:3002";
|
const FIRECRAWL_API_URL = "http://127.0.0.1:3002";
|
||||||
@ -12,9 +10,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
it.concurrent(
|
it.concurrent(
|
||||||
"should return a successful response for a scrape with 403 page",
|
"should return a successful response for a scrape with 403 page",
|
||||||
async () => {
|
async () => {
|
||||||
const response: any = await request(
|
const response: any = await request(FIRECRAWL_API_URL)
|
||||||
FIRECRAWL_API_URL,
|
|
||||||
)
|
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@ -39,9 +35,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
url: E2E_TEST_SERVER_URL,
|
url: E2E_TEST_SERVER_URL,
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: any = await request(
|
const response: any = await request(FIRECRAWL_API_URL)
|
||||||
FIRECRAWL_API_URL,
|
|
||||||
)
|
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@ -86,9 +80,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
formats: ["html"],
|
formats: ["html"],
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: any = await request(
|
const response: any = await request(FIRECRAWL_API_URL)
|
||||||
FIRECRAWL_API_URL,
|
|
||||||
)
|
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@ -121,9 +113,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
formats: ["rawHtml"],
|
formats: ["rawHtml"],
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: any = await request(
|
const response: any = await request(FIRECRAWL_API_URL)
|
||||||
FIRECRAWL_API_URL,
|
|
||||||
)
|
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@ -159,9 +149,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
headers: { "e2e-header-test": "firecrawl" },
|
headers: { "e2e-header-test": "firecrawl" },
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: any = await request(
|
const response: any = await request(FIRECRAWL_API_URL)
|
||||||
FIRECRAWL_API_URL,
|
|
||||||
)
|
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@ -188,9 +176,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
includeTags: ["#content-1"],
|
includeTags: ["#content-1"],
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: any = await request(
|
const response: any = await request(FIRECRAWL_API_URL)
|
||||||
FIRECRAWL_API_URL,
|
|
||||||
)
|
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@ -220,9 +206,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
excludeTags: ["#content-1"],
|
excludeTags: ["#content-1"],
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: any = await request(
|
const response: any = await request(FIRECRAWL_API_URL)
|
||||||
FIRECRAWL_API_URL,
|
|
||||||
)
|
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@ -253,9 +237,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
onlyMainContent: false,
|
onlyMainContent: false,
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: any = await request(
|
const response: any = await request(FIRECRAWL_API_URL)
|
||||||
FIRECRAWL_API_URL,
|
|
||||||
)
|
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@ -285,9 +267,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
timeout: 500,
|
timeout: 500,
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: any = await request(
|
const response: any = await request(FIRECRAWL_API_URL)
|
||||||
FIRECRAWL_API_URL,
|
|
||||||
)
|
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@ -312,9 +292,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
mobile: true,
|
mobile: true,
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: any = await request(
|
const response: any = await request(FIRECRAWL_API_URL)
|
||||||
FIRECRAWL_API_URL,
|
|
||||||
)
|
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@ -335,9 +313,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
it.concurrent(
|
it.concurrent(
|
||||||
"should handle 'parsePDF' parameter correctly",
|
"should handle 'parsePDF' parameter correctly",
|
||||||
async () => {
|
async () => {
|
||||||
const response: any = await request(
|
const response: any = await request(FIRECRAWL_API_URL)
|
||||||
FIRECRAWL_API_URL,
|
|
||||||
)
|
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@ -357,9 +333,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
"h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm",
|
"h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm",
|
||||||
);
|
);
|
||||||
|
|
||||||
const responseNoParsePDF: any = await request(
|
const responseNoParsePDF: any = await request(FIRECRAWL_API_URL)
|
||||||
FIRECRAWL_API_URL,
|
|
||||||
)
|
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@ -410,9 +384,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
timeout: 120000,
|
timeout: 120000,
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: any = await request(
|
const response: any = await request(FIRECRAWL_API_URL)
|
||||||
FIRECRAWL_API_URL,
|
|
||||||
)
|
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@ -432,8 +404,9 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
timeout: 120000,
|
timeout: 120000,
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const responseWithSkipTlsVerification: any =
|
const responseWithSkipTlsVerification: any = await request(
|
||||||
await request(FIRECRAWL_API_URL)
|
FIRECRAWL_API_URL,
|
||||||
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@ -461,9 +434,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
removeBase64Images: true,
|
removeBase64Images: true,
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: any = await request(
|
const response: any = await request(FIRECRAWL_API_URL)
|
||||||
FIRECRAWL_API_URL,
|
|
||||||
)
|
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@ -493,9 +464,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
],
|
],
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: any = await request(
|
const response: any = await request(FIRECRAWL_API_URL)
|
||||||
FIRECRAWL_API_URL,
|
|
||||||
)
|
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@ -526,9 +495,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
],
|
],
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: any = await request(
|
const response: any = await request(FIRECRAWL_API_URL)
|
||||||
FIRECRAWL_API_URL,
|
|
||||||
)
|
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@ -569,9 +536,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
],
|
],
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: any = await request(
|
const response: any = await request(FIRECRAWL_API_URL)
|
||||||
FIRECRAWL_API_URL,
|
|
||||||
)
|
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@ -619,9 +584,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
],
|
],
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: any = await request(
|
const response: any = await request(FIRECRAWL_API_URL)
|
||||||
FIRECRAWL_API_URL,
|
|
||||||
)
|
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@ -657,9 +620,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
],
|
],
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: any = await request(
|
const response: any = await request(FIRECRAWL_API_URL)
|
||||||
FIRECRAWL_API_URL,
|
|
||||||
)
|
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@ -692,9 +653,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
],
|
],
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: any = await request(
|
const response: any = await request(FIRECRAWL_API_URL)
|
||||||
FIRECRAWL_API_URL,
|
|
||||||
)
|
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@ -731,9 +690,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
],
|
],
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: any = await request(
|
const response: any = await request(FIRECRAWL_API_URL)
|
||||||
FIRECRAWL_API_URL,
|
|
||||||
)
|
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
|
@ -23,8 +23,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
|
|
||||||
describe("POST /v0/scrape", () => {
|
describe("POST /v0/scrape", () => {
|
||||||
it.concurrent("should require authorization", async () => {
|
it.concurrent("should require authorization", async () => {
|
||||||
const response: any =
|
const response: any = await request(TEST_URL).post("/v0/scrape");
|
||||||
await request(TEST_URL).post("/v0/scrape");
|
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -159,8 +158,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
it.concurrent(
|
it.concurrent(
|
||||||
"should return a successful response with a valid API key with removeTags option",
|
"should return a successful response with a valid API key with removeTags option",
|
||||||
async () => {
|
async () => {
|
||||||
const responseWithoutRemoveTags: any =
|
const responseWithoutRemoveTags: any = await request(TEST_URL)
|
||||||
await request(TEST_URL)
|
|
||||||
.post("/v0/scrape")
|
.post("/v0/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@ -332,8 +330,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
|
|
||||||
describe("POST /v0/crawl", () => {
|
describe("POST /v0/crawl", () => {
|
||||||
it.concurrent("should require authorization", async () => {
|
it.concurrent("should require authorization", async () => {
|
||||||
const response: any =
|
const response: any = await request(TEST_URL).post("/v0/crawl");
|
||||||
await request(TEST_URL).post("/v0/crawl");
|
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -461,9 +458,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
|
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
|
||||||
const completedResponse: any = await request(
|
const completedResponse: any = await request(TEST_URL)
|
||||||
TEST_URL,
|
|
||||||
)
|
|
||||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
|
||||||
@ -509,9 +504,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const completedResponse: any = await request(
|
const completedResponse: any = await request(TEST_URL)
|
||||||
TEST_URL,
|
|
||||||
)
|
|
||||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
|
||||||
|
@ -31,6 +31,8 @@ describe("Scrape tests", () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
expectScrapeToSucceed(response);
|
expectScrapeToSucceed(response);
|
||||||
expect(response.body.data.markdown).toBe("this is fake data coming from the mocking system!");
|
expect(response.body.data.markdown).toBe(
|
||||||
|
"this is fake data coming from the mocking system!",
|
||||||
|
);
|
||||||
});
|
});
|
||||||
});
|
});
|
@ -4,7 +4,9 @@ const fs = require("fs");
|
|||||||
const mocksDirPath = path.join(__dirname, "../../../scraper/scrapeURL/mocks");
|
const mocksDirPath = path.join(__dirname, "../../../scraper/scrapeURL/mocks");
|
||||||
const files = fs.readdirSync(mocksDirPath);
|
const files = fs.readdirSync(mocksDirPath);
|
||||||
|
|
||||||
const contents = files.map(x => JSON.parse(fs.readFileSync(path.join(mocksDirPath, x), "utf8")));
|
const contents = files.map((x) =>
|
||||||
|
JSON.parse(fs.readFileSync(path.join(mocksDirPath, x), "utf8")),
|
||||||
|
);
|
||||||
|
|
||||||
fs.writeFileSync(
|
fs.writeFileSync(
|
||||||
path.join(__dirname, "../mocks/" + process.argv[2] + ".json"),
|
path.join(__dirname, "../mocks/" + process.argv[2] + ".json"),
|
||||||
|
@ -105,7 +105,6 @@ export async function getACUC(
|
|||||||
{ get: true },
|
{ get: true },
|
||||||
));
|
));
|
||||||
|
|
||||||
|
|
||||||
if (!error) {
|
if (!error) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -146,7 +145,7 @@ export async function clearACUC(api_key: string): Promise<void> {
|
|||||||
modes.map(async (mode) => {
|
modes.map(async (mode) => {
|
||||||
const cacheKey = `acuc_${api_key}_${mode}`;
|
const cacheKey = `acuc_${api_key}_${mode}`;
|
||||||
await deleteKey(cacheKey);
|
await deleteKey(cacheKey);
|
||||||
})
|
}),
|
||||||
);
|
);
|
||||||
|
|
||||||
// Also clear the base cache key
|
// Also clear the base cache key
|
||||||
@ -232,7 +231,6 @@ export async function supaAuthenticateUser(
|
|||||||
teamId = chunk.team_id;
|
teamId = chunk.team_id;
|
||||||
priceId = chunk.price_id;
|
priceId = chunk.price_id;
|
||||||
|
|
||||||
|
|
||||||
plan = getPlanByPriceId(priceId);
|
plan = getPlanByPriceId(priceId);
|
||||||
subscriptionData = {
|
subscriptionData = {
|
||||||
team_id: teamId,
|
team_id: teamId,
|
||||||
|
@ -16,7 +16,7 @@ export async function checkFireEngine(req: Request, res: Response) {
|
|||||||
const timeout = setTimeout(() => controller.abort(), 30000);
|
const timeout = setTimeout(() => controller.abort(), 30000);
|
||||||
|
|
||||||
const urls = ["https://roastmywebsite.ai", "https://example.com"];
|
const urls = ["https://roastmywebsite.ai", "https://example.com"];
|
||||||
let lastError : string | null = null;
|
let lastError: string | null = null;
|
||||||
|
|
||||||
for (const url of urls) {
|
for (const url of urls) {
|
||||||
try {
|
try {
|
||||||
@ -62,7 +62,6 @@ export async function checkFireEngine(req: Request, res: Response) {
|
|||||||
success: false,
|
success: false,
|
||||||
error: "Internal server error - all retry attempts failed",
|
error: "Internal server error - all retry attempts failed",
|
||||||
});
|
});
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(error);
|
logger.error(error);
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
|
@ -62,20 +62,23 @@ export async function crawlErrorsController(
|
|||||||
const failedJobIDs: string[] = [];
|
const failedJobIDs: string[] = [];
|
||||||
|
|
||||||
for (const [id, status] of jobStatuses) {
|
for (const [id, status] of jobStatuses) {
|
||||||
if (
|
if (status === "failed") {
|
||||||
status === "failed"
|
|
||||||
) {
|
|
||||||
failedJobIDs.push(id);
|
failedJobIDs.push(id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
res.status(200).json({
|
res.status(200).json({
|
||||||
errors: (await getJobs(failedJobIDs)).map(x => ({
|
errors: (await getJobs(failedJobIDs)).map((x) => ({
|
||||||
id: x.id,
|
id: x.id,
|
||||||
timestamp: x.finishedOn !== undefined ? (new Date(x.finishedOn).toISOString()) : undefined,
|
timestamp:
|
||||||
|
x.finishedOn !== undefined
|
||||||
|
? new Date(x.finishedOn).toISOString()
|
||||||
|
: undefined,
|
||||||
url: x.data.url,
|
url: x.data.url,
|
||||||
error: x.failedReason,
|
error: x.failedReason,
|
||||||
})),
|
})),
|
||||||
robotsBlocked: await redisConnection.smembers("crawl:" + req.params.jobId + ":robots_blocked"),
|
robotsBlocked: await redisConnection.smembers(
|
||||||
|
"crawl:" + req.params.jobId + ":robots_blocked",
|
||||||
|
),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -116,7 +116,10 @@ export async function crawlStatusController(
|
|||||||
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] =
|
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] =
|
||||||
sc.cancelled
|
sc.cancelled
|
||||||
? "cancelled"
|
? "cancelled"
|
||||||
: validJobStatuses.every((x) => x[1] === "completed") && (sc.crawlerOptions ? await isCrawlKickoffFinished(req.params.jobId) : true)
|
: validJobStatuses.every((x) => x[1] === "completed") &&
|
||||||
|
(sc.crawlerOptions
|
||||||
|
? await isCrawlKickoffFinished(req.params.jobId)
|
||||||
|
: true)
|
||||||
? "completed"
|
? "completed"
|
||||||
: "scraping";
|
: "scraping";
|
||||||
|
|
||||||
|
@ -101,7 +101,7 @@ export async function getMapResults({
|
|||||||
},
|
},
|
||||||
true,
|
true,
|
||||||
true,
|
true,
|
||||||
30000
|
30000,
|
||||||
);
|
);
|
||||||
if (sitemap > 0) {
|
if (sitemap > 0) {
|
||||||
links = links
|
links = links
|
||||||
@ -164,7 +164,6 @@ export async function getMapResults({
|
|||||||
const twoDaysAgo = new Date();
|
const twoDaysAgo = new Date();
|
||||||
twoDaysAgo.setDate(twoDaysAgo.getDate() - 2);
|
twoDaysAgo.setDate(twoDaysAgo.getDate() - 2);
|
||||||
|
|
||||||
|
|
||||||
// If sitemap is not ignored and either we have few URLs (<100) or the data is stale (>2 days old), fetch fresh sitemap
|
// If sitemap is not ignored and either we have few URLs (<100) or the data is stale (>2 days old), fetch fresh sitemap
|
||||||
if (
|
if (
|
||||||
!ignoreSitemap &&
|
!ignoreSitemap &&
|
||||||
@ -172,9 +171,14 @@ export async function getMapResults({
|
|||||||
new Date(sitemapIndexResult.lastUpdated) < twoDaysAgo)
|
new Date(sitemapIndexResult.lastUpdated) < twoDaysAgo)
|
||||||
) {
|
) {
|
||||||
try {
|
try {
|
||||||
await crawler.tryGetSitemap(urls => {
|
await crawler.tryGetSitemap(
|
||||||
|
(urls) => {
|
||||||
links.push(...urls);
|
links.push(...urls);
|
||||||
}, true, false, 30000);
|
},
|
||||||
|
true,
|
||||||
|
false,
|
||||||
|
30000,
|
||||||
|
);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.warn("tryGetSitemap threw an error", { error: e });
|
logger.warn("tryGetSitemap threw an error", { error: e });
|
||||||
}
|
}
|
||||||
@ -253,7 +257,7 @@ export async function getMapResults({
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
priority: 10,
|
priority: 10,
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
@ -33,7 +33,6 @@ export async function scrapeController(
|
|||||||
basePriority: 10,
|
basePriority: 10,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
await addScrapeJob(
|
await addScrapeJob(
|
||||||
{
|
{
|
||||||
url: req.body.url,
|
url: req.body.url,
|
||||||
@ -97,7 +96,7 @@ export async function scrapeController(
|
|||||||
// Don't bill if we're early returning
|
// Don't bill if we're early returning
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (req.body.extract && req.body.formats.includes("extract") ) {
|
if (req.body.extract && req.body.formats.includes("extract")) {
|
||||||
creditsToBeBilled = 5;
|
creditsToBeBilled = 5;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -125,7 +125,7 @@ export const scrapeOptions = z
|
|||||||
"screenshot",
|
"screenshot",
|
||||||
"screenshot@fullPage",
|
"screenshot@fullPage",
|
||||||
"extract",
|
"extract",
|
||||||
"json"
|
"json",
|
||||||
])
|
])
|
||||||
.array()
|
.array()
|
||||||
.optional()
|
.optional()
|
||||||
@ -233,7 +233,7 @@ export const extractV1Options = z
|
|||||||
.strict(strictMessage)
|
.strict(strictMessage)
|
||||||
.transform((obj) => ({
|
.transform((obj) => ({
|
||||||
...obj,
|
...obj,
|
||||||
allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch
|
allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch,
|
||||||
}));
|
}));
|
||||||
|
|
||||||
export type ExtractV1Options = z.infer<typeof extractV1Options>;
|
export type ExtractV1Options = z.infer<typeof extractV1Options>;
|
||||||
@ -268,11 +268,17 @@ export const scrapeRequestSchema = scrapeOptions
|
|||||||
)
|
)
|
||||||
.transform((obj) => {
|
.transform((obj) => {
|
||||||
// Handle timeout
|
// Handle timeout
|
||||||
if ((obj.formats?.includes("extract") || obj.extract || obj.formats?.includes("json") || obj.jsonOptions) && !obj.timeout) {
|
if (
|
||||||
|
(obj.formats?.includes("extract") ||
|
||||||
|
obj.extract ||
|
||||||
|
obj.formats?.includes("json") ||
|
||||||
|
obj.jsonOptions) &&
|
||||||
|
!obj.timeout
|
||||||
|
) {
|
||||||
obj = { ...obj, timeout: 60000 };
|
obj = { ...obj, timeout: 60000 };
|
||||||
}
|
}
|
||||||
|
|
||||||
if(obj.formats?.includes("json")) {
|
if (obj.formats?.includes("json")) {
|
||||||
obj.formats.push("extract");
|
obj.formats.push("extract");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -284,8 +290,8 @@ export const scrapeRequestSchema = scrapeOptions
|
|||||||
prompt: obj.jsonOptions.prompt,
|
prompt: obj.jsonOptions.prompt,
|
||||||
systemPrompt: obj.jsonOptions.systemPrompt,
|
systemPrompt: obj.jsonOptions.systemPrompt,
|
||||||
schema: obj.jsonOptions.schema,
|
schema: obj.jsonOptions.schema,
|
||||||
mode: "llm"
|
mode: "llm",
|
||||||
}
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -602,15 +608,14 @@ export type CrawlStatusResponse =
|
|||||||
data: Document[];
|
data: Document[];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
export type CrawlErrorsResponse =
|
export type CrawlErrorsResponse =
|
||||||
| ErrorResponse
|
| ErrorResponse
|
||||||
| {
|
| {
|
||||||
errors: {
|
errors: {
|
||||||
id: string,
|
id: string;
|
||||||
timestamp?: string,
|
timestamp?: string;
|
||||||
url: string,
|
url: string;
|
||||||
error: string,
|
error: string;
|
||||||
}[];
|
}[];
|
||||||
robotsBlocked: string[];
|
robotsBlocked: string[];
|
||||||
};
|
};
|
||||||
@ -888,7 +893,6 @@ export type SearchResponse =
|
|||||||
data: Document[];
|
data: Document[];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
export type TokenUsage = {
|
export type TokenUsage = {
|
||||||
promptTokens: number;
|
promptTokens: number;
|
||||||
completionTokens: number;
|
completionTokens: number;
|
||||||
|
@ -4,7 +4,11 @@ import * as Sentry from "@sentry/node";
|
|||||||
import express, { NextFunction, Request, Response } from "express";
|
import express, { NextFunction, Request, Response } from "express";
|
||||||
import bodyParser from "body-parser";
|
import bodyParser from "body-parser";
|
||||||
import cors from "cors";
|
import cors from "cors";
|
||||||
import { getExtractQueue, getScrapeQueue, getIndexQueue } from "./services/queue-service";
|
import {
|
||||||
|
getExtractQueue,
|
||||||
|
getScrapeQueue,
|
||||||
|
getIndexQueue,
|
||||||
|
} from "./services/queue-service";
|
||||||
import { v0Router } from "./routes/v0";
|
import { v0Router } from "./routes/v0";
|
||||||
import os from "os";
|
import os from "os";
|
||||||
import { logger } from "./lib/logger";
|
import { logger } from "./lib/logger";
|
||||||
|
@ -3,101 +3,101 @@ import { deduplicateObjectsArray } from "../extract/helpers/deduplicate-objs-arr
|
|||||||
describe("deduplicateObjectsArray", () => {
|
describe("deduplicateObjectsArray", () => {
|
||||||
it("should deduplicate the array", async () => {
|
it("should deduplicate the array", async () => {
|
||||||
const objArray = {
|
const objArray = {
|
||||||
"lawyers": [
|
lawyers: [
|
||||||
{
|
{
|
||||||
"name": "James D. Schull",
|
name: "James D. Schull",
|
||||||
"email": null,
|
email: null,
|
||||||
"title": "Personal Injury Attorney",
|
title: "Personal Injury Attorney",
|
||||||
"phone-number": null,
|
"phone-number": null,
|
||||||
"practice-areas": [
|
"practice-areas": [
|
||||||
{
|
{
|
||||||
"area": "Personal Injury"
|
area: "Personal Injury",
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "James D. Schull",
|
name: "James D. Schull",
|
||||||
"email": null,
|
email: null,
|
||||||
"title": "Personal Injury Attorney",
|
title: "Personal Injury Attorney",
|
||||||
"phone-number": null,
|
"phone-number": null,
|
||||||
"practice-areas": [
|
"practice-areas": [
|
||||||
{
|
{
|
||||||
"area": "Personal Injury"
|
area: "Personal Injury",
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "James D. Schull",
|
name: "James D. Schull",
|
||||||
"email": null,
|
email: null,
|
||||||
"title": "Personal Injury Attorney",
|
title: "Personal Injury Attorney",
|
||||||
"phone-number": null,
|
"phone-number": null,
|
||||||
"practice-areas": [
|
"practice-areas": [
|
||||||
{
|
{
|
||||||
"area": "Personal Injury"
|
area: "Personal Injury",
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
}
|
};
|
||||||
|
|
||||||
const expected = {
|
const expected = {
|
||||||
"lawyers": [
|
lawyers: [
|
||||||
{
|
{
|
||||||
"name": "James D. Schull",
|
name: "James D. Schull",
|
||||||
"email": null,
|
email: null,
|
||||||
"title": "Personal Injury Attorney",
|
title: "Personal Injury Attorney",
|
||||||
"phone-number": null,
|
"phone-number": null,
|
||||||
"practice-areas": [
|
"practice-areas": [
|
||||||
{
|
{
|
||||||
"area": "Personal Injury"
|
area: "Personal Injury",
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
}
|
};
|
||||||
|
|
||||||
const result = await deduplicateObjectsArray(objArray);
|
const result = await deduplicateObjectsArray(objArray);
|
||||||
|
|
||||||
expect(result).toEqual(expected);
|
expect(result).toEqual(expected);
|
||||||
})
|
});
|
||||||
|
|
||||||
it("should not deduplicate if not necessary", async () => {
|
it("should not deduplicate if not necessary", async () => {
|
||||||
const objArray = {
|
const objArray = {
|
||||||
"lawyers": [
|
lawyers: [
|
||||||
{
|
{
|
||||||
"name": "James D. Schull",
|
name: "James D. Schull",
|
||||||
"email": null,
|
email: null,
|
||||||
"title": "Personal Injury Attorney",
|
title: "Personal Injury Attorney",
|
||||||
"phone-number": null,
|
"phone-number": null,
|
||||||
"practice-areas": [
|
"practice-areas": [
|
||||||
{
|
{
|
||||||
"area": "Personal Injury"
|
area: "Personal Injury",
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "John Doe",
|
name: "John Doe",
|
||||||
"email": null,
|
email: null,
|
||||||
"title": "Personal Injury Attorney",
|
title: "Personal Injury Attorney",
|
||||||
"phone-number": null,
|
"phone-number": null,
|
||||||
"practice-areas": [
|
"practice-areas": [
|
||||||
{
|
{
|
||||||
"area": "Personal Injury"
|
area: "Personal Injury",
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
}
|
};
|
||||||
|
|
||||||
const result = await deduplicateObjectsArray(objArray);
|
const result = await deduplicateObjectsArray(objArray);
|
||||||
|
|
||||||
expect(result).toEqual(objArray);
|
expect(result).toEqual(objArray);
|
||||||
})
|
});
|
||||||
|
|
||||||
it("should handle an empty array", async () => {
|
it("should handle an empty array", async () => {
|
||||||
const objArray = { "lawyers": [] };
|
const objArray = { lawyers: [] };
|
||||||
|
|
||||||
const expected = { "lawyers": [] };
|
const expected = { lawyers: [] };
|
||||||
|
|
||||||
const result = await deduplicateObjectsArray(objArray);
|
const result = await deduplicateObjectsArray(objArray);
|
||||||
|
|
||||||
@ -106,35 +106,35 @@ describe("deduplicateObjectsArray", () => {
|
|||||||
|
|
||||||
it("should handle objects with different properties", async () => {
|
it("should handle objects with different properties", async () => {
|
||||||
const objArray = {
|
const objArray = {
|
||||||
"lawyers": [
|
lawyers: [
|
||||||
{
|
{
|
||||||
"name": "James D. Schull",
|
name: "James D. Schull",
|
||||||
"email": "james@example.com",
|
email: "james@example.com",
|
||||||
"title": "Personal Injury Attorney"
|
title: "Personal Injury Attorney",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "James D. Schull",
|
name: "James D. Schull",
|
||||||
"email": "james@example.com",
|
email: "james@example.com",
|
||||||
"title": "Personal Injury Attorney",
|
title: "Personal Injury Attorney",
|
||||||
"phone-number": "123-456-7890"
|
"phone-number": "123-456-7890",
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
};
|
};
|
||||||
|
|
||||||
const expected = {
|
const expected = {
|
||||||
"lawyers": [
|
lawyers: [
|
||||||
{
|
{
|
||||||
"name": "James D. Schull",
|
name: "James D. Schull",
|
||||||
"email": "james@example.com",
|
email: "james@example.com",
|
||||||
"title": "Personal Injury Attorney"
|
title: "Personal Injury Attorney",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "James D. Schull",
|
name: "James D. Schull",
|
||||||
"email": "james@example.com",
|
email: "james@example.com",
|
||||||
"title": "Personal Injury Attorney",
|
title: "Personal Injury Attorney",
|
||||||
"phone-number": "123-456-7890"
|
"phone-number": "123-456-7890",
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
};
|
};
|
||||||
|
|
||||||
const result = await deduplicateObjectsArray(objArray);
|
const result = await deduplicateObjectsArray(objArray);
|
||||||
@ -144,33 +144,33 @@ describe("deduplicateObjectsArray", () => {
|
|||||||
|
|
||||||
it("should handle objects with same properties but different values", async () => {
|
it("should handle objects with same properties but different values", async () => {
|
||||||
const objArray = {
|
const objArray = {
|
||||||
"lawyers": [
|
lawyers: [
|
||||||
{
|
{
|
||||||
"name": "James D. Schull",
|
name: "James D. Schull",
|
||||||
"email": "james1@example.com",
|
email: "james1@example.com",
|
||||||
"title": "Personal Injury Attorney"
|
title: "Personal Injury Attorney",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "James D. Schull",
|
name: "James D. Schull",
|
||||||
"email": "james2@example.com",
|
email: "james2@example.com",
|
||||||
"title": "Personal Injury Attorney"
|
title: "Personal Injury Attorney",
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
};
|
};
|
||||||
|
|
||||||
const expected = {
|
const expected = {
|
||||||
"lawyers": [
|
lawyers: [
|
||||||
{
|
{
|
||||||
"name": "James D. Schull",
|
name: "James D. Schull",
|
||||||
"email": "james1@example.com",
|
email: "james1@example.com",
|
||||||
"title": "Personal Injury Attorney"
|
title: "Personal Injury Attorney",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "James D. Schull",
|
name: "James D. Schull",
|
||||||
"email": "james2@example.com",
|
email: "james2@example.com",
|
||||||
"title": "Personal Injury Attorney"
|
title: "Personal Injury Attorney",
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
};
|
};
|
||||||
|
|
||||||
const result = await deduplicateObjectsArray(objArray);
|
const result = await deduplicateObjectsArray(objArray);
|
||||||
@ -180,47 +180,47 @@ describe("deduplicateObjectsArray", () => {
|
|||||||
|
|
||||||
it("should handle nested identical objects", async () => {
|
it("should handle nested identical objects", async () => {
|
||||||
const objArray = {
|
const objArray = {
|
||||||
"lawyers": [
|
lawyers: [
|
||||||
{
|
{
|
||||||
"name": "James D. Schull",
|
name: "James D. Schull",
|
||||||
"email": null,
|
email: null,
|
||||||
"title": "Personal Injury Attorney",
|
title: "Personal Injury Attorney",
|
||||||
"practice-areas": [
|
"practice-areas": [
|
||||||
{
|
{
|
||||||
"area": "Personal Injury"
|
area: "Personal Injury",
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "James D. Schull",
|
name: "James D. Schull",
|
||||||
"email": null,
|
email: null,
|
||||||
"title": "Personal Injury Attorney",
|
title: "Personal Injury Attorney",
|
||||||
"practice-areas": [
|
"practice-areas": [
|
||||||
{
|
{
|
||||||
"area": "Personal Injury"
|
area: "Personal Injury",
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
};
|
};
|
||||||
|
|
||||||
const expected = {
|
const expected = {
|
||||||
"lawyers": [
|
lawyers: [
|
||||||
{
|
{
|
||||||
"name": "James D. Schull",
|
name: "James D. Schull",
|
||||||
"email": null,
|
email: null,
|
||||||
"title": "Personal Injury Attorney",
|
title: "Personal Injury Attorney",
|
||||||
"practice-areas": [
|
"practice-areas": [
|
||||||
{
|
{
|
||||||
"area": "Personal Injury"
|
area: "Personal Injury",
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
};
|
};
|
||||||
|
|
||||||
const result = await deduplicateObjectsArray(objArray);
|
const result = await deduplicateObjectsArray(objArray);
|
||||||
|
|
||||||
expect(result).toEqual(expected);
|
expect(result).toEqual(expected);
|
||||||
});
|
});
|
||||||
})
|
});
|
||||||
|
@ -3,292 +3,292 @@ import { mergeNullValObjs } from "../extract/helpers/merge-null-val-objs";
|
|||||||
describe("mergeNullValObjs", () => {
|
describe("mergeNullValObjs", () => {
|
||||||
it("should merge the objects with null values", async () => {
|
it("should merge the objects with null values", async () => {
|
||||||
const objArray = {
|
const objArray = {
|
||||||
"lawyers": [
|
lawyers: [
|
||||||
{
|
{
|
||||||
"name": "Frank Giunta",
|
name: "Frank Giunta",
|
||||||
"email": null,
|
email: null,
|
||||||
"title": "Personal Injury Attorney",
|
title: "Personal Injury Attorney",
|
||||||
"phone-number": "214.370.5200",
|
"phone-number": "214.370.5200",
|
||||||
"practice-areas": [
|
"practice-areas": [
|
||||||
{
|
{
|
||||||
"area": "Personal Injury"
|
area: "Personal Injury",
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "Frank Giunta",
|
name: "Frank Giunta",
|
||||||
"email": null,
|
email: null,
|
||||||
"title": "Personal Injury Attorney",
|
title: "Personal Injury Attorney",
|
||||||
"phone-number": "214.370.5200",
|
"phone-number": "214.370.5200",
|
||||||
"practice-areas": [
|
"practice-areas": [
|
||||||
{
|
{
|
||||||
"area": "Personal Injury"
|
area: "Personal Injury",
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
}
|
};
|
||||||
|
|
||||||
const expected = {
|
const expected = {
|
||||||
"lawyers": [
|
lawyers: [
|
||||||
{
|
{
|
||||||
"name": "Frank Giunta",
|
name: "Frank Giunta",
|
||||||
"email": null,
|
email: null,
|
||||||
"title": "Personal Injury Attorney",
|
title: "Personal Injury Attorney",
|
||||||
"phone-number": "214.370.5200",
|
"phone-number": "214.370.5200",
|
||||||
"practice-areas": [
|
"practice-areas": [
|
||||||
{
|
{
|
||||||
"area": "Personal Injury"
|
area: "Personal Injury",
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
}
|
};
|
||||||
|
|
||||||
const result = mergeNullValObjs(objArray);
|
const result = mergeNullValObjs(objArray);
|
||||||
|
|
||||||
expect(result).toEqual(expected);
|
expect(result).toEqual(expected);
|
||||||
})
|
});
|
||||||
|
|
||||||
it("should handle empty object array", async () => {
|
it("should handle empty object array", async () => {
|
||||||
const objArray = {
|
const objArray = {
|
||||||
"lawyers": []
|
lawyers: [],
|
||||||
}
|
};
|
||||||
|
|
||||||
const expected = {
|
const expected = {
|
||||||
"lawyers": []
|
lawyers: [],
|
||||||
}
|
};
|
||||||
|
|
||||||
const result = mergeNullValObjs(objArray);
|
const result = mergeNullValObjs(objArray);
|
||||||
|
|
||||||
expect(result).toEqual(expected);
|
expect(result).toEqual(expected);
|
||||||
})
|
});
|
||||||
|
|
||||||
it("should handle object array with no null values", async () => {
|
it("should handle object array with no null values", async () => {
|
||||||
const objArray = {
|
const objArray = {
|
||||||
"lawyers": [
|
lawyers: [
|
||||||
{
|
{
|
||||||
"name": "John Doe",
|
name: "John Doe",
|
||||||
"email": "john.doe@example.com",
|
email: "john.doe@example.com",
|
||||||
"title": "Attorney",
|
title: "Attorney",
|
||||||
"phone-number": "123.456.7890",
|
"phone-number": "123.456.7890",
|
||||||
"practice-areas": [
|
"practice-areas": [
|
||||||
{
|
{
|
||||||
"area": "Corporate Law"
|
area: "Corporate Law",
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
const expected = {
|
|
||||||
"lawyers": [
|
|
||||||
{
|
|
||||||
"name": "John Doe",
|
|
||||||
"email": "john.doe@example.com",
|
|
||||||
"title": "Attorney",
|
|
||||||
"phone-number": "123.456.7890",
|
|
||||||
"practice-areas": [
|
|
||||||
{
|
|
||||||
"area": "Corporate Law"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
const result = mergeNullValObjs(objArray);
|
|
||||||
|
|
||||||
expect(result).toEqual(expected);
|
|
||||||
})
|
|
||||||
|
|
||||||
it("should merge objects with different null values", async () => {
|
|
||||||
const objArray = {
|
|
||||||
"lawyers": [
|
|
||||||
{
|
|
||||||
"name": "Jane Smith",
|
|
||||||
"email": "null",
|
|
||||||
"title": "Attorney",
|
|
||||||
"description": null,
|
|
||||||
"phone-number": "987.654.3210",
|
|
||||||
"practice-areas": [
|
|
||||||
{
|
|
||||||
"area": "Family Law"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
},
|
||||||
{
|
],
|
||||||
"name": "Jane Smith",
|
|
||||||
"email": "jane.smith@example.com",
|
|
||||||
"title": null,
|
|
||||||
"description": "Jane Smith is an attorney specializing in Family Law.",
|
|
||||||
"phone-number": "987.654.3210",
|
|
||||||
"practice-areas": [
|
|
||||||
{
|
|
||||||
"area": "Family Law"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
const expected = {
|
|
||||||
"lawyers": [
|
|
||||||
{
|
|
||||||
"name": "Jane Smith",
|
|
||||||
"email": "jane.smith@example.com",
|
|
||||||
"title": "Attorney",
|
|
||||||
"description": "Jane Smith is an attorney specializing in Family Law.",
|
|
||||||
"phone-number": "987.654.3210",
|
|
||||||
"practice-areas": [
|
|
||||||
{
|
|
||||||
"area": "Family Law"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
const result = mergeNullValObjs(objArray);
|
|
||||||
|
|
||||||
expect(result).toEqual(expected);
|
|
||||||
})
|
|
||||||
|
|
||||||
it("should merge objects with different null values", async () => {
|
|
||||||
const objArray = {
|
|
||||||
"lawyers": [
|
|
||||||
{
|
|
||||||
"name": "Frank Giunta",
|
|
||||||
"email": "frank.giunta@example.com",
|
|
||||||
"title": "Personal Injury Attorney",
|
|
||||||
"phone-number": "214.370.5200",
|
|
||||||
"practice-areas": [
|
|
||||||
{
|
|
||||||
"area": "Personal Injury"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
},
|
||||||
{
|
],
|
||||||
"name": "Frank Giunta",
|
|
||||||
"email": null,
|
|
||||||
"title": "Personal Injury Attorney",
|
|
||||||
"phone-number": "214.370.5200",
|
|
||||||
"practice-areas": [
|
|
||||||
{
|
|
||||||
"area": "Personal Injury"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Dale R. Rose",
|
|
||||||
"email": null,
|
|
||||||
"title": "Personal Injury Attorney",
|
|
||||||
"phone-number": "972.562.0266",
|
|
||||||
"practice-areas": [
|
|
||||||
{
|
|
||||||
"area": "Personal Injury"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const expected = {
|
const expected = {
|
||||||
"lawyers": [
|
lawyers: [
|
||||||
{
|
{
|
||||||
"name": "Frank Giunta",
|
name: "John Doe",
|
||||||
"email": "frank.giunta@example.com",
|
email: "john.doe@example.com",
|
||||||
"title": "Personal Injury Attorney",
|
title: "Attorney",
|
||||||
"phone-number": "214.370.5200",
|
"phone-number": "123.456.7890",
|
||||||
"practice-areas": [
|
"practice-areas": [
|
||||||
{
|
{
|
||||||
"area": "Personal Injury"
|
area: "Corporate Law",
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
},
|
||||||
{
|
],
|
||||||
"name": "Dale R. Rose",
|
},
|
||||||
"email": null,
|
],
|
||||||
"title": "Personal Injury Attorney",
|
|
||||||
"phone-number": "972.562.0266",
|
|
||||||
"practice-areas": [
|
|
||||||
{
|
|
||||||
"area": "Personal Injury"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const result = mergeNullValObjs(objArray);
|
const result = mergeNullValObjs(objArray);
|
||||||
|
|
||||||
expect(result).toEqual(expected);
|
expect(result).toEqual(expected);
|
||||||
})
|
});
|
||||||
|
|
||||||
|
it("should merge objects with different null values", async () => {
|
||||||
|
const objArray = {
|
||||||
|
lawyers: [
|
||||||
|
{
|
||||||
|
name: "Jane Smith",
|
||||||
|
email: "null",
|
||||||
|
title: "Attorney",
|
||||||
|
description: null,
|
||||||
|
"phone-number": "987.654.3210",
|
||||||
|
"practice-areas": [
|
||||||
|
{
|
||||||
|
area: "Family Law",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Jane Smith",
|
||||||
|
email: "jane.smith@example.com",
|
||||||
|
title: null,
|
||||||
|
description: "Jane Smith is an attorney specializing in Family Law.",
|
||||||
|
"phone-number": "987.654.3210",
|
||||||
|
"practice-areas": [
|
||||||
|
{
|
||||||
|
area: "Family Law",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
const expected = {
|
||||||
|
lawyers: [
|
||||||
|
{
|
||||||
|
name: "Jane Smith",
|
||||||
|
email: "jane.smith@example.com",
|
||||||
|
title: "Attorney",
|
||||||
|
description: "Jane Smith is an attorney specializing in Family Law.",
|
||||||
|
"phone-number": "987.654.3210",
|
||||||
|
"practice-areas": [
|
||||||
|
{
|
||||||
|
area: "Family Law",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = mergeNullValObjs(objArray);
|
||||||
|
|
||||||
|
expect(result).toEqual(expected);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should merge objects with different null values", async () => {
|
||||||
|
const objArray = {
|
||||||
|
lawyers: [
|
||||||
|
{
|
||||||
|
name: "Frank Giunta",
|
||||||
|
email: "frank.giunta@example.com",
|
||||||
|
title: "Personal Injury Attorney",
|
||||||
|
"phone-number": "214.370.5200",
|
||||||
|
"practice-areas": [
|
||||||
|
{
|
||||||
|
area: "Personal Injury",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Frank Giunta",
|
||||||
|
email: null,
|
||||||
|
title: "Personal Injury Attorney",
|
||||||
|
"phone-number": "214.370.5200",
|
||||||
|
"practice-areas": [
|
||||||
|
{
|
||||||
|
area: "Personal Injury",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Dale R. Rose",
|
||||||
|
email: null,
|
||||||
|
title: "Personal Injury Attorney",
|
||||||
|
"phone-number": "972.562.0266",
|
||||||
|
"practice-areas": [
|
||||||
|
{
|
||||||
|
area: "Personal Injury",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
const expected = {
|
||||||
|
lawyers: [
|
||||||
|
{
|
||||||
|
name: "Frank Giunta",
|
||||||
|
email: "frank.giunta@example.com",
|
||||||
|
title: "Personal Injury Attorney",
|
||||||
|
"phone-number": "214.370.5200",
|
||||||
|
"practice-areas": [
|
||||||
|
{
|
||||||
|
area: "Personal Injury",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Dale R. Rose",
|
||||||
|
email: null,
|
||||||
|
title: "Personal Injury Attorney",
|
||||||
|
"phone-number": "972.562.0266",
|
||||||
|
"practice-areas": [
|
||||||
|
{
|
||||||
|
area: "Personal Injury",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = mergeNullValObjs(objArray);
|
||||||
|
|
||||||
|
expect(result).toEqual(expected);
|
||||||
|
});
|
||||||
|
|
||||||
it("should correctly merge and deduplicate objects", async () => {
|
it("should correctly merge and deduplicate objects", async () => {
|
||||||
const objArray = {
|
const objArray = {
|
||||||
"lawyers": [
|
lawyers: [
|
||||||
{
|
{
|
||||||
"name": "Frank Giunta",
|
name: "Frank Giunta",
|
||||||
"email": null,
|
email: null,
|
||||||
"title": "Personal Injury Attorney",
|
title: "Personal Injury Attorney",
|
||||||
"phone-number": "214.370.5200",
|
"phone-number": "214.370.5200",
|
||||||
"practice-areas": [
|
"practice-areas": [
|
||||||
{
|
{
|
||||||
"area": "Personal Injury"
|
area: "Personal Injury",
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "Frank Giunta",
|
name: "Frank Giunta",
|
||||||
"email": null,
|
email: null,
|
||||||
"title": "Personal Injury Attorney",
|
title: "Personal Injury Attorney",
|
||||||
"phone-number": "214.370.5200",
|
"phone-number": "214.370.5200",
|
||||||
"practice-areas": [
|
"practice-areas": [
|
||||||
{
|
{
|
||||||
"area": "Personal Injury"
|
area: "Personal Injury",
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "Dale R. Rose",
|
name: "Dale R. Rose",
|
||||||
"email": null,
|
email: null,
|
||||||
"title": "Personal Injury Attorney",
|
title: "Personal Injury Attorney",
|
||||||
"phone-number": "972.562.0266",
|
"phone-number": "972.562.0266",
|
||||||
"practice-areas": [
|
"practice-areas": [
|
||||||
{
|
{
|
||||||
"area": "Personal Injury"
|
area: "Personal Injury",
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
};
|
};
|
||||||
|
|
||||||
const expected = {
|
const expected = {
|
||||||
"lawyers": [
|
lawyers: [
|
||||||
{
|
{
|
||||||
"name": "Frank Giunta",
|
name: "Frank Giunta",
|
||||||
"email": null,
|
email: null,
|
||||||
"title": "Personal Injury Attorney",
|
title: "Personal Injury Attorney",
|
||||||
"phone-number": "214.370.5200",
|
"phone-number": "214.370.5200",
|
||||||
"practice-areas": [
|
"practice-areas": [
|
||||||
{
|
{
|
||||||
"area": "Personal Injury"
|
area: "Personal Injury",
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "Dale R. Rose",
|
name: "Dale R. Rose",
|
||||||
"email": null,
|
email: null,
|
||||||
"title": "Personal Injury Attorney",
|
title: "Personal Injury Attorney",
|
||||||
"phone-number": "972.562.0266",
|
"phone-number": "972.562.0266",
|
||||||
"practice-areas": [
|
"practice-areas": [
|
||||||
{
|
{
|
||||||
"area": "Personal Injury"
|
area: "Personal Injury",
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
};
|
};
|
||||||
|
|
||||||
const result = mergeNullValObjs(objArray);
|
const result = mergeNullValObjs(objArray);
|
||||||
@ -298,177 +298,172 @@ describe("mergeNullValObjs", () => {
|
|||||||
|
|
||||||
it("should merge arrays of similar objects", async () => {
|
it("should merge arrays of similar objects", async () => {
|
||||||
const objArray = {
|
const objArray = {
|
||||||
"lawyers": [
|
lawyers: [
|
||||||
{
|
{
|
||||||
"name": "Allen Cox",
|
name: "Allen Cox",
|
||||||
"email": null,
|
email: null,
|
||||||
"title": "Personal Injury Lawyer",
|
title: "Personal Injury Lawyer",
|
||||||
"phone-number": "972.606.9000",
|
"phone-number": "972.606.9000",
|
||||||
"practice-areas": [
|
"practice-areas": [{ area: "Personal Injury" }],
|
||||||
{ "area": "Personal Injury" }
|
|
||||||
]
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "Allen Cox",
|
name: "Allen Cox",
|
||||||
"email": "allen.cox@example.com",
|
email: "allen.cox@example.com",
|
||||||
"title": "Personal Injury Lawyer",
|
title: "Personal Injury Lawyer",
|
||||||
"phone-number": null,
|
"phone-number": null,
|
||||||
"practice-areas": [
|
"practice-areas": [
|
||||||
{ "area": "Automobile accidents" },
|
{ area: "Automobile accidents" },
|
||||||
{ "area": "Truck accidents" },
|
{ area: "Truck accidents" },
|
||||||
{ "area": "Amusement park injury" },
|
{ area: "Amusement park injury" },
|
||||||
{ "area": "Bus accident" },
|
{ area: "Bus accident" },
|
||||||
{ "area": "Industrial accidents" },
|
{ area: "Industrial accidents" },
|
||||||
{ "area": "Product defects" },
|
{ area: "Product defects" },
|
||||||
{ "area": "Food poisoning" },
|
{ area: "Food poisoning" },
|
||||||
{ "area": "Workplace accidents" },
|
{ area: "Workplace accidents" },
|
||||||
{ "area": "Wrongful death" },
|
{ area: "Wrongful death" },
|
||||||
{ "area": "Swimming pool accidents" },
|
{ area: "Swimming pool accidents" },
|
||||||
{ "area": "Premises accidents" },
|
{ area: "Premises accidents" },
|
||||||
{ "area": "Aircraft accidents" },
|
{ area: "Aircraft accidents" },
|
||||||
{ "area": "Animal and dog bites" }
|
{ area: "Animal and dog bites" },
|
||||||
]
|
],
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
}
|
};
|
||||||
|
|
||||||
const expected = {
|
const expected = {
|
||||||
"lawyers": [
|
lawyers: [
|
||||||
{
|
{
|
||||||
"name": "Allen Cox",
|
name: "Allen Cox",
|
||||||
"email": "allen.cox@example.com",
|
email: "allen.cox@example.com",
|
||||||
"title": "Personal Injury Lawyer",
|
title: "Personal Injury Lawyer",
|
||||||
"phone-number": "972.606.9000",
|
"phone-number": "972.606.9000",
|
||||||
"practice-areas": [
|
"practice-areas": [
|
||||||
{ "area": "Personal Injury" },
|
{ area: "Personal Injury" },
|
||||||
{ "area": "Automobile accidents" },
|
{ area: "Automobile accidents" },
|
||||||
{ "area": "Truck accidents" },
|
{ area: "Truck accidents" },
|
||||||
{ "area": "Amusement park injury" },
|
{ area: "Amusement park injury" },
|
||||||
{ "area": "Bus accident" },
|
{ area: "Bus accident" },
|
||||||
{ "area": "Industrial accidents" },
|
{ area: "Industrial accidents" },
|
||||||
{ "area": "Product defects" },
|
{ area: "Product defects" },
|
||||||
{ "area": "Food poisoning" },
|
{ area: "Food poisoning" },
|
||||||
{ "area": "Workplace accidents" },
|
{ area: "Workplace accidents" },
|
||||||
{ "area": "Wrongful death" },
|
{ area: "Wrongful death" },
|
||||||
{ "area": "Swimming pool accidents" },
|
{ area: "Swimming pool accidents" },
|
||||||
{ "area": "Premises accidents" },
|
{ area: "Premises accidents" },
|
||||||
{ "area": "Aircraft accidents" },
|
{ area: "Aircraft accidents" },
|
||||||
{ "area": "Animal and dog bites" }
|
{ area: "Animal and dog bites" },
|
||||||
]
|
],
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
}
|
};
|
||||||
|
|
||||||
const result = mergeNullValObjs(objArray);
|
const result = mergeNullValObjs(objArray);
|
||||||
|
|
||||||
expect(result).toEqual(expected);
|
expect(result).toEqual(expected);
|
||||||
})
|
});
|
||||||
|
|
||||||
it("should merge arrays of similar objects with different key names", async () => {
|
it("should merge arrays of similar objects with different key names", async () => {
|
||||||
const objArray = {
|
const objArray = {
|
||||||
"attorneys": [
|
attorneys: [
|
||||||
{
|
{
|
||||||
"fullName": "Allen Cox",
|
fullName: "Allen Cox",
|
||||||
"contactEmail": null,
|
contactEmail: null,
|
||||||
"position": "Personal Injury Lawyer",
|
position: "Personal Injury Lawyer",
|
||||||
"contactNumber": "972.606.9000",
|
contactNumber: "972.606.9000",
|
||||||
"specializations": [
|
specializations: [{ field: "Personal Injury" }],
|
||||||
{ "field": "Personal Injury" }
|
|
||||||
]
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"fullName": "Allen Cox",
|
fullName: "Allen Cox",
|
||||||
"contactEmail": "allen.cox@example.com",
|
contactEmail: "allen.cox@example.com",
|
||||||
"position": "Personal Injury Lawyer",
|
position: "Personal Injury Lawyer",
|
||||||
"contactNumber": null,
|
contactNumber: null,
|
||||||
"specializations": [
|
specializations: [
|
||||||
{ "field": "Automobile accidents" },
|
{ field: "Automobile accidents" },
|
||||||
{ "field": "Truck accidents" },
|
{ field: "Truck accidents" },
|
||||||
{ "field": "Amusement park injury" },
|
{ field: "Amusement park injury" },
|
||||||
{ "field": "Bus accident" },
|
{ field: "Bus accident" },
|
||||||
{ "field": "Industrial accidents" },
|
{ field: "Industrial accidents" },
|
||||||
{ "field": "Product defects" },
|
{ field: "Product defects" },
|
||||||
{ "field": "Food poisoning" },
|
{ field: "Food poisoning" },
|
||||||
{ "field": "Workplace accidents" },
|
{ field: "Workplace accidents" },
|
||||||
{ "field": "Wrongful death" },
|
{ field: "Wrongful death" },
|
||||||
{ "field": "Swimming pool accidents" },
|
{ field: "Swimming pool accidents" },
|
||||||
{ "field": "Premises accidents" },
|
{ field: "Premises accidents" },
|
||||||
{ "field": "Aircraft accidents" },
|
{ field: "Aircraft accidents" },
|
||||||
{ "field": "Animal and dog bites" }
|
{ field: "Animal and dog bites" },
|
||||||
]
|
],
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
}
|
};
|
||||||
|
|
||||||
const expected = {
|
const expected = {
|
||||||
"attorneys": [
|
attorneys: [
|
||||||
{
|
{
|
||||||
"fullName": "Allen Cox",
|
fullName: "Allen Cox",
|
||||||
"contactEmail": "allen.cox@example.com",
|
contactEmail: "allen.cox@example.com",
|
||||||
"position": "Personal Injury Lawyer",
|
position: "Personal Injury Lawyer",
|
||||||
"contactNumber": "972.606.9000",
|
contactNumber: "972.606.9000",
|
||||||
"specializations": [
|
specializations: [
|
||||||
{ "field": "Personal Injury" },
|
{ field: "Personal Injury" },
|
||||||
{ "field": "Automobile accidents" },
|
{ field: "Automobile accidents" },
|
||||||
{ "field": "Truck accidents" },
|
{ field: "Truck accidents" },
|
||||||
{ "field": "Amusement park injury" },
|
{ field: "Amusement park injury" },
|
||||||
{ "field": "Bus accident" },
|
{ field: "Bus accident" },
|
||||||
{ "field": "Industrial accidents" },
|
{ field: "Industrial accidents" },
|
||||||
{ "field": "Product defects" },
|
{ field: "Product defects" },
|
||||||
{ "field": "Food poisoning" },
|
{ field: "Food poisoning" },
|
||||||
{ "field": "Workplace accidents" },
|
{ field: "Workplace accidents" },
|
||||||
{ "field": "Wrongful death" },
|
{ field: "Wrongful death" },
|
||||||
{ "field": "Swimming pool accidents" },
|
{ field: "Swimming pool accidents" },
|
||||||
{ "field": "Premises accidents" },
|
{ field: "Premises accidents" },
|
||||||
{ "field": "Aircraft accidents" },
|
{ field: "Aircraft accidents" },
|
||||||
{ "field": "Animal and dog bites" }
|
{ field: "Animal and dog bites" },
|
||||||
]
|
],
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
}
|
};
|
||||||
|
|
||||||
const result = mergeNullValObjs(objArray);
|
const result = mergeNullValObjs(objArray);
|
||||||
|
|
||||||
expect(result).toEqual(expected);
|
expect(result).toEqual(expected);
|
||||||
})
|
});
|
||||||
|
|
||||||
it ("should deal with not array values", async () => {
|
it("should deal with not array values", async () => {
|
||||||
const objArray = {
|
const objArray = {
|
||||||
"lawyers": {
|
lawyers: {
|
||||||
"name": "not an array"
|
name: "not an array",
|
||||||
},
|
},
|
||||||
"attorneys": {
|
attorneys: {
|
||||||
"name": "not an array"
|
name: "not an array",
|
||||||
}
|
},
|
||||||
}
|
};
|
||||||
|
|
||||||
const expected = {
|
const expected = {
|
||||||
"lawyers": {
|
lawyers: {
|
||||||
"name": "not an array"
|
name: "not an array",
|
||||||
},
|
},
|
||||||
"attorneys": {
|
attorneys: {
|
||||||
"name": "not an array"
|
name: "not an array",
|
||||||
}
|
},
|
||||||
}
|
};
|
||||||
|
|
||||||
// @ts-expect-error
|
// @ts-expect-error
|
||||||
const result = mergeNullValObjs(objArray);
|
const result = mergeNullValObjs(objArray);
|
||||||
|
|
||||||
expect(result).toEqual(expected);
|
expect(result).toEqual(expected);
|
||||||
})
|
});
|
||||||
|
|
||||||
it ("should deal with arrays of strings", async () => {
|
it("should deal with arrays of strings", async () => {
|
||||||
const objArray = {
|
const objArray = {
|
||||||
"lawyers": ["res1", "res2", "res3"]
|
lawyers: ["res1", "res2", "res3"],
|
||||||
}
|
};
|
||||||
|
|
||||||
const expected = {
|
const expected = {
|
||||||
"lawyers": ["res1", "res2", "res3"]
|
lawyers: ["res1", "res2", "res3"],
|
||||||
}
|
};
|
||||||
|
|
||||||
const result = mergeNullValObjs(objArray);
|
const result = mergeNullValObjs(objArray);
|
||||||
|
|
||||||
expect(result).toEqual(expected);
|
expect(result).toEqual(expected);
|
||||||
})
|
});
|
||||||
|
});
|
||||||
})
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -2,7 +2,7 @@ import { spreadSchemas } from "../extract/helpers/spread-schemas";
|
|||||||
|
|
||||||
describe("spreadSchemas", () => {
|
describe("spreadSchemas", () => {
|
||||||
it("should spread kyb schema (id: 1)", async () => {
|
it("should spread kyb schema (id: 1)", async () => {
|
||||||
const keys = ["owners"]
|
const keys = ["owners"];
|
||||||
const schema = {
|
const schema = {
|
||||||
type: "object",
|
type: "object",
|
||||||
properties: {
|
properties: {
|
||||||
@ -21,13 +21,13 @@ describe("spreadSchemas", () => {
|
|||||||
city: { type: "string" },
|
city: { type: "string" },
|
||||||
state: { type: "string" },
|
state: { type: "string" },
|
||||||
country: { type: "string" },
|
country: { type: "string" },
|
||||||
postal_code: { type: "string" }
|
postal_code: { type: "string" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
incorporation_date: { type: "string", format: "date" },
|
incorporation_date: { type: "string", format: "date" },
|
||||||
phone: { type: "string" },
|
phone: { type: "string" },
|
||||||
email: { type: "string", format: "email" }
|
email: { type: "string", format: "email" },
|
||||||
}
|
},
|
||||||
},
|
},
|
||||||
owners: {
|
owners: {
|
||||||
type: "array",
|
type: "array",
|
||||||
@ -43,18 +43,21 @@ describe("spreadSchemas", () => {
|
|||||||
city: { type: "string" },
|
city: { type: "string" },
|
||||||
state: { type: "string" },
|
state: { type: "string" },
|
||||||
country: { type: "string" },
|
country: { type: "string" },
|
||||||
postal_code: { type: "string" }
|
postal_code: { type: "string" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
phone: { type: "string" },
|
phone: { type: "string" },
|
||||||
email: { type: "string", format: "email" }
|
email: { type: "string", format: "email" },
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
}
|
};
|
||||||
|
|
||||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys)
|
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
||||||
|
schema,
|
||||||
|
keys,
|
||||||
|
);
|
||||||
|
|
||||||
expect(singleAnswerSchema).toEqual({
|
expect(singleAnswerSchema).toEqual({
|
||||||
type: "object",
|
type: "object",
|
||||||
@ -74,16 +77,16 @@ describe("spreadSchemas", () => {
|
|||||||
city: { type: "string" },
|
city: { type: "string" },
|
||||||
state: { type: "string" },
|
state: { type: "string" },
|
||||||
country: { type: "string" },
|
country: { type: "string" },
|
||||||
postal_code: { type: "string" }
|
postal_code: { type: "string" },
|
||||||
}
|
},
|
||||||
},
|
},
|
||||||
incorporation_date: { type: "string", format: "date" },
|
incorporation_date: { type: "string", format: "date" },
|
||||||
phone: { type: "string" },
|
phone: { type: "string" },
|
||||||
email: { type: "string", format: "email" }
|
email: { type: "string", format: "email" },
|
||||||
}
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
})
|
},
|
||||||
|
});
|
||||||
|
|
||||||
expect(multiEntitySchema).toEqual({
|
expect(multiEntitySchema).toEqual({
|
||||||
type: "object",
|
type: "object",
|
||||||
@ -102,20 +105,20 @@ describe("spreadSchemas", () => {
|
|||||||
city: { type: "string" },
|
city: { type: "string" },
|
||||||
state: { type: "string" },
|
state: { type: "string" },
|
||||||
country: { type: "string" },
|
country: { type: "string" },
|
||||||
postal_code: { type: "string" }
|
postal_code: { type: "string" },
|
||||||
}
|
},
|
||||||
},
|
},
|
||||||
phone: { type: "string" },
|
phone: { type: "string" },
|
||||||
email: { type: "string", format: "email" }
|
email: { type: "string", format: "email" },
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
})
|
});
|
||||||
})
|
});
|
||||||
|
|
||||||
it("should spread lawyers schema (id: 9)", async () => {
|
it("should spread lawyers schema (id: 9)", async () => {
|
||||||
const keys = ["lawyers"]
|
const keys = ["lawyers"];
|
||||||
const schema = {
|
const schema = {
|
||||||
type: "object",
|
type: "object",
|
||||||
properties: {
|
properties: {
|
||||||
@ -133,22 +136,25 @@ describe("spreadSchemas", () => {
|
|||||||
items: {
|
items: {
|
||||||
type: "object",
|
type: "object",
|
||||||
properties: {
|
properties: {
|
||||||
area: { type: "string" }
|
area: { type: "string" },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
alias: "practice-areas",
|
||||||
|
},
|
||||||
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
alias: "practice-areas"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys)
|
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
||||||
|
schema,
|
||||||
|
keys,
|
||||||
|
);
|
||||||
|
|
||||||
expect(singleAnswerSchema).toEqual({})
|
expect(singleAnswerSchema).toEqual({});
|
||||||
expect(multiEntitySchema).toEqual(schema)
|
expect(multiEntitySchema).toEqual(schema);
|
||||||
})
|
});
|
||||||
|
|
||||||
it("shoud spread (id: 26)", async () => {
|
it("shoud spread (id: 26)", async () => {
|
||||||
const schema = {
|
const schema = {
|
||||||
@ -161,19 +167,22 @@ describe("spreadSchemas", () => {
|
|||||||
properties: {
|
properties: {
|
||||||
name: { type: "string" },
|
name: { type: "string" },
|
||||||
price: { type: "string" },
|
price: { type: "string" },
|
||||||
description: { type: "string" }
|
description: { type: "string" },
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
}
|
};
|
||||||
|
|
||||||
const keys = ["products"]
|
const keys = ["products"];
|
||||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys)
|
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
||||||
|
schema,
|
||||||
|
keys,
|
||||||
|
);
|
||||||
|
|
||||||
expect(singleAnswerSchema).toEqual({})
|
expect(singleAnswerSchema).toEqual({});
|
||||||
expect(multiEntitySchema).toEqual(schema)
|
expect(multiEntitySchema).toEqual(schema);
|
||||||
})
|
});
|
||||||
|
|
||||||
it("shoud spread categories and products", async () => {
|
it("shoud spread categories and products", async () => {
|
||||||
const schema = {
|
const schema = {
|
||||||
@ -182,8 +191,8 @@ describe("spreadSchemas", () => {
|
|||||||
categories: {
|
categories: {
|
||||||
type: "array",
|
type: "array",
|
||||||
items: {
|
items: {
|
||||||
type: "string"
|
type: "string",
|
||||||
}
|
},
|
||||||
},
|
},
|
||||||
products: {
|
products: {
|
||||||
type: "array",
|
type: "array",
|
||||||
@ -192,19 +201,22 @@ describe("spreadSchemas", () => {
|
|||||||
properties: {
|
properties: {
|
||||||
name: { type: "string" },
|
name: { type: "string" },
|
||||||
price: { type: "string" },
|
price: { type: "string" },
|
||||||
description: { type: "string" }
|
description: { type: "string" },
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
}
|
};
|
||||||
|
|
||||||
const keys = ["products", "categories"]
|
const keys = ["products", "categories"];
|
||||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys)
|
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
||||||
|
schema,
|
||||||
|
keys,
|
||||||
|
);
|
||||||
|
|
||||||
expect(singleAnswerSchema).toEqual({})
|
expect(singleAnswerSchema).toEqual({});
|
||||||
expect(multiEntitySchema).toEqual(schema)
|
expect(multiEntitySchema).toEqual(schema);
|
||||||
})
|
});
|
||||||
|
|
||||||
it("should spread (id: 29)", async () => {
|
it("should spread (id: 29)", async () => {
|
||||||
const schema = {
|
const schema = {
|
||||||
@ -220,50 +232,55 @@ describe("spreadSchemas", () => {
|
|||||||
offers_cmmc: { type: "boolean" },
|
offers_cmmc: { type: "boolean" },
|
||||||
has_soc_2_cert: { type: "boolean" },
|
has_soc_2_cert: { type: "boolean" },
|
||||||
offers_office365: { type: "boolean" },
|
offers_office365: { type: "boolean" },
|
||||||
offers_endpoint_security: { type: "boolean" }
|
offers_endpoint_security: { type: "boolean" },
|
||||||
}
|
},
|
||||||
}
|
};
|
||||||
|
|
||||||
const keys = []
|
const keys = [];
|
||||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys)
|
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
||||||
|
schema,
|
||||||
|
keys,
|
||||||
|
);
|
||||||
|
|
||||||
expect(singleAnswerSchema).toEqual(schema)
|
expect(singleAnswerSchema).toEqual(schema);
|
||||||
expect(multiEntitySchema).toEqual({})
|
expect(multiEntitySchema).toEqual({});
|
||||||
})
|
});
|
||||||
|
|
||||||
it("should spread kyb schema (id: 29)", async () => {
|
it("should spread kyb schema (id: 29)", async () => {
|
||||||
|
|
||||||
const schema = {
|
const schema = {
|
||||||
"type": "object",
|
type: "object",
|
||||||
"properties": {
|
properties: {
|
||||||
"lawyers": {
|
lawyers: {
|
||||||
"type": "array",
|
type: "array",
|
||||||
"items": {
|
items: {
|
||||||
"type": "object",
|
type: "object",
|
||||||
"properties": {
|
properties: {
|
||||||
"name": { "type": "string" },
|
name: { type: "string" },
|
||||||
"email": { "type": ["string", "null"] },
|
email: { type: ["string", "null"] },
|
||||||
"phone-number": { "type": "string" },
|
"phone-number": { type: "string" },
|
||||||
"practice-areas": {
|
"practice-areas": {
|
||||||
"type": "array",
|
type: "array",
|
||||||
"items": {
|
items: {
|
||||||
"type": "object",
|
type: "object",
|
||||||
"properties": {
|
properties: {
|
||||||
"area": { "type": "string" }
|
area: { type: "string" },
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"title": { "type": ["string", "null"] }
|
|
||||||
},
|
},
|
||||||
}
|
},
|
||||||
}
|
title: { type: ["string", "null"] },
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
const keys = ["lawyers"]
|
const keys = ["lawyers"];
|
||||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys)
|
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
||||||
|
schema,
|
||||||
|
keys,
|
||||||
|
);
|
||||||
|
|
||||||
expect(singleAnswerSchema).toEqual({})
|
expect(singleAnswerSchema).toEqual({});
|
||||||
expect(multiEntitySchema).toEqual(schema)
|
expect(multiEntitySchema).toEqual(schema);
|
||||||
})
|
});
|
||||||
})
|
});
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -42,7 +42,10 @@ export async function saveEntryToCache(key: string, entry: CacheEntry) {
|
|||||||
if (!cacheRedis) return;
|
if (!cacheRedis) return;
|
||||||
|
|
||||||
if (!entry.html || entry.html.length < 100) {
|
if (!entry.html || entry.html.length < 100) {
|
||||||
logger.warn("Skipping cache save for short HTML", { key, htmlLength: entry.html?.length });
|
logger.warn("Skipping cache save for short HTML", {
|
||||||
|
key,
|
||||||
|
htmlLength: entry.html?.length,
|
||||||
|
});
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -127,13 +127,15 @@ export async function getDoneJobsOrdered(
|
|||||||
export async function isCrawlFinished(id: string) {
|
export async function isCrawlFinished(id: string) {
|
||||||
return (
|
return (
|
||||||
(await redisConnection.scard("crawl:" + id + ":jobs_done")) ===
|
(await redisConnection.scard("crawl:" + id + ":jobs_done")) ===
|
||||||
(await redisConnection.scard("crawl:" + id + ":jobs"))
|
(await redisConnection.scard("crawl:" + id + ":jobs")) &&
|
||||||
&& (await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null
|
(await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function isCrawlKickoffFinished(id: string) {
|
export async function isCrawlKickoffFinished(id: string) {
|
||||||
return await redisConnection.get("crawl:" + id + ":kickoff:finish") !== null
|
return (
|
||||||
|
(await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function isCrawlFinishedLocked(id: string) {
|
export async function isCrawlFinishedLocked(id: string) {
|
||||||
@ -141,7 +143,12 @@ export async function isCrawlFinishedLocked(id: string) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export async function finishCrawlKickoff(id: string) {
|
export async function finishCrawlKickoff(id: string) {
|
||||||
await redisConnection.set("crawl:" + id + ":kickoff:finish", "yes", "EX", 24 * 60 * 60);
|
await redisConnection.set(
|
||||||
|
"crawl:" + id + ":kickoff:finish",
|
||||||
|
"yes",
|
||||||
|
"EX",
|
||||||
|
24 * 60 * 60,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function finishCrawl(id: string) {
|
export async function finishCrawl(id: string) {
|
||||||
@ -161,9 +168,10 @@ export async function finishCrawl(id: string) {
|
|||||||
module: "crawl-redis",
|
module: "crawl-redis",
|
||||||
method: "finishCrawl",
|
method: "finishCrawl",
|
||||||
crawlId: id,
|
crawlId: id,
|
||||||
jobs_done: (await redisConnection.scard("crawl:" + id + ":jobs_done")),
|
jobs_done: await redisConnection.scard("crawl:" + id + ":jobs_done"),
|
||||||
jobs: (await redisConnection.scard("crawl:" + id + ":jobs")),
|
jobs: await redisConnection.scard("crawl:" + id + ":jobs"),
|
||||||
kickoff_finished: (await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null,
|
kickoff_finished:
|
||||||
|
(await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,81 +1,81 @@
|
|||||||
// const id = crypto.randomUUID();
|
// const id = crypto.randomUUID();
|
||||||
|
|
||||||
// const sc: StoredCrawl = {
|
// const sc: StoredCrawl = {
|
||||||
// originUrl: request.urls[0].replace("/*",""),
|
// originUrl: request.urls[0].replace("/*",""),
|
||||||
// crawlerOptions: toLegacyCrawlerOptions({
|
// crawlerOptions: toLegacyCrawlerOptions({
|
||||||
// maxDepth: 15,
|
// maxDepth: 15,
|
||||||
// limit: 5000,
|
// limit: 5000,
|
||||||
// includePaths: [],
|
// includePaths: [],
|
||||||
// excludePaths: [],
|
// excludePaths: [],
|
||||||
// ignoreSitemap: false,
|
// ignoreSitemap: false,
|
||||||
// allowExternalLinks: false,
|
// allowExternalLinks: false,
|
||||||
// allowBackwardLinks: true,
|
// allowBackwardLinks: true,
|
||||||
// allowSubdomains: false,
|
// allowSubdomains: false,
|
||||||
// ignoreRobotsTxt: false,
|
// ignoreRobotsTxt: false,
|
||||||
// deduplicateSimilarURLs: false,
|
// deduplicateSimilarURLs: false,
|
||||||
// ignoreQueryParameters: false
|
// ignoreQueryParameters: false
|
||||||
// }),
|
// }),
|
||||||
// scrapeOptions: {
|
// scrapeOptions: {
|
||||||
// formats: ["markdown"],
|
// formats: ["markdown"],
|
||||||
// onlyMainContent: true,
|
// onlyMainContent: true,
|
||||||
// waitFor: 0,
|
// waitFor: 0,
|
||||||
// mobile: false,
|
// mobile: false,
|
||||||
// removeBase64Images: true,
|
// removeBase64Images: true,
|
||||||
// fastMode: false,
|
// fastMode: false,
|
||||||
// parsePDF: true,
|
// parsePDF: true,
|
||||||
// skipTlsVerification: false,
|
// skipTlsVerification: false,
|
||||||
// },
|
// },
|
||||||
// internalOptions: {
|
// internalOptions: {
|
||||||
// disableSmartWaitCache: true,
|
// disableSmartWaitCache: true,
|
||||||
// isBackgroundIndex: true
|
// isBackgroundIndex: true
|
||||||
// },
|
// },
|
||||||
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
|
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
|
||||||
// createdAt: Date.now(),
|
// createdAt: Date.now(),
|
||||||
// plan: "hobby", // make it a low concurrency
|
// plan: "hobby", // make it a low concurrency
|
||||||
// };
|
// };
|
||||||
|
|
||||||
// // Save the crawl configuration
|
// // Save the crawl configuration
|
||||||
// await saveCrawl(id, sc);
|
// await saveCrawl(id, sc);
|
||||||
|
|
||||||
// // Then kick off the job
|
// // Then kick off the job
|
||||||
// await _addScrapeJobToBullMQ({
|
// await _addScrapeJobToBullMQ({
|
||||||
// url: request.urls[0].replace("/*",""),
|
// url: request.urls[0].replace("/*",""),
|
||||||
// mode: "kickoff" as const,
|
// mode: "kickoff" as const,
|
||||||
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
|
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
|
||||||
// plan: "hobby", // make it a low concurrency
|
// plan: "hobby", // make it a low concurrency
|
||||||
// crawlerOptions: sc.crawlerOptions,
|
// crawlerOptions: sc.crawlerOptions,
|
||||||
// scrapeOptions: sc.scrapeOptions,
|
// scrapeOptions: sc.scrapeOptions,
|
||||||
// internalOptions: sc.internalOptions,
|
// internalOptions: sc.internalOptions,
|
||||||
// origin: "index",
|
// origin: "index",
|
||||||
// crawl_id: id,
|
// crawl_id: id,
|
||||||
// webhook: null,
|
// webhook: null,
|
||||||
// v1: true,
|
// v1: true,
|
||||||
// }, {}, crypto.randomUUID(), 50);
|
// }, {}, crypto.randomUUID(), 50);
|
||||||
|
|
||||||
// we restructure and make all of the arrays we need to fill into objects,
|
// we restructure and make all of the arrays we need to fill into objects,
|
||||||
// adding them to a single object so the llm can fill them one at a time
|
// adding them to a single object so the llm can fill them one at a time
|
||||||
// TODO: make this work for more complex schemas where arrays are not first level
|
// TODO: make this work for more complex schemas where arrays are not first level
|
||||||
|
|
||||||
// let schemasForLLM: {} = {};
|
// let schemasForLLM: {} = {};
|
||||||
// for (const key in largeArraysSchema) {
|
// for (const key in largeArraysSchema) {
|
||||||
// const originalSchema = structuredClone(largeArraysSchema[key].items);
|
// const originalSchema = structuredClone(largeArraysSchema[key].items);
|
||||||
// console.log(
|
// console.log(
|
||||||
// "key",
|
// "key",
|
||||||
// key,
|
// key,
|
||||||
// "\noriginalSchema",
|
// "\noriginalSchema",
|
||||||
// JSON.stringify(largeArraysSchema[key], null, 2),
|
// JSON.stringify(largeArraysSchema[key], null, 2),
|
||||||
// );
|
// );
|
||||||
// let clonedObj = {
|
// let clonedObj = {
|
||||||
// type: "object",
|
// type: "object",
|
||||||
// properties: {
|
// properties: {
|
||||||
// informationFilled: {
|
// informationFilled: {
|
||||||
// type: "boolean",
|
// type: "boolean",
|
||||||
// },
|
// },
|
||||||
// data: {
|
// data: {
|
||||||
// type: "object",
|
// type: "object",
|
||||||
// properties: originalSchema.properties,
|
// properties: originalSchema.properties,
|
||||||
// },
|
// },
|
||||||
// },
|
// },
|
||||||
// };
|
// };
|
||||||
// schemasForLLM[key] = clonedObj;
|
// schemasForLLM[key] = clonedObj;
|
||||||
// }
|
// }
|
||||||
|
@ -59,11 +59,11 @@ export async function updateExtract(
|
|||||||
|
|
||||||
// Limit links in steps to 500
|
// Limit links in steps to 500
|
||||||
if (extract.steps) {
|
if (extract.steps) {
|
||||||
extract.steps = extract.steps.map(step => {
|
extract.steps = extract.steps.map((step) => {
|
||||||
if (step.discoveredLinks && step.discoveredLinks.length > 500) {
|
if (step.discoveredLinks && step.discoveredLinks.length > 500) {
|
||||||
return {
|
return {
|
||||||
...step,
|
...step,
|
||||||
discoveredLinks: step.discoveredLinks.slice(0, 500)
|
discoveredLinks: step.discoveredLinks.slice(0, 500),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
return step;
|
return step;
|
||||||
|
@ -32,7 +32,11 @@ import { ExtractStep, updateExtract } from "./extract-redis";
|
|||||||
import { deduplicateObjectsArray } from "./helpers/deduplicate-objs-array";
|
import { deduplicateObjectsArray } from "./helpers/deduplicate-objs-array";
|
||||||
import { mergeNullValObjs } from "./helpers/merge-null-val-objs";
|
import { mergeNullValObjs } from "./helpers/merge-null-val-objs";
|
||||||
import { CUSTOM_U_TEAMS, extractConfig } from "./config";
|
import { CUSTOM_U_TEAMS, extractConfig } from "./config";
|
||||||
import { calculateFinalResultCost, estimateCost, estimateTotalCost } from "./usage/llm-cost";
|
import {
|
||||||
|
calculateFinalResultCost,
|
||||||
|
estimateCost,
|
||||||
|
estimateTotalCost,
|
||||||
|
} from "./usage/llm-cost";
|
||||||
import { numTokensFromString } from "../LLM-extraction/helpers";
|
import { numTokensFromString } from "../LLM-extraction/helpers";
|
||||||
|
|
||||||
interface ExtractServiceOptions {
|
interface ExtractServiceOptions {
|
||||||
@ -147,7 +151,13 @@ Schema: ${schemaString}\nPrompt: ${prompt}\nRelevant URLs: ${urls}`,
|
|||||||
totalTokens: result.usage?.total_tokens ?? 0,
|
totalTokens: result.usage?.total_tokens ?? 0,
|
||||||
model: model,
|
model: model,
|
||||||
};
|
};
|
||||||
return { isMultiEntity, multiEntityKeys, reasoning, keyIndicators, tokenUsage };
|
return {
|
||||||
|
isMultiEntity,
|
||||||
|
multiEntityKeys,
|
||||||
|
reasoning,
|
||||||
|
keyIndicators,
|
||||||
|
tokenUsage,
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
type completions = {
|
type completions = {
|
||||||
@ -246,7 +256,7 @@ export async function performExtraction(
|
|||||||
"No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.",
|
"No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.",
|
||||||
extractId,
|
extractId,
|
||||||
urlTrace: urlTraces,
|
urlTrace: urlTraces,
|
||||||
totalUrlsScraped: 0
|
totalUrlsScraped: 0,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -277,8 +287,13 @@ export async function performExtraction(
|
|||||||
// 1. the first one is a completion that will extract the array of items
|
// 1. the first one is a completion that will extract the array of items
|
||||||
// 2. the second one is multiple completions that will extract the items from the array
|
// 2. the second one is multiple completions that will extract the items from the array
|
||||||
let startAnalyze = Date.now();
|
let startAnalyze = Date.now();
|
||||||
const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators, tokenUsage: schemaAnalysisTokenUsage } =
|
const {
|
||||||
await analyzeSchemaAndPrompt(links, reqSchema, request.prompt ?? "");
|
isMultiEntity,
|
||||||
|
multiEntityKeys,
|
||||||
|
reasoning,
|
||||||
|
keyIndicators,
|
||||||
|
tokenUsage: schemaAnalysisTokenUsage,
|
||||||
|
} = await analyzeSchemaAndPrompt(links, reqSchema, request.prompt ?? "");
|
||||||
|
|
||||||
// Track schema analysis tokens
|
// Track schema analysis tokens
|
||||||
tokenUsage.push(schemaAnalysisTokenUsage);
|
tokenUsage.push(schemaAnalysisTokenUsage);
|
||||||
@ -540,7 +555,7 @@ export async function performExtraction(
|
|||||||
"An unexpected error occurred. Please contact help@firecrawl.com for help.",
|
"An unexpected error occurred. Please contact help@firecrawl.com for help.",
|
||||||
extractId,
|
extractId,
|
||||||
urlTrace: urlTraces,
|
urlTrace: urlTraces,
|
||||||
totalUrlsScraped
|
totalUrlsScraped,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -592,17 +607,18 @@ export async function performExtraction(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const validResults = results.filter((doc): doc is Document => doc !== null);
|
const validResults = results.filter(
|
||||||
|
(doc): doc is Document => doc !== null,
|
||||||
|
);
|
||||||
singleAnswerDocs.push(...validResults);
|
singleAnswerDocs.push(...validResults);
|
||||||
totalUrlsScraped += validResults.length;
|
totalUrlsScraped += validResults.length;
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
error: error.message,
|
error: error.message,
|
||||||
extractId,
|
extractId,
|
||||||
urlTrace: urlTraces,
|
urlTrace: urlTraces,
|
||||||
totalUrlsScraped
|
totalUrlsScraped,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -614,7 +630,7 @@ export async function performExtraction(
|
|||||||
"All provided URLs are invalid. Please check your input and try again.",
|
"All provided URLs are invalid. Please check your input and try again.",
|
||||||
extractId,
|
extractId,
|
||||||
urlTrace: request.urlTrace ? urlTraces : undefined,
|
urlTrace: request.urlTrace ? urlTraces : undefined,
|
||||||
totalUrlsScraped: 0
|
totalUrlsScraped: 0,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -679,12 +695,12 @@ export async function performExtraction(
|
|||||||
: singleAnswerResult || multiEntityResult;
|
: singleAnswerResult || multiEntityResult;
|
||||||
|
|
||||||
// Tokenize final result to get token count
|
// Tokenize final result to get token count
|
||||||
let finalResultTokens = 0;
|
// let finalResultTokens = 0;
|
||||||
if (finalResult) {
|
// if (finalResult) {
|
||||||
const finalResultStr = JSON.stringify(finalResult);
|
// const finalResultStr = JSON.stringify(finalResult);
|
||||||
finalResultTokens = numTokensFromString(finalResultStr, "gpt-4o");
|
// finalResultTokens = numTokensFromString(finalResultStr, "gpt-4o");
|
||||||
|
|
||||||
}
|
// }
|
||||||
// // Deduplicate and validate final result against schema
|
// // Deduplicate and validate final result against schema
|
||||||
// if (reqSchema && finalResult && finalResult.length <= extractConfig.DEDUPLICATION.MAX_TOKENS) {
|
// if (reqSchema && finalResult && finalResult.length <= extractConfig.DEDUPLICATION.MAX_TOKENS) {
|
||||||
// const schemaValidation = await generateOpenAICompletions(
|
// const schemaValidation = await generateOpenAICompletions(
|
||||||
@ -732,12 +748,10 @@ export async function performExtraction(
|
|||||||
const llmUsage = estimateTotalCost(tokenUsage);
|
const llmUsage = estimateTotalCost(tokenUsage);
|
||||||
let tokensToBill = calculateFinalResultCost(finalResult);
|
let tokensToBill = calculateFinalResultCost(finalResult);
|
||||||
|
|
||||||
|
|
||||||
if (CUSTOM_U_TEAMS.includes(teamId)) {
|
if (CUSTOM_U_TEAMS.includes(teamId)) {
|
||||||
tokensToBill = 1;
|
tokensToBill = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Bill team for usage
|
// Bill team for usage
|
||||||
billTeam(teamId, subId, tokensToBill, logger, true).catch((error) => {
|
billTeam(teamId, subId, tokensToBill, logger, true).catch((error) => {
|
||||||
logger.error(
|
logger.error(
|
||||||
@ -745,7 +759,6 @@ export async function performExtraction(
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
// Log job with token usage
|
// Log job with token usage
|
||||||
logJob({
|
logJob({
|
||||||
job_id: extractId,
|
job_id: extractId,
|
||||||
@ -779,6 +792,6 @@ export async function performExtraction(
|
|||||||
warning: undefined, // TODO FIX
|
warning: undefined, // TODO FIX
|
||||||
urlTrace: request.urlTrace ? urlTraces : undefined,
|
urlTrace: request.urlTrace ? urlTraces : undefined,
|
||||||
llmUsage,
|
llmUsage,
|
||||||
totalUrlsScraped
|
totalUrlsScraped,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -1,10 +1,12 @@
|
|||||||
export function deduplicateObjectsArray(objArray: { [key: string]: any[] }): { [key: string]: any[] } {
|
export function deduplicateObjectsArray(objArray: { [key: string]: any[] }): {
|
||||||
|
[key: string]: any[];
|
||||||
|
} {
|
||||||
const deduplicatedObjArray: { [key: string]: any[] } = {};
|
const deduplicatedObjArray: { [key: string]: any[] } = {};
|
||||||
|
|
||||||
for (const key in objArray) {
|
for (const key in objArray) {
|
||||||
if (Array.isArray(objArray[key])) {
|
if (Array.isArray(objArray[key])) {
|
||||||
const seen = new Set();
|
const seen = new Set();
|
||||||
deduplicatedObjArray[key] = objArray[key].filter(item => {
|
deduplicatedObjArray[key] = objArray[key].filter((item) => {
|
||||||
// Create a unique identifier for each item based on its properties
|
// Create a unique identifier for each item based on its properties
|
||||||
const identifier = JSON.stringify(item);
|
const identifier = JSON.stringify(item);
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
import * as fs from 'fs';
|
import * as fs from "fs";
|
||||||
import * as path from 'path';
|
import * as path from "path";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Helper function to dump data to a file for debugging/logging purposes
|
* Helper function to dump data to a file for debugging/logging purposes
|
||||||
@ -10,17 +10,19 @@ import * as path from 'path';
|
|||||||
export function dumpToFile<T>(
|
export function dumpToFile<T>(
|
||||||
filename: string,
|
filename: string,
|
||||||
data: T[],
|
data: T[],
|
||||||
formatter?: (item: T, index: number) => string
|
formatter?: (item: T, index: number) => string,
|
||||||
) {
|
) {
|
||||||
const filePath = path.join(__dirname, filename);
|
const filePath = path.join(__dirname, filename);
|
||||||
|
|
||||||
let fileContent: string;
|
let fileContent: string;
|
||||||
if (formatter) {
|
if (formatter) {
|
||||||
fileContent = data.map((item, index) => formatter(item, index)).join('\n');
|
fileContent = data.map((item, index) => formatter(item, index)).join("\n");
|
||||||
} else {
|
} else {
|
||||||
fileContent = data.map((item, index) => `${index + 1}. ${JSON.stringify(item)}`).join('\n');
|
fileContent = data
|
||||||
|
.map((item, index) => `${index + 1}. ${JSON.stringify(item)}`)
|
||||||
|
.join("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
fs.writeFileSync(filePath, fileContent, 'utf8');
|
fs.writeFileSync(filePath, fileContent, "utf8");
|
||||||
console.log(`Dumped data to ${filename}`);
|
console.log(`Dumped data to ${filename}`);
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import { deduplicateObjectsArray } from './deduplicate-objs-array';
|
import { deduplicateObjectsArray } from "./deduplicate-objs-array";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Convert "null" strings to actual null values for easier comparison.
|
* Convert "null" strings to actual null values for easier comparison.
|
||||||
@ -56,7 +56,10 @@ function mergeArrays(arr1: any[], arr2: any[]): any[] {
|
|||||||
const combined = [...arr1, ...arr2];
|
const combined = [...arr1, ...arr2];
|
||||||
return combined.filter((item, index) => {
|
return combined.filter((item, index) => {
|
||||||
const stringified = JSON.stringify(item);
|
const stringified = JSON.stringify(item);
|
||||||
return combined.findIndex(other => JSON.stringify(other) === stringified) === index;
|
return (
|
||||||
|
combined.findIndex((other) => JSON.stringify(other) === stringified) ===
|
||||||
|
index
|
||||||
|
);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -78,9 +81,9 @@ function mergeObjects(obj1: any, obj2: any): any {
|
|||||||
// If only obj2's value is an array, use it
|
// If only obj2's value is an array, use it
|
||||||
result[key] = [...obj2[key]];
|
result[key] = [...obj2[key]];
|
||||||
}
|
}
|
||||||
} else if (typeof obj2[key] === 'object') {
|
} else if (typeof obj2[key] === "object") {
|
||||||
// If both are objects (but not arrays), merge them
|
// If both are objects (but not arrays), merge them
|
||||||
if (typeof result[key] === 'object' && !Array.isArray(result[key])) {
|
if (typeof result[key] === "object" && !Array.isArray(result[key])) {
|
||||||
result[key] = mergeObjects(result[key], obj2[key]);
|
result[key] = mergeObjects(result[key], obj2[key]);
|
||||||
} else {
|
} else {
|
||||||
result[key] = { ...obj2[key] };
|
result[key] = { ...obj2[key] };
|
||||||
@ -101,13 +104,17 @@ function mergeObjects(obj1: any, obj2: any): any {
|
|||||||
* null-equivalent fields, filling in null fields with the corresponding
|
* null-equivalent fields, filling in null fields with the corresponding
|
||||||
* non-null fields from the other object.
|
* non-null fields from the other object.
|
||||||
*/
|
*/
|
||||||
export function mergeNullValObjs(objArray: { [key: string]: any[] }): { [key: string]: any[] } {
|
export function mergeNullValObjs(objArray: { [key: string]: any[] }): {
|
||||||
|
[key: string]: any[];
|
||||||
|
} {
|
||||||
const result: { [key: string]: any[] } = {};
|
const result: { [key: string]: any[] } = {};
|
||||||
|
|
||||||
for (const key in objArray) {
|
for (const key in objArray) {
|
||||||
if (Array.isArray(objArray[key])) {
|
if (Array.isArray(objArray[key])) {
|
||||||
// If array contains only primitive values, return as is
|
// If array contains only primitive values, return as is
|
||||||
if (objArray[key].every(item => typeof item !== 'object' || item === null)) {
|
if (
|
||||||
|
objArray[key].every((item) => typeof item !== "object" || item === null)
|
||||||
|
) {
|
||||||
result[key] = [...objArray[key]];
|
result[key] = [...objArray[key]];
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -134,7 +141,10 @@ export function mergeNullValObjs(objArray: { [key: string]: any[] }): { [key: st
|
|||||||
// Final deduplication pass
|
// Final deduplication pass
|
||||||
result[key] = deduplicateObjectsArray({ [key]: mergedItems })[key];
|
result[key] = deduplicateObjectsArray({ [key]: mergedItems })[key];
|
||||||
} else {
|
} else {
|
||||||
console.warn(`Expected an array at objArray[${key}], but found:`, objArray[key]);
|
console.warn(
|
||||||
|
`Expected an array at objArray[${key}], but found:`,
|
||||||
|
objArray[key],
|
||||||
|
);
|
||||||
return objArray;
|
return objArray;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
export async function mixSchemaObjects(
|
export async function mixSchemaObjects(
|
||||||
finalSchema: any,
|
finalSchema: any,
|
||||||
singleAnswerResult: any,
|
singleAnswerResult: any,
|
||||||
multiEntityResult: any
|
multiEntityResult: any,
|
||||||
) {
|
) {
|
||||||
const finalResult: any = {};
|
const finalResult: any = {};
|
||||||
|
|
||||||
@ -9,14 +9,20 @@ export async function mixSchemaObjects(
|
|||||||
function mergeResults(schema: any, singleResult: any, multiResult: any) {
|
function mergeResults(schema: any, singleResult: any, multiResult: any) {
|
||||||
const result: any = {};
|
const result: any = {};
|
||||||
for (const key in schema.properties) {
|
for (const key in schema.properties) {
|
||||||
if (schema.properties[key].type === 'object' && schema.properties[key].properties) {
|
if (
|
||||||
|
schema.properties[key].type === "object" &&
|
||||||
|
schema.properties[key].properties
|
||||||
|
) {
|
||||||
// If the property is an object, recursively merge its properties
|
// If the property is an object, recursively merge its properties
|
||||||
result[key] = mergeResults(
|
result[key] = mergeResults(
|
||||||
schema.properties[key],
|
schema.properties[key],
|
||||||
singleResult[key] || {},
|
singleResult[key] || {},
|
||||||
multiResult[key] || {}
|
multiResult[key] || {},
|
||||||
);
|
);
|
||||||
} else if (schema.properties[key].type === 'array' && Array.isArray(multiResult[key])) {
|
} else if (
|
||||||
|
schema.properties[key].type === "array" &&
|
||||||
|
Array.isArray(multiResult[key])
|
||||||
|
) {
|
||||||
// If the property is an array, flatten the arrays from multiResult
|
// If the property is an array, flatten the arrays from multiResult
|
||||||
result[key] = multiResult[key].flat();
|
result[key] = multiResult[key].flat();
|
||||||
} else if (singleResult.hasOwnProperty(key)) {
|
} else if (singleResult.hasOwnProperty(key)) {
|
||||||
@ -29,7 +35,10 @@ export async function mixSchemaObjects(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Merge the properties from the final schema
|
// Merge the properties from the final schema
|
||||||
Object.assign(finalResult, mergeResults(finalSchema, singleAnswerResult, multiEntityResult));
|
Object.assign(
|
||||||
|
finalResult,
|
||||||
|
mergeResults(finalSchema, singleAnswerResult, multiEntityResult),
|
||||||
|
);
|
||||||
|
|
||||||
return finalResult;
|
return finalResult;
|
||||||
}
|
}
|
@ -1,4 +1,7 @@
|
|||||||
export async function spreadSchemas(schema: any, keys: string[]): Promise<{
|
export async function spreadSchemas(
|
||||||
|
schema: any,
|
||||||
|
keys: string[],
|
||||||
|
): Promise<{
|
||||||
singleAnswerSchema: any;
|
singleAnswerSchema: any;
|
||||||
multiEntitySchema: any;
|
multiEntitySchema: any;
|
||||||
}> {
|
}> {
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
import isEqual from 'lodash/isEqual';
|
import isEqual from "lodash/isEqual";
|
||||||
|
|
||||||
export function transformArrayToObject(
|
export function transformArrayToObject(
|
||||||
originalSchema: any,
|
originalSchema: any,
|
||||||
arrayData: any[]
|
arrayData: any[],
|
||||||
): any {
|
): any {
|
||||||
if (Object.keys(originalSchema).length == 0) {
|
if (Object.keys(originalSchema).length == 0) {
|
||||||
return {};
|
return {};
|
||||||
@ -13,9 +13,9 @@ export function transformArrayToObject(
|
|||||||
// Function to find the array key in a nested schema
|
// Function to find the array key in a nested schema
|
||||||
function findArrayKey(schema: any): string | null {
|
function findArrayKey(schema: any): string | null {
|
||||||
for (const key in schema.properties) {
|
for (const key in schema.properties) {
|
||||||
if (schema.properties[key].type === 'array') {
|
if (schema.properties[key].type === "array") {
|
||||||
return key;
|
return key;
|
||||||
} else if (schema.properties[key].type === 'object') {
|
} else if (schema.properties[key].type === "object") {
|
||||||
const nestedKey = findArrayKey(schema.properties[key]);
|
const nestedKey = findArrayKey(schema.properties[key]);
|
||||||
if (nestedKey) {
|
if (nestedKey) {
|
||||||
return `${key}.${nestedKey}`;
|
return `${key}.${nestedKey}`;
|
||||||
@ -31,7 +31,10 @@ export function transformArrayToObject(
|
|||||||
for (const key in item) {
|
for (const key in item) {
|
||||||
if (!acc[key]) {
|
if (!acc[key]) {
|
||||||
acc[key] = item[key];
|
acc[key] = item[key];
|
||||||
} else if (typeof acc[key] === 'object' && typeof item[key] === 'object') {
|
} else if (
|
||||||
|
typeof acc[key] === "object" &&
|
||||||
|
typeof item[key] === "object"
|
||||||
|
) {
|
||||||
acc[key] = { ...acc[key], ...item[key] };
|
acc[key] = { ...acc[key], ...item[key] };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -39,13 +42,16 @@ export function transformArrayToObject(
|
|||||||
}, {});
|
}, {});
|
||||||
}
|
}
|
||||||
|
|
||||||
const arrayKeyParts = arrayKeyPath.split('.');
|
const arrayKeyParts = arrayKeyPath.split(".");
|
||||||
const arrayKey = arrayKeyParts.pop();
|
const arrayKey = arrayKeyParts.pop();
|
||||||
if (!arrayKey) {
|
if (!arrayKey) {
|
||||||
throw new Error("Array key not found in schema");
|
throw new Error("Array key not found in schema");
|
||||||
}
|
}
|
||||||
|
|
||||||
const parentSchema = arrayKeyParts.reduce((schema, key) => schema.properties[key], originalSchema);
|
const parentSchema = arrayKeyParts.reduce(
|
||||||
|
(schema, key) => schema.properties[key],
|
||||||
|
originalSchema,
|
||||||
|
);
|
||||||
const itemSchema = parentSchema.properties[arrayKey].items;
|
const itemSchema = parentSchema.properties[arrayKey].items;
|
||||||
if (!itemSchema) {
|
if (!itemSchema) {
|
||||||
throw new Error("Item schema not found for array key");
|
throw new Error("Item schema not found for array key");
|
||||||
@ -53,7 +59,7 @@ export function transformArrayToObject(
|
|||||||
|
|
||||||
// Initialize the array in the transformed result
|
// Initialize the array in the transformed result
|
||||||
let currentLevel = transformedResult;
|
let currentLevel = transformedResult;
|
||||||
arrayKeyParts.forEach(part => {
|
arrayKeyParts.forEach((part) => {
|
||||||
if (!currentLevel[part]) {
|
if (!currentLevel[part]) {
|
||||||
currentLevel[part] = {};
|
currentLevel[part] = {};
|
||||||
}
|
}
|
||||||
@ -63,20 +69,23 @@ export function transformArrayToObject(
|
|||||||
|
|
||||||
// Helper function to check if an object is already in the array
|
// Helper function to check if an object is already in the array
|
||||||
function isDuplicateObject(array: any[], obj: any): boolean {
|
function isDuplicateObject(array: any[], obj: any): boolean {
|
||||||
return array.some(existingItem => isEqual(existingItem, obj));
|
return array.some((existingItem) => isEqual(existingItem, obj));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function to validate if an object follows the schema
|
// Helper function to validate if an object follows the schema
|
||||||
function isValidObject(obj: any, schema: any): boolean {
|
function isValidObject(obj: any, schema: any): boolean {
|
||||||
return Object.keys(schema.properties).every(key => {
|
return Object.keys(schema.properties).every((key) => {
|
||||||
return obj.hasOwnProperty(key) && typeof obj[key] === schema.properties[key].type;
|
return (
|
||||||
|
obj.hasOwnProperty(key) &&
|
||||||
|
typeof obj[key] === schema.properties[key].type
|
||||||
|
);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Iterate over each item in the arrayData
|
// Iterate over each item in the arrayData
|
||||||
arrayData.forEach(item => {
|
arrayData.forEach((item) => {
|
||||||
let currentItem = item;
|
let currentItem = item;
|
||||||
arrayKeyParts.forEach(part => {
|
arrayKeyParts.forEach((part) => {
|
||||||
if (currentItem[part]) {
|
if (currentItem[part]) {
|
||||||
currentItem = currentItem[part];
|
currentItem = currentItem[part];
|
||||||
}
|
}
|
||||||
@ -84,7 +93,11 @@ export function transformArrayToObject(
|
|||||||
|
|
||||||
// Copy non-array properties from the parent object
|
// Copy non-array properties from the parent object
|
||||||
for (const key in parentSchema.properties) {
|
for (const key in parentSchema.properties) {
|
||||||
if (key !== arrayKey && currentItem.hasOwnProperty(key) && !currentLevel.hasOwnProperty(key)) {
|
if (
|
||||||
|
key !== arrayKey &&
|
||||||
|
currentItem.hasOwnProperty(key) &&
|
||||||
|
!currentLevel.hasOwnProperty(key)
|
||||||
|
) {
|
||||||
currentLevel[key] = currentItem[key];
|
currentLevel[key] = currentItem[key];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -92,7 +105,11 @@ export function transformArrayToObject(
|
|||||||
// Ensure that the currentItem[arrayKey] is an array before mapping
|
// Ensure that the currentItem[arrayKey] is an array before mapping
|
||||||
if (Array.isArray(currentItem[arrayKey])) {
|
if (Array.isArray(currentItem[arrayKey])) {
|
||||||
currentItem[arrayKey].forEach((subItem: any) => {
|
currentItem[arrayKey].forEach((subItem: any) => {
|
||||||
if (typeof subItem === 'object' && subItem !== null && isValidObject(subItem, itemSchema)) {
|
if (
|
||||||
|
typeof subItem === "object" &&
|
||||||
|
subItem !== null &&
|
||||||
|
isValidObject(subItem, itemSchema)
|
||||||
|
) {
|
||||||
// For arrays of objects, add only unique objects
|
// For arrays of objects, add only unique objects
|
||||||
const transformedItem: any = {};
|
const transformedItem: any = {};
|
||||||
let hasValidData = false;
|
let hasValidData = false;
|
||||||
@ -104,23 +121,35 @@ export function transformArrayToObject(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hasValidData && !isDuplicateObject(currentLevel[arrayKey], transformedItem)) {
|
if (
|
||||||
|
hasValidData &&
|
||||||
|
!isDuplicateObject(currentLevel[arrayKey], transformedItem)
|
||||||
|
) {
|
||||||
currentLevel[arrayKey].push(transformedItem);
|
currentLevel[arrayKey].push(transformedItem);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
console.warn(`Expected an array at ${arrayKey}, but found:`, currentItem[arrayKey]);
|
console.warn(
|
||||||
|
`Expected an array at ${arrayKey}, but found:`,
|
||||||
|
currentItem[arrayKey],
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Handle merging of array properties
|
// Handle merging of array properties
|
||||||
for (const key in parentSchema.properties) {
|
for (const key in parentSchema.properties) {
|
||||||
if (parentSchema.properties[key].type === 'array' && Array.isArray(currentItem[key])) {
|
if (
|
||||||
|
parentSchema.properties[key].type === "array" &&
|
||||||
|
Array.isArray(currentItem[key])
|
||||||
|
) {
|
||||||
if (!currentLevel[key]) {
|
if (!currentLevel[key]) {
|
||||||
currentLevel[key] = [];
|
currentLevel[key] = [];
|
||||||
}
|
}
|
||||||
currentItem[key].forEach((value: any) => {
|
currentItem[key].forEach((value: any) => {
|
||||||
if (!currentLevel[key].includes(value) && !isDuplicateObject(currentLevel[arrayKey], value)) {
|
if (
|
||||||
|
!currentLevel[key].includes(value) &&
|
||||||
|
!isDuplicateObject(currentLevel[arrayKey], value)
|
||||||
|
) {
|
||||||
currentLevel[key].push(value);
|
currentLevel[key].push(value);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -91,7 +91,8 @@ export async function indexPage({
|
|||||||
url: normalizedUrl,
|
url: normalizedUrl,
|
||||||
originUrl: normalizeUrl(originUrl),
|
originUrl: normalizeUrl(originUrl),
|
||||||
title: document.metadata.title ?? document.metadata.ogTitle ?? "",
|
title: document.metadata.title ?? document.metadata.ogTitle ?? "",
|
||||||
description: document.metadata.description ?? document.metadata.ogDescription ?? "",
|
description:
|
||||||
|
document.metadata.description ?? document.metadata.ogDescription ?? "",
|
||||||
crawlId,
|
crawlId,
|
||||||
teamId,
|
teamId,
|
||||||
markdown: trimmedMarkdown,
|
markdown: trimmedMarkdown,
|
||||||
@ -126,7 +127,7 @@ export async function indexPage({
|
|||||||
export async function searchSimilarPages(
|
export async function searchSimilarPages(
|
||||||
query: string,
|
query: string,
|
||||||
originUrl?: string,
|
originUrl?: string,
|
||||||
limit: number = 1000
|
limit: number = 1000,
|
||||||
): Promise<any[]> {
|
): Promise<any[]> {
|
||||||
try {
|
try {
|
||||||
const index = pinecone.index(INDEX_NAME);
|
const index = pinecone.index(INDEX_NAME);
|
||||||
|
@ -59,7 +59,7 @@ export async function rerankLinks(
|
|||||||
const linksAndScores = await performRanking(
|
const linksAndScores = await performRanking(
|
||||||
mappedLinksRerank,
|
mappedLinksRerank,
|
||||||
mappedLinks.map((l) => l.url),
|
mappedLinks.map((l) => l.url),
|
||||||
searchQuery
|
searchQuery,
|
||||||
);
|
);
|
||||||
|
|
||||||
// First try with high threshold
|
// First try with high threshold
|
||||||
@ -109,7 +109,10 @@ export async function rerankLinks(
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
const rankedLinks = filteredLinks.slice(0, extractConfig.RERANKING.MAX_RANKING_LIMIT_FOR_RELEVANCE);
|
const rankedLinks = filteredLinks.slice(
|
||||||
|
0,
|
||||||
|
extractConfig.RERANKING.MAX_RANKING_LIMIT_FOR_RELEVANCE,
|
||||||
|
);
|
||||||
|
|
||||||
// Mark URLs that will be used in completion
|
// Mark URLs that will be used in completion
|
||||||
rankedLinks.forEach((link) => {
|
rankedLinks.forEach((link) => {
|
||||||
@ -120,8 +123,10 @@ export async function rerankLinks(
|
|||||||
});
|
});
|
||||||
|
|
||||||
// Mark URLs that were dropped due to ranking limit
|
// Mark URLs that were dropped due to ranking limit
|
||||||
filteredLinks.slice(extractConfig.RERANKING.MAX_RANKING_LIMIT_FOR_RELEVANCE).forEach(link => {
|
filteredLinks
|
||||||
const trace = urlTraces.find(t => t.url === link.url);
|
.slice(extractConfig.RERANKING.MAX_RANKING_LIMIT_FOR_RELEVANCE)
|
||||||
|
.forEach((link) => {
|
||||||
|
const trace = urlTraces.find((t) => t.url === link.url);
|
||||||
if (trace) {
|
if (trace) {
|
||||||
trace.warning = "Excluded due to ranking limit";
|
trace.warning = "Excluded due to ranking limit";
|
||||||
trace.usedInCompletion = false;
|
trace.usedInCompletion = false;
|
||||||
@ -155,7 +160,7 @@ function filterAndProcessLinks(
|
|||||||
export type RerankerResult = {
|
export type RerankerResult = {
|
||||||
mapDocument: MapDocument[];
|
mapDocument: MapDocument[];
|
||||||
tokensUsed: number;
|
tokensUsed: number;
|
||||||
}
|
};
|
||||||
|
|
||||||
export async function rerankLinksWithLLM(
|
export async function rerankLinksWithLLM(
|
||||||
mappedLinks: MapDocument[],
|
mappedLinks: MapDocument[],
|
||||||
@ -184,23 +189,25 @@ export async function rerankLinksWithLLM(
|
|||||||
type: "object",
|
type: "object",
|
||||||
properties: {
|
properties: {
|
||||||
url: { type: "string" },
|
url: { type: "string" },
|
||||||
relevanceScore: { type: "number" }
|
relevanceScore: { type: "number" },
|
||||||
},
|
},
|
||||||
required: ["url", "relevanceScore"]
|
required: ["url", "relevanceScore"],
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
required: ["relevantLinks"]
|
},
|
||||||
|
},
|
||||||
|
required: ["relevantLinks"],
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
const results = await Promise.all(
|
const results = await Promise.all(
|
||||||
chunks.map(async (chunk, chunkIndex) => {
|
chunks.map(async (chunk, chunkIndex) => {
|
||||||
// console.log(`Processing chunk ${chunkIndex + 1}/${chunks.length} with ${chunk.length} links`);
|
// console.log(`Processing chunk ${chunkIndex + 1}/${chunks.length} with ${chunk.length} links`);
|
||||||
|
|
||||||
const linksContent = chunk.map(link =>
|
const linksContent = chunk
|
||||||
`URL: ${link.url}${link.title ? `\nTitle: ${link.title}` : ''}${link.description ? `\nDescription: ${link.description}` : ''}`
|
.map(
|
||||||
).join("\n\n");
|
(link) =>
|
||||||
|
`URL: ${link.url}${link.title ? `\nTitle: ${link.title}` : ""}${link.description ? `\nDescription: ${link.description}` : ""}`,
|
||||||
|
)
|
||||||
|
.join("\n\n");
|
||||||
|
|
||||||
for (let retry = 0; retry <= MAX_RETRIES; retry++) {
|
for (let retry = 0; retry <= MAX_RETRIES; retry++) {
|
||||||
try {
|
try {
|
||||||
@ -208,21 +215,27 @@ export async function rerankLinksWithLLM(
|
|||||||
setTimeout(() => resolve(null), TIMEOUT_MS);
|
setTimeout(() => resolve(null), TIMEOUT_MS);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
const completionPromise = generateOpenAICompletions(
|
const completionPromise = generateOpenAICompletions(
|
||||||
logger.child({ method: "rerankLinksWithLLM", chunk: chunkIndex + 1, retry }),
|
logger.child({
|
||||||
|
method: "rerankLinksWithLLM",
|
||||||
|
chunk: chunkIndex + 1,
|
||||||
|
retry,
|
||||||
|
}),
|
||||||
{
|
{
|
||||||
mode: "llm",
|
mode: "llm",
|
||||||
systemPrompt: buildRerankerSystemPrompt(),
|
systemPrompt: buildRerankerSystemPrompt(),
|
||||||
prompt: buildRerankerUserPrompt(searchQuery),
|
prompt: buildRerankerUserPrompt(searchQuery),
|
||||||
schema: schema
|
schema: schema,
|
||||||
},
|
},
|
||||||
linksContent,
|
linksContent,
|
||||||
undefined,
|
undefined,
|
||||||
true
|
true,
|
||||||
);
|
);
|
||||||
|
|
||||||
const completion = await Promise.race([completionPromise, timeoutPromise]);
|
const completion = await Promise.race([
|
||||||
|
completionPromise,
|
||||||
|
timeoutPromise,
|
||||||
|
]);
|
||||||
|
|
||||||
if (!completion) {
|
if (!completion) {
|
||||||
// console.log(`Chunk ${chunkIndex + 1}: Timeout on attempt ${retry + 1}`);
|
// console.log(`Chunk ${chunkIndex + 1}: Timeout on attempt ${retry + 1}`);
|
||||||
@ -237,9 +250,11 @@ export async function rerankLinksWithLLM(
|
|||||||
totalTokensUsed += completion.numTokens || 0;
|
totalTokensUsed += completion.numTokens || 0;
|
||||||
// console.log(`Chunk ${chunkIndex + 1}: Found ${completion.extract.relevantLinks.length} relevant links`);
|
// console.log(`Chunk ${chunkIndex + 1}: Found ${completion.extract.relevantLinks.length} relevant links`);
|
||||||
return completion.extract.relevantLinks;
|
return completion.extract.relevantLinks;
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.warn(`Error processing chunk ${chunkIndex + 1} attempt ${retry + 1}:`, error);
|
console.warn(
|
||||||
|
`Error processing chunk ${chunkIndex + 1} attempt ${retry + 1}:`,
|
||||||
|
error,
|
||||||
|
);
|
||||||
if (retry === MAX_RETRIES) {
|
if (retry === MAX_RETRIES) {
|
||||||
// console.log(`Chunk ${chunkIndex + 1}: Max retries reached, returning empty array`);
|
// console.log(`Chunk ${chunkIndex + 1}: Max retries reached, returning empty array`);
|
||||||
return [];
|
return [];
|
||||||
@ -247,18 +262,20 @@ export async function rerankLinksWithLLM(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
return [];
|
return [];
|
||||||
})
|
}),
|
||||||
);
|
);
|
||||||
|
|
||||||
// console.log(`Processed ${results.length} chunks`);
|
// console.log(`Processed ${results.length} chunks`);
|
||||||
|
|
||||||
// Flatten results and sort by relevance score
|
// Flatten results and sort by relevance score
|
||||||
const flattenedResults = results.flat().sort((a, b) => b.relevanceScore - a.relevanceScore);
|
const flattenedResults = results
|
||||||
|
.flat()
|
||||||
|
.sort((a, b) => b.relevanceScore - a.relevanceScore);
|
||||||
// console.log(`Total relevant links found: ${flattenedResults.length}`);
|
// console.log(`Total relevant links found: ${flattenedResults.length}`);
|
||||||
|
|
||||||
// Map back to MapDocument format, keeping only relevant links
|
// Map back to MapDocument format, keeping only relevant links
|
||||||
const relevantLinks = flattenedResults
|
const relevantLinks = flattenedResults
|
||||||
.map(result => mappedLinks.find(link => link.url === result.url))
|
.map((result) => mappedLinks.find((link) => link.url === result.url))
|
||||||
.filter((link): link is MapDocument => link !== undefined);
|
.filter((link): link is MapDocument => link !== undefined);
|
||||||
|
|
||||||
// console.log(`Returning ${relevantLinks.length} relevant links`);
|
// console.log(`Returning ${relevantLinks.length} relevant links`);
|
||||||
|
@ -184,8 +184,6 @@ export async function processUrl(
|
|||||||
// (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}`
|
// (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}`
|
||||||
// );
|
// );
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
const rerankerResult = await rerankLinksWithLLM(
|
const rerankerResult = await rerankLinksWithLLM(
|
||||||
mappedLinks,
|
mappedLinks,
|
||||||
rephrasedPrompt,
|
rephrasedPrompt,
|
||||||
|
@ -12,7 +12,9 @@ const tokenPerCharacter = 4;
|
|||||||
const baseTokenCost = 300;
|
const baseTokenCost = 300;
|
||||||
|
|
||||||
export function calculateFinalResultCost(data: any): number {
|
export function calculateFinalResultCost(data: any): number {
|
||||||
return Math.floor((JSON.stringify(data).length / tokenPerCharacter) + baseTokenCost);
|
return Math.floor(
|
||||||
|
JSON.stringify(data).length / tokenPerCharacter + baseTokenCost,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
export function estimateTotalCost(tokenUsage: TokenUsage[]): number {
|
export function estimateTotalCost(tokenUsage: TokenUsage[]): number {
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -57,7 +57,7 @@ async function performRanking(
|
|||||||
const linksAndScores = await Promise.all(
|
const linksAndScores = await Promise.all(
|
||||||
linksWithContext.map((linkWithContext, index) =>
|
linksWithContext.map((linkWithContext, index) =>
|
||||||
getEmbedding(linkWithContext)
|
getEmbedding(linkWithContext)
|
||||||
.then(linkEmbedding => {
|
.then((linkEmbedding) => {
|
||||||
const score = cosineSimilarity(queryEmbedding, linkEmbedding);
|
const score = cosineSimilarity(queryEmbedding, linkEmbedding);
|
||||||
return {
|
return {
|
||||||
link: links[index],
|
link: links[index],
|
||||||
@ -71,8 +71,8 @@ async function performRanking(
|
|||||||
linkWithContext,
|
linkWithContext,
|
||||||
score: 0,
|
score: 0,
|
||||||
originalIndex: index,
|
originalIndex: index,
|
||||||
}))
|
})),
|
||||||
)
|
),
|
||||||
);
|
);
|
||||||
|
|
||||||
// Sort links based on similarity scores while preserving original order for equal scores
|
// Sort links based on similarity scores while preserving original order for equal scores
|
||||||
|
@ -252,20 +252,19 @@ export class WebCrawler {
|
|||||||
};
|
};
|
||||||
|
|
||||||
const timeoutPromise = new Promise((_, reject) => {
|
const timeoutPromise = new Promise((_, reject) => {
|
||||||
setTimeout(() => reject(new Error('Sitemap fetch timeout')), timeout);
|
setTimeout(() => reject(new Error("Sitemap fetch timeout")), timeout);
|
||||||
});
|
});
|
||||||
|
|
||||||
try {
|
try {
|
||||||
let count = await Promise.race([
|
let count = (await Promise.race([
|
||||||
Promise.all([
|
Promise.all([
|
||||||
this.tryFetchSitemapLinks(
|
this.tryFetchSitemapLinks(this.initialUrl, _urlsHandler),
|
||||||
this.initialUrl,
|
...this.robots
|
||||||
_urlsHandler,
|
.getSitemaps()
|
||||||
),
|
.map((x) => this.tryFetchSitemapLinks(x, _urlsHandler)),
|
||||||
...this.robots.getSitemaps().map(x => this.tryFetchSitemapLinks(x, _urlsHandler)),
|
]).then((results) => results.reduce((a, x) => a + x, 0)),
|
||||||
]).then(results => results.reduce((a,x) => a+x, 0)),
|
timeoutPromise,
|
||||||
timeoutPromise
|
])) as number;
|
||||||
]) as number;
|
|
||||||
|
|
||||||
if (count > 0) {
|
if (count > 0) {
|
||||||
if (
|
if (
|
||||||
@ -281,14 +280,14 @@ export class WebCrawler {
|
|||||||
|
|
||||||
return count;
|
return count;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error.message === 'Sitemap fetch timeout') {
|
if (error.message === "Sitemap fetch timeout") {
|
||||||
this.logger.warn('Sitemap fetch timed out', {
|
this.logger.warn("Sitemap fetch timed out", {
|
||||||
method: "tryGetSitemap",
|
method: "tryGetSitemap",
|
||||||
timeout,
|
timeout,
|
||||||
});
|
});
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
this.logger.error('Error fetching sitemap', {
|
this.logger.error("Error fetching sitemap", {
|
||||||
method: "tryGetSitemap",
|
method: "tryGetSitemap",
|
||||||
error,
|
error,
|
||||||
});
|
});
|
||||||
@ -328,9 +327,16 @@ export class WebCrawler {
|
|||||||
!this.matchesExcludes(path) &&
|
!this.matchesExcludes(path) &&
|
||||||
!this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
|
!this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
|
||||||
) {
|
) {
|
||||||
(async() => {
|
(async () => {
|
||||||
await redisConnection.sadd("crawl:" + this.jobId + ":robots_blocked", fullUrl);
|
await redisConnection.sadd(
|
||||||
await redisConnection.expire("crawl:" + this.jobId + ":robots_blocked", 24 * 60 * 60, "NX");
|
"crawl:" + this.jobId + ":robots_blocked",
|
||||||
|
fullUrl,
|
||||||
|
);
|
||||||
|
await redisConnection.expire(
|
||||||
|
"crawl:" + this.jobId + ":robots_blocked",
|
||||||
|
24 * 60 * 60,
|
||||||
|
"NX",
|
||||||
|
);
|
||||||
})();
|
})();
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -1,5 +1,8 @@
|
|||||||
import { logger } from "../../lib/logger";
|
import { logger } from "../../lib/logger";
|
||||||
import { normalizeUrl, normalizeUrlOnlyHostname } from "../../lib/canonical-url";
|
import {
|
||||||
|
normalizeUrl,
|
||||||
|
normalizeUrlOnlyHostname,
|
||||||
|
} from "../../lib/canonical-url";
|
||||||
import { supabase_service } from "../../services/supabase";
|
import { supabase_service } from "../../services/supabase";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -28,13 +31,19 @@ async function querySitemapIndexFunction(url: string) {
|
|||||||
return { urls: [], lastUpdated: new Date(0) };
|
return { urls: [], lastUpdated: new Date(0) };
|
||||||
}
|
}
|
||||||
|
|
||||||
const allUrls = [...new Set(data.map((entry) => entry.urls).flat().map(url => normalizeUrl(url)))];
|
const allUrls = [
|
||||||
|
...new Set(
|
||||||
|
data
|
||||||
|
.map((entry) => entry.urls)
|
||||||
|
.flat()
|
||||||
|
.map((url) => normalizeUrl(url)),
|
||||||
|
),
|
||||||
|
];
|
||||||
return { urls: allUrls, lastUpdated: data[0].updated_at };
|
return { urls: allUrls, lastUpdated: data[0].updated_at };
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error("(sitemap-index) Error querying the index", {
|
logger.error("(sitemap-index) Error querying the index", {
|
||||||
error,
|
error,
|
||||||
attempt
|
attempt,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (attempt === 3) {
|
if (attempt === 3) {
|
||||||
@ -46,4 +55,7 @@ async function querySitemapIndexFunction(url: string) {
|
|||||||
return { urls: [], lastUpdated: new Date(0) };
|
return { urls: [], lastUpdated: new Date(0) };
|
||||||
}
|
}
|
||||||
|
|
||||||
export const querySitemapIndex = withAuth(querySitemapIndexFunction, { urls: [], lastUpdated: new Date(0) });
|
export const querySitemapIndex = withAuth(querySitemapIndexFunction, {
|
||||||
|
urls: [],
|
||||||
|
lastUpdated: new Date(0),
|
||||||
|
});
|
||||||
|
@ -30,12 +30,20 @@ export async function getLinksFromSitemap(
|
|||||||
{ forceEngine: "fetch" },
|
{ forceEngine: "fetch" },
|
||||||
);
|
);
|
||||||
|
|
||||||
if (fetchResponse.success && (fetchResponse.document.metadata.statusCode >= 200 && fetchResponse.document.metadata.statusCode < 300)) {
|
if (
|
||||||
|
fetchResponse.success &&
|
||||||
|
fetchResponse.document.metadata.statusCode >= 200 &&
|
||||||
|
fetchResponse.document.metadata.statusCode < 300
|
||||||
|
) {
|
||||||
content = fetchResponse.document.rawHtml!;
|
content = fetchResponse.document.rawHtml!;
|
||||||
} else {
|
} else {
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Failed to scrape sitemap via fetch, falling back to TLSClient...",
|
"Failed to scrape sitemap via fetch, falling back to TLSClient...",
|
||||||
{ error: fetchResponse.success ? fetchResponse.document : fetchResponse.error },
|
{
|
||||||
|
error: fetchResponse.success
|
||||||
|
? fetchResponse.document
|
||||||
|
: fetchResponse.error,
|
||||||
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
const tlsResponse = await scrapeURL(
|
const tlsResponse = await scrapeURL(
|
||||||
@ -45,15 +53,24 @@ export async function getLinksFromSitemap(
|
|||||||
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true },
|
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true },
|
||||||
);
|
);
|
||||||
|
|
||||||
if (tlsResponse.success && (tlsResponse.document.metadata.statusCode >= 200 && tlsResponse.document.metadata.statusCode < 300)) {
|
if (
|
||||||
|
tlsResponse.success &&
|
||||||
|
tlsResponse.document.metadata.statusCode >= 200 &&
|
||||||
|
tlsResponse.document.metadata.statusCode < 300
|
||||||
|
) {
|
||||||
content = tlsResponse.document.rawHtml!;
|
content = tlsResponse.document.rawHtml!;
|
||||||
} else {
|
} else {
|
||||||
logger.error(`Request failed for ${sitemapUrl}, ran out of engines!`, {
|
logger.error(
|
||||||
|
`Request failed for ${sitemapUrl}, ran out of engines!`,
|
||||||
|
{
|
||||||
method: "getLinksFromSitemap",
|
method: "getLinksFromSitemap",
|
||||||
mode,
|
mode,
|
||||||
sitemapUrl,
|
sitemapUrl,
|
||||||
error: tlsResponse.success ? tlsResponse.document : tlsResponse.error,
|
error: tlsResponse.success
|
||||||
});
|
? tlsResponse.document
|
||||||
|
: tlsResponse.error,
|
||||||
|
},
|
||||||
|
);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -65,14 +82,21 @@ export async function getLinksFromSitemap(
|
|||||||
{ forceEngine: "fetch" },
|
{ forceEngine: "fetch" },
|
||||||
);
|
);
|
||||||
|
|
||||||
if (fetchResponse.success && (fetchResponse.document.metadata.statusCode >= 200 && fetchResponse.document.metadata.statusCode < 300)) {
|
if (
|
||||||
|
fetchResponse.success &&
|
||||||
|
fetchResponse.document.metadata.statusCode >= 200 &&
|
||||||
|
fetchResponse.document.metadata.statusCode < 300
|
||||||
|
) {
|
||||||
content = fetchResponse.document.rawHtml!;
|
content = fetchResponse.document.rawHtml!;
|
||||||
} else {
|
} else {
|
||||||
logger.error(`Request failed for ${sitemapUrl}, ran out of engines!`, {
|
logger.error(
|
||||||
|
`Request failed for ${sitemapUrl}, ran out of engines!`,
|
||||||
|
{
|
||||||
method: "getLinksFromSitemap",
|
method: "getLinksFromSitemap",
|
||||||
mode,
|
mode,
|
||||||
sitemapUrl,
|
sitemapUrl,
|
||||||
});
|
},
|
||||||
|
);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -167,11 +191,18 @@ export const fetchSitemapData = async (
|
|||||||
const fetchResponse = await scrapeURL(
|
const fetchResponse = await scrapeURL(
|
||||||
"sitemap",
|
"sitemap",
|
||||||
sitemapUrl,
|
sitemapUrl,
|
||||||
scrapeOptions.parse({ formats: ["rawHtml"], timeout: timeout || axiosTimeout }),
|
scrapeOptions.parse({
|
||||||
|
formats: ["rawHtml"],
|
||||||
|
timeout: timeout || axiosTimeout,
|
||||||
|
}),
|
||||||
{ forceEngine: "fetch" },
|
{ forceEngine: "fetch" },
|
||||||
);
|
);
|
||||||
|
|
||||||
if (fetchResponse.success && (fetchResponse.document.metadata.statusCode >= 200 && fetchResponse.document.metadata.statusCode < 300)) {
|
if (
|
||||||
|
fetchResponse.success &&
|
||||||
|
fetchResponse.document.metadata.statusCode >= 200 &&
|
||||||
|
fetchResponse.document.metadata.statusCode < 300
|
||||||
|
) {
|
||||||
const xml = fetchResponse.document.rawHtml!;
|
const xml = fetchResponse.document.rawHtml!;
|
||||||
const parsedXml = await parseStringPromise(xml);
|
const parsedXml = await parseStringPromise(xml);
|
||||||
|
|
||||||
|
@ -17,7 +17,6 @@ export async function scrapeCache(meta: Meta): Promise<EngineScrapeResult> {
|
|||||||
throw new EngineError("Cache hit but HTML is too short to be useful");
|
throw new EngineError("Cache hit but HTML is too short to be useful");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Set fromCache flag to indicate this document was retrieved from cache
|
// Set fromCache flag to indicate this document was retrieved from cache
|
||||||
meta.internalOptions.fromCache = true;
|
meta.internalOptions.fromCache = true;
|
||||||
|
|
||||||
|
@ -3,7 +3,12 @@ import * as Sentry from "@sentry/node";
|
|||||||
import { z } from "zod";
|
import { z } from "zod";
|
||||||
|
|
||||||
import { robustFetch } from "../../lib/fetch";
|
import { robustFetch } from "../../lib/fetch";
|
||||||
import { ActionError, EngineError, SiteError, UnsupportedFileError } from "../../error";
|
import {
|
||||||
|
ActionError,
|
||||||
|
EngineError,
|
||||||
|
SiteError,
|
||||||
|
UnsupportedFileError,
|
||||||
|
} from "../../error";
|
||||||
import { MockState } from "../../lib/mock";
|
import { MockState } from "../../lib/mock";
|
||||||
|
|
||||||
const successSchema = z.object({
|
const successSchema = z.object({
|
||||||
|
@ -4,7 +4,11 @@ import * as Sentry from "@sentry/node";
|
|||||||
import { robustFetch } from "../../lib/fetch";
|
import { robustFetch } from "../../lib/fetch";
|
||||||
import { MockState } from "../../lib/mock";
|
import { MockState } from "../../lib/mock";
|
||||||
|
|
||||||
export async function fireEngineDelete(logger: Logger, jobId: string, mock: MockState | null) {
|
export async function fireEngineDelete(
|
||||||
|
logger: Logger,
|
||||||
|
jobId: string,
|
||||||
|
mock: MockState | null,
|
||||||
|
) {
|
||||||
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
|
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
|
||||||
|
|
||||||
await Sentry.startSpan(
|
await Sentry.startSpan(
|
||||||
|
@ -143,7 +143,10 @@ async function buildMetaObject(
|
|||||||
logger,
|
logger,
|
||||||
logs,
|
logs,
|
||||||
featureFlags: buildFeatureFlags(url, options, internalOptions),
|
featureFlags: buildFeatureFlags(url, options, internalOptions),
|
||||||
mock: options.useMock !== undefined ? await loadMock(options.useMock, _logger) : null,
|
mock:
|
||||||
|
options.useMock !== undefined
|
||||||
|
? await loadMock(options.useMock, _logger)
|
||||||
|
: null,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -34,7 +34,7 @@ export async function robustFetch<
|
|||||||
requestId = crypto.randomUUID(),
|
requestId = crypto.randomUUID(),
|
||||||
tryCount = 1,
|
tryCount = 1,
|
||||||
tryCooldown,
|
tryCooldown,
|
||||||
mock
|
mock,
|
||||||
}: RobustFetchParams<Schema>): Promise<Output> {
|
}: RobustFetchParams<Schema>): Promise<Output> {
|
||||||
const params = {
|
const params = {
|
||||||
url,
|
url,
|
||||||
@ -51,8 +51,8 @@ export async function robustFetch<
|
|||||||
|
|
||||||
let response: {
|
let response: {
|
||||||
status: number;
|
status: number;
|
||||||
headers: Headers,
|
headers: Headers;
|
||||||
body: string,
|
body: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
if (mock === null) {
|
if (mock === null) {
|
||||||
@ -123,16 +123,24 @@ export async function robustFetch<
|
|||||||
return null as Output;
|
return null as Output;
|
||||||
}
|
}
|
||||||
|
|
||||||
const makeRequestTypeId = (request: typeof mock["requests"][number]["options"]) => {
|
const makeRequestTypeId = (
|
||||||
|
request: (typeof mock)["requests"][number]["options"],
|
||||||
|
) => {
|
||||||
let out = request.url + ";" + request.method;
|
let out = request.url + ";" + request.method;
|
||||||
if (process.env.FIRE_ENGINE_BETA_URL && url.startsWith(process.env.FIRE_ENGINE_BETA_URL) && request.method === "POST") {
|
if (
|
||||||
|
process.env.FIRE_ENGINE_BETA_URL &&
|
||||||
|
url.startsWith(process.env.FIRE_ENGINE_BETA_URL) &&
|
||||||
|
request.method === "POST"
|
||||||
|
) {
|
||||||
out += "f-e;" + request.body?.engine + ";" + request.body?.url;
|
out += "f-e;" + request.body?.engine + ";" + request.body?.url;
|
||||||
}
|
}
|
||||||
return out;
|
return out;
|
||||||
}
|
};
|
||||||
|
|
||||||
const thisId = makeRequestTypeId(params);
|
const thisId = makeRequestTypeId(params);
|
||||||
const matchingMocks = mock.requests.filter(x => makeRequestTypeId(x.options) === thisId).sort((a,b) => a.time - b.time);
|
const matchingMocks = mock.requests
|
||||||
|
.filter((x) => makeRequestTypeId(x.options) === thisId)
|
||||||
|
.sort((a, b) => a.time - b.time);
|
||||||
const nextI = mock.tracker[thisId] ?? 0;
|
const nextI = mock.tracker[thisId] ?? 0;
|
||||||
mock.tracker[thisId] = nextI + 1;
|
mock.tracker[thisId] = nextI + 1;
|
||||||
|
|
||||||
@ -141,7 +149,7 @@ export async function robustFetch<
|
|||||||
}
|
}
|
||||||
|
|
||||||
response = {
|
response = {
|
||||||
...(matchingMocks[nextI].result),
|
...matchingMocks[nextI].result,
|
||||||
headers: new Headers(matchingMocks[nextI].result.headers),
|
headers: new Headers(matchingMocks[nextI].result.headers),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@ -180,12 +188,15 @@ export async function robustFetch<
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (mock === null) {
|
if (mock === null) {
|
||||||
await saveMock({
|
await saveMock(
|
||||||
|
{
|
||||||
...params,
|
...params,
|
||||||
logger: undefined,
|
logger: undefined,
|
||||||
schema: undefined,
|
schema: undefined,
|
||||||
headers: undefined,
|
headers: undefined,
|
||||||
}, response);
|
},
|
||||||
|
response,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
let data: Output;
|
let data: Output;
|
||||||
|
@ -14,31 +14,41 @@ export async function saveMock(options: unknown, result: unknown) {
|
|||||||
const filePath = path.join(saveMocksDirPath, fileName);
|
const filePath = path.join(saveMocksDirPath, fileName);
|
||||||
console.log(filePath);
|
console.log(filePath);
|
||||||
|
|
||||||
await fs.writeFile(filePath, JSON.stringify({
|
await fs.writeFile(
|
||||||
|
filePath,
|
||||||
|
JSON.stringify(
|
||||||
|
{
|
||||||
time: Date.now(),
|
time: Date.now(),
|
||||||
options,
|
options,
|
||||||
result,
|
result,
|
||||||
}, undefined, 4));
|
},
|
||||||
|
undefined,
|
||||||
|
4,
|
||||||
|
),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
export type MockState = {
|
export type MockState = {
|
||||||
requests: {
|
requests: {
|
||||||
time: number,
|
time: number;
|
||||||
options: {
|
options: {
|
||||||
url: string,
|
url: string;
|
||||||
method: string,
|
method: string;
|
||||||
body?: any,
|
body?: any;
|
||||||
ignoreResponse: boolean,
|
ignoreResponse: boolean;
|
||||||
ignoreFailure: boolean,
|
ignoreFailure: boolean;
|
||||||
tryCount: number,
|
tryCount: number;
|
||||||
tryCooldown?: number,
|
tryCooldown?: number;
|
||||||
},
|
};
|
||||||
result: any,
|
result: any;
|
||||||
}[],
|
}[];
|
||||||
tracker: Record<string, number>,
|
tracker: Record<string, number>;
|
||||||
}
|
};
|
||||||
|
|
||||||
export async function loadMock(name: string, logger: Logger = _logger): Promise<MockState | null> {
|
export async function loadMock(
|
||||||
|
name: string,
|
||||||
|
logger: Logger = _logger,
|
||||||
|
): Promise<MockState | null> {
|
||||||
try {
|
try {
|
||||||
const mockPath = path.join(loadMocksDirPath, name + ".json");
|
const mockPath = path.join(loadMocksDirPath, name + ".json");
|
||||||
|
|
||||||
@ -54,7 +64,12 @@ export async function loadMock(name: string, logger: Logger = _logger): Promise<
|
|||||||
tracker: {},
|
tracker: {},
|
||||||
};
|
};
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.warn("Failed to load mock file!", { name, module: "scrapeURL:mock", method: "loadMock", error });
|
logger.warn("Failed to load mock file!", {
|
||||||
|
name,
|
||||||
|
module: "scrapeURL:mock",
|
||||||
|
method: "loadMock",
|
||||||
|
error,
|
||||||
|
});
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -119,16 +119,16 @@ export const htmlTransform = (
|
|||||||
|
|
||||||
// always return biggest image
|
// always return biggest image
|
||||||
soup("img[srcset]").each((_, el) => {
|
soup("img[srcset]").each((_, el) => {
|
||||||
const sizes = el.attribs.srcset.split(",").map(x => {
|
const sizes = el.attribs.srcset.split(",").map((x) => {
|
||||||
const tok = x.trim().split(" ");
|
const tok = x.trim().split(" ");
|
||||||
return {
|
return {
|
||||||
url: tok[0],
|
url: tok[0],
|
||||||
size: parseInt((tok[1] ?? "1x").slice(0, -1), 10),
|
size: parseInt((tok[1] ?? "1x").slice(0, -1), 10),
|
||||||
isX: (tok[1] ?? "").endsWith("x")
|
isX: (tok[1] ?? "").endsWith("x"),
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
|
||||||
if (sizes.every(x => x.isX) && el.attribs.src) {
|
if (sizes.every((x) => x.isX) && el.attribs.src) {
|
||||||
sizes.push({
|
sizes.push({
|
||||||
url: el.attribs.src,
|
url: el.attribs.src,
|
||||||
size: 1,
|
size: 1,
|
||||||
@ -136,7 +136,7 @@ export const htmlTransform = (
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
sizes.sort((a,b) => b.size - a.size);
|
sizes.sort((a, b) => b.size - a.size);
|
||||||
|
|
||||||
el.attribs.src = sizes[0]?.url;
|
el.attribs.src = sizes[0]?.url;
|
||||||
});
|
});
|
||||||
|
@ -41,7 +41,11 @@ export function deriveHTMLFromRawHTML(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
document.html = htmlTransform(document.rawHtml, document.metadata.url ?? document.metadata.sourceURL ?? meta.url, meta.options);
|
document.html = htmlTransform(
|
||||||
|
document.rawHtml,
|
||||||
|
document.metadata.url ?? document.metadata.sourceURL ?? meta.url,
|
||||||
|
meta.options,
|
||||||
|
);
|
||||||
return document;
|
return document;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,7 +1,11 @@
|
|||||||
import OpenAI from "openai";
|
import OpenAI from "openai";
|
||||||
import { encoding_for_model } from "@dqbd/tiktoken";
|
import { encoding_for_model } from "@dqbd/tiktoken";
|
||||||
import { TiktokenModel } from "@dqbd/tiktoken";
|
import { TiktokenModel } from "@dqbd/tiktoken";
|
||||||
import { Document, ExtractOptions, TokenUsage } from "../../../controllers/v1/types";
|
import {
|
||||||
|
Document,
|
||||||
|
ExtractOptions,
|
||||||
|
TokenUsage,
|
||||||
|
} from "../../../controllers/v1/types";
|
||||||
import { Logger } from "winston";
|
import { Logger } from "winston";
|
||||||
import { EngineResultsTracker, Meta } from "..";
|
import { EngineResultsTracker, Meta } from "..";
|
||||||
import { logger } from "../../../lib/logger";
|
import { logger } from "../../../lib/logger";
|
||||||
@ -72,14 +76,20 @@ export async function generateOpenAICompletions(
|
|||||||
markdown?: string,
|
markdown?: string,
|
||||||
previousWarning?: string,
|
previousWarning?: string,
|
||||||
isExtractEndpoint?: boolean,
|
isExtractEndpoint?: boolean,
|
||||||
model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini",
|
model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ??
|
||||||
): Promise<{ extract: any; numTokens: number; warning: string | undefined; totalUsage: TokenUsage, model: string }> {
|
"gpt-4o-mini",
|
||||||
|
): Promise<{
|
||||||
|
extract: any;
|
||||||
|
numTokens: number;
|
||||||
|
warning: string | undefined;
|
||||||
|
totalUsage: TokenUsage;
|
||||||
|
model: string;
|
||||||
|
}> {
|
||||||
let extract: any;
|
let extract: any;
|
||||||
let warning: string | undefined;
|
let warning: string | undefined;
|
||||||
|
|
||||||
const openai = new OpenAI();
|
const openai = new OpenAI();
|
||||||
|
|
||||||
|
|
||||||
if (markdown === undefined) {
|
if (markdown === undefined) {
|
||||||
throw new Error("document.markdown is undefined -- this is unexpected");
|
throw new Error("document.markdown is undefined -- this is unexpected");
|
||||||
}
|
}
|
||||||
@ -208,8 +218,8 @@ export async function generateOpenAICompletions(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const promptTokens = (jsonCompletion.usage?.prompt_tokens ?? 0);
|
const promptTokens = jsonCompletion.usage?.prompt_tokens ?? 0;
|
||||||
const completionTokens = (jsonCompletion.usage?.completion_tokens ?? 0);
|
const completionTokens = jsonCompletion.usage?.completion_tokens ?? 0;
|
||||||
|
|
||||||
// If the users actually wants the items object, they can specify it as 'required' in the schema
|
// If the users actually wants the items object, they can specify it as 'required' in the schema
|
||||||
// otherwise, we just return the items array
|
// otherwise, we just return the items array
|
||||||
@ -222,7 +232,17 @@ export async function generateOpenAICompletions(
|
|||||||
}
|
}
|
||||||
// num tokens (just user prompt tokenized) | deprecated
|
// num tokens (just user prompt tokenized) | deprecated
|
||||||
// totalTokens = promptTokens + completionTokens
|
// totalTokens = promptTokens + completionTokens
|
||||||
return { extract, warning, numTokens, totalUsage: { promptTokens, completionTokens, totalTokens: promptTokens + completionTokens }, model };
|
return {
|
||||||
|
extract,
|
||||||
|
warning,
|
||||||
|
numTokens,
|
||||||
|
totalUsage: {
|
||||||
|
promptTokens,
|
||||||
|
completionTokens,
|
||||||
|
totalTokens: promptTokens + completionTokens,
|
||||||
|
},
|
||||||
|
model,
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function performLLMExtract(
|
export async function performLLMExtract(
|
||||||
|
@ -32,7 +32,7 @@ export async function autoCharge(
|
|||||||
const resource = `auto-recharge:${chunk.team_id}`;
|
const resource = `auto-recharge:${chunk.team_id}`;
|
||||||
const cooldownKey = `auto-recharge-cooldown:${chunk.team_id}`;
|
const cooldownKey = `auto-recharge-cooldown:${chunk.team_id}`;
|
||||||
|
|
||||||
if(chunk.team_id === "285bb597-6eaf-4b96-801c-51461fc3c543"){
|
if (chunk.team_id === "285bb597-6eaf-4b96-801c-51461fc3c543") {
|
||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
message: "Auto-recharge failed",
|
message: "Auto-recharge failed",
|
||||||
|
@ -107,15 +107,15 @@ async function processBatch() {
|
|||||||
// Keep most recent entry and mark others for deletion
|
// Keep most recent entry and mark others for deletion
|
||||||
const [mostRecent, ...duplicates] = existingForOrigin;
|
const [mostRecent, ...duplicates] = existingForOrigin;
|
||||||
if (duplicates.length > 0) {
|
if (duplicates.length > 0) {
|
||||||
duplicatesToDelete.push(...duplicates.map(d => d.id));
|
duplicatesToDelete.push(...duplicates.map((d) => d.id));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Merge and deduplicate URLs
|
// Merge and deduplicate URLs
|
||||||
const mergedUrls = [
|
const mergedUrls = [
|
||||||
...new Set([
|
...new Set([
|
||||||
...mostRecent.urls,
|
...mostRecent.urls,
|
||||||
...op.standardizedUrls.map(url => normalizeUrl(url))
|
...op.standardizedUrls.map((url) => normalizeUrl(url)),
|
||||||
])
|
]),
|
||||||
];
|
];
|
||||||
|
|
||||||
updates.push({
|
updates.push({
|
||||||
@ -127,7 +127,9 @@ async function processBatch() {
|
|||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
// Prepare insert with deduplicated URLs
|
// Prepare insert with deduplicated URLs
|
||||||
const deduplicatedUrls = [...new Set(op.standardizedUrls.map(url => normalizeUrl(url)))];
|
const deduplicatedUrls = [
|
||||||
|
...new Set(op.standardizedUrls.map((url) => normalizeUrl(url))),
|
||||||
|
];
|
||||||
inserts.push({
|
inserts.push({
|
||||||
origin_url: op.originUrl,
|
origin_url: op.originUrl,
|
||||||
urls: deduplicatedUrls,
|
urls: deduplicatedUrls,
|
||||||
@ -140,7 +142,9 @@ async function processBatch() {
|
|||||||
|
|
||||||
// Delete duplicate entries
|
// Delete duplicate entries
|
||||||
if (duplicatesToDelete.length > 0) {
|
if (duplicatesToDelete.length > 0) {
|
||||||
logger.info(`🗑️ Deleting ${duplicatesToDelete.length} duplicate crawl maps in batches of 100`);
|
logger.info(
|
||||||
|
`🗑️ Deleting ${duplicatesToDelete.length} duplicate crawl maps in batches of 100`,
|
||||||
|
);
|
||||||
|
|
||||||
// Delete in batches of 100
|
// Delete in batches of 100
|
||||||
for (let i = 0; i < duplicatesToDelete.length; i += 100) {
|
for (let i = 0; i < duplicatesToDelete.length; i += 100) {
|
||||||
@ -151,11 +155,14 @@ async function processBatch() {
|
|||||||
.in("id", batch);
|
.in("id", batch);
|
||||||
|
|
||||||
if (deleteError) {
|
if (deleteError) {
|
||||||
logger.error(`Failed to delete batch ${i/100 + 1} of duplicate crawl maps`, {
|
logger.error(
|
||||||
|
`Failed to delete batch ${i / 100 + 1} of duplicate crawl maps`,
|
||||||
|
{
|
||||||
error: deleteError,
|
error: deleteError,
|
||||||
batchSize: batch.length,
|
batchSize: batch.length,
|
||||||
startIndex: i
|
startIndex: i,
|
||||||
});
|
},
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -175,7 +182,7 @@ async function processBatch() {
|
|||||||
if (updateError) {
|
if (updateError) {
|
||||||
logger.error("Failed to update crawl map", {
|
logger.error("Failed to update crawl map", {
|
||||||
error: updateError,
|
error: updateError,
|
||||||
origin: update.origin_url
|
origin: update.origin_url,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,18 +3,27 @@ import "../sentry";
|
|||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
import { Job, Queue, Worker } from "bullmq";
|
import { Job, Queue, Worker } from "bullmq";
|
||||||
import { logger as _logger, logger } from "../../lib/logger";
|
import { logger as _logger, logger } from "../../lib/logger";
|
||||||
import { redisConnection, indexQueueName, getIndexQueue } from "../queue-service";
|
import {
|
||||||
|
redisConnection,
|
||||||
|
indexQueueName,
|
||||||
|
getIndexQueue,
|
||||||
|
} from "../queue-service";
|
||||||
import { saveCrawlMap } from "./crawl-maps-index";
|
import { saveCrawlMap } from "./crawl-maps-index";
|
||||||
import systemMonitor from "../system-monitor";
|
import systemMonitor from "../system-monitor";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
|
|
||||||
const workerLockDuration = Number(process.env.WORKER_LOCK_DURATION) || 60000;
|
const workerLockDuration = Number(process.env.WORKER_LOCK_DURATION) || 60000;
|
||||||
const workerStalledCheckInterval = Number(process.env.WORKER_STALLED_CHECK_INTERVAL) || 30000;
|
const workerStalledCheckInterval =
|
||||||
const jobLockExtendInterval = Number(process.env.JOB_LOCK_EXTEND_INTERVAL) || 15000;
|
Number(process.env.WORKER_STALLED_CHECK_INTERVAL) || 30000;
|
||||||
const jobLockExtensionTime = Number(process.env.JOB_LOCK_EXTENSION_TIME) || 60000;
|
const jobLockExtendInterval =
|
||||||
|
Number(process.env.JOB_LOCK_EXTEND_INTERVAL) || 15000;
|
||||||
|
const jobLockExtensionTime =
|
||||||
|
Number(process.env.JOB_LOCK_EXTENSION_TIME) || 60000;
|
||||||
|
|
||||||
const cantAcceptConnectionInterval = Number(process.env.CANT_ACCEPT_CONNECTION_INTERVAL) || 2000;
|
const cantAcceptConnectionInterval =
|
||||||
const connectionMonitorInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 10;
|
Number(process.env.CANT_ACCEPT_CONNECTION_INTERVAL) || 2000;
|
||||||
|
const connectionMonitorInterval =
|
||||||
|
Number(process.env.CONNECTION_MONITOR_INTERVAL) || 10;
|
||||||
const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
|
const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
|
||||||
|
|
||||||
const runningJobs: Set<string> = new Set();
|
const runningJobs: Set<string> = new Set();
|
||||||
@ -100,7 +109,9 @@ const workerFun = async (queue: Queue) => {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
await new Promise(resolve => setTimeout(resolve, cantAcceptConnectionInterval));
|
await new Promise((resolve) =>
|
||||||
|
setTimeout(resolve, cantAcceptConnectionInterval),
|
||||||
|
);
|
||||||
continue;
|
continue;
|
||||||
} else {
|
} else {
|
||||||
cantAcceptConnectionCount = 0;
|
cantAcceptConnectionCount = 0;
|
||||||
@ -141,15 +152,17 @@ const workerFun = async (queue: Queue) => {
|
|||||||
runningJobs.delete(job.id);
|
runningJobs.delete(job.id);
|
||||||
}
|
}
|
||||||
|
|
||||||
await new Promise(resolve => setTimeout(resolve, gotJobInterval));
|
await new Promise((resolve) => setTimeout(resolve, gotJobInterval));
|
||||||
} else {
|
} else {
|
||||||
await new Promise(resolve => setTimeout(resolve, connectionMonitorInterval));
|
await new Promise((resolve) =>
|
||||||
|
setTimeout(resolve, connectionMonitorInterval),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info("Worker loop ended. Waiting for running jobs to finish...");
|
logger.info("Worker loop ended. Waiting for running jobs to finish...");
|
||||||
while (runningJobs.size > 0) {
|
while (runningJobs.size > 0) {
|
||||||
await new Promise(resolve => setTimeout(resolve, 500));
|
await new Promise((resolve) => setTimeout(resolve, 500));
|
||||||
}
|
}
|
||||||
logger.info("All jobs finished. Worker exiting!");
|
logger.info("All jobs finished. Worker exiting!");
|
||||||
process.exit(0);
|
process.exit(0);
|
||||||
|
@ -93,7 +93,9 @@ const runningJobs: Set<string> = new Set();
|
|||||||
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
||||||
if (await finishCrawl(job.data.crawl_id)) {
|
if (await finishCrawl(job.data.crawl_id)) {
|
||||||
(async () => {
|
(async () => {
|
||||||
const originUrl = sc.originUrl ? normalizeUrlOnlyHostname(sc.originUrl) : undefined;
|
const originUrl = sc.originUrl
|
||||||
|
? normalizeUrlOnlyHostname(sc.originUrl)
|
||||||
|
: undefined;
|
||||||
// Get all visited unique URLs from Redis
|
// Get all visited unique URLs from Redis
|
||||||
const visitedUrls = await redisConnection.smembers(
|
const visitedUrls = await redisConnection.smembers(
|
||||||
"crawl:" + job.data.crawl_id + ":visited_unique",
|
"crawl:" + job.data.crawl_id + ":visited_unique",
|
||||||
@ -113,7 +115,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
priority: 10,
|
priority: 10,
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
})();
|
})();
|
||||||
@ -319,7 +321,10 @@ const processExtractJobInternal = async (
|
|||||||
await job.moveToCompleted(result, token, false);
|
await job.moveToCompleted(result, token, false);
|
||||||
await updateExtract(job.data.extractId, {
|
await updateExtract(job.data.extractId, {
|
||||||
status: "failed",
|
status: "failed",
|
||||||
error: result.error ?? "Unknown error, please contact help@firecrawl.com. Extract id: " + job.data.extractId,
|
error:
|
||||||
|
result.error ??
|
||||||
|
"Unknown error, please contact help@firecrawl.com. Extract id: " +
|
||||||
|
job.data.extractId,
|
||||||
});
|
});
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
@ -348,7 +353,14 @@ const processExtractJobInternal = async (
|
|||||||
"Unknown error, please contact help@firecrawl.com. Extract id: " +
|
"Unknown error, please contact help@firecrawl.com. Extract id: " +
|
||||||
job.data.extractId,
|
job.data.extractId,
|
||||||
});
|
});
|
||||||
return { success: false, error: error.error ?? error ?? "Unknown error, please contact help@firecrawl.com. Extract id: " + job.data.extractId };
|
return {
|
||||||
|
success: false,
|
||||||
|
error:
|
||||||
|
error.error ??
|
||||||
|
error ??
|
||||||
|
"Unknown error, please contact help@firecrawl.com. Extract id: " +
|
||||||
|
job.data.extractId,
|
||||||
|
};
|
||||||
// throw error;
|
// throw error;
|
||||||
} finally {
|
} finally {
|
||||||
clearInterval(extendLockInterval);
|
clearInterval(extendLockInterval);
|
||||||
@ -949,13 +961,15 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (job.data.team_id !== process.env.BACKGROUND_INDEX_TEAM_ID!) {
|
if (job.data.team_id !== process.env.BACKGROUND_INDEX_TEAM_ID!) {
|
||||||
billTeam(job.data.team_id, undefined, creditsToBeBilled, logger).catch((error) => {
|
billTeam(job.data.team_id, undefined, creditsToBeBilled, logger).catch(
|
||||||
|
(error) => {
|
||||||
logger.error(
|
logger.error(
|
||||||
`Failed to bill team ${job.data.team_id} for ${creditsToBeBilled} credits`,
|
`Failed to bill team ${job.data.team_id} for ${creditsToBeBilled} credits`,
|
||||||
{ error },
|
{ error },
|
||||||
);
|
);
|
||||||
// Optionally, you could notify an admin or add to a retry queue here
|
// Optionally, you could notify an admin or add to a retry queue here
|
||||||
});
|
},
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -978,7 +992,8 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
const isEarlyTimeout =
|
const isEarlyTimeout =
|
||||||
error instanceof Error && error.message === "timeout";
|
error instanceof Error && error.message === "timeout";
|
||||||
const isCancelled =
|
const isCancelled =
|
||||||
error instanceof Error && error.message === "Parent crawl/batch scrape was cancelled";
|
error instanceof Error &&
|
||||||
|
error.message === "Parent crawl/batch scrape was cancelled";
|
||||||
|
|
||||||
if (isEarlyTimeout) {
|
if (isEarlyTimeout) {
|
||||||
logger.error(`🐂 Job timed out ${job.id}`);
|
logger.error(`🐂 Job timed out ${job.id}`);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user