mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 07:39:00 +08:00
Nick: formatting done
This commit is contained in:
parent
994e1eb502
commit
498558d358
@ -1,8 +1,6 @@
|
||||
import request from "supertest";
|
||||
import { configDotenv } from "dotenv";
|
||||
import {
|
||||
ScrapeRequestInput,
|
||||
} from "../../controllers/v1/types";
|
||||
import { ScrapeRequestInput } from "../../controllers/v1/types";
|
||||
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
|
||||
|
||||
configDotenv();
|
||||
@ -19,8 +17,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||
|
||||
describe("GET /is-production", () => {
|
||||
it.concurrent("should return the production status", async () => {
|
||||
const response: any =
|
||||
await request(TEST_URL).get("/is-production");
|
||||
const response: any = await request(TEST_URL).get("/is-production");
|
||||
|
||||
console.log(
|
||||
"process.env.USE_DB_AUTHENTICATION",
|
||||
@ -274,12 +271,11 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||
url: "https://www.scrapethissite.com/",
|
||||
onlyMainContent: false, // default is true
|
||||
};
|
||||
const responseWithoutRemoveTags: any =
|
||||
await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
const responseWithoutRemoveTags: any = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
expect(responseWithoutRemoveTags.statusCode).toBe(200);
|
||||
expect(responseWithoutRemoveTags.body).toHaveProperty("data");
|
||||
|
||||
|
@ -1,8 +1,6 @@
|
||||
import request from "supertest";
|
||||
import { configDotenv } from "dotenv";
|
||||
import {
|
||||
ScrapeRequest,
|
||||
} from "../../controllers/v1/types";
|
||||
import { ScrapeRequest } from "../../controllers/v1/types";
|
||||
|
||||
configDotenv();
|
||||
const FIRECRAWL_API_URL = "http://127.0.0.1:3002";
|
||||
@ -12,9 +10,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||
it.concurrent(
|
||||
"should return a successful response for a scrape with 403 page",
|
||||
async () => {
|
||||
const response: any = await request(
|
||||
FIRECRAWL_API_URL,
|
||||
)
|
||||
const response: any = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
@ -39,9 +35,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||
url: E2E_TEST_SERVER_URL,
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: any = await request(
|
||||
FIRECRAWL_API_URL,
|
||||
)
|
||||
const response: any = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
@ -86,9 +80,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||
formats: ["html"],
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: any = await request(
|
||||
FIRECRAWL_API_URL,
|
||||
)
|
||||
const response: any = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
@ -121,9 +113,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||
formats: ["rawHtml"],
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: any = await request(
|
||||
FIRECRAWL_API_URL,
|
||||
)
|
||||
const response: any = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
@ -159,9 +149,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||
headers: { "e2e-header-test": "firecrawl" },
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: any = await request(
|
||||
FIRECRAWL_API_URL,
|
||||
)
|
||||
const response: any = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
@ -188,9 +176,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||
includeTags: ["#content-1"],
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: any = await request(
|
||||
FIRECRAWL_API_URL,
|
||||
)
|
||||
const response: any = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
@ -220,9 +206,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||
excludeTags: ["#content-1"],
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: any = await request(
|
||||
FIRECRAWL_API_URL,
|
||||
)
|
||||
const response: any = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
@ -253,9 +237,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||
onlyMainContent: false,
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: any = await request(
|
||||
FIRECRAWL_API_URL,
|
||||
)
|
||||
const response: any = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
@ -285,9 +267,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||
timeout: 500,
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: any = await request(
|
||||
FIRECRAWL_API_URL,
|
||||
)
|
||||
const response: any = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
@ -312,9 +292,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||
mobile: true,
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: any = await request(
|
||||
FIRECRAWL_API_URL,
|
||||
)
|
||||
const response: any = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
@ -335,9 +313,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||
it.concurrent(
|
||||
"should handle 'parsePDF' parameter correctly",
|
||||
async () => {
|
||||
const response: any = await request(
|
||||
FIRECRAWL_API_URL,
|
||||
)
|
||||
const response: any = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
@ -357,9 +333,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||
"h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm",
|
||||
);
|
||||
|
||||
const responseNoParsePDF: any = await request(
|
||||
FIRECRAWL_API_URL,
|
||||
)
|
||||
const responseNoParsePDF: any = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
@ -410,9 +384,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||
timeout: 120000,
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: any = await request(
|
||||
FIRECRAWL_API_URL,
|
||||
)
|
||||
const response: any = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
@ -432,12 +404,13 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||
timeout: 120000,
|
||||
} as ScrapeRequest;
|
||||
|
||||
const responseWithSkipTlsVerification: any =
|
||||
await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequestWithSkipTlsVerification);
|
||||
const responseWithSkipTlsVerification: any = await request(
|
||||
FIRECRAWL_API_URL,
|
||||
)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequestWithSkipTlsVerification);
|
||||
|
||||
console.log("Error1b");
|
||||
// console.log(responseWithSkipTlsVerification.body)
|
||||
@ -461,9 +434,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||
removeBase64Images: true,
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: any = await request(
|
||||
FIRECRAWL_API_URL,
|
||||
)
|
||||
const response: any = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
@ -493,9 +464,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||
],
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: any = await request(
|
||||
FIRECRAWL_API_URL,
|
||||
)
|
||||
const response: any = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
@ -526,9 +495,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||
],
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: any = await request(
|
||||
FIRECRAWL_API_URL,
|
||||
)
|
||||
const response: any = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
@ -569,9 +536,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||
],
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: any = await request(
|
||||
FIRECRAWL_API_URL,
|
||||
)
|
||||
const response: any = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
@ -619,9 +584,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||
],
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: any = await request(
|
||||
FIRECRAWL_API_URL,
|
||||
)
|
||||
const response: any = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
@ -657,9 +620,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||
],
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: any = await request(
|
||||
FIRECRAWL_API_URL,
|
||||
)
|
||||
const response: any = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
@ -692,9 +653,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||
],
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: any = await request(
|
||||
FIRECRAWL_API_URL,
|
||||
)
|
||||
const response: any = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
@ -731,9 +690,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||
],
|
||||
} as ScrapeRequest;
|
||||
|
||||
const response: any = await request(
|
||||
FIRECRAWL_API_URL,
|
||||
)
|
||||
const response: any = await request(FIRECRAWL_API_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
|
@ -23,8 +23,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
||||
|
||||
describe("POST /v0/scrape", () => {
|
||||
it.concurrent("should require authorization", async () => {
|
||||
const response: any =
|
||||
await request(TEST_URL).post("/v0/scrape");
|
||||
const response: any = await request(TEST_URL).post("/v0/scrape");
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
@ -159,12 +158,11 @@ describe("E2E Tests for v0 API Routes", () => {
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key with removeTags option",
|
||||
async () => {
|
||||
const responseWithoutRemoveTags: any =
|
||||
await request(TEST_URL)
|
||||
.post("/v0/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://www.scrapethissite.com/" });
|
||||
const responseWithoutRemoveTags: any = await request(TEST_URL)
|
||||
.post("/v0/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://www.scrapethissite.com/" });
|
||||
expect(responseWithoutRemoveTags.statusCode).toBe(200);
|
||||
expect(responseWithoutRemoveTags.body).toHaveProperty("data");
|
||||
expect(responseWithoutRemoveTags.body.data).toHaveProperty("content");
|
||||
@ -332,8 +330,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
||||
|
||||
describe("POST /v0/crawl", () => {
|
||||
it.concurrent("should require authorization", async () => {
|
||||
const response: any =
|
||||
await request(TEST_URL).post("/v0/crawl");
|
||||
const response: any = await request(TEST_URL).post("/v0/crawl");
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
@ -461,9 +458,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
||||
}
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
|
||||
const completedResponse: any = await request(
|
||||
TEST_URL,
|
||||
)
|
||||
const completedResponse: any = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
@ -509,9 +504,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||
}
|
||||
}
|
||||
const completedResponse: any = await request(
|
||||
TEST_URL,
|
||||
)
|
||||
const completedResponse: any = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
|
@ -6,31 +6,33 @@ configDotenv();
|
||||
const TEST_URL = "http://127.0.0.1:3002";
|
||||
|
||||
async function scrape(body: ScrapeRequestInput) {
|
||||
return await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(body);
|
||||
return await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(body);
|
||||
}
|
||||
|
||||
function expectScrapeToSucceed(response: Awaited<ReturnType<typeof scrape>>) {
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body.success).toBe(true);
|
||||
expect(typeof response.body.data).toBe("object");
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body.success).toBe(true);
|
||||
expect(typeof response.body.data).toBe("object");
|
||||
}
|
||||
|
||||
describe("Scrape tests", () => {
|
||||
it("mocking works properly", async () => {
|
||||
// depends on falsified mock mocking-works-properly
|
||||
// this test will fail if mock is bypassed with real data -- firecrawl.dev will never have
|
||||
// that as its actual markdown output
|
||||
it("mocking works properly", async () => {
|
||||
// depends on falsified mock mocking-works-properly
|
||||
// this test will fail if mock is bypassed with real data -- firecrawl.dev will never have
|
||||
// that as its actual markdown output
|
||||
|
||||
const response = await scrape({
|
||||
url: "http://firecrawl.dev",
|
||||
useMock: "mocking-works-properly",
|
||||
});
|
||||
|
||||
expectScrapeToSucceed(response);
|
||||
expect(response.body.data.markdown).toBe("this is fake data coming from the mocking system!");
|
||||
const response = await scrape({
|
||||
url: "http://firecrawl.dev",
|
||||
useMock: "mocking-works-properly",
|
||||
});
|
||||
});
|
||||
|
||||
expectScrapeToSucceed(response);
|
||||
expect(response.body.data.markdown).toBe(
|
||||
"this is fake data coming from the mocking system!",
|
||||
);
|
||||
});
|
||||
});
|
||||
|
@ -4,9 +4,11 @@ const fs = require("fs");
|
||||
const mocksDirPath = path.join(__dirname, "../../../scraper/scrapeURL/mocks");
|
||||
const files = fs.readdirSync(mocksDirPath);
|
||||
|
||||
const contents = files.map(x => JSON.parse(fs.readFileSync(path.join(mocksDirPath, x), "utf8")));
|
||||
const contents = files.map((x) =>
|
||||
JSON.parse(fs.readFileSync(path.join(mocksDirPath, x), "utf8")),
|
||||
);
|
||||
|
||||
fs.writeFileSync(
|
||||
path.join(__dirname, "../mocks/" + process.argv[2] + ".json"),
|
||||
JSON.stringify(contents, undefined, 4),
|
||||
);
|
||||
path.join(__dirname, "../mocks/" + process.argv[2] + ".json"),
|
||||
JSON.stringify(contents, undefined, 4),
|
||||
);
|
||||
|
@ -105,7 +105,6 @@ export async function getACUC(
|
||||
{ get: true },
|
||||
));
|
||||
|
||||
|
||||
if (!error) {
|
||||
break;
|
||||
}
|
||||
@ -146,7 +145,7 @@ export async function clearACUC(api_key: string): Promise<void> {
|
||||
modes.map(async (mode) => {
|
||||
const cacheKey = `acuc_${api_key}_${mode}`;
|
||||
await deleteKey(cacheKey);
|
||||
})
|
||||
}),
|
||||
);
|
||||
|
||||
// Also clear the base cache key
|
||||
@ -232,7 +231,6 @@ export async function supaAuthenticateUser(
|
||||
teamId = chunk.team_id;
|
||||
priceId = chunk.price_id;
|
||||
|
||||
|
||||
plan = getPlanByPriceId(priceId);
|
||||
subscriptionData = {
|
||||
team_id: teamId,
|
||||
|
@ -16,7 +16,7 @@ export async function checkFireEngine(req: Request, res: Response) {
|
||||
const timeout = setTimeout(() => controller.abort(), 30000);
|
||||
|
||||
const urls = ["https://roastmywebsite.ai", "https://example.com"];
|
||||
let lastError : string | null = null;
|
||||
let lastError: string | null = null;
|
||||
|
||||
for (const url of urls) {
|
||||
try {
|
||||
@ -62,7 +62,6 @@ export async function checkFireEngine(req: Request, res: Response) {
|
||||
success: false,
|
||||
error: "Internal server error - all retry attempts failed",
|
||||
});
|
||||
|
||||
} catch (error) {
|
||||
logger.error(error);
|
||||
Sentry.captureException(error);
|
||||
|
@ -227,7 +227,7 @@ export async function crawlController(req: Request, res: Response) {
|
||||
await addScrapeJob(job.data as any, {}, job.opts.jobId);
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
if (sitemap === 0) {
|
||||
await lockURL(id, sc, url);
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
import { Response } from "express";
|
||||
import {
|
||||
CrawlErrorsResponse,
|
||||
CrawlErrorsResponse,
|
||||
CrawlStatusParams,
|
||||
CrawlStatusResponse,
|
||||
ErrorResponse,
|
||||
@ -62,20 +62,23 @@ export async function crawlErrorsController(
|
||||
const failedJobIDs: string[] = [];
|
||||
|
||||
for (const [id, status] of jobStatuses) {
|
||||
if (
|
||||
status === "failed"
|
||||
) {
|
||||
if (status === "failed") {
|
||||
failedJobIDs.push(id);
|
||||
}
|
||||
}
|
||||
|
||||
res.status(200).json({
|
||||
errors: (await getJobs(failedJobIDs)).map(x => ({
|
||||
id: x.id,
|
||||
timestamp: x.finishedOn !== undefined ? (new Date(x.finishedOn).toISOString()) : undefined,
|
||||
url: x.data.url,
|
||||
error: x.failedReason,
|
||||
errors: (await getJobs(failedJobIDs)).map((x) => ({
|
||||
id: x.id,
|
||||
timestamp:
|
||||
x.finishedOn !== undefined
|
||||
? new Date(x.finishedOn).toISOString()
|
||||
: undefined,
|
||||
url: x.data.url,
|
||||
error: x.failedReason,
|
||||
})),
|
||||
robotsBlocked: await redisConnection.smembers("crawl:" + req.params.jobId + ":robots_blocked"),
|
||||
robotsBlocked: await redisConnection.smembers(
|
||||
"crawl:" + req.params.jobId + ":robots_blocked",
|
||||
),
|
||||
});
|
||||
}
|
||||
|
@ -116,7 +116,10 @@ export async function crawlStatusController(
|
||||
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] =
|
||||
sc.cancelled
|
||||
? "cancelled"
|
||||
: validJobStatuses.every((x) => x[1] === "completed") && (sc.crawlerOptions ? await isCrawlKickoffFinished(req.params.jobId) : true)
|
||||
: validJobStatuses.every((x) => x[1] === "completed") &&
|
||||
(sc.crawlerOptions
|
||||
? await isCrawlKickoffFinished(req.params.jobId)
|
||||
: true)
|
||||
? "completed"
|
||||
: "scraping";
|
||||
|
||||
|
@ -101,7 +101,7 @@ export async function getMapResults({
|
||||
},
|
||||
true,
|
||||
true,
|
||||
30000
|
||||
30000,
|
||||
);
|
||||
if (sitemap > 0) {
|
||||
links = links
|
||||
@ -164,20 +164,24 @@ export async function getMapResults({
|
||||
const twoDaysAgo = new Date();
|
||||
twoDaysAgo.setDate(twoDaysAgo.getDate() - 2);
|
||||
|
||||
|
||||
// If sitemap is not ignored and either we have few URLs (<100) or the data is stale (>2 days old), fetch fresh sitemap
|
||||
if (
|
||||
!ignoreSitemap &&
|
||||
!ignoreSitemap &&
|
||||
(sitemapIndexResult.urls.length < 100 ||
|
||||
new Date(sitemapIndexResult.lastUpdated) < twoDaysAgo)
|
||||
new Date(sitemapIndexResult.lastUpdated) < twoDaysAgo)
|
||||
) {
|
||||
try {
|
||||
await crawler.tryGetSitemap(urls => {
|
||||
links.push(...urls);
|
||||
}, true, false, 30000);
|
||||
await crawler.tryGetSitemap(
|
||||
(urls) => {
|
||||
links.push(...urls);
|
||||
},
|
||||
true,
|
||||
false,
|
||||
30000,
|
||||
);
|
||||
} catch (e) {
|
||||
logger.warn("tryGetSitemap threw an error", { error: e });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!cachedResult) {
|
||||
@ -253,7 +257,7 @@ export async function getMapResults({
|
||||
},
|
||||
{
|
||||
priority: 10,
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
return {
|
||||
|
@ -33,7 +33,6 @@ export async function scrapeController(
|
||||
basePriority: 10,
|
||||
});
|
||||
|
||||
|
||||
await addScrapeJob(
|
||||
{
|
||||
url: req.body.url,
|
||||
@ -97,7 +96,7 @@ export async function scrapeController(
|
||||
// Don't bill if we're early returning
|
||||
return;
|
||||
}
|
||||
if (req.body.extract && req.body.formats.includes("extract") ) {
|
||||
if (req.body.extract && req.body.formats.includes("extract")) {
|
||||
creditsToBeBilled = 5;
|
||||
}
|
||||
|
||||
|
@ -125,7 +125,7 @@ export const scrapeOptions = z
|
||||
"screenshot",
|
||||
"screenshot@fullPage",
|
||||
"extract",
|
||||
"json"
|
||||
"json",
|
||||
])
|
||||
.array()
|
||||
.optional()
|
||||
@ -233,7 +233,7 @@ export const extractV1Options = z
|
||||
.strict(strictMessage)
|
||||
.transform((obj) => ({
|
||||
...obj,
|
||||
allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch
|
||||
allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch,
|
||||
}));
|
||||
|
||||
export type ExtractV1Options = z.infer<typeof extractV1Options>;
|
||||
@ -268,11 +268,17 @@ export const scrapeRequestSchema = scrapeOptions
|
||||
)
|
||||
.transform((obj) => {
|
||||
// Handle timeout
|
||||
if ((obj.formats?.includes("extract") || obj.extract || obj.formats?.includes("json") || obj.jsonOptions) && !obj.timeout) {
|
||||
if (
|
||||
(obj.formats?.includes("extract") ||
|
||||
obj.extract ||
|
||||
obj.formats?.includes("json") ||
|
||||
obj.jsonOptions) &&
|
||||
!obj.timeout
|
||||
) {
|
||||
obj = { ...obj, timeout: 60000 };
|
||||
}
|
||||
|
||||
if(obj.formats?.includes("json")) {
|
||||
if (obj.formats?.includes("json")) {
|
||||
obj.formats.push("extract");
|
||||
}
|
||||
|
||||
@ -284,8 +290,8 @@ export const scrapeRequestSchema = scrapeOptions
|
||||
prompt: obj.jsonOptions.prompt,
|
||||
systemPrompt: obj.jsonOptions.systemPrompt,
|
||||
schema: obj.jsonOptions.schema,
|
||||
mode: "llm"
|
||||
}
|
||||
mode: "llm",
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
@ -602,15 +608,14 @@ export type CrawlStatusResponse =
|
||||
data: Document[];
|
||||
};
|
||||
|
||||
|
||||
export type CrawlErrorsResponse =
|
||||
| ErrorResponse
|
||||
| {
|
||||
errors: {
|
||||
id: string,
|
||||
timestamp?: string,
|
||||
url: string,
|
||||
error: string,
|
||||
id: string;
|
||||
timestamp?: string;
|
||||
url: string;
|
||||
error: string;
|
||||
}[];
|
||||
robotsBlocked: string[];
|
||||
};
|
||||
@ -888,7 +893,6 @@ export type SearchResponse =
|
||||
data: Document[];
|
||||
};
|
||||
|
||||
|
||||
export type TokenUsage = {
|
||||
promptTokens: number;
|
||||
completionTokens: number;
|
||||
|
@ -4,7 +4,11 @@ import * as Sentry from "@sentry/node";
|
||||
import express, { NextFunction, Request, Response } from "express";
|
||||
import bodyParser from "body-parser";
|
||||
import cors from "cors";
|
||||
import { getExtractQueue, getScrapeQueue, getIndexQueue } from "./services/queue-service";
|
||||
import {
|
||||
getExtractQueue,
|
||||
getScrapeQueue,
|
||||
getIndexQueue,
|
||||
} from "./services/queue-service";
|
||||
import { v0Router } from "./routes/v0";
|
||||
import os from "os";
|
||||
import { logger } from "./lib/logger";
|
||||
|
@ -3,101 +3,101 @@ import { deduplicateObjectsArray } from "../extract/helpers/deduplicate-objs-arr
|
||||
describe("deduplicateObjectsArray", () => {
|
||||
it("should deduplicate the array", async () => {
|
||||
const objArray = {
|
||||
"lawyers": [
|
||||
lawyers: [
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
name: "James D. Schull",
|
||||
email: null,
|
||||
title: "Personal Injury Attorney",
|
||||
"phone-number": null,
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
area: "Personal Injury",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
name: "James D. Schull",
|
||||
email: null,
|
||||
title: "Personal Injury Attorney",
|
||||
"phone-number": null,
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
area: "Personal Injury",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
name: "James D. Schull",
|
||||
email: null,
|
||||
title: "Personal Injury Attorney",
|
||||
"phone-number": null,
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
area: "Personal Injury",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const expected = {
|
||||
"lawyers": [
|
||||
lawyers: [
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
name: "James D. Schull",
|
||||
email: null,
|
||||
title: "Personal Injury Attorney",
|
||||
"phone-number": null,
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
area: "Personal Injury",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const result = await deduplicateObjectsArray(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
})
|
||||
});
|
||||
|
||||
it("should not deduplicate if not necessary", async () => {
|
||||
const objArray = {
|
||||
"lawyers": [
|
||||
lawyers: [
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
name: "James D. Schull",
|
||||
email: null,
|
||||
title: "Personal Injury Attorney",
|
||||
"phone-number": null,
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
area: "Personal Injury",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "John Doe",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
name: "John Doe",
|
||||
email: null,
|
||||
title: "Personal Injury Attorney",
|
||||
"phone-number": null,
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
area: "Personal Injury",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const result = await deduplicateObjectsArray(objArray);
|
||||
|
||||
expect(result).toEqual(objArray);
|
||||
})
|
||||
});
|
||||
|
||||
it("should handle an empty array", async () => {
|
||||
const objArray = { "lawyers": [] };
|
||||
const objArray = { lawyers: [] };
|
||||
|
||||
const expected = { "lawyers": [] };
|
||||
const expected = { lawyers: [] };
|
||||
|
||||
const result = await deduplicateObjectsArray(objArray);
|
||||
|
||||
@ -106,35 +106,35 @@ describe("deduplicateObjectsArray", () => {
|
||||
|
||||
it("should handle objects with different properties", async () => {
|
||||
const objArray = {
|
||||
"lawyers": [
|
||||
lawyers: [
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": "james@example.com",
|
||||
"title": "Personal Injury Attorney"
|
||||
name: "James D. Schull",
|
||||
email: "james@example.com",
|
||||
title: "Personal Injury Attorney",
|
||||
},
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": "james@example.com",
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "123-456-7890"
|
||||
}
|
||||
]
|
||||
name: "James D. Schull",
|
||||
email: "james@example.com",
|
||||
title: "Personal Injury Attorney",
|
||||
"phone-number": "123-456-7890",
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const expected = {
|
||||
"lawyers": [
|
||||
lawyers: [
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": "james@example.com",
|
||||
"title": "Personal Injury Attorney"
|
||||
name: "James D. Schull",
|
||||
email: "james@example.com",
|
||||
title: "Personal Injury Attorney",
|
||||
},
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": "james@example.com",
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "123-456-7890"
|
||||
}
|
||||
]
|
||||
name: "James D. Schull",
|
||||
email: "james@example.com",
|
||||
title: "Personal Injury Attorney",
|
||||
"phone-number": "123-456-7890",
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const result = await deduplicateObjectsArray(objArray);
|
||||
@ -144,33 +144,33 @@ describe("deduplicateObjectsArray", () => {
|
||||
|
||||
it("should handle objects with same properties but different values", async () => {
|
||||
const objArray = {
|
||||
"lawyers": [
|
||||
lawyers: [
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": "james1@example.com",
|
||||
"title": "Personal Injury Attorney"
|
||||
name: "James D. Schull",
|
||||
email: "james1@example.com",
|
||||
title: "Personal Injury Attorney",
|
||||
},
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": "james2@example.com",
|
||||
"title": "Personal Injury Attorney"
|
||||
}
|
||||
]
|
||||
name: "James D. Schull",
|
||||
email: "james2@example.com",
|
||||
title: "Personal Injury Attorney",
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const expected = {
|
||||
"lawyers": [
|
||||
lawyers: [
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": "james1@example.com",
|
||||
"title": "Personal Injury Attorney"
|
||||
name: "James D. Schull",
|
||||
email: "james1@example.com",
|
||||
title: "Personal Injury Attorney",
|
||||
},
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": "james2@example.com",
|
||||
"title": "Personal Injury Attorney"
|
||||
}
|
||||
]
|
||||
name: "James D. Schull",
|
||||
email: "james2@example.com",
|
||||
title: "Personal Injury Attorney",
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const result = await deduplicateObjectsArray(objArray);
|
||||
@ -180,47 +180,47 @@ describe("deduplicateObjectsArray", () => {
|
||||
|
||||
it("should handle nested identical objects", async () => {
|
||||
const objArray = {
|
||||
"lawyers": [
|
||||
lawyers: [
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
name: "James D. Schull",
|
||||
email: null,
|
||||
title: "Personal Injury Attorney",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
area: "Personal Injury",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
name: "James D. Schull",
|
||||
email: null,
|
||||
title: "Personal Injury Attorney",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
area: "Personal Injury",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const expected = {
|
||||
"lawyers": [
|
||||
lawyers: [
|
||||
{
|
||||
"name": "James D. Schull",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
name: "James D. Schull",
|
||||
email: null,
|
||||
title: "Personal Injury Attorney",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
area: "Personal Injury",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const result = await deduplicateObjectsArray(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
});
|
||||
})
|
||||
});
|
||||
|
@ -3,292 +3,292 @@ import { mergeNullValObjs } from "../extract/helpers/merge-null-val-objs";
|
||||
describe("mergeNullValObjs", () => {
|
||||
it("should merge the objects with null values", async () => {
|
||||
const objArray = {
|
||||
"lawyers": [
|
||||
lawyers: [
|
||||
{
|
||||
"name": "Frank Giunta",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
name: "Frank Giunta",
|
||||
email: null,
|
||||
title: "Personal Injury Attorney",
|
||||
"phone-number": "214.370.5200",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
area: "Personal Injury",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Frank Giunta",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
name: "Frank Giunta",
|
||||
email: null,
|
||||
title: "Personal Injury Attorney",
|
||||
"phone-number": "214.370.5200",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
area: "Personal Injury",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const expected = {
|
||||
"lawyers": [
|
||||
lawyers: [
|
||||
{
|
||||
"name": "Frank Giunta",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
name: "Frank Giunta",
|
||||
email: null,
|
||||
title: "Personal Injury Attorney",
|
||||
"phone-number": "214.370.5200",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
area: "Personal Injury",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const result = mergeNullValObjs(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
})
|
||||
});
|
||||
|
||||
it("should handle empty object array", async () => {
|
||||
const objArray = {
|
||||
"lawyers": []
|
||||
}
|
||||
lawyers: [],
|
||||
};
|
||||
|
||||
const expected = {
|
||||
"lawyers": []
|
||||
}
|
||||
lawyers: [],
|
||||
};
|
||||
|
||||
const result = mergeNullValObjs(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
})
|
||||
});
|
||||
|
||||
it("should handle object array with no null values", async () => {
|
||||
const objArray = {
|
||||
"lawyers": [
|
||||
lawyers: [
|
||||
{
|
||||
"name": "John Doe",
|
||||
"email": "john.doe@example.com",
|
||||
"title": "Attorney",
|
||||
name: "John Doe",
|
||||
email: "john.doe@example.com",
|
||||
title: "Attorney",
|
||||
"phone-number": "123.456.7890",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Corporate Law"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const expected = {
|
||||
"lawyers": [
|
||||
{
|
||||
"name": "John Doe",
|
||||
"email": "john.doe@example.com",
|
||||
"title": "Attorney",
|
||||
"phone-number": "123.456.7890",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Corporate Law"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const result = mergeNullValObjs(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
})
|
||||
|
||||
it("should merge objects with different null values", async () => {
|
||||
const objArray = {
|
||||
"lawyers": [
|
||||
{
|
||||
"name": "Jane Smith",
|
||||
"email": "null",
|
||||
"title": "Attorney",
|
||||
"description": null,
|
||||
"phone-number": "987.654.3210",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Family Law"
|
||||
}
|
||||
]
|
||||
area: "Corporate Law",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Jane Smith",
|
||||
"email": "jane.smith@example.com",
|
||||
"title": null,
|
||||
"description": "Jane Smith is an attorney specializing in Family Law.",
|
||||
"phone-number": "987.654.3210",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Family Law"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const expected = {
|
||||
"lawyers": [
|
||||
{
|
||||
"name": "Jane Smith",
|
||||
"email": "jane.smith@example.com",
|
||||
"title": "Attorney",
|
||||
"description": "Jane Smith is an attorney specializing in Family Law.",
|
||||
"phone-number": "987.654.3210",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Family Law"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const result = mergeNullValObjs(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
})
|
||||
|
||||
it("should merge objects with different null values", async () => {
|
||||
const objArray = {
|
||||
"lawyers": [
|
||||
{
|
||||
"name": "Frank Giunta",
|
||||
"email": "frank.giunta@example.com",
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "214.370.5200",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Frank Giunta",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "214.370.5200",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Dale R. Rose",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "972.562.0266",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
],
|
||||
};
|
||||
|
||||
const expected = {
|
||||
"lawyers": [
|
||||
lawyers: [
|
||||
{
|
||||
"name": "Frank Giunta",
|
||||
"email": "frank.giunta@example.com",
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "214.370.5200",
|
||||
name: "John Doe",
|
||||
email: "john.doe@example.com",
|
||||
title: "Attorney",
|
||||
"phone-number": "123.456.7890",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
area: "Corporate Law",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Dale R. Rose",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
"phone-number": "972.562.0266",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
],
|
||||
};
|
||||
|
||||
const result = mergeNullValObjs(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
})
|
||||
});
|
||||
|
||||
it("should merge objects with different null values", async () => {
|
||||
const objArray = {
|
||||
lawyers: [
|
||||
{
|
||||
name: "Jane Smith",
|
||||
email: "null",
|
||||
title: "Attorney",
|
||||
description: null,
|
||||
"phone-number": "987.654.3210",
|
||||
"practice-areas": [
|
||||
{
|
||||
area: "Family Law",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
name: "Jane Smith",
|
||||
email: "jane.smith@example.com",
|
||||
title: null,
|
||||
description: "Jane Smith is an attorney specializing in Family Law.",
|
||||
"phone-number": "987.654.3210",
|
||||
"practice-areas": [
|
||||
{
|
||||
area: "Family Law",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const expected = {
|
||||
lawyers: [
|
||||
{
|
||||
name: "Jane Smith",
|
||||
email: "jane.smith@example.com",
|
||||
title: "Attorney",
|
||||
description: "Jane Smith is an attorney specializing in Family Law.",
|
||||
"phone-number": "987.654.3210",
|
||||
"practice-areas": [
|
||||
{
|
||||
area: "Family Law",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const result = mergeNullValObjs(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
});
|
||||
|
||||
it("should merge objects with different null values", async () => {
|
||||
const objArray = {
|
||||
lawyers: [
|
||||
{
|
||||
name: "Frank Giunta",
|
||||
email: "frank.giunta@example.com",
|
||||
title: "Personal Injury Attorney",
|
||||
"phone-number": "214.370.5200",
|
||||
"practice-areas": [
|
||||
{
|
||||
area: "Personal Injury",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
name: "Frank Giunta",
|
||||
email: null,
|
||||
title: "Personal Injury Attorney",
|
||||
"phone-number": "214.370.5200",
|
||||
"practice-areas": [
|
||||
{
|
||||
area: "Personal Injury",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
name: "Dale R. Rose",
|
||||
email: null,
|
||||
title: "Personal Injury Attorney",
|
||||
"phone-number": "972.562.0266",
|
||||
"practice-areas": [
|
||||
{
|
||||
area: "Personal Injury",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const expected = {
|
||||
lawyers: [
|
||||
{
|
||||
name: "Frank Giunta",
|
||||
email: "frank.giunta@example.com",
|
||||
title: "Personal Injury Attorney",
|
||||
"phone-number": "214.370.5200",
|
||||
"practice-areas": [
|
||||
{
|
||||
area: "Personal Injury",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
name: "Dale R. Rose",
|
||||
email: null,
|
||||
title: "Personal Injury Attorney",
|
||||
"phone-number": "972.562.0266",
|
||||
"practice-areas": [
|
||||
{
|
||||
area: "Personal Injury",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const result = mergeNullValObjs(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
});
|
||||
|
||||
it("should correctly merge and deduplicate objects", async () => {
|
||||
const objArray = {
|
||||
"lawyers": [
|
||||
lawyers: [
|
||||
{
|
||||
"name": "Frank Giunta",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
name: "Frank Giunta",
|
||||
email: null,
|
||||
title: "Personal Injury Attorney",
|
||||
"phone-number": "214.370.5200",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
area: "Personal Injury",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Frank Giunta",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
name: "Frank Giunta",
|
||||
email: null,
|
||||
title: "Personal Injury Attorney",
|
||||
"phone-number": "214.370.5200",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
area: "Personal Injury",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Dale R. Rose",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
name: "Dale R. Rose",
|
||||
email: null,
|
||||
title: "Personal Injury Attorney",
|
||||
"phone-number": "972.562.0266",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
area: "Personal Injury",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const expected = {
|
||||
"lawyers": [
|
||||
lawyers: [
|
||||
{
|
||||
"name": "Frank Giunta",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
name: "Frank Giunta",
|
||||
email: null,
|
||||
title: "Personal Injury Attorney",
|
||||
"phone-number": "214.370.5200",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
area: "Personal Injury",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Dale R. Rose",
|
||||
"email": null,
|
||||
"title": "Personal Injury Attorney",
|
||||
name: "Dale R. Rose",
|
||||
email: null,
|
||||
title: "Personal Injury Attorney",
|
||||
"phone-number": "972.562.0266",
|
||||
"practice-areas": [
|
||||
{
|
||||
"area": "Personal Injury"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
area: "Personal Injury",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const result = mergeNullValObjs(objArray);
|
||||
@ -298,177 +298,172 @@ describe("mergeNullValObjs", () => {
|
||||
|
||||
it("should merge arrays of similar objects", async () => {
|
||||
const objArray = {
|
||||
"lawyers": [
|
||||
lawyers: [
|
||||
{
|
||||
"name": "Allen Cox",
|
||||
"email": null,
|
||||
"title": "Personal Injury Lawyer",
|
||||
name: "Allen Cox",
|
||||
email: null,
|
||||
title: "Personal Injury Lawyer",
|
||||
"phone-number": "972.606.9000",
|
||||
"practice-areas": [
|
||||
{ "area": "Personal Injury" }
|
||||
]
|
||||
"practice-areas": [{ area: "Personal Injury" }],
|
||||
},
|
||||
{
|
||||
"name": "Allen Cox",
|
||||
"email": "allen.cox@example.com",
|
||||
"title": "Personal Injury Lawyer",
|
||||
name: "Allen Cox",
|
||||
email: "allen.cox@example.com",
|
||||
title: "Personal Injury Lawyer",
|
||||
"phone-number": null,
|
||||
"practice-areas": [
|
||||
{ "area": "Automobile accidents" },
|
||||
{ "area": "Truck accidents" },
|
||||
{ "area": "Amusement park injury" },
|
||||
{ "area": "Bus accident" },
|
||||
{ "area": "Industrial accidents" },
|
||||
{ "area": "Product defects" },
|
||||
{ "area": "Food poisoning" },
|
||||
{ "area": "Workplace accidents" },
|
||||
{ "area": "Wrongful death" },
|
||||
{ "area": "Swimming pool accidents" },
|
||||
{ "area": "Premises accidents" },
|
||||
{ "area": "Aircraft accidents" },
|
||||
{ "area": "Animal and dog bites" }
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
{ area: "Automobile accidents" },
|
||||
{ area: "Truck accidents" },
|
||||
{ area: "Amusement park injury" },
|
||||
{ area: "Bus accident" },
|
||||
{ area: "Industrial accidents" },
|
||||
{ area: "Product defects" },
|
||||
{ area: "Food poisoning" },
|
||||
{ area: "Workplace accidents" },
|
||||
{ area: "Wrongful death" },
|
||||
{ area: "Swimming pool accidents" },
|
||||
{ area: "Premises accidents" },
|
||||
{ area: "Aircraft accidents" },
|
||||
{ area: "Animal and dog bites" },
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const expected = {
|
||||
"lawyers": [
|
||||
lawyers: [
|
||||
{
|
||||
"name": "Allen Cox",
|
||||
"email": "allen.cox@example.com",
|
||||
"title": "Personal Injury Lawyer",
|
||||
name: "Allen Cox",
|
||||
email: "allen.cox@example.com",
|
||||
title: "Personal Injury Lawyer",
|
||||
"phone-number": "972.606.9000",
|
||||
"practice-areas": [
|
||||
{ "area": "Personal Injury" },
|
||||
{ "area": "Automobile accidents" },
|
||||
{ "area": "Truck accidents" },
|
||||
{ "area": "Amusement park injury" },
|
||||
{ "area": "Bus accident" },
|
||||
{ "area": "Industrial accidents" },
|
||||
{ "area": "Product defects" },
|
||||
{ "area": "Food poisoning" },
|
||||
{ "area": "Workplace accidents" },
|
||||
{ "area": "Wrongful death" },
|
||||
{ "area": "Swimming pool accidents" },
|
||||
{ "area": "Premises accidents" },
|
||||
{ "area": "Aircraft accidents" },
|
||||
{ "area": "Animal and dog bites" }
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
{ area: "Personal Injury" },
|
||||
{ area: "Automobile accidents" },
|
||||
{ area: "Truck accidents" },
|
||||
{ area: "Amusement park injury" },
|
||||
{ area: "Bus accident" },
|
||||
{ area: "Industrial accidents" },
|
||||
{ area: "Product defects" },
|
||||
{ area: "Food poisoning" },
|
||||
{ area: "Workplace accidents" },
|
||||
{ area: "Wrongful death" },
|
||||
{ area: "Swimming pool accidents" },
|
||||
{ area: "Premises accidents" },
|
||||
{ area: "Aircraft accidents" },
|
||||
{ area: "Animal and dog bites" },
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const result = mergeNullValObjs(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
})
|
||||
});
|
||||
|
||||
it("should merge arrays of similar objects with different key names", async () => {
|
||||
const objArray = {
|
||||
"attorneys": [
|
||||
attorneys: [
|
||||
{
|
||||
"fullName": "Allen Cox",
|
||||
"contactEmail": null,
|
||||
"position": "Personal Injury Lawyer",
|
||||
"contactNumber": "972.606.9000",
|
||||
"specializations": [
|
||||
{ "field": "Personal Injury" }
|
||||
]
|
||||
fullName: "Allen Cox",
|
||||
contactEmail: null,
|
||||
position: "Personal Injury Lawyer",
|
||||
contactNumber: "972.606.9000",
|
||||
specializations: [{ field: "Personal Injury" }],
|
||||
},
|
||||
{
|
||||
"fullName": "Allen Cox",
|
||||
"contactEmail": "allen.cox@example.com",
|
||||
"position": "Personal Injury Lawyer",
|
||||
"contactNumber": null,
|
||||
"specializations": [
|
||||
{ "field": "Automobile accidents" },
|
||||
{ "field": "Truck accidents" },
|
||||
{ "field": "Amusement park injury" },
|
||||
{ "field": "Bus accident" },
|
||||
{ "field": "Industrial accidents" },
|
||||
{ "field": "Product defects" },
|
||||
{ "field": "Food poisoning" },
|
||||
{ "field": "Workplace accidents" },
|
||||
{ "field": "Wrongful death" },
|
||||
{ "field": "Swimming pool accidents" },
|
||||
{ "field": "Premises accidents" },
|
||||
{ "field": "Aircraft accidents" },
|
||||
{ "field": "Animal and dog bites" }
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
fullName: "Allen Cox",
|
||||
contactEmail: "allen.cox@example.com",
|
||||
position: "Personal Injury Lawyer",
|
||||
contactNumber: null,
|
||||
specializations: [
|
||||
{ field: "Automobile accidents" },
|
||||
{ field: "Truck accidents" },
|
||||
{ field: "Amusement park injury" },
|
||||
{ field: "Bus accident" },
|
||||
{ field: "Industrial accidents" },
|
||||
{ field: "Product defects" },
|
||||
{ field: "Food poisoning" },
|
||||
{ field: "Workplace accidents" },
|
||||
{ field: "Wrongful death" },
|
||||
{ field: "Swimming pool accidents" },
|
||||
{ field: "Premises accidents" },
|
||||
{ field: "Aircraft accidents" },
|
||||
{ field: "Animal and dog bites" },
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const expected = {
|
||||
"attorneys": [
|
||||
attorneys: [
|
||||
{
|
||||
"fullName": "Allen Cox",
|
||||
"contactEmail": "allen.cox@example.com",
|
||||
"position": "Personal Injury Lawyer",
|
||||
"contactNumber": "972.606.9000",
|
||||
"specializations": [
|
||||
{ "field": "Personal Injury" },
|
||||
{ "field": "Automobile accidents" },
|
||||
{ "field": "Truck accidents" },
|
||||
{ "field": "Amusement park injury" },
|
||||
{ "field": "Bus accident" },
|
||||
{ "field": "Industrial accidents" },
|
||||
{ "field": "Product defects" },
|
||||
{ "field": "Food poisoning" },
|
||||
{ "field": "Workplace accidents" },
|
||||
{ "field": "Wrongful death" },
|
||||
{ "field": "Swimming pool accidents" },
|
||||
{ "field": "Premises accidents" },
|
||||
{ "field": "Aircraft accidents" },
|
||||
{ "field": "Animal and dog bites" }
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
fullName: "Allen Cox",
|
||||
contactEmail: "allen.cox@example.com",
|
||||
position: "Personal Injury Lawyer",
|
||||
contactNumber: "972.606.9000",
|
||||
specializations: [
|
||||
{ field: "Personal Injury" },
|
||||
{ field: "Automobile accidents" },
|
||||
{ field: "Truck accidents" },
|
||||
{ field: "Amusement park injury" },
|
||||
{ field: "Bus accident" },
|
||||
{ field: "Industrial accidents" },
|
||||
{ field: "Product defects" },
|
||||
{ field: "Food poisoning" },
|
||||
{ field: "Workplace accidents" },
|
||||
{ field: "Wrongful death" },
|
||||
{ field: "Swimming pool accidents" },
|
||||
{ field: "Premises accidents" },
|
||||
{ field: "Aircraft accidents" },
|
||||
{ field: "Animal and dog bites" },
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const result = mergeNullValObjs(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
})
|
||||
});
|
||||
|
||||
it ("should deal with not array values", async () => {
|
||||
it("should deal with not array values", async () => {
|
||||
const objArray = {
|
||||
"lawyers": {
|
||||
"name": "not an array"
|
||||
lawyers: {
|
||||
name: "not an array",
|
||||
},
|
||||
"attorneys": {
|
||||
"name": "not an array"
|
||||
}
|
||||
}
|
||||
attorneys: {
|
||||
name: "not an array",
|
||||
},
|
||||
};
|
||||
|
||||
const expected = {
|
||||
"lawyers": {
|
||||
"name": "not an array"
|
||||
lawyers: {
|
||||
name: "not an array",
|
||||
},
|
||||
"attorneys": {
|
||||
"name": "not an array"
|
||||
}
|
||||
}
|
||||
attorneys: {
|
||||
name: "not an array",
|
||||
},
|
||||
};
|
||||
|
||||
// @ts-expect-error
|
||||
const result = mergeNullValObjs(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
})
|
||||
});
|
||||
|
||||
it ("should deal with arrays of strings", async () => {
|
||||
const objArray = {
|
||||
"lawyers": ["res1", "res2", "res3"]
|
||||
}
|
||||
it("should deal with arrays of strings", async () => {
|
||||
const objArray = {
|
||||
lawyers: ["res1", "res2", "res3"],
|
||||
};
|
||||
|
||||
const expected = {
|
||||
"lawyers": ["res1", "res2", "res3"]
|
||||
}
|
||||
const expected = {
|
||||
lawyers: ["res1", "res2", "res3"],
|
||||
};
|
||||
|
||||
const result = mergeNullValObjs(objArray);
|
||||
const result = mergeNullValObjs(objArray);
|
||||
|
||||
expect(result).toEqual(expected);
|
||||
})
|
||||
|
||||
})
|
||||
expect(result).toEqual(expected);
|
||||
});
|
||||
});
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -2,7 +2,7 @@ import { spreadSchemas } from "../extract/helpers/spread-schemas";
|
||||
|
||||
describe("spreadSchemas", () => {
|
||||
it("should spread kyb schema (id: 1)", async () => {
|
||||
const keys = ["owners"]
|
||||
const keys = ["owners"];
|
||||
const schema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
@ -21,13 +21,13 @@ describe("spreadSchemas", () => {
|
||||
city: { type: "string" },
|
||||
state: { type: "string" },
|
||||
country: { type: "string" },
|
||||
postal_code: { type: "string" }
|
||||
postal_code: { type: "string" },
|
||||
},
|
||||
},
|
||||
incorporation_date: { type: "string", format: "date" },
|
||||
phone: { type: "string" },
|
||||
email: { type: "string", format: "email" }
|
||||
}
|
||||
email: { type: "string", format: "email" },
|
||||
},
|
||||
},
|
||||
owners: {
|
||||
type: "array",
|
||||
@ -43,18 +43,21 @@ describe("spreadSchemas", () => {
|
||||
city: { type: "string" },
|
||||
state: { type: "string" },
|
||||
country: { type: "string" },
|
||||
postal_code: { type: "string" }
|
||||
postal_code: { type: "string" },
|
||||
},
|
||||
},
|
||||
phone: { type: "string" },
|
||||
email: { type: "string", format: "email" }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
email: { type: "string", format: "email" },
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys)
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
||||
schema,
|
||||
keys,
|
||||
);
|
||||
|
||||
expect(singleAnswerSchema).toEqual({
|
||||
type: "object",
|
||||
@ -74,16 +77,16 @@ describe("spreadSchemas", () => {
|
||||
city: { type: "string" },
|
||||
state: { type: "string" },
|
||||
country: { type: "string" },
|
||||
postal_code: { type: "string" }
|
||||
}
|
||||
postal_code: { type: "string" },
|
||||
},
|
||||
},
|
||||
incorporation_date: { type: "string", format: "date" },
|
||||
phone: { type: "string" },
|
||||
email: { type: "string", format: "email" }
|
||||
}
|
||||
email: { type: "string", format: "email" },
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
});
|
||||
|
||||
expect(multiEntitySchema).toEqual({
|
||||
type: "object",
|
||||
@ -102,20 +105,20 @@ describe("spreadSchemas", () => {
|
||||
city: { type: "string" },
|
||||
state: { type: "string" },
|
||||
country: { type: "string" },
|
||||
postal_code: { type: "string" }
|
||||
}
|
||||
postal_code: { type: "string" },
|
||||
},
|
||||
},
|
||||
phone: { type: "string" },
|
||||
email: { type: "string", format: "email" }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
})
|
||||
email: { type: "string", format: "email" },
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("should spread lawyers schema (id: 9)", async () => {
|
||||
const keys = ["lawyers"]
|
||||
const keys = ["lawyers"];
|
||||
const schema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
@ -133,22 +136,25 @@ describe("spreadSchemas", () => {
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
area: { type: "string" }
|
||||
area: { type: "string" },
|
||||
},
|
||||
},
|
||||
alias: "practice-areas"
|
||||
}
|
||||
alias: "practice-areas",
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys)
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
||||
schema,
|
||||
keys,
|
||||
);
|
||||
|
||||
expect(singleAnswerSchema).toEqual({})
|
||||
expect(multiEntitySchema).toEqual(schema)
|
||||
})
|
||||
expect(singleAnswerSchema).toEqual({});
|
||||
expect(multiEntitySchema).toEqual(schema);
|
||||
});
|
||||
|
||||
it("shoud spread (id: 26)", async () => {
|
||||
const schema = {
|
||||
@ -161,19 +167,22 @@ describe("spreadSchemas", () => {
|
||||
properties: {
|
||||
name: { type: "string" },
|
||||
price: { type: "string" },
|
||||
description: { type: "string" }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
description: { type: "string" },
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const keys = ["products"]
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys)
|
||||
const keys = ["products"];
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
||||
schema,
|
||||
keys,
|
||||
);
|
||||
|
||||
expect(singleAnswerSchema).toEqual({})
|
||||
expect(multiEntitySchema).toEqual(schema)
|
||||
})
|
||||
expect(singleAnswerSchema).toEqual({});
|
||||
expect(multiEntitySchema).toEqual(schema);
|
||||
});
|
||||
|
||||
it("shoud spread categories and products", async () => {
|
||||
const schema = {
|
||||
@ -182,8 +191,8 @@ describe("spreadSchemas", () => {
|
||||
categories: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "string"
|
||||
}
|
||||
type: "string",
|
||||
},
|
||||
},
|
||||
products: {
|
||||
type: "array",
|
||||
@ -192,19 +201,22 @@ describe("spreadSchemas", () => {
|
||||
properties: {
|
||||
name: { type: "string" },
|
||||
price: { type: "string" },
|
||||
description: { type: "string" }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
description: { type: "string" },
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const keys = ["products", "categories"]
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys)
|
||||
const keys = ["products", "categories"];
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
||||
schema,
|
||||
keys,
|
||||
);
|
||||
|
||||
expect(singleAnswerSchema).toEqual({})
|
||||
expect(multiEntitySchema).toEqual(schema)
|
||||
})
|
||||
expect(singleAnswerSchema).toEqual({});
|
||||
expect(multiEntitySchema).toEqual(schema);
|
||||
});
|
||||
|
||||
it("should spread (id: 29)", async () => {
|
||||
const schema = {
|
||||
@ -220,50 +232,55 @@ describe("spreadSchemas", () => {
|
||||
offers_cmmc: { type: "boolean" },
|
||||
has_soc_2_cert: { type: "boolean" },
|
||||
offers_office365: { type: "boolean" },
|
||||
offers_endpoint_security: { type: "boolean" }
|
||||
}
|
||||
}
|
||||
offers_endpoint_security: { type: "boolean" },
|
||||
},
|
||||
};
|
||||
|
||||
const keys = []
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys)
|
||||
const keys = [];
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
||||
schema,
|
||||
keys,
|
||||
);
|
||||
|
||||
expect(singleAnswerSchema).toEqual(schema)
|
||||
expect(multiEntitySchema).toEqual({})
|
||||
})
|
||||
expect(singleAnswerSchema).toEqual(schema);
|
||||
expect(multiEntitySchema).toEqual({});
|
||||
});
|
||||
|
||||
it("should spread kyb schema (id: 29)", async () => {
|
||||
|
||||
const schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"lawyers": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": { "type": "string" },
|
||||
"email": { "type": ["string", "null"] },
|
||||
"phone-number": { "type": "string" },
|
||||
type: "object",
|
||||
properties: {
|
||||
lawyers: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
name: { type: "string" },
|
||||
email: { type: ["string", "null"] },
|
||||
"phone-number": { type: "string" },
|
||||
"practice-areas": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"area": { "type": "string" }
|
||||
}
|
||||
}
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
area: { type: "string" },
|
||||
},
|
||||
},
|
||||
},
|
||||
"title": { "type": ["string", "null"] }
|
||||
title: { type: ["string", "null"] },
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const keys = ["lawyers"]
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys)
|
||||
const keys = ["lawyers"];
|
||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
||||
schema,
|
||||
keys,
|
||||
);
|
||||
|
||||
expect(singleAnswerSchema).toEqual({})
|
||||
expect(multiEntitySchema).toEqual(schema)
|
||||
})
|
||||
})
|
||||
expect(singleAnswerSchema).toEqual({});
|
||||
expect(multiEntitySchema).toEqual(schema);
|
||||
});
|
||||
});
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -42,7 +42,10 @@ export async function saveEntryToCache(key: string, entry: CacheEntry) {
|
||||
if (!cacheRedis) return;
|
||||
|
||||
if (!entry.html || entry.html.length < 100) {
|
||||
logger.warn("Skipping cache save for short HTML", { key, htmlLength: entry.html?.length });
|
||||
logger.warn("Skipping cache save for short HTML", {
|
||||
key,
|
||||
htmlLength: entry.html?.length,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -127,13 +127,15 @@ export async function getDoneJobsOrdered(
|
||||
export async function isCrawlFinished(id: string) {
|
||||
return (
|
||||
(await redisConnection.scard("crawl:" + id + ":jobs_done")) ===
|
||||
(await redisConnection.scard("crawl:" + id + ":jobs"))
|
||||
&& (await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null
|
||||
(await redisConnection.scard("crawl:" + id + ":jobs")) &&
|
||||
(await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null
|
||||
);
|
||||
}
|
||||
|
||||
export async function isCrawlKickoffFinished(id: string) {
|
||||
return await redisConnection.get("crawl:" + id + ":kickoff:finish") !== null
|
||||
return (
|
||||
(await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null
|
||||
);
|
||||
}
|
||||
|
||||
export async function isCrawlFinishedLocked(id: string) {
|
||||
@ -141,7 +143,12 @@ export async function isCrawlFinishedLocked(id: string) {
|
||||
}
|
||||
|
||||
export async function finishCrawlKickoff(id: string) {
|
||||
await redisConnection.set("crawl:" + id + ":kickoff:finish", "yes", "EX", 24 * 60 * 60);
|
||||
await redisConnection.set(
|
||||
"crawl:" + id + ":kickoff:finish",
|
||||
"yes",
|
||||
"EX",
|
||||
24 * 60 * 60,
|
||||
);
|
||||
}
|
||||
|
||||
export async function finishCrawl(id: string) {
|
||||
@ -161,9 +168,10 @@ export async function finishCrawl(id: string) {
|
||||
module: "crawl-redis",
|
||||
method: "finishCrawl",
|
||||
crawlId: id,
|
||||
jobs_done: (await redisConnection.scard("crawl:" + id + ":jobs_done")),
|
||||
jobs: (await redisConnection.scard("crawl:" + id + ":jobs")),
|
||||
kickoff_finished: (await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null,
|
||||
jobs_done: await redisConnection.scard("crawl:" + id + ":jobs_done"),
|
||||
jobs: await redisConnection.scard("crawl:" + id + ":jobs"),
|
||||
kickoff_finished:
|
||||
(await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
@ -1,81 +1,81 @@
|
||||
// const id = crypto.randomUUID();
|
||||
|
||||
// const sc: StoredCrawl = {
|
||||
// originUrl: request.urls[0].replace("/*",""),
|
||||
// crawlerOptions: toLegacyCrawlerOptions({
|
||||
// maxDepth: 15,
|
||||
// limit: 5000,
|
||||
// includePaths: [],
|
||||
// excludePaths: [],
|
||||
// ignoreSitemap: false,
|
||||
// allowExternalLinks: false,
|
||||
// allowBackwardLinks: true,
|
||||
// allowSubdomains: false,
|
||||
// ignoreRobotsTxt: false,
|
||||
// deduplicateSimilarURLs: false,
|
||||
// ignoreQueryParameters: false
|
||||
// }),
|
||||
// scrapeOptions: {
|
||||
// formats: ["markdown"],
|
||||
// onlyMainContent: true,
|
||||
// waitFor: 0,
|
||||
// mobile: false,
|
||||
// removeBase64Images: true,
|
||||
// fastMode: false,
|
||||
// parsePDF: true,
|
||||
// skipTlsVerification: false,
|
||||
// },
|
||||
// internalOptions: {
|
||||
// disableSmartWaitCache: true,
|
||||
// isBackgroundIndex: true
|
||||
// },
|
||||
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
|
||||
// createdAt: Date.now(),
|
||||
// plan: "hobby", // make it a low concurrency
|
||||
// };
|
||||
// const sc: StoredCrawl = {
|
||||
// originUrl: request.urls[0].replace("/*",""),
|
||||
// crawlerOptions: toLegacyCrawlerOptions({
|
||||
// maxDepth: 15,
|
||||
// limit: 5000,
|
||||
// includePaths: [],
|
||||
// excludePaths: [],
|
||||
// ignoreSitemap: false,
|
||||
// allowExternalLinks: false,
|
||||
// allowBackwardLinks: true,
|
||||
// allowSubdomains: false,
|
||||
// ignoreRobotsTxt: false,
|
||||
// deduplicateSimilarURLs: false,
|
||||
// ignoreQueryParameters: false
|
||||
// }),
|
||||
// scrapeOptions: {
|
||||
// formats: ["markdown"],
|
||||
// onlyMainContent: true,
|
||||
// waitFor: 0,
|
||||
// mobile: false,
|
||||
// removeBase64Images: true,
|
||||
// fastMode: false,
|
||||
// parsePDF: true,
|
||||
// skipTlsVerification: false,
|
||||
// },
|
||||
// internalOptions: {
|
||||
// disableSmartWaitCache: true,
|
||||
// isBackgroundIndex: true
|
||||
// },
|
||||
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
|
||||
// createdAt: Date.now(),
|
||||
// plan: "hobby", // make it a low concurrency
|
||||
// };
|
||||
|
||||
// // Save the crawl configuration
|
||||
// await saveCrawl(id, sc);
|
||||
// // Save the crawl configuration
|
||||
// await saveCrawl(id, sc);
|
||||
|
||||
// // Then kick off the job
|
||||
// await _addScrapeJobToBullMQ({
|
||||
// url: request.urls[0].replace("/*",""),
|
||||
// mode: "kickoff" as const,
|
||||
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
|
||||
// plan: "hobby", // make it a low concurrency
|
||||
// crawlerOptions: sc.crawlerOptions,
|
||||
// scrapeOptions: sc.scrapeOptions,
|
||||
// internalOptions: sc.internalOptions,
|
||||
// origin: "index",
|
||||
// crawl_id: id,
|
||||
// webhook: null,
|
||||
// v1: true,
|
||||
// }, {}, crypto.randomUUID(), 50);
|
||||
// // Then kick off the job
|
||||
// await _addScrapeJobToBullMQ({
|
||||
// url: request.urls[0].replace("/*",""),
|
||||
// mode: "kickoff" as const,
|
||||
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
|
||||
// plan: "hobby", // make it a low concurrency
|
||||
// crawlerOptions: sc.crawlerOptions,
|
||||
// scrapeOptions: sc.scrapeOptions,
|
||||
// internalOptions: sc.internalOptions,
|
||||
// origin: "index",
|
||||
// crawl_id: id,
|
||||
// webhook: null,
|
||||
// v1: true,
|
||||
// }, {}, crypto.randomUUID(), 50);
|
||||
|
||||
// we restructure and make all of the arrays we need to fill into objects,
|
||||
// adding them to a single object so the llm can fill them one at a time
|
||||
// TODO: make this work for more complex schemas where arrays are not first level
|
||||
// we restructure and make all of the arrays we need to fill into objects,
|
||||
// adding them to a single object so the llm can fill them one at a time
|
||||
// TODO: make this work for more complex schemas where arrays are not first level
|
||||
|
||||
// let schemasForLLM: {} = {};
|
||||
// for (const key in largeArraysSchema) {
|
||||
// const originalSchema = structuredClone(largeArraysSchema[key].items);
|
||||
// console.log(
|
||||
// "key",
|
||||
// key,
|
||||
// "\noriginalSchema",
|
||||
// JSON.stringify(largeArraysSchema[key], null, 2),
|
||||
// );
|
||||
// let clonedObj = {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// informationFilled: {
|
||||
// type: "boolean",
|
||||
// },
|
||||
// data: {
|
||||
// type: "object",
|
||||
// properties: originalSchema.properties,
|
||||
// },
|
||||
// },
|
||||
// };
|
||||
// schemasForLLM[key] = clonedObj;
|
||||
// }
|
||||
// let schemasForLLM: {} = {};
|
||||
// for (const key in largeArraysSchema) {
|
||||
// const originalSchema = structuredClone(largeArraysSchema[key].items);
|
||||
// console.log(
|
||||
// "key",
|
||||
// key,
|
||||
// "\noriginalSchema",
|
||||
// JSON.stringify(largeArraysSchema[key], null, 2),
|
||||
// );
|
||||
// let clonedObj = {
|
||||
// type: "object",
|
||||
// properties: {
|
||||
// informationFilled: {
|
||||
// type: "boolean",
|
||||
// },
|
||||
// data: {
|
||||
// type: "object",
|
||||
// properties: originalSchema.properties,
|
||||
// },
|
||||
// },
|
||||
// };
|
||||
// schemasForLLM[key] = clonedObj;
|
||||
// }
|
||||
|
@ -59,11 +59,11 @@ export async function updateExtract(
|
||||
|
||||
// Limit links in steps to 500
|
||||
if (extract.steps) {
|
||||
extract.steps = extract.steps.map(step => {
|
||||
extract.steps = extract.steps.map((step) => {
|
||||
if (step.discoveredLinks && step.discoveredLinks.length > 500) {
|
||||
return {
|
||||
...step,
|
||||
discoveredLinks: step.discoveredLinks.slice(0, 500)
|
||||
discoveredLinks: step.discoveredLinks.slice(0, 500),
|
||||
};
|
||||
}
|
||||
return step;
|
||||
|
@ -32,7 +32,11 @@ import { ExtractStep, updateExtract } from "./extract-redis";
|
||||
import { deduplicateObjectsArray } from "./helpers/deduplicate-objs-array";
|
||||
import { mergeNullValObjs } from "./helpers/merge-null-val-objs";
|
||||
import { CUSTOM_U_TEAMS, extractConfig } from "./config";
|
||||
import { calculateFinalResultCost, estimateCost, estimateTotalCost } from "./usage/llm-cost";
|
||||
import {
|
||||
calculateFinalResultCost,
|
||||
estimateCost,
|
||||
estimateTotalCost,
|
||||
} from "./usage/llm-cost";
|
||||
import { numTokensFromString } from "../LLM-extraction/helpers";
|
||||
|
||||
interface ExtractServiceOptions {
|
||||
@ -147,7 +151,13 @@ Schema: ${schemaString}\nPrompt: ${prompt}\nRelevant URLs: ${urls}`,
|
||||
totalTokens: result.usage?.total_tokens ?? 0,
|
||||
model: model,
|
||||
};
|
||||
return { isMultiEntity, multiEntityKeys, reasoning, keyIndicators, tokenUsage };
|
||||
return {
|
||||
isMultiEntity,
|
||||
multiEntityKeys,
|
||||
reasoning,
|
||||
keyIndicators,
|
||||
tokenUsage,
|
||||
};
|
||||
}
|
||||
|
||||
type completions = {
|
||||
@ -187,7 +197,7 @@ export async function performExtraction(
|
||||
method: "performExtraction",
|
||||
extractId,
|
||||
});
|
||||
|
||||
|
||||
// Token tracking
|
||||
let tokenUsage: TokenUsage[] = [];
|
||||
|
||||
@ -246,7 +256,7 @@ export async function performExtraction(
|
||||
"No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.",
|
||||
extractId,
|
||||
urlTrace: urlTraces,
|
||||
totalUrlsScraped: 0
|
||||
totalUrlsScraped: 0,
|
||||
};
|
||||
}
|
||||
|
||||
@ -277,8 +287,13 @@ export async function performExtraction(
|
||||
// 1. the first one is a completion that will extract the array of items
|
||||
// 2. the second one is multiple completions that will extract the items from the array
|
||||
let startAnalyze = Date.now();
|
||||
const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators, tokenUsage: schemaAnalysisTokenUsage } =
|
||||
await analyzeSchemaAndPrompt(links, reqSchema, request.prompt ?? "");
|
||||
const {
|
||||
isMultiEntity,
|
||||
multiEntityKeys,
|
||||
reasoning,
|
||||
keyIndicators,
|
||||
tokenUsage: schemaAnalysisTokenUsage,
|
||||
} = await analyzeSchemaAndPrompt(links, reqSchema, request.prompt ?? "");
|
||||
|
||||
// Track schema analysis tokens
|
||||
tokenUsage.push(schemaAnalysisTokenUsage);
|
||||
@ -540,7 +555,7 @@ export async function performExtraction(
|
||||
"An unexpected error occurred. Please contact help@firecrawl.com for help.",
|
||||
extractId,
|
||||
urlTrace: urlTraces,
|
||||
totalUrlsScraped
|
||||
totalUrlsScraped,
|
||||
};
|
||||
}
|
||||
}
|
||||
@ -592,17 +607,18 @@ export async function performExtraction(
|
||||
}
|
||||
}
|
||||
|
||||
const validResults = results.filter((doc): doc is Document => doc !== null);
|
||||
const validResults = results.filter(
|
||||
(doc): doc is Document => doc !== null,
|
||||
);
|
||||
singleAnswerDocs.push(...validResults);
|
||||
totalUrlsScraped += validResults.length;
|
||||
|
||||
} catch (error) {
|
||||
return {
|
||||
success: false,
|
||||
error: error.message,
|
||||
extractId,
|
||||
urlTrace: urlTraces,
|
||||
totalUrlsScraped
|
||||
totalUrlsScraped,
|
||||
};
|
||||
}
|
||||
|
||||
@ -614,7 +630,7 @@ export async function performExtraction(
|
||||
"All provided URLs are invalid. Please check your input and try again.",
|
||||
extractId,
|
||||
urlTrace: request.urlTrace ? urlTraces : undefined,
|
||||
totalUrlsScraped: 0
|
||||
totalUrlsScraped: 0,
|
||||
};
|
||||
}
|
||||
|
||||
@ -679,12 +695,12 @@ export async function performExtraction(
|
||||
: singleAnswerResult || multiEntityResult;
|
||||
|
||||
// Tokenize final result to get token count
|
||||
let finalResultTokens = 0;
|
||||
if (finalResult) {
|
||||
const finalResultStr = JSON.stringify(finalResult);
|
||||
finalResultTokens = numTokensFromString(finalResultStr, "gpt-4o");
|
||||
// let finalResultTokens = 0;
|
||||
// if (finalResult) {
|
||||
// const finalResultStr = JSON.stringify(finalResult);
|
||||
// finalResultTokens = numTokensFromString(finalResultStr, "gpt-4o");
|
||||
|
||||
}
|
||||
// }
|
||||
// // Deduplicate and validate final result against schema
|
||||
// if (reqSchema && finalResult && finalResult.length <= extractConfig.DEDUPLICATION.MAX_TOKENS) {
|
||||
// const schemaValidation = await generateOpenAICompletions(
|
||||
@ -695,7 +711,7 @@ export async function performExtraction(
|
||||
// 1. Remove any duplicate entries in the data extracted by merging that into a single object according to the provided shcema
|
||||
// 2. Ensure all data matches the provided schema
|
||||
// 3. Keep only the highest quality and most complete entries when duplicates are found.
|
||||
|
||||
|
||||
// Do not change anything else. If data is null keep it null. If the schema is not provided, return the data as is.`,
|
||||
// prompt: `Please validate and merge the duplicate entries in this data according to the schema provided:\n
|
||||
|
||||
@ -732,12 +748,10 @@ export async function performExtraction(
|
||||
const llmUsage = estimateTotalCost(tokenUsage);
|
||||
let tokensToBill = calculateFinalResultCost(finalResult);
|
||||
|
||||
|
||||
if (CUSTOM_U_TEAMS.includes(teamId)) {
|
||||
tokensToBill = 1;
|
||||
}
|
||||
|
||||
|
||||
// Bill team for usage
|
||||
billTeam(teamId, subId, tokensToBill, logger, true).catch((error) => {
|
||||
logger.error(
|
||||
@ -745,7 +759,6 @@ export async function performExtraction(
|
||||
);
|
||||
});
|
||||
|
||||
|
||||
// Log job with token usage
|
||||
logJob({
|
||||
job_id: extractId,
|
||||
@ -779,6 +792,6 @@ export async function performExtraction(
|
||||
warning: undefined, // TODO FIX
|
||||
urlTrace: request.urlTrace ? urlTraces : undefined,
|
||||
llmUsage,
|
||||
totalUrlsScraped
|
||||
totalUrlsScraped,
|
||||
};
|
||||
}
|
||||
|
@ -1,10 +1,12 @@
|
||||
export function deduplicateObjectsArray(objArray: { [key: string]: any[] }): { [key: string]: any[] } {
|
||||
export function deduplicateObjectsArray(objArray: { [key: string]: any[] }): {
|
||||
[key: string]: any[];
|
||||
} {
|
||||
const deduplicatedObjArray: { [key: string]: any[] } = {};
|
||||
|
||||
for (const key in objArray) {
|
||||
if (Array.isArray(objArray[key])) {
|
||||
const seen = new Set();
|
||||
deduplicatedObjArray[key] = objArray[key].filter(item => {
|
||||
deduplicatedObjArray[key] = objArray[key].filter((item) => {
|
||||
// Create a unique identifier for each item based on its properties
|
||||
const identifier = JSON.stringify(item);
|
||||
|
||||
@ -24,4 +26,4 @@ export function deduplicateObjectsArray(objArray: { [key: string]: any[] }): { [
|
||||
}
|
||||
|
||||
return deduplicatedObjArray;
|
||||
}
|
||||
}
|
||||
|
@ -7,4 +7,4 @@ export async function dereferenceSchema(schema: any): Promise<any> {
|
||||
console.error("Failed to dereference schema:", error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as fs from "fs";
|
||||
import * as path from "path";
|
||||
|
||||
/**
|
||||
* Helper function to dump data to a file for debugging/logging purposes
|
||||
@ -10,17 +10,19 @@ import * as path from 'path';
|
||||
export function dumpToFile<T>(
|
||||
filename: string,
|
||||
data: T[],
|
||||
formatter?: (item: T, index: number) => string
|
||||
formatter?: (item: T, index: number) => string,
|
||||
) {
|
||||
const filePath = path.join(__dirname, filename);
|
||||
|
||||
|
||||
let fileContent: string;
|
||||
if (formatter) {
|
||||
fileContent = data.map((item, index) => formatter(item, index)).join('\n');
|
||||
fileContent = data.map((item, index) => formatter(item, index)).join("\n");
|
||||
} else {
|
||||
fileContent = data.map((item, index) => `${index + 1}. ${JSON.stringify(item)}`).join('\n');
|
||||
fileContent = data
|
||||
.map((item, index) => `${index + 1}. ${JSON.stringify(item)}`)
|
||||
.join("\n");
|
||||
}
|
||||
|
||||
fs.writeFileSync(filePath, fileContent, 'utf8');
|
||||
fs.writeFileSync(filePath, fileContent, "utf8");
|
||||
console.log(`Dumped data to ${filename}`);
|
||||
}
|
||||
|
@ -1,4 +1,4 @@
|
||||
import { deduplicateObjectsArray } from './deduplicate-objs-array';
|
||||
import { deduplicateObjectsArray } from "./deduplicate-objs-array";
|
||||
|
||||
/**
|
||||
* Convert "null" strings to actual null values for easier comparison.
|
||||
@ -25,16 +25,16 @@ function areMergeable(obj1: any, obj2: any): boolean {
|
||||
const allKeys = new Set([...Object.keys(obj1), ...Object.keys(obj2)]);
|
||||
let matchingNonNullValues = 0;
|
||||
let nonNullComparisons = 0;
|
||||
|
||||
|
||||
for (const key of allKeys) {
|
||||
const val1 = obj1[key];
|
||||
const val2 = obj2[key];
|
||||
|
||||
|
||||
// Skip array comparisons - they'll be merged separately
|
||||
if (Array.isArray(val1) || Array.isArray(val2)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
// If both values exist and are not null
|
||||
if (val1 !== null && val2 !== null) {
|
||||
nonNullComparisons++;
|
||||
@ -43,7 +43,7 @@ function areMergeable(obj1: any, obj2: any): boolean {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Objects are mergeable if they have at least one matching non-null value
|
||||
// and all their non-null values match when both objects have them
|
||||
return nonNullComparisons > 0 && matchingNonNullValues === nonNullComparisons;
|
||||
@ -56,7 +56,10 @@ function mergeArrays(arr1: any[], arr2: any[]): any[] {
|
||||
const combined = [...arr1, ...arr2];
|
||||
return combined.filter((item, index) => {
|
||||
const stringified = JSON.stringify(item);
|
||||
return combined.findIndex(other => JSON.stringify(other) === stringified) === index;
|
||||
return (
|
||||
combined.findIndex((other) => JSON.stringify(other) === stringified) ===
|
||||
index
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
@ -78,9 +81,9 @@ function mergeObjects(obj1: any, obj2: any): any {
|
||||
// If only obj2's value is an array, use it
|
||||
result[key] = [...obj2[key]];
|
||||
}
|
||||
} else if (typeof obj2[key] === 'object') {
|
||||
} else if (typeof obj2[key] === "object") {
|
||||
// If both are objects (but not arrays), merge them
|
||||
if (typeof result[key] === 'object' && !Array.isArray(result[key])) {
|
||||
if (typeof result[key] === "object" && !Array.isArray(result[key])) {
|
||||
result[key] = mergeObjects(result[key], obj2[key]);
|
||||
} else {
|
||||
result[key] = { ...obj2[key] };
|
||||
@ -101,13 +104,17 @@ function mergeObjects(obj1: any, obj2: any): any {
|
||||
* null-equivalent fields, filling in null fields with the corresponding
|
||||
* non-null fields from the other object.
|
||||
*/
|
||||
export function mergeNullValObjs(objArray: { [key: string]: any[] }): { [key: string]: any[] } {
|
||||
export function mergeNullValObjs(objArray: { [key: string]: any[] }): {
|
||||
[key: string]: any[];
|
||||
} {
|
||||
const result: { [key: string]: any[] } = {};
|
||||
|
||||
for (const key in objArray) {
|
||||
if (Array.isArray(objArray[key])) {
|
||||
// If array contains only primitive values, return as is
|
||||
if (objArray[key].every(item => typeof item !== 'object' || item === null)) {
|
||||
if (
|
||||
objArray[key].every((item) => typeof item !== "object" || item === null)
|
||||
) {
|
||||
result[key] = [...objArray[key]];
|
||||
continue;
|
||||
}
|
||||
@ -117,7 +124,7 @@ export function mergeNullValObjs(objArray: { [key: string]: any[] }): { [key: st
|
||||
|
||||
for (const item of items) {
|
||||
let merged = false;
|
||||
|
||||
|
||||
for (let i = 0; i < mergedItems.length; i++) {
|
||||
if (areMergeable(mergedItems[i], item)) {
|
||||
mergedItems[i] = mergeObjects(mergedItems[i], item);
|
||||
@ -125,7 +132,7 @@ export function mergeNullValObjs(objArray: { [key: string]: any[] }): { [key: st
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (!merged) {
|
||||
mergedItems.push({ ...item });
|
||||
}
|
||||
@ -134,10 +141,13 @@ export function mergeNullValObjs(objArray: { [key: string]: any[] }): { [key: st
|
||||
// Final deduplication pass
|
||||
result[key] = deduplicateObjectsArray({ [key]: mergedItems })[key];
|
||||
} else {
|
||||
console.warn(`Expected an array at objArray[${key}], but found:`, objArray[key]);
|
||||
console.warn(
|
||||
`Expected an array at objArray[${key}], but found:`,
|
||||
objArray[key],
|
||||
);
|
||||
return objArray;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
export async function mixSchemaObjects(
|
||||
finalSchema: any,
|
||||
singleAnswerResult: any,
|
||||
multiEntityResult: any
|
||||
multiEntityResult: any,
|
||||
) {
|
||||
const finalResult: any = {};
|
||||
|
||||
@ -9,14 +9,20 @@ export async function mixSchemaObjects(
|
||||
function mergeResults(schema: any, singleResult: any, multiResult: any) {
|
||||
const result: any = {};
|
||||
for (const key in schema.properties) {
|
||||
if (schema.properties[key].type === 'object' && schema.properties[key].properties) {
|
||||
if (
|
||||
schema.properties[key].type === "object" &&
|
||||
schema.properties[key].properties
|
||||
) {
|
||||
// If the property is an object, recursively merge its properties
|
||||
result[key] = mergeResults(
|
||||
schema.properties[key],
|
||||
singleResult[key] || {},
|
||||
multiResult[key] || {}
|
||||
multiResult[key] || {},
|
||||
);
|
||||
} else if (schema.properties[key].type === 'array' && Array.isArray(multiResult[key])) {
|
||||
} else if (
|
||||
schema.properties[key].type === "array" &&
|
||||
Array.isArray(multiResult[key])
|
||||
) {
|
||||
// If the property is an array, flatten the arrays from multiResult
|
||||
result[key] = multiResult[key].flat();
|
||||
} else if (singleResult.hasOwnProperty(key)) {
|
||||
@ -29,7 +35,10 @@ export async function mixSchemaObjects(
|
||||
}
|
||||
|
||||
// Merge the properties from the final schema
|
||||
Object.assign(finalResult, mergeResults(finalSchema, singleAnswerResult, multiEntityResult));
|
||||
Object.assign(
|
||||
finalResult,
|
||||
mergeResults(finalSchema, singleAnswerResult, multiEntityResult),
|
||||
);
|
||||
|
||||
return finalResult;
|
||||
}
|
||||
}
|
||||
|
@ -1,4 +1,7 @@
|
||||
export async function spreadSchemas(schema: any, keys: string[]): Promise<{
|
||||
export async function spreadSchemas(
|
||||
schema: any,
|
||||
keys: string[],
|
||||
): Promise<{
|
||||
singleAnswerSchema: any;
|
||||
multiEntitySchema: any;
|
||||
}> {
|
||||
@ -32,7 +35,7 @@ export async function spreadSchemas(schema: any, keys: string[]): Promise<{
|
||||
if (Object.keys(singleAnswerSchema.properties).length === 0) {
|
||||
singleAnswerSchema = {};
|
||||
}
|
||||
|
||||
|
||||
if (Object.keys(multiEntitySchema.properties).length === 0) {
|
||||
multiEntitySchema = {};
|
||||
}
|
||||
@ -41,4 +44,4 @@ export async function spreadSchemas(schema: any, keys: string[]): Promise<{
|
||||
singleAnswerSchema,
|
||||
multiEntitySchema,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
@ -1,21 +1,21 @@
|
||||
import isEqual from 'lodash/isEqual';
|
||||
import isEqual from "lodash/isEqual";
|
||||
|
||||
export function transformArrayToObject(
|
||||
originalSchema: any,
|
||||
arrayData: any[]
|
||||
arrayData: any[],
|
||||
): any {
|
||||
if (Object.keys(originalSchema).length == 0) {
|
||||
return {};
|
||||
}
|
||||
|
||||
|
||||
const transformedResult: any = {};
|
||||
|
||||
// Function to find the array key in a nested schema
|
||||
function findArrayKey(schema: any): string | null {
|
||||
for (const key in schema.properties) {
|
||||
if (schema.properties[key].type === 'array') {
|
||||
if (schema.properties[key].type === "array") {
|
||||
return key;
|
||||
} else if (schema.properties[key].type === 'object') {
|
||||
} else if (schema.properties[key].type === "object") {
|
||||
const nestedKey = findArrayKey(schema.properties[key]);
|
||||
if (nestedKey) {
|
||||
return `${key}.${nestedKey}`;
|
||||
@ -31,7 +31,10 @@ export function transformArrayToObject(
|
||||
for (const key in item) {
|
||||
if (!acc[key]) {
|
||||
acc[key] = item[key];
|
||||
} else if (typeof acc[key] === 'object' && typeof item[key] === 'object') {
|
||||
} else if (
|
||||
typeof acc[key] === "object" &&
|
||||
typeof item[key] === "object"
|
||||
) {
|
||||
acc[key] = { ...acc[key], ...item[key] };
|
||||
}
|
||||
}
|
||||
@ -39,13 +42,16 @@ export function transformArrayToObject(
|
||||
}, {});
|
||||
}
|
||||
|
||||
const arrayKeyParts = arrayKeyPath.split('.');
|
||||
const arrayKeyParts = arrayKeyPath.split(".");
|
||||
const arrayKey = arrayKeyParts.pop();
|
||||
if (!arrayKey) {
|
||||
throw new Error("Array key not found in schema");
|
||||
}
|
||||
|
||||
const parentSchema = arrayKeyParts.reduce((schema, key) => schema.properties[key], originalSchema);
|
||||
const parentSchema = arrayKeyParts.reduce(
|
||||
(schema, key) => schema.properties[key],
|
||||
originalSchema,
|
||||
);
|
||||
const itemSchema = parentSchema.properties[arrayKey].items;
|
||||
if (!itemSchema) {
|
||||
throw new Error("Item schema not found for array key");
|
||||
@ -53,7 +59,7 @@ export function transformArrayToObject(
|
||||
|
||||
// Initialize the array in the transformed result
|
||||
let currentLevel = transformedResult;
|
||||
arrayKeyParts.forEach(part => {
|
||||
arrayKeyParts.forEach((part) => {
|
||||
if (!currentLevel[part]) {
|
||||
currentLevel[part] = {};
|
||||
}
|
||||
@ -63,20 +69,23 @@ export function transformArrayToObject(
|
||||
|
||||
// Helper function to check if an object is already in the array
|
||||
function isDuplicateObject(array: any[], obj: any): boolean {
|
||||
return array.some(existingItem => isEqual(existingItem, obj));
|
||||
return array.some((existingItem) => isEqual(existingItem, obj));
|
||||
}
|
||||
|
||||
// Helper function to validate if an object follows the schema
|
||||
function isValidObject(obj: any, schema: any): boolean {
|
||||
return Object.keys(schema.properties).every(key => {
|
||||
return obj.hasOwnProperty(key) && typeof obj[key] === schema.properties[key].type;
|
||||
return Object.keys(schema.properties).every((key) => {
|
||||
return (
|
||||
obj.hasOwnProperty(key) &&
|
||||
typeof obj[key] === schema.properties[key].type
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
// Iterate over each item in the arrayData
|
||||
arrayData.forEach(item => {
|
||||
arrayData.forEach((item) => {
|
||||
let currentItem = item;
|
||||
arrayKeyParts.forEach(part => {
|
||||
arrayKeyParts.forEach((part) => {
|
||||
if (currentItem[part]) {
|
||||
currentItem = currentItem[part];
|
||||
}
|
||||
@ -84,43 +93,63 @@ export function transformArrayToObject(
|
||||
|
||||
// Copy non-array properties from the parent object
|
||||
for (const key in parentSchema.properties) {
|
||||
if (key !== arrayKey && currentItem.hasOwnProperty(key) && !currentLevel.hasOwnProperty(key)) {
|
||||
if (
|
||||
key !== arrayKey &&
|
||||
currentItem.hasOwnProperty(key) &&
|
||||
!currentLevel.hasOwnProperty(key)
|
||||
) {
|
||||
currentLevel[key] = currentItem[key];
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure that the currentItem[arrayKey] is an array before mapping
|
||||
if (Array.isArray(currentItem[arrayKey])) {
|
||||
currentItem[arrayKey].forEach((subItem: any) => {
|
||||
if (typeof subItem === 'object' && subItem !== null && isValidObject(subItem, itemSchema)) {
|
||||
// For arrays of objects, add only unique objects
|
||||
const transformedItem: any = {};
|
||||
let hasValidData = false;
|
||||
// Ensure that the currentItem[arrayKey] is an array before mapping
|
||||
if (Array.isArray(currentItem[arrayKey])) {
|
||||
currentItem[arrayKey].forEach((subItem: any) => {
|
||||
if (
|
||||
typeof subItem === "object" &&
|
||||
subItem !== null &&
|
||||
isValidObject(subItem, itemSchema)
|
||||
) {
|
||||
// For arrays of objects, add only unique objects
|
||||
const transformedItem: any = {};
|
||||
let hasValidData = false;
|
||||
|
||||
for (const key in itemSchema.properties) {
|
||||
if (subItem.hasOwnProperty(key) && subItem[key] !== undefined) {
|
||||
transformedItem[key] = subItem[key];
|
||||
hasValidData = true;
|
||||
for (const key in itemSchema.properties) {
|
||||
if (subItem.hasOwnProperty(key) && subItem[key] !== undefined) {
|
||||
transformedItem[key] = subItem[key];
|
||||
hasValidData = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (
|
||||
hasValidData &&
|
||||
!isDuplicateObject(currentLevel[arrayKey], transformedItem)
|
||||
) {
|
||||
currentLevel[arrayKey].push(transformedItem);
|
||||
}
|
||||
}
|
||||
|
||||
if (hasValidData && !isDuplicateObject(currentLevel[arrayKey], transformedItem)) {
|
||||
currentLevel[arrayKey].push(transformedItem);
|
||||
}
|
||||
}
|
||||
});
|
||||
} else {
|
||||
console.warn(`Expected an array at ${arrayKey}, but found:`, currentItem[arrayKey]);
|
||||
}
|
||||
});
|
||||
} else {
|
||||
console.warn(
|
||||
`Expected an array at ${arrayKey}, but found:`,
|
||||
currentItem[arrayKey],
|
||||
);
|
||||
}
|
||||
|
||||
// Handle merging of array properties
|
||||
for (const key in parentSchema.properties) {
|
||||
if (parentSchema.properties[key].type === 'array' && Array.isArray(currentItem[key])) {
|
||||
if (
|
||||
parentSchema.properties[key].type === "array" &&
|
||||
Array.isArray(currentItem[key])
|
||||
) {
|
||||
if (!currentLevel[key]) {
|
||||
currentLevel[key] = [];
|
||||
}
|
||||
currentItem[key].forEach((value: any) => {
|
||||
if (!currentLevel[key].includes(value) && !isDuplicateObject(currentLevel[arrayKey], value)) {
|
||||
if (
|
||||
!currentLevel[key].includes(value) &&
|
||||
!isDuplicateObject(currentLevel[arrayKey], value)
|
||||
) {
|
||||
currentLevel[key].push(value);
|
||||
}
|
||||
});
|
||||
@ -129,4 +158,4 @@ export function transformArrayToObject(
|
||||
});
|
||||
|
||||
return transformedResult;
|
||||
}
|
||||
}
|
||||
|
@ -91,7 +91,8 @@ export async function indexPage({
|
||||
url: normalizedUrl,
|
||||
originUrl: normalizeUrl(originUrl),
|
||||
title: document.metadata.title ?? document.metadata.ogTitle ?? "",
|
||||
description: document.metadata.description ?? document.metadata.ogDescription ?? "",
|
||||
description:
|
||||
document.metadata.description ?? document.metadata.ogDescription ?? "",
|
||||
crawlId,
|
||||
teamId,
|
||||
markdown: trimmedMarkdown,
|
||||
@ -126,7 +127,7 @@ export async function indexPage({
|
||||
export async function searchSimilarPages(
|
||||
query: string,
|
||||
originUrl?: string,
|
||||
limit: number = 1000
|
||||
limit: number = 1000,
|
||||
): Promise<any[]> {
|
||||
try {
|
||||
const index = pinecone.index(INDEX_NAME);
|
||||
|
@ -59,7 +59,7 @@ export async function rerankLinks(
|
||||
const linksAndScores = await performRanking(
|
||||
mappedLinksRerank,
|
||||
mappedLinks.map((l) => l.url),
|
||||
searchQuery
|
||||
searchQuery,
|
||||
);
|
||||
|
||||
// First try with high threshold
|
||||
@ -109,8 +109,11 @@ export async function rerankLinks(
|
||||
}
|
||||
});
|
||||
|
||||
const rankedLinks = filteredLinks.slice(0, extractConfig.RERANKING.MAX_RANKING_LIMIT_FOR_RELEVANCE);
|
||||
|
||||
const rankedLinks = filteredLinks.slice(
|
||||
0,
|
||||
extractConfig.RERANKING.MAX_RANKING_LIMIT_FOR_RELEVANCE,
|
||||
);
|
||||
|
||||
// Mark URLs that will be used in completion
|
||||
rankedLinks.forEach((link) => {
|
||||
const trace = urlTraces.find((t) => t.url === link.url);
|
||||
@ -120,13 +123,15 @@ export async function rerankLinks(
|
||||
});
|
||||
|
||||
// Mark URLs that were dropped due to ranking limit
|
||||
filteredLinks.slice(extractConfig.RERANKING.MAX_RANKING_LIMIT_FOR_RELEVANCE).forEach(link => {
|
||||
const trace = urlTraces.find(t => t.url === link.url);
|
||||
if (trace) {
|
||||
trace.warning = "Excluded due to ranking limit";
|
||||
trace.usedInCompletion = false;
|
||||
}
|
||||
});
|
||||
filteredLinks
|
||||
.slice(extractConfig.RERANKING.MAX_RANKING_LIMIT_FOR_RELEVANCE)
|
||||
.forEach((link) => {
|
||||
const trace = urlTraces.find((t) => t.url === link.url);
|
||||
if (trace) {
|
||||
trace.warning = "Excluded due to ranking limit";
|
||||
trace.usedInCompletion = false;
|
||||
}
|
||||
});
|
||||
|
||||
// console.log("Reranked links: ", rankedLinks.length);
|
||||
|
||||
@ -155,7 +160,7 @@ function filterAndProcessLinks(
|
||||
export type RerankerResult = {
|
||||
mapDocument: MapDocument[];
|
||||
tokensUsed: number;
|
||||
}
|
||||
};
|
||||
|
||||
export async function rerankLinksWithLLM(
|
||||
mappedLinks: MapDocument[],
|
||||
@ -167,7 +172,7 @@ export async function rerankLinksWithLLM(
|
||||
const TIMEOUT_MS = 20000;
|
||||
const MAX_RETRIES = 2;
|
||||
let totalTokensUsed = 0;
|
||||
|
||||
|
||||
// Split mappedLinks into chunks of 200
|
||||
for (let i = 0; i < mappedLinks.length; i += chunkSize) {
|
||||
chunks.push(mappedLinks.slice(i, i + chunkSize));
|
||||
@ -184,23 +189,25 @@ export async function rerankLinksWithLLM(
|
||||
type: "object",
|
||||
properties: {
|
||||
url: { type: "string" },
|
||||
relevanceScore: { type: "number" }
|
||||
relevanceScore: { type: "number" },
|
||||
},
|
||||
required: ["url", "relevanceScore"]
|
||||
}
|
||||
}
|
||||
required: ["url", "relevanceScore"],
|
||||
},
|
||||
},
|
||||
},
|
||||
required: ["relevantLinks"]
|
||||
required: ["relevantLinks"],
|
||||
};
|
||||
|
||||
|
||||
const results = await Promise.all(
|
||||
chunks.map(async (chunk, chunkIndex) => {
|
||||
// console.log(`Processing chunk ${chunkIndex + 1}/${chunks.length} with ${chunk.length} links`);
|
||||
|
||||
const linksContent = chunk.map(link =>
|
||||
`URL: ${link.url}${link.title ? `\nTitle: ${link.title}` : ''}${link.description ? `\nDescription: ${link.description}` : ''}`
|
||||
).join("\n\n");
|
||||
|
||||
const linksContent = chunk
|
||||
.map(
|
||||
(link) =>
|
||||
`URL: ${link.url}${link.title ? `\nTitle: ${link.title}` : ""}${link.description ? `\nDescription: ${link.description}` : ""}`,
|
||||
)
|
||||
.join("\n\n");
|
||||
|
||||
for (let retry = 0; retry <= MAX_RETRIES; retry++) {
|
||||
try {
|
||||
@ -208,22 +215,28 @@ export async function rerankLinksWithLLM(
|
||||
setTimeout(() => resolve(null), TIMEOUT_MS);
|
||||
});
|
||||
|
||||
|
||||
const completionPromise = generateOpenAICompletions(
|
||||
logger.child({ method: "rerankLinksWithLLM", chunk: chunkIndex + 1, retry }),
|
||||
logger.child({
|
||||
method: "rerankLinksWithLLM",
|
||||
chunk: chunkIndex + 1,
|
||||
retry,
|
||||
}),
|
||||
{
|
||||
mode: "llm",
|
||||
systemPrompt: buildRerankerSystemPrompt(),
|
||||
prompt: buildRerankerUserPrompt(searchQuery),
|
||||
schema: schema
|
||||
schema: schema,
|
||||
},
|
||||
linksContent,
|
||||
undefined,
|
||||
true
|
||||
true,
|
||||
);
|
||||
|
||||
const completion = await Promise.race([completionPromise, timeoutPromise]);
|
||||
|
||||
const completion = await Promise.race([
|
||||
completionPromise,
|
||||
timeoutPromise,
|
||||
]);
|
||||
|
||||
if (!completion) {
|
||||
// console.log(`Chunk ${chunkIndex + 1}: Timeout on attempt ${retry + 1}`);
|
||||
continue;
|
||||
@ -237,9 +250,11 @@ export async function rerankLinksWithLLM(
|
||||
totalTokensUsed += completion.numTokens || 0;
|
||||
// console.log(`Chunk ${chunkIndex + 1}: Found ${completion.extract.relevantLinks.length} relevant links`);
|
||||
return completion.extract.relevantLinks;
|
||||
|
||||
} catch (error) {
|
||||
console.warn(`Error processing chunk ${chunkIndex + 1} attempt ${retry + 1}:`, error);
|
||||
console.warn(
|
||||
`Error processing chunk ${chunkIndex + 1} attempt ${retry + 1}:`,
|
||||
error,
|
||||
);
|
||||
if (retry === MAX_RETRIES) {
|
||||
// console.log(`Chunk ${chunkIndex + 1}: Max retries reached, returning empty array`);
|
||||
return [];
|
||||
@ -247,18 +262,20 @@ export async function rerankLinksWithLLM(
|
||||
}
|
||||
}
|
||||
return [];
|
||||
})
|
||||
}),
|
||||
);
|
||||
|
||||
// console.log(`Processed ${results.length} chunks`);
|
||||
|
||||
// Flatten results and sort by relevance score
|
||||
const flattenedResults = results.flat().sort((a, b) => b.relevanceScore - a.relevanceScore);
|
||||
const flattenedResults = results
|
||||
.flat()
|
||||
.sort((a, b) => b.relevanceScore - a.relevanceScore);
|
||||
// console.log(`Total relevant links found: ${flattenedResults.length}`);
|
||||
|
||||
// Map back to MapDocument format, keeping only relevant links
|
||||
const relevantLinks = flattenedResults
|
||||
.map(result => mappedLinks.find(link => link.url === result.url))
|
||||
.map((result) => mappedLinks.find((link) => link.url === result.url))
|
||||
.filter((link): link is MapDocument => link !== undefined);
|
||||
|
||||
// console.log(`Returning ${relevantLinks.length} relevant links`);
|
||||
|
@ -184,8 +184,6 @@ export async function processUrl(
|
||||
// (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}`
|
||||
// );
|
||||
|
||||
|
||||
|
||||
const rerankerResult = await rerankLinksWithLLM(
|
||||
mappedLinks,
|
||||
rephrasedPrompt,
|
||||
|
@ -12,7 +12,9 @@ const tokenPerCharacter = 4;
|
||||
const baseTokenCost = 300;
|
||||
|
||||
export function calculateFinalResultCost(data: any): number {
|
||||
return Math.floor((JSON.stringify(data).length / tokenPerCharacter) + baseTokenCost);
|
||||
return Math.floor(
|
||||
JSON.stringify(data).length / tokenPerCharacter + baseTokenCost,
|
||||
);
|
||||
}
|
||||
|
||||
export function estimateTotalCost(tokenUsage: TokenUsage[]): number {
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -55,9 +55,9 @@ async function performRanking(
|
||||
|
||||
// Generate embeddings for each link and calculate similarity in parallel
|
||||
const linksAndScores = await Promise.all(
|
||||
linksWithContext.map((linkWithContext, index) =>
|
||||
linksWithContext.map((linkWithContext, index) =>
|
||||
getEmbedding(linkWithContext)
|
||||
.then(linkEmbedding => {
|
||||
.then((linkEmbedding) => {
|
||||
const score = cosineSimilarity(queryEmbedding, linkEmbedding);
|
||||
return {
|
||||
link: links[index],
|
||||
@ -71,8 +71,8 @@ async function performRanking(
|
||||
linkWithContext,
|
||||
score: 0,
|
||||
originalIndex: index,
|
||||
}))
|
||||
)
|
||||
})),
|
||||
),
|
||||
);
|
||||
|
||||
// Sort links based on similarity scores while preserving original order for equal scores
|
||||
|
@ -252,20 +252,19 @@ export class WebCrawler {
|
||||
};
|
||||
|
||||
const timeoutPromise = new Promise((_, reject) => {
|
||||
setTimeout(() => reject(new Error('Sitemap fetch timeout')), timeout);
|
||||
setTimeout(() => reject(new Error("Sitemap fetch timeout")), timeout);
|
||||
});
|
||||
|
||||
try {
|
||||
let count = await Promise.race([
|
||||
let count = (await Promise.race([
|
||||
Promise.all([
|
||||
this.tryFetchSitemapLinks(
|
||||
this.initialUrl,
|
||||
_urlsHandler,
|
||||
),
|
||||
...this.robots.getSitemaps().map(x => this.tryFetchSitemapLinks(x, _urlsHandler)),
|
||||
]).then(results => results.reduce((a,x) => a+x, 0)),
|
||||
timeoutPromise
|
||||
]) as number;
|
||||
this.tryFetchSitemapLinks(this.initialUrl, _urlsHandler),
|
||||
...this.robots
|
||||
.getSitemaps()
|
||||
.map((x) => this.tryFetchSitemapLinks(x, _urlsHandler)),
|
||||
]).then((results) => results.reduce((a, x) => a + x, 0)),
|
||||
timeoutPromise,
|
||||
])) as number;
|
||||
|
||||
if (count > 0) {
|
||||
if (
|
||||
@ -281,14 +280,14 @@ export class WebCrawler {
|
||||
|
||||
return count;
|
||||
} catch (error) {
|
||||
if (error.message === 'Sitemap fetch timeout') {
|
||||
this.logger.warn('Sitemap fetch timed out', {
|
||||
if (error.message === "Sitemap fetch timeout") {
|
||||
this.logger.warn("Sitemap fetch timed out", {
|
||||
method: "tryGetSitemap",
|
||||
timeout,
|
||||
});
|
||||
return 0;
|
||||
}
|
||||
this.logger.error('Error fetching sitemap', {
|
||||
this.logger.error("Error fetching sitemap", {
|
||||
method: "tryGetSitemap",
|
||||
error,
|
||||
});
|
||||
@ -328,9 +327,16 @@ export class WebCrawler {
|
||||
!this.matchesExcludes(path) &&
|
||||
!this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
|
||||
) {
|
||||
(async() => {
|
||||
await redisConnection.sadd("crawl:" + this.jobId + ":robots_blocked", fullUrl);
|
||||
await redisConnection.expire("crawl:" + this.jobId + ":robots_blocked", 24 * 60 * 60, "NX");
|
||||
(async () => {
|
||||
await redisConnection.sadd(
|
||||
"crawl:" + this.jobId + ":robots_blocked",
|
||||
fullUrl,
|
||||
);
|
||||
await redisConnection.expire(
|
||||
"crawl:" + this.jobId + ":robots_blocked",
|
||||
24 * 60 * 60,
|
||||
"NX",
|
||||
);
|
||||
})();
|
||||
}
|
||||
} else {
|
||||
|
@ -1,5 +1,8 @@
|
||||
import { logger } from "../../lib/logger";
|
||||
import { normalizeUrl, normalizeUrlOnlyHostname } from "../../lib/canonical-url";
|
||||
import {
|
||||
normalizeUrl,
|
||||
normalizeUrlOnlyHostname,
|
||||
} from "../../lib/canonical-url";
|
||||
import { supabase_service } from "../../services/supabase";
|
||||
|
||||
/**
|
||||
@ -28,13 +31,19 @@ async function querySitemapIndexFunction(url: string) {
|
||||
return { urls: [], lastUpdated: new Date(0) };
|
||||
}
|
||||
|
||||
const allUrls = [...new Set(data.map((entry) => entry.urls).flat().map(url => normalizeUrl(url)))];
|
||||
const allUrls = [
|
||||
...new Set(
|
||||
data
|
||||
.map((entry) => entry.urls)
|
||||
.flat()
|
||||
.map((url) => normalizeUrl(url)),
|
||||
),
|
||||
];
|
||||
return { urls: allUrls, lastUpdated: data[0].updated_at };
|
||||
|
||||
} catch (error) {
|
||||
logger.error("(sitemap-index) Error querying the index", {
|
||||
logger.error("(sitemap-index) Error querying the index", {
|
||||
error,
|
||||
attempt
|
||||
attempt,
|
||||
});
|
||||
|
||||
if (attempt === 3) {
|
||||
@ -46,4 +55,7 @@ async function querySitemapIndexFunction(url: string) {
|
||||
return { urls: [], lastUpdated: new Date(0) };
|
||||
}
|
||||
|
||||
export const querySitemapIndex = withAuth(querySitemapIndexFunction, { urls: [], lastUpdated: new Date(0) });
|
||||
export const querySitemapIndex = withAuth(querySitemapIndexFunction, {
|
||||
urls: [],
|
||||
lastUpdated: new Date(0),
|
||||
});
|
||||
|
@ -24,55 +24,79 @@ export async function getLinksFromSitemap(
|
||||
try {
|
||||
if (mode === "fire-engine" && useFireEngine) {
|
||||
const fetchResponse = await scrapeURL(
|
||||
"sitemap",
|
||||
"sitemap",
|
||||
sitemapUrl,
|
||||
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
||||
{ forceEngine: "fetch" },
|
||||
);
|
||||
|
||||
if (fetchResponse.success && (fetchResponse.document.metadata.statusCode >= 200 && fetchResponse.document.metadata.statusCode < 300)) {
|
||||
if (
|
||||
fetchResponse.success &&
|
||||
fetchResponse.document.metadata.statusCode >= 200 &&
|
||||
fetchResponse.document.metadata.statusCode < 300
|
||||
) {
|
||||
content = fetchResponse.document.rawHtml!;
|
||||
} else {
|
||||
logger.debug(
|
||||
"Failed to scrape sitemap via fetch, falling back to TLSClient...",
|
||||
{ error: fetchResponse.success ? fetchResponse.document : fetchResponse.error },
|
||||
{
|
||||
error: fetchResponse.success
|
||||
? fetchResponse.document
|
||||
: fetchResponse.error,
|
||||
},
|
||||
);
|
||||
|
||||
const tlsResponse = await scrapeURL(
|
||||
"sitemap",
|
||||
"sitemap",
|
||||
sitemapUrl,
|
||||
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
||||
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true },
|
||||
);
|
||||
|
||||
if (tlsResponse.success && (tlsResponse.document.metadata.statusCode >= 200 && tlsResponse.document.metadata.statusCode < 300)) {
|
||||
if (
|
||||
tlsResponse.success &&
|
||||
tlsResponse.document.metadata.statusCode >= 200 &&
|
||||
tlsResponse.document.metadata.statusCode < 300
|
||||
) {
|
||||
content = tlsResponse.document.rawHtml!;
|
||||
} else {
|
||||
logger.error(`Request failed for ${sitemapUrl}, ran out of engines!`, {
|
||||
method: "getLinksFromSitemap",
|
||||
mode,
|
||||
sitemapUrl,
|
||||
error: tlsResponse.success ? tlsResponse.document : tlsResponse.error,
|
||||
});
|
||||
logger.error(
|
||||
`Request failed for ${sitemapUrl}, ran out of engines!`,
|
||||
{
|
||||
method: "getLinksFromSitemap",
|
||||
mode,
|
||||
sitemapUrl,
|
||||
error: tlsResponse.success
|
||||
? tlsResponse.document
|
||||
: tlsResponse.error,
|
||||
},
|
||||
);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
const fetchResponse = await scrapeURL(
|
||||
"sitemap",
|
||||
"sitemap",
|
||||
sitemapUrl,
|
||||
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
||||
{ forceEngine: "fetch" },
|
||||
);
|
||||
|
||||
if (fetchResponse.success && (fetchResponse.document.metadata.statusCode >= 200 && fetchResponse.document.metadata.statusCode < 300)) {
|
||||
if (
|
||||
fetchResponse.success &&
|
||||
fetchResponse.document.metadata.statusCode >= 200 &&
|
||||
fetchResponse.document.metadata.statusCode < 300
|
||||
) {
|
||||
content = fetchResponse.document.rawHtml!;
|
||||
} else {
|
||||
logger.error(`Request failed for ${sitemapUrl}, ran out of engines!`, {
|
||||
method: "getLinksFromSitemap",
|
||||
mode,
|
||||
sitemapUrl,
|
||||
});
|
||||
logger.error(
|
||||
`Request failed for ${sitemapUrl}, ran out of engines!`,
|
||||
{
|
||||
method: "getLinksFromSitemap",
|
||||
mode,
|
||||
sitemapUrl,
|
||||
},
|
||||
);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
@ -165,13 +189,20 @@ export const fetchSitemapData = async (
|
||||
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
|
||||
try {
|
||||
const fetchResponse = await scrapeURL(
|
||||
"sitemap",
|
||||
"sitemap",
|
||||
sitemapUrl,
|
||||
scrapeOptions.parse({ formats: ["rawHtml"], timeout: timeout || axiosTimeout }),
|
||||
scrapeOptions.parse({
|
||||
formats: ["rawHtml"],
|
||||
timeout: timeout || axiosTimeout,
|
||||
}),
|
||||
{ forceEngine: "fetch" },
|
||||
);
|
||||
|
||||
if (fetchResponse.success && (fetchResponse.document.metadata.statusCode >= 200 && fetchResponse.document.metadata.statusCode < 300)) {
|
||||
if (
|
||||
fetchResponse.success &&
|
||||
fetchResponse.document.metadata.statusCode >= 200 &&
|
||||
fetchResponse.document.metadata.statusCode < 300
|
||||
) {
|
||||
const xml = fetchResponse.document.rawHtml!;
|
||||
const parsedXml = await parseStringPromise(xml);
|
||||
|
||||
|
@ -17,7 +17,6 @@ export async function scrapeCache(meta: Meta): Promise<EngineScrapeResult> {
|
||||
throw new EngineError("Cache hit but HTML is too short to be useful");
|
||||
}
|
||||
|
||||
|
||||
// Set fromCache flag to indicate this document was retrieved from cache
|
||||
meta.internalOptions.fromCache = true;
|
||||
|
||||
|
@ -3,7 +3,12 @@ import * as Sentry from "@sentry/node";
|
||||
import { z } from "zod";
|
||||
|
||||
import { robustFetch } from "../../lib/fetch";
|
||||
import { ActionError, EngineError, SiteError, UnsupportedFileError } from "../../error";
|
||||
import {
|
||||
ActionError,
|
||||
EngineError,
|
||||
SiteError,
|
||||
UnsupportedFileError,
|
||||
} from "../../error";
|
||||
import { MockState } from "../../lib/mock";
|
||||
|
||||
const successSchema = z.object({
|
||||
|
@ -4,7 +4,11 @@ import * as Sentry from "@sentry/node";
|
||||
import { robustFetch } from "../../lib/fetch";
|
||||
import { MockState } from "../../lib/mock";
|
||||
|
||||
export async function fireEngineDelete(logger: Logger, jobId: string, mock: MockState | null) {
|
||||
export async function fireEngineDelete(
|
||||
logger: Logger,
|
||||
jobId: string,
|
||||
mock: MockState | null,
|
||||
) {
|
||||
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
|
||||
|
||||
await Sentry.startSpan(
|
||||
|
@ -143,7 +143,10 @@ async function buildMetaObject(
|
||||
logger,
|
||||
logs,
|
||||
featureFlags: buildFeatureFlags(url, options, internalOptions),
|
||||
mock: options.useMock !== undefined ? await loadMock(options.useMock, _logger) : null,
|
||||
mock:
|
||||
options.useMock !== undefined
|
||||
? await loadMock(options.useMock, _logger)
|
||||
: null,
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -34,7 +34,7 @@ export async function robustFetch<
|
||||
requestId = crypto.randomUUID(),
|
||||
tryCount = 1,
|
||||
tryCooldown,
|
||||
mock
|
||||
mock,
|
||||
}: RobustFetchParams<Schema>): Promise<Output> {
|
||||
const params = {
|
||||
url,
|
||||
@ -51,8 +51,8 @@ export async function robustFetch<
|
||||
|
||||
let response: {
|
||||
status: number;
|
||||
headers: Headers,
|
||||
body: string,
|
||||
headers: Headers;
|
||||
body: string;
|
||||
};
|
||||
|
||||
if (mock === null) {
|
||||
@ -123,25 +123,33 @@ export async function robustFetch<
|
||||
return null as Output;
|
||||
}
|
||||
|
||||
const makeRequestTypeId = (request: typeof mock["requests"][number]["options"]) => {
|
||||
const makeRequestTypeId = (
|
||||
request: (typeof mock)["requests"][number]["options"],
|
||||
) => {
|
||||
let out = request.url + ";" + request.method;
|
||||
if (process.env.FIRE_ENGINE_BETA_URL && url.startsWith(process.env.FIRE_ENGINE_BETA_URL) && request.method === "POST") {
|
||||
if (
|
||||
process.env.FIRE_ENGINE_BETA_URL &&
|
||||
url.startsWith(process.env.FIRE_ENGINE_BETA_URL) &&
|
||||
request.method === "POST"
|
||||
) {
|
||||
out += "f-e;" + request.body?.engine + ";" + request.body?.url;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
const thisId = makeRequestTypeId(params);
|
||||
const matchingMocks = mock.requests.filter(x => makeRequestTypeId(x.options) === thisId).sort((a,b) => a.time - b.time);
|
||||
const matchingMocks = mock.requests
|
||||
.filter((x) => makeRequestTypeId(x.options) === thisId)
|
||||
.sort((a, b) => a.time - b.time);
|
||||
const nextI = mock.tracker[thisId] ?? 0;
|
||||
mock.tracker[thisId] = nextI + 1;
|
||||
|
||||
|
||||
if (!matchingMocks[nextI]) {
|
||||
throw new Error("Failed to mock request -- no mock targets found.");
|
||||
}
|
||||
|
||||
response = {
|
||||
...(matchingMocks[nextI].result),
|
||||
...matchingMocks[nextI].result,
|
||||
headers: new Headers(matchingMocks[nextI].result.headers),
|
||||
};
|
||||
}
|
||||
@ -180,12 +188,15 @@ export async function robustFetch<
|
||||
}
|
||||
|
||||
if (mock === null) {
|
||||
await saveMock({
|
||||
...params,
|
||||
logger: undefined,
|
||||
schema: undefined,
|
||||
headers: undefined,
|
||||
}, response);
|
||||
await saveMock(
|
||||
{
|
||||
...params,
|
||||
logger: undefined,
|
||||
schema: undefined,
|
||||
headers: undefined,
|
||||
},
|
||||
response,
|
||||
);
|
||||
}
|
||||
|
||||
let data: Output;
|
||||
|
@ -6,55 +6,70 @@ const saveMocksDirPath = path.join(__dirname, "../mocks/").replace("dist/", "");
|
||||
const loadMocksDirPath = path.join(__dirname, "../../../__tests__/snips/mocks");
|
||||
|
||||
export async function saveMock(options: unknown, result: unknown) {
|
||||
if (process.env.FIRECRAWL_SAVE_MOCKS !== "true") return;
|
||||
if (process.env.FIRECRAWL_SAVE_MOCKS !== "true") return;
|
||||
|
||||
await fs.mkdir(saveMocksDirPath, { recursive: true });
|
||||
await fs.mkdir(saveMocksDirPath, { recursive: true });
|
||||
|
||||
const fileName = Date.now() + "-" + crypto.randomUUID() + ".json";
|
||||
const filePath = path.join(saveMocksDirPath, fileName);
|
||||
console.log(filePath);
|
||||
const fileName = Date.now() + "-" + crypto.randomUUID() + ".json";
|
||||
const filePath = path.join(saveMocksDirPath, fileName);
|
||||
console.log(filePath);
|
||||
|
||||
await fs.writeFile(filePath, JSON.stringify({
|
||||
await fs.writeFile(
|
||||
filePath,
|
||||
JSON.stringify(
|
||||
{
|
||||
time: Date.now(),
|
||||
options,
|
||||
result,
|
||||
}, undefined, 4));
|
||||
},
|
||||
undefined,
|
||||
4,
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
export type MockState = {
|
||||
requests: {
|
||||
time: number,
|
||||
options: {
|
||||
url: string,
|
||||
method: string,
|
||||
body?: any,
|
||||
ignoreResponse: boolean,
|
||||
ignoreFailure: boolean,
|
||||
tryCount: number,
|
||||
tryCooldown?: number,
|
||||
},
|
||||
result: any,
|
||||
}[],
|
||||
tracker: Record<string, number>,
|
||||
}
|
||||
requests: {
|
||||
time: number;
|
||||
options: {
|
||||
url: string;
|
||||
method: string;
|
||||
body?: any;
|
||||
ignoreResponse: boolean;
|
||||
ignoreFailure: boolean;
|
||||
tryCount: number;
|
||||
tryCooldown?: number;
|
||||
};
|
||||
result: any;
|
||||
}[];
|
||||
tracker: Record<string, number>;
|
||||
};
|
||||
|
||||
export async function loadMock(name: string, logger: Logger = _logger): Promise<MockState | null> {
|
||||
try {
|
||||
const mockPath = path.join(loadMocksDirPath, name + ".json");
|
||||
export async function loadMock(
|
||||
name: string,
|
||||
logger: Logger = _logger,
|
||||
): Promise<MockState | null> {
|
||||
try {
|
||||
const mockPath = path.join(loadMocksDirPath, name + ".json");
|
||||
|
||||
const relative = path.relative(loadMocksDirPath, mockPath);
|
||||
if (!relative || relative.startsWith("..") || path.isAbsolute(relative)) {
|
||||
// directory moving
|
||||
return null;
|
||||
}
|
||||
|
||||
const load = JSON.parse(await fs.readFile(mockPath, "utf8"));
|
||||
return {
|
||||
requests: load,
|
||||
tracker: {},
|
||||
};
|
||||
} catch (error) {
|
||||
logger.warn("Failed to load mock file!", { name, module: "scrapeURL:mock", method: "loadMock", error });
|
||||
return null;
|
||||
const relative = path.relative(loadMocksDirPath, mockPath);
|
||||
if (!relative || relative.startsWith("..") || path.isAbsolute(relative)) {
|
||||
// directory moving
|
||||
return null;
|
||||
}
|
||||
|
||||
const load = JSON.parse(await fs.readFile(mockPath, "utf8"));
|
||||
return {
|
||||
requests: load,
|
||||
tracker: {},
|
||||
};
|
||||
} catch (error) {
|
||||
logger.warn("Failed to load mock file!", {
|
||||
name,
|
||||
module: "scrapeURL:mock",
|
||||
method: "loadMock",
|
||||
error,
|
||||
});
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
@ -119,16 +119,16 @@ export const htmlTransform = (
|
||||
|
||||
// always return biggest image
|
||||
soup("img[srcset]").each((_, el) => {
|
||||
const sizes = el.attribs.srcset.split(",").map(x => {
|
||||
const sizes = el.attribs.srcset.split(",").map((x) => {
|
||||
const tok = x.trim().split(" ");
|
||||
return {
|
||||
url: tok[0],
|
||||
size: parseInt((tok[1] ?? "1x").slice(0, -1), 10),
|
||||
isX: (tok[1] ?? "").endsWith("x")
|
||||
isX: (tok[1] ?? "").endsWith("x"),
|
||||
};
|
||||
});
|
||||
|
||||
if (sizes.every(x => x.isX) && el.attribs.src) {
|
||||
if (sizes.every((x) => x.isX) && el.attribs.src) {
|
||||
sizes.push({
|
||||
url: el.attribs.src,
|
||||
size: 1,
|
||||
@ -136,7 +136,7 @@ export const htmlTransform = (
|
||||
});
|
||||
}
|
||||
|
||||
sizes.sort((a,b) => b.size - a.size);
|
||||
sizes.sort((a, b) => b.size - a.size);
|
||||
|
||||
el.attribs.src = sizes[0]?.url;
|
||||
});
|
||||
|
@ -41,7 +41,11 @@ export function deriveHTMLFromRawHTML(
|
||||
);
|
||||
}
|
||||
|
||||
document.html = htmlTransform(document.rawHtml, document.metadata.url ?? document.metadata.sourceURL ?? meta.url, meta.options);
|
||||
document.html = htmlTransform(
|
||||
document.rawHtml,
|
||||
document.metadata.url ?? document.metadata.sourceURL ?? meta.url,
|
||||
meta.options,
|
||||
);
|
||||
return document;
|
||||
}
|
||||
|
||||
|
@ -1,7 +1,11 @@
|
||||
import OpenAI from "openai";
|
||||
import { encoding_for_model } from "@dqbd/tiktoken";
|
||||
import { TiktokenModel } from "@dqbd/tiktoken";
|
||||
import { Document, ExtractOptions, TokenUsage } from "../../../controllers/v1/types";
|
||||
import {
|
||||
Document,
|
||||
ExtractOptions,
|
||||
TokenUsage,
|
||||
} from "../../../controllers/v1/types";
|
||||
import { Logger } from "winston";
|
||||
import { EngineResultsTracker, Meta } from "..";
|
||||
import { logger } from "../../../lib/logger";
|
||||
@ -72,14 +76,20 @@ export async function generateOpenAICompletions(
|
||||
markdown?: string,
|
||||
previousWarning?: string,
|
||||
isExtractEndpoint?: boolean,
|
||||
model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini",
|
||||
): Promise<{ extract: any; numTokens: number; warning: string | undefined; totalUsage: TokenUsage, model: string }> {
|
||||
model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ??
|
||||
"gpt-4o-mini",
|
||||
): Promise<{
|
||||
extract: any;
|
||||
numTokens: number;
|
||||
warning: string | undefined;
|
||||
totalUsage: TokenUsage;
|
||||
model: string;
|
||||
}> {
|
||||
let extract: any;
|
||||
let warning: string | undefined;
|
||||
|
||||
const openai = new OpenAI();
|
||||
|
||||
|
||||
if (markdown === undefined) {
|
||||
throw new Error("document.markdown is undefined -- this is unexpected");
|
||||
}
|
||||
@ -208,8 +218,8 @@ export async function generateOpenAICompletions(
|
||||
}
|
||||
}
|
||||
|
||||
const promptTokens = (jsonCompletion.usage?.prompt_tokens ?? 0);
|
||||
const completionTokens = (jsonCompletion.usage?.completion_tokens ?? 0);
|
||||
const promptTokens = jsonCompletion.usage?.prompt_tokens ?? 0;
|
||||
const completionTokens = jsonCompletion.usage?.completion_tokens ?? 0;
|
||||
|
||||
// If the users actually wants the items object, they can specify it as 'required' in the schema
|
||||
// otherwise, we just return the items array
|
||||
@ -222,7 +232,17 @@ export async function generateOpenAICompletions(
|
||||
}
|
||||
// num tokens (just user prompt tokenized) | deprecated
|
||||
// totalTokens = promptTokens + completionTokens
|
||||
return { extract, warning, numTokens, totalUsage: { promptTokens, completionTokens, totalTokens: promptTokens + completionTokens }, model };
|
||||
return {
|
||||
extract,
|
||||
warning,
|
||||
numTokens,
|
||||
totalUsage: {
|
||||
promptTokens,
|
||||
completionTokens,
|
||||
totalTokens: promptTokens + completionTokens,
|
||||
},
|
||||
model,
|
||||
};
|
||||
}
|
||||
|
||||
export async function performLLMExtract(
|
||||
@ -238,7 +258,7 @@ export async function performLLMExtract(
|
||||
document.markdown,
|
||||
document.warning,
|
||||
);
|
||||
|
||||
|
||||
if (meta.options.formats.includes("json")) {
|
||||
document.json = extract;
|
||||
} else {
|
||||
|
@ -32,7 +32,7 @@ export async function autoCharge(
|
||||
const resource = `auto-recharge:${chunk.team_id}`;
|
||||
const cooldownKey = `auto-recharge-cooldown:${chunk.team_id}`;
|
||||
|
||||
if(chunk.team_id === "285bb597-6eaf-4b96-801c-51461fc3c543"){
|
||||
if (chunk.team_id === "285bb597-6eaf-4b96-801c-51461fc3c543") {
|
||||
return {
|
||||
success: false,
|
||||
message: "Auto-recharge failed",
|
||||
|
@ -107,15 +107,15 @@ async function processBatch() {
|
||||
// Keep most recent entry and mark others for deletion
|
||||
const [mostRecent, ...duplicates] = existingForOrigin;
|
||||
if (duplicates.length > 0) {
|
||||
duplicatesToDelete.push(...duplicates.map(d => d.id));
|
||||
duplicatesToDelete.push(...duplicates.map((d) => d.id));
|
||||
}
|
||||
|
||||
// Merge and deduplicate URLs
|
||||
const mergedUrls = [
|
||||
...new Set([
|
||||
...mostRecent.urls,
|
||||
...op.standardizedUrls.map(url => normalizeUrl(url))
|
||||
])
|
||||
...op.standardizedUrls.map((url) => normalizeUrl(url)),
|
||||
]),
|
||||
];
|
||||
|
||||
updates.push({
|
||||
@ -127,7 +127,9 @@ async function processBatch() {
|
||||
});
|
||||
} else {
|
||||
// Prepare insert with deduplicated URLs
|
||||
const deduplicatedUrls = [...new Set(op.standardizedUrls.map(url => normalizeUrl(url)))];
|
||||
const deduplicatedUrls = [
|
||||
...new Set(op.standardizedUrls.map((url) => normalizeUrl(url))),
|
||||
];
|
||||
inserts.push({
|
||||
origin_url: op.originUrl,
|
||||
urls: deduplicatedUrls,
|
||||
@ -140,8 +142,10 @@ async function processBatch() {
|
||||
|
||||
// Delete duplicate entries
|
||||
if (duplicatesToDelete.length > 0) {
|
||||
logger.info(`🗑️ Deleting ${duplicatesToDelete.length} duplicate crawl maps in batches of 100`);
|
||||
|
||||
logger.info(
|
||||
`🗑️ Deleting ${duplicatesToDelete.length} duplicate crawl maps in batches of 100`,
|
||||
);
|
||||
|
||||
// Delete in batches of 100
|
||||
for (let i = 0; i < duplicatesToDelete.length; i += 100) {
|
||||
const batch = duplicatesToDelete.slice(i, i + 100);
|
||||
@ -151,11 +155,14 @@ async function processBatch() {
|
||||
.in("id", batch);
|
||||
|
||||
if (deleteError) {
|
||||
logger.error(`Failed to delete batch ${i/100 + 1} of duplicate crawl maps`, {
|
||||
error: deleteError,
|
||||
batchSize: batch.length,
|
||||
startIndex: i
|
||||
});
|
||||
logger.error(
|
||||
`Failed to delete batch ${i / 100 + 1} of duplicate crawl maps`,
|
||||
{
|
||||
error: deleteError,
|
||||
batchSize: batch.length,
|
||||
startIndex: i,
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -165,7 +172,7 @@ async function processBatch() {
|
||||
logger.info(`🔄 Updating ${updates.length} existing crawl maps`, {
|
||||
origins: updates.map((u) => u.origin_url),
|
||||
});
|
||||
|
||||
|
||||
// Process updates one at a time to avoid conflicts
|
||||
for (const update of updates) {
|
||||
const { error: updateError } = await supabase_service
|
||||
@ -175,7 +182,7 @@ async function processBatch() {
|
||||
if (updateError) {
|
||||
logger.error("Failed to update crawl map", {
|
||||
error: updateError,
|
||||
origin: update.origin_url
|
||||
origin: update.origin_url,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
@ -3,18 +3,27 @@ import "../sentry";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { Job, Queue, Worker } from "bullmq";
|
||||
import { logger as _logger, logger } from "../../lib/logger";
|
||||
import { redisConnection, indexQueueName, getIndexQueue } from "../queue-service";
|
||||
import {
|
||||
redisConnection,
|
||||
indexQueueName,
|
||||
getIndexQueue,
|
||||
} from "../queue-service";
|
||||
import { saveCrawlMap } from "./crawl-maps-index";
|
||||
import systemMonitor from "../system-monitor";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
|
||||
const workerLockDuration = Number(process.env.WORKER_LOCK_DURATION) || 60000;
|
||||
const workerStalledCheckInterval = Number(process.env.WORKER_STALLED_CHECK_INTERVAL) || 30000;
|
||||
const jobLockExtendInterval = Number(process.env.JOB_LOCK_EXTEND_INTERVAL) || 15000;
|
||||
const jobLockExtensionTime = Number(process.env.JOB_LOCK_EXTENSION_TIME) || 60000;
|
||||
const workerStalledCheckInterval =
|
||||
Number(process.env.WORKER_STALLED_CHECK_INTERVAL) || 30000;
|
||||
const jobLockExtendInterval =
|
||||
Number(process.env.JOB_LOCK_EXTEND_INTERVAL) || 15000;
|
||||
const jobLockExtensionTime =
|
||||
Number(process.env.JOB_LOCK_EXTENSION_TIME) || 60000;
|
||||
|
||||
const cantAcceptConnectionInterval = Number(process.env.CANT_ACCEPT_CONNECTION_INTERVAL) || 2000;
|
||||
const connectionMonitorInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 10;
|
||||
const cantAcceptConnectionInterval =
|
||||
Number(process.env.CANT_ACCEPT_CONNECTION_INTERVAL) || 2000;
|
||||
const connectionMonitorInterval =
|
||||
Number(process.env.CONNECTION_MONITOR_INTERVAL) || 10;
|
||||
const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
|
||||
|
||||
const runningJobs: Set<string> = new Set();
|
||||
@ -88,7 +97,7 @@ const workerFun = async (queue: Queue) => {
|
||||
|
||||
const token = uuidv4();
|
||||
const canAcceptConnection = await monitor.acceptConnection();
|
||||
|
||||
|
||||
if (!canAcceptConnection) {
|
||||
logger.info("Cant accept connection");
|
||||
cantAcceptConnectionCount++;
|
||||
@ -100,7 +109,9 @@ const workerFun = async (queue: Queue) => {
|
||||
});
|
||||
}
|
||||
|
||||
await new Promise(resolve => setTimeout(resolve, cantAcceptConnectionInterval));
|
||||
await new Promise((resolve) =>
|
||||
setTimeout(resolve, cantAcceptConnectionInterval),
|
||||
);
|
||||
continue;
|
||||
} else {
|
||||
cantAcceptConnectionCount = 0;
|
||||
@ -141,15 +152,17 @@ const workerFun = async (queue: Queue) => {
|
||||
runningJobs.delete(job.id);
|
||||
}
|
||||
|
||||
await new Promise(resolve => setTimeout(resolve, gotJobInterval));
|
||||
await new Promise((resolve) => setTimeout(resolve, gotJobInterval));
|
||||
} else {
|
||||
await new Promise(resolve => setTimeout(resolve, connectionMonitorInterval));
|
||||
await new Promise((resolve) =>
|
||||
setTimeout(resolve, connectionMonitorInterval),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("Worker loop ended. Waiting for running jobs to finish...");
|
||||
while (runningJobs.size > 0) {
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
await new Promise((resolve) => setTimeout(resolve, 500));
|
||||
}
|
||||
logger.info("All jobs finished. Worker exiting!");
|
||||
process.exit(0);
|
||||
@ -158,4 +171,4 @@ const workerFun = async (queue: Queue) => {
|
||||
// Start the worker
|
||||
(async () => {
|
||||
await workerFun(getIndexQueue());
|
||||
})();
|
||||
})();
|
||||
|
@ -93,7 +93,9 @@ const runningJobs: Set<string> = new Set();
|
||||
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
||||
if (await finishCrawl(job.data.crawl_id)) {
|
||||
(async () => {
|
||||
const originUrl = sc.originUrl ? normalizeUrlOnlyHostname(sc.originUrl) : undefined;
|
||||
const originUrl = sc.originUrl
|
||||
? normalizeUrlOnlyHostname(sc.originUrl)
|
||||
: undefined;
|
||||
// Get all visited unique URLs from Redis
|
||||
const visitedUrls = await redisConnection.smembers(
|
||||
"crawl:" + job.data.crawl_id + ":visited_unique",
|
||||
@ -113,7 +115,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
||||
},
|
||||
{
|
||||
priority: 10,
|
||||
}
|
||||
},
|
||||
);
|
||||
}
|
||||
})();
|
||||
@ -315,11 +317,14 @@ const processExtractJobInternal = async (
|
||||
return result;
|
||||
} else {
|
||||
// throw new Error(result.error || "Unknown error during extraction");
|
||||
|
||||
|
||||
await job.moveToCompleted(result, token, false);
|
||||
await updateExtract(job.data.extractId, {
|
||||
status: "failed",
|
||||
error: result.error ?? "Unknown error, please contact help@firecrawl.com. Extract id: " + job.data.extractId,
|
||||
error:
|
||||
result.error ??
|
||||
"Unknown error, please contact help@firecrawl.com. Extract id: " +
|
||||
job.data.extractId,
|
||||
});
|
||||
|
||||
return result;
|
||||
@ -348,7 +353,14 @@ const processExtractJobInternal = async (
|
||||
"Unknown error, please contact help@firecrawl.com. Extract id: " +
|
||||
job.data.extractId,
|
||||
});
|
||||
return { success: false, error: error.error ?? error ?? "Unknown error, please contact help@firecrawl.com. Extract id: " + job.data.extractId };
|
||||
return {
|
||||
success: false,
|
||||
error:
|
||||
error.error ??
|
||||
error ??
|
||||
"Unknown error, please contact help@firecrawl.com. Extract id: " +
|
||||
job.data.extractId,
|
||||
};
|
||||
// throw error;
|
||||
} finally {
|
||||
clearInterval(extendLockInterval);
|
||||
@ -949,13 +961,15 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
}
|
||||
|
||||
if (job.data.team_id !== process.env.BACKGROUND_INDEX_TEAM_ID!) {
|
||||
billTeam(job.data.team_id, undefined, creditsToBeBilled, logger).catch((error) => {
|
||||
logger.error(
|
||||
`Failed to bill team ${job.data.team_id} for ${creditsToBeBilled} credits`,
|
||||
{ error },
|
||||
);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
billTeam(job.data.team_id, undefined, creditsToBeBilled, logger).catch(
|
||||
(error) => {
|
||||
logger.error(
|
||||
`Failed to bill team ${job.data.team_id} for ${creditsToBeBilled} credits`,
|
||||
{ error },
|
||||
);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@ -974,11 +988,12 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
|
||||
await finishCrawlIfNeeded(job, sc);
|
||||
}
|
||||
|
||||
|
||||
const isEarlyTimeout =
|
||||
error instanceof Error && error.message === "timeout";
|
||||
const isCancelled =
|
||||
error instanceof Error && error.message === "Parent crawl/batch scrape was cancelled";
|
||||
error instanceof Error &&
|
||||
error.message === "Parent crawl/batch scrape was cancelled";
|
||||
|
||||
if (isEarlyTimeout) {
|
||||
logger.error(`🐂 Job timed out ${job.id}`);
|
||||
|
Loading…
x
Reference in New Issue
Block a user