Nick: formatting done

This commit is contained in:
Nicolas 2025-01-22 18:47:44 -03:00
parent 994e1eb502
commit 498558d358
53 changed files with 10672 additions and 10329 deletions

View File

@ -1,8 +1,6 @@
import request from "supertest"; import request from "supertest";
import { configDotenv } from "dotenv"; import { configDotenv } from "dotenv";
import { import { ScrapeRequestInput } from "../../controllers/v1/types";
ScrapeRequestInput,
} from "../../controllers/v1/types";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
configDotenv(); configDotenv();
@ -19,8 +17,7 @@ describe("E2E Tests for v1 API Routes", () => {
describe("GET /is-production", () => { describe("GET /is-production", () => {
it.concurrent("should return the production status", async () => { it.concurrent("should return the production status", async () => {
const response: any = const response: any = await request(TEST_URL).get("/is-production");
await request(TEST_URL).get("/is-production");
console.log( console.log(
"process.env.USE_DB_AUTHENTICATION", "process.env.USE_DB_AUTHENTICATION",
@ -274,12 +271,11 @@ describe("E2E Tests for v1 API Routes", () => {
url: "https://www.scrapethissite.com/", url: "https://www.scrapethissite.com/",
onlyMainContent: false, // default is true onlyMainContent: false, // default is true
}; };
const responseWithoutRemoveTags: any = const responseWithoutRemoveTags: any = await request(TEST_URL)
await request(TEST_URL) .post("/v1/scrape")
.post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json")
.set("Content-Type", "application/json") .send(scrapeRequest);
.send(scrapeRequest);
expect(responseWithoutRemoveTags.statusCode).toBe(200); expect(responseWithoutRemoveTags.statusCode).toBe(200);
expect(responseWithoutRemoveTags.body).toHaveProperty("data"); expect(responseWithoutRemoveTags.body).toHaveProperty("data");

View File

@ -1,8 +1,6 @@
import request from "supertest"; import request from "supertest";
import { configDotenv } from "dotenv"; import { configDotenv } from "dotenv";
import { import { ScrapeRequest } from "../../controllers/v1/types";
ScrapeRequest,
} from "../../controllers/v1/types";
configDotenv(); configDotenv();
const FIRECRAWL_API_URL = "http://127.0.0.1:3002"; const FIRECRAWL_API_URL = "http://127.0.0.1:3002";
@ -12,9 +10,7 @@ describe("E2E Tests for v1 API Routes", () => {
it.concurrent( it.concurrent(
"should return a successful response for a scrape with 403 page", "should return a successful response for a scrape with 403 page",
async () => { async () => {
const response: any = await request( const response: any = await request(FIRECRAWL_API_URL)
FIRECRAWL_API_URL,
)
.post("/v1/scrape") .post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
@ -39,9 +35,7 @@ describe("E2E Tests for v1 API Routes", () => {
url: E2E_TEST_SERVER_URL, url: E2E_TEST_SERVER_URL,
} as ScrapeRequest; } as ScrapeRequest;
const response: any = await request( const response: any = await request(FIRECRAWL_API_URL)
FIRECRAWL_API_URL,
)
.post("/v1/scrape") .post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
@ -86,9 +80,7 @@ describe("E2E Tests for v1 API Routes", () => {
formats: ["html"], formats: ["html"],
} as ScrapeRequest; } as ScrapeRequest;
const response: any = await request( const response: any = await request(FIRECRAWL_API_URL)
FIRECRAWL_API_URL,
)
.post("/v1/scrape") .post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
@ -121,9 +113,7 @@ describe("E2E Tests for v1 API Routes", () => {
formats: ["rawHtml"], formats: ["rawHtml"],
} as ScrapeRequest; } as ScrapeRequest;
const response: any = await request( const response: any = await request(FIRECRAWL_API_URL)
FIRECRAWL_API_URL,
)
.post("/v1/scrape") .post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
@ -159,9 +149,7 @@ describe("E2E Tests for v1 API Routes", () => {
headers: { "e2e-header-test": "firecrawl" }, headers: { "e2e-header-test": "firecrawl" },
} as ScrapeRequest; } as ScrapeRequest;
const response: any = await request( const response: any = await request(FIRECRAWL_API_URL)
FIRECRAWL_API_URL,
)
.post("/v1/scrape") .post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
@ -188,9 +176,7 @@ describe("E2E Tests for v1 API Routes", () => {
includeTags: ["#content-1"], includeTags: ["#content-1"],
} as ScrapeRequest; } as ScrapeRequest;
const response: any = await request( const response: any = await request(FIRECRAWL_API_URL)
FIRECRAWL_API_URL,
)
.post("/v1/scrape") .post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
@ -220,9 +206,7 @@ describe("E2E Tests for v1 API Routes", () => {
excludeTags: ["#content-1"], excludeTags: ["#content-1"],
} as ScrapeRequest; } as ScrapeRequest;
const response: any = await request( const response: any = await request(FIRECRAWL_API_URL)
FIRECRAWL_API_URL,
)
.post("/v1/scrape") .post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
@ -253,9 +237,7 @@ describe("E2E Tests for v1 API Routes", () => {
onlyMainContent: false, onlyMainContent: false,
} as ScrapeRequest; } as ScrapeRequest;
const response: any = await request( const response: any = await request(FIRECRAWL_API_URL)
FIRECRAWL_API_URL,
)
.post("/v1/scrape") .post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
@ -285,9 +267,7 @@ describe("E2E Tests for v1 API Routes", () => {
timeout: 500, timeout: 500,
} as ScrapeRequest; } as ScrapeRequest;
const response: any = await request( const response: any = await request(FIRECRAWL_API_URL)
FIRECRAWL_API_URL,
)
.post("/v1/scrape") .post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
@ -312,9 +292,7 @@ describe("E2E Tests for v1 API Routes", () => {
mobile: true, mobile: true,
} as ScrapeRequest; } as ScrapeRequest;
const response: any = await request( const response: any = await request(FIRECRAWL_API_URL)
FIRECRAWL_API_URL,
)
.post("/v1/scrape") .post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
@ -335,9 +313,7 @@ describe("E2E Tests for v1 API Routes", () => {
it.concurrent( it.concurrent(
"should handle 'parsePDF' parameter correctly", "should handle 'parsePDF' parameter correctly",
async () => { async () => {
const response: any = await request( const response: any = await request(FIRECRAWL_API_URL)
FIRECRAWL_API_URL,
)
.post("/v1/scrape") .post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
@ -357,9 +333,7 @@ describe("E2E Tests for v1 API Routes", () => {
"h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm", "h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm",
); );
const responseNoParsePDF: any = await request( const responseNoParsePDF: any = await request(FIRECRAWL_API_URL)
FIRECRAWL_API_URL,
)
.post("/v1/scrape") .post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
@ -410,9 +384,7 @@ describe("E2E Tests for v1 API Routes", () => {
timeout: 120000, timeout: 120000,
} as ScrapeRequest; } as ScrapeRequest;
const response: any = await request( const response: any = await request(FIRECRAWL_API_URL)
FIRECRAWL_API_URL,
)
.post("/v1/scrape") .post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
@ -432,12 +404,13 @@ describe("E2E Tests for v1 API Routes", () => {
timeout: 120000, timeout: 120000,
} as ScrapeRequest; } as ScrapeRequest;
const responseWithSkipTlsVerification: any = const responseWithSkipTlsVerification: any = await request(
await request(FIRECRAWL_API_URL) FIRECRAWL_API_URL,
.post("/v1/scrape") )
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .post("/v1/scrape")
.set("Content-Type", "application/json") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.send(scrapeRequestWithSkipTlsVerification); .set("Content-Type", "application/json")
.send(scrapeRequestWithSkipTlsVerification);
console.log("Error1b"); console.log("Error1b");
// console.log(responseWithSkipTlsVerification.body) // console.log(responseWithSkipTlsVerification.body)
@ -461,9 +434,7 @@ describe("E2E Tests for v1 API Routes", () => {
removeBase64Images: true, removeBase64Images: true,
} as ScrapeRequest; } as ScrapeRequest;
const response: any = await request( const response: any = await request(FIRECRAWL_API_URL)
FIRECRAWL_API_URL,
)
.post("/v1/scrape") .post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
@ -493,9 +464,7 @@ describe("E2E Tests for v1 API Routes", () => {
], ],
} as ScrapeRequest; } as ScrapeRequest;
const response: any = await request( const response: any = await request(FIRECRAWL_API_URL)
FIRECRAWL_API_URL,
)
.post("/v1/scrape") .post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
@ -526,9 +495,7 @@ describe("E2E Tests for v1 API Routes", () => {
], ],
} as ScrapeRequest; } as ScrapeRequest;
const response: any = await request( const response: any = await request(FIRECRAWL_API_URL)
FIRECRAWL_API_URL,
)
.post("/v1/scrape") .post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
@ -569,9 +536,7 @@ describe("E2E Tests for v1 API Routes", () => {
], ],
} as ScrapeRequest; } as ScrapeRequest;
const response: any = await request( const response: any = await request(FIRECRAWL_API_URL)
FIRECRAWL_API_URL,
)
.post("/v1/scrape") .post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
@ -619,9 +584,7 @@ describe("E2E Tests for v1 API Routes", () => {
], ],
} as ScrapeRequest; } as ScrapeRequest;
const response: any = await request( const response: any = await request(FIRECRAWL_API_URL)
FIRECRAWL_API_URL,
)
.post("/v1/scrape") .post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
@ -657,9 +620,7 @@ describe("E2E Tests for v1 API Routes", () => {
], ],
} as ScrapeRequest; } as ScrapeRequest;
const response: any = await request( const response: any = await request(FIRECRAWL_API_URL)
FIRECRAWL_API_URL,
)
.post("/v1/scrape") .post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
@ -692,9 +653,7 @@ describe("E2E Tests for v1 API Routes", () => {
], ],
} as ScrapeRequest; } as ScrapeRequest;
const response: any = await request( const response: any = await request(FIRECRAWL_API_URL)
FIRECRAWL_API_URL,
)
.post("/v1/scrape") .post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
@ -731,9 +690,7 @@ describe("E2E Tests for v1 API Routes", () => {
], ],
} as ScrapeRequest; } as ScrapeRequest;
const response: any = await request( const response: any = await request(FIRECRAWL_API_URL)
FIRECRAWL_API_URL,
)
.post("/v1/scrape") .post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")

View File

@ -23,8 +23,7 @@ describe("E2E Tests for v0 API Routes", () => {
describe("POST /v0/scrape", () => { describe("POST /v0/scrape", () => {
it.concurrent("should require authorization", async () => { it.concurrent("should require authorization", async () => {
const response: any = const response: any = await request(TEST_URL).post("/v0/scrape");
await request(TEST_URL).post("/v0/scrape");
expect(response.statusCode).toBe(401); expect(response.statusCode).toBe(401);
}); });
@ -159,12 +158,11 @@ describe("E2E Tests for v0 API Routes", () => {
it.concurrent( it.concurrent(
"should return a successful response with a valid API key with removeTags option", "should return a successful response with a valid API key with removeTags option",
async () => { async () => {
const responseWithoutRemoveTags: any = const responseWithoutRemoveTags: any = await request(TEST_URL)
await request(TEST_URL) .post("/v0/scrape")
.post("/v0/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json")
.set("Content-Type", "application/json") .send({ url: "https://www.scrapethissite.com/" });
.send({ url: "https://www.scrapethissite.com/" });
expect(responseWithoutRemoveTags.statusCode).toBe(200); expect(responseWithoutRemoveTags.statusCode).toBe(200);
expect(responseWithoutRemoveTags.body).toHaveProperty("data"); expect(responseWithoutRemoveTags.body).toHaveProperty("data");
expect(responseWithoutRemoveTags.body.data).toHaveProperty("content"); expect(responseWithoutRemoveTags.body.data).toHaveProperty("content");
@ -332,8 +330,7 @@ describe("E2E Tests for v0 API Routes", () => {
describe("POST /v0/crawl", () => { describe("POST /v0/crawl", () => {
it.concurrent("should require authorization", async () => { it.concurrent("should require authorization", async () => {
const response: any = const response: any = await request(TEST_URL).post("/v0/crawl");
await request(TEST_URL).post("/v0/crawl");
expect(response.statusCode).toBe(401); expect(response.statusCode).toBe(401);
}); });
@ -461,9 +458,7 @@ describe("E2E Tests for v0 API Routes", () => {
} }
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
const completedResponse: any = await request( const completedResponse: any = await request(TEST_URL)
TEST_URL,
)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
@ -509,9 +504,7 @@ describe("E2E Tests for v0 API Routes", () => {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
} }
} }
const completedResponse: any = await request( const completedResponse: any = await request(TEST_URL)
TEST_URL,
)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);

View File

@ -6,31 +6,33 @@ configDotenv();
const TEST_URL = "http://127.0.0.1:3002"; const TEST_URL = "http://127.0.0.1:3002";
async function scrape(body: ScrapeRequestInput) { async function scrape(body: ScrapeRequestInput) {
return await request(TEST_URL) return await request(TEST_URL)
.post("/v1/scrape") .post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
.send(body); .send(body);
} }
function expectScrapeToSucceed(response: Awaited<ReturnType<typeof scrape>>) { function expectScrapeToSucceed(response: Awaited<ReturnType<typeof scrape>>) {
expect(response.statusCode).toBe(200); expect(response.statusCode).toBe(200);
expect(response.body.success).toBe(true); expect(response.body.success).toBe(true);
expect(typeof response.body.data).toBe("object"); expect(typeof response.body.data).toBe("object");
} }
describe("Scrape tests", () => { describe("Scrape tests", () => {
it("mocking works properly", async () => { it("mocking works properly", async () => {
// depends on falsified mock mocking-works-properly // depends on falsified mock mocking-works-properly
// this test will fail if mock is bypassed with real data -- firecrawl.dev will never have // this test will fail if mock is bypassed with real data -- firecrawl.dev will never have
// that as its actual markdown output // that as its actual markdown output
const response = await scrape({ const response = await scrape({
url: "http://firecrawl.dev", url: "http://firecrawl.dev",
useMock: "mocking-works-properly", useMock: "mocking-works-properly",
});
expectScrapeToSucceed(response);
expect(response.body.data.markdown).toBe("this is fake data coming from the mocking system!");
}); });
});
expectScrapeToSucceed(response);
expect(response.body.data.markdown).toBe(
"this is fake data coming from the mocking system!",
);
});
});

View File

@ -4,9 +4,11 @@ const fs = require("fs");
const mocksDirPath = path.join(__dirname, "../../../scraper/scrapeURL/mocks"); const mocksDirPath = path.join(__dirname, "../../../scraper/scrapeURL/mocks");
const files = fs.readdirSync(mocksDirPath); const files = fs.readdirSync(mocksDirPath);
const contents = files.map(x => JSON.parse(fs.readFileSync(path.join(mocksDirPath, x), "utf8"))); const contents = files.map((x) =>
JSON.parse(fs.readFileSync(path.join(mocksDirPath, x), "utf8")),
);
fs.writeFileSync( fs.writeFileSync(
path.join(__dirname, "../mocks/" + process.argv[2] + ".json"), path.join(__dirname, "../mocks/" + process.argv[2] + ".json"),
JSON.stringify(contents, undefined, 4), JSON.stringify(contents, undefined, 4),
); );

View File

@ -105,7 +105,6 @@ export async function getACUC(
{ get: true }, { get: true },
)); ));
if (!error) { if (!error) {
break; break;
} }
@ -146,7 +145,7 @@ export async function clearACUC(api_key: string): Promise<void> {
modes.map(async (mode) => { modes.map(async (mode) => {
const cacheKey = `acuc_${api_key}_${mode}`; const cacheKey = `acuc_${api_key}_${mode}`;
await deleteKey(cacheKey); await deleteKey(cacheKey);
}) }),
); );
// Also clear the base cache key // Also clear the base cache key
@ -232,7 +231,6 @@ export async function supaAuthenticateUser(
teamId = chunk.team_id; teamId = chunk.team_id;
priceId = chunk.price_id; priceId = chunk.price_id;
plan = getPlanByPriceId(priceId); plan = getPlanByPriceId(priceId);
subscriptionData = { subscriptionData = {
team_id: teamId, team_id: teamId,

View File

@ -16,7 +16,7 @@ export async function checkFireEngine(req: Request, res: Response) {
const timeout = setTimeout(() => controller.abort(), 30000); const timeout = setTimeout(() => controller.abort(), 30000);
const urls = ["https://roastmywebsite.ai", "https://example.com"]; const urls = ["https://roastmywebsite.ai", "https://example.com"];
let lastError : string | null = null; let lastError: string | null = null;
for (const url of urls) { for (const url of urls) {
try { try {
@ -62,7 +62,6 @@ export async function checkFireEngine(req: Request, res: Response) {
success: false, success: false,
error: "Internal server error - all retry attempts failed", error: "Internal server error - all retry attempts failed",
}); });
} catch (error) { } catch (error) {
logger.error(error); logger.error(error);
Sentry.captureException(error); Sentry.captureException(error);

View File

@ -227,7 +227,7 @@ export async function crawlController(req: Request, res: Response) {
await addScrapeJob(job.data as any, {}, job.opts.jobId); await addScrapeJob(job.data as any, {}, job.opts.jobId);
} }
}); });
if (sitemap === 0) { if (sitemap === 0) {
await lockURL(id, sc, url); await lockURL(id, sc, url);

View File

@ -1,6 +1,6 @@
import { Response } from "express"; import { Response } from "express";
import { import {
CrawlErrorsResponse, CrawlErrorsResponse,
CrawlStatusParams, CrawlStatusParams,
CrawlStatusResponse, CrawlStatusResponse,
ErrorResponse, ErrorResponse,
@ -62,20 +62,23 @@ export async function crawlErrorsController(
const failedJobIDs: string[] = []; const failedJobIDs: string[] = [];
for (const [id, status] of jobStatuses) { for (const [id, status] of jobStatuses) {
if ( if (status === "failed") {
status === "failed"
) {
failedJobIDs.push(id); failedJobIDs.push(id);
} }
} }
res.status(200).json({ res.status(200).json({
errors: (await getJobs(failedJobIDs)).map(x => ({ errors: (await getJobs(failedJobIDs)).map((x) => ({
id: x.id, id: x.id,
timestamp: x.finishedOn !== undefined ? (new Date(x.finishedOn).toISOString()) : undefined, timestamp:
url: x.data.url, x.finishedOn !== undefined
error: x.failedReason, ? new Date(x.finishedOn).toISOString()
: undefined,
url: x.data.url,
error: x.failedReason,
})), })),
robotsBlocked: await redisConnection.smembers("crawl:" + req.params.jobId + ":robots_blocked"), robotsBlocked: await redisConnection.smembers(
"crawl:" + req.params.jobId + ":robots_blocked",
),
}); });
} }

View File

@ -116,7 +116,10 @@ export async function crawlStatusController(
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] =
sc.cancelled sc.cancelled
? "cancelled" ? "cancelled"
: validJobStatuses.every((x) => x[1] === "completed") && (sc.crawlerOptions ? await isCrawlKickoffFinished(req.params.jobId) : true) : validJobStatuses.every((x) => x[1] === "completed") &&
(sc.crawlerOptions
? await isCrawlKickoffFinished(req.params.jobId)
: true)
? "completed" ? "completed"
: "scraping"; : "scraping";

View File

@ -101,7 +101,7 @@ export async function getMapResults({
}, },
true, true,
true, true,
30000 30000,
); );
if (sitemap > 0) { if (sitemap > 0) {
links = links links = links
@ -164,20 +164,24 @@ export async function getMapResults({
const twoDaysAgo = new Date(); const twoDaysAgo = new Date();
twoDaysAgo.setDate(twoDaysAgo.getDate() - 2); twoDaysAgo.setDate(twoDaysAgo.getDate() - 2);
// If sitemap is not ignored and either we have few URLs (<100) or the data is stale (>2 days old), fetch fresh sitemap // If sitemap is not ignored and either we have few URLs (<100) or the data is stale (>2 days old), fetch fresh sitemap
if ( if (
!ignoreSitemap && !ignoreSitemap &&
(sitemapIndexResult.urls.length < 100 || (sitemapIndexResult.urls.length < 100 ||
new Date(sitemapIndexResult.lastUpdated) < twoDaysAgo) new Date(sitemapIndexResult.lastUpdated) < twoDaysAgo)
) { ) {
try { try {
await crawler.tryGetSitemap(urls => { await crawler.tryGetSitemap(
links.push(...urls); (urls) => {
}, true, false, 30000); links.push(...urls);
},
true,
false,
30000,
);
} catch (e) { } catch (e) {
logger.warn("tryGetSitemap threw an error", { error: e }); logger.warn("tryGetSitemap threw an error", { error: e });
} }
} }
if (!cachedResult) { if (!cachedResult) {
@ -253,7 +257,7 @@ export async function getMapResults({
}, },
{ {
priority: 10, priority: 10,
} },
); );
return { return {

View File

@ -33,7 +33,6 @@ export async function scrapeController(
basePriority: 10, basePriority: 10,
}); });
await addScrapeJob( await addScrapeJob(
{ {
url: req.body.url, url: req.body.url,
@ -97,7 +96,7 @@ export async function scrapeController(
// Don't bill if we're early returning // Don't bill if we're early returning
return; return;
} }
if (req.body.extract && req.body.formats.includes("extract") ) { if (req.body.extract && req.body.formats.includes("extract")) {
creditsToBeBilled = 5; creditsToBeBilled = 5;
} }

View File

@ -125,7 +125,7 @@ export const scrapeOptions = z
"screenshot", "screenshot",
"screenshot@fullPage", "screenshot@fullPage",
"extract", "extract",
"json" "json",
]) ])
.array() .array()
.optional() .optional()
@ -233,7 +233,7 @@ export const extractV1Options = z
.strict(strictMessage) .strict(strictMessage)
.transform((obj) => ({ .transform((obj) => ({
...obj, ...obj,
allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch,
})); }));
export type ExtractV1Options = z.infer<typeof extractV1Options>; export type ExtractV1Options = z.infer<typeof extractV1Options>;
@ -268,11 +268,17 @@ export const scrapeRequestSchema = scrapeOptions
) )
.transform((obj) => { .transform((obj) => {
// Handle timeout // Handle timeout
if ((obj.formats?.includes("extract") || obj.extract || obj.formats?.includes("json") || obj.jsonOptions) && !obj.timeout) { if (
(obj.formats?.includes("extract") ||
obj.extract ||
obj.formats?.includes("json") ||
obj.jsonOptions) &&
!obj.timeout
) {
obj = { ...obj, timeout: 60000 }; obj = { ...obj, timeout: 60000 };
} }
if(obj.formats?.includes("json")) { if (obj.formats?.includes("json")) {
obj.formats.push("extract"); obj.formats.push("extract");
} }
@ -284,8 +290,8 @@ export const scrapeRequestSchema = scrapeOptions
prompt: obj.jsonOptions.prompt, prompt: obj.jsonOptions.prompt,
systemPrompt: obj.jsonOptions.systemPrompt, systemPrompt: obj.jsonOptions.systemPrompt,
schema: obj.jsonOptions.schema, schema: obj.jsonOptions.schema,
mode: "llm" mode: "llm",
} },
}; };
} }
@ -602,15 +608,14 @@ export type CrawlStatusResponse =
data: Document[]; data: Document[];
}; };
export type CrawlErrorsResponse = export type CrawlErrorsResponse =
| ErrorResponse | ErrorResponse
| { | {
errors: { errors: {
id: string, id: string;
timestamp?: string, timestamp?: string;
url: string, url: string;
error: string, error: string;
}[]; }[];
robotsBlocked: string[]; robotsBlocked: string[];
}; };
@ -888,7 +893,6 @@ export type SearchResponse =
data: Document[]; data: Document[];
}; };
export type TokenUsage = { export type TokenUsage = {
promptTokens: number; promptTokens: number;
completionTokens: number; completionTokens: number;

View File

@ -4,7 +4,11 @@ import * as Sentry from "@sentry/node";
import express, { NextFunction, Request, Response } from "express"; import express, { NextFunction, Request, Response } from "express";
import bodyParser from "body-parser"; import bodyParser from "body-parser";
import cors from "cors"; import cors from "cors";
import { getExtractQueue, getScrapeQueue, getIndexQueue } from "./services/queue-service"; import {
getExtractQueue,
getScrapeQueue,
getIndexQueue,
} from "./services/queue-service";
import { v0Router } from "./routes/v0"; import { v0Router } from "./routes/v0";
import os from "os"; import os from "os";
import { logger } from "./lib/logger"; import { logger } from "./lib/logger";

View File

@ -3,101 +3,101 @@ import { deduplicateObjectsArray } from "../extract/helpers/deduplicate-objs-arr
describe("deduplicateObjectsArray", () => { describe("deduplicateObjectsArray", () => {
it("should deduplicate the array", async () => { it("should deduplicate the array", async () => {
const objArray = { const objArray = {
"lawyers": [ lawyers: [
{ {
"name": "James D. Schull", name: "James D. Schull",
"email": null, email: null,
"title": "Personal Injury Attorney", title: "Personal Injury Attorney",
"phone-number": null, "phone-number": null,
"practice-areas": [ "practice-areas": [
{ {
"area": "Personal Injury" area: "Personal Injury",
} },
] ],
}, },
{ {
"name": "James D. Schull", name: "James D. Schull",
"email": null, email: null,
"title": "Personal Injury Attorney", title: "Personal Injury Attorney",
"phone-number": null, "phone-number": null,
"practice-areas": [ "practice-areas": [
{ {
"area": "Personal Injury" area: "Personal Injury",
} },
] ],
}, },
{ {
"name": "James D. Schull", name: "James D. Schull",
"email": null, email: null,
"title": "Personal Injury Attorney", title: "Personal Injury Attorney",
"phone-number": null, "phone-number": null,
"practice-areas": [ "practice-areas": [
{ {
"area": "Personal Injury" area: "Personal Injury",
} },
] ],
} },
] ],
} };
const expected = { const expected = {
"lawyers": [ lawyers: [
{ {
"name": "James D. Schull", name: "James D. Schull",
"email": null, email: null,
"title": "Personal Injury Attorney", title: "Personal Injury Attorney",
"phone-number": null, "phone-number": null,
"practice-areas": [ "practice-areas": [
{ {
"area": "Personal Injury" area: "Personal Injury",
} },
] ],
} },
] ],
} };
const result = await deduplicateObjectsArray(objArray); const result = await deduplicateObjectsArray(objArray);
expect(result).toEqual(expected); expect(result).toEqual(expected);
}) });
it("should not deduplicate if not necessary", async () => { it("should not deduplicate if not necessary", async () => {
const objArray = { const objArray = {
"lawyers": [ lawyers: [
{ {
"name": "James D. Schull", name: "James D. Schull",
"email": null, email: null,
"title": "Personal Injury Attorney", title: "Personal Injury Attorney",
"phone-number": null, "phone-number": null,
"practice-areas": [ "practice-areas": [
{ {
"area": "Personal Injury" area: "Personal Injury",
} },
] ],
}, },
{ {
"name": "John Doe", name: "John Doe",
"email": null, email: null,
"title": "Personal Injury Attorney", title: "Personal Injury Attorney",
"phone-number": null, "phone-number": null,
"practice-areas": [ "practice-areas": [
{ {
"area": "Personal Injury" area: "Personal Injury",
} },
] ],
} },
] ],
} };
const result = await deduplicateObjectsArray(objArray); const result = await deduplicateObjectsArray(objArray);
expect(result).toEqual(objArray); expect(result).toEqual(objArray);
}) });
it("should handle an empty array", async () => { it("should handle an empty array", async () => {
const objArray = { "lawyers": [] }; const objArray = { lawyers: [] };
const expected = { "lawyers": [] }; const expected = { lawyers: [] };
const result = await deduplicateObjectsArray(objArray); const result = await deduplicateObjectsArray(objArray);
@ -106,35 +106,35 @@ describe("deduplicateObjectsArray", () => {
it("should handle objects with different properties", async () => { it("should handle objects with different properties", async () => {
const objArray = { const objArray = {
"lawyers": [ lawyers: [
{ {
"name": "James D. Schull", name: "James D. Schull",
"email": "james@example.com", email: "james@example.com",
"title": "Personal Injury Attorney" title: "Personal Injury Attorney",
}, },
{ {
"name": "James D. Schull", name: "James D. Schull",
"email": "james@example.com", email: "james@example.com",
"title": "Personal Injury Attorney", title: "Personal Injury Attorney",
"phone-number": "123-456-7890" "phone-number": "123-456-7890",
} },
] ],
}; };
const expected = { const expected = {
"lawyers": [ lawyers: [
{ {
"name": "James D. Schull", name: "James D. Schull",
"email": "james@example.com", email: "james@example.com",
"title": "Personal Injury Attorney" title: "Personal Injury Attorney",
}, },
{ {
"name": "James D. Schull", name: "James D. Schull",
"email": "james@example.com", email: "james@example.com",
"title": "Personal Injury Attorney", title: "Personal Injury Attorney",
"phone-number": "123-456-7890" "phone-number": "123-456-7890",
} },
] ],
}; };
const result = await deduplicateObjectsArray(objArray); const result = await deduplicateObjectsArray(objArray);
@ -144,33 +144,33 @@ describe("deduplicateObjectsArray", () => {
it("should handle objects with same properties but different values", async () => { it("should handle objects with same properties but different values", async () => {
const objArray = { const objArray = {
"lawyers": [ lawyers: [
{ {
"name": "James D. Schull", name: "James D. Schull",
"email": "james1@example.com", email: "james1@example.com",
"title": "Personal Injury Attorney" title: "Personal Injury Attorney",
}, },
{ {
"name": "James D. Schull", name: "James D. Schull",
"email": "james2@example.com", email: "james2@example.com",
"title": "Personal Injury Attorney" title: "Personal Injury Attorney",
} },
] ],
}; };
const expected = { const expected = {
"lawyers": [ lawyers: [
{ {
"name": "James D. Schull", name: "James D. Schull",
"email": "james1@example.com", email: "james1@example.com",
"title": "Personal Injury Attorney" title: "Personal Injury Attorney",
}, },
{ {
"name": "James D. Schull", name: "James D. Schull",
"email": "james2@example.com", email: "james2@example.com",
"title": "Personal Injury Attorney" title: "Personal Injury Attorney",
} },
] ],
}; };
const result = await deduplicateObjectsArray(objArray); const result = await deduplicateObjectsArray(objArray);
@ -180,47 +180,47 @@ describe("deduplicateObjectsArray", () => {
it("should handle nested identical objects", async () => { it("should handle nested identical objects", async () => {
const objArray = { const objArray = {
"lawyers": [ lawyers: [
{ {
"name": "James D. Schull", name: "James D. Schull",
"email": null, email: null,
"title": "Personal Injury Attorney", title: "Personal Injury Attorney",
"practice-areas": [ "practice-areas": [
{ {
"area": "Personal Injury" area: "Personal Injury",
} },
] ],
}, },
{ {
"name": "James D. Schull", name: "James D. Schull",
"email": null, email: null,
"title": "Personal Injury Attorney", title: "Personal Injury Attorney",
"practice-areas": [ "practice-areas": [
{ {
"area": "Personal Injury" area: "Personal Injury",
} },
] ],
} },
] ],
}; };
const expected = { const expected = {
"lawyers": [ lawyers: [
{ {
"name": "James D. Schull", name: "James D. Schull",
"email": null, email: null,
"title": "Personal Injury Attorney", title: "Personal Injury Attorney",
"practice-areas": [ "practice-areas": [
{ {
"area": "Personal Injury" area: "Personal Injury",
} },
] ],
} },
] ],
}; };
const result = await deduplicateObjectsArray(objArray); const result = await deduplicateObjectsArray(objArray);
expect(result).toEqual(expected); expect(result).toEqual(expected);
}); });
}) });

View File

@ -3,292 +3,292 @@ import { mergeNullValObjs } from "../extract/helpers/merge-null-val-objs";
describe("mergeNullValObjs", () => { describe("mergeNullValObjs", () => {
it("should merge the objects with null values", async () => { it("should merge the objects with null values", async () => {
const objArray = { const objArray = {
"lawyers": [ lawyers: [
{ {
"name": "Frank Giunta", name: "Frank Giunta",
"email": null, email: null,
"title": "Personal Injury Attorney", title: "Personal Injury Attorney",
"phone-number": "214.370.5200", "phone-number": "214.370.5200",
"practice-areas": [ "practice-areas": [
{ {
"area": "Personal Injury" area: "Personal Injury",
} },
] ],
}, },
{ {
"name": "Frank Giunta", name: "Frank Giunta",
"email": null, email: null,
"title": "Personal Injury Attorney", title: "Personal Injury Attorney",
"phone-number": "214.370.5200", "phone-number": "214.370.5200",
"practice-areas": [ "practice-areas": [
{ {
"area": "Personal Injury" area: "Personal Injury",
} },
] ],
} },
] ],
} };
const expected = { const expected = {
"lawyers": [ lawyers: [
{ {
"name": "Frank Giunta", name: "Frank Giunta",
"email": null, email: null,
"title": "Personal Injury Attorney", title: "Personal Injury Attorney",
"phone-number": "214.370.5200", "phone-number": "214.370.5200",
"practice-areas": [ "practice-areas": [
{ {
"area": "Personal Injury" area: "Personal Injury",
} },
] ],
} },
] ],
} };
const result = mergeNullValObjs(objArray); const result = mergeNullValObjs(objArray);
expect(result).toEqual(expected); expect(result).toEqual(expected);
}) });
it("should handle empty object array", async () => { it("should handle empty object array", async () => {
const objArray = { const objArray = {
"lawyers": [] lawyers: [],
} };
const expected = { const expected = {
"lawyers": [] lawyers: [],
} };
const result = mergeNullValObjs(objArray); const result = mergeNullValObjs(objArray);
expect(result).toEqual(expected); expect(result).toEqual(expected);
}) });
it("should handle object array with no null values", async () => { it("should handle object array with no null values", async () => {
const objArray = { const objArray = {
"lawyers": [ lawyers: [
{ {
"name": "John Doe", name: "John Doe",
"email": "john.doe@example.com", email: "john.doe@example.com",
"title": "Attorney", title: "Attorney",
"phone-number": "123.456.7890", "phone-number": "123.456.7890",
"practice-areas": [ "practice-areas": [
{ {
"area": "Corporate Law" area: "Corporate Law",
} },
] ],
}
]
}
const expected = {
"lawyers": [
{
"name": "John Doe",
"email": "john.doe@example.com",
"title": "Attorney",
"phone-number": "123.456.7890",
"practice-areas": [
{
"area": "Corporate Law"
}
]
}
]
}
const result = mergeNullValObjs(objArray);
expect(result).toEqual(expected);
})
it("should merge objects with different null values", async () => {
const objArray = {
"lawyers": [
{
"name": "Jane Smith",
"email": "null",
"title": "Attorney",
"description": null,
"phone-number": "987.654.3210",
"practice-areas": [
{
"area": "Family Law"
}
]
}, },
{ ],
"name": "Jane Smith",
"email": "jane.smith@example.com",
"title": null,
"description": "Jane Smith is an attorney specializing in Family Law.",
"phone-number": "987.654.3210",
"practice-areas": [
{
"area": "Family Law"
}
]
}
]
}
const expected = {
"lawyers": [
{
"name": "Jane Smith",
"email": "jane.smith@example.com",
"title": "Attorney",
"description": "Jane Smith is an attorney specializing in Family Law.",
"phone-number": "987.654.3210",
"practice-areas": [
{
"area": "Family Law"
}
]
}
]
}
const result = mergeNullValObjs(objArray);
expect(result).toEqual(expected);
})
it("should merge objects with different null values", async () => {
const objArray = {
"lawyers": [
{
"name": "Frank Giunta",
"email": "frank.giunta@example.com",
"title": "Personal Injury Attorney",
"phone-number": "214.370.5200",
"practice-areas": [
{
"area": "Personal Injury"
}
]
},
{
"name": "Frank Giunta",
"email": null,
"title": "Personal Injury Attorney",
"phone-number": "214.370.5200",
"practice-areas": [
{
"area": "Personal Injury"
}
]
},
{
"name": "Dale R. Rose",
"email": null,
"title": "Personal Injury Attorney",
"phone-number": "972.562.0266",
"practice-areas": [
{
"area": "Personal Injury"
}
]
}
]
}; };
const expected = { const expected = {
"lawyers": [ lawyers: [
{ {
"name": "Frank Giunta", name: "John Doe",
"email": "frank.giunta@example.com", email: "john.doe@example.com",
"title": "Personal Injury Attorney", title: "Attorney",
"phone-number": "214.370.5200", "phone-number": "123.456.7890",
"practice-areas": [ "practice-areas": [
{ {
"area": "Personal Injury" area: "Corporate Law",
} },
] ],
}, },
{ ],
"name": "Dale R. Rose",
"email": null,
"title": "Personal Injury Attorney",
"phone-number": "972.562.0266",
"practice-areas": [
{
"area": "Personal Injury"
}
]
}
]
}; };
const result = mergeNullValObjs(objArray); const result = mergeNullValObjs(objArray);
expect(result).toEqual(expected); expect(result).toEqual(expected);
}) });
it("should merge objects with different null values", async () => {
const objArray = {
lawyers: [
{
name: "Jane Smith",
email: "null",
title: "Attorney",
description: null,
"phone-number": "987.654.3210",
"practice-areas": [
{
area: "Family Law",
},
],
},
{
name: "Jane Smith",
email: "jane.smith@example.com",
title: null,
description: "Jane Smith is an attorney specializing in Family Law.",
"phone-number": "987.654.3210",
"practice-areas": [
{
area: "Family Law",
},
],
},
],
};
const expected = {
lawyers: [
{
name: "Jane Smith",
email: "jane.smith@example.com",
title: "Attorney",
description: "Jane Smith is an attorney specializing in Family Law.",
"phone-number": "987.654.3210",
"practice-areas": [
{
area: "Family Law",
},
],
},
],
};
const result = mergeNullValObjs(objArray);
expect(result).toEqual(expected);
});
it("should merge objects with different null values", async () => {
const objArray = {
lawyers: [
{
name: "Frank Giunta",
email: "frank.giunta@example.com",
title: "Personal Injury Attorney",
"phone-number": "214.370.5200",
"practice-areas": [
{
area: "Personal Injury",
},
],
},
{
name: "Frank Giunta",
email: null,
title: "Personal Injury Attorney",
"phone-number": "214.370.5200",
"practice-areas": [
{
area: "Personal Injury",
},
],
},
{
name: "Dale R. Rose",
email: null,
title: "Personal Injury Attorney",
"phone-number": "972.562.0266",
"practice-areas": [
{
area: "Personal Injury",
},
],
},
],
};
const expected = {
lawyers: [
{
name: "Frank Giunta",
email: "frank.giunta@example.com",
title: "Personal Injury Attorney",
"phone-number": "214.370.5200",
"practice-areas": [
{
area: "Personal Injury",
},
],
},
{
name: "Dale R. Rose",
email: null,
title: "Personal Injury Attorney",
"phone-number": "972.562.0266",
"practice-areas": [
{
area: "Personal Injury",
},
],
},
],
};
const result = mergeNullValObjs(objArray);
expect(result).toEqual(expected);
});
it("should correctly merge and deduplicate objects", async () => { it("should correctly merge and deduplicate objects", async () => {
const objArray = { const objArray = {
"lawyers": [ lawyers: [
{ {
"name": "Frank Giunta", name: "Frank Giunta",
"email": null, email: null,
"title": "Personal Injury Attorney", title: "Personal Injury Attorney",
"phone-number": "214.370.5200", "phone-number": "214.370.5200",
"practice-areas": [ "practice-areas": [
{ {
"area": "Personal Injury" area: "Personal Injury",
} },
] ],
}, },
{ {
"name": "Frank Giunta", name: "Frank Giunta",
"email": null, email: null,
"title": "Personal Injury Attorney", title: "Personal Injury Attorney",
"phone-number": "214.370.5200", "phone-number": "214.370.5200",
"practice-areas": [ "practice-areas": [
{ {
"area": "Personal Injury" area: "Personal Injury",
} },
] ],
}, },
{ {
"name": "Dale R. Rose", name: "Dale R. Rose",
"email": null, email: null,
"title": "Personal Injury Attorney", title: "Personal Injury Attorney",
"phone-number": "972.562.0266", "phone-number": "972.562.0266",
"practice-areas": [ "practice-areas": [
{ {
"area": "Personal Injury" area: "Personal Injury",
} },
] ],
} },
] ],
}; };
const expected = { const expected = {
"lawyers": [ lawyers: [
{ {
"name": "Frank Giunta", name: "Frank Giunta",
"email": null, email: null,
"title": "Personal Injury Attorney", title: "Personal Injury Attorney",
"phone-number": "214.370.5200", "phone-number": "214.370.5200",
"practice-areas": [ "practice-areas": [
{ {
"area": "Personal Injury" area: "Personal Injury",
} },
] ],
}, },
{ {
"name": "Dale R. Rose", name: "Dale R. Rose",
"email": null, email: null,
"title": "Personal Injury Attorney", title: "Personal Injury Attorney",
"phone-number": "972.562.0266", "phone-number": "972.562.0266",
"practice-areas": [ "practice-areas": [
{ {
"area": "Personal Injury" area: "Personal Injury",
} },
] ],
} },
] ],
}; };
const result = mergeNullValObjs(objArray); const result = mergeNullValObjs(objArray);
@ -298,177 +298,172 @@ describe("mergeNullValObjs", () => {
it("should merge arrays of similar objects", async () => { it("should merge arrays of similar objects", async () => {
const objArray = { const objArray = {
"lawyers": [ lawyers: [
{ {
"name": "Allen Cox", name: "Allen Cox",
"email": null, email: null,
"title": "Personal Injury Lawyer", title: "Personal Injury Lawyer",
"phone-number": "972.606.9000", "phone-number": "972.606.9000",
"practice-areas": [ "practice-areas": [{ area: "Personal Injury" }],
{ "area": "Personal Injury" }
]
}, },
{ {
"name": "Allen Cox", name: "Allen Cox",
"email": "allen.cox@example.com", email: "allen.cox@example.com",
"title": "Personal Injury Lawyer", title: "Personal Injury Lawyer",
"phone-number": null, "phone-number": null,
"practice-areas": [ "practice-areas": [
{ "area": "Automobile accidents" }, { area: "Automobile accidents" },
{ "area": "Truck accidents" }, { area: "Truck accidents" },
{ "area": "Amusement park injury" }, { area: "Amusement park injury" },
{ "area": "Bus accident" }, { area: "Bus accident" },
{ "area": "Industrial accidents" }, { area: "Industrial accidents" },
{ "area": "Product defects" }, { area: "Product defects" },
{ "area": "Food poisoning" }, { area: "Food poisoning" },
{ "area": "Workplace accidents" }, { area: "Workplace accidents" },
{ "area": "Wrongful death" }, { area: "Wrongful death" },
{ "area": "Swimming pool accidents" }, { area: "Swimming pool accidents" },
{ "area": "Premises accidents" }, { area: "Premises accidents" },
{ "area": "Aircraft accidents" }, { area: "Aircraft accidents" },
{ "area": "Animal and dog bites" } { area: "Animal and dog bites" },
] ],
} },
] ],
} };
const expected = { const expected = {
"lawyers": [ lawyers: [
{ {
"name": "Allen Cox", name: "Allen Cox",
"email": "allen.cox@example.com", email: "allen.cox@example.com",
"title": "Personal Injury Lawyer", title: "Personal Injury Lawyer",
"phone-number": "972.606.9000", "phone-number": "972.606.9000",
"practice-areas": [ "practice-areas": [
{ "area": "Personal Injury" }, { area: "Personal Injury" },
{ "area": "Automobile accidents" }, { area: "Automobile accidents" },
{ "area": "Truck accidents" }, { area: "Truck accidents" },
{ "area": "Amusement park injury" }, { area: "Amusement park injury" },
{ "area": "Bus accident" }, { area: "Bus accident" },
{ "area": "Industrial accidents" }, { area: "Industrial accidents" },
{ "area": "Product defects" }, { area: "Product defects" },
{ "area": "Food poisoning" }, { area: "Food poisoning" },
{ "area": "Workplace accidents" }, { area: "Workplace accidents" },
{ "area": "Wrongful death" }, { area: "Wrongful death" },
{ "area": "Swimming pool accidents" }, { area: "Swimming pool accidents" },
{ "area": "Premises accidents" }, { area: "Premises accidents" },
{ "area": "Aircraft accidents" }, { area: "Aircraft accidents" },
{ "area": "Animal and dog bites" } { area: "Animal and dog bites" },
] ],
} },
] ],
} };
const result = mergeNullValObjs(objArray); const result = mergeNullValObjs(objArray);
expect(result).toEqual(expected); expect(result).toEqual(expected);
}) });
it("should merge arrays of similar objects with different key names", async () => { it("should merge arrays of similar objects with different key names", async () => {
const objArray = { const objArray = {
"attorneys": [ attorneys: [
{ {
"fullName": "Allen Cox", fullName: "Allen Cox",
"contactEmail": null, contactEmail: null,
"position": "Personal Injury Lawyer", position: "Personal Injury Lawyer",
"contactNumber": "972.606.9000", contactNumber: "972.606.9000",
"specializations": [ specializations: [{ field: "Personal Injury" }],
{ "field": "Personal Injury" }
]
}, },
{ {
"fullName": "Allen Cox", fullName: "Allen Cox",
"contactEmail": "allen.cox@example.com", contactEmail: "allen.cox@example.com",
"position": "Personal Injury Lawyer", position: "Personal Injury Lawyer",
"contactNumber": null, contactNumber: null,
"specializations": [ specializations: [
{ "field": "Automobile accidents" }, { field: "Automobile accidents" },
{ "field": "Truck accidents" }, { field: "Truck accidents" },
{ "field": "Amusement park injury" }, { field: "Amusement park injury" },
{ "field": "Bus accident" }, { field: "Bus accident" },
{ "field": "Industrial accidents" }, { field: "Industrial accidents" },
{ "field": "Product defects" }, { field: "Product defects" },
{ "field": "Food poisoning" }, { field: "Food poisoning" },
{ "field": "Workplace accidents" }, { field: "Workplace accidents" },
{ "field": "Wrongful death" }, { field: "Wrongful death" },
{ "field": "Swimming pool accidents" }, { field: "Swimming pool accidents" },
{ "field": "Premises accidents" }, { field: "Premises accidents" },
{ "field": "Aircraft accidents" }, { field: "Aircraft accidents" },
{ "field": "Animal and dog bites" } { field: "Animal and dog bites" },
] ],
} },
] ],
} };
const expected = { const expected = {
"attorneys": [ attorneys: [
{ {
"fullName": "Allen Cox", fullName: "Allen Cox",
"contactEmail": "allen.cox@example.com", contactEmail: "allen.cox@example.com",
"position": "Personal Injury Lawyer", position: "Personal Injury Lawyer",
"contactNumber": "972.606.9000", contactNumber: "972.606.9000",
"specializations": [ specializations: [
{ "field": "Personal Injury" }, { field: "Personal Injury" },
{ "field": "Automobile accidents" }, { field: "Automobile accidents" },
{ "field": "Truck accidents" }, { field: "Truck accidents" },
{ "field": "Amusement park injury" }, { field: "Amusement park injury" },
{ "field": "Bus accident" }, { field: "Bus accident" },
{ "field": "Industrial accidents" }, { field: "Industrial accidents" },
{ "field": "Product defects" }, { field: "Product defects" },
{ "field": "Food poisoning" }, { field: "Food poisoning" },
{ "field": "Workplace accidents" }, { field: "Workplace accidents" },
{ "field": "Wrongful death" }, { field: "Wrongful death" },
{ "field": "Swimming pool accidents" }, { field: "Swimming pool accidents" },
{ "field": "Premises accidents" }, { field: "Premises accidents" },
{ "field": "Aircraft accidents" }, { field: "Aircraft accidents" },
{ "field": "Animal and dog bites" } { field: "Animal and dog bites" },
] ],
} },
] ],
} };
const result = mergeNullValObjs(objArray); const result = mergeNullValObjs(objArray);
expect(result).toEqual(expected); expect(result).toEqual(expected);
}) });
it ("should deal with not array values", async () => { it("should deal with not array values", async () => {
const objArray = { const objArray = {
"lawyers": { lawyers: {
"name": "not an array" name: "not an array",
}, },
"attorneys": { attorneys: {
"name": "not an array" name: "not an array",
} },
} };
const expected = { const expected = {
"lawyers": { lawyers: {
"name": "not an array" name: "not an array",
}, },
"attorneys": { attorneys: {
"name": "not an array" name: "not an array",
} },
} };
// @ts-expect-error // @ts-expect-error
const result = mergeNullValObjs(objArray); const result = mergeNullValObjs(objArray);
expect(result).toEqual(expected); expect(result).toEqual(expected);
}) });
it ("should deal with arrays of strings", async () => { it("should deal with arrays of strings", async () => {
const objArray = { const objArray = {
"lawyers": ["res1", "res2", "res3"] lawyers: ["res1", "res2", "res3"],
} };
const expected = { const expected = {
"lawyers": ["res1", "res2", "res3"] lawyers: ["res1", "res2", "res3"],
} };
const result = mergeNullValObjs(objArray); const result = mergeNullValObjs(objArray);
expect(result).toEqual(expected); expect(result).toEqual(expected);
}) });
});
})

File diff suppressed because it is too large Load Diff

View File

@ -2,7 +2,7 @@ import { spreadSchemas } from "../extract/helpers/spread-schemas";
describe("spreadSchemas", () => { describe("spreadSchemas", () => {
it("should spread kyb schema (id: 1)", async () => { it("should spread kyb schema (id: 1)", async () => {
const keys = ["owners"] const keys = ["owners"];
const schema = { const schema = {
type: "object", type: "object",
properties: { properties: {
@ -21,13 +21,13 @@ describe("spreadSchemas", () => {
city: { type: "string" }, city: { type: "string" },
state: { type: "string" }, state: { type: "string" },
country: { type: "string" }, country: { type: "string" },
postal_code: { type: "string" } postal_code: { type: "string" },
}, },
}, },
incorporation_date: { type: "string", format: "date" }, incorporation_date: { type: "string", format: "date" },
phone: { type: "string" }, phone: { type: "string" },
email: { type: "string", format: "email" } email: { type: "string", format: "email" },
} },
}, },
owners: { owners: {
type: "array", type: "array",
@ -43,18 +43,21 @@ describe("spreadSchemas", () => {
city: { type: "string" }, city: { type: "string" },
state: { type: "string" }, state: { type: "string" },
country: { type: "string" }, country: { type: "string" },
postal_code: { type: "string" } postal_code: { type: "string" },
}, },
}, },
phone: { type: "string" }, phone: { type: "string" },
email: { type: "string", format: "email" } email: { type: "string", format: "email" },
} },
} },
} },
} },
} };
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys) const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
schema,
keys,
);
expect(singleAnswerSchema).toEqual({ expect(singleAnswerSchema).toEqual({
type: "object", type: "object",
@ -74,16 +77,16 @@ describe("spreadSchemas", () => {
city: { type: "string" }, city: { type: "string" },
state: { type: "string" }, state: { type: "string" },
country: { type: "string" }, country: { type: "string" },
postal_code: { type: "string" } postal_code: { type: "string" },
} },
}, },
incorporation_date: { type: "string", format: "date" }, incorporation_date: { type: "string", format: "date" },
phone: { type: "string" }, phone: { type: "string" },
email: { type: "string", format: "email" } email: { type: "string", format: "email" },
} },
}, },
}, },
}) });
expect(multiEntitySchema).toEqual({ expect(multiEntitySchema).toEqual({
type: "object", type: "object",
@ -102,20 +105,20 @@ describe("spreadSchemas", () => {
city: { type: "string" }, city: { type: "string" },
state: { type: "string" }, state: { type: "string" },
country: { type: "string" }, country: { type: "string" },
postal_code: { type: "string" } postal_code: { type: "string" },
} },
}, },
phone: { type: "string" }, phone: { type: "string" },
email: { type: "string", format: "email" } email: { type: "string", format: "email" },
} },
} },
} },
} },
}) });
}) });
it("should spread lawyers schema (id: 9)", async () => { it("should spread lawyers schema (id: 9)", async () => {
const keys = ["lawyers"] const keys = ["lawyers"];
const schema = { const schema = {
type: "object", type: "object",
properties: { properties: {
@ -133,22 +136,25 @@ describe("spreadSchemas", () => {
items: { items: {
type: "object", type: "object",
properties: { properties: {
area: { type: "string" } area: { type: "string" },
}, },
}, },
alias: "practice-areas" alias: "practice-areas",
} },
}, },
} },
} },
} },
}; };
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys) const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
schema,
keys,
);
expect(singleAnswerSchema).toEqual({}) expect(singleAnswerSchema).toEqual({});
expect(multiEntitySchema).toEqual(schema) expect(multiEntitySchema).toEqual(schema);
}) });
it("shoud spread (id: 26)", async () => { it("shoud spread (id: 26)", async () => {
const schema = { const schema = {
@ -161,19 +167,22 @@ describe("spreadSchemas", () => {
properties: { properties: {
name: { type: "string" }, name: { type: "string" },
price: { type: "string" }, price: { type: "string" },
description: { type: "string" } description: { type: "string" },
} },
} },
} },
} },
} };
const keys = ["products"] const keys = ["products"];
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys) const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
schema,
keys,
);
expect(singleAnswerSchema).toEqual({}) expect(singleAnswerSchema).toEqual({});
expect(multiEntitySchema).toEqual(schema) expect(multiEntitySchema).toEqual(schema);
}) });
it("shoud spread categories and products", async () => { it("shoud spread categories and products", async () => {
const schema = { const schema = {
@ -182,8 +191,8 @@ describe("spreadSchemas", () => {
categories: { categories: {
type: "array", type: "array",
items: { items: {
type: "string" type: "string",
} },
}, },
products: { products: {
type: "array", type: "array",
@ -192,19 +201,22 @@ describe("spreadSchemas", () => {
properties: { properties: {
name: { type: "string" }, name: { type: "string" },
price: { type: "string" }, price: { type: "string" },
description: { type: "string" } description: { type: "string" },
} },
} },
} },
} },
} };
const keys = ["products", "categories"] const keys = ["products", "categories"];
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys) const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
schema,
keys,
);
expect(singleAnswerSchema).toEqual({}) expect(singleAnswerSchema).toEqual({});
expect(multiEntitySchema).toEqual(schema) expect(multiEntitySchema).toEqual(schema);
}) });
it("should spread (id: 29)", async () => { it("should spread (id: 29)", async () => {
const schema = { const schema = {
@ -220,50 +232,55 @@ describe("spreadSchemas", () => {
offers_cmmc: { type: "boolean" }, offers_cmmc: { type: "boolean" },
has_soc_2_cert: { type: "boolean" }, has_soc_2_cert: { type: "boolean" },
offers_office365: { type: "boolean" }, offers_office365: { type: "boolean" },
offers_endpoint_security: { type: "boolean" } offers_endpoint_security: { type: "boolean" },
} },
} };
const keys = [] const keys = [];
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys) const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
schema,
keys,
);
expect(singleAnswerSchema).toEqual(schema) expect(singleAnswerSchema).toEqual(schema);
expect(multiEntitySchema).toEqual({}) expect(multiEntitySchema).toEqual({});
}) });
it("should spread kyb schema (id: 29)", async () => { it("should spread kyb schema (id: 29)", async () => {
const schema = { const schema = {
"type": "object", type: "object",
"properties": { properties: {
"lawyers": { lawyers: {
"type": "array", type: "array",
"items": { items: {
"type": "object", type: "object",
"properties": { properties: {
"name": { "type": "string" }, name: { type: "string" },
"email": { "type": ["string", "null"] }, email: { type: ["string", "null"] },
"phone-number": { "type": "string" }, "phone-number": { type: "string" },
"practice-areas": { "practice-areas": {
"type": "array", type: "array",
"items": { items: {
"type": "object", type: "object",
"properties": { properties: {
"area": { "type": "string" } area: { type: "string" },
} },
} },
}, },
"title": { "type": ["string", "null"] } title: { type: ["string", "null"] },
}, },
} },
} },
} },
} };
const keys = ["lawyers"] const keys = ["lawyers"];
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys) const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
schema,
keys,
);
expect(singleAnswerSchema).toEqual({}) expect(singleAnswerSchema).toEqual({});
expect(multiEntitySchema).toEqual(schema) expect(multiEntitySchema).toEqual(schema);
}) });
}) });

File diff suppressed because it is too large Load Diff

View File

@ -42,7 +42,10 @@ export async function saveEntryToCache(key: string, entry: CacheEntry) {
if (!cacheRedis) return; if (!cacheRedis) return;
if (!entry.html || entry.html.length < 100) { if (!entry.html || entry.html.length < 100) {
logger.warn("Skipping cache save for short HTML", { key, htmlLength: entry.html?.length }); logger.warn("Skipping cache save for short HTML", {
key,
htmlLength: entry.html?.length,
});
return; return;
} }

View File

@ -127,13 +127,15 @@ export async function getDoneJobsOrdered(
export async function isCrawlFinished(id: string) { export async function isCrawlFinished(id: string) {
return ( return (
(await redisConnection.scard("crawl:" + id + ":jobs_done")) === (await redisConnection.scard("crawl:" + id + ":jobs_done")) ===
(await redisConnection.scard("crawl:" + id + ":jobs")) (await redisConnection.scard("crawl:" + id + ":jobs")) &&
&& (await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null (await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null
); );
} }
export async function isCrawlKickoffFinished(id: string) { export async function isCrawlKickoffFinished(id: string) {
return await redisConnection.get("crawl:" + id + ":kickoff:finish") !== null return (
(await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null
);
} }
export async function isCrawlFinishedLocked(id: string) { export async function isCrawlFinishedLocked(id: string) {
@ -141,7 +143,12 @@ export async function isCrawlFinishedLocked(id: string) {
} }
export async function finishCrawlKickoff(id: string) { export async function finishCrawlKickoff(id: string) {
await redisConnection.set("crawl:" + id + ":kickoff:finish", "yes", "EX", 24 * 60 * 60); await redisConnection.set(
"crawl:" + id + ":kickoff:finish",
"yes",
"EX",
24 * 60 * 60,
);
} }
export async function finishCrawl(id: string) { export async function finishCrawl(id: string) {
@ -161,9 +168,10 @@ export async function finishCrawl(id: string) {
module: "crawl-redis", module: "crawl-redis",
method: "finishCrawl", method: "finishCrawl",
crawlId: id, crawlId: id,
jobs_done: (await redisConnection.scard("crawl:" + id + ":jobs_done")), jobs_done: await redisConnection.scard("crawl:" + id + ":jobs_done"),
jobs: (await redisConnection.scard("crawl:" + id + ":jobs")), jobs: await redisConnection.scard("crawl:" + id + ":jobs"),
kickoff_finished: (await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null, kickoff_finished:
(await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null,
}); });
} }
} }

View File

@ -1,81 +1,81 @@
// const id = crypto.randomUUID(); // const id = crypto.randomUUID();
// const sc: StoredCrawl = { // const sc: StoredCrawl = {
// originUrl: request.urls[0].replace("/*",""), // originUrl: request.urls[0].replace("/*",""),
// crawlerOptions: toLegacyCrawlerOptions({ // crawlerOptions: toLegacyCrawlerOptions({
// maxDepth: 15, // maxDepth: 15,
// limit: 5000, // limit: 5000,
// includePaths: [], // includePaths: [],
// excludePaths: [], // excludePaths: [],
// ignoreSitemap: false, // ignoreSitemap: false,
// allowExternalLinks: false, // allowExternalLinks: false,
// allowBackwardLinks: true, // allowBackwardLinks: true,
// allowSubdomains: false, // allowSubdomains: false,
// ignoreRobotsTxt: false, // ignoreRobotsTxt: false,
// deduplicateSimilarURLs: false, // deduplicateSimilarURLs: false,
// ignoreQueryParameters: false // ignoreQueryParameters: false
// }), // }),
// scrapeOptions: { // scrapeOptions: {
// formats: ["markdown"], // formats: ["markdown"],
// onlyMainContent: true, // onlyMainContent: true,
// waitFor: 0, // waitFor: 0,
// mobile: false, // mobile: false,
// removeBase64Images: true, // removeBase64Images: true,
// fastMode: false, // fastMode: false,
// parsePDF: true, // parsePDF: true,
// skipTlsVerification: false, // skipTlsVerification: false,
// }, // },
// internalOptions: { // internalOptions: {
// disableSmartWaitCache: true, // disableSmartWaitCache: true,
// isBackgroundIndex: true // isBackgroundIndex: true
// }, // },
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!, // team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
// createdAt: Date.now(), // createdAt: Date.now(),
// plan: "hobby", // make it a low concurrency // plan: "hobby", // make it a low concurrency
// }; // };
// // Save the crawl configuration // // Save the crawl configuration
// await saveCrawl(id, sc); // await saveCrawl(id, sc);
// // Then kick off the job // // Then kick off the job
// await _addScrapeJobToBullMQ({ // await _addScrapeJobToBullMQ({
// url: request.urls[0].replace("/*",""), // url: request.urls[0].replace("/*",""),
// mode: "kickoff" as const, // mode: "kickoff" as const,
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!, // team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
// plan: "hobby", // make it a low concurrency // plan: "hobby", // make it a low concurrency
// crawlerOptions: sc.crawlerOptions, // crawlerOptions: sc.crawlerOptions,
// scrapeOptions: sc.scrapeOptions, // scrapeOptions: sc.scrapeOptions,
// internalOptions: sc.internalOptions, // internalOptions: sc.internalOptions,
// origin: "index", // origin: "index",
// crawl_id: id, // crawl_id: id,
// webhook: null, // webhook: null,
// v1: true, // v1: true,
// }, {}, crypto.randomUUID(), 50); // }, {}, crypto.randomUUID(), 50);
// we restructure and make all of the arrays we need to fill into objects, // we restructure and make all of the arrays we need to fill into objects,
// adding them to a single object so the llm can fill them one at a time // adding them to a single object so the llm can fill them one at a time
// TODO: make this work for more complex schemas where arrays are not first level // TODO: make this work for more complex schemas where arrays are not first level
// let schemasForLLM: {} = {}; // let schemasForLLM: {} = {};
// for (const key in largeArraysSchema) { // for (const key in largeArraysSchema) {
// const originalSchema = structuredClone(largeArraysSchema[key].items); // const originalSchema = structuredClone(largeArraysSchema[key].items);
// console.log( // console.log(
// "key", // "key",
// key, // key,
// "\noriginalSchema", // "\noriginalSchema",
// JSON.stringify(largeArraysSchema[key], null, 2), // JSON.stringify(largeArraysSchema[key], null, 2),
// ); // );
// let clonedObj = { // let clonedObj = {
// type: "object", // type: "object",
// properties: { // properties: {
// informationFilled: { // informationFilled: {
// type: "boolean", // type: "boolean",
// }, // },
// data: { // data: {
// type: "object", // type: "object",
// properties: originalSchema.properties, // properties: originalSchema.properties,
// }, // },
// }, // },
// }; // };
// schemasForLLM[key] = clonedObj; // schemasForLLM[key] = clonedObj;
// } // }

View File

@ -59,11 +59,11 @@ export async function updateExtract(
// Limit links in steps to 500 // Limit links in steps to 500
if (extract.steps) { if (extract.steps) {
extract.steps = extract.steps.map(step => { extract.steps = extract.steps.map((step) => {
if (step.discoveredLinks && step.discoveredLinks.length > 500) { if (step.discoveredLinks && step.discoveredLinks.length > 500) {
return { return {
...step, ...step,
discoveredLinks: step.discoveredLinks.slice(0, 500) discoveredLinks: step.discoveredLinks.slice(0, 500),
}; };
} }
return step; return step;

View File

@ -32,7 +32,11 @@ import { ExtractStep, updateExtract } from "./extract-redis";
import { deduplicateObjectsArray } from "./helpers/deduplicate-objs-array"; import { deduplicateObjectsArray } from "./helpers/deduplicate-objs-array";
import { mergeNullValObjs } from "./helpers/merge-null-val-objs"; import { mergeNullValObjs } from "./helpers/merge-null-val-objs";
import { CUSTOM_U_TEAMS, extractConfig } from "./config"; import { CUSTOM_U_TEAMS, extractConfig } from "./config";
import { calculateFinalResultCost, estimateCost, estimateTotalCost } from "./usage/llm-cost"; import {
calculateFinalResultCost,
estimateCost,
estimateTotalCost,
} from "./usage/llm-cost";
import { numTokensFromString } from "../LLM-extraction/helpers"; import { numTokensFromString } from "../LLM-extraction/helpers";
interface ExtractServiceOptions { interface ExtractServiceOptions {
@ -147,7 +151,13 @@ Schema: ${schemaString}\nPrompt: ${prompt}\nRelevant URLs: ${urls}`,
totalTokens: result.usage?.total_tokens ?? 0, totalTokens: result.usage?.total_tokens ?? 0,
model: model, model: model,
}; };
return { isMultiEntity, multiEntityKeys, reasoning, keyIndicators, tokenUsage }; return {
isMultiEntity,
multiEntityKeys,
reasoning,
keyIndicators,
tokenUsage,
};
} }
type completions = { type completions = {
@ -187,7 +197,7 @@ export async function performExtraction(
method: "performExtraction", method: "performExtraction",
extractId, extractId,
}); });
// Token tracking // Token tracking
let tokenUsage: TokenUsage[] = []; let tokenUsage: TokenUsage[] = [];
@ -246,7 +256,7 @@ export async function performExtraction(
"No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.", "No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.",
extractId, extractId,
urlTrace: urlTraces, urlTrace: urlTraces,
totalUrlsScraped: 0 totalUrlsScraped: 0,
}; };
} }
@ -277,8 +287,13 @@ export async function performExtraction(
// 1. the first one is a completion that will extract the array of items // 1. the first one is a completion that will extract the array of items
// 2. the second one is multiple completions that will extract the items from the array // 2. the second one is multiple completions that will extract the items from the array
let startAnalyze = Date.now(); let startAnalyze = Date.now();
const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators, tokenUsage: schemaAnalysisTokenUsage } = const {
await analyzeSchemaAndPrompt(links, reqSchema, request.prompt ?? ""); isMultiEntity,
multiEntityKeys,
reasoning,
keyIndicators,
tokenUsage: schemaAnalysisTokenUsage,
} = await analyzeSchemaAndPrompt(links, reqSchema, request.prompt ?? "");
// Track schema analysis tokens // Track schema analysis tokens
tokenUsage.push(schemaAnalysisTokenUsage); tokenUsage.push(schemaAnalysisTokenUsage);
@ -540,7 +555,7 @@ export async function performExtraction(
"An unexpected error occurred. Please contact help@firecrawl.com for help.", "An unexpected error occurred. Please contact help@firecrawl.com for help.",
extractId, extractId,
urlTrace: urlTraces, urlTrace: urlTraces,
totalUrlsScraped totalUrlsScraped,
}; };
} }
} }
@ -592,17 +607,18 @@ export async function performExtraction(
} }
} }
const validResults = results.filter((doc): doc is Document => doc !== null); const validResults = results.filter(
(doc): doc is Document => doc !== null,
);
singleAnswerDocs.push(...validResults); singleAnswerDocs.push(...validResults);
totalUrlsScraped += validResults.length; totalUrlsScraped += validResults.length;
} catch (error) { } catch (error) {
return { return {
success: false, success: false,
error: error.message, error: error.message,
extractId, extractId,
urlTrace: urlTraces, urlTrace: urlTraces,
totalUrlsScraped totalUrlsScraped,
}; };
} }
@ -614,7 +630,7 @@ export async function performExtraction(
"All provided URLs are invalid. Please check your input and try again.", "All provided URLs are invalid. Please check your input and try again.",
extractId, extractId,
urlTrace: request.urlTrace ? urlTraces : undefined, urlTrace: request.urlTrace ? urlTraces : undefined,
totalUrlsScraped: 0 totalUrlsScraped: 0,
}; };
} }
@ -679,12 +695,12 @@ export async function performExtraction(
: singleAnswerResult || multiEntityResult; : singleAnswerResult || multiEntityResult;
// Tokenize final result to get token count // Tokenize final result to get token count
let finalResultTokens = 0; // let finalResultTokens = 0;
if (finalResult) { // if (finalResult) {
const finalResultStr = JSON.stringify(finalResult); // const finalResultStr = JSON.stringify(finalResult);
finalResultTokens = numTokensFromString(finalResultStr, "gpt-4o"); // finalResultTokens = numTokensFromString(finalResultStr, "gpt-4o");
} // }
// // Deduplicate and validate final result against schema // // Deduplicate and validate final result against schema
// if (reqSchema && finalResult && finalResult.length <= extractConfig.DEDUPLICATION.MAX_TOKENS) { // if (reqSchema && finalResult && finalResult.length <= extractConfig.DEDUPLICATION.MAX_TOKENS) {
// const schemaValidation = await generateOpenAICompletions( // const schemaValidation = await generateOpenAICompletions(
@ -695,7 +711,7 @@ export async function performExtraction(
// 1. Remove any duplicate entries in the data extracted by merging that into a single object according to the provided shcema // 1. Remove any duplicate entries in the data extracted by merging that into a single object according to the provided shcema
// 2. Ensure all data matches the provided schema // 2. Ensure all data matches the provided schema
// 3. Keep only the highest quality and most complete entries when duplicates are found. // 3. Keep only the highest quality and most complete entries when duplicates are found.
// Do not change anything else. If data is null keep it null. If the schema is not provided, return the data as is.`, // Do not change anything else. If data is null keep it null. If the schema is not provided, return the data as is.`,
// prompt: `Please validate and merge the duplicate entries in this data according to the schema provided:\n // prompt: `Please validate and merge the duplicate entries in this data according to the schema provided:\n
@ -732,12 +748,10 @@ export async function performExtraction(
const llmUsage = estimateTotalCost(tokenUsage); const llmUsage = estimateTotalCost(tokenUsage);
let tokensToBill = calculateFinalResultCost(finalResult); let tokensToBill = calculateFinalResultCost(finalResult);
if (CUSTOM_U_TEAMS.includes(teamId)) { if (CUSTOM_U_TEAMS.includes(teamId)) {
tokensToBill = 1; tokensToBill = 1;
} }
// Bill team for usage // Bill team for usage
billTeam(teamId, subId, tokensToBill, logger, true).catch((error) => { billTeam(teamId, subId, tokensToBill, logger, true).catch((error) => {
logger.error( logger.error(
@ -745,7 +759,6 @@ export async function performExtraction(
); );
}); });
// Log job with token usage // Log job with token usage
logJob({ logJob({
job_id: extractId, job_id: extractId,
@ -779,6 +792,6 @@ export async function performExtraction(
warning: undefined, // TODO FIX warning: undefined, // TODO FIX
urlTrace: request.urlTrace ? urlTraces : undefined, urlTrace: request.urlTrace ? urlTraces : undefined,
llmUsage, llmUsage,
totalUrlsScraped totalUrlsScraped,
}; };
} }

View File

@ -1,10 +1,12 @@
export function deduplicateObjectsArray(objArray: { [key: string]: any[] }): { [key: string]: any[] } { export function deduplicateObjectsArray(objArray: { [key: string]: any[] }): {
[key: string]: any[];
} {
const deduplicatedObjArray: { [key: string]: any[] } = {}; const deduplicatedObjArray: { [key: string]: any[] } = {};
for (const key in objArray) { for (const key in objArray) {
if (Array.isArray(objArray[key])) { if (Array.isArray(objArray[key])) {
const seen = new Set(); const seen = new Set();
deduplicatedObjArray[key] = objArray[key].filter(item => { deduplicatedObjArray[key] = objArray[key].filter((item) => {
// Create a unique identifier for each item based on its properties // Create a unique identifier for each item based on its properties
const identifier = JSON.stringify(item); const identifier = JSON.stringify(item);
@ -24,4 +26,4 @@ export function deduplicateObjectsArray(objArray: { [key: string]: any[] }): { [
} }
return deduplicatedObjArray; return deduplicatedObjArray;
} }

View File

@ -7,4 +7,4 @@ export async function dereferenceSchema(schema: any): Promise<any> {
console.error("Failed to dereference schema:", error); console.error("Failed to dereference schema:", error);
throw error; throw error;
} }
} }

View File

@ -1,5 +1,5 @@
import * as fs from 'fs'; import * as fs from "fs";
import * as path from 'path'; import * as path from "path";
/** /**
* Helper function to dump data to a file for debugging/logging purposes * Helper function to dump data to a file for debugging/logging purposes
@ -10,17 +10,19 @@ import * as path from 'path';
export function dumpToFile<T>( export function dumpToFile<T>(
filename: string, filename: string,
data: T[], data: T[],
formatter?: (item: T, index: number) => string formatter?: (item: T, index: number) => string,
) { ) {
const filePath = path.join(__dirname, filename); const filePath = path.join(__dirname, filename);
let fileContent: string; let fileContent: string;
if (formatter) { if (formatter) {
fileContent = data.map((item, index) => formatter(item, index)).join('\n'); fileContent = data.map((item, index) => formatter(item, index)).join("\n");
} else { } else {
fileContent = data.map((item, index) => `${index + 1}. ${JSON.stringify(item)}`).join('\n'); fileContent = data
.map((item, index) => `${index + 1}. ${JSON.stringify(item)}`)
.join("\n");
} }
fs.writeFileSync(filePath, fileContent, 'utf8'); fs.writeFileSync(filePath, fileContent, "utf8");
console.log(`Dumped data to ${filename}`); console.log(`Dumped data to ${filename}`);
} }

View File

@ -1,4 +1,4 @@
import { deduplicateObjectsArray } from './deduplicate-objs-array'; import { deduplicateObjectsArray } from "./deduplicate-objs-array";
/** /**
* Convert "null" strings to actual null values for easier comparison. * Convert "null" strings to actual null values for easier comparison.
@ -25,16 +25,16 @@ function areMergeable(obj1: any, obj2: any): boolean {
const allKeys = new Set([...Object.keys(obj1), ...Object.keys(obj2)]); const allKeys = new Set([...Object.keys(obj1), ...Object.keys(obj2)]);
let matchingNonNullValues = 0; let matchingNonNullValues = 0;
let nonNullComparisons = 0; let nonNullComparisons = 0;
for (const key of allKeys) { for (const key of allKeys) {
const val1 = obj1[key]; const val1 = obj1[key];
const val2 = obj2[key]; const val2 = obj2[key];
// Skip array comparisons - they'll be merged separately // Skip array comparisons - they'll be merged separately
if (Array.isArray(val1) || Array.isArray(val2)) { if (Array.isArray(val1) || Array.isArray(val2)) {
continue; continue;
} }
// If both values exist and are not null // If both values exist and are not null
if (val1 !== null && val2 !== null) { if (val1 !== null && val2 !== null) {
nonNullComparisons++; nonNullComparisons++;
@ -43,7 +43,7 @@ function areMergeable(obj1: any, obj2: any): boolean {
} }
} }
} }
// Objects are mergeable if they have at least one matching non-null value // Objects are mergeable if they have at least one matching non-null value
// and all their non-null values match when both objects have them // and all their non-null values match when both objects have them
return nonNullComparisons > 0 && matchingNonNullValues === nonNullComparisons; return nonNullComparisons > 0 && matchingNonNullValues === nonNullComparisons;
@ -56,7 +56,10 @@ function mergeArrays(arr1: any[], arr2: any[]): any[] {
const combined = [...arr1, ...arr2]; const combined = [...arr1, ...arr2];
return combined.filter((item, index) => { return combined.filter((item, index) => {
const stringified = JSON.stringify(item); const stringified = JSON.stringify(item);
return combined.findIndex(other => JSON.stringify(other) === stringified) === index; return (
combined.findIndex((other) => JSON.stringify(other) === stringified) ===
index
);
}); });
} }
@ -78,9 +81,9 @@ function mergeObjects(obj1: any, obj2: any): any {
// If only obj2's value is an array, use it // If only obj2's value is an array, use it
result[key] = [...obj2[key]]; result[key] = [...obj2[key]];
} }
} else if (typeof obj2[key] === 'object') { } else if (typeof obj2[key] === "object") {
// If both are objects (but not arrays), merge them // If both are objects (but not arrays), merge them
if (typeof result[key] === 'object' && !Array.isArray(result[key])) { if (typeof result[key] === "object" && !Array.isArray(result[key])) {
result[key] = mergeObjects(result[key], obj2[key]); result[key] = mergeObjects(result[key], obj2[key]);
} else { } else {
result[key] = { ...obj2[key] }; result[key] = { ...obj2[key] };
@ -101,13 +104,17 @@ function mergeObjects(obj1: any, obj2: any): any {
* null-equivalent fields, filling in null fields with the corresponding * null-equivalent fields, filling in null fields with the corresponding
* non-null fields from the other object. * non-null fields from the other object.
*/ */
export function mergeNullValObjs(objArray: { [key: string]: any[] }): { [key: string]: any[] } { export function mergeNullValObjs(objArray: { [key: string]: any[] }): {
[key: string]: any[];
} {
const result: { [key: string]: any[] } = {}; const result: { [key: string]: any[] } = {};
for (const key in objArray) { for (const key in objArray) {
if (Array.isArray(objArray[key])) { if (Array.isArray(objArray[key])) {
// If array contains only primitive values, return as is // If array contains only primitive values, return as is
if (objArray[key].every(item => typeof item !== 'object' || item === null)) { if (
objArray[key].every((item) => typeof item !== "object" || item === null)
) {
result[key] = [...objArray[key]]; result[key] = [...objArray[key]];
continue; continue;
} }
@ -117,7 +124,7 @@ export function mergeNullValObjs(objArray: { [key: string]: any[] }): { [key: st
for (const item of items) { for (const item of items) {
let merged = false; let merged = false;
for (let i = 0; i < mergedItems.length; i++) { for (let i = 0; i < mergedItems.length; i++) {
if (areMergeable(mergedItems[i], item)) { if (areMergeable(mergedItems[i], item)) {
mergedItems[i] = mergeObjects(mergedItems[i], item); mergedItems[i] = mergeObjects(mergedItems[i], item);
@ -125,7 +132,7 @@ export function mergeNullValObjs(objArray: { [key: string]: any[] }): { [key: st
break; break;
} }
} }
if (!merged) { if (!merged) {
mergedItems.push({ ...item }); mergedItems.push({ ...item });
} }
@ -134,10 +141,13 @@ export function mergeNullValObjs(objArray: { [key: string]: any[] }): { [key: st
// Final deduplication pass // Final deduplication pass
result[key] = deduplicateObjectsArray({ [key]: mergedItems })[key]; result[key] = deduplicateObjectsArray({ [key]: mergedItems })[key];
} else { } else {
console.warn(`Expected an array at objArray[${key}], but found:`, objArray[key]); console.warn(
`Expected an array at objArray[${key}], but found:`,
objArray[key],
);
return objArray; return objArray;
} }
} }
return result; return result;
} }

View File

@ -1,7 +1,7 @@
export async function mixSchemaObjects( export async function mixSchemaObjects(
finalSchema: any, finalSchema: any,
singleAnswerResult: any, singleAnswerResult: any,
multiEntityResult: any multiEntityResult: any,
) { ) {
const finalResult: any = {}; const finalResult: any = {};
@ -9,14 +9,20 @@ export async function mixSchemaObjects(
function mergeResults(schema: any, singleResult: any, multiResult: any) { function mergeResults(schema: any, singleResult: any, multiResult: any) {
const result: any = {}; const result: any = {};
for (const key in schema.properties) { for (const key in schema.properties) {
if (schema.properties[key].type === 'object' && schema.properties[key].properties) { if (
schema.properties[key].type === "object" &&
schema.properties[key].properties
) {
// If the property is an object, recursively merge its properties // If the property is an object, recursively merge its properties
result[key] = mergeResults( result[key] = mergeResults(
schema.properties[key], schema.properties[key],
singleResult[key] || {}, singleResult[key] || {},
multiResult[key] || {} multiResult[key] || {},
); );
} else if (schema.properties[key].type === 'array' && Array.isArray(multiResult[key])) { } else if (
schema.properties[key].type === "array" &&
Array.isArray(multiResult[key])
) {
// If the property is an array, flatten the arrays from multiResult // If the property is an array, flatten the arrays from multiResult
result[key] = multiResult[key].flat(); result[key] = multiResult[key].flat();
} else if (singleResult.hasOwnProperty(key)) { } else if (singleResult.hasOwnProperty(key)) {
@ -29,7 +35,10 @@ export async function mixSchemaObjects(
} }
// Merge the properties from the final schema // Merge the properties from the final schema
Object.assign(finalResult, mergeResults(finalSchema, singleAnswerResult, multiEntityResult)); Object.assign(
finalResult,
mergeResults(finalSchema, singleAnswerResult, multiEntityResult),
);
return finalResult; return finalResult;
} }

View File

@ -1,4 +1,7 @@
export async function spreadSchemas(schema: any, keys: string[]): Promise<{ export async function spreadSchemas(
schema: any,
keys: string[],
): Promise<{
singleAnswerSchema: any; singleAnswerSchema: any;
multiEntitySchema: any; multiEntitySchema: any;
}> { }> {
@ -32,7 +35,7 @@ export async function spreadSchemas(schema: any, keys: string[]): Promise<{
if (Object.keys(singleAnswerSchema.properties).length === 0) { if (Object.keys(singleAnswerSchema.properties).length === 0) {
singleAnswerSchema = {}; singleAnswerSchema = {};
} }
if (Object.keys(multiEntitySchema.properties).length === 0) { if (Object.keys(multiEntitySchema.properties).length === 0) {
multiEntitySchema = {}; multiEntitySchema = {};
} }
@ -41,4 +44,4 @@ export async function spreadSchemas(schema: any, keys: string[]): Promise<{
singleAnswerSchema, singleAnswerSchema,
multiEntitySchema, multiEntitySchema,
}; };
} }

View File

@ -1,21 +1,21 @@
import isEqual from 'lodash/isEqual'; import isEqual from "lodash/isEqual";
export function transformArrayToObject( export function transformArrayToObject(
originalSchema: any, originalSchema: any,
arrayData: any[] arrayData: any[],
): any { ): any {
if (Object.keys(originalSchema).length == 0) { if (Object.keys(originalSchema).length == 0) {
return {}; return {};
} }
const transformedResult: any = {}; const transformedResult: any = {};
// Function to find the array key in a nested schema // Function to find the array key in a nested schema
function findArrayKey(schema: any): string | null { function findArrayKey(schema: any): string | null {
for (const key in schema.properties) { for (const key in schema.properties) {
if (schema.properties[key].type === 'array') { if (schema.properties[key].type === "array") {
return key; return key;
} else if (schema.properties[key].type === 'object') { } else if (schema.properties[key].type === "object") {
const nestedKey = findArrayKey(schema.properties[key]); const nestedKey = findArrayKey(schema.properties[key]);
if (nestedKey) { if (nestedKey) {
return `${key}.${nestedKey}`; return `${key}.${nestedKey}`;
@ -31,7 +31,10 @@ export function transformArrayToObject(
for (const key in item) { for (const key in item) {
if (!acc[key]) { if (!acc[key]) {
acc[key] = item[key]; acc[key] = item[key];
} else if (typeof acc[key] === 'object' && typeof item[key] === 'object') { } else if (
typeof acc[key] === "object" &&
typeof item[key] === "object"
) {
acc[key] = { ...acc[key], ...item[key] }; acc[key] = { ...acc[key], ...item[key] };
} }
} }
@ -39,13 +42,16 @@ export function transformArrayToObject(
}, {}); }, {});
} }
const arrayKeyParts = arrayKeyPath.split('.'); const arrayKeyParts = arrayKeyPath.split(".");
const arrayKey = arrayKeyParts.pop(); const arrayKey = arrayKeyParts.pop();
if (!arrayKey) { if (!arrayKey) {
throw new Error("Array key not found in schema"); throw new Error("Array key not found in schema");
} }
const parentSchema = arrayKeyParts.reduce((schema, key) => schema.properties[key], originalSchema); const parentSchema = arrayKeyParts.reduce(
(schema, key) => schema.properties[key],
originalSchema,
);
const itemSchema = parentSchema.properties[arrayKey].items; const itemSchema = parentSchema.properties[arrayKey].items;
if (!itemSchema) { if (!itemSchema) {
throw new Error("Item schema not found for array key"); throw new Error("Item schema not found for array key");
@ -53,7 +59,7 @@ export function transformArrayToObject(
// Initialize the array in the transformed result // Initialize the array in the transformed result
let currentLevel = transformedResult; let currentLevel = transformedResult;
arrayKeyParts.forEach(part => { arrayKeyParts.forEach((part) => {
if (!currentLevel[part]) { if (!currentLevel[part]) {
currentLevel[part] = {}; currentLevel[part] = {};
} }
@ -63,20 +69,23 @@ export function transformArrayToObject(
// Helper function to check if an object is already in the array // Helper function to check if an object is already in the array
function isDuplicateObject(array: any[], obj: any): boolean { function isDuplicateObject(array: any[], obj: any): boolean {
return array.some(existingItem => isEqual(existingItem, obj)); return array.some((existingItem) => isEqual(existingItem, obj));
} }
// Helper function to validate if an object follows the schema // Helper function to validate if an object follows the schema
function isValidObject(obj: any, schema: any): boolean { function isValidObject(obj: any, schema: any): boolean {
return Object.keys(schema.properties).every(key => { return Object.keys(schema.properties).every((key) => {
return obj.hasOwnProperty(key) && typeof obj[key] === schema.properties[key].type; return (
obj.hasOwnProperty(key) &&
typeof obj[key] === schema.properties[key].type
);
}); });
} }
// Iterate over each item in the arrayData // Iterate over each item in the arrayData
arrayData.forEach(item => { arrayData.forEach((item) => {
let currentItem = item; let currentItem = item;
arrayKeyParts.forEach(part => { arrayKeyParts.forEach((part) => {
if (currentItem[part]) { if (currentItem[part]) {
currentItem = currentItem[part]; currentItem = currentItem[part];
} }
@ -84,43 +93,63 @@ export function transformArrayToObject(
// Copy non-array properties from the parent object // Copy non-array properties from the parent object
for (const key in parentSchema.properties) { for (const key in parentSchema.properties) {
if (key !== arrayKey && currentItem.hasOwnProperty(key) && !currentLevel.hasOwnProperty(key)) { if (
key !== arrayKey &&
currentItem.hasOwnProperty(key) &&
!currentLevel.hasOwnProperty(key)
) {
currentLevel[key] = currentItem[key]; currentLevel[key] = currentItem[key];
} }
} }
// Ensure that the currentItem[arrayKey] is an array before mapping // Ensure that the currentItem[arrayKey] is an array before mapping
if (Array.isArray(currentItem[arrayKey])) { if (Array.isArray(currentItem[arrayKey])) {
currentItem[arrayKey].forEach((subItem: any) => { currentItem[arrayKey].forEach((subItem: any) => {
if (typeof subItem === 'object' && subItem !== null && isValidObject(subItem, itemSchema)) { if (
// For arrays of objects, add only unique objects typeof subItem === "object" &&
const transformedItem: any = {}; subItem !== null &&
let hasValidData = false; isValidObject(subItem, itemSchema)
) {
// For arrays of objects, add only unique objects
const transformedItem: any = {};
let hasValidData = false;
for (const key in itemSchema.properties) { for (const key in itemSchema.properties) {
if (subItem.hasOwnProperty(key) && subItem[key] !== undefined) { if (subItem.hasOwnProperty(key) && subItem[key] !== undefined) {
transformedItem[key] = subItem[key]; transformedItem[key] = subItem[key];
hasValidData = true; hasValidData = true;
}
}
if (
hasValidData &&
!isDuplicateObject(currentLevel[arrayKey], transformedItem)
) {
currentLevel[arrayKey].push(transformedItem);
} }
} }
});
if (hasValidData && !isDuplicateObject(currentLevel[arrayKey], transformedItem)) { } else {
currentLevel[arrayKey].push(transformedItem); console.warn(
} `Expected an array at ${arrayKey}, but found:`,
} currentItem[arrayKey],
}); );
} else { }
console.warn(`Expected an array at ${arrayKey}, but found:`, currentItem[arrayKey]);
}
// Handle merging of array properties // Handle merging of array properties
for (const key in parentSchema.properties) { for (const key in parentSchema.properties) {
if (parentSchema.properties[key].type === 'array' && Array.isArray(currentItem[key])) { if (
parentSchema.properties[key].type === "array" &&
Array.isArray(currentItem[key])
) {
if (!currentLevel[key]) { if (!currentLevel[key]) {
currentLevel[key] = []; currentLevel[key] = [];
} }
currentItem[key].forEach((value: any) => { currentItem[key].forEach((value: any) => {
if (!currentLevel[key].includes(value) && !isDuplicateObject(currentLevel[arrayKey], value)) { if (
!currentLevel[key].includes(value) &&
!isDuplicateObject(currentLevel[arrayKey], value)
) {
currentLevel[key].push(value); currentLevel[key].push(value);
} }
}); });
@ -129,4 +158,4 @@ export function transformArrayToObject(
}); });
return transformedResult; return transformedResult;
} }

View File

@ -91,7 +91,8 @@ export async function indexPage({
url: normalizedUrl, url: normalizedUrl,
originUrl: normalizeUrl(originUrl), originUrl: normalizeUrl(originUrl),
title: document.metadata.title ?? document.metadata.ogTitle ?? "", title: document.metadata.title ?? document.metadata.ogTitle ?? "",
description: document.metadata.description ?? document.metadata.ogDescription ?? "", description:
document.metadata.description ?? document.metadata.ogDescription ?? "",
crawlId, crawlId,
teamId, teamId,
markdown: trimmedMarkdown, markdown: trimmedMarkdown,
@ -126,7 +127,7 @@ export async function indexPage({
export async function searchSimilarPages( export async function searchSimilarPages(
query: string, query: string,
originUrl?: string, originUrl?: string,
limit: number = 1000 limit: number = 1000,
): Promise<any[]> { ): Promise<any[]> {
try { try {
const index = pinecone.index(INDEX_NAME); const index = pinecone.index(INDEX_NAME);

View File

@ -59,7 +59,7 @@ export async function rerankLinks(
const linksAndScores = await performRanking( const linksAndScores = await performRanking(
mappedLinksRerank, mappedLinksRerank,
mappedLinks.map((l) => l.url), mappedLinks.map((l) => l.url),
searchQuery searchQuery,
); );
// First try with high threshold // First try with high threshold
@ -109,8 +109,11 @@ export async function rerankLinks(
} }
}); });
const rankedLinks = filteredLinks.slice(0, extractConfig.RERANKING.MAX_RANKING_LIMIT_FOR_RELEVANCE); const rankedLinks = filteredLinks.slice(
0,
extractConfig.RERANKING.MAX_RANKING_LIMIT_FOR_RELEVANCE,
);
// Mark URLs that will be used in completion // Mark URLs that will be used in completion
rankedLinks.forEach((link) => { rankedLinks.forEach((link) => {
const trace = urlTraces.find((t) => t.url === link.url); const trace = urlTraces.find((t) => t.url === link.url);
@ -120,13 +123,15 @@ export async function rerankLinks(
}); });
// Mark URLs that were dropped due to ranking limit // Mark URLs that were dropped due to ranking limit
filteredLinks.slice(extractConfig.RERANKING.MAX_RANKING_LIMIT_FOR_RELEVANCE).forEach(link => { filteredLinks
const trace = urlTraces.find(t => t.url === link.url); .slice(extractConfig.RERANKING.MAX_RANKING_LIMIT_FOR_RELEVANCE)
if (trace) { .forEach((link) => {
trace.warning = "Excluded due to ranking limit"; const trace = urlTraces.find((t) => t.url === link.url);
trace.usedInCompletion = false; if (trace) {
} trace.warning = "Excluded due to ranking limit";
}); trace.usedInCompletion = false;
}
});
// console.log("Reranked links: ", rankedLinks.length); // console.log("Reranked links: ", rankedLinks.length);
@ -155,7 +160,7 @@ function filterAndProcessLinks(
export type RerankerResult = { export type RerankerResult = {
mapDocument: MapDocument[]; mapDocument: MapDocument[];
tokensUsed: number; tokensUsed: number;
} };
export async function rerankLinksWithLLM( export async function rerankLinksWithLLM(
mappedLinks: MapDocument[], mappedLinks: MapDocument[],
@ -167,7 +172,7 @@ export async function rerankLinksWithLLM(
const TIMEOUT_MS = 20000; const TIMEOUT_MS = 20000;
const MAX_RETRIES = 2; const MAX_RETRIES = 2;
let totalTokensUsed = 0; let totalTokensUsed = 0;
// Split mappedLinks into chunks of 200 // Split mappedLinks into chunks of 200
for (let i = 0; i < mappedLinks.length; i += chunkSize) { for (let i = 0; i < mappedLinks.length; i += chunkSize) {
chunks.push(mappedLinks.slice(i, i + chunkSize)); chunks.push(mappedLinks.slice(i, i + chunkSize));
@ -184,23 +189,25 @@ export async function rerankLinksWithLLM(
type: "object", type: "object",
properties: { properties: {
url: { type: "string" }, url: { type: "string" },
relevanceScore: { type: "number" } relevanceScore: { type: "number" },
}, },
required: ["url", "relevanceScore"] required: ["url", "relevanceScore"],
} },
} },
}, },
required: ["relevantLinks"] required: ["relevantLinks"],
}; };
const results = await Promise.all( const results = await Promise.all(
chunks.map(async (chunk, chunkIndex) => { chunks.map(async (chunk, chunkIndex) => {
// console.log(`Processing chunk ${chunkIndex + 1}/${chunks.length} with ${chunk.length} links`); // console.log(`Processing chunk ${chunkIndex + 1}/${chunks.length} with ${chunk.length} links`);
const linksContent = chunk.map(link => const linksContent = chunk
`URL: ${link.url}${link.title ? `\nTitle: ${link.title}` : ''}${link.description ? `\nDescription: ${link.description}` : ''}` .map(
).join("\n\n"); (link) =>
`URL: ${link.url}${link.title ? `\nTitle: ${link.title}` : ""}${link.description ? `\nDescription: ${link.description}` : ""}`,
)
.join("\n\n");
for (let retry = 0; retry <= MAX_RETRIES; retry++) { for (let retry = 0; retry <= MAX_RETRIES; retry++) {
try { try {
@ -208,22 +215,28 @@ export async function rerankLinksWithLLM(
setTimeout(() => resolve(null), TIMEOUT_MS); setTimeout(() => resolve(null), TIMEOUT_MS);
}); });
const completionPromise = generateOpenAICompletions( const completionPromise = generateOpenAICompletions(
logger.child({ method: "rerankLinksWithLLM", chunk: chunkIndex + 1, retry }), logger.child({
method: "rerankLinksWithLLM",
chunk: chunkIndex + 1,
retry,
}),
{ {
mode: "llm", mode: "llm",
systemPrompt: buildRerankerSystemPrompt(), systemPrompt: buildRerankerSystemPrompt(),
prompt: buildRerankerUserPrompt(searchQuery), prompt: buildRerankerUserPrompt(searchQuery),
schema: schema schema: schema,
}, },
linksContent, linksContent,
undefined, undefined,
true true,
); );
const completion = await Promise.race([completionPromise, timeoutPromise]); const completion = await Promise.race([
completionPromise,
timeoutPromise,
]);
if (!completion) { if (!completion) {
// console.log(`Chunk ${chunkIndex + 1}: Timeout on attempt ${retry + 1}`); // console.log(`Chunk ${chunkIndex + 1}: Timeout on attempt ${retry + 1}`);
continue; continue;
@ -237,9 +250,11 @@ export async function rerankLinksWithLLM(
totalTokensUsed += completion.numTokens || 0; totalTokensUsed += completion.numTokens || 0;
// console.log(`Chunk ${chunkIndex + 1}: Found ${completion.extract.relevantLinks.length} relevant links`); // console.log(`Chunk ${chunkIndex + 1}: Found ${completion.extract.relevantLinks.length} relevant links`);
return completion.extract.relevantLinks; return completion.extract.relevantLinks;
} catch (error) { } catch (error) {
console.warn(`Error processing chunk ${chunkIndex + 1} attempt ${retry + 1}:`, error); console.warn(
`Error processing chunk ${chunkIndex + 1} attempt ${retry + 1}:`,
error,
);
if (retry === MAX_RETRIES) { if (retry === MAX_RETRIES) {
// console.log(`Chunk ${chunkIndex + 1}: Max retries reached, returning empty array`); // console.log(`Chunk ${chunkIndex + 1}: Max retries reached, returning empty array`);
return []; return [];
@ -247,18 +262,20 @@ export async function rerankLinksWithLLM(
} }
} }
return []; return [];
}) }),
); );
// console.log(`Processed ${results.length} chunks`); // console.log(`Processed ${results.length} chunks`);
// Flatten results and sort by relevance score // Flatten results and sort by relevance score
const flattenedResults = results.flat().sort((a, b) => b.relevanceScore - a.relevanceScore); const flattenedResults = results
.flat()
.sort((a, b) => b.relevanceScore - a.relevanceScore);
// console.log(`Total relevant links found: ${flattenedResults.length}`); // console.log(`Total relevant links found: ${flattenedResults.length}`);
// Map back to MapDocument format, keeping only relevant links // Map back to MapDocument format, keeping only relevant links
const relevantLinks = flattenedResults const relevantLinks = flattenedResults
.map(result => mappedLinks.find(link => link.url === result.url)) .map((result) => mappedLinks.find((link) => link.url === result.url))
.filter((link): link is MapDocument => link !== undefined); .filter((link): link is MapDocument => link !== undefined);
// console.log(`Returning ${relevantLinks.length} relevant links`); // console.log(`Returning ${relevantLinks.length} relevant links`);

View File

@ -184,8 +184,6 @@ export async function processUrl(
// (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}` // (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}`
// ); // );
const rerankerResult = await rerankLinksWithLLM( const rerankerResult = await rerankLinksWithLLM(
mappedLinks, mappedLinks,
rephrasedPrompt, rephrasedPrompt,

View File

@ -12,7 +12,9 @@ const tokenPerCharacter = 4;
const baseTokenCost = 300; const baseTokenCost = 300;
export function calculateFinalResultCost(data: any): number { export function calculateFinalResultCost(data: any): number {
return Math.floor((JSON.stringify(data).length / tokenPerCharacter) + baseTokenCost); return Math.floor(
JSON.stringify(data).length / tokenPerCharacter + baseTokenCost,
);
} }
export function estimateTotalCost(tokenUsage: TokenUsage[]): number { export function estimateTotalCost(tokenUsage: TokenUsage[]): number {

File diff suppressed because it is too large Load Diff

View File

@ -55,9 +55,9 @@ async function performRanking(
// Generate embeddings for each link and calculate similarity in parallel // Generate embeddings for each link and calculate similarity in parallel
const linksAndScores = await Promise.all( const linksAndScores = await Promise.all(
linksWithContext.map((linkWithContext, index) => linksWithContext.map((linkWithContext, index) =>
getEmbedding(linkWithContext) getEmbedding(linkWithContext)
.then(linkEmbedding => { .then((linkEmbedding) => {
const score = cosineSimilarity(queryEmbedding, linkEmbedding); const score = cosineSimilarity(queryEmbedding, linkEmbedding);
return { return {
link: links[index], link: links[index],
@ -71,8 +71,8 @@ async function performRanking(
linkWithContext, linkWithContext,
score: 0, score: 0,
originalIndex: index, originalIndex: index,
})) })),
) ),
); );
// Sort links based on similarity scores while preserving original order for equal scores // Sort links based on similarity scores while preserving original order for equal scores

View File

@ -252,20 +252,19 @@ export class WebCrawler {
}; };
const timeoutPromise = new Promise((_, reject) => { const timeoutPromise = new Promise((_, reject) => {
setTimeout(() => reject(new Error('Sitemap fetch timeout')), timeout); setTimeout(() => reject(new Error("Sitemap fetch timeout")), timeout);
}); });
try { try {
let count = await Promise.race([ let count = (await Promise.race([
Promise.all([ Promise.all([
this.tryFetchSitemapLinks( this.tryFetchSitemapLinks(this.initialUrl, _urlsHandler),
this.initialUrl, ...this.robots
_urlsHandler, .getSitemaps()
), .map((x) => this.tryFetchSitemapLinks(x, _urlsHandler)),
...this.robots.getSitemaps().map(x => this.tryFetchSitemapLinks(x, _urlsHandler)), ]).then((results) => results.reduce((a, x) => a + x, 0)),
]).then(results => results.reduce((a,x) => a+x, 0)), timeoutPromise,
timeoutPromise ])) as number;
]) as number;
if (count > 0) { if (count > 0) {
if ( if (
@ -281,14 +280,14 @@ export class WebCrawler {
return count; return count;
} catch (error) { } catch (error) {
if (error.message === 'Sitemap fetch timeout') { if (error.message === "Sitemap fetch timeout") {
this.logger.warn('Sitemap fetch timed out', { this.logger.warn("Sitemap fetch timed out", {
method: "tryGetSitemap", method: "tryGetSitemap",
timeout, timeout,
}); });
return 0; return 0;
} }
this.logger.error('Error fetching sitemap', { this.logger.error("Error fetching sitemap", {
method: "tryGetSitemap", method: "tryGetSitemap",
error, error,
}); });
@ -328,9 +327,16 @@ export class WebCrawler {
!this.matchesExcludes(path) && !this.matchesExcludes(path) &&
!this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt) !this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
) { ) {
(async() => { (async () => {
await redisConnection.sadd("crawl:" + this.jobId + ":robots_blocked", fullUrl); await redisConnection.sadd(
await redisConnection.expire("crawl:" + this.jobId + ":robots_blocked", 24 * 60 * 60, "NX"); "crawl:" + this.jobId + ":robots_blocked",
fullUrl,
);
await redisConnection.expire(
"crawl:" + this.jobId + ":robots_blocked",
24 * 60 * 60,
"NX",
);
})(); })();
} }
} else { } else {

View File

@ -1,5 +1,8 @@
import { logger } from "../../lib/logger"; import { logger } from "../../lib/logger";
import { normalizeUrl, normalizeUrlOnlyHostname } from "../../lib/canonical-url"; import {
normalizeUrl,
normalizeUrlOnlyHostname,
} from "../../lib/canonical-url";
import { supabase_service } from "../../services/supabase"; import { supabase_service } from "../../services/supabase";
/** /**
@ -28,13 +31,19 @@ async function querySitemapIndexFunction(url: string) {
return { urls: [], lastUpdated: new Date(0) }; return { urls: [], lastUpdated: new Date(0) };
} }
const allUrls = [...new Set(data.map((entry) => entry.urls).flat().map(url => normalizeUrl(url)))]; const allUrls = [
...new Set(
data
.map((entry) => entry.urls)
.flat()
.map((url) => normalizeUrl(url)),
),
];
return { urls: allUrls, lastUpdated: data[0].updated_at }; return { urls: allUrls, lastUpdated: data[0].updated_at };
} catch (error) { } catch (error) {
logger.error("(sitemap-index) Error querying the index", { logger.error("(sitemap-index) Error querying the index", {
error, error,
attempt attempt,
}); });
if (attempt === 3) { if (attempt === 3) {
@ -46,4 +55,7 @@ async function querySitemapIndexFunction(url: string) {
return { urls: [], lastUpdated: new Date(0) }; return { urls: [], lastUpdated: new Date(0) };
} }
export const querySitemapIndex = withAuth(querySitemapIndexFunction, { urls: [], lastUpdated: new Date(0) }); export const querySitemapIndex = withAuth(querySitemapIndexFunction, {
urls: [],
lastUpdated: new Date(0),
});

View File

@ -24,55 +24,79 @@ export async function getLinksFromSitemap(
try { try {
if (mode === "fire-engine" && useFireEngine) { if (mode === "fire-engine" && useFireEngine) {
const fetchResponse = await scrapeURL( const fetchResponse = await scrapeURL(
"sitemap", "sitemap",
sitemapUrl, sitemapUrl,
scrapeOptions.parse({ formats: ["rawHtml"] }), scrapeOptions.parse({ formats: ["rawHtml"] }),
{ forceEngine: "fetch" }, { forceEngine: "fetch" },
); );
if (fetchResponse.success && (fetchResponse.document.metadata.statusCode >= 200 && fetchResponse.document.metadata.statusCode < 300)) { if (
fetchResponse.success &&
fetchResponse.document.metadata.statusCode >= 200 &&
fetchResponse.document.metadata.statusCode < 300
) {
content = fetchResponse.document.rawHtml!; content = fetchResponse.document.rawHtml!;
} else { } else {
logger.debug( logger.debug(
"Failed to scrape sitemap via fetch, falling back to TLSClient...", "Failed to scrape sitemap via fetch, falling back to TLSClient...",
{ error: fetchResponse.success ? fetchResponse.document : fetchResponse.error }, {
error: fetchResponse.success
? fetchResponse.document
: fetchResponse.error,
},
); );
const tlsResponse = await scrapeURL( const tlsResponse = await scrapeURL(
"sitemap", "sitemap",
sitemapUrl, sitemapUrl,
scrapeOptions.parse({ formats: ["rawHtml"] }), scrapeOptions.parse({ formats: ["rawHtml"] }),
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true }, { forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true },
); );
if (tlsResponse.success && (tlsResponse.document.metadata.statusCode >= 200 && tlsResponse.document.metadata.statusCode < 300)) { if (
tlsResponse.success &&
tlsResponse.document.metadata.statusCode >= 200 &&
tlsResponse.document.metadata.statusCode < 300
) {
content = tlsResponse.document.rawHtml!; content = tlsResponse.document.rawHtml!;
} else { } else {
logger.error(`Request failed for ${sitemapUrl}, ran out of engines!`, { logger.error(
method: "getLinksFromSitemap", `Request failed for ${sitemapUrl}, ran out of engines!`,
mode, {
sitemapUrl, method: "getLinksFromSitemap",
error: tlsResponse.success ? tlsResponse.document : tlsResponse.error, mode,
}); sitemapUrl,
error: tlsResponse.success
? tlsResponse.document
: tlsResponse.error,
},
);
return 0; return 0;
} }
} }
} else { } else {
const fetchResponse = await scrapeURL( const fetchResponse = await scrapeURL(
"sitemap", "sitemap",
sitemapUrl, sitemapUrl,
scrapeOptions.parse({ formats: ["rawHtml"] }), scrapeOptions.parse({ formats: ["rawHtml"] }),
{ forceEngine: "fetch" }, { forceEngine: "fetch" },
); );
if (fetchResponse.success && (fetchResponse.document.metadata.statusCode >= 200 && fetchResponse.document.metadata.statusCode < 300)) { if (
fetchResponse.success &&
fetchResponse.document.metadata.statusCode >= 200 &&
fetchResponse.document.metadata.statusCode < 300
) {
content = fetchResponse.document.rawHtml!; content = fetchResponse.document.rawHtml!;
} else { } else {
logger.error(`Request failed for ${sitemapUrl}, ran out of engines!`, { logger.error(
method: "getLinksFromSitemap", `Request failed for ${sitemapUrl}, ran out of engines!`,
mode, {
sitemapUrl, method: "getLinksFromSitemap",
}); mode,
sitemapUrl,
},
);
return 0; return 0;
} }
} }
@ -165,13 +189,20 @@ export const fetchSitemapData = async (
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`; const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
try { try {
const fetchResponse = await scrapeURL( const fetchResponse = await scrapeURL(
"sitemap", "sitemap",
sitemapUrl, sitemapUrl,
scrapeOptions.parse({ formats: ["rawHtml"], timeout: timeout || axiosTimeout }), scrapeOptions.parse({
formats: ["rawHtml"],
timeout: timeout || axiosTimeout,
}),
{ forceEngine: "fetch" }, { forceEngine: "fetch" },
); );
if (fetchResponse.success && (fetchResponse.document.metadata.statusCode >= 200 && fetchResponse.document.metadata.statusCode < 300)) { if (
fetchResponse.success &&
fetchResponse.document.metadata.statusCode >= 200 &&
fetchResponse.document.metadata.statusCode < 300
) {
const xml = fetchResponse.document.rawHtml!; const xml = fetchResponse.document.rawHtml!;
const parsedXml = await parseStringPromise(xml); const parsedXml = await parseStringPromise(xml);

View File

@ -17,7 +17,6 @@ export async function scrapeCache(meta: Meta): Promise<EngineScrapeResult> {
throw new EngineError("Cache hit but HTML is too short to be useful"); throw new EngineError("Cache hit but HTML is too short to be useful");
} }
// Set fromCache flag to indicate this document was retrieved from cache // Set fromCache flag to indicate this document was retrieved from cache
meta.internalOptions.fromCache = true; meta.internalOptions.fromCache = true;

View File

@ -3,7 +3,12 @@ import * as Sentry from "@sentry/node";
import { z } from "zod"; import { z } from "zod";
import { robustFetch } from "../../lib/fetch"; import { robustFetch } from "../../lib/fetch";
import { ActionError, EngineError, SiteError, UnsupportedFileError } from "../../error"; import {
ActionError,
EngineError,
SiteError,
UnsupportedFileError,
} from "../../error";
import { MockState } from "../../lib/mock"; import { MockState } from "../../lib/mock";
const successSchema = z.object({ const successSchema = z.object({

View File

@ -4,7 +4,11 @@ import * as Sentry from "@sentry/node";
import { robustFetch } from "../../lib/fetch"; import { robustFetch } from "../../lib/fetch";
import { MockState } from "../../lib/mock"; import { MockState } from "../../lib/mock";
export async function fireEngineDelete(logger: Logger, jobId: string, mock: MockState | null) { export async function fireEngineDelete(
logger: Logger,
jobId: string,
mock: MockState | null,
) {
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!; const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
await Sentry.startSpan( await Sentry.startSpan(

View File

@ -143,7 +143,10 @@ async function buildMetaObject(
logger, logger,
logs, logs,
featureFlags: buildFeatureFlags(url, options, internalOptions), featureFlags: buildFeatureFlags(url, options, internalOptions),
mock: options.useMock !== undefined ? await loadMock(options.useMock, _logger) : null, mock:
options.useMock !== undefined
? await loadMock(options.useMock, _logger)
: null,
}; };
} }

View File

@ -34,7 +34,7 @@ export async function robustFetch<
requestId = crypto.randomUUID(), requestId = crypto.randomUUID(),
tryCount = 1, tryCount = 1,
tryCooldown, tryCooldown,
mock mock,
}: RobustFetchParams<Schema>): Promise<Output> { }: RobustFetchParams<Schema>): Promise<Output> {
const params = { const params = {
url, url,
@ -51,8 +51,8 @@ export async function robustFetch<
let response: { let response: {
status: number; status: number;
headers: Headers, headers: Headers;
body: string, body: string;
}; };
if (mock === null) { if (mock === null) {
@ -123,25 +123,33 @@ export async function robustFetch<
return null as Output; return null as Output;
} }
const makeRequestTypeId = (request: typeof mock["requests"][number]["options"]) => { const makeRequestTypeId = (
request: (typeof mock)["requests"][number]["options"],
) => {
let out = request.url + ";" + request.method; let out = request.url + ";" + request.method;
if (process.env.FIRE_ENGINE_BETA_URL && url.startsWith(process.env.FIRE_ENGINE_BETA_URL) && request.method === "POST") { if (
process.env.FIRE_ENGINE_BETA_URL &&
url.startsWith(process.env.FIRE_ENGINE_BETA_URL) &&
request.method === "POST"
) {
out += "f-e;" + request.body?.engine + ";" + request.body?.url; out += "f-e;" + request.body?.engine + ";" + request.body?.url;
} }
return out; return out;
} };
const thisId = makeRequestTypeId(params); const thisId = makeRequestTypeId(params);
const matchingMocks = mock.requests.filter(x => makeRequestTypeId(x.options) === thisId).sort((a,b) => a.time - b.time); const matchingMocks = mock.requests
.filter((x) => makeRequestTypeId(x.options) === thisId)
.sort((a, b) => a.time - b.time);
const nextI = mock.tracker[thisId] ?? 0; const nextI = mock.tracker[thisId] ?? 0;
mock.tracker[thisId] = nextI + 1; mock.tracker[thisId] = nextI + 1;
if (!matchingMocks[nextI]) { if (!matchingMocks[nextI]) {
throw new Error("Failed to mock request -- no mock targets found."); throw new Error("Failed to mock request -- no mock targets found.");
} }
response = { response = {
...(matchingMocks[nextI].result), ...matchingMocks[nextI].result,
headers: new Headers(matchingMocks[nextI].result.headers), headers: new Headers(matchingMocks[nextI].result.headers),
}; };
} }
@ -180,12 +188,15 @@ export async function robustFetch<
} }
if (mock === null) { if (mock === null) {
await saveMock({ await saveMock(
...params, {
logger: undefined, ...params,
schema: undefined, logger: undefined,
headers: undefined, schema: undefined,
}, response); headers: undefined,
},
response,
);
} }
let data: Output; let data: Output;

View File

@ -6,55 +6,70 @@ const saveMocksDirPath = path.join(__dirname, "../mocks/").replace("dist/", "");
const loadMocksDirPath = path.join(__dirname, "../../../__tests__/snips/mocks"); const loadMocksDirPath = path.join(__dirname, "../../../__tests__/snips/mocks");
export async function saveMock(options: unknown, result: unknown) { export async function saveMock(options: unknown, result: unknown) {
if (process.env.FIRECRAWL_SAVE_MOCKS !== "true") return; if (process.env.FIRECRAWL_SAVE_MOCKS !== "true") return;
await fs.mkdir(saveMocksDirPath, { recursive: true }); await fs.mkdir(saveMocksDirPath, { recursive: true });
const fileName = Date.now() + "-" + crypto.randomUUID() + ".json"; const fileName = Date.now() + "-" + crypto.randomUUID() + ".json";
const filePath = path.join(saveMocksDirPath, fileName); const filePath = path.join(saveMocksDirPath, fileName);
console.log(filePath); console.log(filePath);
await fs.writeFile(filePath, JSON.stringify({ await fs.writeFile(
filePath,
JSON.stringify(
{
time: Date.now(), time: Date.now(),
options, options,
result, result,
}, undefined, 4)); },
undefined,
4,
),
);
} }
export type MockState = { export type MockState = {
requests: { requests: {
time: number, time: number;
options: { options: {
url: string, url: string;
method: string, method: string;
body?: any, body?: any;
ignoreResponse: boolean, ignoreResponse: boolean;
ignoreFailure: boolean, ignoreFailure: boolean;
tryCount: number, tryCount: number;
tryCooldown?: number, tryCooldown?: number;
}, };
result: any, result: any;
}[], }[];
tracker: Record<string, number>, tracker: Record<string, number>;
} };
export async function loadMock(name: string, logger: Logger = _logger): Promise<MockState | null> { export async function loadMock(
try { name: string,
const mockPath = path.join(loadMocksDirPath, name + ".json"); logger: Logger = _logger,
): Promise<MockState | null> {
try {
const mockPath = path.join(loadMocksDirPath, name + ".json");
const relative = path.relative(loadMocksDirPath, mockPath); const relative = path.relative(loadMocksDirPath, mockPath);
if (!relative || relative.startsWith("..") || path.isAbsolute(relative)) { if (!relative || relative.startsWith("..") || path.isAbsolute(relative)) {
// directory moving // directory moving
return null; return null;
}
const load = JSON.parse(await fs.readFile(mockPath, "utf8"));
return {
requests: load,
tracker: {},
};
} catch (error) {
logger.warn("Failed to load mock file!", { name, module: "scrapeURL:mock", method: "loadMock", error });
return null;
} }
const load = JSON.parse(await fs.readFile(mockPath, "utf8"));
return {
requests: load,
tracker: {},
};
} catch (error) {
logger.warn("Failed to load mock file!", {
name,
module: "scrapeURL:mock",
method: "loadMock",
error,
});
return null;
}
} }

View File

@ -119,16 +119,16 @@ export const htmlTransform = (
// always return biggest image // always return biggest image
soup("img[srcset]").each((_, el) => { soup("img[srcset]").each((_, el) => {
const sizes = el.attribs.srcset.split(",").map(x => { const sizes = el.attribs.srcset.split(",").map((x) => {
const tok = x.trim().split(" "); const tok = x.trim().split(" ");
return { return {
url: tok[0], url: tok[0],
size: parseInt((tok[1] ?? "1x").slice(0, -1), 10), size: parseInt((tok[1] ?? "1x").slice(0, -1), 10),
isX: (tok[1] ?? "").endsWith("x") isX: (tok[1] ?? "").endsWith("x"),
}; };
}); });
if (sizes.every(x => x.isX) && el.attribs.src) { if (sizes.every((x) => x.isX) && el.attribs.src) {
sizes.push({ sizes.push({
url: el.attribs.src, url: el.attribs.src,
size: 1, size: 1,
@ -136,7 +136,7 @@ export const htmlTransform = (
}); });
} }
sizes.sort((a,b) => b.size - a.size); sizes.sort((a, b) => b.size - a.size);
el.attribs.src = sizes[0]?.url; el.attribs.src = sizes[0]?.url;
}); });

View File

@ -41,7 +41,11 @@ export function deriveHTMLFromRawHTML(
); );
} }
document.html = htmlTransform(document.rawHtml, document.metadata.url ?? document.metadata.sourceURL ?? meta.url, meta.options); document.html = htmlTransform(
document.rawHtml,
document.metadata.url ?? document.metadata.sourceURL ?? meta.url,
meta.options,
);
return document; return document;
} }

View File

@ -1,7 +1,11 @@
import OpenAI from "openai"; import OpenAI from "openai";
import { encoding_for_model } from "@dqbd/tiktoken"; import { encoding_for_model } from "@dqbd/tiktoken";
import { TiktokenModel } from "@dqbd/tiktoken"; import { TiktokenModel } from "@dqbd/tiktoken";
import { Document, ExtractOptions, TokenUsage } from "../../../controllers/v1/types"; import {
Document,
ExtractOptions,
TokenUsage,
} from "../../../controllers/v1/types";
import { Logger } from "winston"; import { Logger } from "winston";
import { EngineResultsTracker, Meta } from ".."; import { EngineResultsTracker, Meta } from "..";
import { logger } from "../../../lib/logger"; import { logger } from "../../../lib/logger";
@ -72,14 +76,20 @@ export async function generateOpenAICompletions(
markdown?: string, markdown?: string,
previousWarning?: string, previousWarning?: string,
isExtractEndpoint?: boolean, isExtractEndpoint?: boolean,
model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini", model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ??
): Promise<{ extract: any; numTokens: number; warning: string | undefined; totalUsage: TokenUsage, model: string }> { "gpt-4o-mini",
): Promise<{
extract: any;
numTokens: number;
warning: string | undefined;
totalUsage: TokenUsage;
model: string;
}> {
let extract: any; let extract: any;
let warning: string | undefined; let warning: string | undefined;
const openai = new OpenAI(); const openai = new OpenAI();
if (markdown === undefined) { if (markdown === undefined) {
throw new Error("document.markdown is undefined -- this is unexpected"); throw new Error("document.markdown is undefined -- this is unexpected");
} }
@ -208,8 +218,8 @@ export async function generateOpenAICompletions(
} }
} }
const promptTokens = (jsonCompletion.usage?.prompt_tokens ?? 0); const promptTokens = jsonCompletion.usage?.prompt_tokens ?? 0;
const completionTokens = (jsonCompletion.usage?.completion_tokens ?? 0); const completionTokens = jsonCompletion.usage?.completion_tokens ?? 0;
// If the users actually wants the items object, they can specify it as 'required' in the schema // If the users actually wants the items object, they can specify it as 'required' in the schema
// otherwise, we just return the items array // otherwise, we just return the items array
@ -222,7 +232,17 @@ export async function generateOpenAICompletions(
} }
// num tokens (just user prompt tokenized) | deprecated // num tokens (just user prompt tokenized) | deprecated
// totalTokens = promptTokens + completionTokens // totalTokens = promptTokens + completionTokens
return { extract, warning, numTokens, totalUsage: { promptTokens, completionTokens, totalTokens: promptTokens + completionTokens }, model }; return {
extract,
warning,
numTokens,
totalUsage: {
promptTokens,
completionTokens,
totalTokens: promptTokens + completionTokens,
},
model,
};
} }
export async function performLLMExtract( export async function performLLMExtract(
@ -238,7 +258,7 @@ export async function performLLMExtract(
document.markdown, document.markdown,
document.warning, document.warning,
); );
if (meta.options.formats.includes("json")) { if (meta.options.formats.includes("json")) {
document.json = extract; document.json = extract;
} else { } else {

View File

@ -32,7 +32,7 @@ export async function autoCharge(
const resource = `auto-recharge:${chunk.team_id}`; const resource = `auto-recharge:${chunk.team_id}`;
const cooldownKey = `auto-recharge-cooldown:${chunk.team_id}`; const cooldownKey = `auto-recharge-cooldown:${chunk.team_id}`;
if(chunk.team_id === "285bb597-6eaf-4b96-801c-51461fc3c543"){ if (chunk.team_id === "285bb597-6eaf-4b96-801c-51461fc3c543") {
return { return {
success: false, success: false,
message: "Auto-recharge failed", message: "Auto-recharge failed",

View File

@ -107,15 +107,15 @@ async function processBatch() {
// Keep most recent entry and mark others for deletion // Keep most recent entry and mark others for deletion
const [mostRecent, ...duplicates] = existingForOrigin; const [mostRecent, ...duplicates] = existingForOrigin;
if (duplicates.length > 0) { if (duplicates.length > 0) {
duplicatesToDelete.push(...duplicates.map(d => d.id)); duplicatesToDelete.push(...duplicates.map((d) => d.id));
} }
// Merge and deduplicate URLs // Merge and deduplicate URLs
const mergedUrls = [ const mergedUrls = [
...new Set([ ...new Set([
...mostRecent.urls, ...mostRecent.urls,
...op.standardizedUrls.map(url => normalizeUrl(url)) ...op.standardizedUrls.map((url) => normalizeUrl(url)),
]) ]),
]; ];
updates.push({ updates.push({
@ -127,7 +127,9 @@ async function processBatch() {
}); });
} else { } else {
// Prepare insert with deduplicated URLs // Prepare insert with deduplicated URLs
const deduplicatedUrls = [...new Set(op.standardizedUrls.map(url => normalizeUrl(url)))]; const deduplicatedUrls = [
...new Set(op.standardizedUrls.map((url) => normalizeUrl(url))),
];
inserts.push({ inserts.push({
origin_url: op.originUrl, origin_url: op.originUrl,
urls: deduplicatedUrls, urls: deduplicatedUrls,
@ -140,8 +142,10 @@ async function processBatch() {
// Delete duplicate entries // Delete duplicate entries
if (duplicatesToDelete.length > 0) { if (duplicatesToDelete.length > 0) {
logger.info(`🗑️ Deleting ${duplicatesToDelete.length} duplicate crawl maps in batches of 100`); logger.info(
`🗑️ Deleting ${duplicatesToDelete.length} duplicate crawl maps in batches of 100`,
);
// Delete in batches of 100 // Delete in batches of 100
for (let i = 0; i < duplicatesToDelete.length; i += 100) { for (let i = 0; i < duplicatesToDelete.length; i += 100) {
const batch = duplicatesToDelete.slice(i, i + 100); const batch = duplicatesToDelete.slice(i, i + 100);
@ -151,11 +155,14 @@ async function processBatch() {
.in("id", batch); .in("id", batch);
if (deleteError) { if (deleteError) {
logger.error(`Failed to delete batch ${i/100 + 1} of duplicate crawl maps`, { logger.error(
error: deleteError, `Failed to delete batch ${i / 100 + 1} of duplicate crawl maps`,
batchSize: batch.length, {
startIndex: i error: deleteError,
}); batchSize: batch.length,
startIndex: i,
},
);
} }
} }
} }
@ -165,7 +172,7 @@ async function processBatch() {
logger.info(`🔄 Updating ${updates.length} existing crawl maps`, { logger.info(`🔄 Updating ${updates.length} existing crawl maps`, {
origins: updates.map((u) => u.origin_url), origins: updates.map((u) => u.origin_url),
}); });
// Process updates one at a time to avoid conflicts // Process updates one at a time to avoid conflicts
for (const update of updates) { for (const update of updates) {
const { error: updateError } = await supabase_service const { error: updateError } = await supabase_service
@ -175,7 +182,7 @@ async function processBatch() {
if (updateError) { if (updateError) {
logger.error("Failed to update crawl map", { logger.error("Failed to update crawl map", {
error: updateError, error: updateError,
origin: update.origin_url origin: update.origin_url,
}); });
} }
} }

View File

@ -3,18 +3,27 @@ import "../sentry";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import { Job, Queue, Worker } from "bullmq"; import { Job, Queue, Worker } from "bullmq";
import { logger as _logger, logger } from "../../lib/logger"; import { logger as _logger, logger } from "../../lib/logger";
import { redisConnection, indexQueueName, getIndexQueue } from "../queue-service"; import {
redisConnection,
indexQueueName,
getIndexQueue,
} from "../queue-service";
import { saveCrawlMap } from "./crawl-maps-index"; import { saveCrawlMap } from "./crawl-maps-index";
import systemMonitor from "../system-monitor"; import systemMonitor from "../system-monitor";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
const workerLockDuration = Number(process.env.WORKER_LOCK_DURATION) || 60000; const workerLockDuration = Number(process.env.WORKER_LOCK_DURATION) || 60000;
const workerStalledCheckInterval = Number(process.env.WORKER_STALLED_CHECK_INTERVAL) || 30000; const workerStalledCheckInterval =
const jobLockExtendInterval = Number(process.env.JOB_LOCK_EXTEND_INTERVAL) || 15000; Number(process.env.WORKER_STALLED_CHECK_INTERVAL) || 30000;
const jobLockExtensionTime = Number(process.env.JOB_LOCK_EXTENSION_TIME) || 60000; const jobLockExtendInterval =
Number(process.env.JOB_LOCK_EXTEND_INTERVAL) || 15000;
const jobLockExtensionTime =
Number(process.env.JOB_LOCK_EXTENSION_TIME) || 60000;
const cantAcceptConnectionInterval = Number(process.env.CANT_ACCEPT_CONNECTION_INTERVAL) || 2000; const cantAcceptConnectionInterval =
const connectionMonitorInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 10; Number(process.env.CANT_ACCEPT_CONNECTION_INTERVAL) || 2000;
const connectionMonitorInterval =
Number(process.env.CONNECTION_MONITOR_INTERVAL) || 10;
const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20; const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
const runningJobs: Set<string> = new Set(); const runningJobs: Set<string> = new Set();
@ -88,7 +97,7 @@ const workerFun = async (queue: Queue) => {
const token = uuidv4(); const token = uuidv4();
const canAcceptConnection = await monitor.acceptConnection(); const canAcceptConnection = await monitor.acceptConnection();
if (!canAcceptConnection) { if (!canAcceptConnection) {
logger.info("Cant accept connection"); logger.info("Cant accept connection");
cantAcceptConnectionCount++; cantAcceptConnectionCount++;
@ -100,7 +109,9 @@ const workerFun = async (queue: Queue) => {
}); });
} }
await new Promise(resolve => setTimeout(resolve, cantAcceptConnectionInterval)); await new Promise((resolve) =>
setTimeout(resolve, cantAcceptConnectionInterval),
);
continue; continue;
} else { } else {
cantAcceptConnectionCount = 0; cantAcceptConnectionCount = 0;
@ -141,15 +152,17 @@ const workerFun = async (queue: Queue) => {
runningJobs.delete(job.id); runningJobs.delete(job.id);
} }
await new Promise(resolve => setTimeout(resolve, gotJobInterval)); await new Promise((resolve) => setTimeout(resolve, gotJobInterval));
} else { } else {
await new Promise(resolve => setTimeout(resolve, connectionMonitorInterval)); await new Promise((resolve) =>
setTimeout(resolve, connectionMonitorInterval),
);
} }
} }
logger.info("Worker loop ended. Waiting for running jobs to finish..."); logger.info("Worker loop ended. Waiting for running jobs to finish...");
while (runningJobs.size > 0) { while (runningJobs.size > 0) {
await new Promise(resolve => setTimeout(resolve, 500)); await new Promise((resolve) => setTimeout(resolve, 500));
} }
logger.info("All jobs finished. Worker exiting!"); logger.info("All jobs finished. Worker exiting!");
process.exit(0); process.exit(0);
@ -158,4 +171,4 @@ const workerFun = async (queue: Queue) => {
// Start the worker // Start the worker
(async () => { (async () => {
await workerFun(getIndexQueue()); await workerFun(getIndexQueue());
})(); })();

View File

@ -93,7 +93,9 @@ const runningJobs: Set<string> = new Set();
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
if (await finishCrawl(job.data.crawl_id)) { if (await finishCrawl(job.data.crawl_id)) {
(async () => { (async () => {
const originUrl = sc.originUrl ? normalizeUrlOnlyHostname(sc.originUrl) : undefined; const originUrl = sc.originUrl
? normalizeUrlOnlyHostname(sc.originUrl)
: undefined;
// Get all visited unique URLs from Redis // Get all visited unique URLs from Redis
const visitedUrls = await redisConnection.smembers( const visitedUrls = await redisConnection.smembers(
"crawl:" + job.data.crawl_id + ":visited_unique", "crawl:" + job.data.crawl_id + ":visited_unique",
@ -113,7 +115,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
}, },
{ {
priority: 10, priority: 10,
} },
); );
} }
})(); })();
@ -315,11 +317,14 @@ const processExtractJobInternal = async (
return result; return result;
} else { } else {
// throw new Error(result.error || "Unknown error during extraction"); // throw new Error(result.error || "Unknown error during extraction");
await job.moveToCompleted(result, token, false); await job.moveToCompleted(result, token, false);
await updateExtract(job.data.extractId, { await updateExtract(job.data.extractId, {
status: "failed", status: "failed",
error: result.error ?? "Unknown error, please contact help@firecrawl.com. Extract id: " + job.data.extractId, error:
result.error ??
"Unknown error, please contact help@firecrawl.com. Extract id: " +
job.data.extractId,
}); });
return result; return result;
@ -348,7 +353,14 @@ const processExtractJobInternal = async (
"Unknown error, please contact help@firecrawl.com. Extract id: " + "Unknown error, please contact help@firecrawl.com. Extract id: " +
job.data.extractId, job.data.extractId,
}); });
return { success: false, error: error.error ?? error ?? "Unknown error, please contact help@firecrawl.com. Extract id: " + job.data.extractId }; return {
success: false,
error:
error.error ??
error ??
"Unknown error, please contact help@firecrawl.com. Extract id: " +
job.data.extractId,
};
// throw error; // throw error;
} finally { } finally {
clearInterval(extendLockInterval); clearInterval(extendLockInterval);
@ -949,13 +961,15 @@ async function processJob(job: Job & { id: string }, token: string) {
} }
if (job.data.team_id !== process.env.BACKGROUND_INDEX_TEAM_ID!) { if (job.data.team_id !== process.env.BACKGROUND_INDEX_TEAM_ID!) {
billTeam(job.data.team_id, undefined, creditsToBeBilled, logger).catch((error) => { billTeam(job.data.team_id, undefined, creditsToBeBilled, logger).catch(
logger.error( (error) => {
`Failed to bill team ${job.data.team_id} for ${creditsToBeBilled} credits`, logger.error(
{ error }, `Failed to bill team ${job.data.team_id} for ${creditsToBeBilled} credits`,
); { error },
// Optionally, you could notify an admin or add to a retry queue here );
}); // Optionally, you could notify an admin or add to a retry queue here
},
);
} }
} }
@ -974,11 +988,12 @@ async function processJob(job: Job & { id: string }, token: string) {
await finishCrawlIfNeeded(job, sc); await finishCrawlIfNeeded(job, sc);
} }
const isEarlyTimeout = const isEarlyTimeout =
error instanceof Error && error.message === "timeout"; error instanceof Error && error.message === "timeout";
const isCancelled = const isCancelled =
error instanceof Error && error.message === "Parent crawl/batch scrape was cancelled"; error instanceof Error &&
error.message === "Parent crawl/batch scrape was cancelled";
if (isEarlyTimeout) { if (isEarlyTimeout) {
logger.error(`🐂 Job timed out ${job.id}`); logger.error(`🐂 Job timed out ${job.id}`);