Nick: formatting done

This commit is contained in:
Nicolas 2025-01-22 18:47:44 -03:00
parent 994e1eb502
commit 498558d358
53 changed files with 10672 additions and 10329 deletions

View File

@ -1,8 +1,6 @@
import request from "supertest";
import { configDotenv } from "dotenv";
import {
ScrapeRequestInput,
} from "../../controllers/v1/types";
import { ScrapeRequestInput } from "../../controllers/v1/types";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
configDotenv();
@ -19,8 +17,7 @@ describe("E2E Tests for v1 API Routes", () => {
describe("GET /is-production", () => {
it.concurrent("should return the production status", async () => {
const response: any =
await request(TEST_URL).get("/is-production");
const response: any = await request(TEST_URL).get("/is-production");
console.log(
"process.env.USE_DB_AUTHENTICATION",
@ -274,12 +271,11 @@ describe("E2E Tests for v1 API Routes", () => {
url: "https://www.scrapethissite.com/",
onlyMainContent: false, // default is true
};
const responseWithoutRemoveTags: any =
await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
const responseWithoutRemoveTags: any = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(responseWithoutRemoveTags.statusCode).toBe(200);
expect(responseWithoutRemoveTags.body).toHaveProperty("data");

View File

@ -1,8 +1,6 @@
import request from "supertest";
import { configDotenv } from "dotenv";
import {
ScrapeRequest,
} from "../../controllers/v1/types";
import { ScrapeRequest } from "../../controllers/v1/types";
configDotenv();
const FIRECRAWL_API_URL = "http://127.0.0.1:3002";
@ -12,9 +10,7 @@ describe("E2E Tests for v1 API Routes", () => {
it.concurrent(
"should return a successful response for a scrape with 403 page",
async () => {
const response: any = await request(
FIRECRAWL_API_URL,
)
const response: any = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
@ -39,9 +35,7 @@ describe("E2E Tests for v1 API Routes", () => {
url: E2E_TEST_SERVER_URL,
} as ScrapeRequest;
const response: any = await request(
FIRECRAWL_API_URL,
)
const response: any = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
@ -86,9 +80,7 @@ describe("E2E Tests for v1 API Routes", () => {
formats: ["html"],
} as ScrapeRequest;
const response: any = await request(
FIRECRAWL_API_URL,
)
const response: any = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
@ -121,9 +113,7 @@ describe("E2E Tests for v1 API Routes", () => {
formats: ["rawHtml"],
} as ScrapeRequest;
const response: any = await request(
FIRECRAWL_API_URL,
)
const response: any = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
@ -159,9 +149,7 @@ describe("E2E Tests for v1 API Routes", () => {
headers: { "e2e-header-test": "firecrawl" },
} as ScrapeRequest;
const response: any = await request(
FIRECRAWL_API_URL,
)
const response: any = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
@ -188,9 +176,7 @@ describe("E2E Tests for v1 API Routes", () => {
includeTags: ["#content-1"],
} as ScrapeRequest;
const response: any = await request(
FIRECRAWL_API_URL,
)
const response: any = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
@ -220,9 +206,7 @@ describe("E2E Tests for v1 API Routes", () => {
excludeTags: ["#content-1"],
} as ScrapeRequest;
const response: any = await request(
FIRECRAWL_API_URL,
)
const response: any = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
@ -253,9 +237,7 @@ describe("E2E Tests for v1 API Routes", () => {
onlyMainContent: false,
} as ScrapeRequest;
const response: any = await request(
FIRECRAWL_API_URL,
)
const response: any = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
@ -285,9 +267,7 @@ describe("E2E Tests for v1 API Routes", () => {
timeout: 500,
} as ScrapeRequest;
const response: any = await request(
FIRECRAWL_API_URL,
)
const response: any = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
@ -312,9 +292,7 @@ describe("E2E Tests for v1 API Routes", () => {
mobile: true,
} as ScrapeRequest;
const response: any = await request(
FIRECRAWL_API_URL,
)
const response: any = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
@ -335,9 +313,7 @@ describe("E2E Tests for v1 API Routes", () => {
it.concurrent(
"should handle 'parsePDF' parameter correctly",
async () => {
const response: any = await request(
FIRECRAWL_API_URL,
)
const response: any = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
@ -357,9 +333,7 @@ describe("E2E Tests for v1 API Routes", () => {
"h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm",
);
const responseNoParsePDF: any = await request(
FIRECRAWL_API_URL,
)
const responseNoParsePDF: any = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
@ -410,9 +384,7 @@ describe("E2E Tests for v1 API Routes", () => {
timeout: 120000,
} as ScrapeRequest;
const response: any = await request(
FIRECRAWL_API_URL,
)
const response: any = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
@ -432,12 +404,13 @@ describe("E2E Tests for v1 API Routes", () => {
timeout: 120000,
} as ScrapeRequest;
const responseWithSkipTlsVerification: any =
await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequestWithSkipTlsVerification);
const responseWithSkipTlsVerification: any = await request(
FIRECRAWL_API_URL,
)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequestWithSkipTlsVerification);
console.log("Error1b");
// console.log(responseWithSkipTlsVerification.body)
@ -461,9 +434,7 @@ describe("E2E Tests for v1 API Routes", () => {
removeBase64Images: true,
} as ScrapeRequest;
const response: any = await request(
FIRECRAWL_API_URL,
)
const response: any = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
@ -493,9 +464,7 @@ describe("E2E Tests for v1 API Routes", () => {
],
} as ScrapeRequest;
const response: any = await request(
FIRECRAWL_API_URL,
)
const response: any = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
@ -526,9 +495,7 @@ describe("E2E Tests for v1 API Routes", () => {
],
} as ScrapeRequest;
const response: any = await request(
FIRECRAWL_API_URL,
)
const response: any = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
@ -569,9 +536,7 @@ describe("E2E Tests for v1 API Routes", () => {
],
} as ScrapeRequest;
const response: any = await request(
FIRECRAWL_API_URL,
)
const response: any = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
@ -619,9 +584,7 @@ describe("E2E Tests for v1 API Routes", () => {
],
} as ScrapeRequest;
const response: any = await request(
FIRECRAWL_API_URL,
)
const response: any = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
@ -657,9 +620,7 @@ describe("E2E Tests for v1 API Routes", () => {
],
} as ScrapeRequest;
const response: any = await request(
FIRECRAWL_API_URL,
)
const response: any = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
@ -692,9 +653,7 @@ describe("E2E Tests for v1 API Routes", () => {
],
} as ScrapeRequest;
const response: any = await request(
FIRECRAWL_API_URL,
)
const response: any = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
@ -731,9 +690,7 @@ describe("E2E Tests for v1 API Routes", () => {
],
} as ScrapeRequest;
const response: any = await request(
FIRECRAWL_API_URL,
)
const response: any = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")

View File

@ -23,8 +23,7 @@ describe("E2E Tests for v0 API Routes", () => {
describe("POST /v0/scrape", () => {
it.concurrent("should require authorization", async () => {
const response: any =
await request(TEST_URL).post("/v0/scrape");
const response: any = await request(TEST_URL).post("/v0/scrape");
expect(response.statusCode).toBe(401);
});
@ -159,12 +158,11 @@ describe("E2E Tests for v0 API Routes", () => {
it.concurrent(
"should return a successful response with a valid API key with removeTags option",
async () => {
const responseWithoutRemoveTags: any =
await request(TEST_URL)
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: "https://www.scrapethissite.com/" });
const responseWithoutRemoveTags: any = await request(TEST_URL)
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: "https://www.scrapethissite.com/" });
expect(responseWithoutRemoveTags.statusCode).toBe(200);
expect(responseWithoutRemoveTags.body).toHaveProperty("data");
expect(responseWithoutRemoveTags.body.data).toHaveProperty("content");
@ -332,8 +330,7 @@ describe("E2E Tests for v0 API Routes", () => {
describe("POST /v0/crawl", () => {
it.concurrent("should require authorization", async () => {
const response: any =
await request(TEST_URL).post("/v0/crawl");
const response: any = await request(TEST_URL).post("/v0/crawl");
expect(response.statusCode).toBe(401);
});
@ -461,9 +458,7 @@ describe("E2E Tests for v0 API Routes", () => {
}
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
const completedResponse: any = await request(
TEST_URL,
)
const completedResponse: any = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
@ -509,9 +504,7 @@ describe("E2E Tests for v0 API Routes", () => {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
}
}
const completedResponse: any = await request(
TEST_URL,
)
const completedResponse: any = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);

View File

@ -6,31 +6,33 @@ configDotenv();
const TEST_URL = "http://127.0.0.1:3002";
async function scrape(body: ScrapeRequestInput) {
return await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(body);
return await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(body);
}
function expectScrapeToSucceed(response: Awaited<ReturnType<typeof scrape>>) {
expect(response.statusCode).toBe(200);
expect(response.body.success).toBe(true);
expect(typeof response.body.data).toBe("object");
expect(response.statusCode).toBe(200);
expect(response.body.success).toBe(true);
expect(typeof response.body.data).toBe("object");
}
describe("Scrape tests", () => {
it("mocking works properly", async () => {
// depends on falsified mock mocking-works-properly
// this test will fail if mock is bypassed with real data -- firecrawl.dev will never have
// that as its actual markdown output
it("mocking works properly", async () => {
// depends on falsified mock mocking-works-properly
// this test will fail if mock is bypassed with real data -- firecrawl.dev will never have
// that as its actual markdown output
const response = await scrape({
url: "http://firecrawl.dev",
useMock: "mocking-works-properly",
});
expectScrapeToSucceed(response);
expect(response.body.data.markdown).toBe("this is fake data coming from the mocking system!");
const response = await scrape({
url: "http://firecrawl.dev",
useMock: "mocking-works-properly",
});
});
expectScrapeToSucceed(response);
expect(response.body.data.markdown).toBe(
"this is fake data coming from the mocking system!",
);
});
});

View File

@ -4,9 +4,11 @@ const fs = require("fs");
const mocksDirPath = path.join(__dirname, "../../../scraper/scrapeURL/mocks");
const files = fs.readdirSync(mocksDirPath);
const contents = files.map(x => JSON.parse(fs.readFileSync(path.join(mocksDirPath, x), "utf8")));
const contents = files.map((x) =>
JSON.parse(fs.readFileSync(path.join(mocksDirPath, x), "utf8")),
);
fs.writeFileSync(
path.join(__dirname, "../mocks/" + process.argv[2] + ".json"),
JSON.stringify(contents, undefined, 4),
);
path.join(__dirname, "../mocks/" + process.argv[2] + ".json"),
JSON.stringify(contents, undefined, 4),
);

View File

@ -105,7 +105,6 @@ export async function getACUC(
{ get: true },
));
if (!error) {
break;
}
@ -146,7 +145,7 @@ export async function clearACUC(api_key: string): Promise<void> {
modes.map(async (mode) => {
const cacheKey = `acuc_${api_key}_${mode}`;
await deleteKey(cacheKey);
})
}),
);
// Also clear the base cache key
@ -232,7 +231,6 @@ export async function supaAuthenticateUser(
teamId = chunk.team_id;
priceId = chunk.price_id;
plan = getPlanByPriceId(priceId);
subscriptionData = {
team_id: teamId,

View File

@ -16,7 +16,7 @@ export async function checkFireEngine(req: Request, res: Response) {
const timeout = setTimeout(() => controller.abort(), 30000);
const urls = ["https://roastmywebsite.ai", "https://example.com"];
let lastError : string | null = null;
let lastError: string | null = null;
for (const url of urls) {
try {
@ -62,7 +62,6 @@ export async function checkFireEngine(req: Request, res: Response) {
success: false,
error: "Internal server error - all retry attempts failed",
});
} catch (error) {
logger.error(error);
Sentry.captureException(error);

View File

@ -227,7 +227,7 @@ export async function crawlController(req: Request, res: Response) {
await addScrapeJob(job.data as any, {}, job.opts.jobId);
}
});
if (sitemap === 0) {
await lockURL(id, sc, url);

View File

@ -1,6 +1,6 @@
import { Response } from "express";
import {
CrawlErrorsResponse,
CrawlErrorsResponse,
CrawlStatusParams,
CrawlStatusResponse,
ErrorResponse,
@ -62,20 +62,23 @@ export async function crawlErrorsController(
const failedJobIDs: string[] = [];
for (const [id, status] of jobStatuses) {
if (
status === "failed"
) {
if (status === "failed") {
failedJobIDs.push(id);
}
}
res.status(200).json({
errors: (await getJobs(failedJobIDs)).map(x => ({
id: x.id,
timestamp: x.finishedOn !== undefined ? (new Date(x.finishedOn).toISOString()) : undefined,
url: x.data.url,
error: x.failedReason,
errors: (await getJobs(failedJobIDs)).map((x) => ({
id: x.id,
timestamp:
x.finishedOn !== undefined
? new Date(x.finishedOn).toISOString()
: undefined,
url: x.data.url,
error: x.failedReason,
})),
robotsBlocked: await redisConnection.smembers("crawl:" + req.params.jobId + ":robots_blocked"),
robotsBlocked: await redisConnection.smembers(
"crawl:" + req.params.jobId + ":robots_blocked",
),
});
}

View File

@ -116,7 +116,10 @@ export async function crawlStatusController(
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] =
sc.cancelled
? "cancelled"
: validJobStatuses.every((x) => x[1] === "completed") && (sc.crawlerOptions ? await isCrawlKickoffFinished(req.params.jobId) : true)
: validJobStatuses.every((x) => x[1] === "completed") &&
(sc.crawlerOptions
? await isCrawlKickoffFinished(req.params.jobId)
: true)
? "completed"
: "scraping";

View File

@ -101,7 +101,7 @@ export async function getMapResults({
},
true,
true,
30000
30000,
);
if (sitemap > 0) {
links = links
@ -164,20 +164,24 @@ export async function getMapResults({
const twoDaysAgo = new Date();
twoDaysAgo.setDate(twoDaysAgo.getDate() - 2);
// If sitemap is not ignored and either we have few URLs (<100) or the data is stale (>2 days old), fetch fresh sitemap
if (
!ignoreSitemap &&
!ignoreSitemap &&
(sitemapIndexResult.urls.length < 100 ||
new Date(sitemapIndexResult.lastUpdated) < twoDaysAgo)
new Date(sitemapIndexResult.lastUpdated) < twoDaysAgo)
) {
try {
await crawler.tryGetSitemap(urls => {
links.push(...urls);
}, true, false, 30000);
await crawler.tryGetSitemap(
(urls) => {
links.push(...urls);
},
true,
false,
30000,
);
} catch (e) {
logger.warn("tryGetSitemap threw an error", { error: e });
}
}
}
if (!cachedResult) {
@ -253,7 +257,7 @@ export async function getMapResults({
},
{
priority: 10,
}
},
);
return {

View File

@ -33,7 +33,6 @@ export async function scrapeController(
basePriority: 10,
});
await addScrapeJob(
{
url: req.body.url,
@ -97,7 +96,7 @@ export async function scrapeController(
// Don't bill if we're early returning
return;
}
if (req.body.extract && req.body.formats.includes("extract") ) {
if (req.body.extract && req.body.formats.includes("extract")) {
creditsToBeBilled = 5;
}

View File

@ -125,7 +125,7 @@ export const scrapeOptions = z
"screenshot",
"screenshot@fullPage",
"extract",
"json"
"json",
])
.array()
.optional()
@ -233,7 +233,7 @@ export const extractV1Options = z
.strict(strictMessage)
.transform((obj) => ({
...obj,
allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch
allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch,
}));
export type ExtractV1Options = z.infer<typeof extractV1Options>;
@ -268,11 +268,17 @@ export const scrapeRequestSchema = scrapeOptions
)
.transform((obj) => {
// Handle timeout
if ((obj.formats?.includes("extract") || obj.extract || obj.formats?.includes("json") || obj.jsonOptions) && !obj.timeout) {
if (
(obj.formats?.includes("extract") ||
obj.extract ||
obj.formats?.includes("json") ||
obj.jsonOptions) &&
!obj.timeout
) {
obj = { ...obj, timeout: 60000 };
}
if(obj.formats?.includes("json")) {
if (obj.formats?.includes("json")) {
obj.formats.push("extract");
}
@ -284,8 +290,8 @@ export const scrapeRequestSchema = scrapeOptions
prompt: obj.jsonOptions.prompt,
systemPrompt: obj.jsonOptions.systemPrompt,
schema: obj.jsonOptions.schema,
mode: "llm"
}
mode: "llm",
},
};
}
@ -602,15 +608,14 @@ export type CrawlStatusResponse =
data: Document[];
};
export type CrawlErrorsResponse =
| ErrorResponse
| {
errors: {
id: string,
timestamp?: string,
url: string,
error: string,
id: string;
timestamp?: string;
url: string;
error: string;
}[];
robotsBlocked: string[];
};
@ -888,7 +893,6 @@ export type SearchResponse =
data: Document[];
};
export type TokenUsage = {
promptTokens: number;
completionTokens: number;

View File

@ -4,7 +4,11 @@ import * as Sentry from "@sentry/node";
import express, { NextFunction, Request, Response } from "express";
import bodyParser from "body-parser";
import cors from "cors";
import { getExtractQueue, getScrapeQueue, getIndexQueue } from "./services/queue-service";
import {
getExtractQueue,
getScrapeQueue,
getIndexQueue,
} from "./services/queue-service";
import { v0Router } from "./routes/v0";
import os from "os";
import { logger } from "./lib/logger";

View File

@ -3,101 +3,101 @@ import { deduplicateObjectsArray } from "../extract/helpers/deduplicate-objs-arr
describe("deduplicateObjectsArray", () => {
it("should deduplicate the array", async () => {
const objArray = {
"lawyers": [
lawyers: [
{
"name": "James D. Schull",
"email": null,
"title": "Personal Injury Attorney",
name: "James D. Schull",
email: null,
title: "Personal Injury Attorney",
"phone-number": null,
"practice-areas": [
{
"area": "Personal Injury"
}
]
area: "Personal Injury",
},
],
},
{
"name": "James D. Schull",
"email": null,
"title": "Personal Injury Attorney",
name: "James D. Schull",
email: null,
title: "Personal Injury Attorney",
"phone-number": null,
"practice-areas": [
{
"area": "Personal Injury"
}
]
area: "Personal Injury",
},
],
},
{
"name": "James D. Schull",
"email": null,
"title": "Personal Injury Attorney",
name: "James D. Schull",
email: null,
title: "Personal Injury Attorney",
"phone-number": null,
"practice-areas": [
{
"area": "Personal Injury"
}
]
}
]
}
area: "Personal Injury",
},
],
},
],
};
const expected = {
"lawyers": [
lawyers: [
{
"name": "James D. Schull",
"email": null,
"title": "Personal Injury Attorney",
name: "James D. Schull",
email: null,
title: "Personal Injury Attorney",
"phone-number": null,
"practice-areas": [
{
"area": "Personal Injury"
}
]
}
]
}
area: "Personal Injury",
},
],
},
],
};
const result = await deduplicateObjectsArray(objArray);
expect(result).toEqual(expected);
})
});
it("should not deduplicate if not necessary", async () => {
const objArray = {
"lawyers": [
lawyers: [
{
"name": "James D. Schull",
"email": null,
"title": "Personal Injury Attorney",
name: "James D. Schull",
email: null,
title: "Personal Injury Attorney",
"phone-number": null,
"practice-areas": [
{
"area": "Personal Injury"
}
]
area: "Personal Injury",
},
],
},
{
"name": "John Doe",
"email": null,
"title": "Personal Injury Attorney",
name: "John Doe",
email: null,
title: "Personal Injury Attorney",
"phone-number": null,
"practice-areas": [
{
"area": "Personal Injury"
}
]
}
]
}
area: "Personal Injury",
},
],
},
],
};
const result = await deduplicateObjectsArray(objArray);
expect(result).toEqual(objArray);
})
});
it("should handle an empty array", async () => {
const objArray = { "lawyers": [] };
const objArray = { lawyers: [] };
const expected = { "lawyers": [] };
const expected = { lawyers: [] };
const result = await deduplicateObjectsArray(objArray);
@ -106,35 +106,35 @@ describe("deduplicateObjectsArray", () => {
it("should handle objects with different properties", async () => {
const objArray = {
"lawyers": [
lawyers: [
{
"name": "James D. Schull",
"email": "james@example.com",
"title": "Personal Injury Attorney"
name: "James D. Schull",
email: "james@example.com",
title: "Personal Injury Attorney",
},
{
"name": "James D. Schull",
"email": "james@example.com",
"title": "Personal Injury Attorney",
"phone-number": "123-456-7890"
}
]
name: "James D. Schull",
email: "james@example.com",
title: "Personal Injury Attorney",
"phone-number": "123-456-7890",
},
],
};
const expected = {
"lawyers": [
lawyers: [
{
"name": "James D. Schull",
"email": "james@example.com",
"title": "Personal Injury Attorney"
name: "James D. Schull",
email: "james@example.com",
title: "Personal Injury Attorney",
},
{
"name": "James D. Schull",
"email": "james@example.com",
"title": "Personal Injury Attorney",
"phone-number": "123-456-7890"
}
]
name: "James D. Schull",
email: "james@example.com",
title: "Personal Injury Attorney",
"phone-number": "123-456-7890",
},
],
};
const result = await deduplicateObjectsArray(objArray);
@ -144,33 +144,33 @@ describe("deduplicateObjectsArray", () => {
it("should handle objects with same properties but different values", async () => {
const objArray = {
"lawyers": [
lawyers: [
{
"name": "James D. Schull",
"email": "james1@example.com",
"title": "Personal Injury Attorney"
name: "James D. Schull",
email: "james1@example.com",
title: "Personal Injury Attorney",
},
{
"name": "James D. Schull",
"email": "james2@example.com",
"title": "Personal Injury Attorney"
}
]
name: "James D. Schull",
email: "james2@example.com",
title: "Personal Injury Attorney",
},
],
};
const expected = {
"lawyers": [
lawyers: [
{
"name": "James D. Schull",
"email": "james1@example.com",
"title": "Personal Injury Attorney"
name: "James D. Schull",
email: "james1@example.com",
title: "Personal Injury Attorney",
},
{
"name": "James D. Schull",
"email": "james2@example.com",
"title": "Personal Injury Attorney"
}
]
name: "James D. Schull",
email: "james2@example.com",
title: "Personal Injury Attorney",
},
],
};
const result = await deduplicateObjectsArray(objArray);
@ -180,47 +180,47 @@ describe("deduplicateObjectsArray", () => {
it("should handle nested identical objects", async () => {
const objArray = {
"lawyers": [
lawyers: [
{
"name": "James D. Schull",
"email": null,
"title": "Personal Injury Attorney",
name: "James D. Schull",
email: null,
title: "Personal Injury Attorney",
"practice-areas": [
{
"area": "Personal Injury"
}
]
area: "Personal Injury",
},
],
},
{
"name": "James D. Schull",
"email": null,
"title": "Personal Injury Attorney",
name: "James D. Schull",
email: null,
title: "Personal Injury Attorney",
"practice-areas": [
{
"area": "Personal Injury"
}
]
}
]
area: "Personal Injury",
},
],
},
],
};
const expected = {
"lawyers": [
lawyers: [
{
"name": "James D. Schull",
"email": null,
"title": "Personal Injury Attorney",
name: "James D. Schull",
email: null,
title: "Personal Injury Attorney",
"practice-areas": [
{
"area": "Personal Injury"
}
]
}
]
area: "Personal Injury",
},
],
},
],
};
const result = await deduplicateObjectsArray(objArray);
expect(result).toEqual(expected);
});
})
});

View File

@ -3,292 +3,292 @@ import { mergeNullValObjs } from "../extract/helpers/merge-null-val-objs";
describe("mergeNullValObjs", () => {
it("should merge the objects with null values", async () => {
const objArray = {
"lawyers": [
lawyers: [
{
"name": "Frank Giunta",
"email": null,
"title": "Personal Injury Attorney",
name: "Frank Giunta",
email: null,
title: "Personal Injury Attorney",
"phone-number": "214.370.5200",
"practice-areas": [
{
"area": "Personal Injury"
}
]
area: "Personal Injury",
},
],
},
{
"name": "Frank Giunta",
"email": null,
"title": "Personal Injury Attorney",
name: "Frank Giunta",
email: null,
title: "Personal Injury Attorney",
"phone-number": "214.370.5200",
"practice-areas": [
{
"area": "Personal Injury"
}
]
}
]
}
area: "Personal Injury",
},
],
},
],
};
const expected = {
"lawyers": [
lawyers: [
{
"name": "Frank Giunta",
"email": null,
"title": "Personal Injury Attorney",
name: "Frank Giunta",
email: null,
title: "Personal Injury Attorney",
"phone-number": "214.370.5200",
"practice-areas": [
{
"area": "Personal Injury"
}
]
}
]
}
area: "Personal Injury",
},
],
},
],
};
const result = mergeNullValObjs(objArray);
expect(result).toEqual(expected);
})
});
it("should handle empty object array", async () => {
const objArray = {
"lawyers": []
}
lawyers: [],
};
const expected = {
"lawyers": []
}
lawyers: [],
};
const result = mergeNullValObjs(objArray);
expect(result).toEqual(expected);
})
});
it("should handle object array with no null values", async () => {
const objArray = {
"lawyers": [
lawyers: [
{
"name": "John Doe",
"email": "john.doe@example.com",
"title": "Attorney",
name: "John Doe",
email: "john.doe@example.com",
title: "Attorney",
"phone-number": "123.456.7890",
"practice-areas": [
{
"area": "Corporate Law"
}
]
}
]
}
const expected = {
"lawyers": [
{
"name": "John Doe",
"email": "john.doe@example.com",
"title": "Attorney",
"phone-number": "123.456.7890",
"practice-areas": [
{
"area": "Corporate Law"
}
]
}
]
}
const result = mergeNullValObjs(objArray);
expect(result).toEqual(expected);
})
it("should merge objects with different null values", async () => {
const objArray = {
"lawyers": [
{
"name": "Jane Smith",
"email": "null",
"title": "Attorney",
"description": null,
"phone-number": "987.654.3210",
"practice-areas": [
{
"area": "Family Law"
}
]
area: "Corporate Law",
},
],
},
{
"name": "Jane Smith",
"email": "jane.smith@example.com",
"title": null,
"description": "Jane Smith is an attorney specializing in Family Law.",
"phone-number": "987.654.3210",
"practice-areas": [
{
"area": "Family Law"
}
]
}
]
}
const expected = {
"lawyers": [
{
"name": "Jane Smith",
"email": "jane.smith@example.com",
"title": "Attorney",
"description": "Jane Smith is an attorney specializing in Family Law.",
"phone-number": "987.654.3210",
"practice-areas": [
{
"area": "Family Law"
}
]
}
]
}
const result = mergeNullValObjs(objArray);
expect(result).toEqual(expected);
})
it("should merge objects with different null values", async () => {
const objArray = {
"lawyers": [
{
"name": "Frank Giunta",
"email": "frank.giunta@example.com",
"title": "Personal Injury Attorney",
"phone-number": "214.370.5200",
"practice-areas": [
{
"area": "Personal Injury"
}
]
},
{
"name": "Frank Giunta",
"email": null,
"title": "Personal Injury Attorney",
"phone-number": "214.370.5200",
"practice-areas": [
{
"area": "Personal Injury"
}
]
},
{
"name": "Dale R. Rose",
"email": null,
"title": "Personal Injury Attorney",
"phone-number": "972.562.0266",
"practice-areas": [
{
"area": "Personal Injury"
}
]
}
]
],
};
const expected = {
"lawyers": [
lawyers: [
{
"name": "Frank Giunta",
"email": "frank.giunta@example.com",
"title": "Personal Injury Attorney",
"phone-number": "214.370.5200",
name: "John Doe",
email: "john.doe@example.com",
title: "Attorney",
"phone-number": "123.456.7890",
"practice-areas": [
{
"area": "Personal Injury"
}
]
area: "Corporate Law",
},
],
},
{
"name": "Dale R. Rose",
"email": null,
"title": "Personal Injury Attorney",
"phone-number": "972.562.0266",
"practice-areas": [
{
"area": "Personal Injury"
}
]
}
]
],
};
const result = mergeNullValObjs(objArray);
expect(result).toEqual(expected);
})
});
it("should merge objects with different null values", async () => {
const objArray = {
lawyers: [
{
name: "Jane Smith",
email: "null",
title: "Attorney",
description: null,
"phone-number": "987.654.3210",
"practice-areas": [
{
area: "Family Law",
},
],
},
{
name: "Jane Smith",
email: "jane.smith@example.com",
title: null,
description: "Jane Smith is an attorney specializing in Family Law.",
"phone-number": "987.654.3210",
"practice-areas": [
{
area: "Family Law",
},
],
},
],
};
const expected = {
lawyers: [
{
name: "Jane Smith",
email: "jane.smith@example.com",
title: "Attorney",
description: "Jane Smith is an attorney specializing in Family Law.",
"phone-number": "987.654.3210",
"practice-areas": [
{
area: "Family Law",
},
],
},
],
};
const result = mergeNullValObjs(objArray);
expect(result).toEqual(expected);
});
it("should merge objects with different null values", async () => {
const objArray = {
lawyers: [
{
name: "Frank Giunta",
email: "frank.giunta@example.com",
title: "Personal Injury Attorney",
"phone-number": "214.370.5200",
"practice-areas": [
{
area: "Personal Injury",
},
],
},
{
name: "Frank Giunta",
email: null,
title: "Personal Injury Attorney",
"phone-number": "214.370.5200",
"practice-areas": [
{
area: "Personal Injury",
},
],
},
{
name: "Dale R. Rose",
email: null,
title: "Personal Injury Attorney",
"phone-number": "972.562.0266",
"practice-areas": [
{
area: "Personal Injury",
},
],
},
],
};
const expected = {
lawyers: [
{
name: "Frank Giunta",
email: "frank.giunta@example.com",
title: "Personal Injury Attorney",
"phone-number": "214.370.5200",
"practice-areas": [
{
area: "Personal Injury",
},
],
},
{
name: "Dale R. Rose",
email: null,
title: "Personal Injury Attorney",
"phone-number": "972.562.0266",
"practice-areas": [
{
area: "Personal Injury",
},
],
},
],
};
const result = mergeNullValObjs(objArray);
expect(result).toEqual(expected);
});
it("should correctly merge and deduplicate objects", async () => {
const objArray = {
"lawyers": [
lawyers: [
{
"name": "Frank Giunta",
"email": null,
"title": "Personal Injury Attorney",
name: "Frank Giunta",
email: null,
title: "Personal Injury Attorney",
"phone-number": "214.370.5200",
"practice-areas": [
{
"area": "Personal Injury"
}
]
area: "Personal Injury",
},
],
},
{
"name": "Frank Giunta",
"email": null,
"title": "Personal Injury Attorney",
name: "Frank Giunta",
email: null,
title: "Personal Injury Attorney",
"phone-number": "214.370.5200",
"practice-areas": [
{
"area": "Personal Injury"
}
]
area: "Personal Injury",
},
],
},
{
"name": "Dale R. Rose",
"email": null,
"title": "Personal Injury Attorney",
name: "Dale R. Rose",
email: null,
title: "Personal Injury Attorney",
"phone-number": "972.562.0266",
"practice-areas": [
{
"area": "Personal Injury"
}
]
}
]
area: "Personal Injury",
},
],
},
],
};
const expected = {
"lawyers": [
lawyers: [
{
"name": "Frank Giunta",
"email": null,
"title": "Personal Injury Attorney",
name: "Frank Giunta",
email: null,
title: "Personal Injury Attorney",
"phone-number": "214.370.5200",
"practice-areas": [
{
"area": "Personal Injury"
}
]
area: "Personal Injury",
},
],
},
{
"name": "Dale R. Rose",
"email": null,
"title": "Personal Injury Attorney",
name: "Dale R. Rose",
email: null,
title: "Personal Injury Attorney",
"phone-number": "972.562.0266",
"practice-areas": [
{
"area": "Personal Injury"
}
]
}
]
area: "Personal Injury",
},
],
},
],
};
const result = mergeNullValObjs(objArray);
@ -298,177 +298,172 @@ describe("mergeNullValObjs", () => {
it("should merge arrays of similar objects", async () => {
const objArray = {
"lawyers": [
lawyers: [
{
"name": "Allen Cox",
"email": null,
"title": "Personal Injury Lawyer",
name: "Allen Cox",
email: null,
title: "Personal Injury Lawyer",
"phone-number": "972.606.9000",
"practice-areas": [
{ "area": "Personal Injury" }
]
"practice-areas": [{ area: "Personal Injury" }],
},
{
"name": "Allen Cox",
"email": "allen.cox@example.com",
"title": "Personal Injury Lawyer",
name: "Allen Cox",
email: "allen.cox@example.com",
title: "Personal Injury Lawyer",
"phone-number": null,
"practice-areas": [
{ "area": "Automobile accidents" },
{ "area": "Truck accidents" },
{ "area": "Amusement park injury" },
{ "area": "Bus accident" },
{ "area": "Industrial accidents" },
{ "area": "Product defects" },
{ "area": "Food poisoning" },
{ "area": "Workplace accidents" },
{ "area": "Wrongful death" },
{ "area": "Swimming pool accidents" },
{ "area": "Premises accidents" },
{ "area": "Aircraft accidents" },
{ "area": "Animal and dog bites" }
]
}
]
}
{ area: "Automobile accidents" },
{ area: "Truck accidents" },
{ area: "Amusement park injury" },
{ area: "Bus accident" },
{ area: "Industrial accidents" },
{ area: "Product defects" },
{ area: "Food poisoning" },
{ area: "Workplace accidents" },
{ area: "Wrongful death" },
{ area: "Swimming pool accidents" },
{ area: "Premises accidents" },
{ area: "Aircraft accidents" },
{ area: "Animal and dog bites" },
],
},
],
};
const expected = {
"lawyers": [
lawyers: [
{
"name": "Allen Cox",
"email": "allen.cox@example.com",
"title": "Personal Injury Lawyer",
name: "Allen Cox",
email: "allen.cox@example.com",
title: "Personal Injury Lawyer",
"phone-number": "972.606.9000",
"practice-areas": [
{ "area": "Personal Injury" },
{ "area": "Automobile accidents" },
{ "area": "Truck accidents" },
{ "area": "Amusement park injury" },
{ "area": "Bus accident" },
{ "area": "Industrial accidents" },
{ "area": "Product defects" },
{ "area": "Food poisoning" },
{ "area": "Workplace accidents" },
{ "area": "Wrongful death" },
{ "area": "Swimming pool accidents" },
{ "area": "Premises accidents" },
{ "area": "Aircraft accidents" },
{ "area": "Animal and dog bites" }
]
}
]
}
{ area: "Personal Injury" },
{ area: "Automobile accidents" },
{ area: "Truck accidents" },
{ area: "Amusement park injury" },
{ area: "Bus accident" },
{ area: "Industrial accidents" },
{ area: "Product defects" },
{ area: "Food poisoning" },
{ area: "Workplace accidents" },
{ area: "Wrongful death" },
{ area: "Swimming pool accidents" },
{ area: "Premises accidents" },
{ area: "Aircraft accidents" },
{ area: "Animal and dog bites" },
],
},
],
};
const result = mergeNullValObjs(objArray);
expect(result).toEqual(expected);
})
});
it("should merge arrays of similar objects with different key names", async () => {
const objArray = {
"attorneys": [
attorneys: [
{
"fullName": "Allen Cox",
"contactEmail": null,
"position": "Personal Injury Lawyer",
"contactNumber": "972.606.9000",
"specializations": [
{ "field": "Personal Injury" }
]
fullName: "Allen Cox",
contactEmail: null,
position: "Personal Injury Lawyer",
contactNumber: "972.606.9000",
specializations: [{ field: "Personal Injury" }],
},
{
"fullName": "Allen Cox",
"contactEmail": "allen.cox@example.com",
"position": "Personal Injury Lawyer",
"contactNumber": null,
"specializations": [
{ "field": "Automobile accidents" },
{ "field": "Truck accidents" },
{ "field": "Amusement park injury" },
{ "field": "Bus accident" },
{ "field": "Industrial accidents" },
{ "field": "Product defects" },
{ "field": "Food poisoning" },
{ "field": "Workplace accidents" },
{ "field": "Wrongful death" },
{ "field": "Swimming pool accidents" },
{ "field": "Premises accidents" },
{ "field": "Aircraft accidents" },
{ "field": "Animal and dog bites" }
]
}
]
}
fullName: "Allen Cox",
contactEmail: "allen.cox@example.com",
position: "Personal Injury Lawyer",
contactNumber: null,
specializations: [
{ field: "Automobile accidents" },
{ field: "Truck accidents" },
{ field: "Amusement park injury" },
{ field: "Bus accident" },
{ field: "Industrial accidents" },
{ field: "Product defects" },
{ field: "Food poisoning" },
{ field: "Workplace accidents" },
{ field: "Wrongful death" },
{ field: "Swimming pool accidents" },
{ field: "Premises accidents" },
{ field: "Aircraft accidents" },
{ field: "Animal and dog bites" },
],
},
],
};
const expected = {
"attorneys": [
attorneys: [
{
"fullName": "Allen Cox",
"contactEmail": "allen.cox@example.com",
"position": "Personal Injury Lawyer",
"contactNumber": "972.606.9000",
"specializations": [
{ "field": "Personal Injury" },
{ "field": "Automobile accidents" },
{ "field": "Truck accidents" },
{ "field": "Amusement park injury" },
{ "field": "Bus accident" },
{ "field": "Industrial accidents" },
{ "field": "Product defects" },
{ "field": "Food poisoning" },
{ "field": "Workplace accidents" },
{ "field": "Wrongful death" },
{ "field": "Swimming pool accidents" },
{ "field": "Premises accidents" },
{ "field": "Aircraft accidents" },
{ "field": "Animal and dog bites" }
]
}
]
}
fullName: "Allen Cox",
contactEmail: "allen.cox@example.com",
position: "Personal Injury Lawyer",
contactNumber: "972.606.9000",
specializations: [
{ field: "Personal Injury" },
{ field: "Automobile accidents" },
{ field: "Truck accidents" },
{ field: "Amusement park injury" },
{ field: "Bus accident" },
{ field: "Industrial accidents" },
{ field: "Product defects" },
{ field: "Food poisoning" },
{ field: "Workplace accidents" },
{ field: "Wrongful death" },
{ field: "Swimming pool accidents" },
{ field: "Premises accidents" },
{ field: "Aircraft accidents" },
{ field: "Animal and dog bites" },
],
},
],
};
const result = mergeNullValObjs(objArray);
expect(result).toEqual(expected);
})
});
it ("should deal with not array values", async () => {
it("should deal with not array values", async () => {
const objArray = {
"lawyers": {
"name": "not an array"
lawyers: {
name: "not an array",
},
"attorneys": {
"name": "not an array"
}
}
attorneys: {
name: "not an array",
},
};
const expected = {
"lawyers": {
"name": "not an array"
lawyers: {
name: "not an array",
},
"attorneys": {
"name": "not an array"
}
}
attorneys: {
name: "not an array",
},
};
// @ts-expect-error
const result = mergeNullValObjs(objArray);
expect(result).toEqual(expected);
})
});
it ("should deal with arrays of strings", async () => {
const objArray = {
"lawyers": ["res1", "res2", "res3"]
}
it("should deal with arrays of strings", async () => {
const objArray = {
lawyers: ["res1", "res2", "res3"],
};
const expected = {
"lawyers": ["res1", "res2", "res3"]
}
const expected = {
lawyers: ["res1", "res2", "res3"],
};
const result = mergeNullValObjs(objArray);
const result = mergeNullValObjs(objArray);
expect(result).toEqual(expected);
})
})
expect(result).toEqual(expected);
});
});

File diff suppressed because it is too large Load Diff

View File

@ -2,7 +2,7 @@ import { spreadSchemas } from "../extract/helpers/spread-schemas";
describe("spreadSchemas", () => {
it("should spread kyb schema (id: 1)", async () => {
const keys = ["owners"]
const keys = ["owners"];
const schema = {
type: "object",
properties: {
@ -21,13 +21,13 @@ describe("spreadSchemas", () => {
city: { type: "string" },
state: { type: "string" },
country: { type: "string" },
postal_code: { type: "string" }
postal_code: { type: "string" },
},
},
incorporation_date: { type: "string", format: "date" },
phone: { type: "string" },
email: { type: "string", format: "email" }
}
email: { type: "string", format: "email" },
},
},
owners: {
type: "array",
@ -43,18 +43,21 @@ describe("spreadSchemas", () => {
city: { type: "string" },
state: { type: "string" },
country: { type: "string" },
postal_code: { type: "string" }
postal_code: { type: "string" },
},
},
phone: { type: "string" },
email: { type: "string", format: "email" }
}
}
}
}
}
email: { type: "string", format: "email" },
},
},
},
},
};
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys)
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
schema,
keys,
);
expect(singleAnswerSchema).toEqual({
type: "object",
@ -74,16 +77,16 @@ describe("spreadSchemas", () => {
city: { type: "string" },
state: { type: "string" },
country: { type: "string" },
postal_code: { type: "string" }
}
postal_code: { type: "string" },
},
},
incorporation_date: { type: "string", format: "date" },
phone: { type: "string" },
email: { type: "string", format: "email" }
}
email: { type: "string", format: "email" },
},
},
},
})
});
expect(multiEntitySchema).toEqual({
type: "object",
@ -102,20 +105,20 @@ describe("spreadSchemas", () => {
city: { type: "string" },
state: { type: "string" },
country: { type: "string" },
postal_code: { type: "string" }
}
postal_code: { type: "string" },
},
},
phone: { type: "string" },
email: { type: "string", format: "email" }
}
}
}
}
})
})
email: { type: "string", format: "email" },
},
},
},
},
});
});
it("should spread lawyers schema (id: 9)", async () => {
const keys = ["lawyers"]
const keys = ["lawyers"];
const schema = {
type: "object",
properties: {
@ -133,22 +136,25 @@ describe("spreadSchemas", () => {
items: {
type: "object",
properties: {
area: { type: "string" }
area: { type: "string" },
},
},
alias: "practice-areas"
}
alias: "practice-areas",
},
},
}
}
}
},
},
},
};
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys)
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
schema,
keys,
);
expect(singleAnswerSchema).toEqual({})
expect(multiEntitySchema).toEqual(schema)
})
expect(singleAnswerSchema).toEqual({});
expect(multiEntitySchema).toEqual(schema);
});
it("shoud spread (id: 26)", async () => {
const schema = {
@ -161,19 +167,22 @@ describe("spreadSchemas", () => {
properties: {
name: { type: "string" },
price: { type: "string" },
description: { type: "string" }
}
}
}
}
}
description: { type: "string" },
},
},
},
},
};
const keys = ["products"]
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys)
const keys = ["products"];
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
schema,
keys,
);
expect(singleAnswerSchema).toEqual({})
expect(multiEntitySchema).toEqual(schema)
})
expect(singleAnswerSchema).toEqual({});
expect(multiEntitySchema).toEqual(schema);
});
it("shoud spread categories and products", async () => {
const schema = {
@ -182,8 +191,8 @@ describe("spreadSchemas", () => {
categories: {
type: "array",
items: {
type: "string"
}
type: "string",
},
},
products: {
type: "array",
@ -192,19 +201,22 @@ describe("spreadSchemas", () => {
properties: {
name: { type: "string" },
price: { type: "string" },
description: { type: "string" }
}
}
}
}
}
description: { type: "string" },
},
},
},
},
};
const keys = ["products", "categories"]
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys)
const keys = ["products", "categories"];
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
schema,
keys,
);
expect(singleAnswerSchema).toEqual({})
expect(multiEntitySchema).toEqual(schema)
})
expect(singleAnswerSchema).toEqual({});
expect(multiEntitySchema).toEqual(schema);
});
it("should spread (id: 29)", async () => {
const schema = {
@ -220,50 +232,55 @@ describe("spreadSchemas", () => {
offers_cmmc: { type: "boolean" },
has_soc_2_cert: { type: "boolean" },
offers_office365: { type: "boolean" },
offers_endpoint_security: { type: "boolean" }
}
}
offers_endpoint_security: { type: "boolean" },
},
};
const keys = []
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys)
const keys = [];
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
schema,
keys,
);
expect(singleAnswerSchema).toEqual(schema)
expect(multiEntitySchema).toEqual({})
})
expect(singleAnswerSchema).toEqual(schema);
expect(multiEntitySchema).toEqual({});
});
it("should spread kyb schema (id: 29)", async () => {
const schema = {
"type": "object",
"properties": {
"lawyers": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": { "type": "string" },
"email": { "type": ["string", "null"] },
"phone-number": { "type": "string" },
type: "object",
properties: {
lawyers: {
type: "array",
items: {
type: "object",
properties: {
name: { type: "string" },
email: { type: ["string", "null"] },
"phone-number": { type: "string" },
"practice-areas": {
"type": "array",
"items": {
"type": "object",
"properties": {
"area": { "type": "string" }
}
}
type: "array",
items: {
type: "object",
properties: {
area: { type: "string" },
},
},
},
"title": { "type": ["string", "null"] }
title: { type: ["string", "null"] },
},
}
}
}
}
},
},
},
};
const keys = ["lawyers"]
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(schema, keys)
const keys = ["lawyers"];
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
schema,
keys,
);
expect(singleAnswerSchema).toEqual({})
expect(multiEntitySchema).toEqual(schema)
})
})
expect(singleAnswerSchema).toEqual({});
expect(multiEntitySchema).toEqual(schema);
});
});

File diff suppressed because it is too large Load Diff

View File

@ -42,7 +42,10 @@ export async function saveEntryToCache(key: string, entry: CacheEntry) {
if (!cacheRedis) return;
if (!entry.html || entry.html.length < 100) {
logger.warn("Skipping cache save for short HTML", { key, htmlLength: entry.html?.length });
logger.warn("Skipping cache save for short HTML", {
key,
htmlLength: entry.html?.length,
});
return;
}

View File

@ -127,13 +127,15 @@ export async function getDoneJobsOrdered(
export async function isCrawlFinished(id: string) {
return (
(await redisConnection.scard("crawl:" + id + ":jobs_done")) ===
(await redisConnection.scard("crawl:" + id + ":jobs"))
&& (await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null
(await redisConnection.scard("crawl:" + id + ":jobs")) &&
(await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null
);
}
export async function isCrawlKickoffFinished(id: string) {
return await redisConnection.get("crawl:" + id + ":kickoff:finish") !== null
return (
(await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null
);
}
export async function isCrawlFinishedLocked(id: string) {
@ -141,7 +143,12 @@ export async function isCrawlFinishedLocked(id: string) {
}
export async function finishCrawlKickoff(id: string) {
await redisConnection.set("crawl:" + id + ":kickoff:finish", "yes", "EX", 24 * 60 * 60);
await redisConnection.set(
"crawl:" + id + ":kickoff:finish",
"yes",
"EX",
24 * 60 * 60,
);
}
export async function finishCrawl(id: string) {
@ -161,9 +168,10 @@ export async function finishCrawl(id: string) {
module: "crawl-redis",
method: "finishCrawl",
crawlId: id,
jobs_done: (await redisConnection.scard("crawl:" + id + ":jobs_done")),
jobs: (await redisConnection.scard("crawl:" + id + ":jobs")),
kickoff_finished: (await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null,
jobs_done: await redisConnection.scard("crawl:" + id + ":jobs_done"),
jobs: await redisConnection.scard("crawl:" + id + ":jobs"),
kickoff_finished:
(await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null,
});
}
}

View File

@ -1,81 +1,81 @@
// const id = crypto.randomUUID();
// const sc: StoredCrawl = {
// originUrl: request.urls[0].replace("/*",""),
// crawlerOptions: toLegacyCrawlerOptions({
// maxDepth: 15,
// limit: 5000,
// includePaths: [],
// excludePaths: [],
// ignoreSitemap: false,
// allowExternalLinks: false,
// allowBackwardLinks: true,
// allowSubdomains: false,
// ignoreRobotsTxt: false,
// deduplicateSimilarURLs: false,
// ignoreQueryParameters: false
// }),
// scrapeOptions: {
// formats: ["markdown"],
// onlyMainContent: true,
// waitFor: 0,
// mobile: false,
// removeBase64Images: true,
// fastMode: false,
// parsePDF: true,
// skipTlsVerification: false,
// },
// internalOptions: {
// disableSmartWaitCache: true,
// isBackgroundIndex: true
// },
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
// createdAt: Date.now(),
// plan: "hobby", // make it a low concurrency
// };
// const sc: StoredCrawl = {
// originUrl: request.urls[0].replace("/*",""),
// crawlerOptions: toLegacyCrawlerOptions({
// maxDepth: 15,
// limit: 5000,
// includePaths: [],
// excludePaths: [],
// ignoreSitemap: false,
// allowExternalLinks: false,
// allowBackwardLinks: true,
// allowSubdomains: false,
// ignoreRobotsTxt: false,
// deduplicateSimilarURLs: false,
// ignoreQueryParameters: false
// }),
// scrapeOptions: {
// formats: ["markdown"],
// onlyMainContent: true,
// waitFor: 0,
// mobile: false,
// removeBase64Images: true,
// fastMode: false,
// parsePDF: true,
// skipTlsVerification: false,
// },
// internalOptions: {
// disableSmartWaitCache: true,
// isBackgroundIndex: true
// },
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
// createdAt: Date.now(),
// plan: "hobby", // make it a low concurrency
// };
// // Save the crawl configuration
// await saveCrawl(id, sc);
// // Save the crawl configuration
// await saveCrawl(id, sc);
// // Then kick off the job
// await _addScrapeJobToBullMQ({
// url: request.urls[0].replace("/*",""),
// mode: "kickoff" as const,
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
// plan: "hobby", // make it a low concurrency
// crawlerOptions: sc.crawlerOptions,
// scrapeOptions: sc.scrapeOptions,
// internalOptions: sc.internalOptions,
// origin: "index",
// crawl_id: id,
// webhook: null,
// v1: true,
// }, {}, crypto.randomUUID(), 50);
// // Then kick off the job
// await _addScrapeJobToBullMQ({
// url: request.urls[0].replace("/*",""),
// mode: "kickoff" as const,
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
// plan: "hobby", // make it a low concurrency
// crawlerOptions: sc.crawlerOptions,
// scrapeOptions: sc.scrapeOptions,
// internalOptions: sc.internalOptions,
// origin: "index",
// crawl_id: id,
// webhook: null,
// v1: true,
// }, {}, crypto.randomUUID(), 50);
// we restructure and make all of the arrays we need to fill into objects,
// adding them to a single object so the llm can fill them one at a time
// TODO: make this work for more complex schemas where arrays are not first level
// we restructure and make all of the arrays we need to fill into objects,
// adding them to a single object so the llm can fill them one at a time
// TODO: make this work for more complex schemas where arrays are not first level
// let schemasForLLM: {} = {};
// for (const key in largeArraysSchema) {
// const originalSchema = structuredClone(largeArraysSchema[key].items);
// console.log(
// "key",
// key,
// "\noriginalSchema",
// JSON.stringify(largeArraysSchema[key], null, 2),
// );
// let clonedObj = {
// type: "object",
// properties: {
// informationFilled: {
// type: "boolean",
// },
// data: {
// type: "object",
// properties: originalSchema.properties,
// },
// },
// };
// schemasForLLM[key] = clonedObj;
// }
// let schemasForLLM: {} = {};
// for (const key in largeArraysSchema) {
// const originalSchema = structuredClone(largeArraysSchema[key].items);
// console.log(
// "key",
// key,
// "\noriginalSchema",
// JSON.stringify(largeArraysSchema[key], null, 2),
// );
// let clonedObj = {
// type: "object",
// properties: {
// informationFilled: {
// type: "boolean",
// },
// data: {
// type: "object",
// properties: originalSchema.properties,
// },
// },
// };
// schemasForLLM[key] = clonedObj;
// }

View File

@ -59,11 +59,11 @@ export async function updateExtract(
// Limit links in steps to 500
if (extract.steps) {
extract.steps = extract.steps.map(step => {
extract.steps = extract.steps.map((step) => {
if (step.discoveredLinks && step.discoveredLinks.length > 500) {
return {
...step,
discoveredLinks: step.discoveredLinks.slice(0, 500)
discoveredLinks: step.discoveredLinks.slice(0, 500),
};
}
return step;

View File

@ -32,7 +32,11 @@ import { ExtractStep, updateExtract } from "./extract-redis";
import { deduplicateObjectsArray } from "./helpers/deduplicate-objs-array";
import { mergeNullValObjs } from "./helpers/merge-null-val-objs";
import { CUSTOM_U_TEAMS, extractConfig } from "./config";
import { calculateFinalResultCost, estimateCost, estimateTotalCost } from "./usage/llm-cost";
import {
calculateFinalResultCost,
estimateCost,
estimateTotalCost,
} from "./usage/llm-cost";
import { numTokensFromString } from "../LLM-extraction/helpers";
interface ExtractServiceOptions {
@ -147,7 +151,13 @@ Schema: ${schemaString}\nPrompt: ${prompt}\nRelevant URLs: ${urls}`,
totalTokens: result.usage?.total_tokens ?? 0,
model: model,
};
return { isMultiEntity, multiEntityKeys, reasoning, keyIndicators, tokenUsage };
return {
isMultiEntity,
multiEntityKeys,
reasoning,
keyIndicators,
tokenUsage,
};
}
type completions = {
@ -187,7 +197,7 @@ export async function performExtraction(
method: "performExtraction",
extractId,
});
// Token tracking
let tokenUsage: TokenUsage[] = [];
@ -246,7 +256,7 @@ export async function performExtraction(
"No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.",
extractId,
urlTrace: urlTraces,
totalUrlsScraped: 0
totalUrlsScraped: 0,
};
}
@ -277,8 +287,13 @@ export async function performExtraction(
// 1. the first one is a completion that will extract the array of items
// 2. the second one is multiple completions that will extract the items from the array
let startAnalyze = Date.now();
const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators, tokenUsage: schemaAnalysisTokenUsage } =
await analyzeSchemaAndPrompt(links, reqSchema, request.prompt ?? "");
const {
isMultiEntity,
multiEntityKeys,
reasoning,
keyIndicators,
tokenUsage: schemaAnalysisTokenUsage,
} = await analyzeSchemaAndPrompt(links, reqSchema, request.prompt ?? "");
// Track schema analysis tokens
tokenUsage.push(schemaAnalysisTokenUsage);
@ -540,7 +555,7 @@ export async function performExtraction(
"An unexpected error occurred. Please contact help@firecrawl.com for help.",
extractId,
urlTrace: urlTraces,
totalUrlsScraped
totalUrlsScraped,
};
}
}
@ -592,17 +607,18 @@ export async function performExtraction(
}
}
const validResults = results.filter((doc): doc is Document => doc !== null);
const validResults = results.filter(
(doc): doc is Document => doc !== null,
);
singleAnswerDocs.push(...validResults);
totalUrlsScraped += validResults.length;
} catch (error) {
return {
success: false,
error: error.message,
extractId,
urlTrace: urlTraces,
totalUrlsScraped
totalUrlsScraped,
};
}
@ -614,7 +630,7 @@ export async function performExtraction(
"All provided URLs are invalid. Please check your input and try again.",
extractId,
urlTrace: request.urlTrace ? urlTraces : undefined,
totalUrlsScraped: 0
totalUrlsScraped: 0,
};
}
@ -679,12 +695,12 @@ export async function performExtraction(
: singleAnswerResult || multiEntityResult;
// Tokenize final result to get token count
let finalResultTokens = 0;
if (finalResult) {
const finalResultStr = JSON.stringify(finalResult);
finalResultTokens = numTokensFromString(finalResultStr, "gpt-4o");
// let finalResultTokens = 0;
// if (finalResult) {
// const finalResultStr = JSON.stringify(finalResult);
// finalResultTokens = numTokensFromString(finalResultStr, "gpt-4o");
}
// }
// // Deduplicate and validate final result against schema
// if (reqSchema && finalResult && finalResult.length <= extractConfig.DEDUPLICATION.MAX_TOKENS) {
// const schemaValidation = await generateOpenAICompletions(
@ -695,7 +711,7 @@ export async function performExtraction(
// 1. Remove any duplicate entries in the data extracted by merging that into a single object according to the provided shcema
// 2. Ensure all data matches the provided schema
// 3. Keep only the highest quality and most complete entries when duplicates are found.
// Do not change anything else. If data is null keep it null. If the schema is not provided, return the data as is.`,
// prompt: `Please validate and merge the duplicate entries in this data according to the schema provided:\n
@ -732,12 +748,10 @@ export async function performExtraction(
const llmUsage = estimateTotalCost(tokenUsage);
let tokensToBill = calculateFinalResultCost(finalResult);
if (CUSTOM_U_TEAMS.includes(teamId)) {
tokensToBill = 1;
}
// Bill team for usage
billTeam(teamId, subId, tokensToBill, logger, true).catch((error) => {
logger.error(
@ -745,7 +759,6 @@ export async function performExtraction(
);
});
// Log job with token usage
logJob({
job_id: extractId,
@ -779,6 +792,6 @@ export async function performExtraction(
warning: undefined, // TODO FIX
urlTrace: request.urlTrace ? urlTraces : undefined,
llmUsage,
totalUrlsScraped
totalUrlsScraped,
};
}

View File

@ -1,10 +1,12 @@
export function deduplicateObjectsArray(objArray: { [key: string]: any[] }): { [key: string]: any[] } {
export function deduplicateObjectsArray(objArray: { [key: string]: any[] }): {
[key: string]: any[];
} {
const deduplicatedObjArray: { [key: string]: any[] } = {};
for (const key in objArray) {
if (Array.isArray(objArray[key])) {
const seen = new Set();
deduplicatedObjArray[key] = objArray[key].filter(item => {
deduplicatedObjArray[key] = objArray[key].filter((item) => {
// Create a unique identifier for each item based on its properties
const identifier = JSON.stringify(item);
@ -24,4 +26,4 @@ export function deduplicateObjectsArray(objArray: { [key: string]: any[] }): { [
}
return deduplicatedObjArray;
}
}

View File

@ -7,4 +7,4 @@ export async function dereferenceSchema(schema: any): Promise<any> {
console.error("Failed to dereference schema:", error);
throw error;
}
}
}

View File

@ -1,5 +1,5 @@
import * as fs from 'fs';
import * as path from 'path';
import * as fs from "fs";
import * as path from "path";
/**
* Helper function to dump data to a file for debugging/logging purposes
@ -10,17 +10,19 @@ import * as path from 'path';
export function dumpToFile<T>(
filename: string,
data: T[],
formatter?: (item: T, index: number) => string
formatter?: (item: T, index: number) => string,
) {
const filePath = path.join(__dirname, filename);
let fileContent: string;
if (formatter) {
fileContent = data.map((item, index) => formatter(item, index)).join('\n');
fileContent = data.map((item, index) => formatter(item, index)).join("\n");
} else {
fileContent = data.map((item, index) => `${index + 1}. ${JSON.stringify(item)}`).join('\n');
fileContent = data
.map((item, index) => `${index + 1}. ${JSON.stringify(item)}`)
.join("\n");
}
fs.writeFileSync(filePath, fileContent, 'utf8');
fs.writeFileSync(filePath, fileContent, "utf8");
console.log(`Dumped data to ${filename}`);
}

View File

@ -1,4 +1,4 @@
import { deduplicateObjectsArray } from './deduplicate-objs-array';
import { deduplicateObjectsArray } from "./deduplicate-objs-array";
/**
* Convert "null" strings to actual null values for easier comparison.
@ -25,16 +25,16 @@ function areMergeable(obj1: any, obj2: any): boolean {
const allKeys = new Set([...Object.keys(obj1), ...Object.keys(obj2)]);
let matchingNonNullValues = 0;
let nonNullComparisons = 0;
for (const key of allKeys) {
const val1 = obj1[key];
const val2 = obj2[key];
// Skip array comparisons - they'll be merged separately
if (Array.isArray(val1) || Array.isArray(val2)) {
continue;
}
// If both values exist and are not null
if (val1 !== null && val2 !== null) {
nonNullComparisons++;
@ -43,7 +43,7 @@ function areMergeable(obj1: any, obj2: any): boolean {
}
}
}
// Objects are mergeable if they have at least one matching non-null value
// and all their non-null values match when both objects have them
return nonNullComparisons > 0 && matchingNonNullValues === nonNullComparisons;
@ -56,7 +56,10 @@ function mergeArrays(arr1: any[], arr2: any[]): any[] {
const combined = [...arr1, ...arr2];
return combined.filter((item, index) => {
const stringified = JSON.stringify(item);
return combined.findIndex(other => JSON.stringify(other) === stringified) === index;
return (
combined.findIndex((other) => JSON.stringify(other) === stringified) ===
index
);
});
}
@ -78,9 +81,9 @@ function mergeObjects(obj1: any, obj2: any): any {
// If only obj2's value is an array, use it
result[key] = [...obj2[key]];
}
} else if (typeof obj2[key] === 'object') {
} else if (typeof obj2[key] === "object") {
// If both are objects (but not arrays), merge them
if (typeof result[key] === 'object' && !Array.isArray(result[key])) {
if (typeof result[key] === "object" && !Array.isArray(result[key])) {
result[key] = mergeObjects(result[key], obj2[key]);
} else {
result[key] = { ...obj2[key] };
@ -101,13 +104,17 @@ function mergeObjects(obj1: any, obj2: any): any {
* null-equivalent fields, filling in null fields with the corresponding
* non-null fields from the other object.
*/
export function mergeNullValObjs(objArray: { [key: string]: any[] }): { [key: string]: any[] } {
export function mergeNullValObjs(objArray: { [key: string]: any[] }): {
[key: string]: any[];
} {
const result: { [key: string]: any[] } = {};
for (const key in objArray) {
if (Array.isArray(objArray[key])) {
// If array contains only primitive values, return as is
if (objArray[key].every(item => typeof item !== 'object' || item === null)) {
if (
objArray[key].every((item) => typeof item !== "object" || item === null)
) {
result[key] = [...objArray[key]];
continue;
}
@ -117,7 +124,7 @@ export function mergeNullValObjs(objArray: { [key: string]: any[] }): { [key: st
for (const item of items) {
let merged = false;
for (let i = 0; i < mergedItems.length; i++) {
if (areMergeable(mergedItems[i], item)) {
mergedItems[i] = mergeObjects(mergedItems[i], item);
@ -125,7 +132,7 @@ export function mergeNullValObjs(objArray: { [key: string]: any[] }): { [key: st
break;
}
}
if (!merged) {
mergedItems.push({ ...item });
}
@ -134,10 +141,13 @@ export function mergeNullValObjs(objArray: { [key: string]: any[] }): { [key: st
// Final deduplication pass
result[key] = deduplicateObjectsArray({ [key]: mergedItems })[key];
} else {
console.warn(`Expected an array at objArray[${key}], but found:`, objArray[key]);
console.warn(
`Expected an array at objArray[${key}], but found:`,
objArray[key],
);
return objArray;
}
}
return result;
}
}

View File

@ -1,7 +1,7 @@
export async function mixSchemaObjects(
finalSchema: any,
singleAnswerResult: any,
multiEntityResult: any
multiEntityResult: any,
) {
const finalResult: any = {};
@ -9,14 +9,20 @@ export async function mixSchemaObjects(
function mergeResults(schema: any, singleResult: any, multiResult: any) {
const result: any = {};
for (const key in schema.properties) {
if (schema.properties[key].type === 'object' && schema.properties[key].properties) {
if (
schema.properties[key].type === "object" &&
schema.properties[key].properties
) {
// If the property is an object, recursively merge its properties
result[key] = mergeResults(
schema.properties[key],
singleResult[key] || {},
multiResult[key] || {}
multiResult[key] || {},
);
} else if (schema.properties[key].type === 'array' && Array.isArray(multiResult[key])) {
} else if (
schema.properties[key].type === "array" &&
Array.isArray(multiResult[key])
) {
// If the property is an array, flatten the arrays from multiResult
result[key] = multiResult[key].flat();
} else if (singleResult.hasOwnProperty(key)) {
@ -29,7 +35,10 @@ export async function mixSchemaObjects(
}
// Merge the properties from the final schema
Object.assign(finalResult, mergeResults(finalSchema, singleAnswerResult, multiEntityResult));
Object.assign(
finalResult,
mergeResults(finalSchema, singleAnswerResult, multiEntityResult),
);
return finalResult;
}
}

View File

@ -1,4 +1,7 @@
export async function spreadSchemas(schema: any, keys: string[]): Promise<{
export async function spreadSchemas(
schema: any,
keys: string[],
): Promise<{
singleAnswerSchema: any;
multiEntitySchema: any;
}> {
@ -32,7 +35,7 @@ export async function spreadSchemas(schema: any, keys: string[]): Promise<{
if (Object.keys(singleAnswerSchema.properties).length === 0) {
singleAnswerSchema = {};
}
if (Object.keys(multiEntitySchema.properties).length === 0) {
multiEntitySchema = {};
}
@ -41,4 +44,4 @@ export async function spreadSchemas(schema: any, keys: string[]): Promise<{
singleAnswerSchema,
multiEntitySchema,
};
}
}

View File

@ -1,21 +1,21 @@
import isEqual from 'lodash/isEqual';
import isEqual from "lodash/isEqual";
export function transformArrayToObject(
originalSchema: any,
arrayData: any[]
arrayData: any[],
): any {
if (Object.keys(originalSchema).length == 0) {
return {};
}
const transformedResult: any = {};
// Function to find the array key in a nested schema
function findArrayKey(schema: any): string | null {
for (const key in schema.properties) {
if (schema.properties[key].type === 'array') {
if (schema.properties[key].type === "array") {
return key;
} else if (schema.properties[key].type === 'object') {
} else if (schema.properties[key].type === "object") {
const nestedKey = findArrayKey(schema.properties[key]);
if (nestedKey) {
return `${key}.${nestedKey}`;
@ -31,7 +31,10 @@ export function transformArrayToObject(
for (const key in item) {
if (!acc[key]) {
acc[key] = item[key];
} else if (typeof acc[key] === 'object' && typeof item[key] === 'object') {
} else if (
typeof acc[key] === "object" &&
typeof item[key] === "object"
) {
acc[key] = { ...acc[key], ...item[key] };
}
}
@ -39,13 +42,16 @@ export function transformArrayToObject(
}, {});
}
const arrayKeyParts = arrayKeyPath.split('.');
const arrayKeyParts = arrayKeyPath.split(".");
const arrayKey = arrayKeyParts.pop();
if (!arrayKey) {
throw new Error("Array key not found in schema");
}
const parentSchema = arrayKeyParts.reduce((schema, key) => schema.properties[key], originalSchema);
const parentSchema = arrayKeyParts.reduce(
(schema, key) => schema.properties[key],
originalSchema,
);
const itemSchema = parentSchema.properties[arrayKey].items;
if (!itemSchema) {
throw new Error("Item schema not found for array key");
@ -53,7 +59,7 @@ export function transformArrayToObject(
// Initialize the array in the transformed result
let currentLevel = transformedResult;
arrayKeyParts.forEach(part => {
arrayKeyParts.forEach((part) => {
if (!currentLevel[part]) {
currentLevel[part] = {};
}
@ -63,20 +69,23 @@ export function transformArrayToObject(
// Helper function to check if an object is already in the array
function isDuplicateObject(array: any[], obj: any): boolean {
return array.some(existingItem => isEqual(existingItem, obj));
return array.some((existingItem) => isEqual(existingItem, obj));
}
// Helper function to validate if an object follows the schema
function isValidObject(obj: any, schema: any): boolean {
return Object.keys(schema.properties).every(key => {
return obj.hasOwnProperty(key) && typeof obj[key] === schema.properties[key].type;
return Object.keys(schema.properties).every((key) => {
return (
obj.hasOwnProperty(key) &&
typeof obj[key] === schema.properties[key].type
);
});
}
// Iterate over each item in the arrayData
arrayData.forEach(item => {
arrayData.forEach((item) => {
let currentItem = item;
arrayKeyParts.forEach(part => {
arrayKeyParts.forEach((part) => {
if (currentItem[part]) {
currentItem = currentItem[part];
}
@ -84,43 +93,63 @@ export function transformArrayToObject(
// Copy non-array properties from the parent object
for (const key in parentSchema.properties) {
if (key !== arrayKey && currentItem.hasOwnProperty(key) && !currentLevel.hasOwnProperty(key)) {
if (
key !== arrayKey &&
currentItem.hasOwnProperty(key) &&
!currentLevel.hasOwnProperty(key)
) {
currentLevel[key] = currentItem[key];
}
}
// Ensure that the currentItem[arrayKey] is an array before mapping
if (Array.isArray(currentItem[arrayKey])) {
currentItem[arrayKey].forEach((subItem: any) => {
if (typeof subItem === 'object' && subItem !== null && isValidObject(subItem, itemSchema)) {
// For arrays of objects, add only unique objects
const transformedItem: any = {};
let hasValidData = false;
// Ensure that the currentItem[arrayKey] is an array before mapping
if (Array.isArray(currentItem[arrayKey])) {
currentItem[arrayKey].forEach((subItem: any) => {
if (
typeof subItem === "object" &&
subItem !== null &&
isValidObject(subItem, itemSchema)
) {
// For arrays of objects, add only unique objects
const transformedItem: any = {};
let hasValidData = false;
for (const key in itemSchema.properties) {
if (subItem.hasOwnProperty(key) && subItem[key] !== undefined) {
transformedItem[key] = subItem[key];
hasValidData = true;
for (const key in itemSchema.properties) {
if (subItem.hasOwnProperty(key) && subItem[key] !== undefined) {
transformedItem[key] = subItem[key];
hasValidData = true;
}
}
if (
hasValidData &&
!isDuplicateObject(currentLevel[arrayKey], transformedItem)
) {
currentLevel[arrayKey].push(transformedItem);
}
}
if (hasValidData && !isDuplicateObject(currentLevel[arrayKey], transformedItem)) {
currentLevel[arrayKey].push(transformedItem);
}
}
});
} else {
console.warn(`Expected an array at ${arrayKey}, but found:`, currentItem[arrayKey]);
}
});
} else {
console.warn(
`Expected an array at ${arrayKey}, but found:`,
currentItem[arrayKey],
);
}
// Handle merging of array properties
for (const key in parentSchema.properties) {
if (parentSchema.properties[key].type === 'array' && Array.isArray(currentItem[key])) {
if (
parentSchema.properties[key].type === "array" &&
Array.isArray(currentItem[key])
) {
if (!currentLevel[key]) {
currentLevel[key] = [];
}
currentItem[key].forEach((value: any) => {
if (!currentLevel[key].includes(value) && !isDuplicateObject(currentLevel[arrayKey], value)) {
if (
!currentLevel[key].includes(value) &&
!isDuplicateObject(currentLevel[arrayKey], value)
) {
currentLevel[key].push(value);
}
});
@ -129,4 +158,4 @@ export function transformArrayToObject(
});
return transformedResult;
}
}

View File

@ -91,7 +91,8 @@ export async function indexPage({
url: normalizedUrl,
originUrl: normalizeUrl(originUrl),
title: document.metadata.title ?? document.metadata.ogTitle ?? "",
description: document.metadata.description ?? document.metadata.ogDescription ?? "",
description:
document.metadata.description ?? document.metadata.ogDescription ?? "",
crawlId,
teamId,
markdown: trimmedMarkdown,
@ -126,7 +127,7 @@ export async function indexPage({
export async function searchSimilarPages(
query: string,
originUrl?: string,
limit: number = 1000
limit: number = 1000,
): Promise<any[]> {
try {
const index = pinecone.index(INDEX_NAME);

View File

@ -59,7 +59,7 @@ export async function rerankLinks(
const linksAndScores = await performRanking(
mappedLinksRerank,
mappedLinks.map((l) => l.url),
searchQuery
searchQuery,
);
// First try with high threshold
@ -109,8 +109,11 @@ export async function rerankLinks(
}
});
const rankedLinks = filteredLinks.slice(0, extractConfig.RERANKING.MAX_RANKING_LIMIT_FOR_RELEVANCE);
const rankedLinks = filteredLinks.slice(
0,
extractConfig.RERANKING.MAX_RANKING_LIMIT_FOR_RELEVANCE,
);
// Mark URLs that will be used in completion
rankedLinks.forEach((link) => {
const trace = urlTraces.find((t) => t.url === link.url);
@ -120,13 +123,15 @@ export async function rerankLinks(
});
// Mark URLs that were dropped due to ranking limit
filteredLinks.slice(extractConfig.RERANKING.MAX_RANKING_LIMIT_FOR_RELEVANCE).forEach(link => {
const trace = urlTraces.find(t => t.url === link.url);
if (trace) {
trace.warning = "Excluded due to ranking limit";
trace.usedInCompletion = false;
}
});
filteredLinks
.slice(extractConfig.RERANKING.MAX_RANKING_LIMIT_FOR_RELEVANCE)
.forEach((link) => {
const trace = urlTraces.find((t) => t.url === link.url);
if (trace) {
trace.warning = "Excluded due to ranking limit";
trace.usedInCompletion = false;
}
});
// console.log("Reranked links: ", rankedLinks.length);
@ -155,7 +160,7 @@ function filterAndProcessLinks(
export type RerankerResult = {
mapDocument: MapDocument[];
tokensUsed: number;
}
};
export async function rerankLinksWithLLM(
mappedLinks: MapDocument[],
@ -167,7 +172,7 @@ export async function rerankLinksWithLLM(
const TIMEOUT_MS = 20000;
const MAX_RETRIES = 2;
let totalTokensUsed = 0;
// Split mappedLinks into chunks of 200
for (let i = 0; i < mappedLinks.length; i += chunkSize) {
chunks.push(mappedLinks.slice(i, i + chunkSize));
@ -184,23 +189,25 @@ export async function rerankLinksWithLLM(
type: "object",
properties: {
url: { type: "string" },
relevanceScore: { type: "number" }
relevanceScore: { type: "number" },
},
required: ["url", "relevanceScore"]
}
}
required: ["url", "relevanceScore"],
},
},
},
required: ["relevantLinks"]
required: ["relevantLinks"],
};
const results = await Promise.all(
chunks.map(async (chunk, chunkIndex) => {
// console.log(`Processing chunk ${chunkIndex + 1}/${chunks.length} with ${chunk.length} links`);
const linksContent = chunk.map(link =>
`URL: ${link.url}${link.title ? `\nTitle: ${link.title}` : ''}${link.description ? `\nDescription: ${link.description}` : ''}`
).join("\n\n");
const linksContent = chunk
.map(
(link) =>
`URL: ${link.url}${link.title ? `\nTitle: ${link.title}` : ""}${link.description ? `\nDescription: ${link.description}` : ""}`,
)
.join("\n\n");
for (let retry = 0; retry <= MAX_RETRIES; retry++) {
try {
@ -208,22 +215,28 @@ export async function rerankLinksWithLLM(
setTimeout(() => resolve(null), TIMEOUT_MS);
});
const completionPromise = generateOpenAICompletions(
logger.child({ method: "rerankLinksWithLLM", chunk: chunkIndex + 1, retry }),
logger.child({
method: "rerankLinksWithLLM",
chunk: chunkIndex + 1,
retry,
}),
{
mode: "llm",
systemPrompt: buildRerankerSystemPrompt(),
prompt: buildRerankerUserPrompt(searchQuery),
schema: schema
schema: schema,
},
linksContent,
undefined,
true
true,
);
const completion = await Promise.race([completionPromise, timeoutPromise]);
const completion = await Promise.race([
completionPromise,
timeoutPromise,
]);
if (!completion) {
// console.log(`Chunk ${chunkIndex + 1}: Timeout on attempt ${retry + 1}`);
continue;
@ -237,9 +250,11 @@ export async function rerankLinksWithLLM(
totalTokensUsed += completion.numTokens || 0;
// console.log(`Chunk ${chunkIndex + 1}: Found ${completion.extract.relevantLinks.length} relevant links`);
return completion.extract.relevantLinks;
} catch (error) {
console.warn(`Error processing chunk ${chunkIndex + 1} attempt ${retry + 1}:`, error);
console.warn(
`Error processing chunk ${chunkIndex + 1} attempt ${retry + 1}:`,
error,
);
if (retry === MAX_RETRIES) {
// console.log(`Chunk ${chunkIndex + 1}: Max retries reached, returning empty array`);
return [];
@ -247,18 +262,20 @@ export async function rerankLinksWithLLM(
}
}
return [];
})
}),
);
// console.log(`Processed ${results.length} chunks`);
// Flatten results and sort by relevance score
const flattenedResults = results.flat().sort((a, b) => b.relevanceScore - a.relevanceScore);
const flattenedResults = results
.flat()
.sort((a, b) => b.relevanceScore - a.relevanceScore);
// console.log(`Total relevant links found: ${flattenedResults.length}`);
// Map back to MapDocument format, keeping only relevant links
const relevantLinks = flattenedResults
.map(result => mappedLinks.find(link => link.url === result.url))
.map((result) => mappedLinks.find((link) => link.url === result.url))
.filter((link): link is MapDocument => link !== undefined);
// console.log(`Returning ${relevantLinks.length} relevant links`);

View File

@ -184,8 +184,6 @@ export async function processUrl(
// (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}`
// );
const rerankerResult = await rerankLinksWithLLM(
mappedLinks,
rephrasedPrompt,

View File

@ -12,7 +12,9 @@ const tokenPerCharacter = 4;
const baseTokenCost = 300;
export function calculateFinalResultCost(data: any): number {
return Math.floor((JSON.stringify(data).length / tokenPerCharacter) + baseTokenCost);
return Math.floor(
JSON.stringify(data).length / tokenPerCharacter + baseTokenCost,
);
}
export function estimateTotalCost(tokenUsage: TokenUsage[]): number {

File diff suppressed because it is too large Load Diff

View File

@ -55,9 +55,9 @@ async function performRanking(
// Generate embeddings for each link and calculate similarity in parallel
const linksAndScores = await Promise.all(
linksWithContext.map((linkWithContext, index) =>
linksWithContext.map((linkWithContext, index) =>
getEmbedding(linkWithContext)
.then(linkEmbedding => {
.then((linkEmbedding) => {
const score = cosineSimilarity(queryEmbedding, linkEmbedding);
return {
link: links[index],
@ -71,8 +71,8 @@ async function performRanking(
linkWithContext,
score: 0,
originalIndex: index,
}))
)
})),
),
);
// Sort links based on similarity scores while preserving original order for equal scores

View File

@ -252,20 +252,19 @@ export class WebCrawler {
};
const timeoutPromise = new Promise((_, reject) => {
setTimeout(() => reject(new Error('Sitemap fetch timeout')), timeout);
setTimeout(() => reject(new Error("Sitemap fetch timeout")), timeout);
});
try {
let count = await Promise.race([
let count = (await Promise.race([
Promise.all([
this.tryFetchSitemapLinks(
this.initialUrl,
_urlsHandler,
),
...this.robots.getSitemaps().map(x => this.tryFetchSitemapLinks(x, _urlsHandler)),
]).then(results => results.reduce((a,x) => a+x, 0)),
timeoutPromise
]) as number;
this.tryFetchSitemapLinks(this.initialUrl, _urlsHandler),
...this.robots
.getSitemaps()
.map((x) => this.tryFetchSitemapLinks(x, _urlsHandler)),
]).then((results) => results.reduce((a, x) => a + x, 0)),
timeoutPromise,
])) as number;
if (count > 0) {
if (
@ -281,14 +280,14 @@ export class WebCrawler {
return count;
} catch (error) {
if (error.message === 'Sitemap fetch timeout') {
this.logger.warn('Sitemap fetch timed out', {
if (error.message === "Sitemap fetch timeout") {
this.logger.warn("Sitemap fetch timed out", {
method: "tryGetSitemap",
timeout,
});
return 0;
}
this.logger.error('Error fetching sitemap', {
this.logger.error("Error fetching sitemap", {
method: "tryGetSitemap",
error,
});
@ -328,9 +327,16 @@ export class WebCrawler {
!this.matchesExcludes(path) &&
!this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
) {
(async() => {
await redisConnection.sadd("crawl:" + this.jobId + ":robots_blocked", fullUrl);
await redisConnection.expire("crawl:" + this.jobId + ":robots_blocked", 24 * 60 * 60, "NX");
(async () => {
await redisConnection.sadd(
"crawl:" + this.jobId + ":robots_blocked",
fullUrl,
);
await redisConnection.expire(
"crawl:" + this.jobId + ":robots_blocked",
24 * 60 * 60,
"NX",
);
})();
}
} else {

View File

@ -1,5 +1,8 @@
import { logger } from "../../lib/logger";
import { normalizeUrl, normalizeUrlOnlyHostname } from "../../lib/canonical-url";
import {
normalizeUrl,
normalizeUrlOnlyHostname,
} from "../../lib/canonical-url";
import { supabase_service } from "../../services/supabase";
/**
@ -28,13 +31,19 @@ async function querySitemapIndexFunction(url: string) {
return { urls: [], lastUpdated: new Date(0) };
}
const allUrls = [...new Set(data.map((entry) => entry.urls).flat().map(url => normalizeUrl(url)))];
const allUrls = [
...new Set(
data
.map((entry) => entry.urls)
.flat()
.map((url) => normalizeUrl(url)),
),
];
return { urls: allUrls, lastUpdated: data[0].updated_at };
} catch (error) {
logger.error("(sitemap-index) Error querying the index", {
logger.error("(sitemap-index) Error querying the index", {
error,
attempt
attempt,
});
if (attempt === 3) {
@ -46,4 +55,7 @@ async function querySitemapIndexFunction(url: string) {
return { urls: [], lastUpdated: new Date(0) };
}
export const querySitemapIndex = withAuth(querySitemapIndexFunction, { urls: [], lastUpdated: new Date(0) });
export const querySitemapIndex = withAuth(querySitemapIndexFunction, {
urls: [],
lastUpdated: new Date(0),
});

View File

@ -24,55 +24,79 @@ export async function getLinksFromSitemap(
try {
if (mode === "fire-engine" && useFireEngine) {
const fetchResponse = await scrapeURL(
"sitemap",
"sitemap",
sitemapUrl,
scrapeOptions.parse({ formats: ["rawHtml"] }),
{ forceEngine: "fetch" },
);
if (fetchResponse.success && (fetchResponse.document.metadata.statusCode >= 200 && fetchResponse.document.metadata.statusCode < 300)) {
if (
fetchResponse.success &&
fetchResponse.document.metadata.statusCode >= 200 &&
fetchResponse.document.metadata.statusCode < 300
) {
content = fetchResponse.document.rawHtml!;
} else {
logger.debug(
"Failed to scrape sitemap via fetch, falling back to TLSClient...",
{ error: fetchResponse.success ? fetchResponse.document : fetchResponse.error },
{
error: fetchResponse.success
? fetchResponse.document
: fetchResponse.error,
},
);
const tlsResponse = await scrapeURL(
"sitemap",
"sitemap",
sitemapUrl,
scrapeOptions.parse({ formats: ["rawHtml"] }),
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true },
);
if (tlsResponse.success && (tlsResponse.document.metadata.statusCode >= 200 && tlsResponse.document.metadata.statusCode < 300)) {
if (
tlsResponse.success &&
tlsResponse.document.metadata.statusCode >= 200 &&
tlsResponse.document.metadata.statusCode < 300
) {
content = tlsResponse.document.rawHtml!;
} else {
logger.error(`Request failed for ${sitemapUrl}, ran out of engines!`, {
method: "getLinksFromSitemap",
mode,
sitemapUrl,
error: tlsResponse.success ? tlsResponse.document : tlsResponse.error,
});
logger.error(
`Request failed for ${sitemapUrl}, ran out of engines!`,
{
method: "getLinksFromSitemap",
mode,
sitemapUrl,
error: tlsResponse.success
? tlsResponse.document
: tlsResponse.error,
},
);
return 0;
}
}
} else {
const fetchResponse = await scrapeURL(
"sitemap",
"sitemap",
sitemapUrl,
scrapeOptions.parse({ formats: ["rawHtml"] }),
{ forceEngine: "fetch" },
);
if (fetchResponse.success && (fetchResponse.document.metadata.statusCode >= 200 && fetchResponse.document.metadata.statusCode < 300)) {
if (
fetchResponse.success &&
fetchResponse.document.metadata.statusCode >= 200 &&
fetchResponse.document.metadata.statusCode < 300
) {
content = fetchResponse.document.rawHtml!;
} else {
logger.error(`Request failed for ${sitemapUrl}, ran out of engines!`, {
method: "getLinksFromSitemap",
mode,
sitemapUrl,
});
logger.error(
`Request failed for ${sitemapUrl}, ran out of engines!`,
{
method: "getLinksFromSitemap",
mode,
sitemapUrl,
},
);
return 0;
}
}
@ -165,13 +189,20 @@ export const fetchSitemapData = async (
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
try {
const fetchResponse = await scrapeURL(
"sitemap",
"sitemap",
sitemapUrl,
scrapeOptions.parse({ formats: ["rawHtml"], timeout: timeout || axiosTimeout }),
scrapeOptions.parse({
formats: ["rawHtml"],
timeout: timeout || axiosTimeout,
}),
{ forceEngine: "fetch" },
);
if (fetchResponse.success && (fetchResponse.document.metadata.statusCode >= 200 && fetchResponse.document.metadata.statusCode < 300)) {
if (
fetchResponse.success &&
fetchResponse.document.metadata.statusCode >= 200 &&
fetchResponse.document.metadata.statusCode < 300
) {
const xml = fetchResponse.document.rawHtml!;
const parsedXml = await parseStringPromise(xml);

View File

@ -17,7 +17,6 @@ export async function scrapeCache(meta: Meta): Promise<EngineScrapeResult> {
throw new EngineError("Cache hit but HTML is too short to be useful");
}
// Set fromCache flag to indicate this document was retrieved from cache
meta.internalOptions.fromCache = true;

View File

@ -3,7 +3,12 @@ import * as Sentry from "@sentry/node";
import { z } from "zod";
import { robustFetch } from "../../lib/fetch";
import { ActionError, EngineError, SiteError, UnsupportedFileError } from "../../error";
import {
ActionError,
EngineError,
SiteError,
UnsupportedFileError,
} from "../../error";
import { MockState } from "../../lib/mock";
const successSchema = z.object({

View File

@ -4,7 +4,11 @@ import * as Sentry from "@sentry/node";
import { robustFetch } from "../../lib/fetch";
import { MockState } from "../../lib/mock";
export async function fireEngineDelete(logger: Logger, jobId: string, mock: MockState | null) {
export async function fireEngineDelete(
logger: Logger,
jobId: string,
mock: MockState | null,
) {
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
await Sentry.startSpan(

View File

@ -143,7 +143,10 @@ async function buildMetaObject(
logger,
logs,
featureFlags: buildFeatureFlags(url, options, internalOptions),
mock: options.useMock !== undefined ? await loadMock(options.useMock, _logger) : null,
mock:
options.useMock !== undefined
? await loadMock(options.useMock, _logger)
: null,
};
}

View File

@ -34,7 +34,7 @@ export async function robustFetch<
requestId = crypto.randomUUID(),
tryCount = 1,
tryCooldown,
mock
mock,
}: RobustFetchParams<Schema>): Promise<Output> {
const params = {
url,
@ -51,8 +51,8 @@ export async function robustFetch<
let response: {
status: number;
headers: Headers,
body: string,
headers: Headers;
body: string;
};
if (mock === null) {
@ -123,25 +123,33 @@ export async function robustFetch<
return null as Output;
}
const makeRequestTypeId = (request: typeof mock["requests"][number]["options"]) => {
const makeRequestTypeId = (
request: (typeof mock)["requests"][number]["options"],
) => {
let out = request.url + ";" + request.method;
if (process.env.FIRE_ENGINE_BETA_URL && url.startsWith(process.env.FIRE_ENGINE_BETA_URL) && request.method === "POST") {
if (
process.env.FIRE_ENGINE_BETA_URL &&
url.startsWith(process.env.FIRE_ENGINE_BETA_URL) &&
request.method === "POST"
) {
out += "f-e;" + request.body?.engine + ";" + request.body?.url;
}
return out;
}
};
const thisId = makeRequestTypeId(params);
const matchingMocks = mock.requests.filter(x => makeRequestTypeId(x.options) === thisId).sort((a,b) => a.time - b.time);
const matchingMocks = mock.requests
.filter((x) => makeRequestTypeId(x.options) === thisId)
.sort((a, b) => a.time - b.time);
const nextI = mock.tracker[thisId] ?? 0;
mock.tracker[thisId] = nextI + 1;
if (!matchingMocks[nextI]) {
throw new Error("Failed to mock request -- no mock targets found.");
}
response = {
...(matchingMocks[nextI].result),
...matchingMocks[nextI].result,
headers: new Headers(matchingMocks[nextI].result.headers),
};
}
@ -180,12 +188,15 @@ export async function robustFetch<
}
if (mock === null) {
await saveMock({
...params,
logger: undefined,
schema: undefined,
headers: undefined,
}, response);
await saveMock(
{
...params,
logger: undefined,
schema: undefined,
headers: undefined,
},
response,
);
}
let data: Output;

View File

@ -6,55 +6,70 @@ const saveMocksDirPath = path.join(__dirname, "../mocks/").replace("dist/", "");
const loadMocksDirPath = path.join(__dirname, "../../../__tests__/snips/mocks");
export async function saveMock(options: unknown, result: unknown) {
if (process.env.FIRECRAWL_SAVE_MOCKS !== "true") return;
if (process.env.FIRECRAWL_SAVE_MOCKS !== "true") return;
await fs.mkdir(saveMocksDirPath, { recursive: true });
await fs.mkdir(saveMocksDirPath, { recursive: true });
const fileName = Date.now() + "-" + crypto.randomUUID() + ".json";
const filePath = path.join(saveMocksDirPath, fileName);
console.log(filePath);
const fileName = Date.now() + "-" + crypto.randomUUID() + ".json";
const filePath = path.join(saveMocksDirPath, fileName);
console.log(filePath);
await fs.writeFile(filePath, JSON.stringify({
await fs.writeFile(
filePath,
JSON.stringify(
{
time: Date.now(),
options,
result,
}, undefined, 4));
},
undefined,
4,
),
);
}
export type MockState = {
requests: {
time: number,
options: {
url: string,
method: string,
body?: any,
ignoreResponse: boolean,
ignoreFailure: boolean,
tryCount: number,
tryCooldown?: number,
},
result: any,
}[],
tracker: Record<string, number>,
}
requests: {
time: number;
options: {
url: string;
method: string;
body?: any;
ignoreResponse: boolean;
ignoreFailure: boolean;
tryCount: number;
tryCooldown?: number;
};
result: any;
}[];
tracker: Record<string, number>;
};
export async function loadMock(name: string, logger: Logger = _logger): Promise<MockState | null> {
try {
const mockPath = path.join(loadMocksDirPath, name + ".json");
export async function loadMock(
name: string,
logger: Logger = _logger,
): Promise<MockState | null> {
try {
const mockPath = path.join(loadMocksDirPath, name + ".json");
const relative = path.relative(loadMocksDirPath, mockPath);
if (!relative || relative.startsWith("..") || path.isAbsolute(relative)) {
// directory moving
return null;
}
const load = JSON.parse(await fs.readFile(mockPath, "utf8"));
return {
requests: load,
tracker: {},
};
} catch (error) {
logger.warn("Failed to load mock file!", { name, module: "scrapeURL:mock", method: "loadMock", error });
return null;
const relative = path.relative(loadMocksDirPath, mockPath);
if (!relative || relative.startsWith("..") || path.isAbsolute(relative)) {
// directory moving
return null;
}
const load = JSON.parse(await fs.readFile(mockPath, "utf8"));
return {
requests: load,
tracker: {},
};
} catch (error) {
logger.warn("Failed to load mock file!", {
name,
module: "scrapeURL:mock",
method: "loadMock",
error,
});
return null;
}
}

View File

@ -119,16 +119,16 @@ export const htmlTransform = (
// always return biggest image
soup("img[srcset]").each((_, el) => {
const sizes = el.attribs.srcset.split(",").map(x => {
const sizes = el.attribs.srcset.split(",").map((x) => {
const tok = x.trim().split(" ");
return {
url: tok[0],
size: parseInt((tok[1] ?? "1x").slice(0, -1), 10),
isX: (tok[1] ?? "").endsWith("x")
isX: (tok[1] ?? "").endsWith("x"),
};
});
if (sizes.every(x => x.isX) && el.attribs.src) {
if (sizes.every((x) => x.isX) && el.attribs.src) {
sizes.push({
url: el.attribs.src,
size: 1,
@ -136,7 +136,7 @@ export const htmlTransform = (
});
}
sizes.sort((a,b) => b.size - a.size);
sizes.sort((a, b) => b.size - a.size);
el.attribs.src = sizes[0]?.url;
});

View File

@ -41,7 +41,11 @@ export function deriveHTMLFromRawHTML(
);
}
document.html = htmlTransform(document.rawHtml, document.metadata.url ?? document.metadata.sourceURL ?? meta.url, meta.options);
document.html = htmlTransform(
document.rawHtml,
document.metadata.url ?? document.metadata.sourceURL ?? meta.url,
meta.options,
);
return document;
}

View File

@ -1,7 +1,11 @@
import OpenAI from "openai";
import { encoding_for_model } from "@dqbd/tiktoken";
import { TiktokenModel } from "@dqbd/tiktoken";
import { Document, ExtractOptions, TokenUsage } from "../../../controllers/v1/types";
import {
Document,
ExtractOptions,
TokenUsage,
} from "../../../controllers/v1/types";
import { Logger } from "winston";
import { EngineResultsTracker, Meta } from "..";
import { logger } from "../../../lib/logger";
@ -72,14 +76,20 @@ export async function generateOpenAICompletions(
markdown?: string,
previousWarning?: string,
isExtractEndpoint?: boolean,
model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini",
): Promise<{ extract: any; numTokens: number; warning: string | undefined; totalUsage: TokenUsage, model: string }> {
model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ??
"gpt-4o-mini",
): Promise<{
extract: any;
numTokens: number;
warning: string | undefined;
totalUsage: TokenUsage;
model: string;
}> {
let extract: any;
let warning: string | undefined;
const openai = new OpenAI();
if (markdown === undefined) {
throw new Error("document.markdown is undefined -- this is unexpected");
}
@ -208,8 +218,8 @@ export async function generateOpenAICompletions(
}
}
const promptTokens = (jsonCompletion.usage?.prompt_tokens ?? 0);
const completionTokens = (jsonCompletion.usage?.completion_tokens ?? 0);
const promptTokens = jsonCompletion.usage?.prompt_tokens ?? 0;
const completionTokens = jsonCompletion.usage?.completion_tokens ?? 0;
// If the users actually wants the items object, they can specify it as 'required' in the schema
// otherwise, we just return the items array
@ -222,7 +232,17 @@ export async function generateOpenAICompletions(
}
// num tokens (just user prompt tokenized) | deprecated
// totalTokens = promptTokens + completionTokens
return { extract, warning, numTokens, totalUsage: { promptTokens, completionTokens, totalTokens: promptTokens + completionTokens }, model };
return {
extract,
warning,
numTokens,
totalUsage: {
promptTokens,
completionTokens,
totalTokens: promptTokens + completionTokens,
},
model,
};
}
export async function performLLMExtract(
@ -238,7 +258,7 @@ export async function performLLMExtract(
document.markdown,
document.warning,
);
if (meta.options.formats.includes("json")) {
document.json = extract;
} else {

View File

@ -32,7 +32,7 @@ export async function autoCharge(
const resource = `auto-recharge:${chunk.team_id}`;
const cooldownKey = `auto-recharge-cooldown:${chunk.team_id}`;
if(chunk.team_id === "285bb597-6eaf-4b96-801c-51461fc3c543"){
if (chunk.team_id === "285bb597-6eaf-4b96-801c-51461fc3c543") {
return {
success: false,
message: "Auto-recharge failed",

View File

@ -107,15 +107,15 @@ async function processBatch() {
// Keep most recent entry and mark others for deletion
const [mostRecent, ...duplicates] = existingForOrigin;
if (duplicates.length > 0) {
duplicatesToDelete.push(...duplicates.map(d => d.id));
duplicatesToDelete.push(...duplicates.map((d) => d.id));
}
// Merge and deduplicate URLs
const mergedUrls = [
...new Set([
...mostRecent.urls,
...op.standardizedUrls.map(url => normalizeUrl(url))
])
...op.standardizedUrls.map((url) => normalizeUrl(url)),
]),
];
updates.push({
@ -127,7 +127,9 @@ async function processBatch() {
});
} else {
// Prepare insert with deduplicated URLs
const deduplicatedUrls = [...new Set(op.standardizedUrls.map(url => normalizeUrl(url)))];
const deduplicatedUrls = [
...new Set(op.standardizedUrls.map((url) => normalizeUrl(url))),
];
inserts.push({
origin_url: op.originUrl,
urls: deduplicatedUrls,
@ -140,8 +142,10 @@ async function processBatch() {
// Delete duplicate entries
if (duplicatesToDelete.length > 0) {
logger.info(`🗑️ Deleting ${duplicatesToDelete.length} duplicate crawl maps in batches of 100`);
logger.info(
`🗑️ Deleting ${duplicatesToDelete.length} duplicate crawl maps in batches of 100`,
);
// Delete in batches of 100
for (let i = 0; i < duplicatesToDelete.length; i += 100) {
const batch = duplicatesToDelete.slice(i, i + 100);
@ -151,11 +155,14 @@ async function processBatch() {
.in("id", batch);
if (deleteError) {
logger.error(`Failed to delete batch ${i/100 + 1} of duplicate crawl maps`, {
error: deleteError,
batchSize: batch.length,
startIndex: i
});
logger.error(
`Failed to delete batch ${i / 100 + 1} of duplicate crawl maps`,
{
error: deleteError,
batchSize: batch.length,
startIndex: i,
},
);
}
}
}
@ -165,7 +172,7 @@ async function processBatch() {
logger.info(`🔄 Updating ${updates.length} existing crawl maps`, {
origins: updates.map((u) => u.origin_url),
});
// Process updates one at a time to avoid conflicts
for (const update of updates) {
const { error: updateError } = await supabase_service
@ -175,7 +182,7 @@ async function processBatch() {
if (updateError) {
logger.error("Failed to update crawl map", {
error: updateError,
origin: update.origin_url
origin: update.origin_url,
});
}
}

View File

@ -3,18 +3,27 @@ import "../sentry";
import * as Sentry from "@sentry/node";
import { Job, Queue, Worker } from "bullmq";
import { logger as _logger, logger } from "../../lib/logger";
import { redisConnection, indexQueueName, getIndexQueue } from "../queue-service";
import {
redisConnection,
indexQueueName,
getIndexQueue,
} from "../queue-service";
import { saveCrawlMap } from "./crawl-maps-index";
import systemMonitor from "../system-monitor";
import { v4 as uuidv4 } from "uuid";
const workerLockDuration = Number(process.env.WORKER_LOCK_DURATION) || 60000;
const workerStalledCheckInterval = Number(process.env.WORKER_STALLED_CHECK_INTERVAL) || 30000;
const jobLockExtendInterval = Number(process.env.JOB_LOCK_EXTEND_INTERVAL) || 15000;
const jobLockExtensionTime = Number(process.env.JOB_LOCK_EXTENSION_TIME) || 60000;
const workerStalledCheckInterval =
Number(process.env.WORKER_STALLED_CHECK_INTERVAL) || 30000;
const jobLockExtendInterval =
Number(process.env.JOB_LOCK_EXTEND_INTERVAL) || 15000;
const jobLockExtensionTime =
Number(process.env.JOB_LOCK_EXTENSION_TIME) || 60000;
const cantAcceptConnectionInterval = Number(process.env.CANT_ACCEPT_CONNECTION_INTERVAL) || 2000;
const connectionMonitorInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 10;
const cantAcceptConnectionInterval =
Number(process.env.CANT_ACCEPT_CONNECTION_INTERVAL) || 2000;
const connectionMonitorInterval =
Number(process.env.CONNECTION_MONITOR_INTERVAL) || 10;
const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
const runningJobs: Set<string> = new Set();
@ -88,7 +97,7 @@ const workerFun = async (queue: Queue) => {
const token = uuidv4();
const canAcceptConnection = await monitor.acceptConnection();
if (!canAcceptConnection) {
logger.info("Cant accept connection");
cantAcceptConnectionCount++;
@ -100,7 +109,9 @@ const workerFun = async (queue: Queue) => {
});
}
await new Promise(resolve => setTimeout(resolve, cantAcceptConnectionInterval));
await new Promise((resolve) =>
setTimeout(resolve, cantAcceptConnectionInterval),
);
continue;
} else {
cantAcceptConnectionCount = 0;
@ -141,15 +152,17 @@ const workerFun = async (queue: Queue) => {
runningJobs.delete(job.id);
}
await new Promise(resolve => setTimeout(resolve, gotJobInterval));
await new Promise((resolve) => setTimeout(resolve, gotJobInterval));
} else {
await new Promise(resolve => setTimeout(resolve, connectionMonitorInterval));
await new Promise((resolve) =>
setTimeout(resolve, connectionMonitorInterval),
);
}
}
logger.info("Worker loop ended. Waiting for running jobs to finish...");
while (runningJobs.size > 0) {
await new Promise(resolve => setTimeout(resolve, 500));
await new Promise((resolve) => setTimeout(resolve, 500));
}
logger.info("All jobs finished. Worker exiting!");
process.exit(0);
@ -158,4 +171,4 @@ const workerFun = async (queue: Queue) => {
// Start the worker
(async () => {
await workerFun(getIndexQueue());
})();
})();

View File

@ -93,7 +93,9 @@ const runningJobs: Set<string> = new Set();
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
if (await finishCrawl(job.data.crawl_id)) {
(async () => {
const originUrl = sc.originUrl ? normalizeUrlOnlyHostname(sc.originUrl) : undefined;
const originUrl = sc.originUrl
? normalizeUrlOnlyHostname(sc.originUrl)
: undefined;
// Get all visited unique URLs from Redis
const visitedUrls = await redisConnection.smembers(
"crawl:" + job.data.crawl_id + ":visited_unique",
@ -113,7 +115,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
},
{
priority: 10,
}
},
);
}
})();
@ -315,11 +317,14 @@ const processExtractJobInternal = async (
return result;
} else {
// throw new Error(result.error || "Unknown error during extraction");
await job.moveToCompleted(result, token, false);
await updateExtract(job.data.extractId, {
status: "failed",
error: result.error ?? "Unknown error, please contact help@firecrawl.com. Extract id: " + job.data.extractId,
error:
result.error ??
"Unknown error, please contact help@firecrawl.com. Extract id: " +
job.data.extractId,
});
return result;
@ -348,7 +353,14 @@ const processExtractJobInternal = async (
"Unknown error, please contact help@firecrawl.com. Extract id: " +
job.data.extractId,
});
return { success: false, error: error.error ?? error ?? "Unknown error, please contact help@firecrawl.com. Extract id: " + job.data.extractId };
return {
success: false,
error:
error.error ??
error ??
"Unknown error, please contact help@firecrawl.com. Extract id: " +
job.data.extractId,
};
// throw error;
} finally {
clearInterval(extendLockInterval);
@ -949,13 +961,15 @@ async function processJob(job: Job & { id: string }, token: string) {
}
if (job.data.team_id !== process.env.BACKGROUND_INDEX_TEAM_ID!) {
billTeam(job.data.team_id, undefined, creditsToBeBilled, logger).catch((error) => {
logger.error(
`Failed to bill team ${job.data.team_id} for ${creditsToBeBilled} credits`,
{ error },
);
// Optionally, you could notify an admin or add to a retry queue here
});
billTeam(job.data.team_id, undefined, creditsToBeBilled, logger).catch(
(error) => {
logger.error(
`Failed to bill team ${job.data.team_id} for ${creditsToBeBilled} credits`,
{ error },
);
// Optionally, you could notify an admin or add to a retry queue here
},
);
}
}
@ -974,11 +988,12 @@ async function processJob(job: Job & { id: string }, token: string) {
await finishCrawlIfNeeded(job, sc);
}
const isEarlyTimeout =
error instanceof Error && error.message === "timeout";
const isCancelled =
error instanceof Error && error.message === "Parent crawl/batch scrape was cancelled";
error instanceof Error &&
error.message === "Parent crawl/batch scrape was cancelled";
if (isEarlyTimeout) {
logger.error(`🐂 Job timed out ${job.id}`);