From e74e4bcefc5ebf97ef8fbe726c21e924bfef7b2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 13 Dec 2024 23:46:33 +0100 Subject: [PATCH 01/65] feat(runWebScraper): retry a scrape max 3 times in a crawl if the status code is failure --- apps/api/logview.js | 16 +- apps/api/src/controllers/v0/scrape.ts | 16 +- apps/api/src/controllers/v1/extract.ts | 4 +- apps/api/src/controllers/v1/types.ts | 2 +- apps/api/src/main/runWebScraper.ts | 139 ++++++++++-------- .../scraper/scrapeURL/lib/extractMetadata.ts | 2 +- apps/api/src/types.ts | 1 + 7 files changed, 108 insertions(+), 72 deletions(-) diff --git a/apps/api/logview.js b/apps/api/logview.js index 232d2cda..3c0db523 100644 --- a/apps/api/logview.js +++ b/apps/api/logview.js @@ -1,7 +1,19 @@ const fs = require("fs"); -const logs = fs.readFileSync("7a373219-0eb4-4e47-b2df-e90e12afd5c1.log", "utf8") - .split("\n").filter(x => x.trim().length > 0).map(x => JSON.parse(x)); +// METHOD: Winston log file +// const logs = fs.readFileSync("7a373219-0eb4-4e47-b2df-e90e12afd5c1.log", "utf8") +// .split("\n").filter(x => x.trim().length > 0).map(x => JSON.parse(x)); + +// METHOD: GCloud export +const logs = [ + "downloaded-logs-20241213-225607.json", + "downloaded-logs-20241213-225654.json", + "downloaded-logs-20241213-225720.json", + "downloaded-logs-20241213-225758.json", + "downloaded-logs-20241213-225825.json", + "downloaded-logs-20241213-225843.json", +].flatMap(x => JSON.parse(fs.readFileSync(x, "utf8"))).map(x => x.jsonPayload); + const crawlIds = [...new Set(logs.map(x => x.crawlId).filter(x => x))]; diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index 8501e502..96e6ea4f 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -8,7 +8,6 @@ import { authenticateUser } from "../auth"; import { PlanType, RateLimiterMode } from "../../types"; import { logJob } from "../../services/logging/log_job"; import { - Document, fromLegacyCombo, toLegacyDocument, url as urlSchema, @@ -29,6 +28,7 @@ import * as Sentry from "@sentry/node"; import { getJobPriority } from "../../lib/job-priority"; import { fromLegacyScrapeOptions } from "../v1/types"; import { ZodError } from "zod"; +import { Document as V0Document } from "./../../lib/entities"; export async function scrapeHelper( jobId: string, @@ -42,7 +42,7 @@ export async function scrapeHelper( ): Promise<{ success: boolean; error?: string; - data?: Document | { url: string }; + data?: V0Document | { url: string }; returnCode: number; }> { const url = urlSchema.parse(req.body.url); @@ -241,9 +241,9 @@ export async function scrapeController(req: Request, res: Response) { const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; const numTokens = - result.data && (result.data as Document).markdown + result.data && (result.data as V0Document).markdown ? numTokensFromString( - (result.data as Document).markdown!, + (result.data as V0Document).markdown!, "gpt-3.5-turbo", ) : 0; @@ -276,14 +276,14 @@ export async function scrapeController(req: Request, res: Response) { let doc = result.data; if (!pageOptions || !pageOptions.includeRawHtml) { - if (doc && (doc as Document).rawHtml) { - delete (doc as Document).rawHtml; + if (doc && (doc as V0Document).rawHtml) { + delete (doc as V0Document).rawHtml; } } if (pageOptions && pageOptions.includeExtract) { - if (!pageOptions.includeMarkdown && doc && (doc as Document).markdown) { - delete (doc as Document).markdown; + if (!pageOptions.includeMarkdown && doc && (doc as V0Document).markdown) { + delete (doc as V0Document).markdown; } } diff --git a/apps/api/src/controllers/v1/extract.ts b/apps/api/src/controllers/v1/extract.ts index 0c286253..d05dbf6e 100644 --- a/apps/api/src/controllers/v1/extract.ts +++ b/apps/api/src/controllers/v1/extract.ts @@ -1,6 +1,6 @@ import { Request, Response } from "express"; import { - // Document, + Document, RequestWithAuth, ExtractRequest, extractRequestSchema, @@ -8,7 +8,7 @@ import { MapDocument, scrapeOptions, } from "./types"; -import { Document } from "../../lib/entities"; +// import { Document } from "../../lib/entities"; import Redis from "ioredis"; import { configDotenv } from "dotenv"; import { performRanking } from "../../lib/ranker"; diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 076d8b0b..d3f110c8 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -396,7 +396,7 @@ export type Document = { articleSection?: string; url?: string; sourceURL?: string; - statusCode?: number; + statusCode: number; error?: string; [key: string]: string | string[] | number | undefined; }; diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index dc907371..411acfe6 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -49,6 +49,7 @@ export async function startWebScraperPipeline({ bull_job_id: job.id.toString(), priority: job.opts.priority, is_scrape: job.data.is_scrape ?? false, + is_crawl: !!(job.data.crawl_id && job.data.crawlerOptions !== null), }); } @@ -63,73 +64,63 @@ export async function runWebScraper({ bull_job_id, priority, is_scrape = false, + is_crawl = false, }: RunWebScraperParams): Promise { + const tries = is_crawl ? 3 : 1; + let response: ScrapeUrlResponse | undefined = undefined; let engines: EngineResultsTracker = {}; - try { - response = await scrapeURL(bull_job_id, url, scrapeOptions, { - priority, - ...internalOptions, - }); - if (!response.success) { - if (response.error instanceof Error) { - throw response.error; - } else { - throw new Error( - "scrapeURL error: " + - (Array.isArray(response.error) - ? JSON.stringify(response.error) - : typeof response.error === "object" - ? JSON.stringify({ ...response.error }) - : response.error), - ); - } + let error: any = undefined; + + for (let i = 0; i < tries; i++) { + if (i > 0) { + logger.debug("Retrying scrape...", { scrapeId: bull_job_id, jobId: bull_job_id, method: "runWebScraper", module: "runWebScraper", tries, i, previousStatusCode: (response as any)?.document?.metadata?.statusCode, previousError: error }); } - if (is_scrape === false) { - let creditsToBeBilled = 1; // Assuming 1 credit per document - if (scrapeOptions.extract) { - creditsToBeBilled = 5; - } + response = undefined; + engines = {}; + error = undefined; - billTeam(team_id, undefined, creditsToBeBilled).catch((error) => { - logger.error( - `Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`, - ); - // Optionally, you could notify an admin or add to a retry queue here + try { + response = await scrapeURL(bull_job_id, url, scrapeOptions, { + priority, + ...internalOptions, }); + if (!response.success) { + if (response.error instanceof Error) { + throw response.error; + } else { + throw new Error( + "scrapeURL error: " + + (Array.isArray(response.error) + ? JSON.stringify(response.error) + : typeof response.error === "object" + ? JSON.stringify({ ...response.error }) + : response.error), + ); + } + } + + // This is where the returnvalue from the job is set + // onSuccess(response.document, mode); + + engines = response.engines; + + if ((response.document.metadata.statusCode >= 200 && response.document.metadata.statusCode < 300) || response.document.metadata.statusCode === 304) { + // status code is good -- do not attempt retry + break; + } + } catch (error) { + engines = + response !== undefined + ? response.engines + : typeof error === "object" && error !== null + ? ((error as any).results ?? {}) + : {}; } + } - // This is where the returnvalue from the job is set - // onSuccess(response.document, mode); - - engines = response.engines; - return response; - } catch (error) { - engines = - response !== undefined - ? response.engines - : typeof error === "object" && error !== null - ? ((error as any).results ?? {}) - : {}; - - if (response !== undefined) { - return { - ...response, - success: false, - error, - }; - } else { - return { - success: false, - error, - logs: ["no logs -- error coming from runWebScraper"], - engines, - }; - } - // onError(error); - } finally { - const engineOrder = Object.entries(engines) + const engineOrder = Object.entries(engines) .sort((a, b) => a[1].startedAt - b[1].startedAt) .map((x) => x[0]) as Engine[]; @@ -158,6 +149,38 @@ export async function runWebScraper({ }, }); } + + if (error === undefined && response?.success) { + if (is_scrape === false) { + let creditsToBeBilled = 1; // Assuming 1 credit per document + if (scrapeOptions.extract) { + creditsToBeBilled = 5; + } + + billTeam(team_id, undefined, creditsToBeBilled).catch((error) => { + logger.error( + `Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`, + ); + // Optionally, you could notify an admin or add to a retry queue here + }); + } + + return response; + } else { + if (response !== undefined) { + return { + ...response, + success: false, + error, + }; + } else { + return { + success: false, + error, + logs: ["no logs -- error coming from runWebScraper"], + engines, + }; + } } } diff --git a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts index 040bf0ee..c67f9cbd 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts @@ -5,7 +5,7 @@ import { Meta } from ".."; export function extractMetadata( meta: Meta, html: string, -): Document["metadata"] { +): Partial { let title: string | undefined = undefined; let description: string | undefined = undefined; let language: string | undefined = undefined; diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 5325a0ad..9db79bc5 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -55,6 +55,7 @@ export interface RunWebScraperParams { bull_job_id: string; priority?: number; is_scrape?: boolean; + is_crawl?: boolean; } export type RunWebScraperResult = From 4b5014d7fe1336129f91e97b99a0fd495a4e019b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sat, 14 Dec 2024 01:11:43 +0100 Subject: [PATCH 02/65] feat(v1/batch/scrape): add ignoreInvalidURLs option --- apps/api/src/controllers/v1/batch-scrape.ts | 40 +++++++++++++++++---- apps/api/src/controllers/v1/types.ts | 34 ++++++++++++++++++ apps/api/src/lib/crawl-redis.ts | 4 +++ apps/api/src/services/queue-jobs.ts | 1 + 4 files changed, 72 insertions(+), 7 deletions(-) diff --git a/apps/api/src/controllers/v1/batch-scrape.ts b/apps/api/src/controllers/v1/batch-scrape.ts index 89fa6741..19ce3ba0 100644 --- a/apps/api/src/controllers/v1/batch-scrape.ts +++ b/apps/api/src/controllers/v1/batch-scrape.ts @@ -3,9 +3,11 @@ import { v4 as uuidv4 } from "uuid"; import { BatchScrapeRequest, batchScrapeRequestSchema, - CrawlResponse, + batchScrapeRequestSchemaNoURLValidation, + url as urlSchema, RequestWithAuth, ScrapeOptions, + BatchScrapeResponse, } from "./types"; import { addCrawlJobs, @@ -21,10 +23,14 @@ import { callWebhook } from "../../services/webhook"; import { logger as _logger } from "../../lib/logger"; export async function batchScrapeController( - req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>, - res: Response, + req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>, + res: Response, ) { - req.body = batchScrapeRequestSchema.parse(req.body); + if (req.body?.ignoreInvalidURLs === true) { + req.body = batchScrapeRequestSchemaNoURLValidation.parse(req.body); + } else { + req.body = batchScrapeRequestSchema.parse(req.body); + } const id = req.body.appendToId ?? uuidv4(); const logger = _logger.child({ @@ -35,8 +41,27 @@ export async function batchScrapeController( teamId: req.auth.team_id, plan: req.auth.plan, }); + + let urls = req.body.urls; + let invalidURLs: string[] | undefined = undefined; + + if (req.body.ignoreInvalidURLs) { + invalidURLs = []; + + let pendingURLs = urls; + urls = []; + for (const u of pendingURLs) { + try { + const nu = urlSchema.parse(u); + urls.push(nu); + } catch (_) { + invalidURLs.push(u); + } + } + } + logger.debug("Batch scrape " + id + " starting", { - urlsLength: req.body.urls, + urlsLength: urls, appendToId: req.body.appendToId, account: req.account, }); @@ -70,7 +95,7 @@ export async function batchScrapeController( // If it is over 1000, we need to get the job priority, // otherwise we can use the default priority of 20 - if (req.body.urls.length > 1000) { + if (urls.length > 1000) { // set base to 21 jobPriority = await getJobPriority({ plan: req.auth.plan, @@ -84,7 +109,7 @@ export async function batchScrapeController( delete (scrapeOptions as any).urls; delete (scrapeOptions as any).appendToId; - const jobs = req.body.urls.map((x) => { + const jobs = urls.map((x) => { return { data: { url: x, @@ -140,5 +165,6 @@ export async function batchScrapeController( success: true, id, url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`, + invalidURLs, }); } diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index d3f110c8..f7226338 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -262,6 +262,31 @@ export const batchScrapeRequestSchema = scrapeOptions origin: z.string().optional().default("api"), webhook: webhookSchema.optional(), appendToId: z.string().uuid().optional(), + ignoreInvalidURLs: z.boolean().default(false), + }) + .strict(strictMessage) + .refine( + (obj) => { + const hasExtractFormat = obj.formats?.includes("extract"); + const hasExtractOptions = obj.extract !== undefined; + return ( + (hasExtractFormat && hasExtractOptions) || + (!hasExtractFormat && !hasExtractOptions) + ); + }, + { + message: + "When 'extract' format is specified, 'extract' options must be provided, and vice versa", + }, + ); + +export const batchScrapeRequestSchemaNoURLValidation = scrapeOptions + .extend({ + urls: z.string().array(), + origin: z.string().optional().default("api"), + webhook: webhookSchema.optional(), + appendToId: z.string().uuid().optional(), + ignoreInvalidURLs: z.boolean().default(false), }) .strict(strictMessage) .refine( @@ -446,6 +471,15 @@ export type CrawlResponse = url: string; }; +export type BatchScrapeResponse = + | ErrorResponse + | { + success: true; + id: string; + url: string; + invalidURLs?: string[]; + }; + export type MapResponse = | ErrorResponse | { diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index 6ccb9436..3fcd9f67 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -60,6 +60,8 @@ export async function addCrawlJob(id: string, job_id: string) { } export async function addCrawlJobs(id: string, job_ids: string[]) { + if (job_ids.length === 0) return true; + _logger.debug("Adding crawl jobs to Redis...", { jobIds: job_ids, module: "crawl-redis", @@ -261,6 +263,8 @@ export async function lockURLs( sc: StoredCrawl, urls: string[], ): Promise { + if (urls.length === 0) return true; + urls = urls.map((url) => normalizeURL(url, sc)); const logger = _logger.child({ crawlId: id, diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index bd2b9121..ee9e6177 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -108,6 +108,7 @@ export async function addScrapeJobs( }; }[], ) { + if (jobs.length === 0) return true; // TODO: better await Promise.all( jobs.map((job) => From 9cc6576571477e28504c9cdb906e8922d8773c67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sat, 14 Dec 2024 01:16:09 +0100 Subject: [PATCH 03/65] feat(js-sdk/batch/scrape): add ignoreInvalidURLs option --- apps/js-sdk/firecrawl/src/index.ts | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 37fc5ef0..020a2293 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -183,6 +183,7 @@ export interface BatchScrapeResponse { url?: string; success: true; error?: string; + invalidURLs?: string[]; } /** @@ -576,9 +577,10 @@ export default class FirecrawlApp { pollInterval: number = 2, idempotencyKey?: string, webhook?: CrawlParams["webhook"], + ignoreInvalidURLs?: boolean, ): Promise { const headers = this.prepareHeaders(idempotencyKey); - let jsonData: any = { urls, ...params }; + let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...params }; if (jsonData?.extract?.schema) { let schema = jsonData.extract.schema; @@ -621,10 +623,12 @@ export default class FirecrawlApp { async asyncBatchScrapeUrls( urls: string[], params?: ScrapeParams, - idempotencyKey?: string + idempotencyKey?: string, + webhook?: CrawlParams["webhook"], + ignoreInvalidURLs?: boolean, ): Promise { const headers = this.prepareHeaders(idempotencyKey); - let jsonData: any = { urls, ...(params ?? {}) }; + let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...(params ?? {}) }; try { const response: AxiosResponse = await this.postRequest( this.apiUrl + `/v1/batch/scrape`, @@ -657,8 +661,10 @@ export default class FirecrawlApp { urls: string[], params?: ScrapeParams, idempotencyKey?: string, + webhook?: CrawlParams["webhook"], + ignoreInvalidURLs?: boolean, ) { - const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey); + const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey, webhook, ignoreInvalidURLs); if (crawl.success && crawl.id) { const id = crawl.id; From ccbae4b15568dfa02aa6745f97c95e44a68ade5c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 14 Dec 2024 00:20:14 -0300 Subject: [PATCH 04/65] Update auth.ts --- apps/api/src/controllers/auth.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index f865984a..d344625d 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -351,6 +351,7 @@ function getPlanByPriceId(price_id: string | null): PlanType { case process.env.STRIPE_PRICE_ID_ETIER1A_MONTHLY: //ocqh return "etier1a"; case process.env.STRIPE_PRICE_ID_ETIER_SCALE_1_MONTHLY: + case process.env.STRIPE_PRICE_ID_ETIER_SCALE_1_YEARLY: return "etierscale1"; default: return "free"; From c325c3aa337a1f5a6a1924f168233e1565594eda Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 14 Dec 2024 14:55:40 -0300 Subject: [PATCH 05/65] Nick: node sdk patch --- apps/js-sdk/firecrawl/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 30277cc3..74dfcb02 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.9.3", + "version": "1.9.4", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", From 664ba69f08e441e0ee109f51ef7a7a3c57b83c23 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 14 Dec 2024 21:40:46 -0300 Subject: [PATCH 06/65] Nick: f-eng monitoring test --- .../controllers/v0/admin/check-fire-engine.ts | 62 +++++++++++++++++++ apps/api/src/routes/admin.ts | 6 ++ 2 files changed, 68 insertions(+) create mode 100644 apps/api/src/controllers/v0/admin/check-fire-engine.ts diff --git a/apps/api/src/controllers/v0/admin/check-fire-engine.ts b/apps/api/src/controllers/v0/admin/check-fire-engine.ts new file mode 100644 index 00000000..8e69d106 --- /dev/null +++ b/apps/api/src/controllers/v0/admin/check-fire-engine.ts @@ -0,0 +1,62 @@ +import { logger } from "../../../lib/logger"; +import * as Sentry from "@sentry/node"; +import { Request, Response } from "express"; + + +export async function checkFireEngine(req: Request, res: Response) { + try { + if (!process.env.FIRE_ENGINE_BETA_URL) { + logger.warn("Fire engine beta URL not configured"); + return res.status(500).json({ + success: false, + error: "Fire engine beta URL not configured", + }); + } + + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), 30000); + + try { + const response = await fetch(`${process.env.FIRE_ENGINE_BETA_URL}/scrape`, { + method: "POST", + headers: { + "Content-Type": "application/json", + "X-Disable-Cache": "true", + }, + body: JSON.stringify({ + url: "https://example.com", + }), + signal: controller.signal, + }); + + clearTimeout(timeout); + + if (response.ok) { + const responseData = await response.json(); + return res.status(200).json({ + data: responseData, + }); + } else { + return res.status(response.status).json({ + success: false, + error: `Fire engine returned status ${response.status}`, + }); + } + } catch (error) { + if (error.name === 'AbortError') { + return res.status(504).json({ + success: false, + error: "Request timed out after 30 seconds", + }); + } + throw error; + } + } catch (error) { + logger.error(error); + Sentry.captureException(error); + return res.status(500).json({ + success: false, + error: "Internal server error", + }); + } +} diff --git a/apps/api/src/routes/admin.ts b/apps/api/src/routes/admin.ts index ec9967b8..1901c6f2 100644 --- a/apps/api/src/routes/admin.ts +++ b/apps/api/src/routes/admin.ts @@ -8,6 +8,7 @@ import { } from "../controllers/v0/admin/queue"; import { wrap } from "./v1"; import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear"; +import { checkFireEngine } from "../controllers/v0/admin/check-fire-engine"; export const adminRouter = express.Router(); @@ -37,3 +38,8 @@ adminRouter.post( `/admin/${process.env.BULL_AUTH_KEY}/acuc-cache-clear`, wrap(acucCacheClearController), ); + +adminRouter.get( + `/admin/${process.env.BULL_AUTH_KEY}/feng-check`, + wrap(checkFireEngine), +); From 4987880b32f5841b4a2cfb05753b0cc99f4d9f03 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 15 Dec 2024 02:52:06 -0300 Subject: [PATCH 07/65] Nick: random fixes --- .../src/scraper/WebScraper/utils/blocklist.ts | 26 +++++++++++++++- apps/api/src/services/queue-worker.ts | 31 +++++++++---------- 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index 0a3ef705..ba382040 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -6,6 +6,15 @@ configDotenv(); const hashKey = Buffer.from(process.env.HASH_KEY || "", "utf-8"); const algorithm = "aes-256-ecb"; +function encryptAES(plaintext: string, key: Buffer): string { + const cipher = crypto.createCipheriv(algorithm, key, null); + const encrypted = Buffer.concat([ + cipher.update(plaintext, "utf-8"), + cipher.final() + ]); + return encrypted.toString("base64"); +} + function decryptAES(ciphertext: string, key: Buffer): string { const decipher = crypto.createDecipheriv(algorithm, key, null); const decrypted = Buffer.concat([ @@ -42,6 +51,21 @@ const urlBlocklist = [ "PTbGg8PK/h0Seyw4HEpK4Q==", "lZdQMknjHb7+4+sjF3qNTw==", "LsgSq54q5oDysbva29JxnQ==", + "KZfBtpwjOpdSoqacRbz7og==", + "Indtl4yxJMHCKBGF4KABCQ==", + "e3HFXLVgxhaVoadYpwb2BA==", + "b+asgLayXQ5Jq+se+q56jA==", + "86ZDUI7vmp4MvNq3fvZrGQ==", + "sEGFoYZ6GEg4Zocd+TiyfQ==", + "6OOL72eXthgnJ1Hj4PfOQQ==", + "g/ME+Sh1CAFboKrwkVb+5Q==", + "Pw+xawUoX8xBYbX2yqqGWQ==", + "k6vBalxYFhAvkPsF19t9gQ==", + "e3HFXLVgxhaVoadYpwb2BA==", + "b+asgLayXQ5Jq+se+q56jA==", + "KKttwRz4w+AMJrZcB828WQ==", + "vMdzZ33BXoyWVZnAPOBcrg==", + "l8GDVI8w/ueHnNzdN1ODuQ==", ]; const decryptedBlocklist = hashKey.length > 0 ? urlBlocklist.map((ciphertext) => decryptAES(ciphertext, hashKey)) : []; @@ -104,4 +128,4 @@ export function isUrlBlocked(url: string): boolean { logger.error(`Error parsing the following URL: ${url}`); return false; } -} +} \ No newline at end of file diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 29f4b84f..9fd8861b 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -391,22 +391,21 @@ async function processJob(job: Job & { id: string }, token: string) { // Check if the job URL is researchhub and block it immediately // TODO: remove this once solve the root issue - if ( - job.data.url && - (job.data.url.includes("researchhub.com") || - job.data.url.includes("ebay.com") || - job.data.url.includes("youtube.com")) - ) { - logger.info(`🐂 Blocking job ${job.id} with URL ${job.data.url}`); - const data = { - success: false, - document: null, - project_id: job.data.project_id, - error: - "URL is blocked. Suspecious activity detected. Please contact help@firecrawl.com if you believe this is an error.", - }; - return data; - } + // if ( + // job.data.url && + // (job.data.url.includes("researchhub.com") || + // job.data.url.includes("ebay.com")) + // ) { + // logger.info(`🐂 Blocking job ${job.id} with URL ${job.data.url}`); + // const data = { + // success: false, + // document: null, + // project_id: job.data.project_id, + // error: + // "URL is blocked. Suspecious activity detected. Please contact help@firecrawl.com if you believe this is an error.", + // }; + // return data; + // } try { job.updateProgress({ From 588f747ee87e2fcdc0ddf05bc483bca3d9a7451a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 15 Dec 2024 02:54:49 -0300 Subject: [PATCH 08/65] chore: formatting --- .../controllers/v0/admin/check-fire-engine.ts | 26 ++++--- apps/api/src/main/runWebScraper.ts | 75 +++++++++++-------- .../src/scraper/WebScraper/utils/blocklist.ts | 9 ++- 3 files changed, 64 insertions(+), 46 deletions(-) diff --git a/apps/api/src/controllers/v0/admin/check-fire-engine.ts b/apps/api/src/controllers/v0/admin/check-fire-engine.ts index 8e69d106..0671f7a9 100644 --- a/apps/api/src/controllers/v0/admin/check-fire-engine.ts +++ b/apps/api/src/controllers/v0/admin/check-fire-engine.ts @@ -2,7 +2,6 @@ import { logger } from "../../../lib/logger"; import * as Sentry from "@sentry/node"; import { Request, Response } from "express"; - export async function checkFireEngine(req: Request, res: Response) { try { if (!process.env.FIRE_ENGINE_BETA_URL) { @@ -17,17 +16,20 @@ export async function checkFireEngine(req: Request, res: Response) { const timeout = setTimeout(() => controller.abort(), 30000); try { - const response = await fetch(`${process.env.FIRE_ENGINE_BETA_URL}/scrape`, { - method: "POST", - headers: { - "Content-Type": "application/json", - "X-Disable-Cache": "true", + const response = await fetch( + `${process.env.FIRE_ENGINE_BETA_URL}/scrape`, + { + method: "POST", + headers: { + "Content-Type": "application/json", + "X-Disable-Cache": "true", + }, + body: JSON.stringify({ + url: "https://example.com", + }), + signal: controller.signal, }, - body: JSON.stringify({ - url: "https://example.com", - }), - signal: controller.signal, - }); + ); clearTimeout(timeout); @@ -43,7 +45,7 @@ export async function checkFireEngine(req: Request, res: Response) { }); } } catch (error) { - if (error.name === 'AbortError') { + if (error.name === "AbortError") { return res.status(504).json({ success: false, error: "Request timed out after 30 seconds", diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 411acfe6..63063576 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -74,7 +74,16 @@ export async function runWebScraper({ for (let i = 0; i < tries; i++) { if (i > 0) { - logger.debug("Retrying scrape...", { scrapeId: bull_job_id, jobId: bull_job_id, method: "runWebScraper", module: "runWebScraper", tries, i, previousStatusCode: (response as any)?.document?.metadata?.statusCode, previousError: error }); + logger.debug("Retrying scrape...", { + scrapeId: bull_job_id, + jobId: bull_job_id, + method: "runWebScraper", + module: "runWebScraper", + tries, + i, + previousStatusCode: (response as any)?.document?.metadata?.statusCode, + previousError: error, + }); } response = undefined; @@ -100,13 +109,17 @@ export async function runWebScraper({ ); } } - + // This is where the returnvalue from the job is set // onSuccess(response.document, mode); - + engines = response.engines; - if ((response.document.metadata.statusCode >= 200 && response.document.metadata.statusCode < 300) || response.document.metadata.statusCode === 304) { + if ( + (response.document.metadata.statusCode >= 200 && + response.document.metadata.statusCode < 300) || + response.document.metadata.statusCode === 304 + ) { // status code is good -- do not attempt retry break; } @@ -121,34 +134,34 @@ export async function runWebScraper({ } const engineOrder = Object.entries(engines) - .sort((a, b) => a[1].startedAt - b[1].startedAt) - .map((x) => x[0]) as Engine[]; + .sort((a, b) => a[1].startedAt - b[1].startedAt) + .map((x) => x[0]) as Engine[]; - for (const engine of engineOrder) { - const result = engines[engine] as Exclude< - EngineResultsTracker[Engine], - undefined - >; - ScrapeEvents.insert(bull_job_id, { - type: "scrape", - url, - method: engine, - result: { - success: result.state === "success", - response_code: - result.state === "success" ? result.result.statusCode : undefined, - response_size: - result.state === "success" ? result.result.html.length : undefined, - error: - result.state === "error" - ? result.error - : result.state === "timeout" - ? "Timed out" - : undefined, - time_taken: result.finishedAt - result.startedAt, - }, - }); - } + for (const engine of engineOrder) { + const result = engines[engine] as Exclude< + EngineResultsTracker[Engine], + undefined + >; + ScrapeEvents.insert(bull_job_id, { + type: "scrape", + url, + method: engine, + result: { + success: result.state === "success", + response_code: + result.state === "success" ? result.result.statusCode : undefined, + response_size: + result.state === "success" ? result.result.html.length : undefined, + error: + result.state === "error" + ? result.error + : result.state === "timeout" + ? "Timed out" + : undefined, + time_taken: result.finishedAt - result.startedAt, + }, + }); + } if (error === undefined && response?.success) { if (is_scrape === false) { diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index ba382040..16e9e45f 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -10,7 +10,7 @@ function encryptAES(plaintext: string, key: Buffer): string { const cipher = crypto.createCipheriv(algorithm, key, null); const encrypted = Buffer.concat([ cipher.update(plaintext, "utf-8"), - cipher.final() + cipher.final(), ]); return encrypted.toString("base64"); } @@ -68,7 +68,10 @@ const urlBlocklist = [ "l8GDVI8w/ueHnNzdN1ODuQ==", ]; -const decryptedBlocklist = hashKey.length > 0 ? urlBlocklist.map((ciphertext) => decryptAES(ciphertext, hashKey)) : []; +const decryptedBlocklist = + hashKey.length > 0 + ? urlBlocklist.map((ciphertext) => decryptAES(ciphertext, hashKey)) + : []; const allowedKeywords = [ "pulse", @@ -128,4 +131,4 @@ export function isUrlBlocked(url: string): boolean { logger.error(`Error parsing the following URL: ${url}`); return false; } -} \ No newline at end of file +} From 842b522b445d1abcbae2f1649b5968cdce4a2835 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 13 Dec 2024 22:30:57 +0100 Subject: [PATCH 09/65] feat: add scrapeOptions.fastMode --- apps/api/src/controllers/v1/types.ts | 3 ++- apps/api/src/lib/cache.ts | 2 +- apps/api/src/scraper/scrapeURL/index.ts | 3 +-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index f7226338..2c054560 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -182,6 +182,7 @@ export const scrapeOptions = z .optional(), skipTlsVerification: z.boolean().default(false), removeBase64Images: z.boolean().default(true), + fastMode: z.boolean().default(false), }) .strict(strictMessage); @@ -685,11 +686,11 @@ export function fromLegacyScrapeOptions( } : undefined, mobile: pageOptions.mobile, + fastMode: pageOptions.useFastMode, }), internalOptions: { atsv: pageOptions.atsv, v0DisableJsDom: pageOptions.disableJsDom, - v0UseFastMode: pageOptions.useFastMode, }, // TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks }; diff --git a/apps/api/src/lib/cache.ts b/apps/api/src/lib/cache.ts index 7dcbf88b..cbab4e05 100644 --- a/apps/api/src/lib/cache.ts +++ b/apps/api/src/lib/cache.ts @@ -21,7 +21,7 @@ export function cacheKey( if ( internalOptions.v0CrawlOnlyUrls || internalOptions.forceEngine || - internalOptions.v0UseFastMode || + scrapeOptions.fastMode || internalOptions.atsv || (scrapeOptions.actions && scrapeOptions.actions.length > 0) ) { diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index a3eb6f1e..d3b33418 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -86,7 +86,7 @@ function buildFeatureFlags( flags.add("skipTlsVerification"); } - if (internalOptions.v0UseFastMode) { + if (options.fastMode) { flags.add("useFastMode"); } @@ -148,7 +148,6 @@ export type InternalOptions = { atsv?: boolean; // anti-bot solver, beta v0CrawlOnlyUrls?: boolean; - v0UseFastMode?: boolean; v0DisableJsDom?: boolean; disableSmartWaitCache?: boolean; // Passed along to fire-engine From 5e267f92ffce404e779db3788656152db7e110a5 Mon Sep 17 00:00:00 2001 From: NBR0KN Date: Sat, 14 Dec 2024 20:36:43 +0100 Subject: [PATCH 10/65] fix: adjust Playwright service response to match API schema expectations --- apps/playwright-service-ts/api.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/playwright-service-ts/api.ts b/apps/playwright-service-ts/api.ts index 90a4eb87..eacb35ff 100644 --- a/apps/playwright-service-ts/api.ts +++ b/apps/playwright-service-ts/api.ts @@ -196,7 +196,7 @@ app.post('/scrape', async (req: Request, res: Response) => { } } - const pageError = pageStatusCode !== 200 ? getError(pageStatusCode) : false; + const pageError = pageStatusCode !== 200 ? getError(pageStatusCode) : undefined; if (!pageError) { console.log(`✅ Scrape successful!`); @@ -209,7 +209,7 @@ app.post('/scrape', async (req: Request, res: Response) => { res.json({ content: pageContent, pageStatusCode, - pageError + ...(pageError && { pageError }) }); }); From afbd01299af9a56250b3cf56e97fc93b48476cb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sun, 15 Dec 2024 15:58:27 +0100 Subject: [PATCH 11/65] fix(scrapeURL/fire-engine): timeouts --- apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 3fc32835..a5ebb9e9 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -136,7 +136,7 @@ export async function scrapeURLWithFireEngineChromeCDP( priority: meta.internalOptions.priority, geolocation: meta.options.geolocation, mobile: meta.options.mobile, - timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic + timeout: meta.options.timeout === undefined ? 300000 : meta.options.timeout, // TODO: better timeout logic disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache, // TODO: scrollXPaths }; @@ -220,7 +220,7 @@ export async function scrapeURLWithFireEnginePlaywright( wait: meta.options.waitFor, geolocation: meta.options.geolocation, - timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic + timeout: meta.options.timeout === undefined ? 300000 : meta.options.timeout, // TODO: better timeout logic }; let response = await performFireEngineScrape( @@ -279,7 +279,7 @@ export async function scrapeURLWithFireEngineTLSClient( geolocation: meta.options.geolocation, disableJsDom: meta.internalOptions.v0DisableJsDom, - timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic + timeout: meta.options.timeout === undefined ? 30000 : meta.options.timeout, // TODO: better timeout logic }; let response = await performFireEngineScrape( From b4a5e1a6e9d022d1b7e2163c5996f75ce6be4c5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sun, 15 Dec 2024 16:04:17 +0100 Subject: [PATCH 12/65] fix(scrapeURL/fire-engine): timeout handling --- .../scrapeURL/engines/fire-engine/index.ts | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index a5ebb9e9..2b67c4d6 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -120,6 +120,8 @@ export async function scrapeURLWithFireEngineChromeCDP( // Include specified actions ...(meta.options.actions ?? []), ]; + + const timeout = (meta.options.timeout === undefined ? 300000 : Math.round(meta.options.timeout / 3)); const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestChromeCDP = { @@ -136,7 +138,7 @@ export async function scrapeURLWithFireEngineChromeCDP( priority: meta.internalOptions.priority, geolocation: meta.options.geolocation, mobile: meta.options.mobile, - timeout: meta.options.timeout === undefined ? 300000 : meta.options.timeout, // TODO: better timeout logic + timeout, // TODO: better timeout logic disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache, // TODO: scrollXPaths }; @@ -152,7 +154,7 @@ export async function scrapeURLWithFireEngineChromeCDP( request, }), request, - meta.options.timeout !== undefined ? defaultTimeout + totalWait : Infinity, // TODO: better timeout handling + timeout + totalWait, ); specialtyScrapeCheck( @@ -207,6 +209,8 @@ export async function scrapeURLWithFireEngineChromeCDP( export async function scrapeURLWithFireEnginePlaywright( meta: Meta, ): Promise { + const timeout = meta.options.timeout === undefined ? 300000 : Math.round(meta.options.timeout / 3); + const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestPlaywright = { url: meta.url, @@ -220,7 +224,7 @@ export async function scrapeURLWithFireEnginePlaywright( wait: meta.options.waitFor, geolocation: meta.options.geolocation, - timeout: meta.options.timeout === undefined ? 300000 : meta.options.timeout, // TODO: better timeout logic + timeout, }; let response = await performFireEngineScrape( @@ -229,9 +233,7 @@ export async function scrapeURLWithFireEnginePlaywright( request, }), request, - meta.options.timeout !== undefined - ? defaultTimeout + meta.options.waitFor - : Infinity, // TODO: better timeout handling + timeout + meta.options.waitFor, ); specialtyScrapeCheck( @@ -266,6 +268,8 @@ export async function scrapeURLWithFireEnginePlaywright( export async function scrapeURLWithFireEngineTLSClient( meta: Meta, ): Promise { + const timeout = meta.options.timeout === undefined ? 30000 : Math.round(meta.options.timeout / 3); + const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestTLSClient = { url: meta.url, @@ -279,7 +283,7 @@ export async function scrapeURLWithFireEngineTLSClient( geolocation: meta.options.geolocation, disableJsDom: meta.internalOptions.v0DisableJsDom, - timeout: meta.options.timeout === undefined ? 30000 : meta.options.timeout, // TODO: better timeout logic + timeout, }; let response = await performFireEngineScrape( @@ -288,7 +292,7 @@ export async function scrapeURLWithFireEngineTLSClient( request, }), request, - meta.options.timeout !== undefined ? defaultTimeout : Infinity, // TODO: better timeout handling + timeout, ); specialtyScrapeCheck( From 98f27b0acc19ba0f7e956d862db4f1465114fad1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sun, 15 Dec 2024 16:29:09 +0100 Subject: [PATCH 13/65] fix(crawl-redis/addCrawlJobDone): further ensure that completed doesn't go over total --- apps/api/src/lib/crawl-redis.ts | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index 3fcd9f67..0c9e0ff0 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -92,12 +92,16 @@ export async function addCrawlJobDone( if (success) { await redisConnection.rpush("crawl:" + id + ":jobs_done_ordered", job_id); - await redisConnection.expire( - "crawl:" + id + ":jobs_done_ordered", - 24 * 60 * 60, - "NX", - ); + } else { + // in case it's already been pushed, make sure it's removed + await redisConnection.lrem("crawl:" + id + ":jobs_done_ordered", -1, job_id); } + + await redisConnection.expire( + "crawl:" + id + ":jobs_done_ordered", + 24 * 60 * 60, + "NX", + ); } export async function getDoneJobsOrderedLength(id: string): Promise { From a5256827c0e92e48913bb514e5fa439083d25ce2 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 15 Dec 2024 14:36:09 -0300 Subject: [PATCH 14/65] Update index.ts --- apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 2b67c4d6..b84b1e90 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -51,7 +51,11 @@ async function performFireEngineScrape< }); } - if (Date.now() - startTime > timeout) { + const userParam = request.timeout ?? 0; + // Use 70% of the user-provided timeout as the timeout for fire-engine check status + const fireEngineTimeout = timeout + Math.round(userParam * 0.7); + const fullTimeout = Math.max(fireEngineTimeout, timeout); + if (Date.now() - startTime > fullTimeout) { logger.info( "Fire-engine was unable to scrape the page before timing out.", { errors, timeout }, From 0f3a27bf2760c5df0e7c93143dd0ca72c335734c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sun, 15 Dec 2024 18:58:29 +0100 Subject: [PATCH 15/65] fix(scrapeURL/engines): better timeouts --- .../scraper/scrapeURL/engines/fetch/index.ts | 3 ++- .../scrapeURL/engines/fire-engine/index.ts | 19 ++++++++----------- .../src/scraper/scrapeURL/engines/index.ts | 5 +++-- .../scraper/scrapeURL/engines/pdf/index.ts | 7 +++++-- .../scrapeURL/engines/playwright/index.ts | 5 +++-- .../scrapeURL/engines/scrapingbee/index.ts | 9 +++++---- apps/api/src/scraper/scrapeURL/index.ts | 6 +++++- 7 files changed, 31 insertions(+), 23 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts index af6f57c0..168d9b8f 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts @@ -5,8 +5,9 @@ import { specialtyScrapeCheck } from "../utils/specialtyHandler"; export async function scrapeURLWithFetch( meta: Meta, + timeToRun: number | undefined ): Promise { - const timeout = 20000; + const timeout = timeToRun ?? 300000; const response = await Promise.race([ fetch(meta.url, { diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index b84b1e90..ef0b41fc 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -18,8 +18,6 @@ import * as Sentry from "@sentry/node"; import { Action } from "../../../../lib/entities"; import { specialtyScrapeCheck } from "../utils/specialtyHandler"; -export const defaultTimeout = 10000; - // This function does not take `Meta` on purpose. It may not access any // meta values to construct the request -- that must be done by the // `scrapeURLWithFireEngine*` functions. @@ -31,7 +29,7 @@ async function performFireEngineScrape< >( logger: Logger, request: FireEngineScrapeRequestCommon & Engine, - timeout = defaultTimeout, + timeout: number, ): Promise { const scrape = await fireEngineScrape( logger.child({ method: "fireEngineScrape" }), @@ -51,11 +49,7 @@ async function performFireEngineScrape< }); } - const userParam = request.timeout ?? 0; - // Use 70% of the user-provided timeout as the timeout for fire-engine check status - const fireEngineTimeout = timeout + Math.round(userParam * 0.7); - const fullTimeout = Math.max(fireEngineTimeout, timeout); - if (Date.now() - startTime > fullTimeout) { + if (Date.now() - startTime > timeout) { logger.info( "Fire-engine was unable to scrape the page before timing out.", { errors, timeout }, @@ -98,6 +92,7 @@ async function performFireEngineScrape< export async function scrapeURLWithFireEngineChromeCDP( meta: Meta, + timeToRun: number | undefined, ): Promise { const actions: Action[] = [ // Transform waitFor option into an action (unsupported by chrome-cdp) @@ -125,7 +120,7 @@ export async function scrapeURLWithFireEngineChromeCDP( ...(meta.options.actions ?? []), ]; - const timeout = (meta.options.timeout === undefined ? 300000 : Math.round(meta.options.timeout / 3)); + const timeout = timeToRun ?? 300000; const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestChromeCDP = { @@ -212,8 +207,9 @@ export async function scrapeURLWithFireEngineChromeCDP( export async function scrapeURLWithFireEnginePlaywright( meta: Meta, + timeToRun: number | undefined, ): Promise { - const timeout = meta.options.timeout === undefined ? 300000 : Math.round(meta.options.timeout / 3); + const timeout = timeToRun ?? 300000; const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestPlaywright = { @@ -271,8 +267,9 @@ export async function scrapeURLWithFireEnginePlaywright( export async function scrapeURLWithFireEngineTLSClient( meta: Meta, + timeToRun: number | undefined, ): Promise { - const timeout = meta.options.timeout === undefined ? 30000 : Math.round(meta.options.timeout / 3); + const timeout = timeToRun ?? 30000; const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestTLSClient = { diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index 01ac0be9..14f263f3 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -105,7 +105,7 @@ export type EngineScrapeResult = { }; const engineHandlers: { - [E in Engine]: (meta: Meta) => Promise; + [E in Engine]: (meta: Meta, timeToRun: number | undefined) => Promise; } = { cache: scrapeCache, "fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP, @@ -372,6 +372,7 @@ export function buildFallbackList(meta: Meta): { export async function scrapeURLWithEngine( meta: Meta, engine: Engine, + timeToRun: number | undefined ): Promise { const fn = engineHandlers[engine]; const logger = meta.logger.child({ @@ -383,5 +384,5 @@ export async function scrapeURLWithEngine( logger, }; - return await fn(_meta); + return await fn(_meta, timeToRun); } diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index 341a4f1a..24d5f002 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -15,6 +15,7 @@ type PDFProcessorResult = { html: string; markdown?: string }; async function scrapePDFWithLlamaParse( meta: Meta, tempFilePath: string, + timeToRun: number | undefined, ): Promise { meta.logger.debug("Processing PDF document with LlamaIndex", { tempFilePath, @@ -63,8 +64,9 @@ async function scrapePDFWithLlamaParse( // TODO: timeout, retries const startedAt = Date.now(); + const timeout = timeToRun ?? 300000; - while (Date.now() <= startedAt + (meta.options.timeout ?? 300000)) { + while (Date.now() <= startedAt + timeout) { try { const result = await robustFetch({ url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`, @@ -122,7 +124,7 @@ async function scrapePDFWithParsePDF( }; } -export async function scrapePDF(meta: Meta): Promise { +export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Promise { if (!meta.options.parsePDF) { const file = await fetchFileToBuffer(meta.url); const content = file.buffer.toString("base64"); @@ -148,6 +150,7 @@ export async function scrapePDF(meta: Meta): Promise { }), }, tempFilePath, + timeToRun, ); } catch (error) { if (error instanceof Error && error.message === "LlamaParse timed out") { diff --git a/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts b/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts index c92b1d90..edcd50c0 100644 --- a/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts @@ -6,8 +6,9 @@ import { robustFetch } from "../../lib/fetch"; export async function scrapeURLWithPlaywright( meta: Meta, + timeToRun: number | undefined, ): Promise { - const timeout = 20000 + meta.options.waitFor; + const timeout = (timeToRun ?? 300000) + meta.options.waitFor; const response = await Promise.race([ await robustFetch({ @@ -30,7 +31,7 @@ export async function scrapeURLWithPlaywright( }), }), (async () => { - await new Promise((resolve) => setTimeout(() => resolve(null), 20000)); + await new Promise((resolve) => setTimeout(() => resolve(null), timeout)); throw new TimeoutError( "Playwright was unable to scrape the page before timing out", { cause: { timeout } }, diff --git a/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts b/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts index 50ac502b..db702a44 100644 --- a/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts @@ -9,16 +9,17 @@ const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!); export function scrapeURLWithScrapingBee( wait_browser: "domcontentloaded" | "networkidle2", -): (meta: Meta) => Promise { - return async (meta: Meta): Promise => { +): (meta: Meta, timeToRun: number | undefined) => Promise { + return async (meta: Meta, timeToRun: number | undefined): Promise => { let response: AxiosResponse; + const timeout = (timeToRun ?? 300000) + meta.options.waitFor; try { response = await client.get({ url: meta.url, params: { - timeout: 15000, // TODO: dynamic timeout based on request timeout + timeout, wait_browser: wait_browser, - wait: Math.min(meta.options.waitFor, 35000), + wait: meta.options.waitFor, transparent_status_code: true, json_response: true, screenshot: meta.options.formats.includes("screenshot"), diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index d3b33418..c0b6d4e5 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -202,11 +202,15 @@ async function scrapeURLLoop(meta: Meta): Promise { const results: EngineResultsTracker = {}; let result: EngineScrapeResultWithContext | null = null; + const timeToRun = meta.options.timeout !== undefined + ? Math.round(meta.options.timeout / Math.min(fallbackList.length, 3)) + : undefined + for (const { engine, unsupportedFeatures } of fallbackList) { const startedAt = Date.now(); try { meta.logger.info("Scraping via " + engine + "..."); - const _engineResult = await scrapeURLWithEngine(meta, engine); + const _engineResult = await scrapeURLWithEngine(meta, engine, timeToRun); if (_engineResult.markdown === undefined) { // Some engines emit Markdown directly. _engineResult.markdown = await parseMarkdown(_engineResult.html); From 1214d219e12cdec4fff6257b0ef2a5c873818f17 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 15 Dec 2024 15:43:12 -0300 Subject: [PATCH 16/65] Nick: fix actions errors --- apps/api/src/main/runWebScraper.ts | 1 + .../scrapeURL/engines/fire-engine/checkStatus.ts | 8 +++++++- .../src/scraper/scrapeURL/engines/fire-engine/index.ts | 8 ++++++-- apps/api/src/scraper/scrapeURL/error.ts | 10 ++++++++++ apps/api/src/scraper/scrapeURL/index.ts | 5 +++++ 5 files changed, 29 insertions(+), 3 deletions(-) diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 63063576..83e899bb 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -96,6 +96,7 @@ export async function runWebScraper({ ...internalOptions, }); if (!response.success) { + error = response.error; if (response.error instanceof Error) { throw response.error; } else { diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts index 328931ba..6f65db98 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts @@ -3,7 +3,7 @@ import * as Sentry from "@sentry/node"; import { z } from "zod"; import { robustFetch } from "../../lib/fetch"; -import { EngineError, SiteError } from "../../error"; +import { ActionError, EngineError, SiteError } from "../../error"; const successSchema = z.object({ jobId: z.string(), @@ -111,6 +111,12 @@ export async function fireEngineCheckStatus( status.error.includes("Chrome error: ") ) { throw new SiteError(status.error.split("Chrome error: ")[1]); + } else if ( + typeof status.error === "string" && + // TODO: improve this later + status.error.includes("Element") + ) { + throw new ActionError(status.error.split("Error: ")[1]); } else { throw new EngineError("Scrape job failed", { cause: { diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index ef0b41fc..a2deeed2 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -13,7 +13,7 @@ import { FireEngineCheckStatusSuccess, StillProcessingError, } from "./checkStatus"; -import { EngineError, SiteError, TimeoutError } from "../../error"; +import { ActionError, EngineError, SiteError, TimeoutError } from "../../error"; import * as Sentry from "@sentry/node"; import { Action } from "../../../../lib/entities"; import { specialtyScrapeCheck } from "../utils/specialtyHandler"; @@ -68,7 +68,11 @@ async function performFireEngineScrape< } catch (error) { if (error instanceof StillProcessingError) { // nop - } else if (error instanceof EngineError || error instanceof SiteError) { + } else if ( + error instanceof EngineError || + error instanceof SiteError || + error instanceof ActionError + ) { logger.debug("Fire-engine scrape job failed.", { error, jobId: scrape.jobId, diff --git a/apps/api/src/scraper/scrapeURL/error.ts b/apps/api/src/scraper/scrapeURL/error.ts index ec044745..0a4f6e5b 100644 --- a/apps/api/src/scraper/scrapeURL/error.ts +++ b/apps/api/src/scraper/scrapeURL/error.ts @@ -56,3 +56,13 @@ export class SiteError extends Error { this.code = code; } } + +export class ActionError extends Error { + public code: string; + constructor(code: string) { + super( + "Action(s) failed to complete. Error code: " + code, + ); + this.code = code; + } +} diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index c0b6d4e5..800457a8 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -12,6 +12,7 @@ import { } from "./engines"; import { parseMarkdown } from "../../lib/html-to-markdown"; import { + ActionError, AddFeatureError, EngineError, NoEnginesLeftError, @@ -288,6 +289,8 @@ async function scrapeURLLoop(meta: Meta): Promise { throw error; } else if (error instanceof SiteError) { throw error; + } else if (error instanceof ActionError) { + throw error; } else { Sentry.captureException(error); meta.logger.info( @@ -408,6 +411,8 @@ export async function scrapeURL( // TODO: results? } else if (error instanceof SiteError) { meta.logger.warn("scrapeURL: Site failed to load in browser", { error }); + } else if (error instanceof ActionError) { + meta.logger.warn("scrapeURL: Action(s) failed to complete", { error }); } else { Sentry.captureException(error); meta.logger.error("scrapeURL: Unexpected error happened", { error }); From 126b46ee2c16cc7d734d75bc73c090444c29afd1 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 15 Dec 2024 15:53:24 -0300 Subject: [PATCH 17/65] Update issue_credits.ts --- apps/api/src/services/billing/issue_credits.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/services/billing/issue_credits.ts b/apps/api/src/services/billing/issue_credits.ts index ce84db1b..2ca057dd 100644 --- a/apps/api/src/services/billing/issue_credits.ts +++ b/apps/api/src/services/billing/issue_credits.ts @@ -9,6 +9,7 @@ export async function issueCredits(team_id: string, credits: number) { status: "active", // indicates that this coupon was issued from auto recharge from_auto_recharge: true, + initial_credits: credits, }); if (error) { From 30fa78cd9e950899a2359122048e7231edd6bf90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sun, 15 Dec 2024 20:16:29 +0100 Subject: [PATCH 18/65] feat(queue-worker): fix redirect slipping --- apps/api/src/services/queue-worker.ts | 40 ++++++++++++------------- apps/api/{ => utils}/logview.js | 0 apps/api/utils/urldump-redis.js | 14 +++++++++ apps/api/utils/urldump.js | 43 +++++++++++++++++++++++++++ 4 files changed, 76 insertions(+), 21 deletions(-) rename apps/api/{ => utils}/logview.js (100%) create mode 100644 apps/api/utils/urldump-redis.js create mode 100644 apps/api/utils/urldump.js diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 9fd8861b..c2d2e2c6 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -481,33 +481,30 @@ async function processJob(job: Job & { id: string }, token: string) { normalizeURL(doc.metadata.url, sc) !== normalizeURL(doc.metadata.sourceURL, sc) ) { - logger.debug( - "Was redirected, removing old URL and locking new URL...", - { oldUrl: doc.metadata.sourceURL, newUrl: doc.metadata.url }, - ); - // Remove the old URL from visited unique due to checking for limit - // Do not remove from :visited otherwise it will keep crawling the original URL (sourceURL) - await redisConnection.srem( - "crawl:" + job.data.crawl_id + ":visited_unique", - normalizeURL(doc.metadata.sourceURL, sc), - ); - const p1 = generateURLPermutations(normalizeURL(doc.metadata.url, sc)); const p2 = generateURLPermutations( normalizeURL(doc.metadata.sourceURL, sc), ); - // In crawls, we should only crawl a redirected page once, no matter how many; times it is redirected to, or if it's been discovered by the crawler before. - // This can prevent flakiness with race conditions. - // Lock the new URL - const lockRes = await lockURL(job.data.crawl_id, sc, doc.metadata.url); - if ( - job.data.crawlerOptions !== null && - !lockRes && - JSON.stringify(p1) !== JSON.stringify(p2) - ) { - throw new RacedRedirectError(); + if (JSON.stringify(p1) !== JSON.stringify(p2)) { + logger.debug( + "Was redirected, removing old URL and locking new URL...", + { oldUrl: doc.metadata.sourceURL, newUrl: doc.metadata.url }, + ); + + // Prevent redirect target from being visited in the crawl again + // See lockURL + const x = await redisConnection.sadd( + "crawl:" + job.data.crawl_id + ":visited", + ...p1.map(x => x.href), + ); + const lockRes = x === p1.length; + + if (job.data.crawlerOptions !== null && !lockRes) { + throw new RacedRedirectError(); + } } + } logger.debug("Logging job to DB..."); @@ -678,6 +675,7 @@ async function processJob(job: Job & { id: string }, token: string) { logger.debug("Declaring job as done..."); await addCrawlJobDone(job.data.crawl_id, job.id, false); + await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited_unique", normalizeURL(job.data.url, sc)); logger.debug("Logging job to DB..."); await logJob( diff --git a/apps/api/logview.js b/apps/api/utils/logview.js similarity index 100% rename from apps/api/logview.js rename to apps/api/utils/logview.js diff --git a/apps/api/utils/urldump-redis.js b/apps/api/utils/urldump-redis.js new file mode 100644 index 00000000..fdd6090c --- /dev/null +++ b/apps/api/utils/urldump-redis.js @@ -0,0 +1,14 @@ +require("dotenv").config(); +const Redis = require("ioredis"); + +const crawlId = process.argv[2]; + +const redisConnection = new Redis(process.env.REDIS_URL, { + maxRetriesPerRequest: null, +}); + +(async () => { + const res = await redisConnection.sscan("crawl:" + crawlId + ":visited_unique", 0, "COUNT", 999); + await require("fs/promises").writeFile(crawlId + "-visited.txt", res[1].map(x => x.split("://").slice(1).join("://")).sort().join("\n")); + process.exit(0); +})(); \ No newline at end of file diff --git a/apps/api/utils/urldump.js b/apps/api/utils/urldump.js new file mode 100644 index 00000000..3583f7c6 --- /dev/null +++ b/apps/api/utils/urldump.js @@ -0,0 +1,43 @@ +require("dotenv").config(); + +//const baseUrl = "https://api.firecrawl.dev"; +const baseUrl = "http://localhost:3002"; +const crawlId = process.argv[2]; + +(async () => { + let url = baseUrl + "/v1/crawl/" + crawlId; + let urls = []; + + while (url) { + let res; + + while (true) { + try { + res = (await (await fetch(url, { + headers: { + "Authorization": "Bearer " + process.env.TEST_API_KEY + } + })).json()); + break; + } catch (e) { + console.error(e); + } + } + + console.log(res.data.length); + if (res.data.length === 0) { + break; + } + + urls.push(...res.data.map(x => x.metadata.url ?? x.metadata.sourceURL)); + + url = res.next; + if (url !== undefined) { + const o = new URL(url) + o.protocol = new URL(baseUrl).protocol; + url = o.href; + } + } + + await require("fs/promises").writeFile(crawlId + "-urls.txt", urls.map(x => x.split("://").slice(1).join("://")).sort().join("\n")); +})(); \ No newline at end of file From 37f58efe457dd985e3c01aabeed23d48cf3bc99d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sun, 15 Dec 2024 21:01:31 +0100 Subject: [PATCH 19/65] fix(crawl-redis/lockURL): only add to visited_unique if lock succeeds --- apps/api/src/lib/crawl-redis.ts | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index 0c9e0ff0..602d13b3 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -233,13 +233,6 @@ export async function lockURL( url = normalizeURL(url, sc); logger = logger.child({ url }); - await redisConnection.sadd("crawl:" + id + ":visited_unique", url); - await redisConnection.expire( - "crawl:" + id + ":visited_unique", - 24 * 60 * 60, - "NX", - ); - let res: boolean; if (!sc.crawlerOptions?.deduplicateSimilarURLs) { res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0; @@ -255,6 +248,15 @@ export async function lockURL( await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX"); + if (res) { + await redisConnection.sadd("crawl:" + id + ":visited_unique", url); + await redisConnection.expire( + "crawl:" + id + ":visited_unique", + 24 * 60 * 60, + "NX", + ); + } + logger.debug("Locking URL " + JSON.stringify(url) + "... result: " + res, { res, }); From e97ee4a4be9b856707cae547eedd6d9f015d94e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sun, 15 Dec 2024 22:33:36 +0100 Subject: [PATCH 20/65] fix(WebScraper/tryGetSitemap): deduplicate sitemap links list --- apps/api/src/scraper/WebScraper/crawler.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 19b0b5b4..2e47d352 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -210,7 +210,7 @@ export class WebCrawler { } if (sitemapLinks.length > 0) { let filteredLinks = this.filterLinks( - sitemapLinks, + [...new Set(sitemapLinks)], this.limit, this.maxCrawledDepth, fromMap, From 72d6a8179e35eaeff4b6db159c6320bdce2e22f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sun, 15 Dec 2024 23:08:23 +0100 Subject: [PATCH 21/65] fix(rate-limiter): raise crawlStatus limits --- apps/api/src/services/rate-limiter.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 5b8e39ca..21025589 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -80,8 +80,8 @@ const RATE_LIMITS = { default: 100, }, crawlStatus: { - free: 300, - default: 500, + free: 500, + default: 5000, }, testSuite: { free: 10000, From 2de659d81050903014709f22a6cc7170625e47c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sun, 15 Dec 2024 23:54:52 +0100 Subject: [PATCH 22/65] fix(queue-jobs): fix concurrency limit --- apps/api/src/controllers/v1/crawl.ts | 8 +- apps/api/src/services/queue-jobs.ts | 150 ++++++++++++++++++++++----- 2 files changed, 126 insertions(+), 32 deletions(-) diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index 1fb470f9..c2e3369f 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -18,7 +18,7 @@ import { } from "../../lib/crawl-redis"; import { logCrawl } from "../../services/logging/crawl_log"; import { getScrapeQueue } from "../../services/queue-service"; -import { addScrapeJob } from "../../services/queue-jobs"; +import { addScrapeJob, addScrapeJobs } from "../../services/queue-jobs"; import { logger as _logger } from "../../lib/logger"; import { getJobPriority } from "../../lib/job-priority"; import { callWebhook } from "../../services/webhook"; @@ -139,9 +139,9 @@ export async function crawlController( name: uuid, data: { url, - mode: "single_urls", + mode: "single_urls" as const, team_id: req.auth.team_id, - plan: req.auth.plan, + plan: req.auth.plan!, crawlerOptions, scrapeOptions, internalOptions: sc.internalOptions, @@ -170,7 +170,7 @@ export async function crawlController( jobs.map((x) => x.opts.jobId), ); logger.debug("Adding scrape jobs to BullMQ..."); - await getScrapeQueue().addBulk(jobs); + await addScrapeJobs(jobs); } else { logger.debug("Sitemap not found or ignored.", { ignoreSitemap: sc.crawlerOptions.ignoreSitemap, diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index ee9e6177..6ce48a81 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -11,11 +11,50 @@ import { pushConcurrencyLimitedJob, } from "../lib/concurrency-limit"; +async function _addScrapeJobToConcurrencyQueue( + webScraperOptions: any, + options: any, + jobId: string, + jobPriority: number, +) { + await pushConcurrencyLimitedJob(webScraperOptions.team_id, { + id: jobId, + data: webScraperOptions, + opts: { + ...options, + priority: jobPriority, + jobId: jobId, + }, + priority: jobPriority, + }); +} + +async function _addScrapeJobToBullMQ( + webScraperOptions: any, + options: any, + jobId: string, + jobPriority: number, +) { + if ( + webScraperOptions && + webScraperOptions.team_id && + webScraperOptions.plan + ) { + await pushConcurrencyLimitActiveJob(webScraperOptions.team_id, jobId); + } + + await getScrapeQueue().add(jobId, webScraperOptions, { + ...options, + priority: jobPriority, + jobId, + }); +} + async function addScrapeJobRaw( webScraperOptions: any, options: any, jobId: string, - jobPriority: number = 10, + jobPriority: number, ) { let concurrencyLimited = false; @@ -33,30 +72,9 @@ async function addScrapeJobRaw( } if (concurrencyLimited) { - await pushConcurrencyLimitedJob(webScraperOptions.team_id, { - id: jobId, - data: webScraperOptions, - opts: { - ...options, - priority: jobPriority, - jobId: jobId, - }, - priority: jobPriority, - }); + await _addScrapeJobToConcurrencyQueue(webScraperOptions, options, jobId, jobPriority); } else { - if ( - webScraperOptions && - webScraperOptions.team_id && - webScraperOptions.plan - ) { - await pushConcurrencyLimitActiveJob(webScraperOptions.team_id, jobId); - } - - await getScrapeQueue().add(jobId, webScraperOptions, { - ...options, - priority: jobPriority, - jobId, - }); + await _addScrapeJobToBullMQ(webScraperOptions, options, jobId, jobPriority); } } @@ -109,11 +127,87 @@ export async function addScrapeJobs( }[], ) { if (jobs.length === 0) return true; - // TODO: better + + let countCanBeDirectlyAdded = Infinity; + + if ( + jobs[0].data && + jobs[0].data.team_id && + jobs[0].data.plan + ) { + const now = Date.now(); + const limit = await getConcurrencyLimitMax(jobs[0].data.plan); + console.log("CC limit", limit); + cleanOldConcurrencyLimitEntries(jobs[0].data.team_id, now); + + countCanBeDirectlyAdded = Math.max(limit - (await getConcurrencyLimitActiveJobs(jobs[0].data.team_id, now)).length, 0); + } + + const addToBull = jobs.slice(0, countCanBeDirectlyAdded); + const addToCQ = jobs.slice(countCanBeDirectlyAdded); + await Promise.all( - jobs.map((job) => - addScrapeJob(job.data, job.opts, job.opts.jobId, job.opts.priority), - ), + addToBull.map(async (job) => { + const size = JSON.stringify(job.data).length; + return await Sentry.startSpan( + { + name: "Add scrape job", + op: "queue.publish", + attributes: { + "messaging.message.id": job.opts.jobId, + "messaging.destination.name": getScrapeQueue().name, + "messaging.message.body.size": size, + }, + }, + async (span) => { + await _addScrapeJobToBullMQ( + { + ...job.data, + sentry: { + trace: Sentry.spanToTraceHeader(span), + baggage: Sentry.spanToBaggageHeader(span), + size, + }, + }, + job.opts, + job.opts.jobId, + job.opts.priority, + ); + }, + ); + }), + ); + + await Promise.all( + addToCQ.map(async (job) => { + const size = JSON.stringify(job.data).length; + return await Sentry.startSpan( + { + name: "Add scrape job", + op: "queue.publish", + attributes: { + "messaging.message.id": job.opts.jobId, + "messaging.destination.name": getScrapeQueue().name, + "messaging.message.body.size": size, + }, + }, + async (span) => { + await _addScrapeJobToConcurrencyQueue( + { + ...job.data, + sentry: { + trace: Sentry.spanToTraceHeader(span), + baggage: Sentry.spanToBaggageHeader(span), + size, + }, + }, + job.opts, + job.opts.jobId, + job.opts.priority, + ); + }, + ); + }), ); } From 139e2c9a0591bbbbf09554fccb11ea5c23bd8163 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Mon, 16 Dec 2024 22:24:00 +0100 Subject: [PATCH 23/65] fix(runWebScraper): proper error handling --- apps/api/src/main/runWebScraper.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 83e899bb..5fb574d4 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -96,7 +96,6 @@ export async function runWebScraper({ ...internalOptions, }); if (!response.success) { - error = response.error; if (response.error instanceof Error) { throw response.error; } else { @@ -124,7 +123,8 @@ export async function runWebScraper({ // status code is good -- do not attempt retry break; } - } catch (error) { + } catch (_error) { + error = _error; engines = response !== undefined ? response.engines From 0013bdfcb4bd63d6901bc3778fea01371c0276db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Mon, 16 Dec 2024 22:42:51 +0100 Subject: [PATCH 24/65] feat(v1/scrape): add more context to timeout logs --- apps/api/src/controllers/v1/scrape.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index ddd5da74..f1fe3431 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -60,7 +60,7 @@ export async function scrapeController( try { doc = await waitForJob(jobId, timeout + totalWait); // TODO: better types for this } catch (e) { - logger.error(`Error in scrapeController: ${e}`); + logger.error(`Error in scrapeController: ${e}`, { jobId, scrapeId: jobId, startTime }); if ( e instanceof Error && (e.message.startsWith("Job wait") || e.message === "timeout") From 284a6ccedd1baede825571ee933eb7e4f773e2de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Mon, 16 Dec 2024 23:01:34 +0100 Subject: [PATCH 25/65] fix(scrapeURL): better timeToRun distribution --- apps/api/src/scraper/scrapeURL/index.ts | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 800457a8..31f7e2f2 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -203,15 +203,20 @@ async function scrapeURLLoop(meta: Meta): Promise { const results: EngineResultsTracker = {}; let result: EngineScrapeResultWithContext | null = null; - const timeToRun = meta.options.timeout !== undefined - ? Math.round(meta.options.timeout / Math.min(fallbackList.length, 3)) - : undefined + let ttrInstanceCount = Math.min(fallbackList.length, 3); + let ttrRatios = new Array(ttrInstanceCount).fill(0).map((_, i) => ttrInstanceCount - i); + let ttrRatioSum = ttrRatios.reduce((a, x) => a + x, 0); - for (const { engine, unsupportedFeatures } of fallbackList) { + const timeToRun = meta.options.timeout !== undefined + ? ttrRatios.map(ratio => Math.round(meta.options.timeout! * ratio / ttrRatioSum)).map(ratio => isNaN(ratio) ? undefined : ratio) + : [undefined] + + for (const i in fallbackList) { + const { engine, unsupportedFeatures } = fallbackList[i]; const startedAt = Date.now(); try { meta.logger.info("Scraping via " + engine + "..."); - const _engineResult = await scrapeURLWithEngine(meta, engine, timeToRun); + const _engineResult = await scrapeURLWithEngine(meta, engine, timeToRun[i] ?? timeToRun.slice(-1)[0]); if (_engineResult.markdown === undefined) { // Some engines emit Markdown directly. _engineResult.markdown = await parseMarkdown(_engineResult.html); From 7f57c868be83f2c1c5e92f0eaba81dcdde0f1836 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Mon, 16 Dec 2024 23:08:20 +0100 Subject: [PATCH 26/65] Revert "fix(scrapeURL): better timeToRun distribution" This reverts commit 284a6ccedd1baede825571ee933eb7e4f773e2de. --- apps/api/src/scraper/scrapeURL/index.ts | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 31f7e2f2..800457a8 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -203,20 +203,15 @@ async function scrapeURLLoop(meta: Meta): Promise { const results: EngineResultsTracker = {}; let result: EngineScrapeResultWithContext | null = null; - let ttrInstanceCount = Math.min(fallbackList.length, 3); - let ttrRatios = new Array(ttrInstanceCount).fill(0).map((_, i) => ttrInstanceCount - i); - let ttrRatioSum = ttrRatios.reduce((a, x) => a + x, 0); - const timeToRun = meta.options.timeout !== undefined - ? ttrRatios.map(ratio => Math.round(meta.options.timeout! * ratio / ttrRatioSum)).map(ratio => isNaN(ratio) ? undefined : ratio) - : [undefined] + ? Math.round(meta.options.timeout / Math.min(fallbackList.length, 3)) + : undefined - for (const i in fallbackList) { - const { engine, unsupportedFeatures } = fallbackList[i]; + for (const { engine, unsupportedFeatures } of fallbackList) { const startedAt = Date.now(); try { meta.logger.info("Scraping via " + engine + "..."); - const _engineResult = await scrapeURLWithEngine(meta, engine, timeToRun[i] ?? timeToRun.slice(-1)[0]); + const _engineResult = await scrapeURLWithEngine(meta, engine, timeToRun); if (_engineResult.markdown === undefined) { // Some engines emit Markdown directly. _engineResult.markdown = await parseMarkdown(_engineResult.html); From 47b968fedea4e5bfe38b80891b0c983b698f204a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 17 Dec 2024 13:17:55 +0100 Subject: [PATCH 27/65] fix(scrapeURL/fire-engine): timeout calculation issues --- .../scrapeURL/engines/fire-engine/index.ts | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index a2deeed2..2dc134c9 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -123,8 +123,13 @@ export async function scrapeURLWithFireEngineChromeCDP( // Include specified actions ...(meta.options.actions ?? []), ]; + + const totalWait = actions.reduce( + (a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a), + 0, + ); - const timeout = timeToRun ?? 300000; + const timeout = (timeToRun ?? 300000) + totalWait; const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestChromeCDP = { @@ -146,18 +151,13 @@ export async function scrapeURLWithFireEngineChromeCDP( // TODO: scrollXPaths }; - const totalWait = actions.reduce( - (a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a), - 0, - ); - let response = await performFireEngineScrape( meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request, }), request, - timeout + totalWait, + timeout, ); specialtyScrapeCheck( @@ -213,7 +213,8 @@ export async function scrapeURLWithFireEnginePlaywright( meta: Meta, timeToRun: number | undefined, ): Promise { - const timeout = timeToRun ?? 300000; + const totalWait = meta.options.waitFor; + const timeout = (timeToRun ?? 300000) + totalWait; const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestPlaywright = { @@ -237,7 +238,7 @@ export async function scrapeURLWithFireEnginePlaywright( request, }), request, - timeout + meta.options.waitFor, + timeout, ); specialtyScrapeCheck( From 654d6c6e0b128e65f41f40b419b957428ad5b659 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 17 Dec 2024 13:21:24 +0100 Subject: [PATCH 28/65] fix(scrapeURL): increase timeToRun --- apps/api/src/scraper/scrapeURL/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 800457a8..93bdb71b 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -204,7 +204,7 @@ async function scrapeURLLoop(meta: Meta): Promise { let result: EngineScrapeResultWithContext | null = null; const timeToRun = meta.options.timeout !== undefined - ? Math.round(meta.options.timeout / Math.min(fallbackList.length, 3)) + ? Math.round(meta.options.timeout / Math.min(fallbackList.length, 2)) : undefined for (const { engine, unsupportedFeatures } of fallbackList) { From ed7d15d2af23409e351df5a3574ef565f06a29d6 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Tue, 17 Dec 2024 09:50:29 -0500 Subject: [PATCH 29/65] Update index.ts --- .../scraper/scrapeURL/engines/pdf/index.ts | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index 24d5f002..38dc6b5e 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -140,9 +140,23 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom const { response, tempFilePath } = await downloadFile(meta.id, meta.url); let result: PDFProcessorResult | null = null; - if (process.env.LLAMAPARSE_API_KEY) { + + // First, try parsing with PdfParse + result = await scrapePDFWithParsePDF( + { + ...meta, + logger: meta.logger.child({ + method: "scrapePDF/scrapePDFWithParsePDF", + }), + }, + tempFilePath, + ); + + + // If the parsed text is under 500 characters and LLAMAPARSE_API_KEY exists, try LlamaParse + if (result.markdown && result.markdown.length < 500 && process.env.LLAMAPARSE_API_KEY) { try { - result = await scrapePDFWithLlamaParse( + const llamaResult = await scrapePDFWithLlamaParse( { ...meta, logger: meta.logger.child({ @@ -152,16 +166,17 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom tempFilePath, timeToRun, ); + result = llamaResult; // Use LlamaParse result if successful } catch (error) { if (error instanceof Error && error.message === "LlamaParse timed out") { - meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", { + meta.logger.warn("LlamaParse timed out -- using parse-pdf result", { error, }); } else if (error instanceof RemoveFeatureError) { throw error; } else { meta.logger.warn( - "LlamaParse failed to parse PDF -- falling back to parse-pdf", + "LlamaParse failed to parse PDF -- using parse-pdf result", { error }, ); Sentry.captureException(error); @@ -169,18 +184,6 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom } } - if (result === null) { - result = await scrapePDFWithParsePDF( - { - ...meta, - logger: meta.logger.child({ - method: "scrapePDF/scrapePDFWithParsePDF", - }), - }, - tempFilePath, - ); - } - await fs.unlink(tempFilePath); return { From 1402831a0aef09b5a74fed5711efd4627af34e2f Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Tue, 17 Dec 2024 09:59:52 -0500 Subject: [PATCH 30/65] Replace pdf parse with pdf to md --- apps/api/package.json | 1 + apps/api/pnpm-lock.yaml | 288 ++++++++++++++++++ .../scraper/scrapeURL/engines/pdf/index.ts | 15 +- 3 files changed, 297 insertions(+), 7 deletions(-) diff --git a/apps/api/package.json b/apps/api/package.json index c4e70901..f4ac48ff 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -58,6 +58,7 @@ "@devil7softwares/pos": "^1.0.2", "@dqbd/tiktoken": "^1.0.17", "@nangohq/node": "^0.40.8", + "@opendocsg/pdf2md": "^0.2.1", "@sentry/cli": "^2.33.1", "@sentry/node": "^8.26.0", "@sentry/profiling-node": "^8.26.0", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 6d971708..68d43655 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -29,6 +29,9 @@ importers: '@nangohq/node': specifier: ^0.40.8 version: 0.40.8 + '@opendocsg/pdf2md': + specifier: ^0.2.1 + version: 0.2.1 '@sentry/cli': specifier: ^2.33.1 version: 2.33.1 @@ -794,6 +797,10 @@ packages: resolution: {integrity: sha512-cXWgKE3sdWLSqAa8ykbCcUsUF1Kyr5J3HOWYGuobhPEycXW4WI++d5DhzdpL238mzoEXTi90VqfSCra37l5YqA==} engines: {node: '>=18'} + '@mapbox/node-pre-gyp@1.0.11': + resolution: {integrity: sha512-Yhlar6v9WQgUp/He7BdgzOz8lqMQ8sU+jkCq7Wx8Myc5YFJLbEe7lgui/V7G1qB1DJykHSGwreceSaD60Y0PUQ==} + hasBin: true + '@mixmark-io/domino@2.2.0': resolution: {integrity: sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==} @@ -837,6 +844,10 @@ packages: '@one-ini/wasm@0.1.1': resolution: {integrity: sha512-XuySG1E38YScSJoMlqovLru4KTUNSjgVTIjyh7qMX6aNN5HY5Ct5LhRJdxO79JtTzKfzV/bnWpz+zquYrISsvw==} + '@opendocsg/pdf2md@0.2.1': + resolution: {integrity: sha512-k/yvfrTb+GPTIIm/bMm5IsenTqAFl+IqvkBgFwFlmflS5TT7FOfyRLp8MypVWLAG4G9AnT7AZFbdQYgN/CR5BA==} + hasBin: true + '@opentelemetry/api-logs@0.52.1': resolution: {integrity: sha512-qnSqB2DQ9TPP96dl8cDubDvrUyWc0/sK81xHTK8eSUspzDM3bsewX903qclQFvVhgStjRWdC5bLb3kQqMkfV5A==} engines: {node: '>=14'} @@ -1615,6 +1626,9 @@ packages: resolution: {integrity: sha512-2WALfTl4xo2SkGCYRt6rDTFfk9R1czmBvUQy12gK2KuRKIpWEhcbbzy8EZXtz/jkRqHX8bFEc6FC1HjX4TUWYw==} engines: {node: '>=10.0.0'} + abbrev@1.1.1: + resolution: {integrity: sha512-nne9/IiQ/hzIhY6pdDnbBtz7DjPTKrY00P/zvPSm5pOFkl6xuGrGnXn/VtTNNfNtAfZ9/1RtehkszU9qcTii0Q==} + abbrev@2.0.0: resolution: {integrity: sha512-6/mh1E2u2YgEsCHdY0Yx5oW+61gZU+1vXaoiHHrpKeuRNNgFvS+/jrwHiQhB5apAf5oB7UB7E19ol2R2LKH8hQ==} engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0} @@ -1708,6 +1722,14 @@ packages: resolution: {integrity: sha512-KLy/ugo33KZA7nugtQ7O0E1c8kQ52N3IvD/XgIh4w/Nr28ypfkwDfA67F1ev4N1m5D+BOk1+b2dEJDfpj/VvZg==} engines: {node: '>=0.2.6'} + aproba@2.0.0: + resolution: {integrity: sha512-lYe4Gx7QT+MKGbDsA+Z+he/Wtef0BiwDOlK/XkBrdfsh9J/jPPXbX0tE9x9cl27Tmu5gg3QUbUrQYa/y+KOHPQ==} + + are-we-there-yet@2.0.0: + resolution: {integrity: sha512-Ci/qENmwHnsYo9xKIcUJN5LeDKdJ6R1Z1j9V/J5wyq8nh/mYPEpIKJbBZXtZjG04HiK7zV/p6Vs9952MrMeUIw==} + engines: {node: '>=10'} + deprecated: This package is no longer supported. + arg@4.1.3: resolution: {integrity: sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA==} @@ -1908,6 +1930,10 @@ packages: caniuse-lite@1.0.30001627: resolution: {integrity: sha512-4zgNiB8nTyV/tHhwZrFs88ryjls/lHiqFhrxCW4qSTeuRByBVnPYpDInchOIySWknznucaf31Z4KYqjfbrecVw==} + canvas@2.11.2: + resolution: {integrity: sha512-ItanGBMrmRV7Py2Z+Xhs7cT+FNt5K0vPL4p9EZ/UX/Mu7hFbkxSjKF2KVtPwX7UYWp7dRKnrTvReflgrItJbdw==} + engines: {node: '>=6'} + chalk@2.4.2: resolution: {integrity: sha512-Mti+f9lpJNcwF4tWV8/OrTTtF1gZi+f8FqlyAdouralcFWFQWF2+NgCHShjkCb+IFBLq9buZwE1xckQU4peSuQ==} engines: {node: '>=4'} @@ -1938,6 +1964,10 @@ packages: resolution: {integrity: sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==} engines: {node: '>= 8.10.0'} + chownr@2.0.0: + resolution: {integrity: sha512-bIomtDF5KGpdogkLd9VspvFzk9KfpyyGlS8YFVZl7TGPBHL5snIOnxeshwVgPteQ9b4Eydl+pVbIyE1DcvCWgQ==} + engines: {node: '>=10'} + chownr@3.0.0: resolution: {integrity: sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g==} engines: {node: '>=18'} @@ -1995,6 +2025,10 @@ packages: color-string@1.9.1: resolution: {integrity: sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==} + color-support@1.1.3: + resolution: {integrity: sha512-qiBjkpbMLO/HL68y+lh4q0/O1MZFj2RX6X/KmMa3+gJD3z+WwI1ZzDHysvqHGS3mP6mznPckpXmw1nI9cJjyRg==} + hasBin: true + color@3.2.1: resolution: {integrity: sha512-aBl7dZI9ENN6fUGC7mWpMTPNHmWUSNan9tuWN6ahh5ZLNk9baLJOnSMlrQkHcrfFgz2/RigjUVAjdx36VcemKA==} @@ -2021,6 +2055,9 @@ packages: config-chain@1.1.13: resolution: {integrity: sha512-qj+f8APARXHrM0hraqXYb2/bOVSV4PvJQlNZ/DVj0QrmNM2q2euizkeuVckQ57J+W0mRH6Hvi+k50M4Jul2VRQ==} + console-control-strings@1.1.0: + resolution: {integrity: sha512-ty/fTekppD2fIwRvnZAVdeOiGd1c7YXEixbgJTNzqcxJWKQnjJ/V1bNEEE6hygpM3WjwHFUVK6HTjWSzV4a8sQ==} + content-disposition@0.5.4: resolution: {integrity: sha512-FveZTNuGw04cxlAiWbzi6zTAL/lhehaWbTtgluJh4/E95DqMwTmha3KZN1aAWA8cFIhHzMZUvLevkw5Rqk+tSQ==} engines: {node: '>= 0.6'} @@ -2147,6 +2184,10 @@ packages: resolution: {integrity: sha512-9iE1PgSik9HeIIw2JO94IidnE3eBoQrFJ3w7sFuzSX4DpmZ3v5sZpUiV5Swcf6mQEF+Y0ru8Neo+p+nyh2J+hQ==} engines: {node: '>=10'} + decompress-response@4.2.1: + resolution: {integrity: sha512-jOSne2qbyE+/r8G1VU+G/82LBs2Fs4LAsTiLSHOCOMZQl2OKZ6i8i4IyHemTe+/yIXOtTcRQMzPcgyhoFlqPkw==} + engines: {node: '>=8'} + dedent@1.5.3: resolution: {integrity: sha512-NHQtfOOW68WD8lgypbLA5oT+Bt0xXJhiYvoR6SmmNXZfpzOGXwdKWmcwG8N7PwVVWV3eF/68nmD9BaJSsTBhyQ==} peerDependencies: @@ -2171,6 +2212,9 @@ packages: resolution: {integrity: sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==} engines: {node: '>=0.4.0'} + delegates@1.0.0: + resolution: {integrity: sha512-bd2L678uiWATM6m5Z1VzNCErI3jiGzt6HGY8OVICs40JQq/HALfbyNJmp0UDakEY4pMMaN0Ly5om/B1VI/+xfQ==} + denque@2.1.0: resolution: {integrity: sha512-HVQE3AAb/pxF8fQAoiqpvg9i3evqug3hoiwakOyZAwJm+6vZehbkYXZ0l4JxS+I3QxM97v5aaRNhj8v5oBhekw==} engines: {node: '>=0.10'} @@ -2283,6 +2327,9 @@ packages: resolution: {integrity: sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==} engines: {node: '>=0.12'} + enumify@1.0.4: + resolution: {integrity: sha512-5mwWXaVzJaqyUdOW/PDH5QySRgmQ8VvujmxmvXoXj9w0n+6omhVuyD56eI37FMqy/LxueJzsQ4DrHVQzuT/TXg==} + env-paths@2.2.1: resolution: {integrity: sha512-+h1lkLKhZMTYjog1VEpJNG7NZJWcuc2DDk/qsqSTRRCOXiLjeQ1d1/udrUGhqMxUgAlwKNZ0cf2uqan5GLuS2A==} engines: {node: '>=6'} @@ -2484,6 +2531,10 @@ packages: resolution: {integrity: sha512-PmDi3uwK5nFuXh7XDTlVnS17xJS7vW36is2+w3xcv8SVxiB4NyATf4ctkVY5bkSjX0Y4nbvZCq1/EjtEyr9ktw==} engines: {node: '>=14.14'} + fs-minipass@2.1.0: + resolution: {integrity: sha512-V/JgOLFCS+R6Vcq0slCuaeWEdNC3ouDlJMNIsacH2VtALiu9mV4LPrHc5cDl8k5aw6J8jwgWWpiTo5RYhmIzvg==} + engines: {node: '>= 8'} + fs.realpath@1.0.0: resolution: {integrity: sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==} @@ -2495,6 +2546,11 @@ packages: function-bind@1.1.2: resolution: {integrity: sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==} + gauge@3.0.2: + resolution: {integrity: sha512-+5J6MS/5XksCuXq++uFRsnUd7Ovu1XenbeuIuNRJxYWjgQbPuFhT14lAvsWfqfAmnwluf1OwMjz39HjfLPci0Q==} + engines: {node: '>=10'} + deprecated: This package is no longer supported. + generic-pool@3.9.0: resolution: {integrity: sha512-hymDOu5B53XvN4QT9dBmZxPX4CWhBPPLguTZ9MMFeFa/Kg0xWVfylOVNlJji/E7yTZWFd/q9GO5TxDLq156D7g==} engines: {node: '>= 4'} @@ -2578,6 +2634,9 @@ packages: resolution: {integrity: sha512-l3LCuF6MgDNwTDKkdYGEihYjt5pRPbEg46rtlmnSPlUbgmB8LOIrKJbYYFBSbnPaJexMKtiPO8hmeRjRz2Td+A==} engines: {node: '>= 0.4'} + has-unicode@2.0.1: + resolution: {integrity: sha512-8Rf9Y83NBReMnx0gFzA8JImQACstCYWUplepDa9xprwwtmgEZUF0h/i5xSA625zB/I37EtrswSST6OXxwaaIJQ==} + hasown@2.0.2: resolution: {integrity: sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==} engines: {node: '>= 0.4'} @@ -3256,6 +3315,10 @@ packages: resolution: {integrity: sha512-zobTr7akeGHnv7eBOXcRgMeCP6+uyYsczwmeRCauvpvaAltgNyTbLH/+VaEAPUeWBT+1GuNmz4wC/6jtQzbbVA==} engines: {node: '>=12'} + make-dir@3.1.0: + resolution: {integrity: sha512-g3FeP20LNwhALb/6Cz6Dd4F2ngze0jz7tbzrD2wAV+o9FeNHe4rL+yK2md0J/fiSf1sa1ADhXqi5+oVwOM/eGw==} + engines: {node: '>=8'} + make-dir@4.0.0: resolution: {integrity: sha512-hXdUTZYIVOt1Ex//jAQi+wTZZpUpwBj/0QsOzqegb3rGMMeJiSEu5xLHnYfBrRV4RH2+OCSOO95Is/7x1WJ4bw==} engines: {node: '>=10'} @@ -3326,6 +3389,10 @@ packages: resolution: {integrity: sha512-OqbOk5oEQeAZ8WXWydlu9HJjz9WVdEIvamMCcXmuqUYjTknH/sqsWvhQ3vgwKFRR1HpjvNBKQ37nbJgYzGqGcg==} engines: {node: '>=6'} + mimic-response@2.1.0: + resolution: {integrity: sha512-wXqjST+SLt7R009ySCglWBCFpjUygmCIfD790/kVbiGmUgfYGuB14PiTd5DwVxSV4NcYHjzMkoj5LjQZwTQLEA==} + engines: {node: '>=8'} + minimatch@3.1.2: resolution: {integrity: sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==} @@ -3344,10 +3411,22 @@ packages: minimist@1.2.8: resolution: {integrity: sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==} + minipass@3.3.6: + resolution: {integrity: sha512-DxiNidxSEK+tHG6zOIklvNOwm3hvCrbUrdtzY74U6HKTJxvIDfOUL5W5P2Ghd3DTkhhKPYGqeNUIh5qcM4YBfw==} + engines: {node: '>=8'} + + minipass@5.0.0: + resolution: {integrity: sha512-3FnjYuehv9k6ovOEbyOswadCDPX1piCfhV8ncmYtHOjuPwylVWsghTLo7rabjC3Rx5xD4HDx8Wm1xnMF7S5qFQ==} + engines: {node: '>=8'} + minipass@7.1.2: resolution: {integrity: sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==} engines: {node: '>=16 || 14 >=14.17'} + minizlib@2.1.2: + resolution: {integrity: sha512-bAxsR8BVfj60DWXHE3u30oHzfl4G7khkSuPW+qvpd7jFRHm7dLxOjUk1EHACJ/hxLY8phGJ0YhYHZo7jil7Qdg==} + engines: {node: '>= 8'} + minizlib@3.0.1: resolution: {integrity: sha512-umcy022ILvb5/3Djuu8LWeqUa8D68JaBzlttKeMWen48SjabqS3iY5w/vzeMzMUNhLDifyhbOwKDSznB1vvrwg==} engines: {node: '>= 18'} @@ -3359,6 +3438,11 @@ packages: resolution: {integrity: sha512-FP+p8RB8OWpF3YZBCrP5gtADmtXApB5AMLn+vdyA+PyxCjrCs00mjyUozssO33cwDeT3wNGdLxJ5M//YqtHAJw==} hasBin: true + mkdirp@1.0.4: + resolution: {integrity: sha512-vVqVZQyf3WLx2Shd0qJ9xuvqgAyKPLAiqITEtqW0oIUjzo3PePDd6fW9iFz30ef7Ysp/oiWqbhszeGWW2T6Gzw==} + engines: {node: '>=10'} + hasBin: true + mkdirp@3.0.1: resolution: {integrity: sha512-+NsyUUAZDmo6YVHzL/stxSu3t9YS1iljliy3BSDrXJ/dkn1KYdmtZODGGjLcc9XLgVVpH4KshHB8XmZgMhaBXg==} engines: {node: '>=10'} @@ -3447,6 +3531,9 @@ packages: resolution: {integrity: sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ==} hasBin: true + nan@2.22.0: + resolution: {integrity: sha512-nbajikzWTMwsW+eSsNm3QwlOs7het9gGJU5dDZzRTQGk03vyBOauxgI4VakDzE0PtsGTmXPsXTbbjVhRwR5mpw==} + natural-compare@1.4.0: resolution: {integrity: sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==} @@ -3507,6 +3594,11 @@ packages: engines: {node: '>=8.10.0'} hasBin: true + nopt@5.0.0: + resolution: {integrity: sha512-Tbj67rffqceeLpcRXrT7vKAN8CwfPeIBgM7E6iBkmKLV7bEMwpGgYLGv0jACUsECaa/vuxP0IjEont6umdMgtQ==} + engines: {node: '>=6'} + hasBin: true + nopt@7.2.1: resolution: {integrity: sha512-taM24ViiimT/XntxbPyJQzCG+p4EKOpgD3mxFwW38mGjVUrfERQOeY4EDHjdnptttfHuHQXFx+lTP08Q+mLa/w==} engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0} @@ -3524,6 +3616,10 @@ packages: resolution: {integrity: sha512-S48WzZW777zhNIrn7gxOlISNAqi9ZC/uQFnRdbeIHhZhCA6UqpkOT8T1G7BvfdgP4Er8gF4sUbaS0i7QvIfCWw==} engines: {node: '>=8'} + npmlog@5.0.1: + resolution: {integrity: sha512-AqZtDUWOMKs1G/8lwylVjrdYgqA4d9nu8hc+0gzRxlDb1I10+FHBGMXs6aiQHFdCUUlqH99MUMuLfzWDNDtfxw==} + deprecated: This package is no longer supported. + nth-check@2.1.1: resolution: {integrity: sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==} @@ -3949,6 +4045,11 @@ packages: resolution: {integrity: sha512-XQBQ3I8W1Cge0Seh+6gjj03LbmRFWuoszgK9ooCpwYIrhhoO80pfq4cUkU5DkknwfOfFteRwlZ56PYOGYyFWdg==} engines: {node: '>= 4'} + rimraf@3.0.2: + resolution: {integrity: sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==} + deprecated: Rimraf versions prior to v4 are no longer supported + hasBin: true + rimraf@5.0.7: resolution: {integrity: sha512-nV6YcJo5wbLW77m+8KjH8aB/7/rxQy9SZ0HY5shnwULfS+9nmTtVXAJET5NdZmCzA4fPI/Hm1wo/Po/4mopOdg==} engines: {node: '>=14.18'} @@ -4019,6 +4120,9 @@ packages: resolution: {integrity: sha512-XGuRDNjXUijsUL0vl6nSD7cwURuzEgglbOaFuZM9g3kwDXOWVTck0jLzjPzGD+TazWbboZYu52/9/XPdUgne9g==} engines: {node: '>= 0.8.0'} + set-blocking@2.0.0: + resolution: {integrity: sha512-KiKBS8AnWGEyLzofFfmvKwpdPzqiy16LvQfK3yv/fVH7Bj13/wl3JSR1J+rfgRE9q7xUJK4qvgS8raSOeLUehw==} + set-function-length@1.2.2: resolution: {integrity: sha512-pgRc4hJ4/sNjWCSS9AmnS40x3bNMDTknHgL5UaMBTMyJnU90EgWh1Rz+MC9eFu4BuN/UwZjKQuY/1v3rM7HMfg==} engines: {node: '>= 0.4'} @@ -4057,6 +4161,12 @@ packages: resolution: {integrity: sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==} engines: {node: '>=14'} + simple-concat@1.0.1: + resolution: {integrity: sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==} + + simple-get@3.1.1: + resolution: {integrity: sha512-CQ5LTKGfCpvE1K0n2us+kuMPbk/q0EKl82s4aheV9oXjFEz6W/Y7oQFVJuU6QG77hRT4Ghb5RURteF5vnWjupA==} + simple-swizzle@0.2.2: resolution: {integrity: sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==} @@ -4215,6 +4325,10 @@ packages: tar-stream@3.1.7: resolution: {integrity: sha512-qJj60CXt7IU1Ffyc3NJMjh6EkuCFej46zUqJ4J7pqYlThyd9bO0XBTmcOIhSzZJVWfsLks0+nle/j538YAW9RQ==} + tar@6.2.1: + resolution: {integrity: sha512-DZ4yORTwrbTj/7MZYq2w+/ZFdI6OZ/f9SFHR+71gIVUZhOQPHzVCLpvRnPgyaMpfWxxk/4ONva3GQSyNIKRv6A==} + engines: {node: '>=10'} + tar@7.2.0: resolution: {integrity: sha512-hctwP0Nb4AB60bj8WQgRYaMOuJYRAPMGiQUAotms5igN8ppfQM+IvjQ5HcKu1MaZh2Wy2KWVTe563Yj8dfc14w==} engines: {node: '>=18'} @@ -4369,6 +4483,9 @@ packages: resolution: {integrity: sha512-gptHNQghINnc/vTGIk0SOFGFNXw7JVrlRUtConJRlvaw6DuX0wO5Jeko9sWrMBhh+PsYAZ7oXAiOnf/UKogyiw==} engines: {node: '>= 10.0.0'} + unpdf@0.12.1: + resolution: {integrity: sha512-ktP8+TTLDBrlu/j8rQVNbHoMMpFXzkVAkb1rt/JdshFC3jOHdZjuGCNl/voPL0kraUrUOH7ZC88kVxMvlvDBzA==} + unpipe@1.0.0: resolution: {integrity: sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==} engines: {node: '>= 0.8'} @@ -4456,6 +4573,9 @@ packages: engines: {node: '>= 8'} hasBin: true + wide-align@1.1.5: + resolution: {integrity: sha512-eDMORYaPNZ4sQIuuYPDHdQvf4gyCF9rEEV/yPxGfwPkRodwEgiMUUXTx/dex+Me0wxx53S+NgUHaP7y3MGlDmg==} + winston-transport@4.8.0: resolution: {integrity: sha512-qxSTKswC6llEMZKgCQdaWgDuMJQnhuvF5f2Nk3SNXc4byfQ+voo2mX1Px9dkNOuR8p0KAjfPG29PuYUSIb+vSA==} engines: {node: '>= 12.0.0'} @@ -5607,6 +5727,22 @@ snapshots: - langchain - openai + '@mapbox/node-pre-gyp@1.0.11': + dependencies: + detect-libc: 2.0.3 + https-proxy-agent: 5.0.1 + make-dir: 3.1.0 + node-fetch: 2.7.0 + nopt: 5.0.0 + npmlog: 5.0.1 + rimraf: 3.0.2 + semver: 7.6.2 + tar: 6.2.1 + transitivePeerDependencies: + - encoding + - supports-color + optional: true + '@mixmark-io/domino@2.2.0': {} '@mongodb-js/saslprep@1.1.7': @@ -5639,6 +5775,15 @@ snapshots: '@one-ini/wasm@0.1.1': {} + '@opendocsg/pdf2md@0.2.1': + dependencies: + enumify: 1.0.4 + minimist: 1.2.8 + unpdf: 0.12.1 + transitivePeerDependencies: + - encoding + - supports-color + '@opentelemetry/api-logs@0.52.1': dependencies: '@opentelemetry/api': 1.9.0 @@ -6673,6 +6818,9 @@ snapshots: '@xmldom/xmldom@0.8.10': {} + abbrev@1.1.1: + optional: true + abbrev@2.0.0: {} abort-controller@3.0.0: @@ -6755,6 +6903,15 @@ snapshots: dependencies: sylvester: 0.0.12 + aproba@2.0.0: + optional: true + + are-we-there-yet@2.0.0: + dependencies: + delegates: 1.0.0 + readable-stream: 3.6.2 + optional: true + arg@4.1.3: {} argparse@1.0.10: @@ -7008,6 +7165,16 @@ snapshots: caniuse-lite@1.0.30001627: {} + canvas@2.11.2: + dependencies: + '@mapbox/node-pre-gyp': 1.0.11 + nan: 2.22.0 + simple-get: 3.1.1 + transitivePeerDependencies: + - encoding + - supports-color + optional: true + chalk@2.4.2: dependencies: ansi-styles: 3.2.1 @@ -7056,6 +7223,9 @@ snapshots: optionalDependencies: fsevents: 2.3.3 + chownr@2.0.0: + optional: true + chownr@3.0.0: {} chromium-bidi@0.5.24(devtools-protocol@0.0.1299070): @@ -7121,6 +7291,9 @@ snapshots: color-name: 1.1.4 simple-swizzle: 0.2.2 + color-support@1.1.3: + optional: true + color@3.2.1: dependencies: color-convert: 1.9.3 @@ -7148,6 +7321,9 @@ snapshots: ini: 1.3.8 proto-list: 1.2.4 + console-control-strings@1.1.0: + optional: true + content-disposition@0.5.4: dependencies: safe-buffer: 5.2.1 @@ -7255,6 +7431,11 @@ snapshots: decamelize@4.0.0: {} + decompress-response@4.2.1: + dependencies: + mimic-response: 2.1.0 + optional: true + dedent@1.5.3: {} deepmerge@4.3.1: {} @@ -7273,6 +7454,9 @@ snapshots: delayed-stream@1.0.0: {} + delegates@1.0.0: + optional: true + denque@2.1.0: {} depd@2.0.0: {} @@ -7364,6 +7548,8 @@ snapshots: entities@4.5.0: {} + enumify@1.0.4: {} + env-paths@2.2.1: {} error-ex@1.3.2: @@ -7589,6 +7775,11 @@ snapshots: jsonfile: 6.1.0 universalify: 2.0.1 + fs-minipass@2.1.0: + dependencies: + minipass: 3.3.6 + optional: true + fs.realpath@1.0.0: {} fsevents@2.3.3: @@ -7596,6 +7787,19 @@ snapshots: function-bind@1.1.2: {} + gauge@3.0.2: + dependencies: + aproba: 2.0.0 + color-support: 1.1.3 + console-control-strings: 1.1.0 + has-unicode: 2.0.1 + object-assign: 4.1.1 + signal-exit: 3.0.7 + string-width: 4.2.3 + strip-ansi: 6.0.1 + wide-align: 1.1.5 + optional: true + generic-pool@3.9.0: {} gensync@1.0.0-beta.2: {} @@ -7683,6 +7887,9 @@ snapshots: has-symbols@1.0.3: {} + has-unicode@2.0.1: + optional: true + hasown@2.0.2: dependencies: function-bind: 1.1.2 @@ -8463,6 +8670,11 @@ snapshots: luxon@3.4.4: {} + make-dir@3.1.0: + dependencies: + semver: 6.3.1 + optional: true + make-dir@4.0.0: dependencies: semver: 7.6.2 @@ -8523,6 +8735,9 @@ snapshots: mimic-fn@2.1.0: {} + mimic-response@2.1.0: + optional: true + minimatch@3.1.2: dependencies: brace-expansion: 1.1.11 @@ -8541,8 +8756,22 @@ snapshots: minimist@1.2.8: {} + minipass@3.3.6: + dependencies: + yallist: 4.0.0 + optional: true + + minipass@5.0.0: + optional: true + minipass@7.1.2: {} + minizlib@2.1.2: + dependencies: + minipass: 3.3.6 + yallist: 4.0.0 + optional: true + minizlib@3.0.1: dependencies: minipass: 7.1.2 @@ -8554,6 +8783,9 @@ snapshots: dependencies: minimist: 1.2.8 + mkdirp@1.0.4: + optional: true + mkdirp@3.0.1: {} ml-array-mean@1.1.6: @@ -8646,6 +8878,9 @@ snapshots: mustache@4.2.0: {} + nan@2.22.0: + optional: true + natural-compare@1.4.0: {} natural@7.0.7(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3): @@ -8726,6 +8961,11 @@ snapshots: touch: 3.1.1 undefsafe: 2.0.5 + nopt@5.0.0: + dependencies: + abbrev: 1.1.1 + optional: true + nopt@7.2.1: dependencies: abbrev: 2.0.0 @@ -8738,6 +8978,14 @@ snapshots: dependencies: path-key: 3.1.1 + npmlog@5.0.1: + dependencies: + are-we-there-yet: 2.0.0 + console-control-strings: 1.1.0 + gauge: 3.0.2 + set-blocking: 2.0.0 + optional: true + nth-check@2.1.1: dependencies: boolbase: 1.0.0 @@ -9210,6 +9458,11 @@ snapshots: retry@0.13.1: {} + rimraf@3.0.2: + dependencies: + glob: 7.2.3 + optional: true + rimraf@5.0.7: dependencies: glob: 10.4.2 @@ -9285,6 +9538,9 @@ snapshots: transitivePeerDependencies: - supports-color + set-blocking@2.0.0: + optional: true + set-function-length@1.2.2: dependencies: define-data-property: 1.1.4 @@ -9321,6 +9577,16 @@ snapshots: signal-exit@4.1.0: {} + simple-concat@1.0.1: + optional: true + + simple-get@3.1.1: + dependencies: + decompress-response: 4.2.1 + once: 1.4.0 + simple-concat: 1.0.1 + optional: true + simple-swizzle@0.2.2: dependencies: is-arrayish: 0.3.2 @@ -9494,6 +9760,16 @@ snapshots: fast-fifo: 1.3.2 streamx: 2.18.0 + tar@6.2.1: + dependencies: + chownr: 2.0.0 + fs-minipass: 2.1.0 + minipass: 5.0.0 + minizlib: 2.1.2 + mkdirp: 1.0.4 + yallist: 4.0.0 + optional: true + tar@7.2.0: dependencies: '@isaacs/fs-minipass': 4.0.1 @@ -9626,6 +9902,13 @@ snapshots: universalify@2.0.1: {} + unpdf@0.12.1: + optionalDependencies: + canvas: 2.11.2 + transitivePeerDependencies: + - encoding + - supports-color + unpipe@1.0.0: {} unstructured-client@0.11.3(zod@3.23.8): @@ -9698,6 +9981,11 @@ snapshots: dependencies: isexe: 2.0.0 + wide-align@1.1.5: + dependencies: + string-width: 4.2.3 + optional: true + winston-transport@4.8.0: dependencies: logform: 2.6.1 diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index 38dc6b5e..c3baa694 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -6,7 +6,7 @@ import { robustFetch } from "../../lib/fetch"; import { z } from "zod"; import * as Sentry from "@sentry/node"; import escapeHtml from "escape-html"; -import PdfParse from "pdf-parse"; +import pdf2md from "@opendocsg/pdf2md"; import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile"; import { RemoveFeatureError } from "../../error"; @@ -113,10 +113,11 @@ async function scrapePDFWithParsePDF( meta: Meta, tempFilePath: string, ): Promise { - meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath }); + meta.logger.debug("Processing PDF document with pdf2md", { tempFilePath }); - const result = await PdfParse(await fs.readFile(tempFilePath)); - const escaped = escapeHtml(result.text); + const pdfBuffer = await fs.readFile(tempFilePath); + const markdown = await pdf2md(pdfBuffer); + const escaped = escapeHtml(markdown); return { markdown: escaped, @@ -141,7 +142,7 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom let result: PDFProcessorResult | null = null; - // First, try parsing with PdfParse + // First, try parsing with pdf2md result = await scrapePDFWithParsePDF( { ...meta, @@ -169,14 +170,14 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom result = llamaResult; // Use LlamaParse result if successful } catch (error) { if (error instanceof Error && error.message === "LlamaParse timed out") { - meta.logger.warn("LlamaParse timed out -- using parse-pdf result", { + meta.logger.warn("LlamaParse timed out -- using pdf2md result", { error, }); } else if (error instanceof RemoveFeatureError) { throw error; } else { meta.logger.warn( - "LlamaParse failed to parse PDF -- using parse-pdf result", + "LlamaParse failed to parse PDF -- using pdf2md result", { error }, ); Sentry.captureException(error); From 194353af0db6f4734845940589ae233eb1a124f3 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Tue, 17 Dec 2024 10:04:20 -0500 Subject: [PATCH 31/65] Remove pdf parse --- apps/api/package.json | 1 - apps/api/pnpm-lock.yaml | 7 +++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/apps/api/package.json b/apps/api/package.json index f4ac48ff..1ccfee5e 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -101,7 +101,6 @@ "mongoose": "^8.4.4", "natural": "^7.0.7", "openai": "^4.57.0", - "pdf-parse": "^1.1.1", "pos": "^0.4.2", "posthog-node": "^4.0.1", "promptable": "^0.0.10", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 68d43655..9014c42e 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -158,9 +158,6 @@ importers: openai: specifier: ^4.57.0 version: 4.57.0(zod@3.23.8) - pdf-parse: - specifier: ^1.1.1 - version: 1.1.1 pos: specifier: ^0.4.2 version: 0.4.2 @@ -8927,7 +8924,8 @@ snapshots: node-domexception@1.0.0: {} - node-ensure@0.0.0: {} + node-ensure@0.0.0: + optional: true node-fetch@2.7.0: dependencies: @@ -9157,6 +9155,7 @@ snapshots: node-ensure: 0.0.0 transitivePeerDependencies: - supports-color + optional: true peberminta@0.9.0: {} From a20a003c740307945ba752d31c3912ba485963a5 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Tue, 17 Dec 2024 12:12:22 -0500 Subject: [PATCH 32/65] revert to pdf parse --- apps/api/package.json | 2 +- apps/api/pnpm-lock.yaml | 309 +----------------- .../scraper/scrapeURL/engines/pdf/index.ts | 17 +- apps/api/tsconfig.json | 3 +- 4 files changed, 22 insertions(+), 309 deletions(-) diff --git a/apps/api/package.json b/apps/api/package.json index 1ccfee5e..c4e70901 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -58,7 +58,6 @@ "@devil7softwares/pos": "^1.0.2", "@dqbd/tiktoken": "^1.0.17", "@nangohq/node": "^0.40.8", - "@opendocsg/pdf2md": "^0.2.1", "@sentry/cli": "^2.33.1", "@sentry/node": "^8.26.0", "@sentry/profiling-node": "^8.26.0", @@ -101,6 +100,7 @@ "mongoose": "^8.4.4", "natural": "^7.0.7", "openai": "^4.57.0", + "pdf-parse": "^1.1.1", "pos": "^0.4.2", "posthog-node": "^4.0.1", "promptable": "^0.0.10", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 9014c42e..569eafd9 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -29,9 +29,6 @@ importers: '@nangohq/node': specifier: ^0.40.8 version: 0.40.8 - '@opendocsg/pdf2md': - specifier: ^0.2.1 - version: 0.2.1 '@sentry/cli': specifier: ^2.33.1 version: 2.33.1 @@ -158,6 +155,9 @@ importers: openai: specifier: ^4.57.0 version: 4.57.0(zod@3.23.8) + pdf-parse: + specifier: ^1.1.1 + version: 1.1.1 pos: specifier: ^0.4.2 version: 0.4.2 @@ -794,10 +794,6 @@ packages: resolution: {integrity: sha512-cXWgKE3sdWLSqAa8ykbCcUsUF1Kyr5J3HOWYGuobhPEycXW4WI++d5DhzdpL238mzoEXTi90VqfSCra37l5YqA==} engines: {node: '>=18'} - '@mapbox/node-pre-gyp@1.0.11': - resolution: {integrity: sha512-Yhlar6v9WQgUp/He7BdgzOz8lqMQ8sU+jkCq7Wx8Myc5YFJLbEe7lgui/V7G1qB1DJykHSGwreceSaD60Y0PUQ==} - hasBin: true - '@mixmark-io/domino@2.2.0': resolution: {integrity: sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==} @@ -841,10 +837,6 @@ packages: '@one-ini/wasm@0.1.1': resolution: {integrity: sha512-XuySG1E38YScSJoMlqovLru4KTUNSjgVTIjyh7qMX6aNN5HY5Ct5LhRJdxO79JtTzKfzV/bnWpz+zquYrISsvw==} - '@opendocsg/pdf2md@0.2.1': - resolution: {integrity: sha512-k/yvfrTb+GPTIIm/bMm5IsenTqAFl+IqvkBgFwFlmflS5TT7FOfyRLp8MypVWLAG4G9AnT7AZFbdQYgN/CR5BA==} - hasBin: true - '@opentelemetry/api-logs@0.52.1': resolution: {integrity: sha512-qnSqB2DQ9TPP96dl8cDubDvrUyWc0/sK81xHTK8eSUspzDM3bsewX903qclQFvVhgStjRWdC5bLb3kQqMkfV5A==} engines: {node: '>=14'} @@ -1623,9 +1615,6 @@ packages: resolution: {integrity: sha512-2WALfTl4xo2SkGCYRt6rDTFfk9R1czmBvUQy12gK2KuRKIpWEhcbbzy8EZXtz/jkRqHX8bFEc6FC1HjX4TUWYw==} engines: {node: '>=10.0.0'} - abbrev@1.1.1: - resolution: {integrity: sha512-nne9/IiQ/hzIhY6pdDnbBtz7DjPTKrY00P/zvPSm5pOFkl6xuGrGnXn/VtTNNfNtAfZ9/1RtehkszU9qcTii0Q==} - abbrev@2.0.0: resolution: {integrity: sha512-6/mh1E2u2YgEsCHdY0Yx5oW+61gZU+1vXaoiHHrpKeuRNNgFvS+/jrwHiQhB5apAf5oB7UB7E19ol2R2LKH8hQ==} engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0} @@ -1719,14 +1708,6 @@ packages: resolution: {integrity: sha512-KLy/ugo33KZA7nugtQ7O0E1c8kQ52N3IvD/XgIh4w/Nr28ypfkwDfA67F1ev4N1m5D+BOk1+b2dEJDfpj/VvZg==} engines: {node: '>=0.2.6'} - aproba@2.0.0: - resolution: {integrity: sha512-lYe4Gx7QT+MKGbDsA+Z+he/Wtef0BiwDOlK/XkBrdfsh9J/jPPXbX0tE9x9cl27Tmu5gg3QUbUrQYa/y+KOHPQ==} - - are-we-there-yet@2.0.0: - resolution: {integrity: sha512-Ci/qENmwHnsYo9xKIcUJN5LeDKdJ6R1Z1j9V/J5wyq8nh/mYPEpIKJbBZXtZjG04HiK7zV/p6Vs9952MrMeUIw==} - engines: {node: '>=10'} - deprecated: This package is no longer supported. - arg@4.1.3: resolution: {integrity: sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA==} @@ -1927,10 +1908,6 @@ packages: caniuse-lite@1.0.30001627: resolution: {integrity: sha512-4zgNiB8nTyV/tHhwZrFs88ryjls/lHiqFhrxCW4qSTeuRByBVnPYpDInchOIySWknznucaf31Z4KYqjfbrecVw==} - canvas@2.11.2: - resolution: {integrity: sha512-ItanGBMrmRV7Py2Z+Xhs7cT+FNt5K0vPL4p9EZ/UX/Mu7hFbkxSjKF2KVtPwX7UYWp7dRKnrTvReflgrItJbdw==} - engines: {node: '>=6'} - chalk@2.4.2: resolution: {integrity: sha512-Mti+f9lpJNcwF4tWV8/OrTTtF1gZi+f8FqlyAdouralcFWFQWF2+NgCHShjkCb+IFBLq9buZwE1xckQU4peSuQ==} engines: {node: '>=4'} @@ -1961,10 +1938,6 @@ packages: resolution: {integrity: sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==} engines: {node: '>= 8.10.0'} - chownr@2.0.0: - resolution: {integrity: sha512-bIomtDF5KGpdogkLd9VspvFzk9KfpyyGlS8YFVZl7TGPBHL5snIOnxeshwVgPteQ9b4Eydl+pVbIyE1DcvCWgQ==} - engines: {node: '>=10'} - chownr@3.0.0: resolution: {integrity: sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g==} engines: {node: '>=18'} @@ -2022,10 +1995,6 @@ packages: color-string@1.9.1: resolution: {integrity: sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==} - color-support@1.1.3: - resolution: {integrity: sha512-qiBjkpbMLO/HL68y+lh4q0/O1MZFj2RX6X/KmMa3+gJD3z+WwI1ZzDHysvqHGS3mP6mznPckpXmw1nI9cJjyRg==} - hasBin: true - color@3.2.1: resolution: {integrity: sha512-aBl7dZI9ENN6fUGC7mWpMTPNHmWUSNan9tuWN6ahh5ZLNk9baLJOnSMlrQkHcrfFgz2/RigjUVAjdx36VcemKA==} @@ -2052,9 +2021,6 @@ packages: config-chain@1.1.13: resolution: {integrity: sha512-qj+f8APARXHrM0hraqXYb2/bOVSV4PvJQlNZ/DVj0QrmNM2q2euizkeuVckQ57J+W0mRH6Hvi+k50M4Jul2VRQ==} - console-control-strings@1.1.0: - resolution: {integrity: sha512-ty/fTekppD2fIwRvnZAVdeOiGd1c7YXEixbgJTNzqcxJWKQnjJ/V1bNEEE6hygpM3WjwHFUVK6HTjWSzV4a8sQ==} - content-disposition@0.5.4: resolution: {integrity: sha512-FveZTNuGw04cxlAiWbzi6zTAL/lhehaWbTtgluJh4/E95DqMwTmha3KZN1aAWA8cFIhHzMZUvLevkw5Rqk+tSQ==} engines: {node: '>= 0.6'} @@ -2181,10 +2147,6 @@ packages: resolution: {integrity: sha512-9iE1PgSik9HeIIw2JO94IidnE3eBoQrFJ3w7sFuzSX4DpmZ3v5sZpUiV5Swcf6mQEF+Y0ru8Neo+p+nyh2J+hQ==} engines: {node: '>=10'} - decompress-response@4.2.1: - resolution: {integrity: sha512-jOSne2qbyE+/r8G1VU+G/82LBs2Fs4LAsTiLSHOCOMZQl2OKZ6i8i4IyHemTe+/yIXOtTcRQMzPcgyhoFlqPkw==} - engines: {node: '>=8'} - dedent@1.5.3: resolution: {integrity: sha512-NHQtfOOW68WD8lgypbLA5oT+Bt0xXJhiYvoR6SmmNXZfpzOGXwdKWmcwG8N7PwVVWV3eF/68nmD9BaJSsTBhyQ==} peerDependencies: @@ -2209,9 +2171,6 @@ packages: resolution: {integrity: sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==} engines: {node: '>=0.4.0'} - delegates@1.0.0: - resolution: {integrity: sha512-bd2L678uiWATM6m5Z1VzNCErI3jiGzt6HGY8OVICs40JQq/HALfbyNJmp0UDakEY4pMMaN0Ly5om/B1VI/+xfQ==} - denque@2.1.0: resolution: {integrity: sha512-HVQE3AAb/pxF8fQAoiqpvg9i3evqug3hoiwakOyZAwJm+6vZehbkYXZ0l4JxS+I3QxM97v5aaRNhj8v5oBhekw==} engines: {node: '>=0.10'} @@ -2324,9 +2283,6 @@ packages: resolution: {integrity: sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==} engines: {node: '>=0.12'} - enumify@1.0.4: - resolution: {integrity: sha512-5mwWXaVzJaqyUdOW/PDH5QySRgmQ8VvujmxmvXoXj9w0n+6omhVuyD56eI37FMqy/LxueJzsQ4DrHVQzuT/TXg==} - env-paths@2.2.1: resolution: {integrity: sha512-+h1lkLKhZMTYjog1VEpJNG7NZJWcuc2DDk/qsqSTRRCOXiLjeQ1d1/udrUGhqMxUgAlwKNZ0cf2uqan5GLuS2A==} engines: {node: '>=6'} @@ -2528,10 +2484,6 @@ packages: resolution: {integrity: sha512-PmDi3uwK5nFuXh7XDTlVnS17xJS7vW36is2+w3xcv8SVxiB4NyATf4ctkVY5bkSjX0Y4nbvZCq1/EjtEyr9ktw==} engines: {node: '>=14.14'} - fs-minipass@2.1.0: - resolution: {integrity: sha512-V/JgOLFCS+R6Vcq0slCuaeWEdNC3ouDlJMNIsacH2VtALiu9mV4LPrHc5cDl8k5aw6J8jwgWWpiTo5RYhmIzvg==} - engines: {node: '>= 8'} - fs.realpath@1.0.0: resolution: {integrity: sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==} @@ -2543,11 +2495,6 @@ packages: function-bind@1.1.2: resolution: {integrity: sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==} - gauge@3.0.2: - resolution: {integrity: sha512-+5J6MS/5XksCuXq++uFRsnUd7Ovu1XenbeuIuNRJxYWjgQbPuFhT14lAvsWfqfAmnwluf1OwMjz39HjfLPci0Q==} - engines: {node: '>=10'} - deprecated: This package is no longer supported. - generic-pool@3.9.0: resolution: {integrity: sha512-hymDOu5B53XvN4QT9dBmZxPX4CWhBPPLguTZ9MMFeFa/Kg0xWVfylOVNlJji/E7yTZWFd/q9GO5TxDLq156D7g==} engines: {node: '>= 4'} @@ -2631,9 +2578,6 @@ packages: resolution: {integrity: sha512-l3LCuF6MgDNwTDKkdYGEihYjt5pRPbEg46rtlmnSPlUbgmB8LOIrKJbYYFBSbnPaJexMKtiPO8hmeRjRz2Td+A==} engines: {node: '>= 0.4'} - has-unicode@2.0.1: - resolution: {integrity: sha512-8Rf9Y83NBReMnx0gFzA8JImQACstCYWUplepDa9xprwwtmgEZUF0h/i5xSA625zB/I37EtrswSST6OXxwaaIJQ==} - hasown@2.0.2: resolution: {integrity: sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==} engines: {node: '>= 0.4'} @@ -3312,10 +3256,6 @@ packages: resolution: {integrity: sha512-zobTr7akeGHnv7eBOXcRgMeCP6+uyYsczwmeRCauvpvaAltgNyTbLH/+VaEAPUeWBT+1GuNmz4wC/6jtQzbbVA==} engines: {node: '>=12'} - make-dir@3.1.0: - resolution: {integrity: sha512-g3FeP20LNwhALb/6Cz6Dd4F2ngze0jz7tbzrD2wAV+o9FeNHe4rL+yK2md0J/fiSf1sa1ADhXqi5+oVwOM/eGw==} - engines: {node: '>=8'} - make-dir@4.0.0: resolution: {integrity: sha512-hXdUTZYIVOt1Ex//jAQi+wTZZpUpwBj/0QsOzqegb3rGMMeJiSEu5xLHnYfBrRV4RH2+OCSOO95Is/7x1WJ4bw==} engines: {node: '>=10'} @@ -3386,10 +3326,6 @@ packages: resolution: {integrity: sha512-OqbOk5oEQeAZ8WXWydlu9HJjz9WVdEIvamMCcXmuqUYjTknH/sqsWvhQ3vgwKFRR1HpjvNBKQ37nbJgYzGqGcg==} engines: {node: '>=6'} - mimic-response@2.1.0: - resolution: {integrity: sha512-wXqjST+SLt7R009ySCglWBCFpjUygmCIfD790/kVbiGmUgfYGuB14PiTd5DwVxSV4NcYHjzMkoj5LjQZwTQLEA==} - engines: {node: '>=8'} - minimatch@3.1.2: resolution: {integrity: sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==} @@ -3408,22 +3344,10 @@ packages: minimist@1.2.8: resolution: {integrity: sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==} - minipass@3.3.6: - resolution: {integrity: sha512-DxiNidxSEK+tHG6zOIklvNOwm3hvCrbUrdtzY74U6HKTJxvIDfOUL5W5P2Ghd3DTkhhKPYGqeNUIh5qcM4YBfw==} - engines: {node: '>=8'} - - minipass@5.0.0: - resolution: {integrity: sha512-3FnjYuehv9k6ovOEbyOswadCDPX1piCfhV8ncmYtHOjuPwylVWsghTLo7rabjC3Rx5xD4HDx8Wm1xnMF7S5qFQ==} - engines: {node: '>=8'} - minipass@7.1.2: resolution: {integrity: sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==} engines: {node: '>=16 || 14 >=14.17'} - minizlib@2.1.2: - resolution: {integrity: sha512-bAxsR8BVfj60DWXHE3u30oHzfl4G7khkSuPW+qvpd7jFRHm7dLxOjUk1EHACJ/hxLY8phGJ0YhYHZo7jil7Qdg==} - engines: {node: '>= 8'} - minizlib@3.0.1: resolution: {integrity: sha512-umcy022ILvb5/3Djuu8LWeqUa8D68JaBzlttKeMWen48SjabqS3iY5w/vzeMzMUNhLDifyhbOwKDSznB1vvrwg==} engines: {node: '>= 18'} @@ -3435,11 +3359,6 @@ packages: resolution: {integrity: sha512-FP+p8RB8OWpF3YZBCrP5gtADmtXApB5AMLn+vdyA+PyxCjrCs00mjyUozssO33cwDeT3wNGdLxJ5M//YqtHAJw==} hasBin: true - mkdirp@1.0.4: - resolution: {integrity: sha512-vVqVZQyf3WLx2Shd0qJ9xuvqgAyKPLAiqITEtqW0oIUjzo3PePDd6fW9iFz30ef7Ysp/oiWqbhszeGWW2T6Gzw==} - engines: {node: '>=10'} - hasBin: true - mkdirp@3.0.1: resolution: {integrity: sha512-+NsyUUAZDmo6YVHzL/stxSu3t9YS1iljliy3BSDrXJ/dkn1KYdmtZODGGjLcc9XLgVVpH4KshHB8XmZgMhaBXg==} engines: {node: '>=10'} @@ -3528,9 +3447,6 @@ packages: resolution: {integrity: sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ==} hasBin: true - nan@2.22.0: - resolution: {integrity: sha512-nbajikzWTMwsW+eSsNm3QwlOs7het9gGJU5dDZzRTQGk03vyBOauxgI4VakDzE0PtsGTmXPsXTbbjVhRwR5mpw==} - natural-compare@1.4.0: resolution: {integrity: sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==} @@ -3591,11 +3507,6 @@ packages: engines: {node: '>=8.10.0'} hasBin: true - nopt@5.0.0: - resolution: {integrity: sha512-Tbj67rffqceeLpcRXrT7vKAN8CwfPeIBgM7E6iBkmKLV7bEMwpGgYLGv0jACUsECaa/vuxP0IjEont6umdMgtQ==} - engines: {node: '>=6'} - hasBin: true - nopt@7.2.1: resolution: {integrity: sha512-taM24ViiimT/XntxbPyJQzCG+p4EKOpgD3mxFwW38mGjVUrfERQOeY4EDHjdnptttfHuHQXFx+lTP08Q+mLa/w==} engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0} @@ -3613,10 +3524,6 @@ packages: resolution: {integrity: sha512-S48WzZW777zhNIrn7gxOlISNAqi9ZC/uQFnRdbeIHhZhCA6UqpkOT8T1G7BvfdgP4Er8gF4sUbaS0i7QvIfCWw==} engines: {node: '>=8'} - npmlog@5.0.1: - resolution: {integrity: sha512-AqZtDUWOMKs1G/8lwylVjrdYgqA4d9nu8hc+0gzRxlDb1I10+FHBGMXs6aiQHFdCUUlqH99MUMuLfzWDNDtfxw==} - deprecated: This package is no longer supported. - nth-check@2.1.1: resolution: {integrity: sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==} @@ -4042,11 +3949,6 @@ packages: resolution: {integrity: sha512-XQBQ3I8W1Cge0Seh+6gjj03LbmRFWuoszgK9ooCpwYIrhhoO80pfq4cUkU5DkknwfOfFteRwlZ56PYOGYyFWdg==} engines: {node: '>= 4'} - rimraf@3.0.2: - resolution: {integrity: sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==} - deprecated: Rimraf versions prior to v4 are no longer supported - hasBin: true - rimraf@5.0.7: resolution: {integrity: sha512-nV6YcJo5wbLW77m+8KjH8aB/7/rxQy9SZ0HY5shnwULfS+9nmTtVXAJET5NdZmCzA4fPI/Hm1wo/Po/4mopOdg==} engines: {node: '>=14.18'} @@ -4117,9 +4019,6 @@ packages: resolution: {integrity: sha512-XGuRDNjXUijsUL0vl6nSD7cwURuzEgglbOaFuZM9g3kwDXOWVTck0jLzjPzGD+TazWbboZYu52/9/XPdUgne9g==} engines: {node: '>= 0.8.0'} - set-blocking@2.0.0: - resolution: {integrity: sha512-KiKBS8AnWGEyLzofFfmvKwpdPzqiy16LvQfK3yv/fVH7Bj13/wl3JSR1J+rfgRE9q7xUJK4qvgS8raSOeLUehw==} - set-function-length@1.2.2: resolution: {integrity: sha512-pgRc4hJ4/sNjWCSS9AmnS40x3bNMDTknHgL5UaMBTMyJnU90EgWh1Rz+MC9eFu4BuN/UwZjKQuY/1v3rM7HMfg==} engines: {node: '>= 0.4'} @@ -4158,12 +4057,6 @@ packages: resolution: {integrity: sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==} engines: {node: '>=14'} - simple-concat@1.0.1: - resolution: {integrity: sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==} - - simple-get@3.1.1: - resolution: {integrity: sha512-CQ5LTKGfCpvE1K0n2us+kuMPbk/q0EKl82s4aheV9oXjFEz6W/Y7oQFVJuU6QG77hRT4Ghb5RURteF5vnWjupA==} - simple-swizzle@0.2.2: resolution: {integrity: sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==} @@ -4322,10 +4215,6 @@ packages: tar-stream@3.1.7: resolution: {integrity: sha512-qJj60CXt7IU1Ffyc3NJMjh6EkuCFej46zUqJ4J7pqYlThyd9bO0XBTmcOIhSzZJVWfsLks0+nle/j538YAW9RQ==} - tar@6.2.1: - resolution: {integrity: sha512-DZ4yORTwrbTj/7MZYq2w+/ZFdI6OZ/f9SFHR+71gIVUZhOQPHzVCLpvRnPgyaMpfWxxk/4ONva3GQSyNIKRv6A==} - engines: {node: '>=10'} - tar@7.2.0: resolution: {integrity: sha512-hctwP0Nb4AB60bj8WQgRYaMOuJYRAPMGiQUAotms5igN8ppfQM+IvjQ5HcKu1MaZh2Wy2KWVTe563Yj8dfc14w==} engines: {node: '>=18'} @@ -4480,9 +4369,6 @@ packages: resolution: {integrity: sha512-gptHNQghINnc/vTGIk0SOFGFNXw7JVrlRUtConJRlvaw6DuX0wO5Jeko9sWrMBhh+PsYAZ7oXAiOnf/UKogyiw==} engines: {node: '>= 10.0.0'} - unpdf@0.12.1: - resolution: {integrity: sha512-ktP8+TTLDBrlu/j8rQVNbHoMMpFXzkVAkb1rt/JdshFC3jOHdZjuGCNl/voPL0kraUrUOH7ZC88kVxMvlvDBzA==} - unpipe@1.0.0: resolution: {integrity: sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==} engines: {node: '>= 0.8'} @@ -4570,9 +4456,6 @@ packages: engines: {node: '>= 8'} hasBin: true - wide-align@1.1.5: - resolution: {integrity: sha512-eDMORYaPNZ4sQIuuYPDHdQvf4gyCF9rEEV/yPxGfwPkRodwEgiMUUXTx/dex+Me0wxx53S+NgUHaP7y3MGlDmg==} - winston-transport@4.8.0: resolution: {integrity: sha512-qxSTKswC6llEMZKgCQdaWgDuMJQnhuvF5f2Nk3SNXc4byfQ+voo2mX1Px9dkNOuR8p0KAjfPG29PuYUSIb+vSA==} engines: {node: '>= 12.0.0'} @@ -5724,22 +5607,6 @@ snapshots: - langchain - openai - '@mapbox/node-pre-gyp@1.0.11': - dependencies: - detect-libc: 2.0.3 - https-proxy-agent: 5.0.1 - make-dir: 3.1.0 - node-fetch: 2.7.0 - nopt: 5.0.0 - npmlog: 5.0.1 - rimraf: 3.0.2 - semver: 7.6.2 - tar: 6.2.1 - transitivePeerDependencies: - - encoding - - supports-color - optional: true - '@mixmark-io/domino@2.2.0': {} '@mongodb-js/saslprep@1.1.7': @@ -5772,15 +5639,6 @@ snapshots: '@one-ini/wasm@0.1.1': {} - '@opendocsg/pdf2md@0.2.1': - dependencies: - enumify: 1.0.4 - minimist: 1.2.8 - unpdf: 0.12.1 - transitivePeerDependencies: - - encoding - - supports-color - '@opentelemetry/api-logs@0.52.1': dependencies: '@opentelemetry/api': 1.9.0 @@ -6815,9 +6673,6 @@ snapshots: '@xmldom/xmldom@0.8.10': {} - abbrev@1.1.1: - optional: true - abbrev@2.0.0: {} abort-controller@3.0.0: @@ -6900,15 +6755,6 @@ snapshots: dependencies: sylvester: 0.0.12 - aproba@2.0.0: - optional: true - - are-we-there-yet@2.0.0: - dependencies: - delegates: 1.0.0 - readable-stream: 3.6.2 - optional: true - arg@4.1.3: {} argparse@1.0.10: @@ -7162,16 +7008,6 @@ snapshots: caniuse-lite@1.0.30001627: {} - canvas@2.11.2: - dependencies: - '@mapbox/node-pre-gyp': 1.0.11 - nan: 2.22.0 - simple-get: 3.1.1 - transitivePeerDependencies: - - encoding - - supports-color - optional: true - chalk@2.4.2: dependencies: ansi-styles: 3.2.1 @@ -7220,9 +7056,6 @@ snapshots: optionalDependencies: fsevents: 2.3.3 - chownr@2.0.0: - optional: true - chownr@3.0.0: {} chromium-bidi@0.5.24(devtools-protocol@0.0.1299070): @@ -7288,9 +7121,6 @@ snapshots: color-name: 1.1.4 simple-swizzle: 0.2.2 - color-support@1.1.3: - optional: true - color@3.2.1: dependencies: color-convert: 1.9.3 @@ -7318,9 +7148,6 @@ snapshots: ini: 1.3.8 proto-list: 1.2.4 - console-control-strings@1.1.0: - optional: true - content-disposition@0.5.4: dependencies: safe-buffer: 5.2.1 @@ -7428,11 +7255,6 @@ snapshots: decamelize@4.0.0: {} - decompress-response@4.2.1: - dependencies: - mimic-response: 2.1.0 - optional: true - dedent@1.5.3: {} deepmerge@4.3.1: {} @@ -7451,9 +7273,6 @@ snapshots: delayed-stream@1.0.0: {} - delegates@1.0.0: - optional: true - denque@2.1.0: {} depd@2.0.0: {} @@ -7545,8 +7364,6 @@ snapshots: entities@4.5.0: {} - enumify@1.0.4: {} - env-paths@2.2.1: {} error-ex@1.3.2: @@ -7661,7 +7478,7 @@ snapshots: extract-zip@2.0.1: dependencies: - debug: 4.3.4 + debug: 4.3.5 get-stream: 5.2.0 yauzl: 2.10.0 optionalDependencies: @@ -7772,11 +7589,6 @@ snapshots: jsonfile: 6.1.0 universalify: 2.0.1 - fs-minipass@2.1.0: - dependencies: - minipass: 3.3.6 - optional: true - fs.realpath@1.0.0: {} fsevents@2.3.3: @@ -7784,19 +7596,6 @@ snapshots: function-bind@1.1.2: {} - gauge@3.0.2: - dependencies: - aproba: 2.0.0 - color-support: 1.1.3 - console-control-strings: 1.1.0 - has-unicode: 2.0.1 - object-assign: 4.1.1 - signal-exit: 3.0.7 - string-width: 4.2.3 - strip-ansi: 6.0.1 - wide-align: 1.1.5 - optional: true - generic-pool@3.9.0: {} gensync@1.0.0-beta.2: {} @@ -7823,7 +7622,7 @@ snapshots: dependencies: basic-ftp: 5.0.5 data-uri-to-buffer: 6.0.2 - debug: 4.3.4 + debug: 4.3.5 fs-extra: 11.2.0 transitivePeerDependencies: - supports-color @@ -7884,9 +7683,6 @@ snapshots: has-symbols@1.0.3: {} - has-unicode@2.0.1: - optional: true - hasown@2.0.2: dependencies: function-bind: 1.1.2 @@ -7927,7 +7723,7 @@ snapshots: http-proxy-agent@7.0.2: dependencies: agent-base: 7.1.1 - debug: 4.3.4 + debug: 4.3.5 transitivePeerDependencies: - supports-color @@ -7975,7 +7771,7 @@ snapshots: https-proxy-agent@7.0.5: dependencies: agent-base: 7.1.1 - debug: 4.3.4 + debug: 4.3.5 transitivePeerDependencies: - supports-color @@ -8667,11 +8463,6 @@ snapshots: luxon@3.4.4: {} - make-dir@3.1.0: - dependencies: - semver: 6.3.1 - optional: true - make-dir@4.0.0: dependencies: semver: 7.6.2 @@ -8732,9 +8523,6 @@ snapshots: mimic-fn@2.1.0: {} - mimic-response@2.1.0: - optional: true - minimatch@3.1.2: dependencies: brace-expansion: 1.1.11 @@ -8753,22 +8541,8 @@ snapshots: minimist@1.2.8: {} - minipass@3.3.6: - dependencies: - yallist: 4.0.0 - optional: true - - minipass@5.0.0: - optional: true - minipass@7.1.2: {} - minizlib@2.1.2: - dependencies: - minipass: 3.3.6 - yallist: 4.0.0 - optional: true - minizlib@3.0.1: dependencies: minipass: 7.1.2 @@ -8780,9 +8554,6 @@ snapshots: dependencies: minimist: 1.2.8 - mkdirp@1.0.4: - optional: true - mkdirp@3.0.1: {} ml-array-mean@1.1.6: @@ -8875,9 +8646,6 @@ snapshots: mustache@4.2.0: {} - nan@2.22.0: - optional: true - natural-compare@1.4.0: {} natural@7.0.7(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3): @@ -8924,8 +8692,7 @@ snapshots: node-domexception@1.0.0: {} - node-ensure@0.0.0: - optional: true + node-ensure@0.0.0: {} node-fetch@2.7.0: dependencies: @@ -8959,11 +8726,6 @@ snapshots: touch: 3.1.1 undefsafe: 2.0.5 - nopt@5.0.0: - dependencies: - abbrev: 1.1.1 - optional: true - nopt@7.2.1: dependencies: abbrev: 2.0.0 @@ -8976,14 +8738,6 @@ snapshots: dependencies: path-key: 3.1.1 - npmlog@5.0.1: - dependencies: - are-we-there-yet: 2.0.0 - console-control-strings: 1.1.0 - gauge: 3.0.2 - set-blocking: 2.0.0 - optional: true - nth-check@2.1.1: dependencies: boolbase: 1.0.0 @@ -9082,7 +8836,7 @@ snapshots: dependencies: '@tootallnate/quickjs-emscripten': 0.23.0 agent-base: 7.1.1 - debug: 4.3.4 + debug: 4.3.5 get-uri: 6.0.3 http-proxy-agent: 7.0.2 https-proxy-agent: 7.0.5 @@ -9155,7 +8909,6 @@ snapshots: node-ensure: 0.0.0 transitivePeerDependencies: - supports-color - optional: true peberminta@0.9.0: {} @@ -9278,7 +9031,7 @@ snapshots: proxy-agent@6.4.0: dependencies: agent-base: 7.1.1 - debug: 4.3.4 + debug: 4.3.5 http-proxy-agent: 7.0.2 https-proxy-agent: 7.0.5 lru-cache: 7.18.3 @@ -9457,11 +9210,6 @@ snapshots: retry@0.13.1: {} - rimraf@3.0.2: - dependencies: - glob: 7.2.3 - optional: true - rimraf@5.0.7: dependencies: glob: 10.4.2 @@ -9537,9 +9285,6 @@ snapshots: transitivePeerDependencies: - supports-color - set-blocking@2.0.0: - optional: true - set-function-length@1.2.2: dependencies: define-data-property: 1.1.4 @@ -9576,16 +9321,6 @@ snapshots: signal-exit@4.1.0: {} - simple-concat@1.0.1: - optional: true - - simple-get@3.1.1: - dependencies: - decompress-response: 4.2.1 - once: 1.4.0 - simple-concat: 1.0.1 - optional: true - simple-swizzle@0.2.2: dependencies: is-arrayish: 0.3.2 @@ -9603,7 +9338,7 @@ snapshots: socks-proxy-agent@8.0.4: dependencies: agent-base: 7.1.1 - debug: 4.3.4 + debug: 4.3.5 socks: 2.8.3 transitivePeerDependencies: - supports-color @@ -9759,16 +9494,6 @@ snapshots: fast-fifo: 1.3.2 streamx: 2.18.0 - tar@6.2.1: - dependencies: - chownr: 2.0.0 - fs-minipass: 2.1.0 - minipass: 5.0.0 - minizlib: 2.1.2 - mkdirp: 1.0.4 - yallist: 4.0.0 - optional: true - tar@7.2.0: dependencies: '@isaacs/fs-minipass': 4.0.1 @@ -9901,13 +9626,6 @@ snapshots: universalify@2.0.1: {} - unpdf@0.12.1: - optionalDependencies: - canvas: 2.11.2 - transitivePeerDependencies: - - encoding - - supports-color - unpipe@1.0.0: {} unstructured-client@0.11.3(zod@3.23.8): @@ -9980,11 +9698,6 @@ snapshots: dependencies: isexe: 2.0.0 - wide-align@1.1.5: - dependencies: - string-width: 4.2.3 - optional: true - winston-transport@4.8.0: dependencies: logform: 2.6.1 diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index c3baa694..0983e4b1 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -6,7 +6,7 @@ import { robustFetch } from "../../lib/fetch"; import { z } from "zod"; import * as Sentry from "@sentry/node"; import escapeHtml from "escape-html"; -import pdf2md from "@opendocsg/pdf2md"; +import PdfParse from "pdf-parse"; import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile"; import { RemoveFeatureError } from "../../error"; @@ -113,11 +113,10 @@ async function scrapePDFWithParsePDF( meta: Meta, tempFilePath: string, ): Promise { - meta.logger.debug("Processing PDF document with pdf2md", { tempFilePath }); + meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath }); - const pdfBuffer = await fs.readFile(tempFilePath); - const markdown = await pdf2md(pdfBuffer); - const escaped = escapeHtml(markdown); + const result = await PdfParse(await fs.readFile(tempFilePath)); + const escaped = escapeHtml(result.text); return { markdown: escaped, @@ -142,7 +141,7 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom let result: PDFProcessorResult | null = null; - // First, try parsing with pdf2md + // First, try parsing with PdfParse result = await scrapePDFWithParsePDF( { ...meta, @@ -170,14 +169,14 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom result = llamaResult; // Use LlamaParse result if successful } catch (error) { if (error instanceof Error && error.message === "LlamaParse timed out") { - meta.logger.warn("LlamaParse timed out -- using pdf2md result", { + meta.logger.warn("LlamaParse timed out -- using parse-pdf result", { error, }); } else if (error instanceof RemoveFeatureError) { throw error; } else { meta.logger.warn( - "LlamaParse failed to parse PDF -- using pdf2md result", + "LlamaParse failed to parse PDF -- using parse-pdf result", { error }, ); Sentry.captureException(error); @@ -194,4 +193,4 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom html: result.html, markdown: result.markdown, }; -} +} \ No newline at end of file diff --git a/apps/api/tsconfig.json b/apps/api/tsconfig.json index 29093be6..ab2a9546 100644 --- a/apps/api/tsconfig.json +++ b/apps/api/tsconfig.json @@ -3,6 +3,7 @@ "rootDir": "./src", "lib": ["ES2022", "DOM"], + // or higher "target": "ES2022", @@ -18,7 +19,7 @@ "*": ["node_modules/*", "src/types/*"], }, - "inlineSources": true + "inlineSources": true, }, "include": ["src/","src/**/*", "services/db/supabase.ts", "utils/utils.ts", "services/db/supabaseEmbeddings.ts", "utils/EventEmmitter.ts", "src/services/queue-service.ts"] } From 0f8b8a717d6de3a0e1373f3ca69554819533d37c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 17 Dec 2024 15:19:52 -0300 Subject: [PATCH 33/65] Update map.ts --- apps/api/src/controllers/v1/map.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index cd302708..39393313 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -28,7 +28,7 @@ const redis = new Redis(process.env.REDIS_URL!); // Max Links that /map can return const MAX_MAP_LIMIT = 5000; // Max Links that "Smart /map" can return -const MAX_FIRE_ENGINE_RESULTS = 1000; +const MAX_FIRE_ENGINE_RESULTS = 500; interface MapResult { success: boolean; From 6d77879d6813ad926c9016239285090d9ed38599 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 17 Dec 2024 15:22:25 -0300 Subject: [PATCH 34/65] Update extract.ts --- apps/api/src/controllers/v1/extract.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/controllers/v1/extract.ts b/apps/api/src/controllers/v1/extract.ts index d05dbf6e..ed3f149c 100644 --- a/apps/api/src/controllers/v1/extract.ts +++ b/apps/api/src/controllers/v1/extract.ts @@ -308,3 +308,4 @@ function filterAndProcessLinks( x !== undefined && x.url !== undefined && !isUrlBlocked(x.url), ); } + From 79e335636a146c744650fd4daadc478447a775b1 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 17 Dec 2024 16:40:45 -0300 Subject: [PATCH 35/65] Nick: fixed extract issues --- apps/api/src/controllers/v1/extract.ts | 12 ++++++++++-- apps/api/src/controllers/v1/types.ts | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/apps/api/src/controllers/v1/extract.ts b/apps/api/src/controllers/v1/extract.ts index ed3f149c..48cba606 100644 --- a/apps/api/src/controllers/v1/extract.ts +++ b/apps/api/src/controllers/v1/extract.ts @@ -61,7 +61,7 @@ export async function extractController( const baseUrl = url.replace("/*", ""); // const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any - const allowExternalLinks = req.body.allowExternalLinks ?? true; + const allowExternalLinks = req.body.allowExternalLinks; let urlWithoutWww = baseUrl.replace("www.", ""); let mapUrl = req.body.prompt && allowExternalLinks @@ -84,6 +84,8 @@ export async function extractController( includeSubdomains: req.body.includeSubdomains, }); + // console.log("mapResults", mapResults); + let mappedLinks = mapResults.links as MapDocument[]; // Limit number of links to MAX_EXTRACT_LIMIT mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT); @@ -92,6 +94,7 @@ export async function extractController( (x) => `url: ${x.url}, title: ${x.title}, description: ${x.description}`, ); + // console.log("mappedLinksRerank", mappedLinksRerank); // Filter by path prefix if present // wrong @@ -150,15 +153,20 @@ export async function extractController( } else { // Handle direct URLs without glob pattern if (!isUrlBlocked(url)) { + // console.log("url", url); return [url]; } return []; } }); + // console.log("urlPromises", urlPromises.length); + // Wait for all URL processing to complete and flatten results const processedUrls = await Promise.all(urlPromises); - links.push(...processedUrls.flat()); + const flattenedUrls = processedUrls.flat().filter(url => url); // Filter out any null/undefined values + links.push(...flattenedUrls); + // console.log("links", links.length, "flattenedUrls", flattenedUrls.length); if (links.length === 0) { return res.status(400).json({ diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 2c054560..5ba36ea7 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -198,7 +198,7 @@ export const extractV1Options = z limit: z.number().int().positive().finite().safe().optional(), ignoreSitemap: z.boolean().default(false), includeSubdomains: z.boolean().default(true), - allowExternalLinks: z.boolean().default(true), + allowExternalLinks: z.boolean().default(false), origin: z.string().optional().default("api"), timeout: z.number().int().positive().finite().safe().default(60000), }) From b9f621bed5cf0b5e1490e44f7891bf27ec848727 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 17 Dec 2024 16:58:35 -0300 Subject: [PATCH 36/65] Nick: extract fixes --- apps/api/src/controllers/v1/extract.ts | 51 ++++++++++++----------- apps/api/src/lib/LLM-extraction/index.ts | 14 +++++++ apps/api/src/lib/extract/build-prompts.ts | 14 +++++++ apps/api/src/lib/extract/completions.ts | 1 + 4 files changed, 55 insertions(+), 25 deletions(-) create mode 100644 apps/api/src/lib/extract/build-prompts.ts diff --git a/apps/api/src/controllers/v1/extract.ts b/apps/api/src/controllers/v1/extract.ts index 48cba606..f3e94b77 100644 --- a/apps/api/src/controllers/v1/extract.ts +++ b/apps/api/src/controllers/v1/extract.ts @@ -24,6 +24,8 @@ import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/ import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { getMapResults } from "./map"; import { buildDocument } from "../../lib/extract/build-document"; +import { generateBasicCompletion } from "../../lib/LLM-extraction"; +import { buildRefrasedPrompt } from "../../lib/extract/build-prompts"; configDotenv(); const redis = new Redis(process.env.REDIS_URL!); @@ -63,30 +65,35 @@ export async function extractController( const allowExternalLinks = req.body.allowExternalLinks; let urlWithoutWww = baseUrl.replace("www.", ""); - let mapUrl = - req.body.prompt && allowExternalLinks - ? `${req.body.prompt} ${urlWithoutWww}` - : req.body.prompt - ? `${req.body.prompt} site:${urlWithoutWww}` - : `site:${urlWithoutWww}`; + + let rephrasedPrompt = req.body.prompt; + if (req.body.prompt) { + rephrasedPrompt = + (await generateBasicCompletion( + buildRefrasedPrompt(req.body.prompt, baseUrl), + )) ?? req.body.prompt; + } const mapResults = await getMapResults({ url: baseUrl, - search: req.body.prompt, + search: rephrasedPrompt, teamId: req.auth.team_id, plan: req.auth.plan, allowExternalLinks, origin: req.body.origin, limit: req.body.limit, // If we're self-hosted, we don't want to ignore the sitemap, due to our fire-engine mapping - ignoreSitemap: !selfHosted ? true : false, + ignoreSitemap: false, includeMetadata: true, includeSubdomains: req.body.includeSubdomains, }); - // console.log("mapResults", mapResults); - let mappedLinks = mapResults.links as MapDocument[]; + + if (mappedLinks.length === 0) { + mappedLinks = [{ url: baseUrl, title: "", description: "" }]; + } + // Limit number of links to MAX_EXTRACT_LIMIT mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT); @@ -94,20 +101,19 @@ export async function extractController( (x) => `url: ${x.url}, title: ${x.title}, description: ${x.description}`, ); - // console.log("mappedLinksRerank", mappedLinksRerank); - - // Filter by path prefix if present - // wrong - // if (pathPrefix) { - // mappedLinks = mappedLinks.filter(x => x.url && x.url.includes(`/${pathPrefix}/`)); - // } if (req.body.prompt) { + let searchQuery = + req.body.prompt && allowExternalLinks + ? `${req.body.prompt} ${urlWithoutWww}` + : req.body.prompt + ? `${req.body.prompt} site:${urlWithoutWww}` + : `site:${urlWithoutWww}`; // Get similarity scores between the search query and each link's context const linksAndScores = await performRanking( mappedLinksRerank, mappedLinks.map((l) => l.url), - mapUrl, + searchQuery, ); // First try with high threshold @@ -153,20 +159,16 @@ export async function extractController( } else { // Handle direct URLs without glob pattern if (!isUrlBlocked(url)) { - // console.log("url", url); return [url]; } return []; } }); - // console.log("urlPromises", urlPromises.length); - // Wait for all URL processing to complete and flatten results const processedUrls = await Promise.all(urlPromises); - const flattenedUrls = processedUrls.flat().filter(url => url); // Filter out any null/undefined values + const flattenedUrls = processedUrls.flat().filter((url) => url); // Filter out any null/undefined values links.push(...flattenedUrls); - // console.log("links", links.length, "flattenedUrls", flattenedUrls.length); if (links.length === 0) { return res.status(400).json({ @@ -212,7 +214,7 @@ export async function extractController( } return doc; } catch (e) { - logger.error(`Error in scrapeController: ${e}`); + logger.error(`Error in extractController: ${e}`); if ( e instanceof Error && (e.message.startsWith("Job wait") || e.message === "timeout") @@ -316,4 +318,3 @@ function filterAndProcessLinks( x !== undefined && x.url !== undefined && !isUrlBlocked(x.url), ); } - diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts index de7017ea..22e2bd04 100644 --- a/apps/api/src/lib/LLM-extraction/index.ts +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -62,3 +62,17 @@ export async function generateCompletions( return completions; } + +// generate basic completion + +export async function generateBasicCompletion(prompt: string) { + const openai = new OpenAI(); + const model = process.env.MODEL_NAME ?? "gpt-4o-mini"; + + const completion = await openai.chat.completions.create({ + model, + messages: [{ role: "user", content: prompt }], + }); + + return completion.choices[0].message.content; +} diff --git a/apps/api/src/lib/extract/build-prompts.ts b/apps/api/src/lib/extract/build-prompts.ts new file mode 100644 index 00000000..1ab117c2 --- /dev/null +++ b/apps/api/src/lib/extract/build-prompts.ts @@ -0,0 +1,14 @@ +export function buildRefrasedPrompt(prompt: string, url: string): string { + return `You are a search query optimizer. Your task is to rephrase the following prompt into an effective search query that will find relevant results about this topic on ${url}. + +Original prompt: "${prompt}" + +Provide a rephrased search query that: +1. Maintains the core intent of the original prompt +2. Uses relevant keywords +3. Is optimized for search engine results +4. Is concise and focused +5. Short is better than long + +Return only the rephrased search query, without any explanation or additional text.`; +} diff --git a/apps/api/src/lib/extract/completions.ts b/apps/api/src/lib/extract/completions.ts index 34a5a215..8d1b95c9 100644 --- a/apps/api/src/lib/extract/completions.ts +++ b/apps/api/src/lib/extract/completions.ts @@ -122,3 +122,4 @@ // }, // }; // } + From 3b6edef9fa8f48cf826f6f2d77796c8e24423cca Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 17 Dec 2024 16:58:57 -0300 Subject: [PATCH 37/65] chore: formatting --- apps/api/src/controllers/v1/scrape.ts | 6 +++++- apps/api/src/lib/crawl-redis.ts | 8 ++++++-- apps/api/src/lib/extract/completions.ts | 1 - .../scraper/scrapeURL/engines/fetch/index.ts | 2 +- .../scrapeURL/engines/fire-engine/index.ts | 2 +- .../src/scraper/scrapeURL/engines/index.ts | 7 +++++-- .../scraper/scrapeURL/engines/pdf/index.ts | 14 ++++++++++---- .../scrapeURL/engines/scrapingbee/index.ts | 5 ++++- apps/api/src/scraper/scrapeURL/error.ts | 4 +--- apps/api/src/scraper/scrapeURL/index.ts | 7 ++++--- apps/api/src/services/queue-jobs.ts | 19 ++++++++++++------- apps/api/src/services/queue-worker.ts | 10 ++++++---- 12 files changed, 55 insertions(+), 30 deletions(-) diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index f1fe3431..1ea28995 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -60,7 +60,11 @@ export async function scrapeController( try { doc = await waitForJob(jobId, timeout + totalWait); // TODO: better types for this } catch (e) { - logger.error(`Error in scrapeController: ${e}`, { jobId, scrapeId: jobId, startTime }); + logger.error(`Error in scrapeController: ${e}`, { + jobId, + scrapeId: jobId, + startTime, + }); if ( e instanceof Error && (e.message.startsWith("Job wait") || e.message === "timeout") diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index 602d13b3..6ecb0b8f 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -94,9 +94,13 @@ export async function addCrawlJobDone( await redisConnection.rpush("crawl:" + id + ":jobs_done_ordered", job_id); } else { // in case it's already been pushed, make sure it's removed - await redisConnection.lrem("crawl:" + id + ":jobs_done_ordered", -1, job_id); + await redisConnection.lrem( + "crawl:" + id + ":jobs_done_ordered", + -1, + job_id, + ); } - + await redisConnection.expire( "crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60, diff --git a/apps/api/src/lib/extract/completions.ts b/apps/api/src/lib/extract/completions.ts index 8d1b95c9..34a5a215 100644 --- a/apps/api/src/lib/extract/completions.ts +++ b/apps/api/src/lib/extract/completions.ts @@ -122,4 +122,3 @@ // }, // }; // } - diff --git a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts index 168d9b8f..a0c8eaba 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts @@ -5,7 +5,7 @@ import { specialtyScrapeCheck } from "../utils/specialtyHandler"; export async function scrapeURLWithFetch( meta: Meta, - timeToRun: number | undefined + timeToRun: number | undefined, ): Promise { const timeout = timeToRun ?? 300000; diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 2dc134c9..14abf9a9 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -128,7 +128,7 @@ export async function scrapeURLWithFireEngineChromeCDP( (a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a), 0, ); - + const timeout = (timeToRun ?? 300000) + totalWait; const request: FireEngineScrapeRequestCommon & diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index 14f263f3..bb0c485c 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -105,7 +105,10 @@ export type EngineScrapeResult = { }; const engineHandlers: { - [E in Engine]: (meta: Meta, timeToRun: number | undefined) => Promise; + [E in Engine]: ( + meta: Meta, + timeToRun: number | undefined, + ) => Promise; } = { cache: scrapeCache, "fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP, @@ -372,7 +375,7 @@ export function buildFallbackList(meta: Meta): { export async function scrapeURLWithEngine( meta: Meta, engine: Engine, - timeToRun: number | undefined + timeToRun: number | undefined, ): Promise { const fn = engineHandlers[engine]; const logger = meta.logger.child({ diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index 0983e4b1..9d2f11b1 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -124,7 +124,10 @@ async function scrapePDFWithParsePDF( }; } -export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Promise { +export async function scrapePDF( + meta: Meta, + timeToRun: number | undefined, +): Promise { if (!meta.options.parsePDF) { const file = await fetchFileToBuffer(meta.url); const content = file.buffer.toString("base64"); @@ -152,9 +155,12 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom tempFilePath, ); - // If the parsed text is under 500 characters and LLAMAPARSE_API_KEY exists, try LlamaParse - if (result.markdown && result.markdown.length < 500 && process.env.LLAMAPARSE_API_KEY) { + if ( + result.markdown && + result.markdown.length < 500 && + process.env.LLAMAPARSE_API_KEY + ) { try { const llamaResult = await scrapePDFWithLlamaParse( { @@ -193,4 +199,4 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom html: result.html, markdown: result.markdown, }; -} \ No newline at end of file +} diff --git a/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts b/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts index db702a44..38c43878 100644 --- a/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts @@ -10,7 +10,10 @@ const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!); export function scrapeURLWithScrapingBee( wait_browser: "domcontentloaded" | "networkidle2", ): (meta: Meta, timeToRun: number | undefined) => Promise { - return async (meta: Meta, timeToRun: number | undefined): Promise => { + return async ( + meta: Meta, + timeToRun: number | undefined, + ): Promise => { let response: AxiosResponse; const timeout = (timeToRun ?? 300000) + meta.options.waitFor; try { diff --git a/apps/api/src/scraper/scrapeURL/error.ts b/apps/api/src/scraper/scrapeURL/error.ts index 0a4f6e5b..689f90c8 100644 --- a/apps/api/src/scraper/scrapeURL/error.ts +++ b/apps/api/src/scraper/scrapeURL/error.ts @@ -60,9 +60,7 @@ export class SiteError extends Error { export class ActionError extends Error { public code: string; constructor(code: string) { - super( - "Action(s) failed to complete. Error code: " + code, - ); + super("Action(s) failed to complete. Error code: " + code); this.code = code; } } diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 93bdb71b..1df812bd 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -203,9 +203,10 @@ async function scrapeURLLoop(meta: Meta): Promise { const results: EngineResultsTracker = {}; let result: EngineScrapeResultWithContext | null = null; - const timeToRun = meta.options.timeout !== undefined - ? Math.round(meta.options.timeout / Math.min(fallbackList.length, 2)) - : undefined + const timeToRun = + meta.options.timeout !== undefined + ? Math.round(meta.options.timeout / Math.min(fallbackList.length, 2)) + : undefined; for (const { engine, unsupportedFeatures } of fallbackList) { const startedAt = Date.now(); diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index 6ce48a81..654f6cda 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -72,7 +72,12 @@ async function addScrapeJobRaw( } if (concurrencyLimited) { - await _addScrapeJobToConcurrencyQueue(webScraperOptions, options, jobId, jobPriority); + await _addScrapeJobToConcurrencyQueue( + webScraperOptions, + options, + jobId, + jobPriority, + ); } else { await _addScrapeJobToBullMQ(webScraperOptions, options, jobId, jobPriority); } @@ -130,17 +135,17 @@ export async function addScrapeJobs( let countCanBeDirectlyAdded = Infinity; - if ( - jobs[0].data && - jobs[0].data.team_id && - jobs[0].data.plan - ) { + if (jobs[0].data && jobs[0].data.team_id && jobs[0].data.plan) { const now = Date.now(); const limit = await getConcurrencyLimitMax(jobs[0].data.plan); console.log("CC limit", limit); cleanOldConcurrencyLimitEntries(jobs[0].data.team_id, now); - countCanBeDirectlyAdded = Math.max(limit - (await getConcurrencyLimitActiveJobs(jobs[0].data.team_id, now)).length, 0); + countCanBeDirectlyAdded = Math.max( + limit - + (await getConcurrencyLimitActiveJobs(jobs[0].data.team_id, now)).length, + 0, + ); } const addToBull = jobs.slice(0, countCanBeDirectlyAdded); diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index c2d2e2c6..705a06c7 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -496,15 +496,14 @@ async function processJob(job: Job & { id: string }, token: string) { // See lockURL const x = await redisConnection.sadd( "crawl:" + job.data.crawl_id + ":visited", - ...p1.map(x => x.href), + ...p1.map((x) => x.href), ); const lockRes = x === p1.length; - + if (job.data.crawlerOptions !== null && !lockRes) { throw new RacedRedirectError(); } } - } logger.debug("Logging job to DB..."); @@ -675,7 +674,10 @@ async function processJob(job: Job & { id: string }, token: string) { logger.debug("Declaring job as done..."); await addCrawlJobDone(job.data.crawl_id, job.id, false); - await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited_unique", normalizeURL(job.data.url, sc)); + await redisConnection.srem( + "crawl:" + job.data.crawl_id + ":visited_unique", + normalizeURL(job.data.url, sc), + ); logger.debug("Logging job to DB..."); await logJob( From ac187452c3b73d647dfba11b8ff2531d582eda02 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 17 Dec 2024 17:34:55 -0300 Subject: [PATCH 38/65] Nick: better filtering for urls that should be scraped --- apps/api/src/controllers/v1/extract.ts | 18 +++++++++++++++++- apps/api/src/controllers/v1/map.ts | 6 ++++-- apps/api/src/lib/extract/build-prompts.ts | 4 +++- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/apps/api/src/controllers/v1/extract.ts b/apps/api/src/controllers/v1/extract.ts index f3e94b77..c0e06a2d 100644 --- a/apps/api/src/controllers/v1/extract.ts +++ b/apps/api/src/controllers/v1/extract.ts @@ -26,6 +26,7 @@ import { getMapResults } from "./map"; import { buildDocument } from "../../lib/extract/build-document"; import { generateBasicCompletion } from "../../lib/LLM-extraction"; import { buildRefrasedPrompt } from "../../lib/extract/build-prompts"; +import { removeDuplicateUrls } from "../../lib/validateUrl"; configDotenv(); const redis = new Redis(process.env.REDIS_URL!); @@ -88,8 +89,22 @@ export async function extractController( includeSubdomains: req.body.includeSubdomains, }); - let mappedLinks = mapResults.links as MapDocument[]; + let mappedLinks = mapResults.mapResults as MapDocument[]; + // Remove duplicates between mapResults.links and mappedLinks + const allUrls = [...mappedLinks.map(m => m.url), ...mapResults.links]; + const uniqueUrls = removeDuplicateUrls(allUrls); + + // Only add URLs from mapResults.links that aren't already in mappedLinks + const existingUrls = new Set(mappedLinks.map(m => m.url)); + const newUrls = uniqueUrls.filter(url => !existingUrls.has(url)); + + mappedLinks = [ + ...mappedLinks, + ...newUrls.map(url => ({ url, title: "", description: "" })) + ]; + + if (mappedLinks.length === 0) { mappedLinks = [{ url: baseUrl, title: "", description: "" }]; } @@ -102,6 +117,7 @@ export async function extractController( `url: ${x.url}, title: ${x.title}, description: ${x.description}`, ); + if (req.body.prompt) { let searchQuery = req.body.prompt && allowExternalLinks diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 39393313..27a926fc 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -32,10 +32,11 @@ const MAX_FIRE_ENGINE_RESULTS = 500; interface MapResult { success: boolean; - links: string[] | any[]; + links: string[]; scrape_id?: string; job_id: string; time_taken: number; + mapResults: MapDocument[]; } export async function getMapResults({ @@ -215,7 +216,8 @@ export async function getMapResults({ return { success: true, - links: includeMetadata ? mapResults : linksToReturn, + links: linksToReturn, + mapResults: mapResults, scrape_id: origin?.includes("website") ? id : undefined, job_id: id, time_taken: (new Date().getTime() - Date.now()) / 1000, diff --git a/apps/api/src/lib/extract/build-prompts.ts b/apps/api/src/lib/extract/build-prompts.ts index 1ab117c2..f554eadc 100644 --- a/apps/api/src/lib/extract/build-prompts.ts +++ b/apps/api/src/lib/extract/build-prompts.ts @@ -4,11 +4,13 @@ export function buildRefrasedPrompt(prompt: string, url: string): string { Original prompt: "${prompt}" Provide a rephrased search query that: -1. Maintains the core intent of the original prompt +1. Maintains the core intent of the original prompt with ONLY the keywords 2. Uses relevant keywords 3. Is optimized for search engine results 4. Is concise and focused 5. Short is better than long +6. It is a search engine, not a chatbot +7. Concise Return only the rephrased search query, without any explanation or additional text.`; } From 780442d73b2f1a01b66f3cf224cd71f910061f3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=B3ricz=20Gerg=C5=91?= Date: Tue, 17 Dec 2024 22:01:41 +0100 Subject: [PATCH 39/65] feat: improve billing logging --- apps/api/src/controllers/v0/scrape.ts | 7 +++--- apps/api/src/main/runWebScraper.ts | 24 ++++++++++++------- .../src/services/billing/credit_billing.ts | 13 ++++++++-- 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index 96e6ea4f..2d8acf5f 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -265,9 +265,10 @@ export async function scrapeController(req: Request, res: Response) { } if (creditsToBeBilled > 0) { // billing for doc done on queue end, bill only for llm extraction - billTeam(team_id, chunk?.sub_id, creditsToBeBilled).catch((error) => { + billTeam(team_id, chunk?.sub_id, creditsToBeBilled, logger).catch((error) => { logger.error( - `Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`, + `Failed to bill team ${team_id} for ${creditsToBeBilled} credits`, + { error } ); // Optionally, you could notify an admin or add to a retry queue here }); @@ -312,7 +313,7 @@ export async function scrapeController(req: Request, res: Response) { return res.status(result.returnCode).json(result); } catch (error) { Sentry.captureException(error); - logger.error(error); + logger.error("Scrape error occcurred", { error }); return res.status(500).json({ error: error instanceof ZodError diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 5fb574d4..c50ab9c9 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -7,7 +7,7 @@ import { import { billTeam } from "../services/billing/credit_billing"; import { Document } from "../controllers/v1/types"; import { supabase_service } from "../services/supabase"; -import { logger } from "../lib/logger"; +import { logger as _logger } from "../lib/logger"; import { ScrapeEvents } from "../lib/scrape-events"; import { configDotenv } from "dotenv"; import { @@ -66,6 +66,12 @@ export async function runWebScraper({ is_scrape = false, is_crawl = false, }: RunWebScraperParams): Promise { + const logger = _logger.child({ + method: "runWebScraper", + module: "runWebscraper", + scrapeId: bull_job_id, + jobId: bull_job_id, + }) const tries = is_crawl ? 3 : 1; let response: ScrapeUrlResponse | undefined = undefined; @@ -75,10 +81,6 @@ export async function runWebScraper({ for (let i = 0; i < tries; i++) { if (i > 0) { logger.debug("Retrying scrape...", { - scrapeId: bull_job_id, - jobId: bull_job_id, - method: "runWebScraper", - module: "runWebScraper", tries, i, previousStatusCode: (response as any)?.document?.metadata?.statusCode, @@ -171,9 +173,10 @@ export async function runWebScraper({ creditsToBeBilled = 5; } - billTeam(team_id, undefined, creditsToBeBilled).catch((error) => { + billTeam(team_id, undefined, creditsToBeBilled, logger).catch((error) => { logger.error( - `Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`, + `Failed to bill team ${team_id} for ${creditsToBeBilled} credits`, + { error } ); // Optionally, you could notify an admin or add to a retry queue here }); @@ -232,6 +235,11 @@ const saveJob = async ( } ScrapeEvents.logJobEvent(job, "completed"); } catch (error) { - logger.error(`🐂 Failed to update job status: ${error}`); + _logger.error(`🐂 Failed to update job status`, { + module: "runWebScraper", + method: "saveJob", + jobId: job.id, + scrapeId: job.id, + }); } }; diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index bbd04cc0..c2671034 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -10,6 +10,7 @@ import { issueCredits } from "./issue_credits"; import { redlock } from "../redlock"; import { autoCharge } from "./auto_charge"; import { getValue, setValue } from "../redis"; +import type { Logger } from "winston"; const FREE_CREDITS = 500; @@ -20,22 +21,30 @@ export async function billTeam( team_id: string, subscription_id: string | null | undefined, credits: number, + logger?: Logger, ) { return withAuth(supaBillTeam, { success: true, message: "No DB, bypassed." })( team_id, subscription_id, credits, + logger, ); } export async function supaBillTeam( team_id: string, subscription_id: string | null | undefined, credits: number, + __logger?: Logger, ) { + const _logger = (__logger ?? logger).child({ + module: "credit_billing", + method: "supaBillTeam", + }); + if (team_id === "preview") { return { success: true, message: "Preview team, no credits used" }; } - logger.info(`Billing team ${team_id} for ${credits} credits`); + _logger.info(`Billing team ${team_id} for ${credits} credits`, { team_id, credits }); const { data, error } = await supabase_service.rpc("bill_team", { _team_id: team_id, @@ -46,7 +55,7 @@ export async function supaBillTeam( if (error) { Sentry.captureException(error); - logger.error("Failed to bill team: " + JSON.stringify(error)); + _logger.error("Failed to bill team.", { error }); return; } From bd36c441d3a4237b6d0159f58335185fc0fb78df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=B3ricz=20Gerg=C5=91?= Date: Tue, 17 Dec 2024 22:06:36 +0100 Subject: [PATCH 40/65] feat(queue-worker): improve team-based logging --- apps/api/src/services/queue-worker.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 705a06c7..4ef9610d 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -386,6 +386,7 @@ async function processJob(job: Job & { id: string }, token: string) { jobId: job.id, scrapeId: job.id, crawlId: job.data?.crawl_id ?? undefined, + teamId: job.data?.team_id ?? undefined, }); logger.info(`🐂 Worker taking job ${job.id}`, { url: job.data.url }); From e776847c71a393d9fc49f6e1883d3911170a5ba7 Mon Sep 17 00:00:00 2001 From: RutamBhagat Date: Tue, 17 Dec 2024 11:00:13 -0800 Subject: [PATCH 41/65] feat(js-sdk): improve API key handling for cloud vs self-hosted services in FirecrawlApp --- .../firecrawl/src/__tests__/index.test.ts | 27 +++++++---- .../__tests__/v1/e2e_withAuth/index.test.ts | 46 ++++++++++++++----- apps/js-sdk/firecrawl/src/index.ts | 12 +++-- 3 files changed, 61 insertions(+), 24 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/__tests__/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/index.test.ts index 92951237..6958abf8 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/index.test.ts @@ -1,9 +1,9 @@ -import { describe, test, expect, jest } from '@jest/globals'; -import axios from 'axios'; -import FirecrawlApp from '../index'; +import { describe, expect, jest, test } from '@jest/globals'; -import { readFile } from 'fs/promises'; +import FirecrawlApp from '../index'; +import axios from 'axios'; import { join } from 'path'; +import { readFile } from 'fs/promises'; // Mock jest and set the type jest.mock('axios'); @@ -14,13 +14,22 @@ async function loadFixture(name: string): Promise { return await readFile(join(__dirname, 'fixtures', `${name}.json`), 'utf-8') } +const API_URL = process.env.API_URL ?? "https://api.firecrawl.dev"; + describe('the firecrawl JS SDK', () => { - test('Should require an API key to instantiate FirecrawlApp', async () => { - const fn = () => { - new FirecrawlApp({ apiKey: undefined }); - }; - expect(fn).toThrow('No API key provided'); + test('Should require an API key only for cloud service', async () => { + if (API_URL.includes('api.firecrawl.dev')) { + // Should throw for cloud service + expect(() => { + new FirecrawlApp({ apiKey: undefined, apiUrl: API_URL }); + }).toThrow('No API key provided'); + } else { + // Should not throw for self-hosted + expect(() => { + new FirecrawlApp({ apiKey: undefined, apiUrl: API_URL }); + }).not.toThrow(); + } }); test('Should return scraped data from a /scrape API call', async () => { diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index dea55846..60d0b44f 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -9,15 +9,28 @@ const TEST_API_KEY = process.env.TEST_API_KEY; const API_URL = process.env.API_URL ?? "https://api.firecrawl.dev"; describe('FirecrawlApp E2E Tests', () => { - test.concurrent('should throw error for no API key', async () => { - expect(() => { - new FirecrawlApp({ apiKey: null, apiUrl: API_URL }); - }).toThrow("No API key provided"); + test.concurrent('should throw error for no API key only for cloud service', async () => { + if (API_URL.includes('api.firecrawl.dev')) { + // Should throw for cloud service + expect(() => { + new FirecrawlApp({ apiKey: null, apiUrl: API_URL }); + }).toThrow("No API key provided"); + } else { + // Should not throw for self-hosted + expect(() => { + new FirecrawlApp({ apiKey: null, apiUrl: API_URL }); + }).not.toThrow(); + } }); test.concurrent('should throw error for invalid API key on scrape', async () => { - const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); - await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); + if (API_URL.includes('api.firecrawl.dev')) { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Unexpected error occurred while trying to scrape URL. Status code: 404"); + } else { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).resolves.not.toThrow(); + } }); test.concurrent('should throw error for blocklisted URL on scrape', async () => { @@ -155,8 +168,13 @@ describe('FirecrawlApp E2E Tests', () => { }, 30000); // 30 seconds timeout test.concurrent('should throw error for invalid API key on crawl', async () => { - const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); - await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); + if (API_URL.includes('api.firecrawl.dev')) { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 404"); + } else { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).resolves.not.toThrow(); + } }); test.concurrent('should throw error for blocklisted URL on crawl', async () => { @@ -337,8 +355,13 @@ describe('FirecrawlApp E2E Tests', () => { }, 60000); // 60 seconds timeout test.concurrent('should throw error for invalid API key on map', async () => { - const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); - await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); + if (API_URL.includes('api.firecrawl.dev')) { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 404"); + } else { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).resolves.not.toThrow(); + } }); test.concurrent('should throw error for blocklisted URL on map', async () => { @@ -355,8 +378,7 @@ describe('FirecrawlApp E2E Tests', () => { }, 30000); // 30 seconds timeout test.concurrent('should return successful response for valid map', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse; + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse; expect(response).not.toBeNull(); expect(response.links?.length).toBeGreaterThan(0); diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 020a2293..6d9a0a73 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -289,17 +289,23 @@ export default class FirecrawlApp { public apiKey: string; public apiUrl: string; + private isCloudService(url: string): boolean { + return url.includes('api.firecrawl.dev'); + } + /** * Initializes a new instance of the FirecrawlApp class. * @param config - Configuration options for the FirecrawlApp instance. */ constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) { - if (typeof apiKey !== "string") { + const baseUrl = apiUrl || "https://api.firecrawl.dev"; + + if (this.isCloudService(baseUrl) && typeof apiKey !== "string") { throw new FirecrawlError("No API key provided", 401); } - this.apiKey = apiKey; - this.apiUrl = apiUrl || "https://api.firecrawl.dev"; + this.apiKey = apiKey || ''; + this.apiUrl = baseUrl; } /** From e899ecbe44ded76c2a957a1a9cd8683db1708e70 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 18 Dec 2024 16:52:05 -0300 Subject: [PATCH 42/65] Update llmExtract.ts --- apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 6380edb8..9b23af2c 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -159,8 +159,8 @@ export async function generateOpenAICompletions( role: "user", content: options.prompt !== undefined - ? `Transform the above content into structured JSON output based on the following user request: ${options.prompt}` - : "Transform the above content into structured JSON output.", + ? `Transform the above content into structured JSON output based on the provided schema if any and the following user request: ${options.prompt}` + : "Transform the above content into structured JSON output based on the provided schema if any.", }, ], response_format: options.schema From 19246f6289172ea356bbcc45d40797bbf1838425 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 18 Dec 2024 18:36:04 -0300 Subject: [PATCH 43/65] feat-SDK/added crawl id to ws --- apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/src/index.ts | 15 +++++++++++++-- apps/python-sdk/firecrawl/__init__.py | 2 +- apps/python-sdk/firecrawl/firecrawl.py | 8 ++++---- 4 files changed, 19 insertions(+), 8 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 74dfcb02..1c7f082f 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.9.4", + "version": "1.9.5", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 020a2293..44063097 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -934,9 +934,11 @@ export class CrawlWatcher extends TypedEventTarget { private ws: WebSocket; public data: FirecrawlDocument[]; public status: CrawlStatusResponse["status"]; + public id: string; constructor(id: string, app: FirecrawlApp) { super(); + this.id = id; this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey); this.status = "scraping"; this.data = []; @@ -967,6 +969,7 @@ export class CrawlWatcher extends TypedEventTarget { detail: { status: this.status, data: this.data, + id: this.id, }, })); } else if (msg.type === "error") { @@ -976,6 +979,7 @@ export class CrawlWatcher extends TypedEventTarget { status: this.status, data: this.data, error: msg.error, + id: this.id, }, })); } else if (msg.type === "catchup") { @@ -983,12 +987,18 @@ export class CrawlWatcher extends TypedEventTarget { this.data.push(...(msg.data.data ?? [])); for (const doc of this.data) { this.dispatchTypedEvent("document", new CustomEvent("document", { - detail: doc, + detail: { + ...doc, + id: this.id, + }, })); } } else if (msg.type === "document") { this.dispatchTypedEvent("document", new CustomEvent("document", { - detail: msg.data, + detail: { + ...msg.data, + id: this.id, + }, })); } } @@ -1015,6 +1025,7 @@ export class CrawlWatcher extends TypedEventTarget { status: this.status, data: this.data, error: "WebSocket error", + id: this.id, }, })); }).bind(this); diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 31d68095..8c5d1b44 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp # noqa -__version__ = "1.6.4" +__version__ = "1.6.5" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 45ed27d8..7ac2d2dc 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -704,15 +704,15 @@ class CrawlWatcher: async def _handle_message(self, msg: Dict[str, Any]): if msg['type'] == 'done': self.status = 'completed' - self.dispatch_event('done', {'status': self.status, 'data': self.data}) + self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id}) elif msg['type'] == 'error': self.status = 'failed' - self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error']}) + self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id}) elif msg['type'] == 'catchup': self.status = msg['data']['status'] self.data.extend(msg['data'].get('data', [])) for doc in self.data: - self.dispatch_event('document', doc) + self.dispatch_event('document', {'data': doc, 'id': self.id}) elif msg['type'] == 'document': self.data.append(msg['data']) - self.dispatch_event('document', msg['data']) + self.dispatch_event('document', {'data': msg['data'], 'id': self.id}) From 94267ff406d005351ef2b4aadc70b09bec687ac3 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Wed, 18 Dec 2024 17:31:45 -0500 Subject: [PATCH 44/65] Create o1_web_extractor.py --- examples/o1_web_extractor/o1_web_extractor.py | 147 ++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 examples/o1_web_extractor/o1_web_extractor.py diff --git a/examples/o1_web_extractor/o1_web_extractor.py b/examples/o1_web_extractor/o1_web_extractor.py new file mode 100644 index 00000000..34857ff9 --- /dev/null +++ b/examples/o1_web_extractor/o1_web_extractor.py @@ -0,0 +1,147 @@ +import os +import json +import requests +from dotenv import load_dotenv +from openai import OpenAI +from serpapi import GoogleSearch + +# ANSI color codes +class Colors: + CYAN = '\033[96m' + YELLOW = '\033[93m' + GREEN = '\033[92m' + RED = '\033[91m' + MAGENTA = '\033[95m' + BLUE = '\033[94m' + RESET = '\033[0m' + +# Load environment variables +load_dotenv() + +# Initialize clients +client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) +firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") + +def search_google(query): + """Search Google using SerpAPI and return top results.""" + print(f"{Colors.YELLOW}Searching Google for '{query}'...{Colors.RESET}") + search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")}) + return search.get_dict().get("organic_results", []) + +def select_urls_with_o1(company, objective, serp_results): + """ + Use O1 to select the most relevant URLs from SERP results for the given company and objective. + Returns a JSON object with a "selected_urls" property that is an array of strings. + """ + try: + # Prepare the data for O1 + serp_data = [{"title": r.get("title"), "link": r.get("link"), "snippet": r.get("snippet")} + for r in serp_results if r.get("link")] + + response = client.chat.completions.create( + model="o1-2024-12-17", + messages=[ + { + "role": "developer", + "content": "You select URLs from the SERP results relevant to the company and objective." + }, + { + "role": "user", + "content": ( + f"Company: {company}\n" + f"Objective: {objective}\n" + f"SERP Results: {json.dumps(serp_data)}\n\n" + "Return a JSON object with a property 'selected_urls' that contains an array " + "of URLs most likely to help meet the objective. Add a /* to the end of the URL if you think it should search all of the pages in the site. Do not return any social media links. For example: {\"selected_urls\": [\"https://example.com\", \"https://example2.com\"]}" + ) + } + ], + response_format={ + "type": "json_schema", + "json_schema": { + "name": "selected_urls_object", + "schema": { + "type": "object", + "properties": { + "selected_urls": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["selected_urls"], + "additionalProperties": False + } + } + } + ) + + # The response is guaranteed to follow the specified JSON schema + result = json.loads(response.choices[0].message.content) + urls = result.get("selected_urls", []) + return urls + + except Exception as e: + print(f"{Colors.RED}Error selecting URLs with O1: {e}{Colors.RESET}") + return [] + + + +def extract_company_info(urls, prompt, company, api_key): + """Use requests to call Firecrawl's extract endpoint with selected URLs.""" + print(f"{Colors.YELLOW}Extracting structured data from the provided URLs using Firecrawl...{Colors.RESET}") + + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {api_key}' + } + + payload = { + "urls": urls, + "prompt": prompt + " for " + company + } + + try: + response = requests.post( + "https://api.firecrawl.dev/v1/extract", + headers=headers, + json=payload + ) + response.raise_for_status() + data = response.json() + return data + except Exception as e: + print(f"{Colors.RED}Failed to extract data: {e}{Colors.RESET}") + return None + +def main(): + company = input(f"{Colors.BLUE}Enter the company name: {Colors.RESET}") + objective = input(f"{Colors.BLUE}Enter what information you want about the company: {Colors.RESET}") + + serp_results = search_google(f"{company}") + if not serp_results: + print(f"{Colors.RED}No search results found.{Colors.RESET}") + return + + # Ask O1 to select URLs + selected_urls = select_urls_with_o1(company, objective, serp_results) + + if not selected_urls: + print(f"{Colors.RED}O1 did not return any URLs.{Colors.RESET}") + return + + print(f"{Colors.CYAN}Selected URLs for extraction by O1:{Colors.RESET}") + for url in selected_urls: + print(f"- {url}") + + data = extract_company_info(selected_urls, objective, company, firecrawl_api_key) + + if data and data.get('success') and data.get('data'): + print(f"{Colors.GREEN}Data successfully extracted:{Colors.RESET}") + print(json.dumps(data['data'], indent=2)) + else: + print(f"{Colors.RED}Failed to extract the requested information. Try refining your prompt or choosing a different company.{Colors.RESET}") + +if __name__ == "__main__": + main() From a759a7ab7a3de7dd579658b27ed00d909fb78541 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 18 Dec 2024 21:45:06 -0300 Subject: [PATCH 45/65] Nick: small improvements --- apps/api/src/controllers/v1/extract.ts | 2 +- apps/api/src/lib/LLM-extraction/index.ts | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/apps/api/src/controllers/v1/extract.ts b/apps/api/src/controllers/v1/extract.ts index c0e06a2d..a6f50fae 100644 --- a/apps/api/src/controllers/v1/extract.ts +++ b/apps/api/src/controllers/v1/extract.ts @@ -263,7 +263,7 @@ export async function extractController( { mode: "llm", systemPrompt: - "Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema if provided. Here are the urls the user provided of which he wants to extract information from: " + + "Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " + links.join(", "), prompt: req.body.prompt, schema: req.body.schema, diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts index 22e2bd04..3a98ffc9 100644 --- a/apps/api/src/lib/LLM-extraction/index.ts +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -67,12 +67,11 @@ export async function generateCompletions( export async function generateBasicCompletion(prompt: string) { const openai = new OpenAI(); - const model = process.env.MODEL_NAME ?? "gpt-4o-mini"; + const model = "gpt-4o"; const completion = await openai.chat.completions.create({ model, messages: [{ role: "user", content: prompt }], }); - return completion.choices[0].message.content; } From 2d37dca9dcc1095729af6d2f340a0dc1f0a43426 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 18 Dec 2024 22:10:41 -0300 Subject: [PATCH 46/65] Nick: introduced system prompt to /extract --- apps/api/src/controllers/v1/extract.ts | 1 + apps/api/src/controllers/v1/types.ts | 1 + apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/src/index.ts | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/extract.ts b/apps/api/src/controllers/v1/extract.ts index a6f50fae..e7462ca2 100644 --- a/apps/api/src/controllers/v1/extract.ts +++ b/apps/api/src/controllers/v1/extract.ts @@ -263,6 +263,7 @@ export async function extractController( { mode: "llm", systemPrompt: + (req.body.systemPrompt ? `${req.body.systemPrompt}\n` : "") + "Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " + links.join(", "), prompt: req.body.prompt, diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 5ba36ea7..06605eb9 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -194,6 +194,7 @@ export const extractV1Options = z .array() .max(10, "Maximum of 10 URLs allowed per request while in beta."), prompt: z.string().optional(), + systemPrompt: z.string().optional(), schema: z.any().optional(), limit: z.number().int().positive().finite().safe().optional(), ignoreSitemap: z.boolean().default(false), diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 74dfcb02..1c7f082f 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.9.4", + "version": "1.9.5", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 020a2293..0d19ab60 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -247,6 +247,7 @@ export interface ExtractParams { schema?: LLMSchema; systemPrompt?: string; allowExternalLinks?: boolean; + includeSubdomains?: boolean; } /** From 05605112bb8c98f2a444c0621ca0dbe809a0b99a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 18 Dec 2024 23:34:07 -0300 Subject: [PATCH 47/65] Update extract.ts --- apps/api/src/controllers/v1/extract.ts | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/apps/api/src/controllers/v1/extract.ts b/apps/api/src/controllers/v1/extract.ts index e7462ca2..1188bafb 100644 --- a/apps/api/src/controllers/v1/extract.ts +++ b/apps/api/src/controllers/v1/extract.ts @@ -231,20 +231,7 @@ export async function extractController( return doc; } catch (e) { logger.error(`Error in extractController: ${e}`); - if ( - e instanceof Error && - (e.message.startsWith("Job wait") || e.message === "timeout") - ) { - throw { - status: 408, - error: "Request timed out", - }; - } else { - throw { - status: 500, - error: `(Internal server error) - ${e && e.message ? e.message : e}`, - }; - } + return null; } }); From 066071cd54a90d559e52f5077f80c942ab618ed0 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 18 Dec 2024 23:45:43 -0300 Subject: [PATCH 48/65] Update llmExtract.ts --- apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 9b23af2c..759f87e2 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -159,7 +159,7 @@ export async function generateOpenAICompletions( role: "user", content: options.prompt !== undefined - ? `Transform the above content into structured JSON output based on the provided schema if any and the following user request: ${options.prompt}` + ? `Transform the above content into structured JSON output based on the provided schema if any and the following user request: ${options.prompt}. If schema is provided, strictly follow it.` : "Transform the above content into structured JSON output based on the provided schema if any.", }, ], From cf2ec7713166d6f9a7b6c7218e125e82d43aba40 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 19 Dec 2024 08:32:10 -0300 Subject: [PATCH 49/65] fixed title extra info --- apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts index c67f9cbd..66cf30cc 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts @@ -40,7 +40,7 @@ export function extractMetadata( const soup = load(html); try { - title = soup("title").text() || undefined; + title = soup("title").first().text().trim() || undefined; description = soup('meta[name="description"]').attr("content") || undefined; // Assuming the language is part of the URL as per the regex pattern From 6002bf322872f1ad849bbecc0c26636e3d22b10f Mon Sep 17 00:00:00 2001 From: Thomas Kosmas Date: Thu, 19 Dec 2024 14:52:43 +0200 Subject: [PATCH 50/65] feat: dynamically import WebSocket module with error handling --- apps/js-sdk/firecrawl/src/index.ts | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 0d19ab60..7eef05f8 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -1,7 +1,24 @@ import axios, { type AxiosResponse, type AxiosRequestHeaders, AxiosError } from "axios"; import type * as zt from "zod"; import { zodToJsonSchema } from "zod-to-json-schema"; -import { WebSocket } from "isows"; + +import type { WebSocket as IsowsWebSocket } from 'isows'; +/** + * Dynamically imports the WebSocket class from 'isows'. + * If the import fails, WebSocket is set to null. + * This approach is used because some environments, such as Firebase Functions, + * might not support WebSocket natively. + */ +const WebSocket: typeof IsowsWebSocket | null = await (async () => { + try { + const module = await import('isows'); + return module.WebSocket; + } catch (error) { + console.error("Failed to load 'isows' module:", error); + return null; + } +})(); + import { TypedEventTarget } from "typescript-event-target"; /** @@ -938,6 +955,8 @@ export class CrawlWatcher extends TypedEventTarget { constructor(id: string, app: FirecrawlApp) { super(); + if(!WebSocket) + throw new FirecrawlError("WebSocket module failed to load. Your system might not support WebSocket.", 500); this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey); this.status = "scraping"; this.data = []; From 071b9a01c35613eab2a3905343199bf9f065d569 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 19 Dec 2024 18:22:54 +0100 Subject: [PATCH 51/65] fix(scrapeURL/fire-engine): pass geolocation --- apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 14abf9a9..d753465d 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -144,7 +144,7 @@ export async function scrapeURLWithFireEngineChromeCDP( } : {}), priority: meta.internalOptions.priority, - geolocation: meta.options.geolocation, + geolocation: meta.options.geolocation ?? meta.options.location, mobile: meta.options.mobile, timeout, // TODO: better timeout logic disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache, @@ -227,7 +227,7 @@ export async function scrapeURLWithFireEnginePlaywright( screenshot: meta.options.formats.includes("screenshot"), fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"), wait: meta.options.waitFor, - geolocation: meta.options.geolocation, + geolocation: meta.options.geolocation ?? meta.options.location, timeout, }; @@ -286,7 +286,7 @@ export async function scrapeURLWithFireEngineTLSClient( priority: meta.internalOptions.priority, atsv: meta.internalOptions.atsv, - geolocation: meta.options.geolocation, + geolocation: meta.options.geolocation ?? meta.options.location, disableJsDom: meta.internalOptions.v0DisableJsDom, timeout, From c8cd0148dd86e8903a3b8cf16b87841262d3c1e6 Mon Sep 17 00:00:00 2001 From: Thomas Kosmas Date: Thu, 19 Dec 2024 20:39:30 +0200 Subject: [PATCH 52/65] refactor: remove error logging for 'isows' module import in WebSocket initialization --- apps/js-sdk/firecrawl/src/index.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 7eef05f8..9e3a849f 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -14,7 +14,6 @@ const WebSocket: typeof IsowsWebSocket | null = await (async () => { const module = await import('isows'); return module.WebSocket; } catch (error) { - console.error("Failed to load 'isows' module:", error); return null; } })(); From 4fddc86e66143c20adaa5ff6422c24d17c0a08d1 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 19 Dec 2024 16:09:08 -0300 Subject: [PATCH 53/65] Update package.json --- apps/js-sdk/firecrawl/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 1c7f082f..a6ed595e 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.9.5", + "version": "1.9.6", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", From 525a71d789bdfece2a21a40024e19f9cea7fcefb Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 19 Dec 2024 16:10:42 -0300 Subject: [PATCH 54/65] Update __init__.py --- apps/python-sdk/firecrawl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 8c5d1b44..19a33d17 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp # noqa -__version__ = "1.6.5" +__version__ = "1.6.7" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") From 8063474c85bff7e91ee100cf54b2d809cddecb98 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 19 Dec 2024 16:14:17 -0300 Subject: [PATCH 55/65] Update __init__.py --- apps/python-sdk/firecrawl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 19a33d17..5f592c2c 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp # noqa -__version__ = "1.6.7" +__version__ = "1.6.8" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") From 3e60f175bbf3fa2f7510d1544d311f44ccadba9c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 19 Dec 2024 16:14:49 -0300 Subject: [PATCH 56/65] Nick: prompt should be optional on /extract sdks --- apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/src/index.ts | 2 +- apps/python-sdk/firecrawl/firecrawl.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index a6ed595e..1296aedb 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.9.6", + "version": "1.9.7", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 08f7b183..2772466c 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -243,7 +243,7 @@ export interface MapResponse { * Defines options for extracting information from URLs. */ export interface ExtractParams { - prompt: string; + prompt?: string; schema?: LLMSchema; systemPrompt?: string; allowExternalLinks?: boolean; diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 7ac2d2dc..e4ac2726 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -26,7 +26,7 @@ class FirecrawlApp: """ Parameters for the extract operation. """ - prompt: str + prompt: Optional[str] = None schema_: Optional[Any] = pydantic.Field(None, alias='schema') system_prompt: Optional[str] = None allow_external_links: Optional[bool] = False From 63bbeadbfcdb26c7e43fbe99eb92a1b251f94a9b Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 20 Dec 2024 08:05:13 -0300 Subject: [PATCH 57/65] Added try catch to message handler --- apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/src/index.ts | 17 ++++++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 1296aedb..3334abe6 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.9.7", + "version": "1.9.8", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 2772466c..43b77825 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -1009,14 +1009,21 @@ export class CrawlWatcher extends TypedEventTarget { this.ws.close(); return; } - - const msg = JSON.parse(ev.data) as Message; - messageHandler(msg); + try { + const msg = JSON.parse(ev.data) as Message; + messageHandler(msg); + } catch (error) { + console.error("Error on message", error); + } }).bind(this); this.ws.onclose = ((ev: CloseEvent) => { - const msg = JSON.parse(ev.reason) as Message; - messageHandler(msg); + try { + const msg = JSON.parse(ev.reason) as Message; + messageHandler(msg); + } catch (error) { + console.error("Error on close", error); + } }).bind(this); this.ws.onerror = ((_: Event) => { From 62221522497ab9bb7890dc76de59af31f04ba6b5 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 20 Dec 2024 15:44:17 -0300 Subject: [PATCH 58/65] Nick: credit usage endpoint --- apps/api/src/controllers/v1/credit-usage.ts | 45 +++++++++++++++++++++ apps/api/src/routes/v1.ts | 8 ++++ apps/api/src/services/rate-limiter.ts | 2 +- 3 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 apps/api/src/controllers/v1/credit-usage.ts diff --git a/apps/api/src/controllers/v1/credit-usage.ts b/apps/api/src/controllers/v1/credit-usage.ts new file mode 100644 index 00000000..da522c13 --- /dev/null +++ b/apps/api/src/controllers/v1/credit-usage.ts @@ -0,0 +1,45 @@ +import { Request, Response } from "express"; +import { RequestWithAuth } from "./types"; +import { getACUC } from "../auth"; +import { logger } from "../../lib/logger"; + +export async function creditUsageController( + req: RequestWithAuth, + res: Response, +): Promise { + try { + // If we already have the credit usage info from auth, use it + if (req.acuc) { + res.json({ + success: true, + data: { + remaining_credits: req.acuc.remaining_credits, + }, + }); + return; + } + + // Otherwise fetch fresh data + const chunk = await getACUC(req.auth.team_id); + if (!chunk) { + res.status(404).json({ + success: false, + error: "Could not find credit usage information", + }); + return; + } + + res.json({ + success: true, + data: { + remaining_credits: chunk.remaining_credits, + }, + }); + } catch (error) { + logger.error("Error in credit usage controller:", error); + res.status(500).json({ + success: false, + error: "Internal server error while fetching credit usage", + }); + } +} diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index f09573d9..76427114 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -31,6 +31,7 @@ import { extractController } from "../controllers/v1/extract"; // import { keyAuthController } from "../../src/controllers/v1/keyAuth"; // import { livenessController } from "../controllers/v1/liveness"; // import { readinessController } from "../controllers/v1/readiness"; +import { creditUsageController } from "../controllers/v1/credit-usage"; function checkCreditsMiddleware( minimum?: number, @@ -224,3 +225,10 @@ v1Router.delete( // Health/Probe routes // v1Router.get("/health/liveness", livenessController); // v1Router.get("/health/readiness", readinessController); + +v1Router.get( + "/team/credit-usage", + authMiddleware(RateLimiterMode.CrawlStatus), + wrap(creditUsageController), +); + diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 21025589..d88ff7ad 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -86,7 +86,7 @@ const RATE_LIMITS = { testSuite: { free: 10000, default: 10000, - }, + } }; export const redisRateLimitClient = new Redis( From ba95df96b1fdcfdf1530d1cdd6415058f86b38a3 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 20 Dec 2024 15:45:44 -0300 Subject: [PATCH 59/65] Update rate-limiter.ts --- apps/api/src/services/rate-limiter.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index d88ff7ad..21025589 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -86,7 +86,7 @@ const RATE_LIMITS = { testSuite: { free: 10000, default: 10000, - } + }, }; export const redisRateLimitClient = new Redis( From d1f3e26f9ed8eb099d3076c137c66334d3619efd Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 20 Dec 2024 18:09:49 -0300 Subject: [PATCH 60/65] Nick: blocklist string --- .../__tests__/e2e_full_withAuth/index.test.ts | 9 ++-- .../src/__tests__/e2e_noAuth/index.test.ts | 13 ++--- .../__tests__/e2e_v1_withAuth/index.test.ts | 9 ++-- apps/api/src/controllers/v0/crawl.ts | 4 +- apps/api/src/controllers/v0/crawlPreview.ts | 4 +- apps/api/src/controllers/v0/scrape.ts | 20 +++---- .../v1/__tests__/urlValidation.test.ts | 9 ++-- apps/api/src/controllers/v1/extract.ts | 14 +++-- apps/api/src/controllers/v1/types.ts | 6 +-- apps/api/src/lib/strings.ts | 2 + apps/api/src/main/runWebScraper.ts | 4 +- apps/api/src/routes/v1.ts | 5 +- .../src/services/billing/credit_billing.ts | 5 +- .../__tests__/v1/e2e_withAuth/index.test.ts | 6 --- .../firecrawl/__tests__/e2e_withAuth/test.py | 24 ++++----- .../__tests__/v1/e2e_withAuth/test.py | 36 ++++++------- apps/rust-sdk/tests/e2e_with_auth.rs | 53 ++++++++++--------- 17 files changed, 105 insertions(+), 118 deletions(-) create mode 100644 apps/api/src/lib/strings.ts diff --git a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts index 45b3c31e..40686c45 100644 --- a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts @@ -1,6 +1,7 @@ import request from "supertest"; import dotenv from "dotenv"; import { v4 as uuidv4 } from "uuid"; +import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; dotenv.config(); @@ -58,9 +59,7 @@ describe("E2E Tests for API Routes", () => { .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", - ); + expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE); }); // tested on rate limit test @@ -480,9 +479,7 @@ describe("E2E Tests for API Routes", () => { .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", - ); + expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE); }); it.concurrent( diff --git a/apps/api/src/__tests__/e2e_noAuth/index.test.ts b/apps/api/src/__tests__/e2e_noAuth/index.test.ts index e30352a5..9d5dc554 100644 --- a/apps/api/src/__tests__/e2e_noAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_noAuth/index.test.ts @@ -1,5 +1,6 @@ import request from "supertest"; import dotenv from "dotenv"; +import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; const fs = require("fs"); const path = require("path"); @@ -61,9 +62,7 @@ describe("E2E Tests for API Routes with No Authentication", () => { .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", - ); + expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE); }); it("should return a successful response", async () => { @@ -88,9 +87,7 @@ describe("E2E Tests for API Routes with No Authentication", () => { .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", - ); + expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE); }); it("should return a successful response", async () => { @@ -119,9 +116,7 @@ describe("E2E Tests for API Routes with No Authentication", () => { .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", - ); + expect(response.body.error).toContain(BLOCKLISTED_URL_MESSAGE); }); it("should return a successful response", async () => { diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index 35ee2d89..39e0aa85 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -4,6 +4,7 @@ import { ScrapeRequestInput, ScrapeResponseRequestTest, } from "../../controllers/v1/types"; +import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; configDotenv(); const TEST_URL = "http://127.0.0.1:3002"; @@ -57,9 +58,7 @@ describe("E2E Tests for v1 API Routes", () => { .send(scrapeRequest); expect(response.statusCode).toBe(403); - expect(response.body.error).toBe( - "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.", - ); + expect(response.body.error).toBe(BLOCKLISTED_URL_MESSAGE); }); it.concurrent( @@ -756,9 +755,7 @@ describe("E2E Tests for v1 API Routes", () => { .send(scrapeRequest); expect(response.statusCode).toBe(403); - expect(response.body.error).toBe( - "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.", - ); + expect(response.body.error).toBe(BLOCKLISTED_URL_MESSAGE); }); it.concurrent( diff --git a/apps/api/src/controllers/v0/crawl.ts b/apps/api/src/controllers/v0/crawl.ts index 36b8309f..ceeaa436 100644 --- a/apps/api/src/controllers/v0/crawl.ts +++ b/apps/api/src/controllers/v0/crawl.ts @@ -29,6 +29,7 @@ import * as Sentry from "@sentry/node"; import { getJobPriority } from "../../lib/job-priority"; import { fromLegacyScrapeOptions, url as urlSchema } from "../v1/types"; import { ZodError } from "zod"; +import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; export async function crawlController(req: Request, res: Response) { try { @@ -112,8 +113,7 @@ export async function crawlController(req: Request, res: Response) { if (isUrlBlocked(url)) { return res.status(403).json({ - error: - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", + error: BLOCKLISTED_URL_MESSAGE, }); } diff --git a/apps/api/src/controllers/v0/crawlPreview.ts b/apps/api/src/controllers/v0/crawlPreview.ts index 405e49c2..f9462c3d 100644 --- a/apps/api/src/controllers/v0/crawlPreview.ts +++ b/apps/api/src/controllers/v0/crawlPreview.ts @@ -15,6 +15,7 @@ import { addScrapeJob } from "../../../src/services/queue-jobs"; import { checkAndUpdateURL } from "../../../src/lib/validateUrl"; import * as Sentry from "@sentry/node"; import { fromLegacyScrapeOptions } from "../v1/types"; +import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; export async function crawlPreviewController(req: Request, res: Response) { try { @@ -42,8 +43,7 @@ export async function crawlPreviewController(req: Request, res: Response) { if (isUrlBlocked(url)) { return res.status(403).json({ - error: - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", + error: BLOCKLISTED_URL_MESSAGE, }); } diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index 2d8acf5f..05bf364b 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -29,6 +29,7 @@ import { getJobPriority } from "../../lib/job-priority"; import { fromLegacyScrapeOptions } from "../v1/types"; import { ZodError } from "zod"; import { Document as V0Document } from "./../../lib/entities"; +import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; export async function scrapeHelper( jobId: string, @@ -53,8 +54,7 @@ export async function scrapeHelper( if (isUrlBlocked(url)) { return { success: false, - error: - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", + error: BLOCKLISTED_URL_MESSAGE, returnCode: 403, }; } @@ -265,13 +265,15 @@ export async function scrapeController(req: Request, res: Response) { } if (creditsToBeBilled > 0) { // billing for doc done on queue end, bill only for llm extraction - billTeam(team_id, chunk?.sub_id, creditsToBeBilled, logger).catch((error) => { - logger.error( - `Failed to bill team ${team_id} for ${creditsToBeBilled} credits`, - { error } - ); - // Optionally, you could notify an admin or add to a retry queue here - }); + billTeam(team_id, chunk?.sub_id, creditsToBeBilled, logger).catch( + (error) => { + logger.error( + `Failed to bill team ${team_id} for ${creditsToBeBilled} credits`, + { error }, + ); + // Optionally, you could notify an admin or add to a retry queue here + }, + ); } } diff --git a/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts b/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts index b455e5ab..afa44e58 100644 --- a/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts +++ b/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts @@ -1,4 +1,5 @@ import { url } from "../types"; +import { BLOCKLISTED_URL_MESSAGE } from "../../../lib/strings"; describe("URL Schema Validation", () => { beforeEach(() => { @@ -31,7 +32,7 @@ describe("URL Schema Validation", () => { it("should reject blocked URLs", () => { expect(() => url.parse("https://facebook.com")).toThrow( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", + BLOCKLISTED_URL_MESSAGE, ); }); @@ -47,16 +48,16 @@ describe("URL Schema Validation", () => { it("should handle URLs with subdomains that are blocked", () => { expect(() => url.parse("https://sub.facebook.com")).toThrow( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", + BLOCKLISTED_URL_MESSAGE, ); }); it("should handle URLs with paths that are blocked", () => { expect(() => url.parse("http://facebook.com/path")).toThrow( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", + BLOCKLISTED_URL_MESSAGE, ); expect(() => url.parse("https://facebook.com/another/path")).toThrow( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", + BLOCKLISTED_URL_MESSAGE, ); }); diff --git a/apps/api/src/controllers/v1/extract.ts b/apps/api/src/controllers/v1/extract.ts index 1188bafb..58e75751 100644 --- a/apps/api/src/controllers/v1/extract.ts +++ b/apps/api/src/controllers/v1/extract.ts @@ -92,19 +92,18 @@ export async function extractController( let mappedLinks = mapResults.mapResults as MapDocument[]; // Remove duplicates between mapResults.links and mappedLinks - const allUrls = [...mappedLinks.map(m => m.url), ...mapResults.links]; + const allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links]; const uniqueUrls = removeDuplicateUrls(allUrls); - + // Only add URLs from mapResults.links that aren't already in mappedLinks - const existingUrls = new Set(mappedLinks.map(m => m.url)); - const newUrls = uniqueUrls.filter(url => !existingUrls.has(url)); - + const existingUrls = new Set(mappedLinks.map((m) => m.url)); + const newUrls = uniqueUrls.filter((url) => !existingUrls.has(url)); + mappedLinks = [ ...mappedLinks, - ...newUrls.map(url => ({ url, title: "", description: "" })) + ...newUrls.map((url) => ({ url, title: "", description: "" })), ]; - if (mappedLinks.length === 0) { mappedLinks = [{ url: baseUrl, title: "", description: "" }]; } @@ -117,7 +116,6 @@ export async function extractController( `url: ${x.url}, title: ${x.title}, description: ${x.description}`, ); - if (req.body.prompt) { let searchQuery = req.body.prompt && allowExternalLinks diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 06605eb9..114c115e 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -11,6 +11,7 @@ import { Document as V0Document, } from "../../lib/entities"; import { InternalOptions } from "../../scraper/scrapeURL"; +import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; export type Format = | "markdown" @@ -44,10 +45,7 @@ export const url = z.preprocess( return false; } }, "Invalid URL") - .refine( - (x) => !isUrlBlocked(x as string), - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", - ), + .refine((x) => !isUrlBlocked(x as string), BLOCKLISTED_URL_MESSAGE), ); const strictMessage = diff --git a/apps/api/src/lib/strings.ts b/apps/api/src/lib/strings.ts new file mode 100644 index 00000000..4e278d2b --- /dev/null +++ b/apps/api/src/lib/strings.ts @@ -0,0 +1,2 @@ +export const BLOCKLISTED_URL_MESSAGE = + "This website is no longer supported, please reach out to help@firecrawl.com for more info on how to activate it on your account."; diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index c50ab9c9..0f3b8524 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -71,7 +71,7 @@ export async function runWebScraper({ module: "runWebscraper", scrapeId: bull_job_id, jobId: bull_job_id, - }) + }); const tries = is_crawl ? 3 : 1; let response: ScrapeUrlResponse | undefined = undefined; @@ -176,7 +176,7 @@ export async function runWebScraper({ billTeam(team_id, undefined, creditsToBeBilled, logger).catch((error) => { logger.error( `Failed to bill team ${team_id} for ${creditsToBeBilled} credits`, - { error } + { error }, ); // Optionally, you could notify an admin or add to a retry queue here }); diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index 76427114..1ee191ef 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -32,6 +32,7 @@ import { extractController } from "../controllers/v1/extract"; // import { livenessController } from "../controllers/v1/liveness"; // import { readinessController } from "../controllers/v1/readiness"; import { creditUsageController } from "../controllers/v1/credit-usage"; +import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings"; function checkCreditsMiddleware( minimum?: number, @@ -123,8 +124,7 @@ function blocklistMiddleware(req: Request, res: Response, next: NextFunction) { if (!res.headersSent) { return res.status(403).json({ success: false, - error: - "URL is blocked intentionally. Firecrawl currently does not support scraping this site due to policy restrictions.", + error: BLOCKLISTED_URL_MESSAGE, }); } } @@ -231,4 +231,3 @@ v1Router.get( authMiddleware(RateLimiterMode.CrawlStatus), wrap(creditUsageController), ); - diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index c2671034..5eb541fd 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -44,7 +44,10 @@ export async function supaBillTeam( if (team_id === "preview") { return { success: true, message: "Preview team, no credits used" }; } - _logger.info(`Billing team ${team_id} for ${credits} credits`, { team_id, credits }); + _logger.info(`Billing team ${team_id} for ${credits} credits`, { + team_id, + credits, + }); const { data, error } = await supabase_service.rpc("bill_team", { _team_id: team_id, diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index dea55846..81b0a523 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -159,12 +159,6 @@ describe('FirecrawlApp E2E Tests', () => { await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); }); - test.concurrent('should throw error for blocklisted URL on crawl', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const blocklistedUrl = "https://twitter.com/fake-test"; - await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions."); - }); - test.concurrent('should return successful response for crawl and wait for completion', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.crawlUrl('https://roastmywebsite.ai', {}, 30) as CrawlStatusResponse; diff --git a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py index 8945d74d..50d5306f 100644 --- a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py +++ b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py @@ -29,12 +29,12 @@ def test_scrape_url_invalid_api_key(): invalid_app.scrape_url('https://firecrawl.dev') assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) -def test_blocklisted_url(): - blocklisted_url = "https://facebook.com/fake-test" - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') - with pytest.raises(Exception) as excinfo: - app.scrape_url(blocklisted_url) - assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) +# def test_blocklisted_url(): +# blocklisted_url = "https://facebook.com/fake-test" +# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') +# with pytest.raises(Exception) as excinfo: +# app.scrape_url(blocklisted_url) +# assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) def test_successful_response_with_valid_preview_token(): app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token", version='v0') @@ -90,12 +90,12 @@ def test_crawl_url_invalid_api_key(): invalid_app.crawl_url('https://firecrawl.dev') assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) -def test_should_return_error_for_blocklisted_url(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') - blocklisted_url = "https://twitter.com/fake-test" - with pytest.raises(Exception) as excinfo: - app.crawl_url(blocklisted_url) - assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) +# def test_should_return_error_for_blocklisted_url(): +# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') +# blocklisted_url = "https://twitter.com/fake-test" +# with pytest.raises(Exception) as excinfo: +# app.crawl_url(blocklisted_url) +# assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) def test_crawl_url_wait_for_completion_e2e(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py index 12fa10ce..0ada6c1d 100644 --- a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py +++ b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py @@ -30,12 +30,12 @@ def test_scrape_url_invalid_api_key(): invalid_app.scrape_url('https://firecrawl.dev') assert "Unauthorized: Invalid token" in str(excinfo.value) -def test_blocklisted_url(): - blocklisted_url = "https://facebook.com/fake-test" - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) - with pytest.raises(Exception) as excinfo: - app.scrape_url(blocklisted_url) - assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value) +# def test_blocklisted_url(): +# blocklisted_url = "https://facebook.com/fake-test" +# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) +# with pytest.raises(Exception) as excinfo: +# app.scrape_url(blocklisted_url) +# assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value) def test_successful_response_with_valid_preview_token(): app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token") @@ -136,12 +136,12 @@ def test_crawl_url_invalid_api_key(): invalid_app.crawl_url('https://firecrawl.dev') assert "Unauthorized: Invalid token" in str(excinfo.value) -def test_should_return_error_for_blocklisted_url(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) - blocklisted_url = "https://twitter.com/fake-test" - with pytest.raises(Exception) as excinfo: - app.crawl_url(blocklisted_url) - assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value) +# def test_should_return_error_for_blocklisted_url(): +# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) +# blocklisted_url = "https://twitter.com/fake-test" +# with pytest.raises(Exception) as excinfo: +# app.crawl_url(blocklisted_url) +# assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value) def test_crawl_url_wait_for_completion_e2e(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) @@ -296,12 +296,12 @@ def test_invalid_api_key_on_map(): invalid_app.map_url('https://roastmywebsite.ai') assert "Unauthorized: Invalid token" in str(excinfo.value) -def test_blocklisted_url_on_map(): - app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL) - blocklisted_url = "https://facebook.com/fake-test" - with pytest.raises(Exception) as excinfo: - app.map_url(blocklisted_url) - assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value) +# def test_blocklisted_url_on_map(): +# app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL) +# blocklisted_url = "https://facebook.com/fake-test" +# with pytest.raises(Exception) as excinfo: +# app.map_url(blocklisted_url) +# assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value) def test_successful_response_with_valid_preview_token_on_map(): app = FirecrawlApp(api_key="this_is_just_a_preview_token", api_url=API_URL) diff --git a/apps/rust-sdk/tests/e2e_with_auth.rs b/apps/rust-sdk/tests/e2e_with_auth.rs index 75568f92..92b202cb 100644 --- a/apps/rust-sdk/tests/e2e_with_auth.rs +++ b/apps/rust-sdk/tests/e2e_with_auth.rs @@ -5,20 +5,20 @@ use firecrawl::FirecrawlApp; use serde_json::json; use std::env; -#[tokio::test] -async fn test_blocklisted_url() { - dotenv().ok(); - let api_url = env::var("API_URL").unwrap(); - let api_key = env::var("TEST_API_KEY").ok(); - let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap(); - let blocklisted_url = "https://facebook.com/fake-test"; - let result = app.scrape_url(blocklisted_url, None).await; +// #[tokio::test] +// async fn test_blocklisted_url() { +// dotenv().ok(); +// let api_url = env::var("API_URL").unwrap(); +// let api_key = env::var("TEST_API_KEY").ok(); +// let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap(); +// let blocklisted_url = "https://facebook.com/fake-test"; +// let result = app.scrape_url(blocklisted_url, None).await; - assert_matches!( - result, - Err(e) if e.to_string().contains("Firecrawl currently does not support social media scraping due to policy restrictions") - ); -} +// assert_matches!( +// result, +// Err(e) if e.to_string().contains("Firecrawl currently does not support social media scraping due to policy restrictions") +// ); +// } #[tokio::test] async fn test_successful_response_with_valid_preview_token() { @@ -103,20 +103,21 @@ async fn test_successful_response_for_valid_scrape_with_pdf_file_without_explici .contains("We present spectrophotometric observations of the Broad Line Radio Galaxy")); } -#[tokio::test] -async fn test_should_return_error_for_blocklisted_url() { - dotenv().ok(); - let api_url = env::var("API_URL").unwrap(); - let api_key = env::var("TEST_API_KEY").ok(); - let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap(); - let blocklisted_url = "https://twitter.com/fake-test"; - let result = app.crawl_url(blocklisted_url, None).await; - assert_matches!( - result, - Err(e) if e.to_string().contains("Firecrawl currently does not support social media scraping due to policy restrictions.") - ); -} +// #[tokio::test] +// async fn test_should_return_error_for_blocklisted_url() { +// dotenv().ok(); +// let api_url = env::var("API_URL").unwrap(); +// let api_key = env::var("TEST_API_KEY").ok(); +// let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap(); +// let blocklisted_url = "https://twitter.com/fake-test"; +// let result = app.crawl_url(blocklisted_url, None).await; + +// assert_matches!( +// result, +// Err(e) if e.to_string().contains("Firecrawl currently does not support social media scraping due to policy restrictions.") +// ); +// } #[tokio::test] async fn test_llm_extraction() { From f043f5fd61d229f08dbba3f16079061a0f2cecbf Mon Sep 17 00:00:00 2001 From: Thomas Kosmas Date: Sat, 21 Dec 2024 02:27:22 +0200 Subject: [PATCH 61/65] Enhance error handling in E2E tests and introduce CrawlWatcher tests - Updated error messages in E2E tests to provide clearer feedback for blocked URLs and invalid API keys. - Added new test suite for CrawlWatcher to ensure proper instantiation and error handling when WebSocket is unavailable. - Improved test conditions for URL scraping and crawling to reflect updated error responses. --- .../__tests__/e2e_v1_withAuth/index.test.ts | 4 +-- .../src/__tests__/CrawlWatcher.test.ts | 35 +++++++++++++++++++ .../src/__tests__/e2e_withAuth/index.test.ts | 8 ++--- .../__tests__/v1/e2e_withAuth/index.test.ts | 12 +++---- 4 files changed, 47 insertions(+), 12 deletions(-) create mode 100644 apps/js-sdk/firecrawl/src/__tests__/CrawlWatcher.test.ts diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index 35ee2d89..f5fc5d5d 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -58,7 +58,7 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.statusCode).toBe(403); expect(response.body.error).toBe( - "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.", + "Request failed with status code 403. Error: URL is blocked intentionally. Firecrawl currently does not support scraping this site due to policy restrictions. ", ); }); @@ -757,7 +757,7 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.statusCode).toBe(403); expect(response.body.error).toBe( - "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.", + "Request failed with status code 403. Error: URL is blocked intentionally. Firecrawl currently does not support scraping this site due to policy restrictions. ", ); }); diff --git a/apps/js-sdk/firecrawl/src/__tests__/CrawlWatcher.test.ts b/apps/js-sdk/firecrawl/src/__tests__/CrawlWatcher.test.ts new file mode 100644 index 00000000..7f53828d --- /dev/null +++ b/apps/js-sdk/firecrawl/src/__tests__/CrawlWatcher.test.ts @@ -0,0 +1,35 @@ +import { jest } from '@jest/globals'; + +describe('CrawlWatcher', () => { + const mockApiUrl = 'https://api.firecrawl.dev'; + const mockApiKey = 'test-api-key'; + + beforeEach(() => { + jest.resetModules(); + }); + + test('should create a CrawlWatcher instance successfully when isows is available', async () => { + await jest.unstable_mockModule('isows', () => ({ + WebSocket: jest.fn(), + })); + + const { default: FirecrawlApp, CrawlWatcher } = await import('../index'); + const app = new FirecrawlApp({ apiKey: mockApiKey, apiUrl: mockApiUrl }); + + const watcher = new CrawlWatcher('test-id', app); + expect(watcher).toBeInstanceOf(CrawlWatcher); + }); + + test('should throw when WebSocket is not available (isows import fails)', async () => { + await jest.unstable_mockModule('isows', () => { + throw new Error('Module not found'); + }); + + const { default: FirecrawlApp, CrawlWatcher, FirecrawlError } = await import('../index'); + const app = new FirecrawlApp({ apiKey: mockApiKey, apiUrl: mockApiUrl }); + + expect(() => { + new CrawlWatcher('test-id', app); + }).toThrow(FirecrawlError); + }); +}); diff --git a/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts index 7d107afe..6db51775 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts @@ -32,7 +32,7 @@ describe('FirecrawlApp<"v0"> E2E Tests', () => { }); await expect( invalidApp.scrapeUrl("https://roastmywebsite.ai") - ).rejects.toThrow("Request failed with status code 401"); + ).rejects.toThrow("Unexpected error occurred while trying to scrape URL. Status code: 401"); } ); @@ -46,7 +46,7 @@ describe('FirecrawlApp<"v0"> E2E Tests', () => { }); const blocklistedUrl = "https://facebook.com/fake-test"; await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow( - "Request failed with status code 403" + "Unexpected error occurred while trying to scrape URL. Status code: 403" ); } ); @@ -169,7 +169,7 @@ describe('FirecrawlApp<"v0"> E2E Tests', () => { }); const blocklistedUrl = "https://twitter.com/fake-test"; await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow( - "Request failed with status code 403" + "Unexpected error occurred while trying to scrape URL. Status code: 403" ); } ); @@ -242,7 +242,7 @@ describe('FirecrawlApp<"v0"> E2E Tests', () => { const maxChecks = 15; let checks = 0; - while (statusResponse.status === "active" && checks < maxChecks) { + while ((statusResponse.status === "active" || statusResponse.status === "scraping" ) && checks < maxChecks) { await new Promise((resolve) => setTimeout(resolve, 5000)); expect(statusResponse.partial_data).not.toBeNull(); // expect(statusResponse.current).toBeGreaterThanOrEqual(1); diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index dea55846..76dc7f73 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -17,13 +17,13 @@ describe('FirecrawlApp E2E Tests', () => { test.concurrent('should throw error for invalid API key on scrape', async () => { const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); - await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); + await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Unexpected error occurred while trying to scrape URL. Status code: 401"); }); test.concurrent('should throw error for blocklisted URL on scrape', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const blocklistedUrl = "https://facebook.com/fake-test"; - await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); + await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Unexpected error occurred while trying to scrape URL. Status code: 403"); }); test.concurrent('should return successful response with valid preview token', async () => { @@ -61,7 +61,7 @@ describe('FirecrawlApp E2E Tests', () => { 'https://roastmywebsite.ai', { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links'], headers: { "x-key": "test" }, - includeTags: ['h1'], + // includeTags: ['h1'], excludeTags: ['h2'], onlyMainContent: true, timeout: 30000, @@ -162,7 +162,7 @@ describe('FirecrawlApp E2E Tests', () => { test.concurrent('should throw error for blocklisted URL on crawl', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const blocklistedUrl = "https://twitter.com/fake-test"; - await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions."); + await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403. Error: This website is no longer supported, please reach out to help@firecrawl.com for more info on how to activate it on your account. "); }); test.concurrent('should return successful response for crawl and wait for completion', async () => { @@ -212,7 +212,7 @@ describe('FirecrawlApp E2E Tests', () => { scrapeOptions: { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links'], headers: { "x-key": "test" }, - includeTags: ['h1'], + // includeTags: ['h1'], excludeTags: ['h2'], onlyMainContent: true, waitFor: 1000 @@ -334,7 +334,7 @@ describe('FirecrawlApp E2E Tests', () => { expect(statusResponse.data[0].metadata).not.toHaveProperty("error"); } } - }, 60000); // 60 seconds timeout + }, 120000); // 120 seconds timeout test.concurrent('should throw error for invalid API key on map', async () => { const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); From 7366f36e397669fcb4260617707f63aa38ced375 Mon Sep 17 00:00:00 2001 From: RutamBhagat Date: Sat, 21 Dec 2024 07:03:16 -0800 Subject: [PATCH 62/65] docs(CONTRIBUTING.md): Add Docker Compose setup instructions to CONTRIBUTING.md --- CONTRIBUTING.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 773454e5..ce82236d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -111,6 +111,20 @@ curl -X POST http://localhost:3002/v1/crawl \ }' ``` +### Alternative: Using Docker Compose + +For a simpler setup, you can use Docker Compose to run all services: + +1. Prerequisites: Make sure you have Docker and Docker Compose installed +2. Copy the `.env.example` file to `.env` in the `/apps/api/` directory and configure as needed +3. From the root directory, run: + +```bash +docker compose up +``` + +This will start Redis, the API server, and workers automatically in the correct configuration. + ## Tests: The best way to do this is run the test with `npm run test:local-no-auth` if you'd like to run the tests without authentication. From 18ceaf10a5fa162ee66a33967bffd682be1746f8 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 23 Dec 2024 18:42:05 -0300 Subject: [PATCH 63/65] Update .gitignore --- apps/api/.gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/api/.gitignore b/apps/api/.gitignore index d9639687..52345155 100644 --- a/apps/api/.gitignore +++ b/apps/api/.gitignore @@ -9,3 +9,5 @@ dump.rdb .rdb .sentryclirc + +.env.* \ No newline at end of file From b1a5625b2208ea34096096bfdd1685f9879a1d1b Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 23 Dec 2024 18:45:51 -0300 Subject: [PATCH 64/65] Revert "Merge pull request #997 from mendableai/feat/sdk-without-ws" This reverts commit 53cda5f81c53d3de35925c610ce083923ca09fbe, reversing changes made to 51f79b55efadc53243a8c22d86bb2d08d878d524. --- .../src/__tests__/CrawlWatcher.test.ts | 35 ------------------- .../src/__tests__/e2e_withAuth/index.test.ts | 8 ++--- .../__tests__/v1/e2e_withAuth/index.test.ts | 8 ++--- apps/js-sdk/firecrawl/src/index.ts | 20 +---------- 4 files changed, 9 insertions(+), 62 deletions(-) delete mode 100644 apps/js-sdk/firecrawl/src/__tests__/CrawlWatcher.test.ts diff --git a/apps/js-sdk/firecrawl/src/__tests__/CrawlWatcher.test.ts b/apps/js-sdk/firecrawl/src/__tests__/CrawlWatcher.test.ts deleted file mode 100644 index 7f53828d..00000000 --- a/apps/js-sdk/firecrawl/src/__tests__/CrawlWatcher.test.ts +++ /dev/null @@ -1,35 +0,0 @@ -import { jest } from '@jest/globals'; - -describe('CrawlWatcher', () => { - const mockApiUrl = 'https://api.firecrawl.dev'; - const mockApiKey = 'test-api-key'; - - beforeEach(() => { - jest.resetModules(); - }); - - test('should create a CrawlWatcher instance successfully when isows is available', async () => { - await jest.unstable_mockModule('isows', () => ({ - WebSocket: jest.fn(), - })); - - const { default: FirecrawlApp, CrawlWatcher } = await import('../index'); - const app = new FirecrawlApp({ apiKey: mockApiKey, apiUrl: mockApiUrl }); - - const watcher = new CrawlWatcher('test-id', app); - expect(watcher).toBeInstanceOf(CrawlWatcher); - }); - - test('should throw when WebSocket is not available (isows import fails)', async () => { - await jest.unstable_mockModule('isows', () => { - throw new Error('Module not found'); - }); - - const { default: FirecrawlApp, CrawlWatcher, FirecrawlError } = await import('../index'); - const app = new FirecrawlApp({ apiKey: mockApiKey, apiUrl: mockApiUrl }); - - expect(() => { - new CrawlWatcher('test-id', app); - }).toThrow(FirecrawlError); - }); -}); diff --git a/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts index 6db51775..7d107afe 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts @@ -32,7 +32,7 @@ describe('FirecrawlApp<"v0"> E2E Tests', () => { }); await expect( invalidApp.scrapeUrl("https://roastmywebsite.ai") - ).rejects.toThrow("Unexpected error occurred while trying to scrape URL. Status code: 401"); + ).rejects.toThrow("Request failed with status code 401"); } ); @@ -46,7 +46,7 @@ describe('FirecrawlApp<"v0"> E2E Tests', () => { }); const blocklistedUrl = "https://facebook.com/fake-test"; await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow( - "Unexpected error occurred while trying to scrape URL. Status code: 403" + "Request failed with status code 403" ); } ); @@ -169,7 +169,7 @@ describe('FirecrawlApp<"v0"> E2E Tests', () => { }); const blocklistedUrl = "https://twitter.com/fake-test"; await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow( - "Unexpected error occurred while trying to scrape URL. Status code: 403" + "Request failed with status code 403" ); } ); @@ -242,7 +242,7 @@ describe('FirecrawlApp<"v0"> E2E Tests', () => { const maxChecks = 15; let checks = 0; - while ((statusResponse.status === "active" || statusResponse.status === "scraping" ) && checks < maxChecks) { + while (statusResponse.status === "active" && checks < maxChecks) { await new Promise((resolve) => setTimeout(resolve, 5000)); expect(statusResponse.partial_data).not.toBeNull(); // expect(statusResponse.current).toBeGreaterThanOrEqual(1); diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index 4f3a9cb2..e5c04209 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -36,7 +36,7 @@ describe('FirecrawlApp E2E Tests', () => { test.concurrent('should throw error for blocklisted URL on scrape', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const blocklistedUrl = "https://facebook.com/fake-test"; - await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Unexpected error occurred while trying to scrape URL. Status code: 403"); + await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); }); test.concurrent('should return successful response with valid preview token', async () => { @@ -74,7 +74,7 @@ describe('FirecrawlApp E2E Tests', () => { 'https://roastmywebsite.ai', { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links'], headers: { "x-key": "test" }, - // includeTags: ['h1'], + includeTags: ['h1'], excludeTags: ['h2'], onlyMainContent: true, timeout: 30000, @@ -224,7 +224,7 @@ describe('FirecrawlApp E2E Tests', () => { scrapeOptions: { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links'], headers: { "x-key": "test" }, - // includeTags: ['h1'], + includeTags: ['h1'], excludeTags: ['h2'], onlyMainContent: true, waitFor: 1000 @@ -346,7 +346,7 @@ describe('FirecrawlApp E2E Tests', () => { expect(statusResponse.data[0].metadata).not.toHaveProperty("error"); } } - }, 120000); // 120 seconds timeout + }, 60000); // 60 seconds timeout test.concurrent('should throw error for invalid API key on map', async () => { if (API_URL.includes('api.firecrawl.dev')) { diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index feb69f03..d3ae630b 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -1,23 +1,7 @@ import axios, { type AxiosResponse, type AxiosRequestHeaders, AxiosError } from "axios"; import type * as zt from "zod"; import { zodToJsonSchema } from "zod-to-json-schema"; - -import type { WebSocket as IsowsWebSocket } from 'isows'; -/** - * Dynamically imports the WebSocket class from 'isows'. - * If the import fails, WebSocket is set to null. - * This approach is used because some environments, such as Firebase Functions, - * might not support WebSocket natively. - */ -const WebSocket: typeof IsowsWebSocket | null = await (async () => { - try { - const module = await import('isows'); - return module.WebSocket; - } catch (error) { - return null; - } -})(); - +import { WebSocket } from "isows"; import { TypedEventTarget } from "typescript-event-target"; /** @@ -961,8 +945,6 @@ export class CrawlWatcher extends TypedEventTarget { constructor(id: string, app: FirecrawlApp) { super(); - if(!WebSocket) - throw new FirecrawlError("WebSocket module failed to load. Your system might not support WebSocket.", 500); this.id = id; this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey); this.status = "scraping"; From c911aad228ebb76384833e8e95e2a074f9d78030 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 23 Dec 2024 18:48:03 -0300 Subject: [PATCH 65/65] Update package.json --- apps/js-sdk/firecrawl/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 3334abe6..8945f3fa 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.9.8", + "version": "1.10.0", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts",