From a36c6a4f401d06f8d27b135727bf10edded431d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 27 May 2025 21:33:44 +0200 Subject: [PATCH] feat(scrapeURL): add unnormalizedSourceURL for url matching DX (FIR-2137) (#1601) * feat(scrapeURL): add unnormalizedSourceURL for url matching DX * fix(tests): fixc --- apps/api/src/__tests__/snips/batch-scrape.test.ts | 8 ++++++++ apps/api/src/__tests__/snips/scrape.test.ts | 8 ++++++++ apps/api/src/controllers/v1/batch-scrape.ts | 15 +++++++-------- apps/api/src/controllers/v1/scrape.ts | 1 + apps/api/src/scraper/scrapeURL/index.ts | 3 ++- 5 files changed, 26 insertions(+), 9 deletions(-) diff --git a/apps/api/src/__tests__/snips/batch-scrape.test.ts b/apps/api/src/__tests__/snips/batch-scrape.test.ts index f3e9e585..8d4c3946 100644 --- a/apps/api/src/__tests__/snips/batch-scrape.test.ts +++ b/apps/api/src/__tests__/snips/batch-scrape.test.ts @@ -48,4 +48,12 @@ describe("Batch scrape tests", () => { }, 180000); }); } + + it.concurrent("sourceURL stays unnormalized", async () => { + const response = await batchScrape({ + urls: ["https://firecrawl.dev/?pagewanted=all&et_blog"], + }); + + expect(response.body.data[0].metadata.sourceURL).toBe("https://firecrawl.dev/?pagewanted=all&et_blog"); + }, 35000); }); diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index 4703964e..9c2a9e2d 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -366,4 +366,12 @@ describe("Scrape tests", () => { }, 30000); }); } + + it.concurrent("sourceURL stays unnormalized", async () => { + const response = await scrape({ + url: "https://firecrawl.dev/?pagewanted=all&et_blog", + }); + + expect(response.metadata.sourceURL).toBe("https://firecrawl.dev/?pagewanted=all&et_blog"); + }, 35000); }); diff --git a/apps/api/src/controllers/v1/batch-scrape.ts b/apps/api/src/controllers/v1/batch-scrape.ts index 326aba5f..d14f8cd7 100644 --- a/apps/api/src/controllers/v1/batch-scrape.ts +++ b/apps/api/src/controllers/v1/batch-scrape.ts @@ -22,7 +22,6 @@ import { getJobPriority } from "../../lib/job-priority"; import { addScrapeJobs } from "../../services/queue-jobs"; import { callWebhook } from "../../services/webhook"; import { logger as _logger } from "../../lib/logger"; -import { CostTracking } from "../../lib/extract/extraction-service"; import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; @@ -30,6 +29,8 @@ export async function batchScrapeController( req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>, res: Response, ) { + const preNormalizedBody = { ...req.body }; + if (req.body?.ignoreInvalidURLs === true) { req.body = batchScrapeRequestSchemaNoURLValidation.parse(req.body); } else { @@ -46,6 +47,7 @@ export async function batchScrapeController( }); let urls = req.body.urls; + let unnormalizedURLs = preNormalizedBody.urls; let invalidURLs: string[] | undefined = undefined; if (req.body.ignoreInvalidURLs) { @@ -53,11 +55,13 @@ export async function batchScrapeController( let pendingURLs = urls; urls = []; + unnormalizedURLs = []; for (const u of pendingURLs) { try { const nu = urlSchema.parse(u); if (!isUrlBlocked(nu, req.acuc?.flags ?? null)) { urls.push(nu); + unnormalizedURLs.push(u); } else { invalidURLs.push(u); } @@ -86,12 +90,6 @@ export async function batchScrapeController( await logCrawl(id, req.auth.team_id); } - let { remainingCredits } = req.account!; - const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; - if (!useDbAuthentication) { - remainingCredits = Infinity; - } - const sc: StoredCrawl = req.body.appendToId ? ((await getCrawl(req.body.appendToId)) as StoredCrawl) : { @@ -127,7 +125,7 @@ export async function batchScrapeController( delete (scrapeOptions as any).urls; delete (scrapeOptions as any).appendToId; - const jobs = urls.map((x) => { + const jobs = urls.map((x, i) => { return { data: { url: x, @@ -142,6 +140,7 @@ export async function batchScrapeController( webhook: req.body.webhook, internalOptions: { saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false, + unnormalizedSourceURL: unnormalizedURLs[i], }, }, opts: { diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index 8da72dae..092d86e8 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -51,6 +51,7 @@ export async function scrapeController( internalOptions: { teamId: req.auth.team_id, saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false, + unnormalizedSourceURL: preNormalizedBody.url, }, origin: req.body.origin, is_scrape: true, diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 85254f37..fbc0c53b 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -182,6 +182,7 @@ export type InternalOptions = { fromCache?: boolean; // Indicates if the document was retrieved from cache abort?: AbortSignal; urlInvisibleInCurrentCrawl?: boolean; + unnormalizedSourceURL?: string; saveScrapeResultToGCS?: boolean; // Passed along to fire-engine }; @@ -373,7 +374,7 @@ async function scrapeURLLoop(meta: Meta): Promise { screenshot: result.result.screenshot, actions: result.result.actions, metadata: { - sourceURL: meta.url, + sourceURL: meta.internalOptions.unnormalizedSourceURL ?? meta.url, url: result.result.url, statusCode: result.result.statusCode, error: result.result.error,