feat(scrapeURL): add unnormalizedSourceURL for url matching DX (FIR-2137) (#1601)

* feat(scrapeURL): add unnormalizedSourceURL for url matching DX

* fix(tests): fixc
This commit is contained in:
Gergő Móricz 2025-05-27 21:33:44 +02:00 committed by GitHub
parent 474e5a0543
commit a36c6a4f40
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 26 additions and 9 deletions

View File

@ -48,4 +48,12 @@ describe("Batch scrape tests", () => {
}, 180000); }, 180000);
}); });
} }
it.concurrent("sourceURL stays unnormalized", async () => {
const response = await batchScrape({
urls: ["https://firecrawl.dev/?pagewanted=all&et_blog"],
});
expect(response.body.data[0].metadata.sourceURL).toBe("https://firecrawl.dev/?pagewanted=all&et_blog");
}, 35000);
}); });

View File

@ -366,4 +366,12 @@ describe("Scrape tests", () => {
}, 30000); }, 30000);
}); });
} }
it.concurrent("sourceURL stays unnormalized", async () => {
const response = await scrape({
url: "https://firecrawl.dev/?pagewanted=all&et_blog",
});
expect(response.metadata.sourceURL).toBe("https://firecrawl.dev/?pagewanted=all&et_blog");
}, 35000);
}); });

View File

@ -22,7 +22,6 @@ import { getJobPriority } from "../../lib/job-priority";
import { addScrapeJobs } from "../../services/queue-jobs"; import { addScrapeJobs } from "../../services/queue-jobs";
import { callWebhook } from "../../services/webhook"; import { callWebhook } from "../../services/webhook";
import { logger as _logger } from "../../lib/logger"; import { logger as _logger } from "../../lib/logger";
import { CostTracking } from "../../lib/extract/extraction-service";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
@ -30,6 +29,8 @@ export async function batchScrapeController(
req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>, req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>,
res: Response<BatchScrapeResponse>, res: Response<BatchScrapeResponse>,
) { ) {
const preNormalizedBody = { ...req.body };
if (req.body?.ignoreInvalidURLs === true) { if (req.body?.ignoreInvalidURLs === true) {
req.body = batchScrapeRequestSchemaNoURLValidation.parse(req.body); req.body = batchScrapeRequestSchemaNoURLValidation.parse(req.body);
} else { } else {
@ -46,6 +47,7 @@ export async function batchScrapeController(
}); });
let urls = req.body.urls; let urls = req.body.urls;
let unnormalizedURLs = preNormalizedBody.urls;
let invalidURLs: string[] | undefined = undefined; let invalidURLs: string[] | undefined = undefined;
if (req.body.ignoreInvalidURLs) { if (req.body.ignoreInvalidURLs) {
@ -53,11 +55,13 @@ export async function batchScrapeController(
let pendingURLs = urls; let pendingURLs = urls;
urls = []; urls = [];
unnormalizedURLs = [];
for (const u of pendingURLs) { for (const u of pendingURLs) {
try { try {
const nu = urlSchema.parse(u); const nu = urlSchema.parse(u);
if (!isUrlBlocked(nu, req.acuc?.flags ?? null)) { if (!isUrlBlocked(nu, req.acuc?.flags ?? null)) {
urls.push(nu); urls.push(nu);
unnormalizedURLs.push(u);
} else { } else {
invalidURLs.push(u); invalidURLs.push(u);
} }
@ -86,12 +90,6 @@ export async function batchScrapeController(
await logCrawl(id, req.auth.team_id); await logCrawl(id, req.auth.team_id);
} }
let { remainingCredits } = req.account!;
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
if (!useDbAuthentication) {
remainingCredits = Infinity;
}
const sc: StoredCrawl = req.body.appendToId const sc: StoredCrawl = req.body.appendToId
? ((await getCrawl(req.body.appendToId)) as StoredCrawl) ? ((await getCrawl(req.body.appendToId)) as StoredCrawl)
: { : {
@ -127,7 +125,7 @@ export async function batchScrapeController(
delete (scrapeOptions as any).urls; delete (scrapeOptions as any).urls;
delete (scrapeOptions as any).appendToId; delete (scrapeOptions as any).appendToId;
const jobs = urls.map((x) => { const jobs = urls.map((x, i) => {
return { return {
data: { data: {
url: x, url: x,
@ -142,6 +140,7 @@ export async function batchScrapeController(
webhook: req.body.webhook, webhook: req.body.webhook,
internalOptions: { internalOptions: {
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false, saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
unnormalizedSourceURL: unnormalizedURLs[i],
}, },
}, },
opts: { opts: {

View File

@ -51,6 +51,7 @@ export async function scrapeController(
internalOptions: { internalOptions: {
teamId: req.auth.team_id, teamId: req.auth.team_id,
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false, saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
unnormalizedSourceURL: preNormalizedBody.url,
}, },
origin: req.body.origin, origin: req.body.origin,
is_scrape: true, is_scrape: true,

View File

@ -182,6 +182,7 @@ export type InternalOptions = {
fromCache?: boolean; // Indicates if the document was retrieved from cache fromCache?: boolean; // Indicates if the document was retrieved from cache
abort?: AbortSignal; abort?: AbortSignal;
urlInvisibleInCurrentCrawl?: boolean; urlInvisibleInCurrentCrawl?: boolean;
unnormalizedSourceURL?: string;
saveScrapeResultToGCS?: boolean; // Passed along to fire-engine saveScrapeResultToGCS?: boolean; // Passed along to fire-engine
}; };
@ -373,7 +374,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
screenshot: result.result.screenshot, screenshot: result.result.screenshot,
actions: result.result.actions, actions: result.result.actions,
metadata: { metadata: {
sourceURL: meta.url, sourceURL: meta.internalOptions.unnormalizedSourceURL ?? meta.url,
url: result.result.url, url: result.result.url,
statusCode: result.result.statusCode, statusCode: result.result.statusCode,
error: result.result.error, error: result.result.error,