mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 06:25:56 +08:00
feat(scrapeURL): add unnormalizedSourceURL for url matching DX (FIR-2137) (#1601)
* feat(scrapeURL): add unnormalizedSourceURL for url matching DX * fix(tests): fixc
This commit is contained in:
parent
474e5a0543
commit
a36c6a4f40
@ -48,4 +48,12 @@ describe("Batch scrape tests", () => {
|
|||||||
}, 180000);
|
}, 180000);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
it.concurrent("sourceURL stays unnormalized", async () => {
|
||||||
|
const response = await batchScrape({
|
||||||
|
urls: ["https://firecrawl.dev/?pagewanted=all&et_blog"],
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(response.body.data[0].metadata.sourceURL).toBe("https://firecrawl.dev/?pagewanted=all&et_blog");
|
||||||
|
}, 35000);
|
||||||
});
|
});
|
||||||
|
@ -366,4 +366,12 @@ describe("Scrape tests", () => {
|
|||||||
}, 30000);
|
}, 30000);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
it.concurrent("sourceURL stays unnormalized", async () => {
|
||||||
|
const response = await scrape({
|
||||||
|
url: "https://firecrawl.dev/?pagewanted=all&et_blog",
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(response.metadata.sourceURL).toBe("https://firecrawl.dev/?pagewanted=all&et_blog");
|
||||||
|
}, 35000);
|
||||||
});
|
});
|
||||||
|
@ -22,7 +22,6 @@ import { getJobPriority } from "../../lib/job-priority";
|
|||||||
import { addScrapeJobs } from "../../services/queue-jobs";
|
import { addScrapeJobs } from "../../services/queue-jobs";
|
||||||
import { callWebhook } from "../../services/webhook";
|
import { callWebhook } from "../../services/webhook";
|
||||||
import { logger as _logger } from "../../lib/logger";
|
import { logger as _logger } from "../../lib/logger";
|
||||||
import { CostTracking } from "../../lib/extract/extraction-service";
|
|
||||||
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
|
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
|
||||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||||
|
|
||||||
@ -30,6 +29,8 @@ export async function batchScrapeController(
|
|||||||
req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>,
|
req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>,
|
||||||
res: Response<BatchScrapeResponse>,
|
res: Response<BatchScrapeResponse>,
|
||||||
) {
|
) {
|
||||||
|
const preNormalizedBody = { ...req.body };
|
||||||
|
|
||||||
if (req.body?.ignoreInvalidURLs === true) {
|
if (req.body?.ignoreInvalidURLs === true) {
|
||||||
req.body = batchScrapeRequestSchemaNoURLValidation.parse(req.body);
|
req.body = batchScrapeRequestSchemaNoURLValidation.parse(req.body);
|
||||||
} else {
|
} else {
|
||||||
@ -46,6 +47,7 @@ export async function batchScrapeController(
|
|||||||
});
|
});
|
||||||
|
|
||||||
let urls = req.body.urls;
|
let urls = req.body.urls;
|
||||||
|
let unnormalizedURLs = preNormalizedBody.urls;
|
||||||
let invalidURLs: string[] | undefined = undefined;
|
let invalidURLs: string[] | undefined = undefined;
|
||||||
|
|
||||||
if (req.body.ignoreInvalidURLs) {
|
if (req.body.ignoreInvalidURLs) {
|
||||||
@ -53,11 +55,13 @@ export async function batchScrapeController(
|
|||||||
|
|
||||||
let pendingURLs = urls;
|
let pendingURLs = urls;
|
||||||
urls = [];
|
urls = [];
|
||||||
|
unnormalizedURLs = [];
|
||||||
for (const u of pendingURLs) {
|
for (const u of pendingURLs) {
|
||||||
try {
|
try {
|
||||||
const nu = urlSchema.parse(u);
|
const nu = urlSchema.parse(u);
|
||||||
if (!isUrlBlocked(nu, req.acuc?.flags ?? null)) {
|
if (!isUrlBlocked(nu, req.acuc?.flags ?? null)) {
|
||||||
urls.push(nu);
|
urls.push(nu);
|
||||||
|
unnormalizedURLs.push(u);
|
||||||
} else {
|
} else {
|
||||||
invalidURLs.push(u);
|
invalidURLs.push(u);
|
||||||
}
|
}
|
||||||
@ -86,12 +90,6 @@ export async function batchScrapeController(
|
|||||||
await logCrawl(id, req.auth.team_id);
|
await logCrawl(id, req.auth.team_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
let { remainingCredits } = req.account!;
|
|
||||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
|
||||||
if (!useDbAuthentication) {
|
|
||||||
remainingCredits = Infinity;
|
|
||||||
}
|
|
||||||
|
|
||||||
const sc: StoredCrawl = req.body.appendToId
|
const sc: StoredCrawl = req.body.appendToId
|
||||||
? ((await getCrawl(req.body.appendToId)) as StoredCrawl)
|
? ((await getCrawl(req.body.appendToId)) as StoredCrawl)
|
||||||
: {
|
: {
|
||||||
@ -127,7 +125,7 @@ export async function batchScrapeController(
|
|||||||
delete (scrapeOptions as any).urls;
|
delete (scrapeOptions as any).urls;
|
||||||
delete (scrapeOptions as any).appendToId;
|
delete (scrapeOptions as any).appendToId;
|
||||||
|
|
||||||
const jobs = urls.map((x) => {
|
const jobs = urls.map((x, i) => {
|
||||||
return {
|
return {
|
||||||
data: {
|
data: {
|
||||||
url: x,
|
url: x,
|
||||||
@ -142,6 +140,7 @@ export async function batchScrapeController(
|
|||||||
webhook: req.body.webhook,
|
webhook: req.body.webhook,
|
||||||
internalOptions: {
|
internalOptions: {
|
||||||
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
|
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
|
||||||
|
unnormalizedSourceURL: unnormalizedURLs[i],
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
opts: {
|
opts: {
|
||||||
|
@ -51,6 +51,7 @@ export async function scrapeController(
|
|||||||
internalOptions: {
|
internalOptions: {
|
||||||
teamId: req.auth.team_id,
|
teamId: req.auth.team_id,
|
||||||
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
|
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
|
||||||
|
unnormalizedSourceURL: preNormalizedBody.url,
|
||||||
},
|
},
|
||||||
origin: req.body.origin,
|
origin: req.body.origin,
|
||||||
is_scrape: true,
|
is_scrape: true,
|
||||||
|
@ -182,6 +182,7 @@ export type InternalOptions = {
|
|||||||
fromCache?: boolean; // Indicates if the document was retrieved from cache
|
fromCache?: boolean; // Indicates if the document was retrieved from cache
|
||||||
abort?: AbortSignal;
|
abort?: AbortSignal;
|
||||||
urlInvisibleInCurrentCrawl?: boolean;
|
urlInvisibleInCurrentCrawl?: boolean;
|
||||||
|
unnormalizedSourceURL?: string;
|
||||||
|
|
||||||
saveScrapeResultToGCS?: boolean; // Passed along to fire-engine
|
saveScrapeResultToGCS?: boolean; // Passed along to fire-engine
|
||||||
};
|
};
|
||||||
@ -373,7 +374,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
|||||||
screenshot: result.result.screenshot,
|
screenshot: result.result.screenshot,
|
||||||
actions: result.result.actions,
|
actions: result.result.actions,
|
||||||
metadata: {
|
metadata: {
|
||||||
sourceURL: meta.url,
|
sourceURL: meta.internalOptions.unnormalizedSourceURL ?? meta.url,
|
||||||
url: result.result.url,
|
url: result.result.url,
|
||||||
statusCode: result.result.statusCode,
|
statusCode: result.result.statusCode,
|
||||||
error: result.result.error,
|
error: result.result.error,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user