mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 18:06:00 +08:00
fix(crawl): disable smart wait
This increases the reliability/deterministic-ness of crawls.
This commit is contained in:
parent
5d90a6c1cd
commit
85cbfbb5bb
@ -1,6 +1,6 @@
|
||||
const fs = require("fs");
|
||||
|
||||
const logs = fs.readFileSync("log-20780c8a-52f5-4af7-ac48-62997d11ec9b.log", "utf8")
|
||||
const logs = fs.readFileSync("7a373219-0eb4-4e47-b2df-e90e12afd5c1.log", "utf8")
|
||||
.split("\n").filter(x => x.trim().length > 0).map(x => JSON.parse(x));
|
||||
|
||||
const crawlIds = [...new Set(logs.map(x => x.crawlId).filter(x => x))];
|
||||
@ -9,15 +9,18 @@ const urlFilter = x => new URL(x).pathname.slice(1) || "root"
|
||||
|
||||
for (const crawlId of crawlIds) {
|
||||
const crawlLogs = logs.filter(x => x.crawlId === crawlId);
|
||||
fs.writeFileSync("crawl-" + crawlId + ".log", crawlLogs.map(x => JSON.stringify(x)).join("\n"));
|
||||
|
||||
const jobAdds = crawlLogs.filter(x => x.jobPriority !== undefined && x.message.startsWith("Added job for URL "));
|
||||
const jobStarts = crawlLogs.filter(x => x.message.startsWith("🐂 Worker taking job"));
|
||||
const ttl = [...new Set(crawlLogs.filter(x => x.method === "lockURL" && x.res !== undefined).map(x => x.url))]
|
||||
|
||||
fs.writeFileSync(crawlId + ".md",
|
||||
"```mermaid\nflowchart LR\n "
|
||||
+ jobStarts.map(x => `${x.jobId}[${urlFilter(x.url)}]`).join("\n ") + "\n "
|
||||
+ jobAdds.map(x => `${x.jobId}[${urlFilter(jobStarts.find(y => y.jobId === x.jobId).url)}] --> ${x.newJobId}[${urlFilter(x.url)}]`).join("\n ")
|
||||
+ "\n```\n\nURLs scraped: (" + jobStarts.length + ")\n"
|
||||
+ jobStarts.map(x => "- " + x.url).join("\n")
|
||||
+ jobStarts.map(x => "- " + x.url).join("\n") + "\n\nURLs tried to lock: (" + ttl.length + ")\n"
|
||||
+ ttl.map(x => "- " + x + " ("+ crawlLogs.filter(y => y.method === "lockURL" && y.res !== undefined && y.url === x).length + "; " + crawlLogs.filter(y => y.method === "lockURL" && y.res === true && y.url === x).length + ")").join("\n")
|
||||
);
|
||||
}
|
||||
|
@ -137,6 +137,7 @@ export async function crawlController(req: Request, res: Response) {
|
||||
await logCrawl(id, team_id);
|
||||
|
||||
const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(pageOptions, undefined, undefined);
|
||||
internalOptions.disableSmartWaitCache = true; // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
|
||||
|
||||
delete (scrapeOptions as any).timeout;
|
||||
|
||||
|
@ -43,7 +43,7 @@ export async function batchScrapeController(
|
||||
const sc: StoredCrawl = req.body.appendToId ? await getCrawl(req.body.appendToId) as StoredCrawl : {
|
||||
crawlerOptions: null,
|
||||
scrapeOptions: req.body,
|
||||
internalOptions: {},
|
||||
internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
plan: req.auth.plan,
|
||||
|
@ -79,7 +79,7 @@ export async function crawlController(
|
||||
originUrl: req.body.url,
|
||||
crawlerOptions: toLegacyCrawlerOptions(crawlerOptions),
|
||||
scrapeOptions,
|
||||
internalOptions: {},
|
||||
internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
plan: req.auth.plan,
|
||||
@ -122,6 +122,7 @@ export async function crawlController(
|
||||
plan: req.auth.plan,
|
||||
crawlerOptions,
|
||||
scrapeOptions,
|
||||
internalOptions: sc.internalOptions,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
@ -162,6 +163,7 @@ export async function crawlController(
|
||||
team_id: req.auth.team_id,
|
||||
crawlerOptions,
|
||||
scrapeOptions: scrapeOptionsSchema.parse(scrapeOptions),
|
||||
internalOptions: sc.internalOptions,
|
||||
plan: req.auth.plan!,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
|
@ -88,6 +88,7 @@ export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise<Engi
|
||||
geolocation: meta.options.geolocation,
|
||||
mobile: meta.options.mobile,
|
||||
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
|
||||
disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache,
|
||||
// TODO: scrollXPaths
|
||||
};
|
||||
|
||||
|
@ -35,6 +35,7 @@ export type FireEngineScrapeRequestChromeCDP = {
|
||||
actions?: Action[];
|
||||
blockMedia?: true; // cannot be false
|
||||
mobile?: boolean;
|
||||
disableSmartWaitCache?: boolean;
|
||||
};
|
||||
|
||||
export type FireEngineScrapeRequestPlaywright = {
|
||||
|
@ -114,6 +114,8 @@ export type InternalOptions = {
|
||||
v0CrawlOnlyUrls?: boolean;
|
||||
v0UseFastMode?: boolean;
|
||||
v0DisableJsDom?: boolean;
|
||||
|
||||
disableSmartWaitCache?: boolean; // Passed along to fire-engine
|
||||
};
|
||||
|
||||
export type EngineResultsTracker = { [E in Engine]?: ({
|
||||
|
Loading…
x
Reference in New Issue
Block a user