fix(crawl): disable smart wait

This increases the reliability/deterministic-ness of crawls.
This commit is contained in:
Gergő Móricz 2024-12-10 21:12:31 +01:00
parent 5d90a6c1cd
commit 85cbfbb5bb
7 changed files with 14 additions and 4 deletions

View File

@ -1,6 +1,6 @@
const fs = require("fs");
const logs = fs.readFileSync("log-20780c8a-52f5-4af7-ac48-62997d11ec9b.log", "utf8")
const logs = fs.readFileSync("7a373219-0eb4-4e47-b2df-e90e12afd5c1.log", "utf8")
.split("\n").filter(x => x.trim().length > 0).map(x => JSON.parse(x));
const crawlIds = [...new Set(logs.map(x => x.crawlId).filter(x => x))];
@ -9,15 +9,18 @@ const urlFilter = x => new URL(x).pathname.slice(1) || "root"
for (const crawlId of crawlIds) {
const crawlLogs = logs.filter(x => x.crawlId === crawlId);
fs.writeFileSync("crawl-" + crawlId + ".log", crawlLogs.map(x => JSON.stringify(x)).join("\n"));
const jobAdds = crawlLogs.filter(x => x.jobPriority !== undefined && x.message.startsWith("Added job for URL "));
const jobStarts = crawlLogs.filter(x => x.message.startsWith("🐂 Worker taking job"));
const ttl = [...new Set(crawlLogs.filter(x => x.method === "lockURL" && x.res !== undefined).map(x => x.url))]
fs.writeFileSync(crawlId + ".md",
"```mermaid\nflowchart LR\n "
+ jobStarts.map(x => `${x.jobId}[${urlFilter(x.url)}]`).join("\n ") + "\n "
+ jobAdds.map(x => `${x.jobId}[${urlFilter(jobStarts.find(y => y.jobId === x.jobId).url)}] --> ${x.newJobId}[${urlFilter(x.url)}]`).join("\n ")
+ "\n```\n\nURLs scraped: (" + jobStarts.length + ")\n"
+ jobStarts.map(x => "- " + x.url).join("\n")
+ jobStarts.map(x => "- " + x.url).join("\n") + "\n\nURLs tried to lock: (" + ttl.length + ")\n"
+ ttl.map(x => "- " + x + " ("+ crawlLogs.filter(y => y.method === "lockURL" && y.res !== undefined && y.url === x).length + "; " + crawlLogs.filter(y => y.method === "lockURL" && y.res === true && y.url === x).length + ")").join("\n")
);
}

View File

@ -137,6 +137,7 @@ export async function crawlController(req: Request, res: Response) {
await logCrawl(id, team_id);
const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(pageOptions, undefined, undefined);
internalOptions.disableSmartWaitCache = true; // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
delete (scrapeOptions as any).timeout;

View File

@ -43,7 +43,7 @@ export async function batchScrapeController(
const sc: StoredCrawl = req.body.appendToId ? await getCrawl(req.body.appendToId) as StoredCrawl : {
crawlerOptions: null,
scrapeOptions: req.body,
internalOptions: {},
internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter
team_id: req.auth.team_id,
createdAt: Date.now(),
plan: req.auth.plan,

View File

@ -79,7 +79,7 @@ export async function crawlController(
originUrl: req.body.url,
crawlerOptions: toLegacyCrawlerOptions(crawlerOptions),
scrapeOptions,
internalOptions: {},
internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
team_id: req.auth.team_id,
createdAt: Date.now(),
plan: req.auth.plan,
@ -122,6 +122,7 @@ export async function crawlController(
plan: req.auth.plan,
crawlerOptions,
scrapeOptions,
internalOptions: sc.internalOptions,
origin: "api",
crawl_id: id,
sitemapped: true,
@ -162,6 +163,7 @@ export async function crawlController(
team_id: req.auth.team_id,
crawlerOptions,
scrapeOptions: scrapeOptionsSchema.parse(scrapeOptions),
internalOptions: sc.internalOptions,
plan: req.auth.plan!,
origin: "api",
crawl_id: id,

View File

@ -88,6 +88,7 @@ export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise<Engi
geolocation: meta.options.geolocation,
mobile: meta.options.mobile,
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache,
// TODO: scrollXPaths
};

View File

@ -35,6 +35,7 @@ export type FireEngineScrapeRequestChromeCDP = {
actions?: Action[];
blockMedia?: true; // cannot be false
mobile?: boolean;
disableSmartWaitCache?: boolean;
};
export type FireEngineScrapeRequestPlaywright = {

View File

@ -114,6 +114,8 @@ export type InternalOptions = {
v0CrawlOnlyUrls?: boolean;
v0UseFastMode?: boolean;
v0DisableJsDom?: boolean;
disableSmartWaitCache?: boolean; // Passed along to fire-engine
};
export type EngineResultsTracker = { [E in Engine]?: ({