diff --git a/apps/api/logview.js b/apps/api/logview.js index 17032b2e..232d2cda 100644 --- a/apps/api/logview.js +++ b/apps/api/logview.js @@ -1,6 +1,6 @@ const fs = require("fs"); -const logs = fs.readFileSync("log-20780c8a-52f5-4af7-ac48-62997d11ec9b.log", "utf8") +const logs = fs.readFileSync("7a373219-0eb4-4e47-b2df-e90e12afd5c1.log", "utf8") .split("\n").filter(x => x.trim().length > 0).map(x => JSON.parse(x)); const crawlIds = [...new Set(logs.map(x => x.crawlId).filter(x => x))]; @@ -9,15 +9,18 @@ const urlFilter = x => new URL(x).pathname.slice(1) || "root" for (const crawlId of crawlIds) { const crawlLogs = logs.filter(x => x.crawlId === crawlId); + fs.writeFileSync("crawl-" + crawlId + ".log", crawlLogs.map(x => JSON.stringify(x)).join("\n")); const jobAdds = crawlLogs.filter(x => x.jobPriority !== undefined && x.message.startsWith("Added job for URL ")); const jobStarts = crawlLogs.filter(x => x.message.startsWith("🐂 Worker taking job")); + const ttl = [...new Set(crawlLogs.filter(x => x.method === "lockURL" && x.res !== undefined).map(x => x.url))] fs.writeFileSync(crawlId + ".md", "```mermaid\nflowchart LR\n " + jobStarts.map(x => `${x.jobId}[${urlFilter(x.url)}]`).join("\n ") + "\n " + jobAdds.map(x => `${x.jobId}[${urlFilter(jobStarts.find(y => y.jobId === x.jobId).url)}] --> ${x.newJobId}[${urlFilter(x.url)}]`).join("\n ") + "\n```\n\nURLs scraped: (" + jobStarts.length + ")\n" - + jobStarts.map(x => "- " + x.url).join("\n") + + jobStarts.map(x => "- " + x.url).join("\n") + "\n\nURLs tried to lock: (" + ttl.length + ")\n" + + ttl.map(x => "- " + x + " ("+ crawlLogs.filter(y => y.method === "lockURL" && y.res !== undefined && y.url === x).length + "; " + crawlLogs.filter(y => y.method === "lockURL" && y.res === true && y.url === x).length + ")").join("\n") ); } diff --git a/apps/api/src/controllers/v0/crawl.ts b/apps/api/src/controllers/v0/crawl.ts index 5de5eccf..06a86f92 100644 --- a/apps/api/src/controllers/v0/crawl.ts +++ b/apps/api/src/controllers/v0/crawl.ts @@ -137,6 +137,7 @@ export async function crawlController(req: Request, res: Response) { await logCrawl(id, team_id); const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(pageOptions, undefined, undefined); + internalOptions.disableSmartWaitCache = true; // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter delete (scrapeOptions as any).timeout; diff --git a/apps/api/src/controllers/v1/batch-scrape.ts b/apps/api/src/controllers/v1/batch-scrape.ts index 133977be..064ee73b 100644 --- a/apps/api/src/controllers/v1/batch-scrape.ts +++ b/apps/api/src/controllers/v1/batch-scrape.ts @@ -43,7 +43,7 @@ export async function batchScrapeController( const sc: StoredCrawl = req.body.appendToId ? await getCrawl(req.body.appendToId) as StoredCrawl : { crawlerOptions: null, scrapeOptions: req.body, - internalOptions: {}, + internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter team_id: req.auth.team_id, createdAt: Date.now(), plan: req.auth.plan, diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index 08630bfa..3db518d0 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -79,7 +79,7 @@ export async function crawlController( originUrl: req.body.url, crawlerOptions: toLegacyCrawlerOptions(crawlerOptions), scrapeOptions, - internalOptions: {}, + internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter team_id: req.auth.team_id, createdAt: Date.now(), plan: req.auth.plan, @@ -122,6 +122,7 @@ export async function crawlController( plan: req.auth.plan, crawlerOptions, scrapeOptions, + internalOptions: sc.internalOptions, origin: "api", crawl_id: id, sitemapped: true, @@ -162,6 +163,7 @@ export async function crawlController( team_id: req.auth.team_id, crawlerOptions, scrapeOptions: scrapeOptionsSchema.parse(scrapeOptions), + internalOptions: sc.internalOptions, plan: req.auth.plan!, origin: "api", crawl_id: id, diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 106803ac..ae953c1b 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -88,6 +88,7 @@ export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise