diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index f1fe3431..1ea28995 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -60,7 +60,11 @@ export async function scrapeController( try { doc = await waitForJob(jobId, timeout + totalWait); // TODO: better types for this } catch (e) { - logger.error(`Error in scrapeController: ${e}`, { jobId, scrapeId: jobId, startTime }); + logger.error(`Error in scrapeController: ${e}`, { + jobId, + scrapeId: jobId, + startTime, + }); if ( e instanceof Error && (e.message.startsWith("Job wait") || e.message === "timeout") diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index 602d13b3..6ecb0b8f 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -94,9 +94,13 @@ export async function addCrawlJobDone( await redisConnection.rpush("crawl:" + id + ":jobs_done_ordered", job_id); } else { // in case it's already been pushed, make sure it's removed - await redisConnection.lrem("crawl:" + id + ":jobs_done_ordered", -1, job_id); + await redisConnection.lrem( + "crawl:" + id + ":jobs_done_ordered", + -1, + job_id, + ); } - + await redisConnection.expire( "crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60, diff --git a/apps/api/src/lib/extract/completions.ts b/apps/api/src/lib/extract/completions.ts index 8d1b95c9..34a5a215 100644 --- a/apps/api/src/lib/extract/completions.ts +++ b/apps/api/src/lib/extract/completions.ts @@ -122,4 +122,3 @@ // }, // }; // } - diff --git a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts index 168d9b8f..a0c8eaba 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts @@ -5,7 +5,7 @@ import { specialtyScrapeCheck } from "../utils/specialtyHandler"; export async function scrapeURLWithFetch( meta: Meta, - timeToRun: number | undefined + timeToRun: number | undefined, ): Promise { const timeout = timeToRun ?? 300000; diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 2dc134c9..14abf9a9 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -128,7 +128,7 @@ export async function scrapeURLWithFireEngineChromeCDP( (a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a), 0, ); - + const timeout = (timeToRun ?? 300000) + totalWait; const request: FireEngineScrapeRequestCommon & diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index 14f263f3..bb0c485c 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -105,7 +105,10 @@ export type EngineScrapeResult = { }; const engineHandlers: { - [E in Engine]: (meta: Meta, timeToRun: number | undefined) => Promise; + [E in Engine]: ( + meta: Meta, + timeToRun: number | undefined, + ) => Promise; } = { cache: scrapeCache, "fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP, @@ -372,7 +375,7 @@ export function buildFallbackList(meta: Meta): { export async function scrapeURLWithEngine( meta: Meta, engine: Engine, - timeToRun: number | undefined + timeToRun: number | undefined, ): Promise { const fn = engineHandlers[engine]; const logger = meta.logger.child({ diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index 0983e4b1..9d2f11b1 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -124,7 +124,10 @@ async function scrapePDFWithParsePDF( }; } -export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Promise { +export async function scrapePDF( + meta: Meta, + timeToRun: number | undefined, +): Promise { if (!meta.options.parsePDF) { const file = await fetchFileToBuffer(meta.url); const content = file.buffer.toString("base64"); @@ -152,9 +155,12 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom tempFilePath, ); - // If the parsed text is under 500 characters and LLAMAPARSE_API_KEY exists, try LlamaParse - if (result.markdown && result.markdown.length < 500 && process.env.LLAMAPARSE_API_KEY) { + if ( + result.markdown && + result.markdown.length < 500 && + process.env.LLAMAPARSE_API_KEY + ) { try { const llamaResult = await scrapePDFWithLlamaParse( { @@ -193,4 +199,4 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom html: result.html, markdown: result.markdown, }; -} \ No newline at end of file +} diff --git a/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts b/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts index db702a44..38c43878 100644 --- a/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts @@ -10,7 +10,10 @@ const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!); export function scrapeURLWithScrapingBee( wait_browser: "domcontentloaded" | "networkidle2", ): (meta: Meta, timeToRun: number | undefined) => Promise { - return async (meta: Meta, timeToRun: number | undefined): Promise => { + return async ( + meta: Meta, + timeToRun: number | undefined, + ): Promise => { let response: AxiosResponse; const timeout = (timeToRun ?? 300000) + meta.options.waitFor; try { diff --git a/apps/api/src/scraper/scrapeURL/error.ts b/apps/api/src/scraper/scrapeURL/error.ts index 0a4f6e5b..689f90c8 100644 --- a/apps/api/src/scraper/scrapeURL/error.ts +++ b/apps/api/src/scraper/scrapeURL/error.ts @@ -60,9 +60,7 @@ export class SiteError extends Error { export class ActionError extends Error { public code: string; constructor(code: string) { - super( - "Action(s) failed to complete. Error code: " + code, - ); + super("Action(s) failed to complete. Error code: " + code); this.code = code; } } diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 93bdb71b..1df812bd 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -203,9 +203,10 @@ async function scrapeURLLoop(meta: Meta): Promise { const results: EngineResultsTracker = {}; let result: EngineScrapeResultWithContext | null = null; - const timeToRun = meta.options.timeout !== undefined - ? Math.round(meta.options.timeout / Math.min(fallbackList.length, 2)) - : undefined + const timeToRun = + meta.options.timeout !== undefined + ? Math.round(meta.options.timeout / Math.min(fallbackList.length, 2)) + : undefined; for (const { engine, unsupportedFeatures } of fallbackList) { const startedAt = Date.now(); diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index 6ce48a81..654f6cda 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -72,7 +72,12 @@ async function addScrapeJobRaw( } if (concurrencyLimited) { - await _addScrapeJobToConcurrencyQueue(webScraperOptions, options, jobId, jobPriority); + await _addScrapeJobToConcurrencyQueue( + webScraperOptions, + options, + jobId, + jobPriority, + ); } else { await _addScrapeJobToBullMQ(webScraperOptions, options, jobId, jobPriority); } @@ -130,17 +135,17 @@ export async function addScrapeJobs( let countCanBeDirectlyAdded = Infinity; - if ( - jobs[0].data && - jobs[0].data.team_id && - jobs[0].data.plan - ) { + if (jobs[0].data && jobs[0].data.team_id && jobs[0].data.plan) { const now = Date.now(); const limit = await getConcurrencyLimitMax(jobs[0].data.plan); console.log("CC limit", limit); cleanOldConcurrencyLimitEntries(jobs[0].data.team_id, now); - countCanBeDirectlyAdded = Math.max(limit - (await getConcurrencyLimitActiveJobs(jobs[0].data.team_id, now)).length, 0); + countCanBeDirectlyAdded = Math.max( + limit - + (await getConcurrencyLimitActiveJobs(jobs[0].data.team_id, now)).length, + 0, + ); } const addToBull = jobs.slice(0, countCanBeDirectlyAdded); diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index c2d2e2c6..705a06c7 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -496,15 +496,14 @@ async function processJob(job: Job & { id: string }, token: string) { // See lockURL const x = await redisConnection.sadd( "crawl:" + job.data.crawl_id + ":visited", - ...p1.map(x => x.href), + ...p1.map((x) => x.href), ); const lockRes = x === p1.length; - + if (job.data.crawlerOptions !== null && !lockRes) { throw new RacedRedirectError(); } } - } logger.debug("Logging job to DB..."); @@ -675,7 +674,10 @@ async function processJob(job: Job & { id: string }, token: string) { logger.debug("Declaring job as done..."); await addCrawlJobDone(job.data.crawl_id, job.id, false); - await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited_unique", normalizeURL(job.data.url, sc)); + await redisConnection.srem( + "crawl:" + job.data.crawl_id + ":visited_unique", + normalizeURL(job.data.url, sc), + ); logger.debug("Logging job to DB..."); await logJob(