chore: formatting

This commit is contained in:
Nicolas 2024-12-17 16:58:57 -03:00
parent b9f621bed5
commit 3b6edef9fa
12 changed files with 55 additions and 30 deletions

View File

@ -60,7 +60,11 @@ export async function scrapeController(
try {
doc = await waitForJob<Document>(jobId, timeout + totalWait); // TODO: better types for this
} catch (e) {
logger.error(`Error in scrapeController: ${e}`, { jobId, scrapeId: jobId, startTime });
logger.error(`Error in scrapeController: ${e}`, {
jobId,
scrapeId: jobId,
startTime,
});
if (
e instanceof Error &&
(e.message.startsWith("Job wait") || e.message === "timeout")

View File

@ -94,7 +94,11 @@ export async function addCrawlJobDone(
await redisConnection.rpush("crawl:" + id + ":jobs_done_ordered", job_id);
} else {
// in case it's already been pushed, make sure it's removed
await redisConnection.lrem("crawl:" + id + ":jobs_done_ordered", -1, job_id);
await redisConnection.lrem(
"crawl:" + id + ":jobs_done_ordered",
-1,
job_id,
);
}
await redisConnection.expire(

View File

@ -122,4 +122,3 @@
// },
// };
// }

View File

@ -5,7 +5,7 @@ import { specialtyScrapeCheck } from "../utils/specialtyHandler";
export async function scrapeURLWithFetch(
meta: Meta,
timeToRun: number | undefined
timeToRun: number | undefined,
): Promise<EngineScrapeResult> {
const timeout = timeToRun ?? 300000;

View File

@ -105,7 +105,10 @@ export type EngineScrapeResult = {
};
const engineHandlers: {
[E in Engine]: (meta: Meta, timeToRun: number | undefined) => Promise<EngineScrapeResult>;
[E in Engine]: (
meta: Meta,
timeToRun: number | undefined,
) => Promise<EngineScrapeResult>;
} = {
cache: scrapeCache,
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
@ -372,7 +375,7 @@ export function buildFallbackList(meta: Meta): {
export async function scrapeURLWithEngine(
meta: Meta,
engine: Engine,
timeToRun: number | undefined
timeToRun: number | undefined,
): Promise<EngineScrapeResult> {
const fn = engineHandlers[engine];
const logger = meta.logger.child({

View File

@ -124,7 +124,10 @@ async function scrapePDFWithParsePDF(
};
}
export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Promise<EngineScrapeResult> {
export async function scrapePDF(
meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> {
if (!meta.options.parsePDF) {
const file = await fetchFileToBuffer(meta.url);
const content = file.buffer.toString("base64");
@ -152,9 +155,12 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
tempFilePath,
);
// If the parsed text is under 500 characters and LLAMAPARSE_API_KEY exists, try LlamaParse
if (result.markdown && result.markdown.length < 500 && process.env.LLAMAPARSE_API_KEY) {
if (
result.markdown &&
result.markdown.length < 500 &&
process.env.LLAMAPARSE_API_KEY
) {
try {
const llamaResult = await scrapePDFWithLlamaParse(
{

View File

@ -10,7 +10,10 @@ const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!);
export function scrapeURLWithScrapingBee(
wait_browser: "domcontentloaded" | "networkidle2",
): (meta: Meta, timeToRun: number | undefined) => Promise<EngineScrapeResult> {
return async (meta: Meta, timeToRun: number | undefined): Promise<EngineScrapeResult> => {
return async (
meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> => {
let response: AxiosResponse<any>;
const timeout = (timeToRun ?? 300000) + meta.options.waitFor;
try {

View File

@ -60,9 +60,7 @@ export class SiteError extends Error {
export class ActionError extends Error {
public code: string;
constructor(code: string) {
super(
"Action(s) failed to complete. Error code: " + code,
);
super("Action(s) failed to complete. Error code: " + code);
this.code = code;
}
}

View File

@ -203,9 +203,10 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
const results: EngineResultsTracker = {};
let result: EngineScrapeResultWithContext | null = null;
const timeToRun = meta.options.timeout !== undefined
const timeToRun =
meta.options.timeout !== undefined
? Math.round(meta.options.timeout / Math.min(fallbackList.length, 2))
: undefined
: undefined;
for (const { engine, unsupportedFeatures } of fallbackList) {
const startedAt = Date.now();

View File

@ -72,7 +72,12 @@ async function addScrapeJobRaw(
}
if (concurrencyLimited) {
await _addScrapeJobToConcurrencyQueue(webScraperOptions, options, jobId, jobPriority);
await _addScrapeJobToConcurrencyQueue(
webScraperOptions,
options,
jobId,
jobPriority,
);
} else {
await _addScrapeJobToBullMQ(webScraperOptions, options, jobId, jobPriority);
}
@ -130,17 +135,17 @@ export async function addScrapeJobs(
let countCanBeDirectlyAdded = Infinity;
if (
jobs[0].data &&
jobs[0].data.team_id &&
jobs[0].data.plan
) {
if (jobs[0].data && jobs[0].data.team_id && jobs[0].data.plan) {
const now = Date.now();
const limit = await getConcurrencyLimitMax(jobs[0].data.plan);
console.log("CC limit", limit);
cleanOldConcurrencyLimitEntries(jobs[0].data.team_id, now);
countCanBeDirectlyAdded = Math.max(limit - (await getConcurrencyLimitActiveJobs(jobs[0].data.team_id, now)).length, 0);
countCanBeDirectlyAdded = Math.max(
limit -
(await getConcurrencyLimitActiveJobs(jobs[0].data.team_id, now)).length,
0,
);
}
const addToBull = jobs.slice(0, countCanBeDirectlyAdded);

View File

@ -496,7 +496,7 @@ async function processJob(job: Job & { id: string }, token: string) {
// See lockURL
const x = await redisConnection.sadd(
"crawl:" + job.data.crawl_id + ":visited",
...p1.map(x => x.href),
...p1.map((x) => x.href),
);
const lockRes = x === p1.length;
@ -504,7 +504,6 @@ async function processJob(job: Job & { id: string }, token: string) {
throw new RacedRedirectError();
}
}
}
logger.debug("Logging job to DB...");
@ -675,7 +674,10 @@ async function processJob(job: Job & { id: string }, token: string) {
logger.debug("Declaring job as done...");
await addCrawlJobDone(job.data.crawl_id, job.id, false);
await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited_unique", normalizeURL(job.data.url, sc));
await redisConnection.srem(
"crawl:" + job.data.crawl_id + ":visited_unique",
normalizeURL(job.data.url, sc),
);
logger.debug("Logging job to DB...");
await logJob(