mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 21:39:06 +08:00
chore: formatting
This commit is contained in:
parent
b9f621bed5
commit
3b6edef9fa
@ -60,7 +60,11 @@ export async function scrapeController(
|
||||
try {
|
||||
doc = await waitForJob<Document>(jobId, timeout + totalWait); // TODO: better types for this
|
||||
} catch (e) {
|
||||
logger.error(`Error in scrapeController: ${e}`, { jobId, scrapeId: jobId, startTime });
|
||||
logger.error(`Error in scrapeController: ${e}`, {
|
||||
jobId,
|
||||
scrapeId: jobId,
|
||||
startTime,
|
||||
});
|
||||
if (
|
||||
e instanceof Error &&
|
||||
(e.message.startsWith("Job wait") || e.message === "timeout")
|
||||
|
@ -94,7 +94,11 @@ export async function addCrawlJobDone(
|
||||
await redisConnection.rpush("crawl:" + id + ":jobs_done_ordered", job_id);
|
||||
} else {
|
||||
// in case it's already been pushed, make sure it's removed
|
||||
await redisConnection.lrem("crawl:" + id + ":jobs_done_ordered", -1, job_id);
|
||||
await redisConnection.lrem(
|
||||
"crawl:" + id + ":jobs_done_ordered",
|
||||
-1,
|
||||
job_id,
|
||||
);
|
||||
}
|
||||
|
||||
await redisConnection.expire(
|
||||
|
@ -122,4 +122,3 @@
|
||||
// },
|
||||
// };
|
||||
// }
|
||||
|
||||
|
@ -5,7 +5,7 @@ import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||
|
||||
export async function scrapeURLWithFetch(
|
||||
meta: Meta,
|
||||
timeToRun: number | undefined
|
||||
timeToRun: number | undefined,
|
||||
): Promise<EngineScrapeResult> {
|
||||
const timeout = timeToRun ?? 300000;
|
||||
|
||||
|
@ -105,7 +105,10 @@ export type EngineScrapeResult = {
|
||||
};
|
||||
|
||||
const engineHandlers: {
|
||||
[E in Engine]: (meta: Meta, timeToRun: number | undefined) => Promise<EngineScrapeResult>;
|
||||
[E in Engine]: (
|
||||
meta: Meta,
|
||||
timeToRun: number | undefined,
|
||||
) => Promise<EngineScrapeResult>;
|
||||
} = {
|
||||
cache: scrapeCache,
|
||||
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
|
||||
@ -372,7 +375,7 @@ export function buildFallbackList(meta: Meta): {
|
||||
export async function scrapeURLWithEngine(
|
||||
meta: Meta,
|
||||
engine: Engine,
|
||||
timeToRun: number | undefined
|
||||
timeToRun: number | undefined,
|
||||
): Promise<EngineScrapeResult> {
|
||||
const fn = engineHandlers[engine];
|
||||
const logger = meta.logger.child({
|
||||
|
@ -124,7 +124,10 @@ async function scrapePDFWithParsePDF(
|
||||
};
|
||||
}
|
||||
|
||||
export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Promise<EngineScrapeResult> {
|
||||
export async function scrapePDF(
|
||||
meta: Meta,
|
||||
timeToRun: number | undefined,
|
||||
): Promise<EngineScrapeResult> {
|
||||
if (!meta.options.parsePDF) {
|
||||
const file = await fetchFileToBuffer(meta.url);
|
||||
const content = file.buffer.toString("base64");
|
||||
@ -152,9 +155,12 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
|
||||
tempFilePath,
|
||||
);
|
||||
|
||||
|
||||
// If the parsed text is under 500 characters and LLAMAPARSE_API_KEY exists, try LlamaParse
|
||||
if (result.markdown && result.markdown.length < 500 && process.env.LLAMAPARSE_API_KEY) {
|
||||
if (
|
||||
result.markdown &&
|
||||
result.markdown.length < 500 &&
|
||||
process.env.LLAMAPARSE_API_KEY
|
||||
) {
|
||||
try {
|
||||
const llamaResult = await scrapePDFWithLlamaParse(
|
||||
{
|
||||
|
@ -10,7 +10,10 @@ const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!);
|
||||
export function scrapeURLWithScrapingBee(
|
||||
wait_browser: "domcontentloaded" | "networkidle2",
|
||||
): (meta: Meta, timeToRun: number | undefined) => Promise<EngineScrapeResult> {
|
||||
return async (meta: Meta, timeToRun: number | undefined): Promise<EngineScrapeResult> => {
|
||||
return async (
|
||||
meta: Meta,
|
||||
timeToRun: number | undefined,
|
||||
): Promise<EngineScrapeResult> => {
|
||||
let response: AxiosResponse<any>;
|
||||
const timeout = (timeToRun ?? 300000) + meta.options.waitFor;
|
||||
try {
|
||||
|
@ -60,9 +60,7 @@ export class SiteError extends Error {
|
||||
export class ActionError extends Error {
|
||||
public code: string;
|
||||
constructor(code: string) {
|
||||
super(
|
||||
"Action(s) failed to complete. Error code: " + code,
|
||||
);
|
||||
super("Action(s) failed to complete. Error code: " + code);
|
||||
this.code = code;
|
||||
}
|
||||
}
|
||||
|
@ -203,9 +203,10 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
||||
const results: EngineResultsTracker = {};
|
||||
let result: EngineScrapeResultWithContext | null = null;
|
||||
|
||||
const timeToRun = meta.options.timeout !== undefined
|
||||
const timeToRun =
|
||||
meta.options.timeout !== undefined
|
||||
? Math.round(meta.options.timeout / Math.min(fallbackList.length, 2))
|
||||
: undefined
|
||||
: undefined;
|
||||
|
||||
for (const { engine, unsupportedFeatures } of fallbackList) {
|
||||
const startedAt = Date.now();
|
||||
|
@ -72,7 +72,12 @@ async function addScrapeJobRaw(
|
||||
}
|
||||
|
||||
if (concurrencyLimited) {
|
||||
await _addScrapeJobToConcurrencyQueue(webScraperOptions, options, jobId, jobPriority);
|
||||
await _addScrapeJobToConcurrencyQueue(
|
||||
webScraperOptions,
|
||||
options,
|
||||
jobId,
|
||||
jobPriority,
|
||||
);
|
||||
} else {
|
||||
await _addScrapeJobToBullMQ(webScraperOptions, options, jobId, jobPriority);
|
||||
}
|
||||
@ -130,17 +135,17 @@ export async function addScrapeJobs(
|
||||
|
||||
let countCanBeDirectlyAdded = Infinity;
|
||||
|
||||
if (
|
||||
jobs[0].data &&
|
||||
jobs[0].data.team_id &&
|
||||
jobs[0].data.plan
|
||||
) {
|
||||
if (jobs[0].data && jobs[0].data.team_id && jobs[0].data.plan) {
|
||||
const now = Date.now();
|
||||
const limit = await getConcurrencyLimitMax(jobs[0].data.plan);
|
||||
console.log("CC limit", limit);
|
||||
cleanOldConcurrencyLimitEntries(jobs[0].data.team_id, now);
|
||||
|
||||
countCanBeDirectlyAdded = Math.max(limit - (await getConcurrencyLimitActiveJobs(jobs[0].data.team_id, now)).length, 0);
|
||||
countCanBeDirectlyAdded = Math.max(
|
||||
limit -
|
||||
(await getConcurrencyLimitActiveJobs(jobs[0].data.team_id, now)).length,
|
||||
0,
|
||||
);
|
||||
}
|
||||
|
||||
const addToBull = jobs.slice(0, countCanBeDirectlyAdded);
|
||||
|
@ -496,7 +496,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
// See lockURL
|
||||
const x = await redisConnection.sadd(
|
||||
"crawl:" + job.data.crawl_id + ":visited",
|
||||
...p1.map(x => x.href),
|
||||
...p1.map((x) => x.href),
|
||||
);
|
||||
const lockRes = x === p1.length;
|
||||
|
||||
@ -504,7 +504,6 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
throw new RacedRedirectError();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
logger.debug("Logging job to DB...");
|
||||
@ -675,7 +674,10 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
|
||||
logger.debug("Declaring job as done...");
|
||||
await addCrawlJobDone(job.data.crawl_id, job.id, false);
|
||||
await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited_unique", normalizeURL(job.data.url, sc));
|
||||
await redisConnection.srem(
|
||||
"crawl:" + job.data.crawl_id + ":visited_unique",
|
||||
normalizeURL(job.data.url, sc),
|
||||
);
|
||||
|
||||
logger.debug("Logging job to DB...");
|
||||
await logJob(
|
||||
|
Loading…
x
Reference in New Issue
Block a user