mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 13:45:57 +08:00
feat(runWebScraper): retry a scrape max 3 times in a crawl if the status code is failure
This commit is contained in:
parent
6b17a53d4b
commit
e74e4bcefc
@ -1,7 +1,19 @@
|
|||||||
const fs = require("fs");
|
const fs = require("fs");
|
||||||
|
|
||||||
const logs = fs.readFileSync("7a373219-0eb4-4e47-b2df-e90e12afd5c1.log", "utf8")
|
// METHOD: Winston log file
|
||||||
.split("\n").filter(x => x.trim().length > 0).map(x => JSON.parse(x));
|
// const logs = fs.readFileSync("7a373219-0eb4-4e47-b2df-e90e12afd5c1.log", "utf8")
|
||||||
|
// .split("\n").filter(x => x.trim().length > 0).map(x => JSON.parse(x));
|
||||||
|
|
||||||
|
// METHOD: GCloud export
|
||||||
|
const logs = [
|
||||||
|
"downloaded-logs-20241213-225607.json",
|
||||||
|
"downloaded-logs-20241213-225654.json",
|
||||||
|
"downloaded-logs-20241213-225720.json",
|
||||||
|
"downloaded-logs-20241213-225758.json",
|
||||||
|
"downloaded-logs-20241213-225825.json",
|
||||||
|
"downloaded-logs-20241213-225843.json",
|
||||||
|
].flatMap(x => JSON.parse(fs.readFileSync(x, "utf8"))).map(x => x.jsonPayload);
|
||||||
|
|
||||||
|
|
||||||
const crawlIds = [...new Set(logs.map(x => x.crawlId).filter(x => x))];
|
const crawlIds = [...new Set(logs.map(x => x.crawlId).filter(x => x))];
|
||||||
|
|
||||||
|
@ -8,7 +8,6 @@ import { authenticateUser } from "../auth";
|
|||||||
import { PlanType, RateLimiterMode } from "../../types";
|
import { PlanType, RateLimiterMode } from "../../types";
|
||||||
import { logJob } from "../../services/logging/log_job";
|
import { logJob } from "../../services/logging/log_job";
|
||||||
import {
|
import {
|
||||||
Document,
|
|
||||||
fromLegacyCombo,
|
fromLegacyCombo,
|
||||||
toLegacyDocument,
|
toLegacyDocument,
|
||||||
url as urlSchema,
|
url as urlSchema,
|
||||||
@ -29,6 +28,7 @@ import * as Sentry from "@sentry/node";
|
|||||||
import { getJobPriority } from "../../lib/job-priority";
|
import { getJobPriority } from "../../lib/job-priority";
|
||||||
import { fromLegacyScrapeOptions } from "../v1/types";
|
import { fromLegacyScrapeOptions } from "../v1/types";
|
||||||
import { ZodError } from "zod";
|
import { ZodError } from "zod";
|
||||||
|
import { Document as V0Document } from "./../../lib/entities";
|
||||||
|
|
||||||
export async function scrapeHelper(
|
export async function scrapeHelper(
|
||||||
jobId: string,
|
jobId: string,
|
||||||
@ -42,7 +42,7 @@ export async function scrapeHelper(
|
|||||||
): Promise<{
|
): Promise<{
|
||||||
success: boolean;
|
success: boolean;
|
||||||
error?: string;
|
error?: string;
|
||||||
data?: Document | { url: string };
|
data?: V0Document | { url: string };
|
||||||
returnCode: number;
|
returnCode: number;
|
||||||
}> {
|
}> {
|
||||||
const url = urlSchema.parse(req.body.url);
|
const url = urlSchema.parse(req.body.url);
|
||||||
@ -241,9 +241,9 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
const endTime = new Date().getTime();
|
const endTime = new Date().getTime();
|
||||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
const numTokens =
|
const numTokens =
|
||||||
result.data && (result.data as Document).markdown
|
result.data && (result.data as V0Document).markdown
|
||||||
? numTokensFromString(
|
? numTokensFromString(
|
||||||
(result.data as Document).markdown!,
|
(result.data as V0Document).markdown!,
|
||||||
"gpt-3.5-turbo",
|
"gpt-3.5-turbo",
|
||||||
)
|
)
|
||||||
: 0;
|
: 0;
|
||||||
@ -276,14 +276,14 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
|
|
||||||
let doc = result.data;
|
let doc = result.data;
|
||||||
if (!pageOptions || !pageOptions.includeRawHtml) {
|
if (!pageOptions || !pageOptions.includeRawHtml) {
|
||||||
if (doc && (doc as Document).rawHtml) {
|
if (doc && (doc as V0Document).rawHtml) {
|
||||||
delete (doc as Document).rawHtml;
|
delete (doc as V0Document).rawHtml;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pageOptions && pageOptions.includeExtract) {
|
if (pageOptions && pageOptions.includeExtract) {
|
||||||
if (!pageOptions.includeMarkdown && doc && (doc as Document).markdown) {
|
if (!pageOptions.includeMarkdown && doc && (doc as V0Document).markdown) {
|
||||||
delete (doc as Document).markdown;
|
delete (doc as V0Document).markdown;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import {
|
import {
|
||||||
// Document,
|
Document,
|
||||||
RequestWithAuth,
|
RequestWithAuth,
|
||||||
ExtractRequest,
|
ExtractRequest,
|
||||||
extractRequestSchema,
|
extractRequestSchema,
|
||||||
@ -8,7 +8,7 @@ import {
|
|||||||
MapDocument,
|
MapDocument,
|
||||||
scrapeOptions,
|
scrapeOptions,
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import { Document } from "../../lib/entities";
|
// import { Document } from "../../lib/entities";
|
||||||
import Redis from "ioredis";
|
import Redis from "ioredis";
|
||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
import { performRanking } from "../../lib/ranker";
|
import { performRanking } from "../../lib/ranker";
|
||||||
|
@ -396,7 +396,7 @@ export type Document = {
|
|||||||
articleSection?: string;
|
articleSection?: string;
|
||||||
url?: string;
|
url?: string;
|
||||||
sourceURL?: string;
|
sourceURL?: string;
|
||||||
statusCode?: number;
|
statusCode: number;
|
||||||
error?: string;
|
error?: string;
|
||||||
[key: string]: string | string[] | number | undefined;
|
[key: string]: string | string[] | number | undefined;
|
||||||
};
|
};
|
||||||
|
@ -49,6 +49,7 @@ export async function startWebScraperPipeline({
|
|||||||
bull_job_id: job.id.toString(),
|
bull_job_id: job.id.toString(),
|
||||||
priority: job.opts.priority,
|
priority: job.opts.priority,
|
||||||
is_scrape: job.data.is_scrape ?? false,
|
is_scrape: job.data.is_scrape ?? false,
|
||||||
|
is_crawl: !!(job.data.crawl_id && job.data.crawlerOptions !== null),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -63,9 +64,23 @@ export async function runWebScraper({
|
|||||||
bull_job_id,
|
bull_job_id,
|
||||||
priority,
|
priority,
|
||||||
is_scrape = false,
|
is_scrape = false,
|
||||||
|
is_crawl = false,
|
||||||
}: RunWebScraperParams): Promise<ScrapeUrlResponse> {
|
}: RunWebScraperParams): Promise<ScrapeUrlResponse> {
|
||||||
|
const tries = is_crawl ? 3 : 1;
|
||||||
|
|
||||||
let response: ScrapeUrlResponse | undefined = undefined;
|
let response: ScrapeUrlResponse | undefined = undefined;
|
||||||
let engines: EngineResultsTracker = {};
|
let engines: EngineResultsTracker = {};
|
||||||
|
let error: any = undefined;
|
||||||
|
|
||||||
|
for (let i = 0; i < tries; i++) {
|
||||||
|
if (i > 0) {
|
||||||
|
logger.debug("Retrying scrape...", { scrapeId: bull_job_id, jobId: bull_job_id, method: "runWebScraper", module: "runWebScraper", tries, i, previousStatusCode: (response as any)?.document?.metadata?.statusCode, previousError: error });
|
||||||
|
}
|
||||||
|
|
||||||
|
response = undefined;
|
||||||
|
engines = {};
|
||||||
|
error = undefined;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
response = await scrapeURL(bull_job_id, url, scrapeOptions, {
|
response = await scrapeURL(bull_job_id, url, scrapeOptions, {
|
||||||
priority,
|
priority,
|
||||||
@ -86,25 +101,15 @@ export async function runWebScraper({
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (is_scrape === false) {
|
|
||||||
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
|
||||||
if (scrapeOptions.extract) {
|
|
||||||
creditsToBeBilled = 5;
|
|
||||||
}
|
|
||||||
|
|
||||||
billTeam(team_id, undefined, creditsToBeBilled).catch((error) => {
|
|
||||||
logger.error(
|
|
||||||
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`,
|
|
||||||
);
|
|
||||||
// Optionally, you could notify an admin or add to a retry queue here
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// This is where the returnvalue from the job is set
|
// This is where the returnvalue from the job is set
|
||||||
// onSuccess(response.document, mode);
|
// onSuccess(response.document, mode);
|
||||||
|
|
||||||
engines = response.engines;
|
engines = response.engines;
|
||||||
return response;
|
|
||||||
|
if ((response.document.metadata.statusCode >= 200 && response.document.metadata.statusCode < 300) || response.document.metadata.statusCode === 304) {
|
||||||
|
// status code is good -- do not attempt retry
|
||||||
|
break;
|
||||||
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
engines =
|
engines =
|
||||||
response !== undefined
|
response !== undefined
|
||||||
@ -112,23 +117,9 @@ export async function runWebScraper({
|
|||||||
: typeof error === "object" && error !== null
|
: typeof error === "object" && error !== null
|
||||||
? ((error as any).results ?? {})
|
? ((error as any).results ?? {})
|
||||||
: {};
|
: {};
|
||||||
|
|
||||||
if (response !== undefined) {
|
|
||||||
return {
|
|
||||||
...response,
|
|
||||||
success: false,
|
|
||||||
error,
|
|
||||||
};
|
|
||||||
} else {
|
|
||||||
return {
|
|
||||||
success: false,
|
|
||||||
error,
|
|
||||||
logs: ["no logs -- error coming from runWebScraper"],
|
|
||||||
engines,
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
// onError(error);
|
}
|
||||||
} finally {
|
|
||||||
const engineOrder = Object.entries(engines)
|
const engineOrder = Object.entries(engines)
|
||||||
.sort((a, b) => a[1].startedAt - b[1].startedAt)
|
.sort((a, b) => a[1].startedAt - b[1].startedAt)
|
||||||
.map((x) => x[0]) as Engine[];
|
.map((x) => x[0]) as Engine[];
|
||||||
@ -158,6 +149,38 @@ export async function runWebScraper({
|
|||||||
},
|
},
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (error === undefined && response?.success) {
|
||||||
|
if (is_scrape === false) {
|
||||||
|
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||||
|
if (scrapeOptions.extract) {
|
||||||
|
creditsToBeBilled = 5;
|
||||||
|
}
|
||||||
|
|
||||||
|
billTeam(team_id, undefined, creditsToBeBilled).catch((error) => {
|
||||||
|
logger.error(
|
||||||
|
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`,
|
||||||
|
);
|
||||||
|
// Optionally, you could notify an admin or add to a retry queue here
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return response;
|
||||||
|
} else {
|
||||||
|
if (response !== undefined) {
|
||||||
|
return {
|
||||||
|
...response,
|
||||||
|
success: false,
|
||||||
|
error,
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
error,
|
||||||
|
logs: ["no logs -- error coming from runWebScraper"],
|
||||||
|
engines,
|
||||||
|
};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5,7 +5,7 @@ import { Meta } from "..";
|
|||||||
export function extractMetadata(
|
export function extractMetadata(
|
||||||
meta: Meta,
|
meta: Meta,
|
||||||
html: string,
|
html: string,
|
||||||
): Document["metadata"] {
|
): Partial<Document["metadata"]> {
|
||||||
let title: string | undefined = undefined;
|
let title: string | undefined = undefined;
|
||||||
let description: string | undefined = undefined;
|
let description: string | undefined = undefined;
|
||||||
let language: string | undefined = undefined;
|
let language: string | undefined = undefined;
|
||||||
|
@ -55,6 +55,7 @@ export interface RunWebScraperParams {
|
|||||||
bull_job_id: string;
|
bull_job_id: string;
|
||||||
priority?: number;
|
priority?: number;
|
||||||
is_scrape?: boolean;
|
is_scrape?: boolean;
|
||||||
|
is_crawl?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
export type RunWebScraperResult =
|
export type RunWebScraperResult =
|
||||||
|
Loading…
x
Reference in New Issue
Block a user