Merge pull request #978 from mendableai/nsc/timeout-fixes

Timeout fixes on user defined timeouts
This commit is contained in:
Nicolas 2024-12-15 15:02:46 -03:00 committed by GitHub
commit 20f89c3478
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 30 additions and 18 deletions

View File

@ -5,8 +5,9 @@ import { specialtyScrapeCheck } from "../utils/specialtyHandler";
export async function scrapeURLWithFetch( export async function scrapeURLWithFetch(
meta: Meta, meta: Meta,
timeToRun: number | undefined
): Promise<EngineScrapeResult> { ): Promise<EngineScrapeResult> {
const timeout = 20000; const timeout = timeToRun ?? 300000;
const response = await Promise.race([ const response = await Promise.race([
fetch(meta.url, { fetch(meta.url, {

View File

@ -18,8 +18,6 @@ import * as Sentry from "@sentry/node";
import { Action } from "../../../../lib/entities"; import { Action } from "../../../../lib/entities";
import { specialtyScrapeCheck } from "../utils/specialtyHandler"; import { specialtyScrapeCheck } from "../utils/specialtyHandler";
export const defaultTimeout = 10000;
// This function does not take `Meta` on purpose. It may not access any // This function does not take `Meta` on purpose. It may not access any
// meta values to construct the request -- that must be done by the // meta values to construct the request -- that must be done by the
// `scrapeURLWithFireEngine*` functions. // `scrapeURLWithFireEngine*` functions.
@ -31,7 +29,7 @@ async function performFireEngineScrape<
>( >(
logger: Logger, logger: Logger,
request: FireEngineScrapeRequestCommon & Engine, request: FireEngineScrapeRequestCommon & Engine,
timeout = defaultTimeout, timeout: number,
): Promise<FireEngineCheckStatusSuccess> { ): Promise<FireEngineCheckStatusSuccess> {
const scrape = await fireEngineScrape( const scrape = await fireEngineScrape(
logger.child({ method: "fireEngineScrape" }), logger.child({ method: "fireEngineScrape" }),
@ -94,6 +92,7 @@ async function performFireEngineScrape<
export async function scrapeURLWithFireEngineChromeCDP( export async function scrapeURLWithFireEngineChromeCDP(
meta: Meta, meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> { ): Promise<EngineScrapeResult> {
const actions: Action[] = [ const actions: Action[] = [
// Transform waitFor option into an action (unsupported by chrome-cdp) // Transform waitFor option into an action (unsupported by chrome-cdp)
@ -121,7 +120,7 @@ export async function scrapeURLWithFireEngineChromeCDP(
...(meta.options.actions ?? []), ...(meta.options.actions ?? []),
]; ];
const timeout = (meta.options.timeout === undefined ? 300000 : Math.round(meta.options.timeout / 3)); const timeout = timeToRun ?? 300000;
const request: FireEngineScrapeRequestCommon & const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestChromeCDP = { FireEngineScrapeRequestChromeCDP = {
@ -208,8 +207,9 @@ export async function scrapeURLWithFireEngineChromeCDP(
export async function scrapeURLWithFireEnginePlaywright( export async function scrapeURLWithFireEnginePlaywright(
meta: Meta, meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> { ): Promise<EngineScrapeResult> {
const timeout = meta.options.timeout === undefined ? 300000 : Math.round(meta.options.timeout / 3); const timeout = timeToRun ?? 300000;
const request: FireEngineScrapeRequestCommon & const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestPlaywright = { FireEngineScrapeRequestPlaywright = {
@ -267,8 +267,9 @@ export async function scrapeURLWithFireEnginePlaywright(
export async function scrapeURLWithFireEngineTLSClient( export async function scrapeURLWithFireEngineTLSClient(
meta: Meta, meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> { ): Promise<EngineScrapeResult> {
const timeout = meta.options.timeout === undefined ? 30000 : Math.round(meta.options.timeout / 3); const timeout = timeToRun ?? 30000;
const request: FireEngineScrapeRequestCommon & const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestTLSClient = { FireEngineScrapeRequestTLSClient = {

View File

@ -105,7 +105,7 @@ export type EngineScrapeResult = {
}; };
const engineHandlers: { const engineHandlers: {
[E in Engine]: (meta: Meta) => Promise<EngineScrapeResult>; [E in Engine]: (meta: Meta, timeToRun: number | undefined) => Promise<EngineScrapeResult>;
} = { } = {
cache: scrapeCache, cache: scrapeCache,
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP, "fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
@ -372,6 +372,7 @@ export function buildFallbackList(meta: Meta): {
export async function scrapeURLWithEngine( export async function scrapeURLWithEngine(
meta: Meta, meta: Meta,
engine: Engine, engine: Engine,
timeToRun: number | undefined
): Promise<EngineScrapeResult> { ): Promise<EngineScrapeResult> {
const fn = engineHandlers[engine]; const fn = engineHandlers[engine];
const logger = meta.logger.child({ const logger = meta.logger.child({
@ -383,5 +384,5 @@ export async function scrapeURLWithEngine(
logger, logger,
}; };
return await fn(_meta); return await fn(_meta, timeToRun);
} }

View File

@ -15,6 +15,7 @@ type PDFProcessorResult = { html: string; markdown?: string };
async function scrapePDFWithLlamaParse( async function scrapePDFWithLlamaParse(
meta: Meta, meta: Meta,
tempFilePath: string, tempFilePath: string,
timeToRun: number | undefined,
): Promise<PDFProcessorResult> { ): Promise<PDFProcessorResult> {
meta.logger.debug("Processing PDF document with LlamaIndex", { meta.logger.debug("Processing PDF document with LlamaIndex", {
tempFilePath, tempFilePath,
@ -63,8 +64,9 @@ async function scrapePDFWithLlamaParse(
// TODO: timeout, retries // TODO: timeout, retries
const startedAt = Date.now(); const startedAt = Date.now();
const timeout = timeToRun ?? 300000;
while (Date.now() <= startedAt + (meta.options.timeout ?? 300000)) { while (Date.now() <= startedAt + timeout) {
try { try {
const result = await robustFetch({ const result = await robustFetch({
url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`, url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
@ -122,7 +124,7 @@ async function scrapePDFWithParsePDF(
}; };
} }
export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> { export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Promise<EngineScrapeResult> {
if (!meta.options.parsePDF) { if (!meta.options.parsePDF) {
const file = await fetchFileToBuffer(meta.url); const file = await fetchFileToBuffer(meta.url);
const content = file.buffer.toString("base64"); const content = file.buffer.toString("base64");
@ -148,6 +150,7 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
}), }),
}, },
tempFilePath, tempFilePath,
timeToRun,
); );
} catch (error) { } catch (error) {
if (error instanceof Error && error.message === "LlamaParse timed out") { if (error instanceof Error && error.message === "LlamaParse timed out") {

View File

@ -6,8 +6,9 @@ import { robustFetch } from "../../lib/fetch";
export async function scrapeURLWithPlaywright( export async function scrapeURLWithPlaywright(
meta: Meta, meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> { ): Promise<EngineScrapeResult> {
const timeout = 20000 + meta.options.waitFor; const timeout = (timeToRun ?? 300000) + meta.options.waitFor;
const response = await Promise.race([ const response = await Promise.race([
await robustFetch({ await robustFetch({
@ -30,7 +31,7 @@ export async function scrapeURLWithPlaywright(
}), }),
}), }),
(async () => { (async () => {
await new Promise((resolve) => setTimeout(() => resolve(null), 20000)); await new Promise((resolve) => setTimeout(() => resolve(null), timeout));
throw new TimeoutError( throw new TimeoutError(
"Playwright was unable to scrape the page before timing out", "Playwright was unable to scrape the page before timing out",
{ cause: { timeout } }, { cause: { timeout } },

View File

@ -9,16 +9,17 @@ const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!);
export function scrapeURLWithScrapingBee( export function scrapeURLWithScrapingBee(
wait_browser: "domcontentloaded" | "networkidle2", wait_browser: "domcontentloaded" | "networkidle2",
): (meta: Meta) => Promise<EngineScrapeResult> { ): (meta: Meta, timeToRun: number | undefined) => Promise<EngineScrapeResult> {
return async (meta: Meta): Promise<EngineScrapeResult> => { return async (meta: Meta, timeToRun: number | undefined): Promise<EngineScrapeResult> => {
let response: AxiosResponse<any>; let response: AxiosResponse<any>;
const timeout = (timeToRun ?? 300000) + meta.options.waitFor;
try { try {
response = await client.get({ response = await client.get({
url: meta.url, url: meta.url,
params: { params: {
timeout: 15000, // TODO: dynamic timeout based on request timeout timeout,
wait_browser: wait_browser, wait_browser: wait_browser,
wait: Math.min(meta.options.waitFor, 35000), wait: meta.options.waitFor,
transparent_status_code: true, transparent_status_code: true,
json_response: true, json_response: true,
screenshot: meta.options.formats.includes("screenshot"), screenshot: meta.options.formats.includes("screenshot"),

View File

@ -202,11 +202,15 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
const results: EngineResultsTracker = {}; const results: EngineResultsTracker = {};
let result: EngineScrapeResultWithContext | null = null; let result: EngineScrapeResultWithContext | null = null;
const timeToRun = meta.options.timeout !== undefined
? Math.round(meta.options.timeout / Math.min(fallbackList.length, 3))
: undefined
for (const { engine, unsupportedFeatures } of fallbackList) { for (const { engine, unsupportedFeatures } of fallbackList) {
const startedAt = Date.now(); const startedAt = Date.now();
try { try {
meta.logger.info("Scraping via " + engine + "..."); meta.logger.info("Scraping via " + engine + "...");
const _engineResult = await scrapeURLWithEngine(meta, engine); const _engineResult = await scrapeURLWithEngine(meta, engine, timeToRun);
if (_engineResult.markdown === undefined) { if (_engineResult.markdown === undefined) {
// Some engines emit Markdown directly. // Some engines emit Markdown directly.
_engineResult.markdown = await parseMarkdown(_engineResult.html); _engineResult.markdown = await parseMarkdown(_engineResult.html);