Merge pull request #978 from mendableai/nsc/timeout-fixes

Timeout fixes on user defined timeouts
This commit is contained in:
Nicolas 2024-12-15 15:02:46 -03:00 committed by GitHub
commit 20f89c3478
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 30 additions and 18 deletions

View File

@ -5,8 +5,9 @@ import { specialtyScrapeCheck } from "../utils/specialtyHandler";
export async function scrapeURLWithFetch(
meta: Meta,
timeToRun: number | undefined
): Promise<EngineScrapeResult> {
const timeout = 20000;
const timeout = timeToRun ?? 300000;
const response = await Promise.race([
fetch(meta.url, {

View File

@ -18,8 +18,6 @@ import * as Sentry from "@sentry/node";
import { Action } from "../../../../lib/entities";
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
export const defaultTimeout = 10000;
// This function does not take `Meta` on purpose. It may not access any
// meta values to construct the request -- that must be done by the
// `scrapeURLWithFireEngine*` functions.
@ -31,7 +29,7 @@ async function performFireEngineScrape<
>(
logger: Logger,
request: FireEngineScrapeRequestCommon & Engine,
timeout = defaultTimeout,
timeout: number,
): Promise<FireEngineCheckStatusSuccess> {
const scrape = await fireEngineScrape(
logger.child({ method: "fireEngineScrape" }),
@ -94,6 +92,7 @@ async function performFireEngineScrape<
export async function scrapeURLWithFireEngineChromeCDP(
meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> {
const actions: Action[] = [
// Transform waitFor option into an action (unsupported by chrome-cdp)
@ -121,7 +120,7 @@ export async function scrapeURLWithFireEngineChromeCDP(
...(meta.options.actions ?? []),
];
const timeout = (meta.options.timeout === undefined ? 300000 : Math.round(meta.options.timeout / 3));
const timeout = timeToRun ?? 300000;
const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestChromeCDP = {
@ -208,8 +207,9 @@ export async function scrapeURLWithFireEngineChromeCDP(
export async function scrapeURLWithFireEnginePlaywright(
meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> {
const timeout = meta.options.timeout === undefined ? 300000 : Math.round(meta.options.timeout / 3);
const timeout = timeToRun ?? 300000;
const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestPlaywright = {
@ -267,8 +267,9 @@ export async function scrapeURLWithFireEnginePlaywright(
export async function scrapeURLWithFireEngineTLSClient(
meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> {
const timeout = meta.options.timeout === undefined ? 30000 : Math.round(meta.options.timeout / 3);
const timeout = timeToRun ?? 30000;
const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestTLSClient = {

View File

@ -105,7 +105,7 @@ export type EngineScrapeResult = {
};
const engineHandlers: {
[E in Engine]: (meta: Meta) => Promise<EngineScrapeResult>;
[E in Engine]: (meta: Meta, timeToRun: number | undefined) => Promise<EngineScrapeResult>;
} = {
cache: scrapeCache,
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
@ -372,6 +372,7 @@ export function buildFallbackList(meta: Meta): {
export async function scrapeURLWithEngine(
meta: Meta,
engine: Engine,
timeToRun: number | undefined
): Promise<EngineScrapeResult> {
const fn = engineHandlers[engine];
const logger = meta.logger.child({
@ -383,5 +384,5 @@ export async function scrapeURLWithEngine(
logger,
};
return await fn(_meta);
return await fn(_meta, timeToRun);
}

View File

@ -15,6 +15,7 @@ type PDFProcessorResult = { html: string; markdown?: string };
async function scrapePDFWithLlamaParse(
meta: Meta,
tempFilePath: string,
timeToRun: number | undefined,
): Promise<PDFProcessorResult> {
meta.logger.debug("Processing PDF document with LlamaIndex", {
tempFilePath,
@ -63,8 +64,9 @@ async function scrapePDFWithLlamaParse(
// TODO: timeout, retries
const startedAt = Date.now();
const timeout = timeToRun ?? 300000;
while (Date.now() <= startedAt + (meta.options.timeout ?? 300000)) {
while (Date.now() <= startedAt + timeout) {
try {
const result = await robustFetch({
url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
@ -122,7 +124,7 @@ async function scrapePDFWithParsePDF(
};
}
export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Promise<EngineScrapeResult> {
if (!meta.options.parsePDF) {
const file = await fetchFileToBuffer(meta.url);
const content = file.buffer.toString("base64");
@ -148,6 +150,7 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
}),
},
tempFilePath,
timeToRun,
);
} catch (error) {
if (error instanceof Error && error.message === "LlamaParse timed out") {

View File

@ -6,8 +6,9 @@ import { robustFetch } from "../../lib/fetch";
export async function scrapeURLWithPlaywright(
meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> {
const timeout = 20000 + meta.options.waitFor;
const timeout = (timeToRun ?? 300000) + meta.options.waitFor;
const response = await Promise.race([
await robustFetch({
@ -30,7 +31,7 @@ export async function scrapeURLWithPlaywright(
}),
}),
(async () => {
await new Promise((resolve) => setTimeout(() => resolve(null), 20000));
await new Promise((resolve) => setTimeout(() => resolve(null), timeout));
throw new TimeoutError(
"Playwright was unable to scrape the page before timing out",
{ cause: { timeout } },

View File

@ -9,16 +9,17 @@ const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!);
export function scrapeURLWithScrapingBee(
wait_browser: "domcontentloaded" | "networkidle2",
): (meta: Meta) => Promise<EngineScrapeResult> {
return async (meta: Meta): Promise<EngineScrapeResult> => {
): (meta: Meta, timeToRun: number | undefined) => Promise<EngineScrapeResult> {
return async (meta: Meta, timeToRun: number | undefined): Promise<EngineScrapeResult> => {
let response: AxiosResponse<any>;
const timeout = (timeToRun ?? 300000) + meta.options.waitFor;
try {
response = await client.get({
url: meta.url,
params: {
timeout: 15000, // TODO: dynamic timeout based on request timeout
timeout,
wait_browser: wait_browser,
wait: Math.min(meta.options.waitFor, 35000),
wait: meta.options.waitFor,
transparent_status_code: true,
json_response: true,
screenshot: meta.options.formats.includes("screenshot"),

View File

@ -202,11 +202,15 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
const results: EngineResultsTracker = {};
let result: EngineScrapeResultWithContext | null = null;
const timeToRun = meta.options.timeout !== undefined
? Math.round(meta.options.timeout / Math.min(fallbackList.length, 3))
: undefined
for (const { engine, unsupportedFeatures } of fallbackList) {
const startedAt = Date.now();
try {
meta.logger.info("Scraping via " + engine + "...");
const _engineResult = await scrapeURLWithEngine(meta, engine);
const _engineResult = await scrapeURLWithEngine(meta, engine, timeToRun);
if (_engineResult.markdown === undefined) {
// Some engines emit Markdown directly.
_engineResult.markdown = await parseMarkdown(_engineResult.html);