feat(v1/map): stop mapping if timed out via AbortController (#1205)

This commit is contained in:
Gergő Móricz 2025-02-20 00:42:13 +01:00 committed by GitHub
parent 2200f084f3
commit 46b187bc64
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 137 additions and 105 deletions

View File

@ -5,6 +5,7 @@ import {
mapRequestSchema, mapRequestSchema,
RequestWithAuth, RequestWithAuth,
scrapeOptions, scrapeOptions,
TimeoutSignal,
} from "./types"; } from "./types";
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis"; import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
import { MapResponse, MapRequest } from "./types"; import { MapResponse, MapRequest } from "./types";
@ -53,6 +54,7 @@ export async function getMapResults({
origin, origin,
includeMetadata = false, includeMetadata = false,
allowExternalLinks, allowExternalLinks,
abort = new AbortController().signal, // noop
}: { }: {
url: string; url: string;
search?: string; search?: string;
@ -65,6 +67,7 @@ export async function getMapResults({
origin?: string; origin?: string;
includeMetadata?: boolean; includeMetadata?: boolean;
allowExternalLinks?: boolean; allowExternalLinks?: boolean;
abort?: AbortSignal;
}): Promise<MapResult> { }): Promise<MapResult> {
const id = uuidv4(); const id = uuidv4();
let links: string[] = [url]; let links: string[] = [url];
@ -87,8 +90,8 @@ export async function getMapResults({
const crawler = crawlToCrawler(id, sc); const crawler = crawlToCrawler(id, sc);
try { try {
sc.robots = await crawler.getRobotsTxt(); sc.robots = await crawler.getRobotsTxt(false, abort);
await crawler.importRobotsTxt(sc.robots); crawler.importRobotsTxt(sc.robots);
} catch (_) {} } catch (_) {}
// If sitemapOnly is true, only get links from sitemap // If sitemapOnly is true, only get links from sitemap
@ -102,6 +105,7 @@ export async function getMapResults({
true, true,
true, true,
30000, 30000,
abort,
); );
if (sitemap > 0) { if (sitemap > 0) {
links = links links = links
@ -144,7 +148,7 @@ export async function getMapResults({
return fireEngineMap(mapUrl, { return fireEngineMap(mapUrl, {
numResults: resultsPerPage, numResults: resultsPerPage,
page: page, page: page,
}); }, abort);
}; };
pagePromises = Array.from({ length: maxPages }, (_, i) => pagePromises = Array.from({ length: maxPages }, (_, i) =>
@ -157,7 +161,7 @@ export async function getMapResults({
// Parallelize sitemap index query with search results // Parallelize sitemap index query with search results
const [sitemapIndexResult, ...searchResults] = await Promise.all([ const [sitemapIndexResult, ...searchResults] = await Promise.all([
querySitemapIndex(url), querySitemapIndex(url, abort),
...(cachedResult ? [] : pagePromises), ...(cachedResult ? [] : pagePromises),
]); ]);
@ -178,6 +182,7 @@ export async function getMapResults({
true, true,
false, false,
30000, 30000,
abort,
); );
} catch (e) { } catch (e) {
logger.warn("tryGetSitemap threw an error", { error: e }); logger.warn("tryGetSitemap threw an error", { error: e });
@ -277,6 +282,7 @@ export async function mapController(
req.body = mapRequestSchema.parse(req.body); req.body = mapRequestSchema.parse(req.body);
let result: Awaited<ReturnType<typeof getMapResults>>; let result: Awaited<ReturnType<typeof getMapResults>>;
const abort = new AbortController();
try { try {
result = await Promise.race([ result = await Promise.race([
getMapResults({ getMapResults({
@ -289,13 +295,17 @@ export async function mapController(
origin: req.body.origin, origin: req.body.origin,
teamId: req.auth.team_id, teamId: req.auth.team_id,
plan: req.auth.plan, plan: req.auth.plan,
abort: abort.signal,
}), }),
...(req.body.timeout !== undefined ? [ ...(req.body.timeout !== undefined ? [
new Promise((resolve, reject) => setTimeout(() => reject("timeout"), req.body.timeout)) new Promise((resolve, reject) => setTimeout(() => {
abort.abort(new TimeoutSignal());
reject(new TimeoutSignal());
}, req.body.timeout))
] : []), ] : []),
]) as any; ]) as any;
} catch (error) { } catch (error) {
if (error === "timeout") { if (error instanceof TimeoutSignal || error === "timeout") {
return res.status(408).json({ return res.status(408).json({
success: false, success: false,
error: "Request timed out", error: "Request timed out",

View File

@ -1004,3 +1004,9 @@ export const generateLLMsTextRequestSchema = z.object({
export type GenerateLLMsTextRequest = z.infer< export type GenerateLLMsTextRequest = z.infer<
typeof generateLLMsTextRequestSchema typeof generateLLMsTextRequestSchema
>; >;
export class TimeoutSignal extends Error {
constructor() {
super("Operation timed out")
}
}

View File

@ -9,6 +9,7 @@ import { logger as _logger } from "../../lib/logger";
import https from "https"; import https from "https";
import { redisConnection } from "../../services/queue-service"; import { redisConnection } from "../../services/queue-service";
import { extractLinks } from "../../lib/html-transformer"; import { extractLinks } from "../../lib/html-transformer";
import { TimeoutSignal } from "../../controllers/v1/types";
export class WebCrawler { export class WebCrawler {
private jobId: string; private jobId: string;
private initialUrl: string; private initialUrl: string;
@ -182,7 +183,7 @@ export class WebCrawler {
.slice(0, limit); .slice(0, limit);
} }
public async getRobotsTxt(skipTlsVerification = false): Promise<string> { public async getRobotsTxt(skipTlsVerification = false, abort?: AbortSignal): Promise<string> {
let extraArgs = {}; let extraArgs = {};
if (skipTlsVerification) { if (skipTlsVerification) {
extraArgs["httpsAgent"] = new https.Agent({ extraArgs["httpsAgent"] = new https.Agent({
@ -191,6 +192,7 @@ export class WebCrawler {
} }
const response = await axios.get(this.robotsTxtUrl, { const response = await axios.get(this.robotsTxtUrl, {
timeout: axiosTimeout, timeout: axiosTimeout,
signal: abort,
...extraArgs, ...extraArgs,
}); });
return response.data; return response.data;
@ -205,6 +207,7 @@ export class WebCrawler {
fromMap: boolean = false, fromMap: boolean = false,
onlySitemap: boolean = false, onlySitemap: boolean = false,
timeout: number = 120000, timeout: number = 120000,
abort?: AbortSignal
): Promise<number> { ): Promise<number> {
this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, { this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, {
method: "tryGetSitemap", method: "tryGetSitemap",
@ -260,10 +263,10 @@ export class WebCrawler {
try { try {
let count = (await Promise.race([ let count = (await Promise.race([
Promise.all([ Promise.all([
this.tryFetchSitemapLinks(this.initialUrl, _urlsHandler), this.tryFetchSitemapLinks(this.initialUrl, _urlsHandler, abort),
...this.robots ...this.robots
.getSitemaps() .getSitemaps()
.map((x) => this.tryFetchSitemapLinks(x, _urlsHandler)), .map((x) => this.tryFetchSitemapLinks(x, _urlsHandler, abort)),
]).then((results) => results.reduce((a, x) => a + x, 0)), ]).then((results) => results.reduce((a, x) => a + x, 0)),
timeoutPromise, timeoutPromise,
])) as number; ])) as number;
@ -555,6 +558,7 @@ export class WebCrawler {
private async tryFetchSitemapLinks( private async tryFetchSitemapLinks(
url: string, url: string,
urlsHandler: (urls: string[]) => unknown, urlsHandler: (urls: string[]) => unknown,
abort?: AbortSignal,
): Promise<number> { ): Promise<number> {
const sitemapUrl = url.endsWith(".xml") const sitemapUrl = url.endsWith(".xml")
? url ? url
@ -569,13 +573,18 @@ export class WebCrawler {
this.logger, this.logger,
this.jobId, this.jobId,
this.sitemapsHit, this.sitemapsHit,
abort,
); );
} catch (error) { } catch (error) {
this.logger.debug(`Failed to fetch sitemap from ${sitemapUrl}`, { if (error instanceof TimeoutSignal) {
method: "tryFetchSitemapLinks", throw error;
sitemapUrl, } else {
error, this.logger.debug(`Failed to fetch sitemap from ${sitemapUrl}`, {
}); method: "tryFetchSitemapLinks",
sitemapUrl,
error,
});
}
} }
// If this is a subdomain, also try to get sitemap from the main domain // If this is a subdomain, also try to get sitemap from the main domain
@ -611,20 +620,29 @@ export class WebCrawler {
this.logger, this.logger,
this.jobId, this.jobId,
this.sitemapsHit, this.sitemapsHit,
abort,
); );
} catch (error) { } catch (error) {
this.logger.debug( if (error instanceof TimeoutSignal) {
`Failed to fetch main domain sitemap from ${mainDomainSitemapUrl}`, throw error;
{ method: "tryFetchSitemapLinks", mainDomainSitemapUrl, error }, } else {
); this.logger.debug(
`Failed to fetch main domain sitemap from ${mainDomainSitemapUrl}`,
{ method: "tryFetchSitemapLinks", mainDomainSitemapUrl, error },
);
}
} }
} }
} catch (error) { } catch (error) {
this.logger.debug(`Error processing main domain sitemap`, { if (error instanceof TimeoutSignal) {
method: "tryFetchSitemapLinks", throw error;
url, } else {
error, this.logger.debug(`Error processing main domain sitemap`, {
}); method: "tryFetchSitemapLinks",
url,
error,
});
}
} }
// If no sitemap found yet, try the baseUrl as a last resort // If no sitemap found yet, try the baseUrl as a last resort
@ -636,22 +654,28 @@ export class WebCrawler {
this.logger, this.logger,
this.jobId, this.jobId,
this.sitemapsHit, this.sitemapsHit,
abort,
); );
} catch (error) { } catch (error) {
this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, { if (error instanceof TimeoutSignal) {
method: "tryFetchSitemapLinks", throw error;
sitemapUrl: baseUrlSitemap,
error,
});
if (error instanceof AxiosError && error.response?.status === 404) {
// ignore 404
} else { } else {
sitemapCount += await getLinksFromSitemap( this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, {
{ sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" }, method: "tryFetchSitemapLinks",
this.logger, sitemapUrl: baseUrlSitemap,
this.jobId, error,
this.sitemapsHit, });
); if (error instanceof AxiosError && error.response?.status === 404) {
// ignore 404
} else {
sitemapCount += await getLinksFromSitemap(
{ sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
this.logger,
this.jobId,
this.sitemapsHit,
abort,
);
}
} }
} }
} }

View File

@ -12,10 +12,11 @@ import { supabase_service } from "../../services/supabase";
*/ */
import { withAuth } from "../../lib/withAuth"; import { withAuth } from "../../lib/withAuth";
async function querySitemapIndexFunction(url: string) { async function querySitemapIndexFunction(url: string, abort?: AbortSignal) {
const originUrl = normalizeUrlOnlyHostname(url); const originUrl = normalizeUrlOnlyHostname(url);
for (let attempt = 1; attempt <= 3; attempt++) { for (let attempt = 1; attempt <= 3; attempt++) {
abort?.throwIfAborted();
try { try {
const { data, error } = await supabase_service const { data, error } = await supabase_service
.from("crawl_maps") .from("crawl_maps")

View File

@ -1,8 +1,7 @@
import { axiosTimeout } from "../../lib/timeout";
import { parseStringPromise } from "xml2js"; import { parseStringPromise } from "xml2js";
import { WebCrawler } from "./crawler"; import { WebCrawler } from "./crawler";
import { scrapeURL } from "../scrapeURL"; import { scrapeURL } from "../scrapeURL";
import { scrapeOptions } from "../../controllers/v1/types"; import { scrapeOptions, TimeoutSignal } from "../../controllers/v1/types";
import type { Logger } from "winston"; import type { Logger } from "winston";
const useFireEngine = const useFireEngine =
process.env.FIRE_ENGINE_BETA_URL !== "" && process.env.FIRE_ENGINE_BETA_URL !== "" &&
@ -20,6 +19,7 @@ export async function getLinksFromSitemap(
logger: Logger, logger: Logger,
crawlId: string, crawlId: string,
sitemapsHit: Set<string>, sitemapsHit: Set<string>,
abort?: AbortSignal,
): Promise<number> { ): Promise<number> {
if (sitemapsHit.size >= 20) { if (sitemapsHit.size >= 20) {
return 0; return 0;
@ -44,7 +44,8 @@ export async function getLinksFromSitemap(
"fetch", "fetch",
...((mode === "fire-engine" && useFireEngine) ? ["fire-engine;tlsclient" as const] : []), ...((mode === "fire-engine" && useFireEngine) ? ["fire-engine;tlsclient" as const] : []),
], ],
v0DisableJsDom: true v0DisableJsDom: true,
abort,
}, },
); );
@ -69,14 +70,18 @@ export async function getLinksFromSitemap(
return 0; return 0;
} }
} catch (error) { } catch (error) {
logger.error(`Request failed for sitemap fetch`, { if (error instanceof TimeoutSignal) {
method: "getLinksFromSitemap", throw error;
mode, } else {
sitemapUrl, logger.error(`Request failed for sitemap fetch`, {
error, method: "getLinksFromSitemap",
}); mode,
sitemapUrl,
return 0; error,
});
return 0;
}
} }
const parsed = await parseStringPromise(content); const parsed = await parseStringPromise(content);
@ -90,7 +95,7 @@ export async function getLinksFromSitemap(
.map((sitemap) => sitemap.loc[0].trim()); .map((sitemap) => sitemap.loc[0].trim());
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) => const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger, crawlId, sitemapsHit), getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger, crawlId, sitemapsHit, abort),
); );
const results = await Promise.all(sitemapPromises); const results = await Promise.all(sitemapPromises);
@ -114,6 +119,7 @@ export async function getLinksFromSitemap(
logger, logger,
crawlId, crawlId,
sitemapsHit, sitemapsHit,
abort,
), ),
); );
count += (await Promise.all(sitemapPromises)).reduce( count += (await Promise.all(sitemapPromises)).reduce(
@ -151,56 +157,3 @@ export async function getLinksFromSitemap(
return 0; return 0;
} }
export const fetchSitemapData = async (
url: string,
timeout?: number,
): Promise<SitemapEntry[] | null> => {
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
try {
const fetchResponse = await scrapeURL(
"sitemap",
sitemapUrl,
scrapeOptions.parse({
formats: ["rawHtml"],
timeout: timeout || axiosTimeout,
}),
{ forceEngine: "fetch" },
);
if (
fetchResponse.success &&
fetchResponse.document.metadata.statusCode >= 200 &&
fetchResponse.document.metadata.statusCode < 300
) {
const xml = fetchResponse.document.rawHtml!;
const parsedXml = await parseStringPromise(xml);
const sitemapData: SitemapEntry[] = [];
if (parsedXml.urlset && parsedXml.urlset.url) {
for (const urlElement of parsedXml.urlset.url) {
const sitemapEntry: SitemapEntry = { loc: urlElement.loc[0] };
if (urlElement.lastmod) sitemapEntry.lastmod = urlElement.lastmod[0];
if (urlElement.changefreq)
sitemapEntry.changefreq = urlElement.changefreq[0];
if (urlElement.priority)
sitemapEntry.priority = Number(urlElement.priority[0]);
sitemapData.push(sitemapEntry);
}
}
return sitemapData;
}
return null;
} catch (error) {
// Error handling for failed sitemap fetch
}
return [];
};
export interface SitemapEntry {
loc: string;
lastmod?: string;
changefreq?: string;
priority?: number;
}

View File

@ -21,6 +21,7 @@ export async function scrapeURLWithFetch(
dispatcher: await makeSecureDispatcher(meta.url), dispatcher: await makeSecureDispatcher(meta.url),
redirect: "follow", redirect: "follow",
headers: meta.options.headers, headers: meta.options.headers,
signal: meta.internalOptions.abort,
}), }),
(async () => { (async () => {
await new Promise((resolve) => await new Promise((resolve) =>

View File

@ -85,6 +85,7 @@ export async function fireEngineCheckStatus(
logger: Logger, logger: Logger,
jobId: string, jobId: string,
mock: MockState | null, mock: MockState | null,
abort?: AbortSignal,
): Promise<FireEngineCheckStatusSuccess> { ): Promise<FireEngineCheckStatusSuccess> {
const status = await Sentry.startSpan( const status = await Sentry.startSpan(
{ {

View File

@ -24,8 +24,9 @@ import * as Sentry from "@sentry/node";
import { Action } from "../../../../lib/entities"; import { Action } from "../../../../lib/entities";
import { specialtyScrapeCheck } from "../utils/specialtyHandler"; import { specialtyScrapeCheck } from "../utils/specialtyHandler";
import { fireEngineDelete } from "./delete"; import { fireEngineDelete } from "./delete";
import { MockState, saveMock } from "../../lib/mock"; import { MockState } from "../../lib/mock";
import { getInnerJSON } from "../../../../lib/html-transformer"; import { getInnerJSON } from "../../../../lib/html-transformer";
import { TimeoutSignal } from "../../../../controllers/v1/types";
// This function does not take `Meta` on purpose. It may not access any // This function does not take `Meta` on purpose. It may not access any
// meta values to construct the request -- that must be done by the // meta values to construct the request -- that must be done by the
@ -40,6 +41,7 @@ async function performFireEngineScrape<
request: FireEngineScrapeRequestCommon & Engine, request: FireEngineScrapeRequestCommon & Engine,
timeout: number, timeout: number,
mock: MockState | null, mock: MockState | null,
abort?: AbortSignal,
): Promise<FireEngineCheckStatusSuccess> { ): Promise<FireEngineCheckStatusSuccess> {
const scrape = await fireEngineScrape( const scrape = await fireEngineScrape(
logger.child({ method: "fireEngineScrape" }), logger.child({ method: "fireEngineScrape" }),
@ -84,6 +86,7 @@ async function performFireEngineScrape<
logger.child({ method: "fireEngineCheckStatus" }), logger.child({ method: "fireEngineCheckStatus" }),
scrape.jobId, scrape.jobId,
mock, mock,
abort,
); );
} catch (error) { } catch (error) {
if (error instanceof StillProcessingError) { if (error instanceof StillProcessingError) {
@ -107,6 +110,16 @@ async function performFireEngineScrape<
jobId: scrape.jobId, jobId: scrape.jobId,
}); });
throw error; throw error;
} else if (error instanceof TimeoutSignal) {
fireEngineDelete(
logger.child({
method: "performFireEngineScrape/fireEngineDelete",
afterError: error,
}),
scrape.jobId,
mock,
);
throw error;
} else { } else {
Sentry.captureException(error); Sentry.captureException(error);
errors.push(error); errors.push(error);
@ -219,6 +232,7 @@ export async function scrapeURLWithFireEngineChromeCDP(
request, request,
timeout, timeout,
meta.mock, meta.mock,
meta.internalOptions.abort,
); );
if ( if (
@ -298,6 +312,7 @@ export async function scrapeURLWithFireEnginePlaywright(
request, request,
timeout, timeout,
meta.mock, meta.mock,
meta.internalOptions.abort,
); );
if (!response.url) { if (!response.url) {
@ -353,6 +368,7 @@ export async function scrapeURLWithFireEngineTLSClient(
request, request,
timeout, timeout,
meta.mock, meta.mock,
meta.internalOptions.abort,
); );
if (!response.url) { if (!response.url) {

View File

@ -76,6 +76,7 @@ export async function fireEngineScrape<
logger: Logger, logger: Logger,
request: FireEngineScrapeRequestCommon & Engine, request: FireEngineScrapeRequestCommon & Engine,
mock: MockState | null, mock: MockState | null,
abort?: AbortSignal,
): Promise<z.infer<typeof schema>> { ): Promise<z.infer<typeof schema>> {
const scrapeRequest = await Sentry.startSpan( const scrapeRequest = await Sentry.startSpan(
{ {
@ -101,6 +102,7 @@ export async function fireEngineScrape<
schema, schema,
tryCount: 3, tryCount: 3,
mock, mock,
abort,
}); });
}, },
); );

View File

@ -1,7 +1,7 @@
import { Logger } from "winston"; import { Logger } from "winston";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import { Document, ScrapeOptions } from "../../controllers/v1/types"; import { Document, ScrapeOptions, TimeoutSignal } from "../../controllers/v1/types";
import { logger as _logger } from "../../lib/logger"; import { logger as _logger } from "../../lib/logger";
import { import {
buildFallbackList, buildFallbackList,
@ -165,6 +165,7 @@ export type InternalOptions = {
disableSmartWaitCache?: boolean; // Passed along to fire-engine disableSmartWaitCache?: boolean; // Passed along to fire-engine
isBackgroundIndex?: boolean; isBackgroundIndex?: boolean;
fromCache?: boolean; // Indicates if the document was retrieved from cache fromCache?: boolean; // Indicates if the document was retrieved from cache
abort?: AbortSignal;
}; };
export type EngineResultsTracker = { export type EngineResultsTracker = {
@ -222,6 +223,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
: undefined; : undefined;
for (const { engine, unsupportedFeatures } of fallbackList) { for (const { engine, unsupportedFeatures } of fallbackList) {
meta.internalOptions.abort?.throwIfAborted();
const startedAt = Date.now(); const startedAt = Date.now();
try { try {
meta.logger.info("Scraping via " + engine + "..."); meta.logger.info("Scraping via " + engine + "...");
@ -307,6 +309,8 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
throw error; throw error;
} else if (error instanceof UnsupportedFileError) { } else if (error instanceof UnsupportedFileError) {
throw error; throw error;
} else if (error instanceof TimeoutSignal) {
throw error;
} else { } else {
Sentry.captureException(error); Sentry.captureException(error);
meta.logger.warn( meta.logger.warn(
@ -433,6 +437,8 @@ export async function scrapeURL(
meta.logger.warn("scrapeURL: Tried to scrape unsupported file", { meta.logger.warn("scrapeURL: Tried to scrape unsupported file", {
error, error,
}); });
} else if (error instanceof TimeoutSignal) {
throw error;
} else { } else {
Sentry.captureException(error); Sentry.captureException(error);
meta.logger.error("scrapeURL: Unexpected error happened", { error }); meta.logger.error("scrapeURL: Unexpected error happened", { error });

View File

@ -2,6 +2,7 @@ import { Logger } from "winston";
import { z, ZodError } from "zod"; import { z, ZodError } from "zod";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import { MockState, saveMock } from "./mock"; import { MockState, saveMock } from "./mock";
import { TimeoutSignal } from "../../../controllers/v1/types";
import { fireEngineURL } from "../engines/fire-engine/scrape"; import { fireEngineURL } from "../engines/fire-engine/scrape";
export type RobustFetchParams<Schema extends z.Schema<any>> = { export type RobustFetchParams<Schema extends z.Schema<any>> = {
@ -18,6 +19,7 @@ export type RobustFetchParams<Schema extends z.Schema<any>> = {
tryCount?: number; tryCount?: number;
tryCooldown?: number; tryCooldown?: number;
mock: MockState | null; mock: MockState | null;
abort?: AbortSignal;
}; };
export async function robustFetch< export async function robustFetch<
@ -36,7 +38,10 @@ export async function robustFetch<
tryCount = 1, tryCount = 1,
tryCooldown, tryCooldown,
mock, mock,
abort,
}: RobustFetchParams<Schema>): Promise<Output> { }: RobustFetchParams<Schema>): Promise<Output> {
abort?.throwIfAborted();
const params = { const params = {
url, url,
logger, logger,
@ -48,6 +53,7 @@ export async function robustFetch<
ignoreFailure, ignoreFailure,
tryCount, tryCount,
tryCooldown, tryCooldown,
abort,
}; };
let response: { let response: {
@ -71,6 +77,7 @@ export async function robustFetch<
: {}), : {}),
...(headers !== undefined ? headers : {}), ...(headers !== undefined ? headers : {}),
}, },
signal: abort,
...(body instanceof FormData ...(body instanceof FormData
? { ? {
body, body,
@ -82,7 +89,9 @@ export async function robustFetch<
: {}), : {}),
}); });
} catch (error) { } catch (error) {
if (!ignoreFailure) { if (error instanceof TimeoutSignal) {
throw error;
} else if (!ignoreFailure) {
Sentry.captureException(error); Sentry.captureException(error);
if (tryCount > 1) { if (tryCount > 1) {
logger.debug( logger.debug(

View File

@ -305,6 +305,7 @@ export async function performLLMExtract(
document: Document, document: Document,
): Promise<Document> { ): Promise<Document> {
if (meta.options.formats.includes("extract")) { if (meta.options.formats.includes("extract")) {
meta.internalOptions.abort?.throwIfAborted();
const { extract, warning } = await generateOpenAICompletions( const { extract, warning } = await generateOpenAICompletions(
meta.logger.child({ meta.logger.child({
method: "performLLMExtract/generateOpenAICompletions", method: "performLLMExtract/generateOpenAICompletions",

View File

@ -16,6 +16,7 @@ export async function fireEngineMap(
numResults: number; numResults: number;
page?: number; page?: number;
}, },
abort?: AbortSignal,
): Promise<SearchResult[]> { ): Promise<SearchResult[]> {
try { try {
let data = JSON.stringify({ let data = JSON.stringify({
@ -40,6 +41,7 @@ export async function fireEngineMap(
"X-Disable-Cache": "true", "X-Disable-Cache": "true",
}, },
body: data, body: data,
signal: abort,
}); });
if (response.ok) { if (response.ok) {