mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 12:09:03 +08:00
feat(v1/map): stop mapping if timed out via AbortController (#1205)
This commit is contained in:
parent
2200f084f3
commit
46b187bc64
@ -5,6 +5,7 @@ import {
|
|||||||
mapRequestSchema,
|
mapRequestSchema,
|
||||||
RequestWithAuth,
|
RequestWithAuth,
|
||||||
scrapeOptions,
|
scrapeOptions,
|
||||||
|
TimeoutSignal,
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
|
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
|
||||||
import { MapResponse, MapRequest } from "./types";
|
import { MapResponse, MapRequest } from "./types";
|
||||||
@ -53,6 +54,7 @@ export async function getMapResults({
|
|||||||
origin,
|
origin,
|
||||||
includeMetadata = false,
|
includeMetadata = false,
|
||||||
allowExternalLinks,
|
allowExternalLinks,
|
||||||
|
abort = new AbortController().signal, // noop
|
||||||
}: {
|
}: {
|
||||||
url: string;
|
url: string;
|
||||||
search?: string;
|
search?: string;
|
||||||
@ -65,6 +67,7 @@ export async function getMapResults({
|
|||||||
origin?: string;
|
origin?: string;
|
||||||
includeMetadata?: boolean;
|
includeMetadata?: boolean;
|
||||||
allowExternalLinks?: boolean;
|
allowExternalLinks?: boolean;
|
||||||
|
abort?: AbortSignal;
|
||||||
}): Promise<MapResult> {
|
}): Promise<MapResult> {
|
||||||
const id = uuidv4();
|
const id = uuidv4();
|
||||||
let links: string[] = [url];
|
let links: string[] = [url];
|
||||||
@ -87,8 +90,8 @@ export async function getMapResults({
|
|||||||
const crawler = crawlToCrawler(id, sc);
|
const crawler = crawlToCrawler(id, sc);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
sc.robots = await crawler.getRobotsTxt();
|
sc.robots = await crawler.getRobotsTxt(false, abort);
|
||||||
await crawler.importRobotsTxt(sc.robots);
|
crawler.importRobotsTxt(sc.robots);
|
||||||
} catch (_) {}
|
} catch (_) {}
|
||||||
|
|
||||||
// If sitemapOnly is true, only get links from sitemap
|
// If sitemapOnly is true, only get links from sitemap
|
||||||
@ -102,6 +105,7 @@ export async function getMapResults({
|
|||||||
true,
|
true,
|
||||||
true,
|
true,
|
||||||
30000,
|
30000,
|
||||||
|
abort,
|
||||||
);
|
);
|
||||||
if (sitemap > 0) {
|
if (sitemap > 0) {
|
||||||
links = links
|
links = links
|
||||||
@ -144,7 +148,7 @@ export async function getMapResults({
|
|||||||
return fireEngineMap(mapUrl, {
|
return fireEngineMap(mapUrl, {
|
||||||
numResults: resultsPerPage,
|
numResults: resultsPerPage,
|
||||||
page: page,
|
page: page,
|
||||||
});
|
}, abort);
|
||||||
};
|
};
|
||||||
|
|
||||||
pagePromises = Array.from({ length: maxPages }, (_, i) =>
|
pagePromises = Array.from({ length: maxPages }, (_, i) =>
|
||||||
@ -157,7 +161,7 @@ export async function getMapResults({
|
|||||||
|
|
||||||
// Parallelize sitemap index query with search results
|
// Parallelize sitemap index query with search results
|
||||||
const [sitemapIndexResult, ...searchResults] = await Promise.all([
|
const [sitemapIndexResult, ...searchResults] = await Promise.all([
|
||||||
querySitemapIndex(url),
|
querySitemapIndex(url, abort),
|
||||||
...(cachedResult ? [] : pagePromises),
|
...(cachedResult ? [] : pagePromises),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
@ -178,6 +182,7 @@ export async function getMapResults({
|
|||||||
true,
|
true,
|
||||||
false,
|
false,
|
||||||
30000,
|
30000,
|
||||||
|
abort,
|
||||||
);
|
);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.warn("tryGetSitemap threw an error", { error: e });
|
logger.warn("tryGetSitemap threw an error", { error: e });
|
||||||
@ -277,6 +282,7 @@ export async function mapController(
|
|||||||
req.body = mapRequestSchema.parse(req.body);
|
req.body = mapRequestSchema.parse(req.body);
|
||||||
|
|
||||||
let result: Awaited<ReturnType<typeof getMapResults>>;
|
let result: Awaited<ReturnType<typeof getMapResults>>;
|
||||||
|
const abort = new AbortController();
|
||||||
try {
|
try {
|
||||||
result = await Promise.race([
|
result = await Promise.race([
|
||||||
getMapResults({
|
getMapResults({
|
||||||
@ -289,13 +295,17 @@ export async function mapController(
|
|||||||
origin: req.body.origin,
|
origin: req.body.origin,
|
||||||
teamId: req.auth.team_id,
|
teamId: req.auth.team_id,
|
||||||
plan: req.auth.plan,
|
plan: req.auth.plan,
|
||||||
|
abort: abort.signal,
|
||||||
}),
|
}),
|
||||||
...(req.body.timeout !== undefined ? [
|
...(req.body.timeout !== undefined ? [
|
||||||
new Promise((resolve, reject) => setTimeout(() => reject("timeout"), req.body.timeout))
|
new Promise((resolve, reject) => setTimeout(() => {
|
||||||
|
abort.abort(new TimeoutSignal());
|
||||||
|
reject(new TimeoutSignal());
|
||||||
|
}, req.body.timeout))
|
||||||
] : []),
|
] : []),
|
||||||
]) as any;
|
]) as any;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error === "timeout") {
|
if (error instanceof TimeoutSignal || error === "timeout") {
|
||||||
return res.status(408).json({
|
return res.status(408).json({
|
||||||
success: false,
|
success: false,
|
||||||
error: "Request timed out",
|
error: "Request timed out",
|
||||||
|
@ -1004,3 +1004,9 @@ export const generateLLMsTextRequestSchema = z.object({
|
|||||||
export type GenerateLLMsTextRequest = z.infer<
|
export type GenerateLLMsTextRequest = z.infer<
|
||||||
typeof generateLLMsTextRequestSchema
|
typeof generateLLMsTextRequestSchema
|
||||||
>;
|
>;
|
||||||
|
|
||||||
|
export class TimeoutSignal extends Error {
|
||||||
|
constructor() {
|
||||||
|
super("Operation timed out")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -9,6 +9,7 @@ import { logger as _logger } from "../../lib/logger";
|
|||||||
import https from "https";
|
import https from "https";
|
||||||
import { redisConnection } from "../../services/queue-service";
|
import { redisConnection } from "../../services/queue-service";
|
||||||
import { extractLinks } from "../../lib/html-transformer";
|
import { extractLinks } from "../../lib/html-transformer";
|
||||||
|
import { TimeoutSignal } from "../../controllers/v1/types";
|
||||||
export class WebCrawler {
|
export class WebCrawler {
|
||||||
private jobId: string;
|
private jobId: string;
|
||||||
private initialUrl: string;
|
private initialUrl: string;
|
||||||
@ -182,7 +183,7 @@ export class WebCrawler {
|
|||||||
.slice(0, limit);
|
.slice(0, limit);
|
||||||
}
|
}
|
||||||
|
|
||||||
public async getRobotsTxt(skipTlsVerification = false): Promise<string> {
|
public async getRobotsTxt(skipTlsVerification = false, abort?: AbortSignal): Promise<string> {
|
||||||
let extraArgs = {};
|
let extraArgs = {};
|
||||||
if (skipTlsVerification) {
|
if (skipTlsVerification) {
|
||||||
extraArgs["httpsAgent"] = new https.Agent({
|
extraArgs["httpsAgent"] = new https.Agent({
|
||||||
@ -191,6 +192,7 @@ export class WebCrawler {
|
|||||||
}
|
}
|
||||||
const response = await axios.get(this.robotsTxtUrl, {
|
const response = await axios.get(this.robotsTxtUrl, {
|
||||||
timeout: axiosTimeout,
|
timeout: axiosTimeout,
|
||||||
|
signal: abort,
|
||||||
...extraArgs,
|
...extraArgs,
|
||||||
});
|
});
|
||||||
return response.data;
|
return response.data;
|
||||||
@ -205,6 +207,7 @@ export class WebCrawler {
|
|||||||
fromMap: boolean = false,
|
fromMap: boolean = false,
|
||||||
onlySitemap: boolean = false,
|
onlySitemap: boolean = false,
|
||||||
timeout: number = 120000,
|
timeout: number = 120000,
|
||||||
|
abort?: AbortSignal
|
||||||
): Promise<number> {
|
): Promise<number> {
|
||||||
this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, {
|
this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, {
|
||||||
method: "tryGetSitemap",
|
method: "tryGetSitemap",
|
||||||
@ -260,10 +263,10 @@ export class WebCrawler {
|
|||||||
try {
|
try {
|
||||||
let count = (await Promise.race([
|
let count = (await Promise.race([
|
||||||
Promise.all([
|
Promise.all([
|
||||||
this.tryFetchSitemapLinks(this.initialUrl, _urlsHandler),
|
this.tryFetchSitemapLinks(this.initialUrl, _urlsHandler, abort),
|
||||||
...this.robots
|
...this.robots
|
||||||
.getSitemaps()
|
.getSitemaps()
|
||||||
.map((x) => this.tryFetchSitemapLinks(x, _urlsHandler)),
|
.map((x) => this.tryFetchSitemapLinks(x, _urlsHandler, abort)),
|
||||||
]).then((results) => results.reduce((a, x) => a + x, 0)),
|
]).then((results) => results.reduce((a, x) => a + x, 0)),
|
||||||
timeoutPromise,
|
timeoutPromise,
|
||||||
])) as number;
|
])) as number;
|
||||||
@ -555,6 +558,7 @@ export class WebCrawler {
|
|||||||
private async tryFetchSitemapLinks(
|
private async tryFetchSitemapLinks(
|
||||||
url: string,
|
url: string,
|
||||||
urlsHandler: (urls: string[]) => unknown,
|
urlsHandler: (urls: string[]) => unknown,
|
||||||
|
abort?: AbortSignal,
|
||||||
): Promise<number> {
|
): Promise<number> {
|
||||||
const sitemapUrl = url.endsWith(".xml")
|
const sitemapUrl = url.endsWith(".xml")
|
||||||
? url
|
? url
|
||||||
@ -569,13 +573,18 @@ export class WebCrawler {
|
|||||||
this.logger,
|
this.logger,
|
||||||
this.jobId,
|
this.jobId,
|
||||||
this.sitemapsHit,
|
this.sitemapsHit,
|
||||||
|
abort,
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.debug(`Failed to fetch sitemap from ${sitemapUrl}`, {
|
if (error instanceof TimeoutSignal) {
|
||||||
method: "tryFetchSitemapLinks",
|
throw error;
|
||||||
sitemapUrl,
|
} else {
|
||||||
error,
|
this.logger.debug(`Failed to fetch sitemap from ${sitemapUrl}`, {
|
||||||
});
|
method: "tryFetchSitemapLinks",
|
||||||
|
sitemapUrl,
|
||||||
|
error,
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If this is a subdomain, also try to get sitemap from the main domain
|
// If this is a subdomain, also try to get sitemap from the main domain
|
||||||
@ -611,20 +620,29 @@ export class WebCrawler {
|
|||||||
this.logger,
|
this.logger,
|
||||||
this.jobId,
|
this.jobId,
|
||||||
this.sitemapsHit,
|
this.sitemapsHit,
|
||||||
|
abort,
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.debug(
|
if (error instanceof TimeoutSignal) {
|
||||||
`Failed to fetch main domain sitemap from ${mainDomainSitemapUrl}`,
|
throw error;
|
||||||
{ method: "tryFetchSitemapLinks", mainDomainSitemapUrl, error },
|
} else {
|
||||||
);
|
this.logger.debug(
|
||||||
|
`Failed to fetch main domain sitemap from ${mainDomainSitemapUrl}`,
|
||||||
|
{ method: "tryFetchSitemapLinks", mainDomainSitemapUrl, error },
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.debug(`Error processing main domain sitemap`, {
|
if (error instanceof TimeoutSignal) {
|
||||||
method: "tryFetchSitemapLinks",
|
throw error;
|
||||||
url,
|
} else {
|
||||||
error,
|
this.logger.debug(`Error processing main domain sitemap`, {
|
||||||
});
|
method: "tryFetchSitemapLinks",
|
||||||
|
url,
|
||||||
|
error,
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If no sitemap found yet, try the baseUrl as a last resort
|
// If no sitemap found yet, try the baseUrl as a last resort
|
||||||
@ -636,22 +654,28 @@ export class WebCrawler {
|
|||||||
this.logger,
|
this.logger,
|
||||||
this.jobId,
|
this.jobId,
|
||||||
this.sitemapsHit,
|
this.sitemapsHit,
|
||||||
|
abort,
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, {
|
if (error instanceof TimeoutSignal) {
|
||||||
method: "tryFetchSitemapLinks",
|
throw error;
|
||||||
sitemapUrl: baseUrlSitemap,
|
|
||||||
error,
|
|
||||||
});
|
|
||||||
if (error instanceof AxiosError && error.response?.status === 404) {
|
|
||||||
// ignore 404
|
|
||||||
} else {
|
} else {
|
||||||
sitemapCount += await getLinksFromSitemap(
|
this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, {
|
||||||
{ sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
|
method: "tryFetchSitemapLinks",
|
||||||
this.logger,
|
sitemapUrl: baseUrlSitemap,
|
||||||
this.jobId,
|
error,
|
||||||
this.sitemapsHit,
|
});
|
||||||
);
|
if (error instanceof AxiosError && error.response?.status === 404) {
|
||||||
|
// ignore 404
|
||||||
|
} else {
|
||||||
|
sitemapCount += await getLinksFromSitemap(
|
||||||
|
{ sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
|
||||||
|
this.logger,
|
||||||
|
this.jobId,
|
||||||
|
this.sitemapsHit,
|
||||||
|
abort,
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -12,10 +12,11 @@ import { supabase_service } from "../../services/supabase";
|
|||||||
*/
|
*/
|
||||||
import { withAuth } from "../../lib/withAuth";
|
import { withAuth } from "../../lib/withAuth";
|
||||||
|
|
||||||
async function querySitemapIndexFunction(url: string) {
|
async function querySitemapIndexFunction(url: string, abort?: AbortSignal) {
|
||||||
const originUrl = normalizeUrlOnlyHostname(url);
|
const originUrl = normalizeUrlOnlyHostname(url);
|
||||||
|
|
||||||
for (let attempt = 1; attempt <= 3; attempt++) {
|
for (let attempt = 1; attempt <= 3; attempt++) {
|
||||||
|
abort?.throwIfAborted();
|
||||||
try {
|
try {
|
||||||
const { data, error } = await supabase_service
|
const { data, error } = await supabase_service
|
||||||
.from("crawl_maps")
|
.from("crawl_maps")
|
||||||
|
@ -1,8 +1,7 @@
|
|||||||
import { axiosTimeout } from "../../lib/timeout";
|
|
||||||
import { parseStringPromise } from "xml2js";
|
import { parseStringPromise } from "xml2js";
|
||||||
import { WebCrawler } from "./crawler";
|
import { WebCrawler } from "./crawler";
|
||||||
import { scrapeURL } from "../scrapeURL";
|
import { scrapeURL } from "../scrapeURL";
|
||||||
import { scrapeOptions } from "../../controllers/v1/types";
|
import { scrapeOptions, TimeoutSignal } from "../../controllers/v1/types";
|
||||||
import type { Logger } from "winston";
|
import type { Logger } from "winston";
|
||||||
const useFireEngine =
|
const useFireEngine =
|
||||||
process.env.FIRE_ENGINE_BETA_URL !== "" &&
|
process.env.FIRE_ENGINE_BETA_URL !== "" &&
|
||||||
@ -20,6 +19,7 @@ export async function getLinksFromSitemap(
|
|||||||
logger: Logger,
|
logger: Logger,
|
||||||
crawlId: string,
|
crawlId: string,
|
||||||
sitemapsHit: Set<string>,
|
sitemapsHit: Set<string>,
|
||||||
|
abort?: AbortSignal,
|
||||||
): Promise<number> {
|
): Promise<number> {
|
||||||
if (sitemapsHit.size >= 20) {
|
if (sitemapsHit.size >= 20) {
|
||||||
return 0;
|
return 0;
|
||||||
@ -44,7 +44,8 @@ export async function getLinksFromSitemap(
|
|||||||
"fetch",
|
"fetch",
|
||||||
...((mode === "fire-engine" && useFireEngine) ? ["fire-engine;tlsclient" as const] : []),
|
...((mode === "fire-engine" && useFireEngine) ? ["fire-engine;tlsclient" as const] : []),
|
||||||
],
|
],
|
||||||
v0DisableJsDom: true
|
v0DisableJsDom: true,
|
||||||
|
abort,
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -69,14 +70,18 @@ export async function getLinksFromSitemap(
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Request failed for sitemap fetch`, {
|
if (error instanceof TimeoutSignal) {
|
||||||
method: "getLinksFromSitemap",
|
throw error;
|
||||||
mode,
|
} else {
|
||||||
sitemapUrl,
|
logger.error(`Request failed for sitemap fetch`, {
|
||||||
error,
|
method: "getLinksFromSitemap",
|
||||||
});
|
mode,
|
||||||
|
sitemapUrl,
|
||||||
return 0;
|
error,
|
||||||
|
});
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const parsed = await parseStringPromise(content);
|
const parsed = await parseStringPromise(content);
|
||||||
@ -90,7 +95,7 @@ export async function getLinksFromSitemap(
|
|||||||
.map((sitemap) => sitemap.loc[0].trim());
|
.map((sitemap) => sitemap.loc[0].trim());
|
||||||
|
|
||||||
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
|
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
|
||||||
getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger, crawlId, sitemapsHit),
|
getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger, crawlId, sitemapsHit, abort),
|
||||||
);
|
);
|
||||||
|
|
||||||
const results = await Promise.all(sitemapPromises);
|
const results = await Promise.all(sitemapPromises);
|
||||||
@ -114,6 +119,7 @@ export async function getLinksFromSitemap(
|
|||||||
logger,
|
logger,
|
||||||
crawlId,
|
crawlId,
|
||||||
sitemapsHit,
|
sitemapsHit,
|
||||||
|
abort,
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
count += (await Promise.all(sitemapPromises)).reduce(
|
count += (await Promise.all(sitemapPromises)).reduce(
|
||||||
@ -151,56 +157,3 @@ export async function getLinksFromSitemap(
|
|||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
export const fetchSitemapData = async (
|
|
||||||
url: string,
|
|
||||||
timeout?: number,
|
|
||||||
): Promise<SitemapEntry[] | null> => {
|
|
||||||
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
|
|
||||||
try {
|
|
||||||
const fetchResponse = await scrapeURL(
|
|
||||||
"sitemap",
|
|
||||||
sitemapUrl,
|
|
||||||
scrapeOptions.parse({
|
|
||||||
formats: ["rawHtml"],
|
|
||||||
timeout: timeout || axiosTimeout,
|
|
||||||
}),
|
|
||||||
{ forceEngine: "fetch" },
|
|
||||||
);
|
|
||||||
|
|
||||||
if (
|
|
||||||
fetchResponse.success &&
|
|
||||||
fetchResponse.document.metadata.statusCode >= 200 &&
|
|
||||||
fetchResponse.document.metadata.statusCode < 300
|
|
||||||
) {
|
|
||||||
const xml = fetchResponse.document.rawHtml!;
|
|
||||||
const parsedXml = await parseStringPromise(xml);
|
|
||||||
|
|
||||||
const sitemapData: SitemapEntry[] = [];
|
|
||||||
if (parsedXml.urlset && parsedXml.urlset.url) {
|
|
||||||
for (const urlElement of parsedXml.urlset.url) {
|
|
||||||
const sitemapEntry: SitemapEntry = { loc: urlElement.loc[0] };
|
|
||||||
if (urlElement.lastmod) sitemapEntry.lastmod = urlElement.lastmod[0];
|
|
||||||
if (urlElement.changefreq)
|
|
||||||
sitemapEntry.changefreq = urlElement.changefreq[0];
|
|
||||||
if (urlElement.priority)
|
|
||||||
sitemapEntry.priority = Number(urlElement.priority[0]);
|
|
||||||
sitemapData.push(sitemapEntry);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return sitemapData;
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
} catch (error) {
|
|
||||||
// Error handling for failed sitemap fetch
|
|
||||||
}
|
|
||||||
return [];
|
|
||||||
};
|
|
||||||
|
|
||||||
export interface SitemapEntry {
|
|
||||||
loc: string;
|
|
||||||
lastmod?: string;
|
|
||||||
changefreq?: string;
|
|
||||||
priority?: number;
|
|
||||||
}
|
|
||||||
|
@ -21,6 +21,7 @@ export async function scrapeURLWithFetch(
|
|||||||
dispatcher: await makeSecureDispatcher(meta.url),
|
dispatcher: await makeSecureDispatcher(meta.url),
|
||||||
redirect: "follow",
|
redirect: "follow",
|
||||||
headers: meta.options.headers,
|
headers: meta.options.headers,
|
||||||
|
signal: meta.internalOptions.abort,
|
||||||
}),
|
}),
|
||||||
(async () => {
|
(async () => {
|
||||||
await new Promise((resolve) =>
|
await new Promise((resolve) =>
|
||||||
|
@ -85,6 +85,7 @@ export async function fireEngineCheckStatus(
|
|||||||
logger: Logger,
|
logger: Logger,
|
||||||
jobId: string,
|
jobId: string,
|
||||||
mock: MockState | null,
|
mock: MockState | null,
|
||||||
|
abort?: AbortSignal,
|
||||||
): Promise<FireEngineCheckStatusSuccess> {
|
): Promise<FireEngineCheckStatusSuccess> {
|
||||||
const status = await Sentry.startSpan(
|
const status = await Sentry.startSpan(
|
||||||
{
|
{
|
||||||
|
@ -24,8 +24,9 @@ import * as Sentry from "@sentry/node";
|
|||||||
import { Action } from "../../../../lib/entities";
|
import { Action } from "../../../../lib/entities";
|
||||||
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||||
import { fireEngineDelete } from "./delete";
|
import { fireEngineDelete } from "./delete";
|
||||||
import { MockState, saveMock } from "../../lib/mock";
|
import { MockState } from "../../lib/mock";
|
||||||
import { getInnerJSON } from "../../../../lib/html-transformer";
|
import { getInnerJSON } from "../../../../lib/html-transformer";
|
||||||
|
import { TimeoutSignal } from "../../../../controllers/v1/types";
|
||||||
|
|
||||||
// This function does not take `Meta` on purpose. It may not access any
|
// This function does not take `Meta` on purpose. It may not access any
|
||||||
// meta values to construct the request -- that must be done by the
|
// meta values to construct the request -- that must be done by the
|
||||||
@ -40,6 +41,7 @@ async function performFireEngineScrape<
|
|||||||
request: FireEngineScrapeRequestCommon & Engine,
|
request: FireEngineScrapeRequestCommon & Engine,
|
||||||
timeout: number,
|
timeout: number,
|
||||||
mock: MockState | null,
|
mock: MockState | null,
|
||||||
|
abort?: AbortSignal,
|
||||||
): Promise<FireEngineCheckStatusSuccess> {
|
): Promise<FireEngineCheckStatusSuccess> {
|
||||||
const scrape = await fireEngineScrape(
|
const scrape = await fireEngineScrape(
|
||||||
logger.child({ method: "fireEngineScrape" }),
|
logger.child({ method: "fireEngineScrape" }),
|
||||||
@ -84,6 +86,7 @@ async function performFireEngineScrape<
|
|||||||
logger.child({ method: "fireEngineCheckStatus" }),
|
logger.child({ method: "fireEngineCheckStatus" }),
|
||||||
scrape.jobId,
|
scrape.jobId,
|
||||||
mock,
|
mock,
|
||||||
|
abort,
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error instanceof StillProcessingError) {
|
if (error instanceof StillProcessingError) {
|
||||||
@ -107,6 +110,16 @@ async function performFireEngineScrape<
|
|||||||
jobId: scrape.jobId,
|
jobId: scrape.jobId,
|
||||||
});
|
});
|
||||||
throw error;
|
throw error;
|
||||||
|
} else if (error instanceof TimeoutSignal) {
|
||||||
|
fireEngineDelete(
|
||||||
|
logger.child({
|
||||||
|
method: "performFireEngineScrape/fireEngineDelete",
|
||||||
|
afterError: error,
|
||||||
|
}),
|
||||||
|
scrape.jobId,
|
||||||
|
mock,
|
||||||
|
);
|
||||||
|
throw error;
|
||||||
} else {
|
} else {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
errors.push(error);
|
errors.push(error);
|
||||||
@ -219,6 +232,7 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
|||||||
request,
|
request,
|
||||||
timeout,
|
timeout,
|
||||||
meta.mock,
|
meta.mock,
|
||||||
|
meta.internalOptions.abort,
|
||||||
);
|
);
|
||||||
|
|
||||||
if (
|
if (
|
||||||
@ -298,6 +312,7 @@ export async function scrapeURLWithFireEnginePlaywright(
|
|||||||
request,
|
request,
|
||||||
timeout,
|
timeout,
|
||||||
meta.mock,
|
meta.mock,
|
||||||
|
meta.internalOptions.abort,
|
||||||
);
|
);
|
||||||
|
|
||||||
if (!response.url) {
|
if (!response.url) {
|
||||||
@ -353,6 +368,7 @@ export async function scrapeURLWithFireEngineTLSClient(
|
|||||||
request,
|
request,
|
||||||
timeout,
|
timeout,
|
||||||
meta.mock,
|
meta.mock,
|
||||||
|
meta.internalOptions.abort,
|
||||||
);
|
);
|
||||||
|
|
||||||
if (!response.url) {
|
if (!response.url) {
|
||||||
|
@ -76,6 +76,7 @@ export async function fireEngineScrape<
|
|||||||
logger: Logger,
|
logger: Logger,
|
||||||
request: FireEngineScrapeRequestCommon & Engine,
|
request: FireEngineScrapeRequestCommon & Engine,
|
||||||
mock: MockState | null,
|
mock: MockState | null,
|
||||||
|
abort?: AbortSignal,
|
||||||
): Promise<z.infer<typeof schema>> {
|
): Promise<z.infer<typeof schema>> {
|
||||||
const scrapeRequest = await Sentry.startSpan(
|
const scrapeRequest = await Sentry.startSpan(
|
||||||
{
|
{
|
||||||
@ -101,6 +102,7 @@ export async function fireEngineScrape<
|
|||||||
schema,
|
schema,
|
||||||
tryCount: 3,
|
tryCount: 3,
|
||||||
mock,
|
mock,
|
||||||
|
abort,
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import { Logger } from "winston";
|
import { Logger } from "winston";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
|
|
||||||
import { Document, ScrapeOptions } from "../../controllers/v1/types";
|
import { Document, ScrapeOptions, TimeoutSignal } from "../../controllers/v1/types";
|
||||||
import { logger as _logger } from "../../lib/logger";
|
import { logger as _logger } from "../../lib/logger";
|
||||||
import {
|
import {
|
||||||
buildFallbackList,
|
buildFallbackList,
|
||||||
@ -165,6 +165,7 @@ export type InternalOptions = {
|
|||||||
disableSmartWaitCache?: boolean; // Passed along to fire-engine
|
disableSmartWaitCache?: boolean; // Passed along to fire-engine
|
||||||
isBackgroundIndex?: boolean;
|
isBackgroundIndex?: boolean;
|
||||||
fromCache?: boolean; // Indicates if the document was retrieved from cache
|
fromCache?: boolean; // Indicates if the document was retrieved from cache
|
||||||
|
abort?: AbortSignal;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type EngineResultsTracker = {
|
export type EngineResultsTracker = {
|
||||||
@ -222,6 +223,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
|||||||
: undefined;
|
: undefined;
|
||||||
|
|
||||||
for (const { engine, unsupportedFeatures } of fallbackList) {
|
for (const { engine, unsupportedFeatures } of fallbackList) {
|
||||||
|
meta.internalOptions.abort?.throwIfAborted();
|
||||||
const startedAt = Date.now();
|
const startedAt = Date.now();
|
||||||
try {
|
try {
|
||||||
meta.logger.info("Scraping via " + engine + "...");
|
meta.logger.info("Scraping via " + engine + "...");
|
||||||
@ -307,6 +309,8 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
|||||||
throw error;
|
throw error;
|
||||||
} else if (error instanceof UnsupportedFileError) {
|
} else if (error instanceof UnsupportedFileError) {
|
||||||
throw error;
|
throw error;
|
||||||
|
} else if (error instanceof TimeoutSignal) {
|
||||||
|
throw error;
|
||||||
} else {
|
} else {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
meta.logger.warn(
|
meta.logger.warn(
|
||||||
@ -433,6 +437,8 @@ export async function scrapeURL(
|
|||||||
meta.logger.warn("scrapeURL: Tried to scrape unsupported file", {
|
meta.logger.warn("scrapeURL: Tried to scrape unsupported file", {
|
||||||
error,
|
error,
|
||||||
});
|
});
|
||||||
|
} else if (error instanceof TimeoutSignal) {
|
||||||
|
throw error;
|
||||||
} else {
|
} else {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
meta.logger.error("scrapeURL: Unexpected error happened", { error });
|
meta.logger.error("scrapeURL: Unexpected error happened", { error });
|
||||||
|
@ -2,6 +2,7 @@ import { Logger } from "winston";
|
|||||||
import { z, ZodError } from "zod";
|
import { z, ZodError } from "zod";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
import { MockState, saveMock } from "./mock";
|
import { MockState, saveMock } from "./mock";
|
||||||
|
import { TimeoutSignal } from "../../../controllers/v1/types";
|
||||||
import { fireEngineURL } from "../engines/fire-engine/scrape";
|
import { fireEngineURL } from "../engines/fire-engine/scrape";
|
||||||
|
|
||||||
export type RobustFetchParams<Schema extends z.Schema<any>> = {
|
export type RobustFetchParams<Schema extends z.Schema<any>> = {
|
||||||
@ -18,6 +19,7 @@ export type RobustFetchParams<Schema extends z.Schema<any>> = {
|
|||||||
tryCount?: number;
|
tryCount?: number;
|
||||||
tryCooldown?: number;
|
tryCooldown?: number;
|
||||||
mock: MockState | null;
|
mock: MockState | null;
|
||||||
|
abort?: AbortSignal;
|
||||||
};
|
};
|
||||||
|
|
||||||
export async function robustFetch<
|
export async function robustFetch<
|
||||||
@ -36,7 +38,10 @@ export async function robustFetch<
|
|||||||
tryCount = 1,
|
tryCount = 1,
|
||||||
tryCooldown,
|
tryCooldown,
|
||||||
mock,
|
mock,
|
||||||
|
abort,
|
||||||
}: RobustFetchParams<Schema>): Promise<Output> {
|
}: RobustFetchParams<Schema>): Promise<Output> {
|
||||||
|
abort?.throwIfAborted();
|
||||||
|
|
||||||
const params = {
|
const params = {
|
||||||
url,
|
url,
|
||||||
logger,
|
logger,
|
||||||
@ -48,6 +53,7 @@ export async function robustFetch<
|
|||||||
ignoreFailure,
|
ignoreFailure,
|
||||||
tryCount,
|
tryCount,
|
||||||
tryCooldown,
|
tryCooldown,
|
||||||
|
abort,
|
||||||
};
|
};
|
||||||
|
|
||||||
let response: {
|
let response: {
|
||||||
@ -71,6 +77,7 @@ export async function robustFetch<
|
|||||||
: {}),
|
: {}),
|
||||||
...(headers !== undefined ? headers : {}),
|
...(headers !== undefined ? headers : {}),
|
||||||
},
|
},
|
||||||
|
signal: abort,
|
||||||
...(body instanceof FormData
|
...(body instanceof FormData
|
||||||
? {
|
? {
|
||||||
body,
|
body,
|
||||||
@ -82,7 +89,9 @@ export async function robustFetch<
|
|||||||
: {}),
|
: {}),
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (!ignoreFailure) {
|
if (error instanceof TimeoutSignal) {
|
||||||
|
throw error;
|
||||||
|
} else if (!ignoreFailure) {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
if (tryCount > 1) {
|
if (tryCount > 1) {
|
||||||
logger.debug(
|
logger.debug(
|
||||||
|
@ -305,6 +305,7 @@ export async function performLLMExtract(
|
|||||||
document: Document,
|
document: Document,
|
||||||
): Promise<Document> {
|
): Promise<Document> {
|
||||||
if (meta.options.formats.includes("extract")) {
|
if (meta.options.formats.includes("extract")) {
|
||||||
|
meta.internalOptions.abort?.throwIfAborted();
|
||||||
const { extract, warning } = await generateOpenAICompletions(
|
const { extract, warning } = await generateOpenAICompletions(
|
||||||
meta.logger.child({
|
meta.logger.child({
|
||||||
method: "performLLMExtract/generateOpenAICompletions",
|
method: "performLLMExtract/generateOpenAICompletions",
|
||||||
|
@ -16,6 +16,7 @@ export async function fireEngineMap(
|
|||||||
numResults: number;
|
numResults: number;
|
||||||
page?: number;
|
page?: number;
|
||||||
},
|
},
|
||||||
|
abort?: AbortSignal,
|
||||||
): Promise<SearchResult[]> {
|
): Promise<SearchResult[]> {
|
||||||
try {
|
try {
|
||||||
let data = JSON.stringify({
|
let data = JSON.stringify({
|
||||||
@ -40,6 +41,7 @@ export async function fireEngineMap(
|
|||||||
"X-Disable-Cache": "true",
|
"X-Disable-Cache": "true",
|
||||||
},
|
},
|
||||||
body: data,
|
body: data,
|
||||||
|
signal: abort,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (response.ok) {
|
if (response.ok) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user