feat(scrapeURL): proxy auto mode (FIR-1853) (#1551)

* feat(scrapeURL): proxy auto mode

* feat(api/tests/snips/proxy/auto): add test for stealth pick
This commit is contained in:
Gergő Móricz 2025-05-19 19:43:03 +02:00 committed by GitHub
parent 8eeb3c5cd4
commit fab4f00536
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 131 additions and 16 deletions

View File

@ -31,6 +31,11 @@ function expectScrapeToSucceed(response: Awaited<ReturnType<typeof scrapeRaw>>)
export async function scrape(body: ScrapeRequestInput): Promise<Document> {
const raw = await scrapeRaw(body);
expectScrapeToSucceed(raw);
if (body.proxy === "stealth") {
expect(raw.body.data.metadata.proxyUsed).toBe("stealth");
} else if (!body.proxy || body.proxy === "basic") {
expect(raw.body.data.metadata.proxyUsed).toBe("basic");
}
return raw.body.data;
}

View File

@ -275,6 +275,26 @@ describe("Scrape tests", () => {
timeout: 120000,
});
}, 130000);
it.concurrent("auto works properly on non-stealth site", async () => {
const res = await scrape({
url: "http://firecrawl.dev",
proxy: "auto",
timeout: 120000,
});
expect(res.metadata.proxyUsed).toBe("basic");
}, 130000);
it.concurrent("auto works properly on 'stealth' site (faked for reliabile testing)", async () => {
const res = await scrape({
url: "https://httpstat.us/403",
proxy: "auto",
timeout: 120000,
});
expect(res.metadata.proxyUsed).toBe("stealth");
}, 130000);
});
// Temporarily disabled, too flaky

View File

@ -140,6 +140,7 @@ export async function scrapeController(
if ((req.body.extract && req.body.formats?.includes("extract")) || (req.body.formats?.includes("changeTracking") && req.body.changeTrackingOptions?.modes?.includes("json"))) {
creditsToBeBilled = 5;
}
if (req.body.agent?.model?.toLowerCase() === "fire-1" || req.body.extract?.agent?.model?.toLowerCase() === "fire-1" || req.body.jsonOptions?.agent?.model?.toLowerCase() === "fire-1") {
if (process.env.USE_DB_AUTHENTICATION === "true") {
// @Nick this is a hack pushed at 2AM pls help - mogery
@ -155,7 +156,7 @@ export async function scrapeController(
}
}
if (req.body.proxy === "stealth") {
if (doc?.metadata?.proxyUsed === "stealth") {
creditsToBeBilled += 4;
}

View File

@ -145,6 +145,7 @@ async function scrapeSearchResult(
metadata: {
statusCode,
error: error.message,
proxyUsed: "basic",
},
};
}

View File

@ -308,7 +308,7 @@ const baseScrapeOptions = z
fastMode: z.boolean().default(false),
useMock: z.string().optional(),
blockAds: z.boolean().default(true),
proxy: z.enum(["basic", "stealth"]).optional(),
proxy: z.enum(["basic", "stealth", "auto"]).optional(),
})
.strict(strictMessage);
@ -360,7 +360,7 @@ const extractTransform = (obj) => {
obj = { ...obj, timeout: 300000 };
}
if (obj.proxy === "stealth" && obj.timeout === 30000) {
if ((obj.proxy === "stealth" || obj.proxy === "auto") && obj.timeout === 30000) {
obj = { ...obj, timeout: 120000 };
}
@ -748,6 +748,7 @@ export type Document = {
statusCode: number;
scrapeId?: string;
error?: string;
proxyUsed: "basic" | "stealth";
// [key: string]: string | string[] | number | { smartScrape: number; other: number; total: number } | undefined;
};
serpResults?: {

View File

@ -223,7 +223,7 @@ export async function scrapeURLWithFireEngineChromeCDP(
timeout, // TODO: better timeout logic
disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache,
blockAds: meta.options.blockAds,
mobileProxy: meta.options.proxy === undefined ? undefined : meta.options.proxy === "stealth" ? true : false,
mobileProxy: meta.featureFlags.has("stealthProxy"),
saveScrapeResultToGCS: meta.internalOptions.saveScrapeResultToGCS,
// TODO: scrollXPaths
};
@ -304,7 +304,7 @@ export async function scrapeURLWithFireEnginePlaywright(
wait: meta.options.waitFor,
geolocation: meta.options.geolocation ?? meta.options.location,
blockAds: meta.options.blockAds,
mobileProxy: meta.options.proxy === undefined ? undefined : meta.options.proxy === "stealth" ? true : false,
mobileProxy: meta.featureFlags.has("stealthProxy"),
timeout,
};
@ -360,7 +360,7 @@ export async function scrapeURLWithFireEngineTLSClient(
atsv: meta.internalOptions.atsv,
geolocation: meta.options.geolocation ?? meta.options.location,
disableJsDom: meta.internalOptions.v0DisableJsDom,
mobileProxy: meta.options.proxy === undefined ? undefined : meta.options.proxy === "stealth" ? true : false,
mobileProxy: meta.featureFlags.has("stealthProxy"),
timeout,
};

View File

@ -14,8 +14,12 @@ import { scrapeCache } from "./cache";
export type Engine =
| "fire-engine;chrome-cdp"
| "fire-engine(retry);chrome-cdp"
| "fire-engine;chrome-cdp;stealth"
| "fire-engine(retry);chrome-cdp;stealth"
| "fire-engine;playwright"
| "fire-engine;playwright;stealth"
| "fire-engine;tlsclient"
| "fire-engine;tlsclient;stealth"
| "playwright"
| "fetch"
| "pdf"
@ -37,9 +41,13 @@ export const engines: Engine[] = [
...(useFireEngine
? [
"fire-engine;chrome-cdp" as const,
"fire-engine;chrome-cdp;stealth" as const,
"fire-engine(retry);chrome-cdp" as const,
"fire-engine(retry);chrome-cdp;stealth" as const,
"fire-engine;playwright" as const,
"fire-engine;playwright;stealth" as const,
"fire-engine;tlsclient" as const,
"fire-engine;tlsclient;stealth" as const,
]
: []),
...(usePlaywright ? ["playwright" as const] : []),
@ -112,8 +120,12 @@ const engineHandlers: {
cache: scrapeCache,
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
"fire-engine(retry);chrome-cdp": scrapeURLWithFireEngineChromeCDP,
"fire-engine;chrome-cdp;stealth": scrapeURLWithFireEngineChromeCDP,
"fire-engine(retry);chrome-cdp;stealth": scrapeURLWithFireEngineChromeCDP,
"fire-engine;playwright": scrapeURLWithFireEnginePlaywright,
"fire-engine;playwright;stealth": scrapeURLWithFireEnginePlaywright,
"fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient,
"fire-engine;tlsclient;stealth": scrapeURLWithFireEngineTLSClient,
playwright: scrapeURLWithPlaywright,
fetch: scrapeURLWithFetch,
pdf: scrapePDF,
@ -126,7 +138,7 @@ export const engineOptions: {
features: { [F in FeatureFlag]: boolean };
// This defines the order of engines in general. The engine with the highest quality will be used the most.
// Negative quality numbers are reserved for specialty engines, e.g. PDF and DOCX
// Negative quality numbers are reserved for specialty engines, e.g. PDF, DOCX, stealth proxies
quality: number;
};
} = {
@ -160,7 +172,7 @@ export const engineOptions: {
mobile: true,
skipTlsVerification: true,
useFastMode: false,
stealthProxy: true,
stealthProxy: false,
},
quality: 50,
},
@ -177,10 +189,44 @@ export const engineOptions: {
mobile: true,
skipTlsVerification: true,
useFastMode: false,
stealthProxy: true,
stealthProxy: false,
},
quality: 45,
},
"fire-engine;chrome-cdp;stealth": {
features: {
actions: true,
waitFor: true, // through actions transform
screenshot: true, // through actions transform
"screenshot@fullScreen": true, // through actions transform
pdf: false,
docx: false,
atsv: false,
location: true,
mobile: true,
skipTlsVerification: true,
useFastMode: false,
stealthProxy: true,
},
quality: -1,
},
"fire-engine(retry);chrome-cdp;stealth": {
features: {
actions: true,
waitFor: true, // through actions transform
screenshot: true, // through actions transform
"screenshot@fullScreen": true, // through actions transform
pdf: false,
docx: false,
atsv: false,
location: true,
mobile: true,
skipTlsVerification: true,
useFastMode: false,
stealthProxy: true,
},
quality: -5,
},
"fire-engine;playwright": {
features: {
actions: false,
@ -194,10 +240,27 @@ export const engineOptions: {
mobile: false,
skipTlsVerification: false,
useFastMode: false,
stealthProxy: true,
stealthProxy: false,
},
quality: 40,
},
"fire-engine;playwright;stealth": {
features: {
actions: false,
waitFor: true,
screenshot: true,
"screenshot@fullScreen": true,
pdf: false,
docx: false,
atsv: false,
location: false,
mobile: false,
skipTlsVerification: false,
useFastMode: false,
stealthProxy: true,
},
quality: -10,
},
playwright: {
features: {
actions: false,
@ -228,10 +291,27 @@ export const engineOptions: {
mobile: false,
skipTlsVerification: false,
useFastMode: true,
stealthProxy: true,
stealthProxy: false,
},
quality: 10,
},
"fire-engine;tlsclient;stealth": {
features: {
actions: false,
waitFor: false,
screenshot: false,
"screenshot@fullScreen": false,
pdf: false,
docx: false,
atsv: true,
location: true,
mobile: false,
skipTlsVerification: false,
useFastMode: true,
stealthProxy: true,
},
quality: -15,
},
fetch: {
features: {
actions: false,
@ -264,7 +344,7 @@ export const engineOptions: {
useFastMode: true,
stealthProxy: true, // kinda...
},
quality: -10,
quality: -20,
},
docx: {
features: {
@ -281,7 +361,7 @@ export const engineOptions: {
useFastMode: true,
stealthProxy: true, // kinda...
},
quality: -10,
quality: -20,
},
};
@ -293,7 +373,7 @@ export function buildFallbackList(meta: Meta): {
...engines,
// enable fire-engine in self-hosted testing environment when mocks are supplied
...((!useFireEngine && meta.mock !== null) ? ["fire-engine;chrome-cdp", "fire-engine(retry);chrome-cdp", "fire-engine;playwright", "fire-engine;tlsclient"] as Engine[] : [])
...((!useFireEngine && meta.mock !== null) ? ["fire-engine;chrome-cdp", "fire-engine(retry);chrome-cdp", "fire-engine;chrome-cdp;stealth", "fire-engine(retry);chrome-cdp;stealth", "fire-engine;playwright", "fire-engine;tlsclient", "fire-engine;playwright;stealth", "fire-engine;tlsclient;stealth"] as Engine[] : [])
];
if (meta.internalOptions.useCache !== true) {

View File

@ -261,16 +261,22 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
(engineResult.statusCode >= 200 && engineResult.statusCode < 300) ||
engineResult.statusCode === 304;
const hasNoPageError = engineResult.error === undefined;
const isLikelyProxyError = [403, 429].includes(engineResult.statusCode);
results[engine] = {
state: "success",
result: engineResult,
factors: { isLongEnough, isGoodStatusCode, hasNoPageError },
factors: { isLongEnough, isGoodStatusCode, hasNoPageError, isLikelyProxyError },
unsupportedFeatures,
startedAt,
finishedAt: Date.now(),
};
if (isLikelyProxyError && meta.options.proxy === "auto" && !meta.featureFlags.has("stealthProxy")) {
meta.logger.info("Scrape via " + engine + " deemed unsuccessful due to proxy inadequacy. Adding stealthProxy flag.");
throw new AddFeatureError(["stealthProxy"]);
}
// NOTE: TODO: what to do when status code is bad is tough...
// we cannot just rely on text because error messages can be brief and not hit the limit
// should we just use all the fallbacks and pick the one with the longest text? - mogery
@ -368,6 +374,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
url: result.result.url,
statusCode: result.result.statusCode,
error: result.result.error,
proxyUsed: meta.featureFlags.has("stealthProxy") ? "stealth" : "basic",
},
};

View File

@ -1384,7 +1384,7 @@ async function processJob(job: Job & { id: string }, token: string) {
}
}
if (job.data.scrapeOptions.proxy === "stealth") {
if (doc.metadata?.proxyUsed === "stealth") {
creditsToBeBilled += 4;
}