mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-06-01 12:03:20 +08:00
Nick: (#1492)
This commit is contained in:
parent
feda4dede7
commit
1c421f2d74
1
.github/archive/js-sdk.yml
vendored
1
.github/archive/js-sdk.yml
vendored
@ -15,7 +15,6 @@ env:
|
||||
PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }}
|
||||
PORT: ${{ secrets.PORT }}
|
||||
REDIS_URL: ${{ secrets.REDIS_URL }}
|
||||
SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }}
|
||||
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
|
||||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||
|
1
.github/archive/python-sdk.yml
vendored
1
.github/archive/python-sdk.yml
vendored
@ -15,7 +15,6 @@ env:
|
||||
PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }}
|
||||
PORT: ${{ secrets.PORT }}
|
||||
REDIS_URL: ${{ secrets.REDIS_URL }}
|
||||
SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }}
|
||||
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
|
||||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||
|
1
.github/archive/rust-sdk.yml
vendored
1
.github/archive/rust-sdk.yml
vendored
@ -15,7 +15,6 @@ env:
|
||||
PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }}
|
||||
PORT: ${{ secrets.PORT }}
|
||||
REDIS_URL: ${{ secrets.REDIS_URL }}
|
||||
SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }}
|
||||
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
|
||||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||
|
1
.github/workflows/test-server.yml
vendored
1
.github/workflows/test-server.yml
vendored
@ -17,7 +17,6 @@ env:
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
PORT: ${{ secrets.PORT }}
|
||||
REDIS_URL: ${{ secrets.REDIS_URL }}
|
||||
SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }}
|
||||
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
|
||||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||
|
@ -38,7 +38,6 @@ SUPABASE_SERVICE_TOKEN=
|
||||
|
||||
# Other Optionals
|
||||
TEST_API_KEY= # use if you've set up authentication and want to test with a real API key
|
||||
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Bee to handle JS blocking
|
||||
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
|
||||
BULL_AUTH_KEY= @
|
||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||
|
@ -86,9 +86,6 @@ USE_DB_AUTHENTICATION=false
|
||||
# Use if you've set up authentication and want to test with a real API key
|
||||
# TEST_API_KEY=
|
||||
|
||||
# You can add this to enable ScrapingBee as a fallback scraping engine.
|
||||
# SCRAPING_BEE_API_KEY=
|
||||
|
||||
# This key lets you access the queue admin panel. Change this if your deployment is publicly accessible.
|
||||
BULL_AUTH_KEY=CHANGEME
|
||||
|
||||
|
@ -128,7 +128,6 @@
|
||||
"redlock": "5.0.0-beta.2",
|
||||
"resend": "^3.4.0",
|
||||
"robots-parser": "^3.0.1",
|
||||
"scrapingbee": "^1.7.4",
|
||||
"stripe": "^16.1.0",
|
||||
"supabase": "^1.77.9",
|
||||
"systeminformation": "^5.22.11",
|
||||
|
51
apps/api/pnpm-lock.yaml
generated
51
apps/api/pnpm-lock.yaml
generated
@ -236,9 +236,6 @@ importers:
|
||||
robots-parser:
|
||||
specifier: ^3.0.1
|
||||
version: 3.0.1
|
||||
scrapingbee:
|
||||
specifier: ^1.7.4
|
||||
version: 1.7.4
|
||||
stripe:
|
||||
specifier: ^16.1.0
|
||||
version: 16.1.0
|
||||
@ -2030,9 +2027,6 @@ packages:
|
||||
asynckit@0.4.0:
|
||||
resolution: {integrity: sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==}
|
||||
|
||||
axios-retry@3.9.1:
|
||||
resolution: {integrity: sha512-8PJDLJv7qTTMMwdnbMvrLYuvB47M81wRtxQmEdV5w4rgbTXTt+vtPkXwajOfOdSyv/wZICJOC+/UhXH4aQ/R+w==}
|
||||
|
||||
axios-retry@4.5.0:
|
||||
resolution: {integrity: sha512-aR99oXhpEDGo0UuAlYcn2iGRds30k366Zfa05XWScR9QaQD4JYiP3/1Qt1u7YlefUOK+cn0CcwoL1oefavQUlQ==}
|
||||
peerDependencies:
|
||||
@ -2442,10 +2436,6 @@ packages:
|
||||
supports-color:
|
||||
optional: true
|
||||
|
||||
decamelize@4.0.0:
|
||||
resolution: {integrity: sha512-9iE1PgSik9HeIIw2JO94IidnE3eBoQrFJ3w7sFuzSX4DpmZ3v5sZpUiV5Swcf6mQEF+Y0ru8Neo+p+nyh2J+hQ==}
|
||||
engines: {node: '>=10'}
|
||||
|
||||
decimal.js@10.5.0:
|
||||
resolution: {integrity: sha512-8vDa8Qxvr/+d94hSh5P3IJwI5t8/c0KsMp+g8bNw9cY2icONa5aPfvKeieW1WlG0WQYwwhJ7mjui2xtiePQSXw==}
|
||||
|
||||
@ -2767,10 +2757,6 @@ packages:
|
||||
resolution: {integrity: sha512-PpOwAdQ/YlXQ2vj8a3h8IipDuYRi3wceVQQGYWxNINccq40Anw7BlsEXCMbt1Zt+OLA6Fq9suIpIWD0OsnISlw==}
|
||||
engines: {node: '>=8'}
|
||||
|
||||
flat@5.0.2:
|
||||
resolution: {integrity: sha512-b6suED+5/3rTpUBdG1gupIl8MPFCAMA0QXwmljLhvCUKcUvdE4gWky9zpuGCcXHOsz4J9wPGNWq6OKpmIzz3hQ==}
|
||||
hasBin: true
|
||||
|
||||
fn.name@1.1.0:
|
||||
resolution: {integrity: sha512-GRnmB5gPyJpAhTQdSZTSp9uaPSvl09KoYcMQtsB9rQoOmzs9dH6ffeccH+Z+cv6P68Hu5bC6JjRh4Ah/mHSNRw==}
|
||||
|
||||
@ -3145,10 +3131,6 @@ packages:
|
||||
resolution: {integrity: sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==}
|
||||
engines: {node: '>=0.12.0'}
|
||||
|
||||
is-plain-obj@2.1.0:
|
||||
resolution: {integrity: sha512-YWnfyRwxL/+SsrWYfOpUtz5b3YD+nyfkHvjbcanzk8zgyO4ASD67uVMRt8k5bM4lLMDnXfriRhOpemw+NfT1eA==}
|
||||
engines: {node: '>=8'}
|
||||
|
||||
is-potential-custom-element-name@1.0.1:
|
||||
resolution: {integrity: sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==}
|
||||
|
||||
@ -4233,9 +4215,6 @@ packages:
|
||||
scheduler@0.23.2:
|
||||
resolution: {integrity: sha512-UOShsPwz7NrMUqhR6t0hWjFduvOzbtv7toDH1/hIrfRNIDBnnBWd0CwJTGvTpngVlmwGCdP9/Zl/tVrDqcuYzQ==}
|
||||
|
||||
scrapingbee@1.7.4:
|
||||
resolution: {integrity: sha512-cTo+mfLi+T3mSeCHIefVZpjWEX2O70SkmCoWj9ypsnIFqBI2GmljdHYXt8yoT6D/YKjI0rHE7YH9iVRdhyoMmQ==}
|
||||
|
||||
secure-compare@3.0.1:
|
||||
resolution: {integrity: sha512-AckIIV90rPDcBcglUwXPF3kg0P0qmPsPXAj6BBEENQE1p5yA1xfmDJzfi1Tappj37Pv2mVbKpL3Z1T+Nn7k1Qw==}
|
||||
|
||||
@ -4874,10 +4853,6 @@ packages:
|
||||
resolution: {integrity: sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==}
|
||||
engines: {node: '>=12'}
|
||||
|
||||
yargs-unparser@2.0.0:
|
||||
resolution: {integrity: sha512-7pRTIA9Qc1caZ0bZ6RYRGbHJthJWuakf+WmHK0rVeLkNrrGhfoabBNdue6kdINI6r4if7ocq9aD/n7xwKOdzOA==}
|
||||
engines: {node: '>=10'}
|
||||
|
||||
yargs@17.7.2:
|
||||
resolution: {integrity: sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==}
|
||||
engines: {node: '>=12'}
|
||||
@ -7339,11 +7314,6 @@ snapshots:
|
||||
|
||||
asynckit@0.4.0: {}
|
||||
|
||||
axios-retry@3.9.1:
|
||||
dependencies:
|
||||
'@babel/runtime': 7.24.6
|
||||
is-retry-allowed: 2.2.0
|
||||
|
||||
axios-retry@4.5.0(axios@1.7.2):
|
||||
dependencies:
|
||||
axios: 1.7.2
|
||||
@ -7822,8 +7792,6 @@ snapshots:
|
||||
dependencies:
|
||||
ms: 2.1.2
|
||||
|
||||
decamelize@4.0.0: {}
|
||||
|
||||
decimal.js@10.5.0: {}
|
||||
|
||||
dedent@1.5.3: {}
|
||||
@ -8153,8 +8121,6 @@ snapshots:
|
||||
locate-path: 5.0.0
|
||||
path-exists: 4.0.0
|
||||
|
||||
flat@5.0.2: {}
|
||||
|
||||
fn.name@1.1.0: {}
|
||||
|
||||
follow-redirects@1.15.6: {}
|
||||
@ -8596,8 +8562,6 @@ snapshots:
|
||||
|
||||
is-number@7.0.0: {}
|
||||
|
||||
is-plain-obj@2.1.0: {}
|
||||
|
||||
is-potential-custom-element-name@1.0.1: {}
|
||||
|
||||
is-retry-allowed@2.2.0: {}
|
||||
@ -9926,14 +9890,6 @@ snapshots:
|
||||
dependencies:
|
||||
loose-envify: 1.4.0
|
||||
|
||||
scrapingbee@1.7.4:
|
||||
dependencies:
|
||||
axios: 1.7.2
|
||||
axios-retry: 3.9.1
|
||||
yargs-unparser: 2.0.0
|
||||
transitivePeerDependencies:
|
||||
- debug
|
||||
|
||||
secure-compare@3.0.1: {}
|
||||
|
||||
secure-json-parse@2.7.0: {}
|
||||
@ -10540,13 +10496,6 @@ snapshots:
|
||||
|
||||
yargs-parser@21.1.1: {}
|
||||
|
||||
yargs-unparser@2.0.0:
|
||||
dependencies:
|
||||
camelcase: 6.3.0
|
||||
decamelize: 4.0.0
|
||||
flat: 5.0.2
|
||||
is-plain-obj: 2.1.0
|
||||
|
||||
yargs@17.7.2:
|
||||
dependencies:
|
||||
cliui: 8.0.1
|
||||
|
@ -18,7 +18,6 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
||||
process.env.SUPABASE_ANON_TOKEN = "";
|
||||
process.env.SUPABASE_URL = "";
|
||||
process.env.SUPABASE_SERVICE_TOKEN = "";
|
||||
process.env.SCRAPING_BEE_API_KEY = "";
|
||||
process.env.OPENAI_API_KEY = "";
|
||||
process.env.BULL_AUTH_KEY = "";
|
||||
process.env.PLAYWRIGHT_MICROSERVICE_URL = "";
|
||||
|
@ -7,7 +7,6 @@ import {
|
||||
scrapeURLWithFireEngineTLSClient,
|
||||
} from "./fire-engine";
|
||||
import { scrapePDF } from "./pdf";
|
||||
import { scrapeURLWithScrapingBee } from "./scrapingbee";
|
||||
import { scrapeURLWithFetch } from "./fetch";
|
||||
import { scrapeURLWithPlaywright } from "./playwright";
|
||||
import { scrapeCache } from "./cache";
|
||||
@ -16,17 +15,12 @@ export type Engine =
|
||||
| "fire-engine;chrome-cdp"
|
||||
| "fire-engine;playwright"
|
||||
| "fire-engine;tlsclient"
|
||||
| "scrapingbee"
|
||||
| "scrapingbeeLoad"
|
||||
| "playwright"
|
||||
| "fetch"
|
||||
| "pdf"
|
||||
| "docx"
|
||||
| "cache";
|
||||
|
||||
const useScrapingBee =
|
||||
process.env.SCRAPING_BEE_API_KEY !== "" &&
|
||||
process.env.SCRAPING_BEE_API_KEY !== undefined;
|
||||
const useFireEngine =
|
||||
process.env.FIRE_ENGINE_BETA_URL !== "" &&
|
||||
process.env.FIRE_ENGINE_BETA_URL !== undefined;
|
||||
@ -46,9 +40,6 @@ export const engines: Engine[] = [
|
||||
"fire-engine;tlsclient" as const,
|
||||
]
|
||||
: []),
|
||||
...(useScrapingBee
|
||||
? ["scrapingbee" as const, "scrapingbeeLoad" as const]
|
||||
: []),
|
||||
...(usePlaywright ? ["playwright" as const] : []),
|
||||
"fetch",
|
||||
"pdf",
|
||||
@ -120,8 +111,6 @@ const engineHandlers: {
|
||||
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
|
||||
"fire-engine;playwright": scrapeURLWithFireEnginePlaywright,
|
||||
"fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient,
|
||||
scrapingbee: scrapeURLWithScrapingBee("domcontentloaded"),
|
||||
scrapingbeeLoad: scrapeURLWithScrapingBee("networkidle2"),
|
||||
playwright: scrapeURLWithPlaywright,
|
||||
fetch: scrapeURLWithFetch,
|
||||
pdf: scrapePDF,
|
||||
@ -189,40 +178,6 @@ export const engineOptions: {
|
||||
},
|
||||
quality: 40,
|
||||
},
|
||||
scrapingbee: {
|
||||
features: {
|
||||
actions: false,
|
||||
waitFor: true,
|
||||
screenshot: true,
|
||||
"screenshot@fullScreen": true,
|
||||
pdf: false,
|
||||
docx: false,
|
||||
atsv: false,
|
||||
location: false,
|
||||
mobile: false,
|
||||
skipTlsVerification: false,
|
||||
useFastMode: false,
|
||||
stealthProxy: false,
|
||||
},
|
||||
quality: 30,
|
||||
},
|
||||
scrapingbeeLoad: {
|
||||
features: {
|
||||
actions: false,
|
||||
waitFor: true,
|
||||
screenshot: true,
|
||||
"screenshot@fullScreen": true,
|
||||
pdf: false,
|
||||
docx: false,
|
||||
atsv: false,
|
||||
location: false,
|
||||
mobile: false,
|
||||
skipTlsVerification: false,
|
||||
useFastMode: false,
|
||||
stealthProxy: false,
|
||||
},
|
||||
quality: 29,
|
||||
},
|
||||
playwright: {
|
||||
features: {
|
||||
actions: false,
|
||||
|
@ -1,95 +0,0 @@
|
||||
import { ScrapingBeeClient } from "scrapingbee";
|
||||
import { Meta } from "../..";
|
||||
import { EngineScrapeResult } from "..";
|
||||
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||
import { AxiosError, type AxiosResponse } from "axios";
|
||||
import { EngineError, TimeoutError } from "../../error";
|
||||
|
||||
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!);
|
||||
|
||||
export function scrapeURLWithScrapingBee(
|
||||
wait_browser: "domcontentloaded" | "networkidle2",
|
||||
): (meta: Meta, timeToRun: number | undefined) => Promise<EngineScrapeResult> {
|
||||
return async (
|
||||
meta: Meta,
|
||||
timeToRun: number | undefined,
|
||||
): Promise<EngineScrapeResult> => {
|
||||
let response: AxiosResponse<any>;
|
||||
const timeout = (timeToRun ?? 300000) + meta.options.waitFor;
|
||||
try {
|
||||
response = await Promise.race<AxiosResponse<any>>([
|
||||
client.get({
|
||||
url: meta.url,
|
||||
params: {
|
||||
timeout,
|
||||
wait_browser: wait_browser,
|
||||
wait: meta.options.waitFor,
|
||||
transparent_status_code: true,
|
||||
json_response: true,
|
||||
screenshot: meta.options.formats.includes("screenshot"),
|
||||
screenshot_full_page: meta.options.formats.includes(
|
||||
"screenshot@fullPage",
|
||||
),
|
||||
},
|
||||
headers: {
|
||||
"ScrapingService-Request": "TRUE", // this is sent to the page, not to ScrapingBee - mogery
|
||||
},
|
||||
}),
|
||||
new Promise((_, reject) => setTimeout(() => reject(new TimeoutError("ScrapingBee timed out")), timeout + 5000)),
|
||||
]);
|
||||
} catch (error) {
|
||||
if (error instanceof AxiosError && error.response !== undefined) {
|
||||
response = error.response;
|
||||
} else {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
const data: Buffer = response.data;
|
||||
const body = JSON.parse(new TextDecoder().decode(data));
|
||||
|
||||
const headers = body.headers ?? {};
|
||||
const isHiddenEngineError = !(
|
||||
headers["Date"] ??
|
||||
headers["date"] ??
|
||||
headers["Content-Type"] ??
|
||||
headers["content-type"]
|
||||
);
|
||||
|
||||
if (body.errors || body.body?.error || isHiddenEngineError) {
|
||||
meta.logger.error("ScrapingBee threw an error", {
|
||||
body: body.body?.error ?? body.errors ?? body.body ?? body,
|
||||
});
|
||||
throw new EngineError("Engine error #34", {
|
||||
cause: { body, statusCode: response.status },
|
||||
});
|
||||
}
|
||||
|
||||
if (typeof body.body !== "string") {
|
||||
meta.logger.error("ScrapingBee: Body is not string??", { body });
|
||||
throw new EngineError("Engine error #35", {
|
||||
cause: { body, statusCode: response.status },
|
||||
});
|
||||
}
|
||||
|
||||
await specialtyScrapeCheck(
|
||||
meta.logger.child({
|
||||
method: "scrapeURLWithScrapingBee/specialtyScrapeCheck",
|
||||
}),
|
||||
body.headers,
|
||||
);
|
||||
|
||||
return {
|
||||
url: body["resolved-url"] ?? meta.url,
|
||||
|
||||
html: body.body,
|
||||
error: response.status >= 300 ? response.statusText : undefined,
|
||||
statusCode: response.status,
|
||||
...(body.screenshot
|
||||
? {
|
||||
screenshot: `data:image/png;base64,${body.screenshot}`,
|
||||
}
|
||||
: {}),
|
||||
};
|
||||
};
|
||||
}
|
@ -12,8 +12,6 @@ const testEngines: (Engine | undefined)[] = [
|
||||
"fire-engine;chrome-cdp",
|
||||
"fire-engine;playwright",
|
||||
"fire-engine;tlsclient",
|
||||
"scrapingbee",
|
||||
"scrapingbeeLoad",
|
||||
"fetch",
|
||||
];
|
||||
|
||||
@ -21,8 +19,6 @@ const testEnginesScreenshot: (Engine | undefined)[] = [
|
||||
undefined,
|
||||
"fire-engine;chrome-cdp",
|
||||
"fire-engine;playwright",
|
||||
"scrapingbee",
|
||||
"scrapingbeeLoad",
|
||||
];
|
||||
|
||||
describe("Standalone scrapeURL tests", () => {
|
||||
|
@ -33,7 +33,6 @@ x-common-env: &common-env
|
||||
SUPABASE_ANON_TOKEN: ${SUPABASE_ANON_TOKEN}
|
||||
SUPABASE_URL: ${SUPABASE_URL}
|
||||
SUPABASE_SERVICE_TOKEN: ${SUPABASE_SERVICE_TOKEN}
|
||||
SCRAPING_BEE_API_KEY: ${SCRAPING_BEE_API_KEY}
|
||||
SELF_HOSTED_WEBHOOK_URL: ${SELF_HOSTED_WEBHOOK_URL}
|
||||
SERPER_API_KEY: ${SERPER_API_KEY}
|
||||
SEARCHAPI_API_KEY: ${SEARCHAPI_API_KEY}
|
||||
|
@ -11,7 +11,6 @@ data:
|
||||
TEST_API_KEY: ""
|
||||
POSTHOG_API_KEY: ""
|
||||
POSTHOG_HOST: ""
|
||||
SCRAPING_BEE_API_KEY: ""
|
||||
STRIPE_PRICE_ID_STANDARD: ""
|
||||
STRIPE_PRICE_ID_SCALE: ""
|
||||
FIRE_ENGINE_BETA_URL: ""
|
||||
|
Loading…
x
Reference in New Issue
Block a user