mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-06-03 02:44:04 +08:00
Nick: (#1492)
This commit is contained in:
parent
feda4dede7
commit
1c421f2d74
1
.github/archive/js-sdk.yml
vendored
1
.github/archive/js-sdk.yml
vendored
@ -15,7 +15,6 @@ env:
|
|||||||
PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }}
|
PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }}
|
||||||
PORT: ${{ secrets.PORT }}
|
PORT: ${{ secrets.PORT }}
|
||||||
REDIS_URL: ${{ secrets.REDIS_URL }}
|
REDIS_URL: ${{ secrets.REDIS_URL }}
|
||||||
SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }}
|
|
||||||
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
|
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
|
||||||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||||
|
1
.github/archive/python-sdk.yml
vendored
1
.github/archive/python-sdk.yml
vendored
@ -15,7 +15,6 @@ env:
|
|||||||
PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }}
|
PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }}
|
||||||
PORT: ${{ secrets.PORT }}
|
PORT: ${{ secrets.PORT }}
|
||||||
REDIS_URL: ${{ secrets.REDIS_URL }}
|
REDIS_URL: ${{ secrets.REDIS_URL }}
|
||||||
SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }}
|
|
||||||
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
|
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
|
||||||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||||
|
1
.github/archive/rust-sdk.yml
vendored
1
.github/archive/rust-sdk.yml
vendored
@ -15,7 +15,6 @@ env:
|
|||||||
PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }}
|
PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }}
|
||||||
PORT: ${{ secrets.PORT }}
|
PORT: ${{ secrets.PORT }}
|
||||||
REDIS_URL: ${{ secrets.REDIS_URL }}
|
REDIS_URL: ${{ secrets.REDIS_URL }}
|
||||||
SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }}
|
|
||||||
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
|
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
|
||||||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||||
|
1
.github/workflows/test-server.yml
vendored
1
.github/workflows/test-server.yml
vendored
@ -17,7 +17,6 @@ env:
|
|||||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||||
PORT: ${{ secrets.PORT }}
|
PORT: ${{ secrets.PORT }}
|
||||||
REDIS_URL: ${{ secrets.REDIS_URL }}
|
REDIS_URL: ${{ secrets.REDIS_URL }}
|
||||||
SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }}
|
|
||||||
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
|
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
|
||||||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||||
|
@ -38,7 +38,6 @@ SUPABASE_SERVICE_TOKEN=
|
|||||||
|
|
||||||
# Other Optionals
|
# Other Optionals
|
||||||
TEST_API_KEY= # use if you've set up authentication and want to test with a real API key
|
TEST_API_KEY= # use if you've set up authentication and want to test with a real API key
|
||||||
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Bee to handle JS blocking
|
|
||||||
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
|
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
|
||||||
BULL_AUTH_KEY= @
|
BULL_AUTH_KEY= @
|
||||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||||
|
@ -86,9 +86,6 @@ USE_DB_AUTHENTICATION=false
|
|||||||
# Use if you've set up authentication and want to test with a real API key
|
# Use if you've set up authentication and want to test with a real API key
|
||||||
# TEST_API_KEY=
|
# TEST_API_KEY=
|
||||||
|
|
||||||
# You can add this to enable ScrapingBee as a fallback scraping engine.
|
|
||||||
# SCRAPING_BEE_API_KEY=
|
|
||||||
|
|
||||||
# This key lets you access the queue admin panel. Change this if your deployment is publicly accessible.
|
# This key lets you access the queue admin panel. Change this if your deployment is publicly accessible.
|
||||||
BULL_AUTH_KEY=CHANGEME
|
BULL_AUTH_KEY=CHANGEME
|
||||||
|
|
||||||
|
@ -128,7 +128,6 @@
|
|||||||
"redlock": "5.0.0-beta.2",
|
"redlock": "5.0.0-beta.2",
|
||||||
"resend": "^3.4.0",
|
"resend": "^3.4.0",
|
||||||
"robots-parser": "^3.0.1",
|
"robots-parser": "^3.0.1",
|
||||||
"scrapingbee": "^1.7.4",
|
|
||||||
"stripe": "^16.1.0",
|
"stripe": "^16.1.0",
|
||||||
"supabase": "^1.77.9",
|
"supabase": "^1.77.9",
|
||||||
"systeminformation": "^5.22.11",
|
"systeminformation": "^5.22.11",
|
||||||
|
51
apps/api/pnpm-lock.yaml
generated
51
apps/api/pnpm-lock.yaml
generated
@ -236,9 +236,6 @@ importers:
|
|||||||
robots-parser:
|
robots-parser:
|
||||||
specifier: ^3.0.1
|
specifier: ^3.0.1
|
||||||
version: 3.0.1
|
version: 3.0.1
|
||||||
scrapingbee:
|
|
||||||
specifier: ^1.7.4
|
|
||||||
version: 1.7.4
|
|
||||||
stripe:
|
stripe:
|
||||||
specifier: ^16.1.0
|
specifier: ^16.1.0
|
||||||
version: 16.1.0
|
version: 16.1.0
|
||||||
@ -2030,9 +2027,6 @@ packages:
|
|||||||
asynckit@0.4.0:
|
asynckit@0.4.0:
|
||||||
resolution: {integrity: sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==}
|
resolution: {integrity: sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==}
|
||||||
|
|
||||||
axios-retry@3.9.1:
|
|
||||||
resolution: {integrity: sha512-8PJDLJv7qTTMMwdnbMvrLYuvB47M81wRtxQmEdV5w4rgbTXTt+vtPkXwajOfOdSyv/wZICJOC+/UhXH4aQ/R+w==}
|
|
||||||
|
|
||||||
axios-retry@4.5.0:
|
axios-retry@4.5.0:
|
||||||
resolution: {integrity: sha512-aR99oXhpEDGo0UuAlYcn2iGRds30k366Zfa05XWScR9QaQD4JYiP3/1Qt1u7YlefUOK+cn0CcwoL1oefavQUlQ==}
|
resolution: {integrity: sha512-aR99oXhpEDGo0UuAlYcn2iGRds30k366Zfa05XWScR9QaQD4JYiP3/1Qt1u7YlefUOK+cn0CcwoL1oefavQUlQ==}
|
||||||
peerDependencies:
|
peerDependencies:
|
||||||
@ -2442,10 +2436,6 @@ packages:
|
|||||||
supports-color:
|
supports-color:
|
||||||
optional: true
|
optional: true
|
||||||
|
|
||||||
decamelize@4.0.0:
|
|
||||||
resolution: {integrity: sha512-9iE1PgSik9HeIIw2JO94IidnE3eBoQrFJ3w7sFuzSX4DpmZ3v5sZpUiV5Swcf6mQEF+Y0ru8Neo+p+nyh2J+hQ==}
|
|
||||||
engines: {node: '>=10'}
|
|
||||||
|
|
||||||
decimal.js@10.5.0:
|
decimal.js@10.5.0:
|
||||||
resolution: {integrity: sha512-8vDa8Qxvr/+d94hSh5P3IJwI5t8/c0KsMp+g8bNw9cY2icONa5aPfvKeieW1WlG0WQYwwhJ7mjui2xtiePQSXw==}
|
resolution: {integrity: sha512-8vDa8Qxvr/+d94hSh5P3IJwI5t8/c0KsMp+g8bNw9cY2icONa5aPfvKeieW1WlG0WQYwwhJ7mjui2xtiePQSXw==}
|
||||||
|
|
||||||
@ -2767,10 +2757,6 @@ packages:
|
|||||||
resolution: {integrity: sha512-PpOwAdQ/YlXQ2vj8a3h8IipDuYRi3wceVQQGYWxNINccq40Anw7BlsEXCMbt1Zt+OLA6Fq9suIpIWD0OsnISlw==}
|
resolution: {integrity: sha512-PpOwAdQ/YlXQ2vj8a3h8IipDuYRi3wceVQQGYWxNINccq40Anw7BlsEXCMbt1Zt+OLA6Fq9suIpIWD0OsnISlw==}
|
||||||
engines: {node: '>=8'}
|
engines: {node: '>=8'}
|
||||||
|
|
||||||
flat@5.0.2:
|
|
||||||
resolution: {integrity: sha512-b6suED+5/3rTpUBdG1gupIl8MPFCAMA0QXwmljLhvCUKcUvdE4gWky9zpuGCcXHOsz4J9wPGNWq6OKpmIzz3hQ==}
|
|
||||||
hasBin: true
|
|
||||||
|
|
||||||
fn.name@1.1.0:
|
fn.name@1.1.0:
|
||||||
resolution: {integrity: sha512-GRnmB5gPyJpAhTQdSZTSp9uaPSvl09KoYcMQtsB9rQoOmzs9dH6ffeccH+Z+cv6P68Hu5bC6JjRh4Ah/mHSNRw==}
|
resolution: {integrity: sha512-GRnmB5gPyJpAhTQdSZTSp9uaPSvl09KoYcMQtsB9rQoOmzs9dH6ffeccH+Z+cv6P68Hu5bC6JjRh4Ah/mHSNRw==}
|
||||||
|
|
||||||
@ -3145,10 +3131,6 @@ packages:
|
|||||||
resolution: {integrity: sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==}
|
resolution: {integrity: sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==}
|
||||||
engines: {node: '>=0.12.0'}
|
engines: {node: '>=0.12.0'}
|
||||||
|
|
||||||
is-plain-obj@2.1.0:
|
|
||||||
resolution: {integrity: sha512-YWnfyRwxL/+SsrWYfOpUtz5b3YD+nyfkHvjbcanzk8zgyO4ASD67uVMRt8k5bM4lLMDnXfriRhOpemw+NfT1eA==}
|
|
||||||
engines: {node: '>=8'}
|
|
||||||
|
|
||||||
is-potential-custom-element-name@1.0.1:
|
is-potential-custom-element-name@1.0.1:
|
||||||
resolution: {integrity: sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==}
|
resolution: {integrity: sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==}
|
||||||
|
|
||||||
@ -4233,9 +4215,6 @@ packages:
|
|||||||
scheduler@0.23.2:
|
scheduler@0.23.2:
|
||||||
resolution: {integrity: sha512-UOShsPwz7NrMUqhR6t0hWjFduvOzbtv7toDH1/hIrfRNIDBnnBWd0CwJTGvTpngVlmwGCdP9/Zl/tVrDqcuYzQ==}
|
resolution: {integrity: sha512-UOShsPwz7NrMUqhR6t0hWjFduvOzbtv7toDH1/hIrfRNIDBnnBWd0CwJTGvTpngVlmwGCdP9/Zl/tVrDqcuYzQ==}
|
||||||
|
|
||||||
scrapingbee@1.7.4:
|
|
||||||
resolution: {integrity: sha512-cTo+mfLi+T3mSeCHIefVZpjWEX2O70SkmCoWj9ypsnIFqBI2GmljdHYXt8yoT6D/YKjI0rHE7YH9iVRdhyoMmQ==}
|
|
||||||
|
|
||||||
secure-compare@3.0.1:
|
secure-compare@3.0.1:
|
||||||
resolution: {integrity: sha512-AckIIV90rPDcBcglUwXPF3kg0P0qmPsPXAj6BBEENQE1p5yA1xfmDJzfi1Tappj37Pv2mVbKpL3Z1T+Nn7k1Qw==}
|
resolution: {integrity: sha512-AckIIV90rPDcBcglUwXPF3kg0P0qmPsPXAj6BBEENQE1p5yA1xfmDJzfi1Tappj37Pv2mVbKpL3Z1T+Nn7k1Qw==}
|
||||||
|
|
||||||
@ -4874,10 +4853,6 @@ packages:
|
|||||||
resolution: {integrity: sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==}
|
resolution: {integrity: sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==}
|
||||||
engines: {node: '>=12'}
|
engines: {node: '>=12'}
|
||||||
|
|
||||||
yargs-unparser@2.0.0:
|
|
||||||
resolution: {integrity: sha512-7pRTIA9Qc1caZ0bZ6RYRGbHJthJWuakf+WmHK0rVeLkNrrGhfoabBNdue6kdINI6r4if7ocq9aD/n7xwKOdzOA==}
|
|
||||||
engines: {node: '>=10'}
|
|
||||||
|
|
||||||
yargs@17.7.2:
|
yargs@17.7.2:
|
||||||
resolution: {integrity: sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==}
|
resolution: {integrity: sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==}
|
||||||
engines: {node: '>=12'}
|
engines: {node: '>=12'}
|
||||||
@ -7339,11 +7314,6 @@ snapshots:
|
|||||||
|
|
||||||
asynckit@0.4.0: {}
|
asynckit@0.4.0: {}
|
||||||
|
|
||||||
axios-retry@3.9.1:
|
|
||||||
dependencies:
|
|
||||||
'@babel/runtime': 7.24.6
|
|
||||||
is-retry-allowed: 2.2.0
|
|
||||||
|
|
||||||
axios-retry@4.5.0(axios@1.7.2):
|
axios-retry@4.5.0(axios@1.7.2):
|
||||||
dependencies:
|
dependencies:
|
||||||
axios: 1.7.2
|
axios: 1.7.2
|
||||||
@ -7822,8 +7792,6 @@ snapshots:
|
|||||||
dependencies:
|
dependencies:
|
||||||
ms: 2.1.2
|
ms: 2.1.2
|
||||||
|
|
||||||
decamelize@4.0.0: {}
|
|
||||||
|
|
||||||
decimal.js@10.5.0: {}
|
decimal.js@10.5.0: {}
|
||||||
|
|
||||||
dedent@1.5.3: {}
|
dedent@1.5.3: {}
|
||||||
@ -8153,8 +8121,6 @@ snapshots:
|
|||||||
locate-path: 5.0.0
|
locate-path: 5.0.0
|
||||||
path-exists: 4.0.0
|
path-exists: 4.0.0
|
||||||
|
|
||||||
flat@5.0.2: {}
|
|
||||||
|
|
||||||
fn.name@1.1.0: {}
|
fn.name@1.1.0: {}
|
||||||
|
|
||||||
follow-redirects@1.15.6: {}
|
follow-redirects@1.15.6: {}
|
||||||
@ -8596,8 +8562,6 @@ snapshots:
|
|||||||
|
|
||||||
is-number@7.0.0: {}
|
is-number@7.0.0: {}
|
||||||
|
|
||||||
is-plain-obj@2.1.0: {}
|
|
||||||
|
|
||||||
is-potential-custom-element-name@1.0.1: {}
|
is-potential-custom-element-name@1.0.1: {}
|
||||||
|
|
||||||
is-retry-allowed@2.2.0: {}
|
is-retry-allowed@2.2.0: {}
|
||||||
@ -9926,14 +9890,6 @@ snapshots:
|
|||||||
dependencies:
|
dependencies:
|
||||||
loose-envify: 1.4.0
|
loose-envify: 1.4.0
|
||||||
|
|
||||||
scrapingbee@1.7.4:
|
|
||||||
dependencies:
|
|
||||||
axios: 1.7.2
|
|
||||||
axios-retry: 3.9.1
|
|
||||||
yargs-unparser: 2.0.0
|
|
||||||
transitivePeerDependencies:
|
|
||||||
- debug
|
|
||||||
|
|
||||||
secure-compare@3.0.1: {}
|
secure-compare@3.0.1: {}
|
||||||
|
|
||||||
secure-json-parse@2.7.0: {}
|
secure-json-parse@2.7.0: {}
|
||||||
@ -10540,13 +10496,6 @@ snapshots:
|
|||||||
|
|
||||||
yargs-parser@21.1.1: {}
|
yargs-parser@21.1.1: {}
|
||||||
|
|
||||||
yargs-unparser@2.0.0:
|
|
||||||
dependencies:
|
|
||||||
camelcase: 6.3.0
|
|
||||||
decamelize: 4.0.0
|
|
||||||
flat: 5.0.2
|
|
||||||
is-plain-obj: 2.1.0
|
|
||||||
|
|
||||||
yargs@17.7.2:
|
yargs@17.7.2:
|
||||||
dependencies:
|
dependencies:
|
||||||
cliui: 8.0.1
|
cliui: 8.0.1
|
||||||
|
@ -18,7 +18,6 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
|||||||
process.env.SUPABASE_ANON_TOKEN = "";
|
process.env.SUPABASE_ANON_TOKEN = "";
|
||||||
process.env.SUPABASE_URL = "";
|
process.env.SUPABASE_URL = "";
|
||||||
process.env.SUPABASE_SERVICE_TOKEN = "";
|
process.env.SUPABASE_SERVICE_TOKEN = "";
|
||||||
process.env.SCRAPING_BEE_API_KEY = "";
|
|
||||||
process.env.OPENAI_API_KEY = "";
|
process.env.OPENAI_API_KEY = "";
|
||||||
process.env.BULL_AUTH_KEY = "";
|
process.env.BULL_AUTH_KEY = "";
|
||||||
process.env.PLAYWRIGHT_MICROSERVICE_URL = "";
|
process.env.PLAYWRIGHT_MICROSERVICE_URL = "";
|
||||||
|
@ -7,7 +7,6 @@ import {
|
|||||||
scrapeURLWithFireEngineTLSClient,
|
scrapeURLWithFireEngineTLSClient,
|
||||||
} from "./fire-engine";
|
} from "./fire-engine";
|
||||||
import { scrapePDF } from "./pdf";
|
import { scrapePDF } from "./pdf";
|
||||||
import { scrapeURLWithScrapingBee } from "./scrapingbee";
|
|
||||||
import { scrapeURLWithFetch } from "./fetch";
|
import { scrapeURLWithFetch } from "./fetch";
|
||||||
import { scrapeURLWithPlaywright } from "./playwright";
|
import { scrapeURLWithPlaywright } from "./playwright";
|
||||||
import { scrapeCache } from "./cache";
|
import { scrapeCache } from "./cache";
|
||||||
@ -16,17 +15,12 @@ export type Engine =
|
|||||||
| "fire-engine;chrome-cdp"
|
| "fire-engine;chrome-cdp"
|
||||||
| "fire-engine;playwright"
|
| "fire-engine;playwright"
|
||||||
| "fire-engine;tlsclient"
|
| "fire-engine;tlsclient"
|
||||||
| "scrapingbee"
|
|
||||||
| "scrapingbeeLoad"
|
|
||||||
| "playwright"
|
| "playwright"
|
||||||
| "fetch"
|
| "fetch"
|
||||||
| "pdf"
|
| "pdf"
|
||||||
| "docx"
|
| "docx"
|
||||||
| "cache";
|
| "cache";
|
||||||
|
|
||||||
const useScrapingBee =
|
|
||||||
process.env.SCRAPING_BEE_API_KEY !== "" &&
|
|
||||||
process.env.SCRAPING_BEE_API_KEY !== undefined;
|
|
||||||
const useFireEngine =
|
const useFireEngine =
|
||||||
process.env.FIRE_ENGINE_BETA_URL !== "" &&
|
process.env.FIRE_ENGINE_BETA_URL !== "" &&
|
||||||
process.env.FIRE_ENGINE_BETA_URL !== undefined;
|
process.env.FIRE_ENGINE_BETA_URL !== undefined;
|
||||||
@ -46,9 +40,6 @@ export const engines: Engine[] = [
|
|||||||
"fire-engine;tlsclient" as const,
|
"fire-engine;tlsclient" as const,
|
||||||
]
|
]
|
||||||
: []),
|
: []),
|
||||||
...(useScrapingBee
|
|
||||||
? ["scrapingbee" as const, "scrapingbeeLoad" as const]
|
|
||||||
: []),
|
|
||||||
...(usePlaywright ? ["playwright" as const] : []),
|
...(usePlaywright ? ["playwright" as const] : []),
|
||||||
"fetch",
|
"fetch",
|
||||||
"pdf",
|
"pdf",
|
||||||
@ -120,8 +111,6 @@ const engineHandlers: {
|
|||||||
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
|
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
|
||||||
"fire-engine;playwright": scrapeURLWithFireEnginePlaywright,
|
"fire-engine;playwright": scrapeURLWithFireEnginePlaywright,
|
||||||
"fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient,
|
"fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient,
|
||||||
scrapingbee: scrapeURLWithScrapingBee("domcontentloaded"),
|
|
||||||
scrapingbeeLoad: scrapeURLWithScrapingBee("networkidle2"),
|
|
||||||
playwright: scrapeURLWithPlaywright,
|
playwright: scrapeURLWithPlaywright,
|
||||||
fetch: scrapeURLWithFetch,
|
fetch: scrapeURLWithFetch,
|
||||||
pdf: scrapePDF,
|
pdf: scrapePDF,
|
||||||
@ -189,40 +178,6 @@ export const engineOptions: {
|
|||||||
},
|
},
|
||||||
quality: 40,
|
quality: 40,
|
||||||
},
|
},
|
||||||
scrapingbee: {
|
|
||||||
features: {
|
|
||||||
actions: false,
|
|
||||||
waitFor: true,
|
|
||||||
screenshot: true,
|
|
||||||
"screenshot@fullScreen": true,
|
|
||||||
pdf: false,
|
|
||||||
docx: false,
|
|
||||||
atsv: false,
|
|
||||||
location: false,
|
|
||||||
mobile: false,
|
|
||||||
skipTlsVerification: false,
|
|
||||||
useFastMode: false,
|
|
||||||
stealthProxy: false,
|
|
||||||
},
|
|
||||||
quality: 30,
|
|
||||||
},
|
|
||||||
scrapingbeeLoad: {
|
|
||||||
features: {
|
|
||||||
actions: false,
|
|
||||||
waitFor: true,
|
|
||||||
screenshot: true,
|
|
||||||
"screenshot@fullScreen": true,
|
|
||||||
pdf: false,
|
|
||||||
docx: false,
|
|
||||||
atsv: false,
|
|
||||||
location: false,
|
|
||||||
mobile: false,
|
|
||||||
skipTlsVerification: false,
|
|
||||||
useFastMode: false,
|
|
||||||
stealthProxy: false,
|
|
||||||
},
|
|
||||||
quality: 29,
|
|
||||||
},
|
|
||||||
playwright: {
|
playwright: {
|
||||||
features: {
|
features: {
|
||||||
actions: false,
|
actions: false,
|
||||||
|
@ -1,95 +0,0 @@
|
|||||||
import { ScrapingBeeClient } from "scrapingbee";
|
|
||||||
import { Meta } from "../..";
|
|
||||||
import { EngineScrapeResult } from "..";
|
|
||||||
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
|
||||||
import { AxiosError, type AxiosResponse } from "axios";
|
|
||||||
import { EngineError, TimeoutError } from "../../error";
|
|
||||||
|
|
||||||
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!);
|
|
||||||
|
|
||||||
export function scrapeURLWithScrapingBee(
|
|
||||||
wait_browser: "domcontentloaded" | "networkidle2",
|
|
||||||
): (meta: Meta, timeToRun: number | undefined) => Promise<EngineScrapeResult> {
|
|
||||||
return async (
|
|
||||||
meta: Meta,
|
|
||||||
timeToRun: number | undefined,
|
|
||||||
): Promise<EngineScrapeResult> => {
|
|
||||||
let response: AxiosResponse<any>;
|
|
||||||
const timeout = (timeToRun ?? 300000) + meta.options.waitFor;
|
|
||||||
try {
|
|
||||||
response = await Promise.race<AxiosResponse<any>>([
|
|
||||||
client.get({
|
|
||||||
url: meta.url,
|
|
||||||
params: {
|
|
||||||
timeout,
|
|
||||||
wait_browser: wait_browser,
|
|
||||||
wait: meta.options.waitFor,
|
|
||||||
transparent_status_code: true,
|
|
||||||
json_response: true,
|
|
||||||
screenshot: meta.options.formats.includes("screenshot"),
|
|
||||||
screenshot_full_page: meta.options.formats.includes(
|
|
||||||
"screenshot@fullPage",
|
|
||||||
),
|
|
||||||
},
|
|
||||||
headers: {
|
|
||||||
"ScrapingService-Request": "TRUE", // this is sent to the page, not to ScrapingBee - mogery
|
|
||||||
},
|
|
||||||
}),
|
|
||||||
new Promise((_, reject) => setTimeout(() => reject(new TimeoutError("ScrapingBee timed out")), timeout + 5000)),
|
|
||||||
]);
|
|
||||||
} catch (error) {
|
|
||||||
if (error instanceof AxiosError && error.response !== undefined) {
|
|
||||||
response = error.response;
|
|
||||||
} else {
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const data: Buffer = response.data;
|
|
||||||
const body = JSON.parse(new TextDecoder().decode(data));
|
|
||||||
|
|
||||||
const headers = body.headers ?? {};
|
|
||||||
const isHiddenEngineError = !(
|
|
||||||
headers["Date"] ??
|
|
||||||
headers["date"] ??
|
|
||||||
headers["Content-Type"] ??
|
|
||||||
headers["content-type"]
|
|
||||||
);
|
|
||||||
|
|
||||||
if (body.errors || body.body?.error || isHiddenEngineError) {
|
|
||||||
meta.logger.error("ScrapingBee threw an error", {
|
|
||||||
body: body.body?.error ?? body.errors ?? body.body ?? body,
|
|
||||||
});
|
|
||||||
throw new EngineError("Engine error #34", {
|
|
||||||
cause: { body, statusCode: response.status },
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if (typeof body.body !== "string") {
|
|
||||||
meta.logger.error("ScrapingBee: Body is not string??", { body });
|
|
||||||
throw new EngineError("Engine error #35", {
|
|
||||||
cause: { body, statusCode: response.status },
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
await specialtyScrapeCheck(
|
|
||||||
meta.logger.child({
|
|
||||||
method: "scrapeURLWithScrapingBee/specialtyScrapeCheck",
|
|
||||||
}),
|
|
||||||
body.headers,
|
|
||||||
);
|
|
||||||
|
|
||||||
return {
|
|
||||||
url: body["resolved-url"] ?? meta.url,
|
|
||||||
|
|
||||||
html: body.body,
|
|
||||||
error: response.status >= 300 ? response.statusText : undefined,
|
|
||||||
statusCode: response.status,
|
|
||||||
...(body.screenshot
|
|
||||||
? {
|
|
||||||
screenshot: `data:image/png;base64,${body.screenshot}`,
|
|
||||||
}
|
|
||||||
: {}),
|
|
||||||
};
|
|
||||||
};
|
|
||||||
}
|
|
@ -12,8 +12,6 @@ const testEngines: (Engine | undefined)[] = [
|
|||||||
"fire-engine;chrome-cdp",
|
"fire-engine;chrome-cdp",
|
||||||
"fire-engine;playwright",
|
"fire-engine;playwright",
|
||||||
"fire-engine;tlsclient",
|
"fire-engine;tlsclient",
|
||||||
"scrapingbee",
|
|
||||||
"scrapingbeeLoad",
|
|
||||||
"fetch",
|
"fetch",
|
||||||
];
|
];
|
||||||
|
|
||||||
@ -21,8 +19,6 @@ const testEnginesScreenshot: (Engine | undefined)[] = [
|
|||||||
undefined,
|
undefined,
|
||||||
"fire-engine;chrome-cdp",
|
"fire-engine;chrome-cdp",
|
||||||
"fire-engine;playwright",
|
"fire-engine;playwright",
|
||||||
"scrapingbee",
|
|
||||||
"scrapingbeeLoad",
|
|
||||||
];
|
];
|
||||||
|
|
||||||
describe("Standalone scrapeURL tests", () => {
|
describe("Standalone scrapeURL tests", () => {
|
||||||
|
@ -33,7 +33,6 @@ x-common-env: &common-env
|
|||||||
SUPABASE_ANON_TOKEN: ${SUPABASE_ANON_TOKEN}
|
SUPABASE_ANON_TOKEN: ${SUPABASE_ANON_TOKEN}
|
||||||
SUPABASE_URL: ${SUPABASE_URL}
|
SUPABASE_URL: ${SUPABASE_URL}
|
||||||
SUPABASE_SERVICE_TOKEN: ${SUPABASE_SERVICE_TOKEN}
|
SUPABASE_SERVICE_TOKEN: ${SUPABASE_SERVICE_TOKEN}
|
||||||
SCRAPING_BEE_API_KEY: ${SCRAPING_BEE_API_KEY}
|
|
||||||
SELF_HOSTED_WEBHOOK_URL: ${SELF_HOSTED_WEBHOOK_URL}
|
SELF_HOSTED_WEBHOOK_URL: ${SELF_HOSTED_WEBHOOK_URL}
|
||||||
SERPER_API_KEY: ${SERPER_API_KEY}
|
SERPER_API_KEY: ${SERPER_API_KEY}
|
||||||
SEARCHAPI_API_KEY: ${SEARCHAPI_API_KEY}
|
SEARCHAPI_API_KEY: ${SEARCHAPI_API_KEY}
|
||||||
|
@ -11,7 +11,6 @@ data:
|
|||||||
TEST_API_KEY: ""
|
TEST_API_KEY: ""
|
||||||
POSTHOG_API_KEY: ""
|
POSTHOG_API_KEY: ""
|
||||||
POSTHOG_HOST: ""
|
POSTHOG_HOST: ""
|
||||||
SCRAPING_BEE_API_KEY: ""
|
|
||||||
STRIPE_PRICE_ID_STANDARD: ""
|
STRIPE_PRICE_ID_STANDARD: ""
|
||||||
STRIPE_PRICE_ID_SCALE: ""
|
STRIPE_PRICE_ID_SCALE: ""
|
||||||
FIRE_ENGINE_BETA_URL: ""
|
FIRE_ENGINE_BETA_URL: ""
|
||||||
|
Loading…
x
Reference in New Issue
Block a user