From 1c421f2d74b09c8af7b13b4f81eab06887c6c1d5 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 22 Apr 2025 21:42:37 -0400 Subject: [PATCH] Nick: (#1492) --- .github/archive/js-sdk.yml | 1 - .github/archive/python-sdk.yml | 1 - .github/archive/rust-sdk.yml | 1 - .github/workflows/test-server.yml | 1 - CONTRIBUTING.md | 1 - SELF_HOST.md | 3 - apps/api/package.json | 1 - apps/api/pnpm-lock.yaml | 51 ---------- .../src/__tests__/e2e_noAuth/index.test.ts | 1 - .../src/scraper/scrapeURL/engines/index.ts | 45 --------- .../scrapeURL/engines/scrapingbee/index.ts | 95 ------------------- .../src/scraper/scrapeURL/scrapeURL.test.ts | 4 - docker-compose.yaml | 1 - .../kubernetes/cluster-install/secret.yaml | 1 - 14 files changed, 207 deletions(-) delete mode 100644 apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts diff --git a/.github/archive/js-sdk.yml b/.github/archive/js-sdk.yml index 7ef096d4..e996f7d3 100644 --- a/.github/archive/js-sdk.yml +++ b/.github/archive/js-sdk.yml @@ -15,7 +15,6 @@ env: PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }} PORT: ${{ secrets.PORT }} REDIS_URL: ${{ secrets.REDIS_URL }} - SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }} SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }} SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} SUPABASE_URL: ${{ secrets.SUPABASE_URL }} diff --git a/.github/archive/python-sdk.yml b/.github/archive/python-sdk.yml index bdefeab6..4bd9efbb 100644 --- a/.github/archive/python-sdk.yml +++ b/.github/archive/python-sdk.yml @@ -15,7 +15,6 @@ env: PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }} PORT: ${{ secrets.PORT }} REDIS_URL: ${{ secrets.REDIS_URL }} - SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }} SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }} SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} SUPABASE_URL: ${{ secrets.SUPABASE_URL }} diff --git a/.github/archive/rust-sdk.yml b/.github/archive/rust-sdk.yml index 792e06c2..fa84726f 100644 --- a/.github/archive/rust-sdk.yml +++ b/.github/archive/rust-sdk.yml @@ -15,7 +15,6 @@ env: PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }} PORT: ${{ secrets.PORT }} REDIS_URL: ${{ secrets.REDIS_URL }} - SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }} SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }} SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} SUPABASE_URL: ${{ secrets.SUPABASE_URL }} diff --git a/.github/workflows/test-server.yml b/.github/workflows/test-server.yml index bf37baa3..febad61a 100644 --- a/.github/workflows/test-server.yml +++ b/.github/workflows/test-server.yml @@ -17,7 +17,6 @@ env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} PORT: ${{ secrets.PORT }} REDIS_URL: ${{ secrets.REDIS_URL }} - SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }} SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }} SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} SUPABASE_URL: ${{ secrets.SUPABASE_URL }} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ac3bd24a..83f81411 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -38,7 +38,6 @@ SUPABASE_SERVICE_TOKEN= # Other Optionals TEST_API_KEY= # use if you've set up authentication and want to test with a real API key -SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Bee to handle JS blocking OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.) BULL_AUTH_KEY= @ PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback diff --git a/SELF_HOST.md b/SELF_HOST.md index 098afc77..acccddbc 100644 --- a/SELF_HOST.md +++ b/SELF_HOST.md @@ -86,9 +86,6 @@ USE_DB_AUTHENTICATION=false # Use if you've set up authentication and want to test with a real API key # TEST_API_KEY= -# You can add this to enable ScrapingBee as a fallback scraping engine. -# SCRAPING_BEE_API_KEY= - # This key lets you access the queue admin panel. Change this if your deployment is publicly accessible. BULL_AUTH_KEY=CHANGEME diff --git a/apps/api/package.json b/apps/api/package.json index c3363c45..2461df18 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -128,7 +128,6 @@ "redlock": "5.0.0-beta.2", "resend": "^3.4.0", "robots-parser": "^3.0.1", - "scrapingbee": "^1.7.4", "stripe": "^16.1.0", "supabase": "^1.77.9", "systeminformation": "^5.22.11", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 6d3aa164..333cd85c 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -236,9 +236,6 @@ importers: robots-parser: specifier: ^3.0.1 version: 3.0.1 - scrapingbee: - specifier: ^1.7.4 - version: 1.7.4 stripe: specifier: ^16.1.0 version: 16.1.0 @@ -2030,9 +2027,6 @@ packages: asynckit@0.4.0: resolution: {integrity: sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==} - axios-retry@3.9.1: - resolution: {integrity: sha512-8PJDLJv7qTTMMwdnbMvrLYuvB47M81wRtxQmEdV5w4rgbTXTt+vtPkXwajOfOdSyv/wZICJOC+/UhXH4aQ/R+w==} - axios-retry@4.5.0: resolution: {integrity: sha512-aR99oXhpEDGo0UuAlYcn2iGRds30k366Zfa05XWScR9QaQD4JYiP3/1Qt1u7YlefUOK+cn0CcwoL1oefavQUlQ==} peerDependencies: @@ -2442,10 +2436,6 @@ packages: supports-color: optional: true - decamelize@4.0.0: - resolution: {integrity: sha512-9iE1PgSik9HeIIw2JO94IidnE3eBoQrFJ3w7sFuzSX4DpmZ3v5sZpUiV5Swcf6mQEF+Y0ru8Neo+p+nyh2J+hQ==} - engines: {node: '>=10'} - decimal.js@10.5.0: resolution: {integrity: sha512-8vDa8Qxvr/+d94hSh5P3IJwI5t8/c0KsMp+g8bNw9cY2icONa5aPfvKeieW1WlG0WQYwwhJ7mjui2xtiePQSXw==} @@ -2767,10 +2757,6 @@ packages: resolution: {integrity: sha512-PpOwAdQ/YlXQ2vj8a3h8IipDuYRi3wceVQQGYWxNINccq40Anw7BlsEXCMbt1Zt+OLA6Fq9suIpIWD0OsnISlw==} engines: {node: '>=8'} - flat@5.0.2: - resolution: {integrity: sha512-b6suED+5/3rTpUBdG1gupIl8MPFCAMA0QXwmljLhvCUKcUvdE4gWky9zpuGCcXHOsz4J9wPGNWq6OKpmIzz3hQ==} - hasBin: true - fn.name@1.1.0: resolution: {integrity: sha512-GRnmB5gPyJpAhTQdSZTSp9uaPSvl09KoYcMQtsB9rQoOmzs9dH6ffeccH+Z+cv6P68Hu5bC6JjRh4Ah/mHSNRw==} @@ -3145,10 +3131,6 @@ packages: resolution: {integrity: sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==} engines: {node: '>=0.12.0'} - is-plain-obj@2.1.0: - resolution: {integrity: sha512-YWnfyRwxL/+SsrWYfOpUtz5b3YD+nyfkHvjbcanzk8zgyO4ASD67uVMRt8k5bM4lLMDnXfriRhOpemw+NfT1eA==} - engines: {node: '>=8'} - is-potential-custom-element-name@1.0.1: resolution: {integrity: sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==} @@ -4233,9 +4215,6 @@ packages: scheduler@0.23.2: resolution: {integrity: sha512-UOShsPwz7NrMUqhR6t0hWjFduvOzbtv7toDH1/hIrfRNIDBnnBWd0CwJTGvTpngVlmwGCdP9/Zl/tVrDqcuYzQ==} - scrapingbee@1.7.4: - resolution: {integrity: sha512-cTo+mfLi+T3mSeCHIefVZpjWEX2O70SkmCoWj9ypsnIFqBI2GmljdHYXt8yoT6D/YKjI0rHE7YH9iVRdhyoMmQ==} - secure-compare@3.0.1: resolution: {integrity: sha512-AckIIV90rPDcBcglUwXPF3kg0P0qmPsPXAj6BBEENQE1p5yA1xfmDJzfi1Tappj37Pv2mVbKpL3Z1T+Nn7k1Qw==} @@ -4874,10 +4853,6 @@ packages: resolution: {integrity: sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==} engines: {node: '>=12'} - yargs-unparser@2.0.0: - resolution: {integrity: sha512-7pRTIA9Qc1caZ0bZ6RYRGbHJthJWuakf+WmHK0rVeLkNrrGhfoabBNdue6kdINI6r4if7ocq9aD/n7xwKOdzOA==} - engines: {node: '>=10'} - yargs@17.7.2: resolution: {integrity: sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==} engines: {node: '>=12'} @@ -7339,11 +7314,6 @@ snapshots: asynckit@0.4.0: {} - axios-retry@3.9.1: - dependencies: - '@babel/runtime': 7.24.6 - is-retry-allowed: 2.2.0 - axios-retry@4.5.0(axios@1.7.2): dependencies: axios: 1.7.2 @@ -7822,8 +7792,6 @@ snapshots: dependencies: ms: 2.1.2 - decamelize@4.0.0: {} - decimal.js@10.5.0: {} dedent@1.5.3: {} @@ -8153,8 +8121,6 @@ snapshots: locate-path: 5.0.0 path-exists: 4.0.0 - flat@5.0.2: {} - fn.name@1.1.0: {} follow-redirects@1.15.6: {} @@ -8596,8 +8562,6 @@ snapshots: is-number@7.0.0: {} - is-plain-obj@2.1.0: {} - is-potential-custom-element-name@1.0.1: {} is-retry-allowed@2.2.0: {} @@ -9926,14 +9890,6 @@ snapshots: dependencies: loose-envify: 1.4.0 - scrapingbee@1.7.4: - dependencies: - axios: 1.7.2 - axios-retry: 3.9.1 - yargs-unparser: 2.0.0 - transitivePeerDependencies: - - debug - secure-compare@3.0.1: {} secure-json-parse@2.7.0: {} @@ -10540,13 +10496,6 @@ snapshots: yargs-parser@21.1.1: {} - yargs-unparser@2.0.0: - dependencies: - camelcase: 6.3.0 - decamelize: 4.0.0 - flat: 5.0.2 - is-plain-obj: 2.1.0 - yargs@17.7.2: dependencies: cliui: 8.0.1 diff --git a/apps/api/src/__tests__/e2e_noAuth/index.test.ts b/apps/api/src/__tests__/e2e_noAuth/index.test.ts index 9d5dc554..587f3e86 100644 --- a/apps/api/src/__tests__/e2e_noAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_noAuth/index.test.ts @@ -18,7 +18,6 @@ describe("E2E Tests for API Routes with No Authentication", () => { process.env.SUPABASE_ANON_TOKEN = ""; process.env.SUPABASE_URL = ""; process.env.SUPABASE_SERVICE_TOKEN = ""; - process.env.SCRAPING_BEE_API_KEY = ""; process.env.OPENAI_API_KEY = ""; process.env.BULL_AUTH_KEY = ""; process.env.PLAYWRIGHT_MICROSERVICE_URL = ""; diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index ab2fe79b..c7c84621 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -7,7 +7,6 @@ import { scrapeURLWithFireEngineTLSClient, } from "./fire-engine"; import { scrapePDF } from "./pdf"; -import { scrapeURLWithScrapingBee } from "./scrapingbee"; import { scrapeURLWithFetch } from "./fetch"; import { scrapeURLWithPlaywright } from "./playwright"; import { scrapeCache } from "./cache"; @@ -16,17 +15,12 @@ export type Engine = | "fire-engine;chrome-cdp" | "fire-engine;playwright" | "fire-engine;tlsclient" - | "scrapingbee" - | "scrapingbeeLoad" | "playwright" | "fetch" | "pdf" | "docx" | "cache"; -const useScrapingBee = - process.env.SCRAPING_BEE_API_KEY !== "" && - process.env.SCRAPING_BEE_API_KEY !== undefined; const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== "" && process.env.FIRE_ENGINE_BETA_URL !== undefined; @@ -46,9 +40,6 @@ export const engines: Engine[] = [ "fire-engine;tlsclient" as const, ] : []), - ...(useScrapingBee - ? ["scrapingbee" as const, "scrapingbeeLoad" as const] - : []), ...(usePlaywright ? ["playwright" as const] : []), "fetch", "pdf", @@ -120,8 +111,6 @@ const engineHandlers: { "fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP, "fire-engine;playwright": scrapeURLWithFireEnginePlaywright, "fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient, - scrapingbee: scrapeURLWithScrapingBee("domcontentloaded"), - scrapingbeeLoad: scrapeURLWithScrapingBee("networkidle2"), playwright: scrapeURLWithPlaywright, fetch: scrapeURLWithFetch, pdf: scrapePDF, @@ -189,40 +178,6 @@ export const engineOptions: { }, quality: 40, }, - scrapingbee: { - features: { - actions: false, - waitFor: true, - screenshot: true, - "screenshot@fullScreen": true, - pdf: false, - docx: false, - atsv: false, - location: false, - mobile: false, - skipTlsVerification: false, - useFastMode: false, - stealthProxy: false, - }, - quality: 30, - }, - scrapingbeeLoad: { - features: { - actions: false, - waitFor: true, - screenshot: true, - "screenshot@fullScreen": true, - pdf: false, - docx: false, - atsv: false, - location: false, - mobile: false, - skipTlsVerification: false, - useFastMode: false, - stealthProxy: false, - }, - quality: 29, - }, playwright: { features: { actions: false, diff --git a/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts b/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts deleted file mode 100644 index 6840c142..00000000 --- a/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts +++ /dev/null @@ -1,95 +0,0 @@ -import { ScrapingBeeClient } from "scrapingbee"; -import { Meta } from "../.."; -import { EngineScrapeResult } from ".."; -import { specialtyScrapeCheck } from "../utils/specialtyHandler"; -import { AxiosError, type AxiosResponse } from "axios"; -import { EngineError, TimeoutError } from "../../error"; - -const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!); - -export function scrapeURLWithScrapingBee( - wait_browser: "domcontentloaded" | "networkidle2", -): (meta: Meta, timeToRun: number | undefined) => Promise { - return async ( - meta: Meta, - timeToRun: number | undefined, - ): Promise => { - let response: AxiosResponse; - const timeout = (timeToRun ?? 300000) + meta.options.waitFor; - try { - response = await Promise.race>([ - client.get({ - url: meta.url, - params: { - timeout, - wait_browser: wait_browser, - wait: meta.options.waitFor, - transparent_status_code: true, - json_response: true, - screenshot: meta.options.formats.includes("screenshot"), - screenshot_full_page: meta.options.formats.includes( - "screenshot@fullPage", - ), - }, - headers: { - "ScrapingService-Request": "TRUE", // this is sent to the page, not to ScrapingBee - mogery - }, - }), - new Promise((_, reject) => setTimeout(() => reject(new TimeoutError("ScrapingBee timed out")), timeout + 5000)), - ]); - } catch (error) { - if (error instanceof AxiosError && error.response !== undefined) { - response = error.response; - } else { - throw error; - } - } - - const data: Buffer = response.data; - const body = JSON.parse(new TextDecoder().decode(data)); - - const headers = body.headers ?? {}; - const isHiddenEngineError = !( - headers["Date"] ?? - headers["date"] ?? - headers["Content-Type"] ?? - headers["content-type"] - ); - - if (body.errors || body.body?.error || isHiddenEngineError) { - meta.logger.error("ScrapingBee threw an error", { - body: body.body?.error ?? body.errors ?? body.body ?? body, - }); - throw new EngineError("Engine error #34", { - cause: { body, statusCode: response.status }, - }); - } - - if (typeof body.body !== "string") { - meta.logger.error("ScrapingBee: Body is not string??", { body }); - throw new EngineError("Engine error #35", { - cause: { body, statusCode: response.status }, - }); - } - - await specialtyScrapeCheck( - meta.logger.child({ - method: "scrapeURLWithScrapingBee/specialtyScrapeCheck", - }), - body.headers, - ); - - return { - url: body["resolved-url"] ?? meta.url, - - html: body.body, - error: response.status >= 300 ? response.statusText : undefined, - statusCode: response.status, - ...(body.screenshot - ? { - screenshot: `data:image/png;base64,${body.screenshot}`, - } - : {}), - }; - }; -} diff --git a/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts b/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts index 2f11d945..531cf1a2 100644 --- a/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts +++ b/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts @@ -12,8 +12,6 @@ const testEngines: (Engine | undefined)[] = [ "fire-engine;chrome-cdp", "fire-engine;playwright", "fire-engine;tlsclient", - "scrapingbee", - "scrapingbeeLoad", "fetch", ]; @@ -21,8 +19,6 @@ const testEnginesScreenshot: (Engine | undefined)[] = [ undefined, "fire-engine;chrome-cdp", "fire-engine;playwright", - "scrapingbee", - "scrapingbeeLoad", ]; describe("Standalone scrapeURL tests", () => { diff --git a/docker-compose.yaml b/docker-compose.yaml index 61819a15..0b0cfb3e 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -33,7 +33,6 @@ x-common-env: &common-env SUPABASE_ANON_TOKEN: ${SUPABASE_ANON_TOKEN} SUPABASE_URL: ${SUPABASE_URL} SUPABASE_SERVICE_TOKEN: ${SUPABASE_SERVICE_TOKEN} - SCRAPING_BEE_API_KEY: ${SCRAPING_BEE_API_KEY} SELF_HOSTED_WEBHOOK_URL: ${SELF_HOSTED_WEBHOOK_URL} SERPER_API_KEY: ${SERPER_API_KEY} SEARCHAPI_API_KEY: ${SEARCHAPI_API_KEY} diff --git a/examples/kubernetes/cluster-install/secret.yaml b/examples/kubernetes/cluster-install/secret.yaml index 861770c0..78fb800c 100644 --- a/examples/kubernetes/cluster-install/secret.yaml +++ b/examples/kubernetes/cluster-install/secret.yaml @@ -11,7 +11,6 @@ data: TEST_API_KEY: "" POSTHOG_API_KEY: "" POSTHOG_HOST: "" - SCRAPING_BEE_API_KEY: "" STRIPE_PRICE_ID_STANDARD: "" STRIPE_PRICE_ID_SCALE: "" FIRE_ENGINE_BETA_URL: ""