This commit is contained in:
Nicolas 2025-04-22 21:42:37 -04:00 committed by GitHub
parent feda4dede7
commit 1c421f2d74
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 0 additions and 207 deletions

View File

@ -15,7 +15,6 @@ env:
PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }}
PORT: ${{ secrets.PORT }}
REDIS_URL: ${{ secrets.REDIS_URL }}
SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }}
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}

View File

@ -15,7 +15,6 @@ env:
PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }}
PORT: ${{ secrets.PORT }}
REDIS_URL: ${{ secrets.REDIS_URL }}
SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }}
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}

View File

@ -15,7 +15,6 @@ env:
PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }}
PORT: ${{ secrets.PORT }}
REDIS_URL: ${{ secrets.REDIS_URL }}
SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }}
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}

View File

@ -17,7 +17,6 @@ env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
PORT: ${{ secrets.PORT }}
REDIS_URL: ${{ secrets.REDIS_URL }}
SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }}
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}

View File

@ -38,7 +38,6 @@ SUPABASE_SERVICE_TOKEN=
# Other Optionals
TEST_API_KEY= # use if you've set up authentication and want to test with a real API key
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Bee to handle JS blocking
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
BULL_AUTH_KEY= @
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback

View File

@ -86,9 +86,6 @@ USE_DB_AUTHENTICATION=false
# Use if you've set up authentication and want to test with a real API key
# TEST_API_KEY=
# You can add this to enable ScrapingBee as a fallback scraping engine.
# SCRAPING_BEE_API_KEY=
# This key lets you access the queue admin panel. Change this if your deployment is publicly accessible.
BULL_AUTH_KEY=CHANGEME

View File

@ -128,7 +128,6 @@
"redlock": "5.0.0-beta.2",
"resend": "^3.4.0",
"robots-parser": "^3.0.1",
"scrapingbee": "^1.7.4",
"stripe": "^16.1.0",
"supabase": "^1.77.9",
"systeminformation": "^5.22.11",

View File

@ -236,9 +236,6 @@ importers:
robots-parser:
specifier: ^3.0.1
version: 3.0.1
scrapingbee:
specifier: ^1.7.4
version: 1.7.4
stripe:
specifier: ^16.1.0
version: 16.1.0
@ -2030,9 +2027,6 @@ packages:
asynckit@0.4.0:
resolution: {integrity: sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==}
axios-retry@3.9.1:
resolution: {integrity: sha512-8PJDLJv7qTTMMwdnbMvrLYuvB47M81wRtxQmEdV5w4rgbTXTt+vtPkXwajOfOdSyv/wZICJOC+/UhXH4aQ/R+w==}
axios-retry@4.5.0:
resolution: {integrity: sha512-aR99oXhpEDGo0UuAlYcn2iGRds30k366Zfa05XWScR9QaQD4JYiP3/1Qt1u7YlefUOK+cn0CcwoL1oefavQUlQ==}
peerDependencies:
@ -2442,10 +2436,6 @@ packages:
supports-color:
optional: true
decamelize@4.0.0:
resolution: {integrity: sha512-9iE1PgSik9HeIIw2JO94IidnE3eBoQrFJ3w7sFuzSX4DpmZ3v5sZpUiV5Swcf6mQEF+Y0ru8Neo+p+nyh2J+hQ==}
engines: {node: '>=10'}
decimal.js@10.5.0:
resolution: {integrity: sha512-8vDa8Qxvr/+d94hSh5P3IJwI5t8/c0KsMp+g8bNw9cY2icONa5aPfvKeieW1WlG0WQYwwhJ7mjui2xtiePQSXw==}
@ -2767,10 +2757,6 @@ packages:
resolution: {integrity: sha512-PpOwAdQ/YlXQ2vj8a3h8IipDuYRi3wceVQQGYWxNINccq40Anw7BlsEXCMbt1Zt+OLA6Fq9suIpIWD0OsnISlw==}
engines: {node: '>=8'}
flat@5.0.2:
resolution: {integrity: sha512-b6suED+5/3rTpUBdG1gupIl8MPFCAMA0QXwmljLhvCUKcUvdE4gWky9zpuGCcXHOsz4J9wPGNWq6OKpmIzz3hQ==}
hasBin: true
fn.name@1.1.0:
resolution: {integrity: sha512-GRnmB5gPyJpAhTQdSZTSp9uaPSvl09KoYcMQtsB9rQoOmzs9dH6ffeccH+Z+cv6P68Hu5bC6JjRh4Ah/mHSNRw==}
@ -3145,10 +3131,6 @@ packages:
resolution: {integrity: sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==}
engines: {node: '>=0.12.0'}
is-plain-obj@2.1.0:
resolution: {integrity: sha512-YWnfyRwxL/+SsrWYfOpUtz5b3YD+nyfkHvjbcanzk8zgyO4ASD67uVMRt8k5bM4lLMDnXfriRhOpemw+NfT1eA==}
engines: {node: '>=8'}
is-potential-custom-element-name@1.0.1:
resolution: {integrity: sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==}
@ -4233,9 +4215,6 @@ packages:
scheduler@0.23.2:
resolution: {integrity: sha512-UOShsPwz7NrMUqhR6t0hWjFduvOzbtv7toDH1/hIrfRNIDBnnBWd0CwJTGvTpngVlmwGCdP9/Zl/tVrDqcuYzQ==}
scrapingbee@1.7.4:
resolution: {integrity: sha512-cTo+mfLi+T3mSeCHIefVZpjWEX2O70SkmCoWj9ypsnIFqBI2GmljdHYXt8yoT6D/YKjI0rHE7YH9iVRdhyoMmQ==}
secure-compare@3.0.1:
resolution: {integrity: sha512-AckIIV90rPDcBcglUwXPF3kg0P0qmPsPXAj6BBEENQE1p5yA1xfmDJzfi1Tappj37Pv2mVbKpL3Z1T+Nn7k1Qw==}
@ -4874,10 +4853,6 @@ packages:
resolution: {integrity: sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==}
engines: {node: '>=12'}
yargs-unparser@2.0.0:
resolution: {integrity: sha512-7pRTIA9Qc1caZ0bZ6RYRGbHJthJWuakf+WmHK0rVeLkNrrGhfoabBNdue6kdINI6r4if7ocq9aD/n7xwKOdzOA==}
engines: {node: '>=10'}
yargs@17.7.2:
resolution: {integrity: sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==}
engines: {node: '>=12'}
@ -7339,11 +7314,6 @@ snapshots:
asynckit@0.4.0: {}
axios-retry@3.9.1:
dependencies:
'@babel/runtime': 7.24.6
is-retry-allowed: 2.2.0
axios-retry@4.5.0(axios@1.7.2):
dependencies:
axios: 1.7.2
@ -7822,8 +7792,6 @@ snapshots:
dependencies:
ms: 2.1.2
decamelize@4.0.0: {}
decimal.js@10.5.0: {}
dedent@1.5.3: {}
@ -8153,8 +8121,6 @@ snapshots:
locate-path: 5.0.0
path-exists: 4.0.0
flat@5.0.2: {}
fn.name@1.1.0: {}
follow-redirects@1.15.6: {}
@ -8596,8 +8562,6 @@ snapshots:
is-number@7.0.0: {}
is-plain-obj@2.1.0: {}
is-potential-custom-element-name@1.0.1: {}
is-retry-allowed@2.2.0: {}
@ -9926,14 +9890,6 @@ snapshots:
dependencies:
loose-envify: 1.4.0
scrapingbee@1.7.4:
dependencies:
axios: 1.7.2
axios-retry: 3.9.1
yargs-unparser: 2.0.0
transitivePeerDependencies:
- debug
secure-compare@3.0.1: {}
secure-json-parse@2.7.0: {}
@ -10540,13 +10496,6 @@ snapshots:
yargs-parser@21.1.1: {}
yargs-unparser@2.0.0:
dependencies:
camelcase: 6.3.0
decamelize: 4.0.0
flat: 5.0.2
is-plain-obj: 2.1.0
yargs@17.7.2:
dependencies:
cliui: 8.0.1

View File

@ -18,7 +18,6 @@ describe("E2E Tests for API Routes with No Authentication", () => {
process.env.SUPABASE_ANON_TOKEN = "";
process.env.SUPABASE_URL = "";
process.env.SUPABASE_SERVICE_TOKEN = "";
process.env.SCRAPING_BEE_API_KEY = "";
process.env.OPENAI_API_KEY = "";
process.env.BULL_AUTH_KEY = "";
process.env.PLAYWRIGHT_MICROSERVICE_URL = "";

View File

@ -7,7 +7,6 @@ import {
scrapeURLWithFireEngineTLSClient,
} from "./fire-engine";
import { scrapePDF } from "./pdf";
import { scrapeURLWithScrapingBee } from "./scrapingbee";
import { scrapeURLWithFetch } from "./fetch";
import { scrapeURLWithPlaywright } from "./playwright";
import { scrapeCache } from "./cache";
@ -16,17 +15,12 @@ export type Engine =
| "fire-engine;chrome-cdp"
| "fire-engine;playwright"
| "fire-engine;tlsclient"
| "scrapingbee"
| "scrapingbeeLoad"
| "playwright"
| "fetch"
| "pdf"
| "docx"
| "cache";
const useScrapingBee =
process.env.SCRAPING_BEE_API_KEY !== "" &&
process.env.SCRAPING_BEE_API_KEY !== undefined;
const useFireEngine =
process.env.FIRE_ENGINE_BETA_URL !== "" &&
process.env.FIRE_ENGINE_BETA_URL !== undefined;
@ -46,9 +40,6 @@ export const engines: Engine[] = [
"fire-engine;tlsclient" as const,
]
: []),
...(useScrapingBee
? ["scrapingbee" as const, "scrapingbeeLoad" as const]
: []),
...(usePlaywright ? ["playwright" as const] : []),
"fetch",
"pdf",
@ -120,8 +111,6 @@ const engineHandlers: {
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
"fire-engine;playwright": scrapeURLWithFireEnginePlaywright,
"fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient,
scrapingbee: scrapeURLWithScrapingBee("domcontentloaded"),
scrapingbeeLoad: scrapeURLWithScrapingBee("networkidle2"),
playwright: scrapeURLWithPlaywright,
fetch: scrapeURLWithFetch,
pdf: scrapePDF,
@ -189,40 +178,6 @@ export const engineOptions: {
},
quality: 40,
},
scrapingbee: {
features: {
actions: false,
waitFor: true,
screenshot: true,
"screenshot@fullScreen": true,
pdf: false,
docx: false,
atsv: false,
location: false,
mobile: false,
skipTlsVerification: false,
useFastMode: false,
stealthProxy: false,
},
quality: 30,
},
scrapingbeeLoad: {
features: {
actions: false,
waitFor: true,
screenshot: true,
"screenshot@fullScreen": true,
pdf: false,
docx: false,
atsv: false,
location: false,
mobile: false,
skipTlsVerification: false,
useFastMode: false,
stealthProxy: false,
},
quality: 29,
},
playwright: {
features: {
actions: false,

View File

@ -1,95 +0,0 @@
import { ScrapingBeeClient } from "scrapingbee";
import { Meta } from "../..";
import { EngineScrapeResult } from "..";
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
import { AxiosError, type AxiosResponse } from "axios";
import { EngineError, TimeoutError } from "../../error";
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!);
export function scrapeURLWithScrapingBee(
wait_browser: "domcontentloaded" | "networkidle2",
): (meta: Meta, timeToRun: number | undefined) => Promise<EngineScrapeResult> {
return async (
meta: Meta,
timeToRun: number | undefined,
): Promise<EngineScrapeResult> => {
let response: AxiosResponse<any>;
const timeout = (timeToRun ?? 300000) + meta.options.waitFor;
try {
response = await Promise.race<AxiosResponse<any>>([
client.get({
url: meta.url,
params: {
timeout,
wait_browser: wait_browser,
wait: meta.options.waitFor,
transparent_status_code: true,
json_response: true,
screenshot: meta.options.formats.includes("screenshot"),
screenshot_full_page: meta.options.formats.includes(
"screenshot@fullPage",
),
},
headers: {
"ScrapingService-Request": "TRUE", // this is sent to the page, not to ScrapingBee - mogery
},
}),
new Promise((_, reject) => setTimeout(() => reject(new TimeoutError("ScrapingBee timed out")), timeout + 5000)),
]);
} catch (error) {
if (error instanceof AxiosError && error.response !== undefined) {
response = error.response;
} else {
throw error;
}
}
const data: Buffer = response.data;
const body = JSON.parse(new TextDecoder().decode(data));
const headers = body.headers ?? {};
const isHiddenEngineError = !(
headers["Date"] ??
headers["date"] ??
headers["Content-Type"] ??
headers["content-type"]
);
if (body.errors || body.body?.error || isHiddenEngineError) {
meta.logger.error("ScrapingBee threw an error", {
body: body.body?.error ?? body.errors ?? body.body ?? body,
});
throw new EngineError("Engine error #34", {
cause: { body, statusCode: response.status },
});
}
if (typeof body.body !== "string") {
meta.logger.error("ScrapingBee: Body is not string??", { body });
throw new EngineError("Engine error #35", {
cause: { body, statusCode: response.status },
});
}
await specialtyScrapeCheck(
meta.logger.child({
method: "scrapeURLWithScrapingBee/specialtyScrapeCheck",
}),
body.headers,
);
return {
url: body["resolved-url"] ?? meta.url,
html: body.body,
error: response.status >= 300 ? response.statusText : undefined,
statusCode: response.status,
...(body.screenshot
? {
screenshot: `data:image/png;base64,${body.screenshot}`,
}
: {}),
};
};
}

View File

@ -12,8 +12,6 @@ const testEngines: (Engine | undefined)[] = [
"fire-engine;chrome-cdp",
"fire-engine;playwright",
"fire-engine;tlsclient",
"scrapingbee",
"scrapingbeeLoad",
"fetch",
];
@ -21,8 +19,6 @@ const testEnginesScreenshot: (Engine | undefined)[] = [
undefined,
"fire-engine;chrome-cdp",
"fire-engine;playwright",
"scrapingbee",
"scrapingbeeLoad",
];
describe("Standalone scrapeURL tests", () => {

View File

@ -33,7 +33,6 @@ x-common-env: &common-env
SUPABASE_ANON_TOKEN: ${SUPABASE_ANON_TOKEN}
SUPABASE_URL: ${SUPABASE_URL}
SUPABASE_SERVICE_TOKEN: ${SUPABASE_SERVICE_TOKEN}
SCRAPING_BEE_API_KEY: ${SCRAPING_BEE_API_KEY}
SELF_HOSTED_WEBHOOK_URL: ${SELF_HOSTED_WEBHOOK_URL}
SERPER_API_KEY: ${SERPER_API_KEY}
SEARCHAPI_API_KEY: ${SEARCHAPI_API_KEY}

View File

@ -11,7 +11,6 @@ data:
TEST_API_KEY: ""
POSTHOG_API_KEY: ""
POSTHOG_HOST: ""
SCRAPING_BEE_API_KEY: ""
STRIPE_PRICE_ID_STANDARD: ""
STRIPE_PRICE_ID_SCALE: ""
FIRE_ENGINE_BETA_URL: ""