feat: new snips test framework (FIR-414) (#1033)

* feat: new snips test framework

* Update mock.ts

---------

Co-authored-by: Nicolas <nicolascamara29@gmail.com>
This commit is contained in:
Gergő Móricz 2025-01-13 20:50:47 +01:00 committed by GitHub
parent 9a13c1dede
commit 5c62bb1195
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 393 additions and 86 deletions

View File

@ -15,6 +15,7 @@
"test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'", "test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'",
"test:full": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_withAuth)'", "test:full": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_withAuth)'",
"test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_full_withAuth)'", "test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_full_withAuth)'",
"test:snips": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false src/__tests__/snips/*.test.ts",
"workers": "nodemon --exec ts-node src/services/queue-worker.ts", "workers": "nodemon --exec ts-node src/services/queue-worker.ts",
"worker:production": "node dist/src/services/queue-worker.js", "worker:production": "node dist/src/services/queue-worker.js",
"mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest", "mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest",
@ -37,6 +38,7 @@
"@types/jest": "^29.5.12", "@types/jest": "^29.5.12",
"@types/node": "^20.14.1", "@types/node": "^20.14.1",
"@types/pdf-parse": "^1.1.4", "@types/pdf-parse": "^1.1.4",
"@types/supertest": "^6.0.2",
"body-parser": "^1.20.1", "body-parser": "^1.20.1",
"express": "^4.18.2", "express": "^4.18.2",
"jest": "^29.6.3", "jest": "^29.6.3",

View File

@ -267,6 +267,9 @@ importers:
'@types/pdf-parse': '@types/pdf-parse':
specifier: ^1.1.4 specifier: ^1.1.4
version: 1.1.4 version: 1.1.4
'@types/supertest':
specifier: ^6.0.2
version: 6.0.2
body-parser: body-parser:
specifier: ^1.20.1 specifier: ^1.20.1
version: 1.20.2 version: 1.20.2
@ -1516,6 +1519,9 @@ packages:
'@types/connect@3.4.38': '@types/connect@3.4.38':
resolution: {integrity: sha512-K6uROf1LD88uDQqJCktA4yzL1YYAK6NgfsI0v/mTgyPKWsX1CnJ0XPSDhViejru1GcRkLWb8RlzFYJRqGUbaug==} resolution: {integrity: sha512-K6uROf1LD88uDQqJCktA4yzL1YYAK6NgfsI0v/mTgyPKWsX1CnJ0XPSDhViejru1GcRkLWb8RlzFYJRqGUbaug==}
'@types/cookiejar@2.1.5':
resolution: {integrity: sha512-he+DHOWReW0nghN24E1WUqM0efK4kI9oTqDm6XmK8ZPe2djZ90BSNdGnIyCLzCPw7/pogPlGbzI2wHGGmi4O/Q==}
'@types/cors@2.8.17': '@types/cors@2.8.17':
resolution: {integrity: sha512-8CGDvrBj1zgo2qE+oS3pOCyYNqCPryMWY2bGfwA0dcfopWGgxs+78df0Rs3rc9THP4JkOhLsAa+15VdpAqkcUA==} resolution: {integrity: sha512-8CGDvrBj1zgo2qE+oS3pOCyYNqCPryMWY2bGfwA0dcfopWGgxs+78df0Rs3rc9THP4JkOhLsAa+15VdpAqkcUA==}
@ -1549,6 +1555,9 @@ packages:
'@types/jest@29.5.12': '@types/jest@29.5.12':
resolution: {integrity: sha512-eDC8bTvT/QhYdxJAulQikueigY5AsdBRH2yDKW3yveW7svY3+DzN84/2NUgkw10RTiJbWqZrTtoGVdYlvFJdLw==} resolution: {integrity: sha512-eDC8bTvT/QhYdxJAulQikueigY5AsdBRH2yDKW3yveW7svY3+DzN84/2NUgkw10RTiJbWqZrTtoGVdYlvFJdLw==}
'@types/methods@1.1.4':
resolution: {integrity: sha512-ymXWVrDiCxTBE3+RIrrP533E70eA+9qu7zdWoHuOmGujkYtzf4HQF96b8nwHLqhuf4ykX61IGRIB38CC6/sImQ==}
'@types/mime@1.3.5': '@types/mime@1.3.5':
resolution: {integrity: sha512-/pyBZWSLD2n0dcHE3hq8s8ZvcETHtEuF+3E7XVt0Ig2nvsVQXdghHVcEkIWjy9A0wKfTn97a/PSDYohKIlnP/w==} resolution: {integrity: sha512-/pyBZWSLD2n0dcHE3hq8s8ZvcETHtEuF+3E7XVt0Ig2nvsVQXdghHVcEkIWjy9A0wKfTn97a/PSDYohKIlnP/w==}
@ -1597,6 +1606,12 @@ packages:
'@types/stack-utils@2.0.3': '@types/stack-utils@2.0.3':
resolution: {integrity: sha512-9aEbYZ3TbYMznPdcdr3SmIrLXwC/AKZXQeCf9Pgao5CKb8CyHuEX5jzWPTkvregvhRJHcpRO6BFoGW9ycaOkYw==} resolution: {integrity: sha512-9aEbYZ3TbYMznPdcdr3SmIrLXwC/AKZXQeCf9Pgao5CKb8CyHuEX5jzWPTkvregvhRJHcpRO6BFoGW9ycaOkYw==}
'@types/superagent@8.1.9':
resolution: {integrity: sha512-pTVjI73witn+9ILmoJdajHGW2jkSaOzhiFYF1Rd3EQ94kymLqB9PjD9ISg7WaALC7+dCHT0FGe9T2LktLq/3GQ==}
'@types/supertest@6.0.2':
resolution: {integrity: sha512-137ypx2lk/wTQbW6An6safu9hXmajAifU/s7szAHLN/FeIm5w7yR0Wkl9fdJMRSHwOn4HLAI0DaB2TOORuhPDg==}
'@types/triple-beam@1.3.5': '@types/triple-beam@1.3.5':
resolution: {integrity: sha512-6WaYesThRMCl19iryMYP7/x2OVgCtbIVflDGFpWnb9irXI3UjYE4AzmYuiUKY1AJstGijoY+MgUszMgRxIYTYw==} resolution: {integrity: sha512-6WaYesThRMCl19iryMYP7/x2OVgCtbIVflDGFpWnb9irXI3UjYE4AzmYuiUKY1AJstGijoY+MgUszMgRxIYTYw==}
@ -6566,6 +6581,8 @@ snapshots:
dependencies: dependencies:
'@types/node': 20.14.1 '@types/node': 20.14.1
'@types/cookiejar@2.1.5': {}
'@types/cors@2.8.17': '@types/cors@2.8.17':
dependencies: dependencies:
'@types/node': 20.14.1 '@types/node': 20.14.1
@ -6613,6 +6630,8 @@ snapshots:
expect: 29.7.0 expect: 29.7.0
pretty-format: 29.7.0 pretty-format: 29.7.0
'@types/methods@1.1.4': {}
'@types/mime@1.3.5': {} '@types/mime@1.3.5': {}
'@types/mysql@2.15.22': '@types/mysql@2.15.22':
@ -6667,6 +6686,18 @@ snapshots:
'@types/stack-utils@2.0.3': {} '@types/stack-utils@2.0.3': {}
'@types/superagent@8.1.9':
dependencies:
'@types/cookiejar': 2.1.5
'@types/methods': 1.1.4
'@types/node': 20.14.1
form-data: 4.0.0
'@types/supertest@6.0.2':
dependencies:
'@types/methods': 1.1.4
'@types/superagent': 8.1.9
'@types/triple-beam@1.3.5': {} '@types/triple-beam@1.3.5': {}
'@types/uuid@9.0.8': {} '@types/uuid@9.0.8': {}

View File

@ -0,0 +1,107 @@
[
{
"time": 1735911273239,
"options": {
"url": "http://default-fire-engine-api-service:8080/scrape",
"method": "POST",
"body": {
"url": "http://firecrawl.dev",
"engine": "chrome-cdp",
"instantReturn": true,
"skipTlsVerification": false,
"priority": 10,
"mobile": false,
"timeout": 15000
},
"headers": {},
"ignoreResponse": false,
"ignoreFailure": false,
"tryCount": 3
},
"result": {
"status": 200,
"headers": {},
"body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp\",\"processing\":true}"
}
},
{
"time": 1735911273354,
"options": {
"url": "http://default-fire-engine-api-service:8080/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp",
"method": "GET",
"headers": {},
"ignoreResponse": false,
"ignoreFailure": false,
"tryCount": 1
},
"result": {
"status": 200,
"headers": {},
"body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35\",\"state\":\"prioritized\",\"processing\":true}"
}
},
{
"time": 1735911273720,
"options": {
"url": "http://default-fire-engine-api-service:8080/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp",
"method": "GET",
"headers": {},
"ignoreResponse": false,
"ignoreFailure": false,
"tryCount": 1
},
"result": {
"status": 200,
"headers": {},
"body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35\",\"state\":\"active\",\"processing\":true}"
}
},
{
"time": 1735911274092,
"options": {
"url": "http://default-fire-engine-api-service:8080/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp",
"method": "GET",
"headers": {},
"ignoreResponse": false,
"ignoreFailure": false,
"tryCount": 1
},
"result": {
"status": 200,
"headers": {},
"body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35\",\"state\":\"active\",\"processing\":true}"
}
},
{
"time": 1735911274467,
"options": {
"url": "http://default-fire-engine-api-service:8080/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp",
"method": "GET",
"headers": {},
"ignoreResponse": false,
"ignoreFailure": false,
"tryCount": 1
},
"result": {
"status": 200,
"headers": {},
"body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35\",\"state\":\"active\",\"processing\":true}"
}
},
{
"time": 1735911274947,
"options": {
"url": "http://default-fire-engine-api-service:8080/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp",
"method": "GET",
"headers": {},
"ignoreResponse": false,
"ignoreFailure": false,
"tryCount": 1
},
"result": {
"status": 200,
"headers": {},
"body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35\",\"state\":\"completed\",\"processing\":false,\"timeTaken\":1.204,\"content\":\"<!DOCTYPE html><html lang=\\\"en\\\"><body><p>this is fake data coming from the mocking system!</p></body></html>\",\"url\":\"https://www.firecrawl.dev/\",\"screenshots\":[],\"actionContent\":[],\"pageStatusCode\":200,\"responseHeaders\":{\"X-DNS-Prefetch-Control\":\"off\",\"age\":\"0\",\"cache-control\":\"private, no-cache, no-store, max-age=0, must-revalidate\",\"content-encoding\":\"br\",\"content-type\":\"text/html; charset=utf-8\",\"date\":\"Fri, 03 Jan 2025 13:34:34 GMT\",\"link\":\"</_next/static/media/171883e03d2067b6-s.p.woff2>; rel=preload; as=\\\"font\\\"; crossorigin=\\\"\\\"; type=\\\"font/woff2\\\", </_next/static/media/a34f9d1faa5f3315-s.p.woff2>; rel=preload; as=\\\"font\\\"; crossorigin=\\\"\\\"; type=\\\"font/woff2\\\", </_next/static/media/c4c7b0ec92b72e30-s.p.woff2>; rel=preload; as=\\\"font\\\"; crossorigin=\\\"\\\"; type=\\\"font/woff2\\\"\",\"permissions-policy\":\"keyboard-map=(), attribution-reporting=(), run-ad-auction=(), private-state-token-redemption=(), private-state-token-issuance=(), join-ad-interest-group=(), idle-detection=(), compute-pressure=(), browsing-topics=()\",\"server\":\"Vercel\",\"strict-transport-security\":\"max-age=63072000\",\"vary\":\"RSC, Next-Router-State-Tree, Next-Router-Prefetch\",\"x-matched-path\":\"/\",\"x-powered-by\":\"Next.js\",\"x-vercel-cache\":\"MISS\",\"x-vercel-id\":\"iad1::iad1::bs88l-1735911273932-1f7bba7a8b45\"},\"invalidTlsCert\":false,\"file\":null}"
}
}
]

View File

@ -0,0 +1,36 @@
import request from "supertest";
import { configDotenv } from "dotenv";
import { ScrapeRequestInput } from "../../controllers/v1/types";
configDotenv();
const TEST_URL = "http://127.0.0.1:3002";
async function scrape(body: ScrapeRequestInput) {
return await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(body);
}
function expectScrapeToSucceed(response: Awaited<ReturnType<typeof scrape>>) {
expect(response.statusCode).toBe(200);
expect(response.body.success).toBe(true);
expect(typeof response.body.data).toBe("object");
}
describe("Scrape tests", () => {
it("mocking works properly", async () => {
// depends on falsified mock mocking-works-properly
// this test will fail if mock is bypassed with real data -- firecrawl.dev will never have
// that as its actual markdown output
const response = await scrape({
url: "http://firecrawl.dev",
useMock: "mocking-works-properly",
});
expectScrapeToSucceed(response);
expect(response.body.data.markdown).toBe("this is fake data coming from the mocking system!");
});
});

View File

@ -0,0 +1,12 @@
const path = require("path");
const fs = require("fs");
const mocksDirPath = path.join(__dirname, "../../../scraper/scrapeURL/mocks");
const files = fs.readdirSync(mocksDirPath);
const contents = files.map(x => JSON.parse(fs.readFileSync(path.join(mocksDirPath, x), "utf8")));
fs.writeFileSync(
path.join(__dirname, "../mocks/" + process.argv[2] + ".json"),
JSON.stringify(contents, undefined, 4),
);

View File

@ -181,6 +181,7 @@ export const scrapeOptions = z
skipTlsVerification: z.boolean().default(false), skipTlsVerification: z.boolean().default(false),
removeBase64Images: z.boolean().default(true), removeBase64Images: z.boolean().default(true),
fastMode: z.boolean().default(false), fastMode: z.boolean().default(false),
useMock: z.string().optional(),
}) })
.strict(strictMessage); .strict(strictMessage);

View File

@ -0,0 +1 @@
/mocks

View File

@ -3,12 +3,8 @@ import * as Sentry from "@sentry/node";
import { z } from "zod"; import { z } from "zod";
import { robustFetch } from "../../lib/fetch"; import { robustFetch } from "../../lib/fetch";
import { import { ActionError, EngineError, SiteError, UnsupportedFileError } from "../../error";
ActionError, import { MockState } from "../../lib/mock";
EngineError,
SiteError,
UnsupportedFileError,
} from "../../error";
const successSchema = z.object({ const successSchema = z.object({
jobId: z.string(), jobId: z.string(),
@ -82,6 +78,7 @@ export class StillProcessingError extends Error {
export async function fireEngineCheckStatus( export async function fireEngineCheckStatus(
logger: Logger, logger: Logger,
jobId: string, jobId: string,
mock: MockState | null,
): Promise<FireEngineCheckStatusSuccess> { ): Promise<FireEngineCheckStatusSuccess> {
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!; const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
@ -105,6 +102,7 @@ export async function fireEngineCheckStatus(
} }
: {}), : {}),
}, },
mock,
}); });
}, },
); );

View File

@ -2,8 +2,9 @@ import { Logger } from "winston";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import { robustFetch } from "../../lib/fetch"; import { robustFetch } from "../../lib/fetch";
import { MockState } from "../../lib/mock";
export async function fireEngineDelete(logger: Logger, jobId: string) { export async function fireEngineDelete(logger: Logger, jobId: string, mock: MockState | null) {
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!; const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
await Sentry.startSpan( await Sentry.startSpan(
@ -28,6 +29,7 @@ export async function fireEngineDelete(logger: Logger, jobId: string) {
ignoreResponse: true, ignoreResponse: true,
ignoreFailure: true, ignoreFailure: true,
logger: logger.child({ method: "fireEngineDelete/robustFetch", jobId }), logger: logger.child({ method: "fireEngineDelete/robustFetch", jobId }),
mock,
}); });
}, },
); );

View File

@ -24,6 +24,7 @@ import * as Sentry from "@sentry/node";
import { Action } from "../../../../lib/entities"; import { Action } from "../../../../lib/entities";
import { specialtyScrapeCheck } from "../utils/specialtyHandler"; import { specialtyScrapeCheck } from "../utils/specialtyHandler";
import { fireEngineDelete } from "./delete"; import { fireEngineDelete } from "./delete";
import { MockState, saveMock } from "../../lib/mock";
// This function does not take `Meta` on purpose. It may not access any // This function does not take `Meta` on purpose. It may not access any
// meta values to construct the request -- that must be done by the // meta values to construct the request -- that must be done by the
@ -37,10 +38,12 @@ async function performFireEngineScrape<
logger: Logger, logger: Logger,
request: FireEngineScrapeRequestCommon & Engine, request: FireEngineScrapeRequestCommon & Engine,
timeout: number, timeout: number,
mock: MockState | null,
): Promise<FireEngineCheckStatusSuccess> { ): Promise<FireEngineCheckStatusSuccess> {
const scrape = await fireEngineScrape( const scrape = await fireEngineScrape(
logger.child({ method: "fireEngineScrape" }), logger.child({ method: "fireEngineScrape" }),
request, request,
mock,
); );
const startTime = Date.now(); const startTime = Date.now();
@ -57,6 +60,7 @@ async function performFireEngineScrape<
afterErrors: errors, afterErrors: errors,
}), }),
scrape.jobId, scrape.jobId,
mock,
); );
throw new Error("Error limit hit. See e.cause.errors for errors.", { throw new Error("Error limit hit. See e.cause.errors for errors.", {
cause: { errors }, cause: { errors },
@ -78,6 +82,7 @@ async function performFireEngineScrape<
status = await fireEngineCheckStatus( status = await fireEngineCheckStatus(
logger.child({ method: "fireEngineCheckStatus" }), logger.child({ method: "fireEngineCheckStatus" }),
scrape.jobId, scrape.jobId,
mock,
); );
} catch (error) { } catch (error) {
if (error instanceof StillProcessingError) { if (error instanceof StillProcessingError) {
@ -94,6 +99,7 @@ async function performFireEngineScrape<
afterError: error, afterError: error,
}), }),
scrape.jobId, scrape.jobId,
mock,
); );
logger.debug("Fire-engine scrape job failed.", { logger.debug("Fire-engine scrape job failed.", {
error, error,
@ -131,6 +137,7 @@ async function performFireEngineScrape<
method: "performFireEngineScrape/fireEngineDelete", method: "performFireEngineScrape/fireEngineDelete",
}), }),
scrape.jobId, scrape.jobId,
mock,
); );
return status; return status;
@ -200,6 +207,7 @@ export async function scrapeURLWithFireEngineChromeCDP(
}), }),
request, request,
timeout, timeout,
meta.mock,
); );
if ( if (
@ -274,6 +282,7 @@ export async function scrapeURLWithFireEnginePlaywright(
}), }),
request, request,
timeout, timeout,
meta.mock,
); );
if (!response.url) { if (!response.url) {
@ -327,6 +336,7 @@ export async function scrapeURLWithFireEngineTLSClient(
}), }),
request, request,
timeout, timeout,
meta.mock,
); );
if (!response.url) { if (!response.url) {

View File

@ -4,6 +4,7 @@ import { z } from "zod";
import { Action } from "../../../../lib/entities"; import { Action } from "../../../../lib/entities";
import { robustFetch } from "../../lib/fetch"; import { robustFetch } from "../../lib/fetch";
import { MockState } from "../../lib/mock";
export type FireEngineScrapeRequestCommon = { export type FireEngineScrapeRequestCommon = {
url: string; url: string;
@ -69,6 +70,7 @@ export async function fireEngineScrape<
>( >(
logger: Logger, logger: Logger,
request: FireEngineScrapeRequestCommon & Engine, request: FireEngineScrapeRequestCommon & Engine,
mock: MockState | null,
): Promise<z.infer<typeof schema>> { ): Promise<z.infer<typeof schema>> {
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!; const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
@ -97,6 +99,7 @@ export async function fireEngineScrape<
logger: logger.child({ method: "fireEngineScrape/robustFetch" }), logger: logger.child({ method: "fireEngineScrape/robustFetch" }),
schema, schema,
tryCount: 3, tryCount: 3,
mock,
}); });
}, },
); );

View File

@ -46,6 +46,7 @@ async function scrapePDFWithRunPodMU(
markdown: z.string(), markdown: z.string(),
}), }),
}), }),
mock: meta.mock,
}); });
return { return {

View File

@ -29,6 +29,7 @@ export async function scrapeURLWithPlaywright(
pageStatusCode: z.number(), pageStatusCode: z.number(),
pageError: z.string().optional(), pageError: z.string().optional(),
}), }),
mock: meta.mock,
}), }),
(async () => { (async () => {
await new Promise((resolve) => setTimeout(() => resolve(null), timeout)); await new Promise((resolve) => setTimeout(() => resolve(null), timeout));

View File

@ -2,7 +2,7 @@ import { Logger } from "winston";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import { Document, ScrapeOptions } from "../../controllers/v1/types"; import { Document, ScrapeOptions } from "../../controllers/v1/types";
import { logger } from "../../lib/logger"; import { logger as _logger } from "../../lib/logger";
import { import {
buildFallbackList, buildFallbackList,
Engine, Engine,
@ -24,6 +24,7 @@ import {
import { executeTransformers } from "./transformers"; import { executeTransformers } from "./transformers";
import { LLMRefusalError } from "./transformers/llmExtract"; import { LLMRefusalError } from "./transformers/llmExtract";
import { urlSpecificParams } from "./lib/urlSpecificParams"; import { urlSpecificParams } from "./lib/urlSpecificParams";
import { loadMock, MockState } from "./lib/mock";
export type ScrapeUrlResponse = ( export type ScrapeUrlResponse = (
| { | {
@ -47,6 +48,7 @@ export type Meta = {
logger: Logger; logger: Logger;
logs: any[]; logs: any[];
featureFlags: Set<FeatureFlag>; featureFlags: Set<FeatureFlag>;
mock: MockState | null;
}; };
function buildFeatureFlags( function buildFeatureFlags(
@ -110,12 +112,12 @@ function buildFeatureFlags(
// The meta object is usually immutable, except for the logs array, and in edge cases (e.g. a new feature is suddenly required) // The meta object is usually immutable, except for the logs array, and in edge cases (e.g. a new feature is suddenly required)
// Having a meta object that is treated as immutable helps the code stay clean and easily tracable, // Having a meta object that is treated as immutable helps the code stay clean and easily tracable,
// while also retaining the benefits that WebScraper had from its OOP design. // while also retaining the benefits that WebScraper had from its OOP design.
function buildMetaObject( async function buildMetaObject(
id: string, id: string,
url: string, url: string,
options: ScrapeOptions, options: ScrapeOptions,
internalOptions: InternalOptions, internalOptions: InternalOptions,
): Meta { ): Promise<Meta> {
const specParams = const specParams =
urlSpecificParams[new URL(url).hostname.replace(/^www\./, "")]; urlSpecificParams[new URL(url).hostname.replace(/^www\./, "")];
if (specParams !== undefined) { if (specParams !== undefined) {
@ -126,7 +128,7 @@ function buildMetaObject(
); );
} }
const _logger = logger.child({ const logger = _logger.child({
module: "ScrapeURL", module: "ScrapeURL",
scrapeId: id, scrapeId: id,
scrapeURL: url, scrapeURL: url,
@ -138,9 +140,10 @@ function buildMetaObject(
url, url,
options, options,
internalOptions, internalOptions,
logger: _logger, logger,
logs, logs,
featureFlags: buildFeatureFlags(url, options, internalOptions), featureFlags: buildFeatureFlags(url, options, internalOptions),
mock: options.useMock !== undefined ? await loadMock(options.useMock, _logger) : null,
}; };
} }
@ -299,7 +302,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
throw error; throw error;
} else { } else {
Sentry.captureException(error); Sentry.captureException(error);
meta.logger.info( meta.logger.warn(
"An unexpected error happened while scraping with " + engine + ".", "An unexpected error happened while scraping with " + engine + ".",
{ error }, { error },
); );
@ -362,7 +365,7 @@ export async function scrapeURL(
options: ScrapeOptions, options: ScrapeOptions,
internalOptions: InternalOptions = {}, internalOptions: InternalOptions = {},
): Promise<ScrapeUrlResponse> { ): Promise<ScrapeUrlResponse> {
const meta = buildMetaObject(id, url, options, internalOptions); const meta = await buildMetaObject(id, url, options, internalOptions);
try { try {
while (true) { while (true) {
try { try {

View File

@ -1,7 +1,7 @@
import { Logger } from "winston"; import { Logger } from "winston";
import { z, ZodError } from "zod"; import { z, ZodError } from "zod";
import { v4 as uuid } from "uuid";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import { MockState, saveMock } from "./mock";
export type RobustFetchParams<Schema extends z.Schema<any>> = { export type RobustFetchParams<Schema extends z.Schema<any>> = {
url: string; url: string;
@ -16,6 +16,7 @@ export type RobustFetchParams<Schema extends z.Schema<any>> = {
requestId?: string; requestId?: string;
tryCount?: number; tryCount?: number;
tryCooldown?: number; tryCooldown?: number;
mock: MockState | null;
}; };
export async function robustFetch< export async function robustFetch<
@ -30,9 +31,10 @@ export async function robustFetch<
schema, schema,
ignoreResponse = false, ignoreResponse = false,
ignoreFailure = false, ignoreFailure = false,
requestId = uuid(), requestId = crypto.randomUUID(),
tryCount = 1, tryCount = 1,
tryCooldown, tryCooldown,
mock
}: RobustFetchParams<Schema>): Promise<Output> { }: RobustFetchParams<Schema>): Promise<Output> {
const params = { const params = {
url, url,
@ -47,6 +49,13 @@ export async function robustFetch<
tryCooldown, tryCooldown,
}; };
let response: {
status: number;
headers: Headers,
body: string,
};
if (mock === null) {
let request: Response; let request: Response;
try { try {
request = await fetch(url, { request = await fetch(url, {
@ -83,6 +92,7 @@ export async function robustFetch<
...params, ...params,
requestId, requestId,
tryCount: tryCount - 1, tryCount: tryCount - 1,
mock,
}); });
} else { } else {
logger.debug("Request failed", { params, error, requestId }); logger.debug("Request failed", { params, error, requestId });
@ -103,17 +113,44 @@ export async function robustFetch<
return null as Output; return null as Output;
} }
const response = { response = {
status: request.status, status: request.status,
headers: request.headers, headers: request.headers,
body: await request.text(), // NOTE: can this throw an exception? body: await request.text(), // NOTE: can this throw an exception?
}; };
} else {
if (ignoreResponse === true) {
return null as Output;
}
if (request.status >= 300) { const makeRequestTypeId = (request: typeof mock["requests"][number]["options"]) => {
let out = request.url + ";" + request.method;
if (process.env.FIRE_ENGINE_BETA_URL && url.startsWith(process.env.FIRE_ENGINE_BETA_URL) && request.method === "POST") {
out += "f-e;" + request.body?.engine + ";" + request.body?.url;
}
return out;
}
const thisId = makeRequestTypeId(params);
const matchingMocks = mock.requests.filter(x => makeRequestTypeId(x.options) === thisId).sort((a,b) => a.time - b.time);
const nextI = mock.tracker[thisId] ?? 0;
mock.tracker[thisId] = nextI + 1;
if (!matchingMocks[nextI]) {
throw new Error("Failed to mock request -- no mock targets found.");
}
response = {
...(matchingMocks[nextI].result),
headers: new Headers(matchingMocks[nextI].result.headers),
};
}
if (response.status >= 300) {
if (tryCount > 1) { if (tryCount > 1) {
logger.debug( logger.debug(
"Request sent failure status, trying " + (tryCount - 1) + " more times", "Request sent failure status, trying " + (tryCount - 1) + " more times",
{ params, request, response, requestId }, { params, response, requestId },
); );
if (tryCooldown !== undefined) { if (tryCooldown !== undefined) {
await new Promise((resolve) => await new Promise((resolve) =>
@ -124,18 +161,17 @@ export async function robustFetch<
...params, ...params,
requestId, requestId,
tryCount: tryCount - 1, tryCount: tryCount - 1,
mock,
}); });
} else { } else {
logger.debug("Request sent failure status", { logger.debug("Request sent failure status", {
params, params,
request,
response, response,
requestId, requestId,
}); });
throw new Error("Request sent failure status", { throw new Error("Request sent failure status", {
cause: { cause: {
params, params,
request,
response, response,
requestId, requestId,
}, },
@ -143,20 +179,27 @@ export async function robustFetch<
} }
} }
if (mock === null) {
await saveMock({
...params,
logger: undefined,
schema: undefined,
headers: undefined,
}, response);
}
let data: Output; let data: Output;
try { try {
data = JSON.parse(response.body); data = JSON.parse(response.body);
} catch (error) { } catch (error) {
logger.debug("Request sent malformed JSON", { logger.debug("Request sent malformed JSON", {
params, params,
request,
response, response,
requestId, requestId,
}); });
throw new Error("Request sent malformed JSON", { throw new Error("Request sent malformed JSON", {
cause: { cause: {
params, params,
request,
response, response,
requestId, requestId,
}, },
@ -170,7 +213,6 @@ export async function robustFetch<
if (error instanceof ZodError) { if (error instanceof ZodError) {
logger.debug("Response does not match provided schema", { logger.debug("Response does not match provided schema", {
params, params,
request,
response, response,
requestId, requestId,
error, error,
@ -179,7 +221,6 @@ export async function robustFetch<
throw new Error("Response does not match provided schema", { throw new Error("Response does not match provided schema", {
cause: { cause: {
params, params,
request,
response, response,
requestId, requestId,
error, error,
@ -189,7 +230,6 @@ export async function robustFetch<
} else { } else {
logger.debug("Parsing response with provided schema failed", { logger.debug("Parsing response with provided schema failed", {
params, params,
request,
response, response,
requestId, requestId,
error, error,
@ -198,7 +238,6 @@ export async function robustFetch<
throw new Error("Parsing response with provided schema failed", { throw new Error("Parsing response with provided schema failed", {
cause: { cause: {
params, params,
request,
response, response,
requestId, requestId,
error, error,

View File

@ -0,0 +1,60 @@
import * as fs from "fs/promises";
import * as path from "path";
import { logger as _logger } from "../../../lib/logger";
import { Logger } from "winston";
const saveMocksDirPath = path.join(__dirname, "../mocks/").replace("dist/", "");
const loadMocksDirPath = path.join(__dirname, "../../../__tests__/snips/mocks");
export async function saveMock(options: unknown, result: unknown) {
if (!process.env.FIRECRAWL_SAVE_MOCKS) return;
await fs.mkdir(saveMocksDirPath, { recursive: true });
const fileName = Date.now() + "-" + crypto.randomUUID() + ".json";
const filePath = path.join(saveMocksDirPath, fileName);
console.log(filePath);
await fs.writeFile(filePath, JSON.stringify({
time: Date.now(),
options,
result,
}, undefined, 4));
}
export type MockState = {
requests: {
time: number,
options: {
url: string,
method: string,
body?: any,
ignoreResponse: boolean,
ignoreFailure: boolean,
tryCount: number,
tryCooldown?: number,
},
result: any,
}[],
tracker: Record<string, number>,
}
export async function loadMock(name: string, logger: Logger = _logger): Promise<MockState | null> {
try {
const mockPath = path.join(loadMocksDirPath, name + ".json");
const relative = path.relative(loadMocksDirPath, mockPath);
if (!relative || relative.startsWith("..") || path.isAbsolute(relative)) {
// directory moving
return null;
}
const load = JSON.parse(await fs.readFile(mockPath, "utf8"));
return {
requests: load,
tracker: {},
};
} catch (error) {
logger.warn("Failed to load mock file!", { name, module: "scrapeURL:mock", method: "loadMock", error });
return null;
}
}