mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-04-18 12:09:42 +08:00
feat: new snips test framework (FIR-414) (#1033)
* feat: new snips test framework * Update mock.ts --------- Co-authored-by: Nicolas <nicolascamara29@gmail.com>
This commit is contained in:
parent
9a13c1dede
commit
5c62bb1195
@ -15,6 +15,7 @@
|
||||
"test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'",
|
||||
"test:full": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_withAuth)'",
|
||||
"test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_full_withAuth)'",
|
||||
"test:snips": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false src/__tests__/snips/*.test.ts",
|
||||
"workers": "nodemon --exec ts-node src/services/queue-worker.ts",
|
||||
"worker:production": "node dist/src/services/queue-worker.js",
|
||||
"mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest",
|
||||
@ -37,6 +38,7 @@
|
||||
"@types/jest": "^29.5.12",
|
||||
"@types/node": "^20.14.1",
|
||||
"@types/pdf-parse": "^1.1.4",
|
||||
"@types/supertest": "^6.0.2",
|
||||
"body-parser": "^1.20.1",
|
||||
"express": "^4.18.2",
|
||||
"jest": "^29.6.3",
|
||||
|
31
apps/api/pnpm-lock.yaml
generated
31
apps/api/pnpm-lock.yaml
generated
@ -267,6 +267,9 @@ importers:
|
||||
'@types/pdf-parse':
|
||||
specifier: ^1.1.4
|
||||
version: 1.1.4
|
||||
'@types/supertest':
|
||||
specifier: ^6.0.2
|
||||
version: 6.0.2
|
||||
body-parser:
|
||||
specifier: ^1.20.1
|
||||
version: 1.20.2
|
||||
@ -1516,6 +1519,9 @@ packages:
|
||||
'@types/connect@3.4.38':
|
||||
resolution: {integrity: sha512-K6uROf1LD88uDQqJCktA4yzL1YYAK6NgfsI0v/mTgyPKWsX1CnJ0XPSDhViejru1GcRkLWb8RlzFYJRqGUbaug==}
|
||||
|
||||
'@types/cookiejar@2.1.5':
|
||||
resolution: {integrity: sha512-he+DHOWReW0nghN24E1WUqM0efK4kI9oTqDm6XmK8ZPe2djZ90BSNdGnIyCLzCPw7/pogPlGbzI2wHGGmi4O/Q==}
|
||||
|
||||
'@types/cors@2.8.17':
|
||||
resolution: {integrity: sha512-8CGDvrBj1zgo2qE+oS3pOCyYNqCPryMWY2bGfwA0dcfopWGgxs+78df0Rs3rc9THP4JkOhLsAa+15VdpAqkcUA==}
|
||||
|
||||
@ -1549,6 +1555,9 @@ packages:
|
||||
'@types/jest@29.5.12':
|
||||
resolution: {integrity: sha512-eDC8bTvT/QhYdxJAulQikueigY5AsdBRH2yDKW3yveW7svY3+DzN84/2NUgkw10RTiJbWqZrTtoGVdYlvFJdLw==}
|
||||
|
||||
'@types/methods@1.1.4':
|
||||
resolution: {integrity: sha512-ymXWVrDiCxTBE3+RIrrP533E70eA+9qu7zdWoHuOmGujkYtzf4HQF96b8nwHLqhuf4ykX61IGRIB38CC6/sImQ==}
|
||||
|
||||
'@types/mime@1.3.5':
|
||||
resolution: {integrity: sha512-/pyBZWSLD2n0dcHE3hq8s8ZvcETHtEuF+3E7XVt0Ig2nvsVQXdghHVcEkIWjy9A0wKfTn97a/PSDYohKIlnP/w==}
|
||||
|
||||
@ -1597,6 +1606,12 @@ packages:
|
||||
'@types/stack-utils@2.0.3':
|
||||
resolution: {integrity: sha512-9aEbYZ3TbYMznPdcdr3SmIrLXwC/AKZXQeCf9Pgao5CKb8CyHuEX5jzWPTkvregvhRJHcpRO6BFoGW9ycaOkYw==}
|
||||
|
||||
'@types/superagent@8.1.9':
|
||||
resolution: {integrity: sha512-pTVjI73witn+9ILmoJdajHGW2jkSaOzhiFYF1Rd3EQ94kymLqB9PjD9ISg7WaALC7+dCHT0FGe9T2LktLq/3GQ==}
|
||||
|
||||
'@types/supertest@6.0.2':
|
||||
resolution: {integrity: sha512-137ypx2lk/wTQbW6An6safu9hXmajAifU/s7szAHLN/FeIm5w7yR0Wkl9fdJMRSHwOn4HLAI0DaB2TOORuhPDg==}
|
||||
|
||||
'@types/triple-beam@1.3.5':
|
||||
resolution: {integrity: sha512-6WaYesThRMCl19iryMYP7/x2OVgCtbIVflDGFpWnb9irXI3UjYE4AzmYuiUKY1AJstGijoY+MgUszMgRxIYTYw==}
|
||||
|
||||
@ -6566,6 +6581,8 @@ snapshots:
|
||||
dependencies:
|
||||
'@types/node': 20.14.1
|
||||
|
||||
'@types/cookiejar@2.1.5': {}
|
||||
|
||||
'@types/cors@2.8.17':
|
||||
dependencies:
|
||||
'@types/node': 20.14.1
|
||||
@ -6613,6 +6630,8 @@ snapshots:
|
||||
expect: 29.7.0
|
||||
pretty-format: 29.7.0
|
||||
|
||||
'@types/methods@1.1.4': {}
|
||||
|
||||
'@types/mime@1.3.5': {}
|
||||
|
||||
'@types/mysql@2.15.22':
|
||||
@ -6667,6 +6686,18 @@ snapshots:
|
||||
|
||||
'@types/stack-utils@2.0.3': {}
|
||||
|
||||
'@types/superagent@8.1.9':
|
||||
dependencies:
|
||||
'@types/cookiejar': 2.1.5
|
||||
'@types/methods': 1.1.4
|
||||
'@types/node': 20.14.1
|
||||
form-data: 4.0.0
|
||||
|
||||
'@types/supertest@6.0.2':
|
||||
dependencies:
|
||||
'@types/methods': 1.1.4
|
||||
'@types/superagent': 8.1.9
|
||||
|
||||
'@types/triple-beam@1.3.5': {}
|
||||
|
||||
'@types/uuid@9.0.8': {}
|
||||
|
107
apps/api/src/__tests__/snips/mocks/mocking-works-properly.json
Normal file
107
apps/api/src/__tests__/snips/mocks/mocking-works-properly.json
Normal file
@ -0,0 +1,107 @@
|
||||
[
|
||||
{
|
||||
"time": 1735911273239,
|
||||
"options": {
|
||||
"url": "http://default-fire-engine-api-service:8080/scrape",
|
||||
"method": "POST",
|
||||
"body": {
|
||||
"url": "http://firecrawl.dev",
|
||||
"engine": "chrome-cdp",
|
||||
"instantReturn": true,
|
||||
"skipTlsVerification": false,
|
||||
"priority": 10,
|
||||
"mobile": false,
|
||||
"timeout": 15000
|
||||
},
|
||||
"headers": {},
|
||||
"ignoreResponse": false,
|
||||
"ignoreFailure": false,
|
||||
"tryCount": 3
|
||||
},
|
||||
"result": {
|
||||
"status": 200,
|
||||
"headers": {},
|
||||
"body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp\",\"processing\":true}"
|
||||
}
|
||||
},
|
||||
{
|
||||
"time": 1735911273354,
|
||||
"options": {
|
||||
"url": "http://default-fire-engine-api-service:8080/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp",
|
||||
"method": "GET",
|
||||
"headers": {},
|
||||
"ignoreResponse": false,
|
||||
"ignoreFailure": false,
|
||||
"tryCount": 1
|
||||
},
|
||||
"result": {
|
||||
"status": 200,
|
||||
"headers": {},
|
||||
"body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35\",\"state\":\"prioritized\",\"processing\":true}"
|
||||
}
|
||||
},
|
||||
{
|
||||
"time": 1735911273720,
|
||||
"options": {
|
||||
"url": "http://default-fire-engine-api-service:8080/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp",
|
||||
"method": "GET",
|
||||
"headers": {},
|
||||
"ignoreResponse": false,
|
||||
"ignoreFailure": false,
|
||||
"tryCount": 1
|
||||
},
|
||||
"result": {
|
||||
"status": 200,
|
||||
"headers": {},
|
||||
"body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35\",\"state\":\"active\",\"processing\":true}"
|
||||
}
|
||||
},
|
||||
{
|
||||
"time": 1735911274092,
|
||||
"options": {
|
||||
"url": "http://default-fire-engine-api-service:8080/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp",
|
||||
"method": "GET",
|
||||
"headers": {},
|
||||
"ignoreResponse": false,
|
||||
"ignoreFailure": false,
|
||||
"tryCount": 1
|
||||
},
|
||||
"result": {
|
||||
"status": 200,
|
||||
"headers": {},
|
||||
"body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35\",\"state\":\"active\",\"processing\":true}"
|
||||
}
|
||||
},
|
||||
{
|
||||
"time": 1735911274467,
|
||||
"options": {
|
||||
"url": "http://default-fire-engine-api-service:8080/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp",
|
||||
"method": "GET",
|
||||
"headers": {},
|
||||
"ignoreResponse": false,
|
||||
"ignoreFailure": false,
|
||||
"tryCount": 1
|
||||
},
|
||||
"result": {
|
||||
"status": 200,
|
||||
"headers": {},
|
||||
"body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35\",\"state\":\"active\",\"processing\":true}"
|
||||
}
|
||||
},
|
||||
{
|
||||
"time": 1735911274947,
|
||||
"options": {
|
||||
"url": "http://default-fire-engine-api-service:8080/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp",
|
||||
"method": "GET",
|
||||
"headers": {},
|
||||
"ignoreResponse": false,
|
||||
"ignoreFailure": false,
|
||||
"tryCount": 1
|
||||
},
|
||||
"result": {
|
||||
"status": 200,
|
||||
"headers": {},
|
||||
"body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35\",\"state\":\"completed\",\"processing\":false,\"timeTaken\":1.204,\"content\":\"<!DOCTYPE html><html lang=\\\"en\\\"><body><p>this is fake data coming from the mocking system!</p></body></html>\",\"url\":\"https://www.firecrawl.dev/\",\"screenshots\":[],\"actionContent\":[],\"pageStatusCode\":200,\"responseHeaders\":{\"X-DNS-Prefetch-Control\":\"off\",\"age\":\"0\",\"cache-control\":\"private, no-cache, no-store, max-age=0, must-revalidate\",\"content-encoding\":\"br\",\"content-type\":\"text/html; charset=utf-8\",\"date\":\"Fri, 03 Jan 2025 13:34:34 GMT\",\"link\":\"</_next/static/media/171883e03d2067b6-s.p.woff2>; rel=preload; as=\\\"font\\\"; crossorigin=\\\"\\\"; type=\\\"font/woff2\\\", </_next/static/media/a34f9d1faa5f3315-s.p.woff2>; rel=preload; as=\\\"font\\\"; crossorigin=\\\"\\\"; type=\\\"font/woff2\\\", </_next/static/media/c4c7b0ec92b72e30-s.p.woff2>; rel=preload; as=\\\"font\\\"; crossorigin=\\\"\\\"; type=\\\"font/woff2\\\"\",\"permissions-policy\":\"keyboard-map=(), attribution-reporting=(), run-ad-auction=(), private-state-token-redemption=(), private-state-token-issuance=(), join-ad-interest-group=(), idle-detection=(), compute-pressure=(), browsing-topics=()\",\"server\":\"Vercel\",\"strict-transport-security\":\"max-age=63072000\",\"vary\":\"RSC, Next-Router-State-Tree, Next-Router-Prefetch\",\"x-matched-path\":\"/\",\"x-powered-by\":\"Next.js\",\"x-vercel-cache\":\"MISS\",\"x-vercel-id\":\"iad1::iad1::bs88l-1735911273932-1f7bba7a8b45\"},\"invalidTlsCert\":false,\"file\":null}"
|
||||
}
|
||||
}
|
||||
]
|
36
apps/api/src/__tests__/snips/scrape.test.ts
Normal file
36
apps/api/src/__tests__/snips/scrape.test.ts
Normal file
@ -0,0 +1,36 @@
|
||||
import request from "supertest";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { ScrapeRequestInput } from "../../controllers/v1/types";
|
||||
|
||||
configDotenv();
|
||||
const TEST_URL = "http://127.0.0.1:3002";
|
||||
|
||||
async function scrape(body: ScrapeRequestInput) {
|
||||
return await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(body);
|
||||
}
|
||||
|
||||
function expectScrapeToSucceed(response: Awaited<ReturnType<typeof scrape>>) {
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body.success).toBe(true);
|
||||
expect(typeof response.body.data).toBe("object");
|
||||
}
|
||||
|
||||
describe("Scrape tests", () => {
|
||||
it("mocking works properly", async () => {
|
||||
// depends on falsified mock mocking-works-properly
|
||||
// this test will fail if mock is bypassed with real data -- firecrawl.dev will never have
|
||||
// that as its actual markdown output
|
||||
|
||||
const response = await scrape({
|
||||
url: "http://firecrawl.dev",
|
||||
useMock: "mocking-works-properly",
|
||||
});
|
||||
|
||||
expectScrapeToSucceed(response);
|
||||
expect(response.body.data.markdown).toBe("this is fake data coming from the mocking system!");
|
||||
});
|
||||
});
|
12
apps/api/src/__tests__/snips/utils/collect-mocks.js
Normal file
12
apps/api/src/__tests__/snips/utils/collect-mocks.js
Normal file
@ -0,0 +1,12 @@
|
||||
const path = require("path");
|
||||
const fs = require("fs");
|
||||
|
||||
const mocksDirPath = path.join(__dirname, "../../../scraper/scrapeURL/mocks");
|
||||
const files = fs.readdirSync(mocksDirPath);
|
||||
|
||||
const contents = files.map(x => JSON.parse(fs.readFileSync(path.join(mocksDirPath, x), "utf8")));
|
||||
|
||||
fs.writeFileSync(
|
||||
path.join(__dirname, "../mocks/" + process.argv[2] + ".json"),
|
||||
JSON.stringify(contents, undefined, 4),
|
||||
);
|
@ -181,6 +181,7 @@ export const scrapeOptions = z
|
||||
skipTlsVerification: z.boolean().default(false),
|
||||
removeBase64Images: z.boolean().default(true),
|
||||
fastMode: z.boolean().default(false),
|
||||
useMock: z.string().optional(),
|
||||
})
|
||||
.strict(strictMessage);
|
||||
|
||||
|
1
apps/api/src/scraper/scrapeURL/.gitignore
vendored
Normal file
1
apps/api/src/scraper/scrapeURL/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
/mocks
|
@ -3,12 +3,8 @@ import * as Sentry from "@sentry/node";
|
||||
import { z } from "zod";
|
||||
|
||||
import { robustFetch } from "../../lib/fetch";
|
||||
import {
|
||||
ActionError,
|
||||
EngineError,
|
||||
SiteError,
|
||||
UnsupportedFileError,
|
||||
} from "../../error";
|
||||
import { ActionError, EngineError, SiteError, UnsupportedFileError } from "../../error";
|
||||
import { MockState } from "../../lib/mock";
|
||||
|
||||
const successSchema = z.object({
|
||||
jobId: z.string(),
|
||||
@ -82,6 +78,7 @@ export class StillProcessingError extends Error {
|
||||
export async function fireEngineCheckStatus(
|
||||
logger: Logger,
|
||||
jobId: string,
|
||||
mock: MockState | null,
|
||||
): Promise<FireEngineCheckStatusSuccess> {
|
||||
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
|
||||
|
||||
@ -105,6 +102,7 @@ export async function fireEngineCheckStatus(
|
||||
}
|
||||
: {}),
|
||||
},
|
||||
mock,
|
||||
});
|
||||
},
|
||||
);
|
||||
|
@ -2,8 +2,9 @@ import { Logger } from "winston";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
import { robustFetch } from "../../lib/fetch";
|
||||
import { MockState } from "../../lib/mock";
|
||||
|
||||
export async function fireEngineDelete(logger: Logger, jobId: string) {
|
||||
export async function fireEngineDelete(logger: Logger, jobId: string, mock: MockState | null) {
|
||||
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
|
||||
|
||||
await Sentry.startSpan(
|
||||
@ -28,6 +29,7 @@ export async function fireEngineDelete(logger: Logger, jobId: string) {
|
||||
ignoreResponse: true,
|
||||
ignoreFailure: true,
|
||||
logger: logger.child({ method: "fireEngineDelete/robustFetch", jobId }),
|
||||
mock,
|
||||
});
|
||||
},
|
||||
);
|
||||
|
@ -24,6 +24,7 @@ import * as Sentry from "@sentry/node";
|
||||
import { Action } from "../../../../lib/entities";
|
||||
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||
import { fireEngineDelete } from "./delete";
|
||||
import { MockState, saveMock } from "../../lib/mock";
|
||||
|
||||
// This function does not take `Meta` on purpose. It may not access any
|
||||
// meta values to construct the request -- that must be done by the
|
||||
@ -37,10 +38,12 @@ async function performFireEngineScrape<
|
||||
logger: Logger,
|
||||
request: FireEngineScrapeRequestCommon & Engine,
|
||||
timeout: number,
|
||||
mock: MockState | null,
|
||||
): Promise<FireEngineCheckStatusSuccess> {
|
||||
const scrape = await fireEngineScrape(
|
||||
logger.child({ method: "fireEngineScrape" }),
|
||||
request,
|
||||
mock,
|
||||
);
|
||||
|
||||
const startTime = Date.now();
|
||||
@ -57,6 +60,7 @@ async function performFireEngineScrape<
|
||||
afterErrors: errors,
|
||||
}),
|
||||
scrape.jobId,
|
||||
mock,
|
||||
);
|
||||
throw new Error("Error limit hit. See e.cause.errors for errors.", {
|
||||
cause: { errors },
|
||||
@ -78,6 +82,7 @@ async function performFireEngineScrape<
|
||||
status = await fireEngineCheckStatus(
|
||||
logger.child({ method: "fireEngineCheckStatus" }),
|
||||
scrape.jobId,
|
||||
mock,
|
||||
);
|
||||
} catch (error) {
|
||||
if (error instanceof StillProcessingError) {
|
||||
@ -94,6 +99,7 @@ async function performFireEngineScrape<
|
||||
afterError: error,
|
||||
}),
|
||||
scrape.jobId,
|
||||
mock,
|
||||
);
|
||||
logger.debug("Fire-engine scrape job failed.", {
|
||||
error,
|
||||
@ -131,6 +137,7 @@ async function performFireEngineScrape<
|
||||
method: "performFireEngineScrape/fireEngineDelete",
|
||||
}),
|
||||
scrape.jobId,
|
||||
mock,
|
||||
);
|
||||
|
||||
return status;
|
||||
@ -200,6 +207,7 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
||||
}),
|
||||
request,
|
||||
timeout,
|
||||
meta.mock,
|
||||
);
|
||||
|
||||
if (
|
||||
@ -274,6 +282,7 @@ export async function scrapeURLWithFireEnginePlaywright(
|
||||
}),
|
||||
request,
|
||||
timeout,
|
||||
meta.mock,
|
||||
);
|
||||
|
||||
if (!response.url) {
|
||||
@ -327,6 +336,7 @@ export async function scrapeURLWithFireEngineTLSClient(
|
||||
}),
|
||||
request,
|
||||
timeout,
|
||||
meta.mock,
|
||||
);
|
||||
|
||||
if (!response.url) {
|
||||
|
@ -4,6 +4,7 @@ import { z } from "zod";
|
||||
|
||||
import { Action } from "../../../../lib/entities";
|
||||
import { robustFetch } from "../../lib/fetch";
|
||||
import { MockState } from "../../lib/mock";
|
||||
|
||||
export type FireEngineScrapeRequestCommon = {
|
||||
url: string;
|
||||
@ -69,6 +70,7 @@ export async function fireEngineScrape<
|
||||
>(
|
||||
logger: Logger,
|
||||
request: FireEngineScrapeRequestCommon & Engine,
|
||||
mock: MockState | null,
|
||||
): Promise<z.infer<typeof schema>> {
|
||||
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
|
||||
|
||||
@ -97,6 +99,7 @@ export async function fireEngineScrape<
|
||||
logger: logger.child({ method: "fireEngineScrape/robustFetch" }),
|
||||
schema,
|
||||
tryCount: 3,
|
||||
mock,
|
||||
});
|
||||
},
|
||||
);
|
||||
|
@ -46,6 +46,7 @@ async function scrapePDFWithRunPodMU(
|
||||
markdown: z.string(),
|
||||
}),
|
||||
}),
|
||||
mock: meta.mock,
|
||||
});
|
||||
|
||||
return {
|
||||
|
@ -29,6 +29,7 @@ export async function scrapeURLWithPlaywright(
|
||||
pageStatusCode: z.number(),
|
||||
pageError: z.string().optional(),
|
||||
}),
|
||||
mock: meta.mock,
|
||||
}),
|
||||
(async () => {
|
||||
await new Promise((resolve) => setTimeout(() => resolve(null), timeout));
|
||||
|
@ -2,7 +2,7 @@ import { Logger } from "winston";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
import { Document, ScrapeOptions } from "../../controllers/v1/types";
|
||||
import { logger } from "../../lib/logger";
|
||||
import { logger as _logger } from "../../lib/logger";
|
||||
import {
|
||||
buildFallbackList,
|
||||
Engine,
|
||||
@ -24,6 +24,7 @@ import {
|
||||
import { executeTransformers } from "./transformers";
|
||||
import { LLMRefusalError } from "./transformers/llmExtract";
|
||||
import { urlSpecificParams } from "./lib/urlSpecificParams";
|
||||
import { loadMock, MockState } from "./lib/mock";
|
||||
|
||||
export type ScrapeUrlResponse = (
|
||||
| {
|
||||
@ -47,6 +48,7 @@ export type Meta = {
|
||||
logger: Logger;
|
||||
logs: any[];
|
||||
featureFlags: Set<FeatureFlag>;
|
||||
mock: MockState | null;
|
||||
};
|
||||
|
||||
function buildFeatureFlags(
|
||||
@ -110,12 +112,12 @@ function buildFeatureFlags(
|
||||
// The meta object is usually immutable, except for the logs array, and in edge cases (e.g. a new feature is suddenly required)
|
||||
// Having a meta object that is treated as immutable helps the code stay clean and easily tracable,
|
||||
// while also retaining the benefits that WebScraper had from its OOP design.
|
||||
function buildMetaObject(
|
||||
async function buildMetaObject(
|
||||
id: string,
|
||||
url: string,
|
||||
options: ScrapeOptions,
|
||||
internalOptions: InternalOptions,
|
||||
): Meta {
|
||||
): Promise<Meta> {
|
||||
const specParams =
|
||||
urlSpecificParams[new URL(url).hostname.replace(/^www\./, "")];
|
||||
if (specParams !== undefined) {
|
||||
@ -126,7 +128,7 @@ function buildMetaObject(
|
||||
);
|
||||
}
|
||||
|
||||
const _logger = logger.child({
|
||||
const logger = _logger.child({
|
||||
module: "ScrapeURL",
|
||||
scrapeId: id,
|
||||
scrapeURL: url,
|
||||
@ -138,9 +140,10 @@ function buildMetaObject(
|
||||
url,
|
||||
options,
|
||||
internalOptions,
|
||||
logger: _logger,
|
||||
logger,
|
||||
logs,
|
||||
featureFlags: buildFeatureFlags(url, options, internalOptions),
|
||||
mock: options.useMock !== undefined ? await loadMock(options.useMock, _logger) : null,
|
||||
};
|
||||
}
|
||||
|
||||
@ -299,7 +302,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
||||
throw error;
|
||||
} else {
|
||||
Sentry.captureException(error);
|
||||
meta.logger.info(
|
||||
meta.logger.warn(
|
||||
"An unexpected error happened while scraping with " + engine + ".",
|
||||
{ error },
|
||||
);
|
||||
@ -362,7 +365,7 @@ export async function scrapeURL(
|
||||
options: ScrapeOptions,
|
||||
internalOptions: InternalOptions = {},
|
||||
): Promise<ScrapeUrlResponse> {
|
||||
const meta = buildMetaObject(id, url, options, internalOptions);
|
||||
const meta = await buildMetaObject(id, url, options, internalOptions);
|
||||
try {
|
||||
while (true) {
|
||||
try {
|
||||
|
@ -1,7 +1,7 @@
|
||||
import { Logger } from "winston";
|
||||
import { z, ZodError } from "zod";
|
||||
import { v4 as uuid } from "uuid";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { MockState, saveMock } from "./mock";
|
||||
|
||||
export type RobustFetchParams<Schema extends z.Schema<any>> = {
|
||||
url: string;
|
||||
@ -16,6 +16,7 @@ export type RobustFetchParams<Schema extends z.Schema<any>> = {
|
||||
requestId?: string;
|
||||
tryCount?: number;
|
||||
tryCooldown?: number;
|
||||
mock: MockState | null;
|
||||
};
|
||||
|
||||
export async function robustFetch<
|
||||
@ -30,9 +31,10 @@ export async function robustFetch<
|
||||
schema,
|
||||
ignoreResponse = false,
|
||||
ignoreFailure = false,
|
||||
requestId = uuid(),
|
||||
requestId = crypto.randomUUID(),
|
||||
tryCount = 1,
|
||||
tryCooldown,
|
||||
mock
|
||||
}: RobustFetchParams<Schema>): Promise<Output> {
|
||||
const params = {
|
||||
url,
|
||||
@ -47,73 +49,108 @@ export async function robustFetch<
|
||||
tryCooldown,
|
||||
};
|
||||
|
||||
let request: Response;
|
||||
try {
|
||||
request = await fetch(url, {
|
||||
method,
|
||||
headers: {
|
||||
...(body instanceof FormData
|
||||
? {}
|
||||
: body !== undefined
|
||||
? {
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
: {}),
|
||||
...(headers !== undefined ? headers : {}),
|
||||
},
|
||||
...(body instanceof FormData
|
||||
? {
|
||||
body,
|
||||
}
|
||||
: body !== undefined
|
||||
? {
|
||||
body: JSON.stringify(body),
|
||||
}
|
||||
: {}),
|
||||
});
|
||||
} catch (error) {
|
||||
if (!ignoreFailure) {
|
||||
Sentry.captureException(error);
|
||||
if (tryCount > 1) {
|
||||
logger.debug(
|
||||
"Request failed, trying " + (tryCount - 1) + " more times",
|
||||
{ params, error, requestId },
|
||||
);
|
||||
return await robustFetch({
|
||||
...params,
|
||||
requestId,
|
||||
tryCount: tryCount - 1,
|
||||
});
|
||||
} else {
|
||||
logger.debug("Request failed", { params, error, requestId });
|
||||
throw new Error("Request failed", {
|
||||
cause: {
|
||||
params,
|
||||
requestId,
|
||||
error,
|
||||
},
|
||||
});
|
||||
}
|
||||
} else {
|
||||
return null as Output;
|
||||
}
|
||||
}
|
||||
|
||||
if (ignoreResponse === true) {
|
||||
return null as Output;
|
||||
}
|
||||
|
||||
const response = {
|
||||
status: request.status,
|
||||
headers: request.headers,
|
||||
body: await request.text(), // NOTE: can this throw an exception?
|
||||
let response: {
|
||||
status: number;
|
||||
headers: Headers,
|
||||
body: string,
|
||||
};
|
||||
|
||||
if (request.status >= 300) {
|
||||
if (mock === null) {
|
||||
let request: Response;
|
||||
try {
|
||||
request = await fetch(url, {
|
||||
method,
|
||||
headers: {
|
||||
...(body instanceof FormData
|
||||
? {}
|
||||
: body !== undefined
|
||||
? {
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
: {}),
|
||||
...(headers !== undefined ? headers : {}),
|
||||
},
|
||||
...(body instanceof FormData
|
||||
? {
|
||||
body,
|
||||
}
|
||||
: body !== undefined
|
||||
? {
|
||||
body: JSON.stringify(body),
|
||||
}
|
||||
: {}),
|
||||
});
|
||||
} catch (error) {
|
||||
if (!ignoreFailure) {
|
||||
Sentry.captureException(error);
|
||||
if (tryCount > 1) {
|
||||
logger.debug(
|
||||
"Request failed, trying " + (tryCount - 1) + " more times",
|
||||
{ params, error, requestId },
|
||||
);
|
||||
return await robustFetch({
|
||||
...params,
|
||||
requestId,
|
||||
tryCount: tryCount - 1,
|
||||
mock,
|
||||
});
|
||||
} else {
|
||||
logger.debug("Request failed", { params, error, requestId });
|
||||
throw new Error("Request failed", {
|
||||
cause: {
|
||||
params,
|
||||
requestId,
|
||||
error,
|
||||
},
|
||||
});
|
||||
}
|
||||
} else {
|
||||
return null as Output;
|
||||
}
|
||||
}
|
||||
|
||||
if (ignoreResponse === true) {
|
||||
return null as Output;
|
||||
}
|
||||
|
||||
response = {
|
||||
status: request.status,
|
||||
headers: request.headers,
|
||||
body: await request.text(), // NOTE: can this throw an exception?
|
||||
};
|
||||
} else {
|
||||
if (ignoreResponse === true) {
|
||||
return null as Output;
|
||||
}
|
||||
|
||||
const makeRequestTypeId = (request: typeof mock["requests"][number]["options"]) => {
|
||||
let out = request.url + ";" + request.method;
|
||||
if (process.env.FIRE_ENGINE_BETA_URL && url.startsWith(process.env.FIRE_ENGINE_BETA_URL) && request.method === "POST") {
|
||||
out += "f-e;" + request.body?.engine + ";" + request.body?.url;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
const thisId = makeRequestTypeId(params);
|
||||
const matchingMocks = mock.requests.filter(x => makeRequestTypeId(x.options) === thisId).sort((a,b) => a.time - b.time);
|
||||
const nextI = mock.tracker[thisId] ?? 0;
|
||||
mock.tracker[thisId] = nextI + 1;
|
||||
|
||||
if (!matchingMocks[nextI]) {
|
||||
throw new Error("Failed to mock request -- no mock targets found.");
|
||||
}
|
||||
|
||||
response = {
|
||||
...(matchingMocks[nextI].result),
|
||||
headers: new Headers(matchingMocks[nextI].result.headers),
|
||||
};
|
||||
}
|
||||
|
||||
if (response.status >= 300) {
|
||||
if (tryCount > 1) {
|
||||
logger.debug(
|
||||
"Request sent failure status, trying " + (tryCount - 1) + " more times",
|
||||
{ params, request, response, requestId },
|
||||
{ params, response, requestId },
|
||||
);
|
||||
if (tryCooldown !== undefined) {
|
||||
await new Promise((resolve) =>
|
||||
@ -124,18 +161,17 @@ export async function robustFetch<
|
||||
...params,
|
||||
requestId,
|
||||
tryCount: tryCount - 1,
|
||||
mock,
|
||||
});
|
||||
} else {
|
||||
logger.debug("Request sent failure status", {
|
||||
params,
|
||||
request,
|
||||
response,
|
||||
requestId,
|
||||
});
|
||||
throw new Error("Request sent failure status", {
|
||||
cause: {
|
||||
params,
|
||||
request,
|
||||
response,
|
||||
requestId,
|
||||
},
|
||||
@ -143,20 +179,27 @@ export async function robustFetch<
|
||||
}
|
||||
}
|
||||
|
||||
if (mock === null) {
|
||||
await saveMock({
|
||||
...params,
|
||||
logger: undefined,
|
||||
schema: undefined,
|
||||
headers: undefined,
|
||||
}, response);
|
||||
}
|
||||
|
||||
let data: Output;
|
||||
try {
|
||||
data = JSON.parse(response.body);
|
||||
} catch (error) {
|
||||
logger.debug("Request sent malformed JSON", {
|
||||
params,
|
||||
request,
|
||||
response,
|
||||
requestId,
|
||||
});
|
||||
throw new Error("Request sent malformed JSON", {
|
||||
cause: {
|
||||
params,
|
||||
request,
|
||||
response,
|
||||
requestId,
|
||||
},
|
||||
@ -170,7 +213,6 @@ export async function robustFetch<
|
||||
if (error instanceof ZodError) {
|
||||
logger.debug("Response does not match provided schema", {
|
||||
params,
|
||||
request,
|
||||
response,
|
||||
requestId,
|
||||
error,
|
||||
@ -179,7 +221,6 @@ export async function robustFetch<
|
||||
throw new Error("Response does not match provided schema", {
|
||||
cause: {
|
||||
params,
|
||||
request,
|
||||
response,
|
||||
requestId,
|
||||
error,
|
||||
@ -189,7 +230,6 @@ export async function robustFetch<
|
||||
} else {
|
||||
logger.debug("Parsing response with provided schema failed", {
|
||||
params,
|
||||
request,
|
||||
response,
|
||||
requestId,
|
||||
error,
|
||||
@ -198,7 +238,6 @@ export async function robustFetch<
|
||||
throw new Error("Parsing response with provided schema failed", {
|
||||
cause: {
|
||||
params,
|
||||
request,
|
||||
response,
|
||||
requestId,
|
||||
error,
|
||||
|
60
apps/api/src/scraper/scrapeURL/lib/mock.ts
Normal file
60
apps/api/src/scraper/scrapeURL/lib/mock.ts
Normal file
@ -0,0 +1,60 @@
|
||||
import * as fs from "fs/promises";
|
||||
import * as path from "path";
|
||||
import { logger as _logger } from "../../../lib/logger";
|
||||
import { Logger } from "winston";
|
||||
const saveMocksDirPath = path.join(__dirname, "../mocks/").replace("dist/", "");
|
||||
const loadMocksDirPath = path.join(__dirname, "../../../__tests__/snips/mocks");
|
||||
|
||||
export async function saveMock(options: unknown, result: unknown) {
|
||||
if (!process.env.FIRECRAWL_SAVE_MOCKS) return;
|
||||
|
||||
await fs.mkdir(saveMocksDirPath, { recursive: true });
|
||||
|
||||
const fileName = Date.now() + "-" + crypto.randomUUID() + ".json";
|
||||
const filePath = path.join(saveMocksDirPath, fileName);
|
||||
console.log(filePath);
|
||||
|
||||
await fs.writeFile(filePath, JSON.stringify({
|
||||
time: Date.now(),
|
||||
options,
|
||||
result,
|
||||
}, undefined, 4));
|
||||
}
|
||||
|
||||
export type MockState = {
|
||||
requests: {
|
||||
time: number,
|
||||
options: {
|
||||
url: string,
|
||||
method: string,
|
||||
body?: any,
|
||||
ignoreResponse: boolean,
|
||||
ignoreFailure: boolean,
|
||||
tryCount: number,
|
||||
tryCooldown?: number,
|
||||
},
|
||||
result: any,
|
||||
}[],
|
||||
tracker: Record<string, number>,
|
||||
}
|
||||
|
||||
export async function loadMock(name: string, logger: Logger = _logger): Promise<MockState | null> {
|
||||
try {
|
||||
const mockPath = path.join(loadMocksDirPath, name + ".json");
|
||||
|
||||
const relative = path.relative(loadMocksDirPath, mockPath);
|
||||
if (!relative || relative.startsWith("..") || path.isAbsolute(relative)) {
|
||||
// directory moving
|
||||
return null;
|
||||
}
|
||||
|
||||
const load = JSON.parse(await fs.readFile(mockPath, "utf8"));
|
||||
return {
|
||||
requests: load,
|
||||
tracker: {},
|
||||
};
|
||||
} catch (error) {
|
||||
logger.warn("Failed to load mock file!", { name, module: "scrapeURL:mock", method: "loadMock", error });
|
||||
return null;
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user