diff --git a/apps/api/package.json b/apps/api/package.json index 670dfc7a..059659c0 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -15,6 +15,7 @@ "test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'", "test:full": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_withAuth)'", "test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_full_withAuth)'", + "test:snips": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false src/__tests__/snips/*.test.ts", "workers": "nodemon --exec ts-node src/services/queue-worker.ts", "worker:production": "node dist/src/services/queue-worker.js", "mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest", @@ -37,6 +38,7 @@ "@types/jest": "^29.5.12", "@types/node": "^20.14.1", "@types/pdf-parse": "^1.1.4", + "@types/supertest": "^6.0.2", "body-parser": "^1.20.1", "express": "^4.18.2", "jest": "^29.6.3", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 082a200f..dbf4f35a 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -267,6 +267,9 @@ importers: '@types/pdf-parse': specifier: ^1.1.4 version: 1.1.4 + '@types/supertest': + specifier: ^6.0.2 + version: 6.0.2 body-parser: specifier: ^1.20.1 version: 1.20.2 @@ -1516,6 +1519,9 @@ packages: '@types/connect@3.4.38': resolution: {integrity: sha512-K6uROf1LD88uDQqJCktA4yzL1YYAK6NgfsI0v/mTgyPKWsX1CnJ0XPSDhViejru1GcRkLWb8RlzFYJRqGUbaug==} + '@types/cookiejar@2.1.5': + resolution: {integrity: sha512-he+DHOWReW0nghN24E1WUqM0efK4kI9oTqDm6XmK8ZPe2djZ90BSNdGnIyCLzCPw7/pogPlGbzI2wHGGmi4O/Q==} + '@types/cors@2.8.17': resolution: {integrity: sha512-8CGDvrBj1zgo2qE+oS3pOCyYNqCPryMWY2bGfwA0dcfopWGgxs+78df0Rs3rc9THP4JkOhLsAa+15VdpAqkcUA==} @@ -1549,6 +1555,9 @@ packages: '@types/jest@29.5.12': resolution: {integrity: sha512-eDC8bTvT/QhYdxJAulQikueigY5AsdBRH2yDKW3yveW7svY3+DzN84/2NUgkw10RTiJbWqZrTtoGVdYlvFJdLw==} + '@types/methods@1.1.4': + resolution: {integrity: sha512-ymXWVrDiCxTBE3+RIrrP533E70eA+9qu7zdWoHuOmGujkYtzf4HQF96b8nwHLqhuf4ykX61IGRIB38CC6/sImQ==} + '@types/mime@1.3.5': resolution: {integrity: sha512-/pyBZWSLD2n0dcHE3hq8s8ZvcETHtEuF+3E7XVt0Ig2nvsVQXdghHVcEkIWjy9A0wKfTn97a/PSDYohKIlnP/w==} @@ -1597,6 +1606,12 @@ packages: '@types/stack-utils@2.0.3': resolution: {integrity: sha512-9aEbYZ3TbYMznPdcdr3SmIrLXwC/AKZXQeCf9Pgao5CKb8CyHuEX5jzWPTkvregvhRJHcpRO6BFoGW9ycaOkYw==} + '@types/superagent@8.1.9': + resolution: {integrity: sha512-pTVjI73witn+9ILmoJdajHGW2jkSaOzhiFYF1Rd3EQ94kymLqB9PjD9ISg7WaALC7+dCHT0FGe9T2LktLq/3GQ==} + + '@types/supertest@6.0.2': + resolution: {integrity: sha512-137ypx2lk/wTQbW6An6safu9hXmajAifU/s7szAHLN/FeIm5w7yR0Wkl9fdJMRSHwOn4HLAI0DaB2TOORuhPDg==} + '@types/triple-beam@1.3.5': resolution: {integrity: sha512-6WaYesThRMCl19iryMYP7/x2OVgCtbIVflDGFpWnb9irXI3UjYE4AzmYuiUKY1AJstGijoY+MgUszMgRxIYTYw==} @@ -6566,6 +6581,8 @@ snapshots: dependencies: '@types/node': 20.14.1 + '@types/cookiejar@2.1.5': {} + '@types/cors@2.8.17': dependencies: '@types/node': 20.14.1 @@ -6613,6 +6630,8 @@ snapshots: expect: 29.7.0 pretty-format: 29.7.0 + '@types/methods@1.1.4': {} + '@types/mime@1.3.5': {} '@types/mysql@2.15.22': @@ -6667,6 +6686,18 @@ snapshots: '@types/stack-utils@2.0.3': {} + '@types/superagent@8.1.9': + dependencies: + '@types/cookiejar': 2.1.5 + '@types/methods': 1.1.4 + '@types/node': 20.14.1 + form-data: 4.0.0 + + '@types/supertest@6.0.2': + dependencies: + '@types/methods': 1.1.4 + '@types/superagent': 8.1.9 + '@types/triple-beam@1.3.5': {} '@types/uuid@9.0.8': {} diff --git a/apps/api/src/__tests__/snips/mocks/mocking-works-properly.json b/apps/api/src/__tests__/snips/mocks/mocking-works-properly.json new file mode 100644 index 00000000..5609e6c2 --- /dev/null +++ b/apps/api/src/__tests__/snips/mocks/mocking-works-properly.json @@ -0,0 +1,107 @@ +[ + { + "time": 1735911273239, + "options": { + "url": "http://default-fire-engine-api-service:8080/scrape", + "method": "POST", + "body": { + "url": "http://firecrawl.dev", + "engine": "chrome-cdp", + "instantReturn": true, + "skipTlsVerification": false, + "priority": 10, + "mobile": false, + "timeout": 15000 + }, + "headers": {}, + "ignoreResponse": false, + "ignoreFailure": false, + "tryCount": 3 + }, + "result": { + "status": 200, + "headers": {}, + "body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp\",\"processing\":true}" + } + }, + { + "time": 1735911273354, + "options": { + "url": "http://default-fire-engine-api-service:8080/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp", + "method": "GET", + "headers": {}, + "ignoreResponse": false, + "ignoreFailure": false, + "tryCount": 1 + }, + "result": { + "status": 200, + "headers": {}, + "body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35\",\"state\":\"prioritized\",\"processing\":true}" + } + }, + { + "time": 1735911273720, + "options": { + "url": "http://default-fire-engine-api-service:8080/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp", + "method": "GET", + "headers": {}, + "ignoreResponse": false, + "ignoreFailure": false, + "tryCount": 1 + }, + "result": { + "status": 200, + "headers": {}, + "body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35\",\"state\":\"active\",\"processing\":true}" + } + }, + { + "time": 1735911274092, + "options": { + "url": "http://default-fire-engine-api-service:8080/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp", + "method": "GET", + "headers": {}, + "ignoreResponse": false, + "ignoreFailure": false, + "tryCount": 1 + }, + "result": { + "status": 200, + "headers": {}, + "body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35\",\"state\":\"active\",\"processing\":true}" + } + }, + { + "time": 1735911274467, + "options": { + "url": "http://default-fire-engine-api-service:8080/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp", + "method": "GET", + "headers": {}, + "ignoreResponse": false, + "ignoreFailure": false, + "tryCount": 1 + }, + "result": { + "status": 200, + "headers": {}, + "body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35\",\"state\":\"active\",\"processing\":true}" + } + }, + { + "time": 1735911274947, + "options": { + "url": "http://default-fire-engine-api-service:8080/scrape/ede37286-90db-4f60-8efb-76217dfcfa35!chrome-cdp", + "method": "GET", + "headers": {}, + "ignoreResponse": false, + "ignoreFailure": false, + "tryCount": 1 + }, + "result": { + "status": 200, + "headers": {}, + "body": "{\"jobId\":\"ede37286-90db-4f60-8efb-76217dfcfa35\",\"state\":\"completed\",\"processing\":false,\"timeTaken\":1.204,\"content\":\"

this is fake data coming from the mocking system!

\",\"url\":\"https://www.firecrawl.dev/\",\"screenshots\":[],\"actionContent\":[],\"pageStatusCode\":200,\"responseHeaders\":{\"X-DNS-Prefetch-Control\":\"off\",\"age\":\"0\",\"cache-control\":\"private, no-cache, no-store, max-age=0, must-revalidate\",\"content-encoding\":\"br\",\"content-type\":\"text/html; charset=utf-8\",\"date\":\"Fri, 03 Jan 2025 13:34:34 GMT\",\"link\":\"; rel=preload; as=\\\"font\\\"; crossorigin=\\\"\\\"; type=\\\"font/woff2\\\", ; rel=preload; as=\\\"font\\\"; crossorigin=\\\"\\\"; type=\\\"font/woff2\\\", ; rel=preload; as=\\\"font\\\"; crossorigin=\\\"\\\"; type=\\\"font/woff2\\\"\",\"permissions-policy\":\"keyboard-map=(), attribution-reporting=(), run-ad-auction=(), private-state-token-redemption=(), private-state-token-issuance=(), join-ad-interest-group=(), idle-detection=(), compute-pressure=(), browsing-topics=()\",\"server\":\"Vercel\",\"strict-transport-security\":\"max-age=63072000\",\"vary\":\"RSC, Next-Router-State-Tree, Next-Router-Prefetch\",\"x-matched-path\":\"/\",\"x-powered-by\":\"Next.js\",\"x-vercel-cache\":\"MISS\",\"x-vercel-id\":\"iad1::iad1::bs88l-1735911273932-1f7bba7a8b45\"},\"invalidTlsCert\":false,\"file\":null}" + } + } +] \ No newline at end of file diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts new file mode 100644 index 00000000..c337f4f8 --- /dev/null +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -0,0 +1,36 @@ +import request from "supertest"; +import { configDotenv } from "dotenv"; +import { ScrapeRequestInput } from "../../controllers/v1/types"; + +configDotenv(); +const TEST_URL = "http://127.0.0.1:3002"; + +async function scrape(body: ScrapeRequestInput) { + return await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(body); +} + +function expectScrapeToSucceed(response: Awaited>) { + expect(response.statusCode).toBe(200); + expect(response.body.success).toBe(true); + expect(typeof response.body.data).toBe("object"); +} + +describe("Scrape tests", () => { + it("mocking works properly", async () => { + // depends on falsified mock mocking-works-properly + // this test will fail if mock is bypassed with real data -- firecrawl.dev will never have + // that as its actual markdown output + + const response = await scrape({ + url: "http://firecrawl.dev", + useMock: "mocking-works-properly", + }); + + expectScrapeToSucceed(response); + expect(response.body.data.markdown).toBe("this is fake data coming from the mocking system!"); + }); +}); \ No newline at end of file diff --git a/apps/api/src/__tests__/snips/utils/collect-mocks.js b/apps/api/src/__tests__/snips/utils/collect-mocks.js new file mode 100644 index 00000000..3b879136 --- /dev/null +++ b/apps/api/src/__tests__/snips/utils/collect-mocks.js @@ -0,0 +1,12 @@ +const path = require("path"); +const fs = require("fs"); + +const mocksDirPath = path.join(__dirname, "../../../scraper/scrapeURL/mocks"); +const files = fs.readdirSync(mocksDirPath); + +const contents = files.map(x => JSON.parse(fs.readFileSync(path.join(mocksDirPath, x), "utf8"))); + +fs.writeFileSync( + path.join(__dirname, "../mocks/" + process.argv[2] + ".json"), + JSON.stringify(contents, undefined, 4), +); \ No newline at end of file diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 7aebd560..1160d871 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -181,6 +181,7 @@ export const scrapeOptions = z skipTlsVerification: z.boolean().default(false), removeBase64Images: z.boolean().default(true), fastMode: z.boolean().default(false), + useMock: z.string().optional(), }) .strict(strictMessage); diff --git a/apps/api/src/scraper/scrapeURL/.gitignore b/apps/api/src/scraper/scrapeURL/.gitignore new file mode 100644 index 00000000..3307228f --- /dev/null +++ b/apps/api/src/scraper/scrapeURL/.gitignore @@ -0,0 +1 @@ +/mocks \ No newline at end of file diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts index 8b7b86fb..b3af6103 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts @@ -3,12 +3,8 @@ import * as Sentry from "@sentry/node"; import { z } from "zod"; import { robustFetch } from "../../lib/fetch"; -import { - ActionError, - EngineError, - SiteError, - UnsupportedFileError, -} from "../../error"; +import { ActionError, EngineError, SiteError, UnsupportedFileError } from "../../error"; +import { MockState } from "../../lib/mock"; const successSchema = z.object({ jobId: z.string(), @@ -82,6 +78,7 @@ export class StillProcessingError extends Error { export async function fireEngineCheckStatus( logger: Logger, jobId: string, + mock: MockState | null, ): Promise { const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!; @@ -105,6 +102,7 @@ export async function fireEngineCheckStatus( } : {}), }, + mock, }); }, ); diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/delete.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/delete.ts index d5fe58cb..1d4464d9 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/delete.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/delete.ts @@ -2,8 +2,9 @@ import { Logger } from "winston"; import * as Sentry from "@sentry/node"; import { robustFetch } from "../../lib/fetch"; +import { MockState } from "../../lib/mock"; -export async function fireEngineDelete(logger: Logger, jobId: string) { +export async function fireEngineDelete(logger: Logger, jobId: string, mock: MockState | null) { const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!; await Sentry.startSpan( @@ -28,6 +29,7 @@ export async function fireEngineDelete(logger: Logger, jobId: string) { ignoreResponse: true, ignoreFailure: true, logger: logger.child({ method: "fireEngineDelete/robustFetch", jobId }), + mock, }); }, ); diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 43da361c..62a50f60 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -24,6 +24,7 @@ import * as Sentry from "@sentry/node"; import { Action } from "../../../../lib/entities"; import { specialtyScrapeCheck } from "../utils/specialtyHandler"; import { fireEngineDelete } from "./delete"; +import { MockState, saveMock } from "../../lib/mock"; // This function does not take `Meta` on purpose. It may not access any // meta values to construct the request -- that must be done by the @@ -37,10 +38,12 @@ async function performFireEngineScrape< logger: Logger, request: FireEngineScrapeRequestCommon & Engine, timeout: number, + mock: MockState | null, ): Promise { const scrape = await fireEngineScrape( logger.child({ method: "fireEngineScrape" }), request, + mock, ); const startTime = Date.now(); @@ -57,6 +60,7 @@ async function performFireEngineScrape< afterErrors: errors, }), scrape.jobId, + mock, ); throw new Error("Error limit hit. See e.cause.errors for errors.", { cause: { errors }, @@ -78,6 +82,7 @@ async function performFireEngineScrape< status = await fireEngineCheckStatus( logger.child({ method: "fireEngineCheckStatus" }), scrape.jobId, + mock, ); } catch (error) { if (error instanceof StillProcessingError) { @@ -94,6 +99,7 @@ async function performFireEngineScrape< afterError: error, }), scrape.jobId, + mock, ); logger.debug("Fire-engine scrape job failed.", { error, @@ -131,6 +137,7 @@ async function performFireEngineScrape< method: "performFireEngineScrape/fireEngineDelete", }), scrape.jobId, + mock, ); return status; @@ -200,6 +207,7 @@ export async function scrapeURLWithFireEngineChromeCDP( }), request, timeout, + meta.mock, ); if ( @@ -274,6 +282,7 @@ export async function scrapeURLWithFireEnginePlaywright( }), request, timeout, + meta.mock, ); if (!response.url) { @@ -327,6 +336,7 @@ export async function scrapeURLWithFireEngineTLSClient( }), request, timeout, + meta.mock, ); if (!response.url) { diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts index de6ac3f4..4248024a 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts @@ -4,6 +4,7 @@ import { z } from "zod"; import { Action } from "../../../../lib/entities"; import { robustFetch } from "../../lib/fetch"; +import { MockState } from "../../lib/mock"; export type FireEngineScrapeRequestCommon = { url: string; @@ -69,6 +70,7 @@ export async function fireEngineScrape< >( logger: Logger, request: FireEngineScrapeRequestCommon & Engine, + mock: MockState | null, ): Promise> { const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!; @@ -97,6 +99,7 @@ export async function fireEngineScrape< logger: logger.child({ method: "fireEngineScrape/robustFetch" }), schema, tryCount: 3, + mock, }); }, ); diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index 101c9a53..a3678615 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -46,6 +46,7 @@ async function scrapePDFWithRunPodMU( markdown: z.string(), }), }), + mock: meta.mock, }); return { diff --git a/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts b/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts index edcd50c0..123a1c68 100644 --- a/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts @@ -29,6 +29,7 @@ export async function scrapeURLWithPlaywright( pageStatusCode: z.number(), pageError: z.string().optional(), }), + mock: meta.mock, }), (async () => { await new Promise((resolve) => setTimeout(() => resolve(null), timeout)); diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 26577675..3df5020d 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -2,7 +2,7 @@ import { Logger } from "winston"; import * as Sentry from "@sentry/node"; import { Document, ScrapeOptions } from "../../controllers/v1/types"; -import { logger } from "../../lib/logger"; +import { logger as _logger } from "../../lib/logger"; import { buildFallbackList, Engine, @@ -24,6 +24,7 @@ import { import { executeTransformers } from "./transformers"; import { LLMRefusalError } from "./transformers/llmExtract"; import { urlSpecificParams } from "./lib/urlSpecificParams"; +import { loadMock, MockState } from "./lib/mock"; export type ScrapeUrlResponse = ( | { @@ -47,6 +48,7 @@ export type Meta = { logger: Logger; logs: any[]; featureFlags: Set; + mock: MockState | null; }; function buildFeatureFlags( @@ -110,12 +112,12 @@ function buildFeatureFlags( // The meta object is usually immutable, except for the logs array, and in edge cases (e.g. a new feature is suddenly required) // Having a meta object that is treated as immutable helps the code stay clean and easily tracable, // while also retaining the benefits that WebScraper had from its OOP design. -function buildMetaObject( +async function buildMetaObject( id: string, url: string, options: ScrapeOptions, internalOptions: InternalOptions, -): Meta { +): Promise { const specParams = urlSpecificParams[new URL(url).hostname.replace(/^www\./, "")]; if (specParams !== undefined) { @@ -126,7 +128,7 @@ function buildMetaObject( ); } - const _logger = logger.child({ + const logger = _logger.child({ module: "ScrapeURL", scrapeId: id, scrapeURL: url, @@ -138,9 +140,10 @@ function buildMetaObject( url, options, internalOptions, - logger: _logger, + logger, logs, featureFlags: buildFeatureFlags(url, options, internalOptions), + mock: options.useMock !== undefined ? await loadMock(options.useMock, _logger) : null, }; } @@ -299,7 +302,7 @@ async function scrapeURLLoop(meta: Meta): Promise { throw error; } else { Sentry.captureException(error); - meta.logger.info( + meta.logger.warn( "An unexpected error happened while scraping with " + engine + ".", { error }, ); @@ -362,7 +365,7 @@ export async function scrapeURL( options: ScrapeOptions, internalOptions: InternalOptions = {}, ): Promise { - const meta = buildMetaObject(id, url, options, internalOptions); + const meta = await buildMetaObject(id, url, options, internalOptions); try { while (true) { try { diff --git a/apps/api/src/scraper/scrapeURL/lib/fetch.ts b/apps/api/src/scraper/scrapeURL/lib/fetch.ts index 897587a9..56b91687 100644 --- a/apps/api/src/scraper/scrapeURL/lib/fetch.ts +++ b/apps/api/src/scraper/scrapeURL/lib/fetch.ts @@ -1,7 +1,7 @@ import { Logger } from "winston"; import { z, ZodError } from "zod"; -import { v4 as uuid } from "uuid"; import * as Sentry from "@sentry/node"; +import { MockState, saveMock } from "./mock"; export type RobustFetchParams> = { url: string; @@ -16,6 +16,7 @@ export type RobustFetchParams> = { requestId?: string; tryCount?: number; tryCooldown?: number; + mock: MockState | null; }; export async function robustFetch< @@ -30,9 +31,10 @@ export async function robustFetch< schema, ignoreResponse = false, ignoreFailure = false, - requestId = uuid(), + requestId = crypto.randomUUID(), tryCount = 1, tryCooldown, + mock }: RobustFetchParams): Promise { const params = { url, @@ -47,73 +49,108 @@ export async function robustFetch< tryCooldown, }; - let request: Response; - try { - request = await fetch(url, { - method, - headers: { - ...(body instanceof FormData - ? {} - : body !== undefined - ? { - "Content-Type": "application/json", - } - : {}), - ...(headers !== undefined ? headers : {}), - }, - ...(body instanceof FormData - ? { - body, - } - : body !== undefined - ? { - body: JSON.stringify(body), - } - : {}), - }); - } catch (error) { - if (!ignoreFailure) { - Sentry.captureException(error); - if (tryCount > 1) { - logger.debug( - "Request failed, trying " + (tryCount - 1) + " more times", - { params, error, requestId }, - ); - return await robustFetch({ - ...params, - requestId, - tryCount: tryCount - 1, - }); - } else { - logger.debug("Request failed", { params, error, requestId }); - throw new Error("Request failed", { - cause: { - params, - requestId, - error, - }, - }); - } - } else { - return null as Output; - } - } - - if (ignoreResponse === true) { - return null as Output; - } - - const response = { - status: request.status, - headers: request.headers, - body: await request.text(), // NOTE: can this throw an exception? + let response: { + status: number; + headers: Headers, + body: string, }; - if (request.status >= 300) { + if (mock === null) { + let request: Response; + try { + request = await fetch(url, { + method, + headers: { + ...(body instanceof FormData + ? {} + : body !== undefined + ? { + "Content-Type": "application/json", + } + : {}), + ...(headers !== undefined ? headers : {}), + }, + ...(body instanceof FormData + ? { + body, + } + : body !== undefined + ? { + body: JSON.stringify(body), + } + : {}), + }); + } catch (error) { + if (!ignoreFailure) { + Sentry.captureException(error); + if (tryCount > 1) { + logger.debug( + "Request failed, trying " + (tryCount - 1) + " more times", + { params, error, requestId }, + ); + return await robustFetch({ + ...params, + requestId, + tryCount: tryCount - 1, + mock, + }); + } else { + logger.debug("Request failed", { params, error, requestId }); + throw new Error("Request failed", { + cause: { + params, + requestId, + error, + }, + }); + } + } else { + return null as Output; + } + } + + if (ignoreResponse === true) { + return null as Output; + } + + response = { + status: request.status, + headers: request.headers, + body: await request.text(), // NOTE: can this throw an exception? + }; + } else { + if (ignoreResponse === true) { + return null as Output; + } + + const makeRequestTypeId = (request: typeof mock["requests"][number]["options"]) => { + let out = request.url + ";" + request.method; + if (process.env.FIRE_ENGINE_BETA_URL && url.startsWith(process.env.FIRE_ENGINE_BETA_URL) && request.method === "POST") { + out += "f-e;" + request.body?.engine + ";" + request.body?.url; + } + return out; + } + + const thisId = makeRequestTypeId(params); + const matchingMocks = mock.requests.filter(x => makeRequestTypeId(x.options) === thisId).sort((a,b) => a.time - b.time); + const nextI = mock.tracker[thisId] ?? 0; + mock.tracker[thisId] = nextI + 1; + + if (!matchingMocks[nextI]) { + throw new Error("Failed to mock request -- no mock targets found."); + } + + response = { + ...(matchingMocks[nextI].result), + headers: new Headers(matchingMocks[nextI].result.headers), + }; + } + + if (response.status >= 300) { if (tryCount > 1) { logger.debug( "Request sent failure status, trying " + (tryCount - 1) + " more times", - { params, request, response, requestId }, + { params, response, requestId }, ); if (tryCooldown !== undefined) { await new Promise((resolve) => @@ -124,18 +161,17 @@ export async function robustFetch< ...params, requestId, tryCount: tryCount - 1, + mock, }); } else { logger.debug("Request sent failure status", { params, - request, response, requestId, }); throw new Error("Request sent failure status", { cause: { params, - request, response, requestId, }, @@ -143,20 +179,27 @@ export async function robustFetch< } } + if (mock === null) { + await saveMock({ + ...params, + logger: undefined, + schema: undefined, + headers: undefined, + }, response); + } + let data: Output; try { data = JSON.parse(response.body); } catch (error) { logger.debug("Request sent malformed JSON", { params, - request, response, requestId, }); throw new Error("Request sent malformed JSON", { cause: { params, - request, response, requestId, }, @@ -170,7 +213,6 @@ export async function robustFetch< if (error instanceof ZodError) { logger.debug("Response does not match provided schema", { params, - request, response, requestId, error, @@ -179,7 +221,6 @@ export async function robustFetch< throw new Error("Response does not match provided schema", { cause: { params, - request, response, requestId, error, @@ -189,7 +230,6 @@ export async function robustFetch< } else { logger.debug("Parsing response with provided schema failed", { params, - request, response, requestId, error, @@ -198,7 +238,6 @@ export async function robustFetch< throw new Error("Parsing response with provided schema failed", { cause: { params, - request, response, requestId, error, diff --git a/apps/api/src/scraper/scrapeURL/lib/mock.ts b/apps/api/src/scraper/scrapeURL/lib/mock.ts new file mode 100644 index 00000000..666f4d9f --- /dev/null +++ b/apps/api/src/scraper/scrapeURL/lib/mock.ts @@ -0,0 +1,60 @@ +import * as fs from "fs/promises"; +import * as path from "path"; +import { logger as _logger } from "../../../lib/logger"; +import { Logger } from "winston"; +const saveMocksDirPath = path.join(__dirname, "../mocks/").replace("dist/", ""); +const loadMocksDirPath = path.join(__dirname, "../../../__tests__/snips/mocks"); + +export async function saveMock(options: unknown, result: unknown) { + if (!process.env.FIRECRAWL_SAVE_MOCKS) return; + + await fs.mkdir(saveMocksDirPath, { recursive: true }); + + const fileName = Date.now() + "-" + crypto.randomUUID() + ".json"; + const filePath = path.join(saveMocksDirPath, fileName); + console.log(filePath); + + await fs.writeFile(filePath, JSON.stringify({ + time: Date.now(), + options, + result, + }, undefined, 4)); +} + +export type MockState = { + requests: { + time: number, + options: { + url: string, + method: string, + body?: any, + ignoreResponse: boolean, + ignoreFailure: boolean, + tryCount: number, + tryCooldown?: number, + }, + result: any, + }[], + tracker: Record, +} + +export async function loadMock(name: string, logger: Logger = _logger): Promise { + try { + const mockPath = path.join(loadMocksDirPath, name + ".json"); + + const relative = path.relative(loadMocksDirPath, mockPath); + if (!relative || relative.startsWith("..") || path.isAbsolute(relative)) { + // directory moving + return null; + } + + const load = JSON.parse(await fs.readFile(mockPath, "utf8")); + return { + requests: load, + tracker: {}, + }; + } catch (error) { + logger.warn("Failed to load mock file!", { name, module: "scrapeURL:mock", method: "loadMock", error }); + return null; + } +}