From fc64f436ed5814c39a3bbf17e955d438db9fff54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 19 Feb 2025 16:07:55 +0100 Subject: [PATCH] fix(v1/types): fix extract -> json rename, ROUND II (FIR-1072) (#1199) * Revert "Revert "fix(v1/types): fix extract -> json rename (FIR-1072) (#1195)"" This reverts commit e28a44463ae49ffc195507204492cc7c15c438c4. * fix(v1/types): fix bad transform * feat(v1): proxy option / stealthProxy flag (FIR-1050) (#1196) * feat(v1): proxy option / stealthProxy flag * feat(js-sdk): add proxy option * fix * fix extract tests --- .github/workflows/ci.yml | 1 + .../src/__tests__/snips/batch-scrape.test.ts | 92 +++++++++++ apps/api/src/__tests__/snips/crawl.test.ts | 62 ++++++++ apps/api/src/__tests__/snips/extract.test.ts | 81 ++++++++++ apps/api/src/__tests__/snips/scrape.test.ts | 72 ++++++--- apps/api/src/__tests__/snips/search.test.ts | 36 +++++ apps/api/src/controllers/v1/types.ts | 150 ++++++++---------- 7 files changed, 394 insertions(+), 100 deletions(-) create mode 100644 apps/api/src/__tests__/snips/batch-scrape.test.ts create mode 100644 apps/api/src/__tests__/snips/crawl.test.ts create mode 100644 apps/api/src/__tests__/snips/extract.test.ts create mode 100644 apps/api/src/__tests__/snips/search.test.ts diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b9b24bf4..e9519ef1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -27,6 +27,7 @@ env: HDX_NODE_BETA_MODE: 1 FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }} USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }} + SERPER_API_KEY: ${{ secrets.SERPER_API_KEY }} ENV: ${{ secrets.ENV }} jobs: diff --git a/apps/api/src/__tests__/snips/batch-scrape.test.ts b/apps/api/src/__tests__/snips/batch-scrape.test.ts new file mode 100644 index 00000000..59c9da2e --- /dev/null +++ b/apps/api/src/__tests__/snips/batch-scrape.test.ts @@ -0,0 +1,92 @@ +import request from "supertest"; +import { configDotenv } from "dotenv"; +import { BatchScrapeRequestInput } from "../../controllers/v1/types"; + +configDotenv(); +const TEST_URL = "http://127.0.0.1:3002"; + +async function batchScrapeStart(body: BatchScrapeRequestInput) { + return await request(TEST_URL) + .post("/v1/batch/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(body); +} + +async function batchScrapeStatus(id: string) { + return await request(TEST_URL) + .get("/v1/batch/scrape/" + encodeURIComponent(id)) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .send(); +} + +async function batchScrape(body: BatchScrapeRequestInput): ReturnType { + const bss = await batchScrapeStart(body); + expectBatchScrapeStartToSucceed(bss); + + let x; + + do { + x = await batchScrapeStatus(bss.body.id); + expect(x.statusCode).toBe(200); + expect(typeof x.body.status).toBe("string"); + } while (x.body.status !== "completed") + + expectBatchScrapeToSucceed(x); + return x; +} + +function expectBatchScrapeStartToSucceed(response: Awaited>) { + expect(response.statusCode).toBe(200); + expect(response.body.success).toBe(true); + expect(typeof response.body.id).toBe("string"); +} + +function expectBatchScrapeToSucceed(response: Awaited>) { + expect(response.statusCode).toBe(200); + expect(response.body.success).toBe(true); + expect(typeof response.body.status).toBe("string"); + expect(response.body.status).toBe("completed"); + expect(response.body).toHaveProperty("data"); + expect(Array.isArray(response.body.data)).toBe(true); + expect(response.body.data.length).toBeGreaterThan(0); +} + +describe("Batch scrape tests", () => { + describe("JSON format", () => { + it.concurrent("works", async () => { + const response = await batchScrape({ + urls: ["http://firecrawl.dev"], + formats: ["json"], + jsonOptions: { + prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.", + schema: { + type: "object", + properties: { + company_mission: { + type: "string", + }, + supports_sso: { + type: "boolean", + }, + is_open_source: { + type: "boolean", + }, + }, + required: ["company_mission", "supports_sso", "is_open_source"], + }, + }, + }); + + expect(response.body.data[0]).toHaveProperty("json"); + expect(response.body.data[0].json).toHaveProperty("company_mission"); + expect(typeof response.body.data[0].json.company_mission).toBe("string"); + expect(response.body.data[0].json).toHaveProperty("supports_sso"); + expect(response.body.data[0].json.supports_sso).toBe(false); + expect(typeof response.body.data[0].json.supports_sso).toBe("boolean"); + expect(response.body.data[0].json).toHaveProperty("is_open_source"); + expect(response.body.data[0].json.is_open_source).toBe(true); + expect(typeof response.body.data[0].json.is_open_source).toBe("boolean"); + }, 30000); + }); +}); diff --git a/apps/api/src/__tests__/snips/crawl.test.ts b/apps/api/src/__tests__/snips/crawl.test.ts new file mode 100644 index 00000000..6aa47732 --- /dev/null +++ b/apps/api/src/__tests__/snips/crawl.test.ts @@ -0,0 +1,62 @@ +import request from "supertest"; +import { configDotenv } from "dotenv"; +import { CrawlRequestInput } from "../../controllers/v1/types"; + +configDotenv(); +const TEST_URL = "http://127.0.0.1:3002"; + +async function crawlStart(body: CrawlRequestInput) { + return await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(body); +} + +async function crawlStatus(id: string) { + return await request(TEST_URL) + .get("/v1/crawl/" + encodeURIComponent(id)) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .send(); +} + +async function crawl(body: CrawlRequestInput): ReturnType { + const cs = await crawlStart(body); + expectCrawlStartToSucceed(cs); + + let x; + + do { + x = await crawlStatus(cs.body.id); + expect(x.statusCode).toBe(200); + expect(typeof x.body.status).toBe("string"); + } while (x.body.status !== "completed") + + expectCrawlToSucceed(x); + return x; +} + +function expectCrawlStartToSucceed(response: Awaited>) { + expect(response.statusCode).toBe(200); + expect(response.body.success).toBe(true); + expect(typeof response.body.id).toBe("string"); +} + +function expectCrawlToSucceed(response: Awaited>) { + expect(response.statusCode).toBe(200); + expect(response.body.success).toBe(true); + expect(typeof response.body.status).toBe("string"); + expect(response.body.status).toBe("completed"); + expect(response.body).toHaveProperty("data"); + expect(Array.isArray(response.body.data)).toBe(true); + expect(response.body.data.length).toBeGreaterThan(0); +} + +describe("Crawl tests", () => { + it.concurrent("works", async () => { + await crawl({ + url: "https://firecrawl.dev", + limit: 10, + }); + }, 120000); +}); diff --git a/apps/api/src/__tests__/snips/extract.test.ts b/apps/api/src/__tests__/snips/extract.test.ts new file mode 100644 index 00000000..7375d85a --- /dev/null +++ b/apps/api/src/__tests__/snips/extract.test.ts @@ -0,0 +1,81 @@ +import request from "supertest"; +import { configDotenv } from "dotenv"; +import { ExtractRequestInput, ExtractResponse } from "../../controllers/v1/types"; + +configDotenv(); +const TEST_URL = "http://127.0.0.1:3002"; + +async function extractStart(body: ExtractRequestInput) { + return await request(TEST_URL) + .post("/v1/extract") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(body); +} + +async function extractStatus(id: string) { + return await request(TEST_URL) + .get("/v1/extract/" + encodeURIComponent(id)) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .send(); +} + +async function extract(body: ExtractRequestInput): Promise { + const es = await extractStart(body); + expectExtractStartToSucceed(es); + + let x; + + do { + x = await extractStatus(es.body.id); + expect(x.statusCode).toBe(200); + expect(typeof x.body.status).toBe("string"); + } while (x.body.status !== "completed"); + + expectExtractToSucceed(x); + return x.body; +} + +function expectExtractStartToSucceed(response: Awaited>) { + expect(response.statusCode).toBe(200); + expect(response.body.success).toBe(true); + expect(typeof response.body.id).toBe("string"); +} + +function expectExtractToSucceed(response: Awaited>) { + expect(response.statusCode).toBe(200); + expect(response.body.success).toBe(true); + expect(typeof response.body.status).toBe("string"); + expect(response.body.status).toBe("completed"); + expect(response.body).toHaveProperty("data"); +} + +describe("Extract tests", () => { + it.concurrent("works", async () => { + const res = await extract({ + urls: ["https://firecrawl.dev"], + schema: { + "type": "object", + "properties": { + "company_mission": { + "type": "string" + }, + "is_open_source": { + "type": "boolean" + } + }, + "required": [ + "company_mission", + "is_open_source" + ] + }, + origin: "api-sdk", + }); + + expect(res.data).toHaveProperty("company_mission"); + expect(typeof res.data.company_mission).toBe("string") + expect(res.data).toHaveProperty("is_open_source"); + expect(typeof res.data.is_open_source).toBe("boolean"); + expect(res.data.is_open_source).toBe(true); + }, 60000); +}); diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index 754ef8e0..2ab5df9e 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -1,11 +1,11 @@ import request from "supertest"; import { configDotenv } from "dotenv"; -import { ScrapeRequestInput } from "../../controllers/v1/types"; +import { Document, ScrapeRequestInput } from "../../controllers/v1/types"; configDotenv(); const TEST_URL = "http://127.0.0.1:3002"; -async function scrape(body: ScrapeRequestInput) { +async function scrapeRaw(body: ScrapeRequestInput) { return await request(TEST_URL) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -13,12 +13,18 @@ async function scrape(body: ScrapeRequestInput) { .send(body); } -function expectScrapeToSucceed(response: Awaited>) { +function expectScrapeToSucceed(response: Awaited>) { expect(response.statusCode).toBe(200); expect(response.body.success).toBe(true); expect(typeof response.body.data).toBe("object"); } +async function scrape(body: ScrapeRequestInput): Promise { + const raw = await scrapeRaw(body); + expectScrapeToSucceed(raw); + return raw.body.data; +} + describe("Scrape tests", () => { it("mocking works properly", async () => { // depends on falsified mock mocking-works-properly @@ -30,8 +36,7 @@ describe("Scrape tests", () => { useMock: "mocking-works-properly", }); - expectScrapeToSucceed(response); - expect(response.body.data.markdown).toBe( + expect(response.markdown).toBe( "this is fake data coming from the mocking system!", ); }, 10000); @@ -42,8 +47,7 @@ describe("Scrape tests", () => { url: "https://canyoublockit.com/testing/", }); - expectScrapeToSucceed(response); - expect(response.body.data.markdown).not.toContain(".g.doubleclick.net/"); + expect(response.markdown).not.toContain(".g.doubleclick.net/"); }, 10000); it.concurrent("doesn't block ads if explicitly disabled", async () => { @@ -52,8 +56,7 @@ describe("Scrape tests", () => { blockAds: false, }); - expectScrapeToSucceed(response); - expect(response.body.data.markdown).toContain(".g.doubleclick.net/"); + expect(response.markdown).toContain(".g.doubleclick.net/"); }, 10000); }); @@ -62,8 +65,6 @@ describe("Scrape tests", () => { const response = await scrape({ url: "https://iplocation.com", }); - - expectScrapeToSucceed(response); }, 10000); it.concurrent("works with country US", async () => { @@ -72,8 +73,7 @@ describe("Scrape tests", () => { location: { country: "US" }, }); - expectScrapeToSucceed(response); - expect(response.body.data.markdown).toContain("| Country | United States |"); + expect(response.markdown).toContain("| Country | United States |"); }, 10000); }); @@ -84,8 +84,7 @@ describe("Scrape tests", () => { formats: ["rawHtml"], }); - expectScrapeToSucceed(response); - const obj = JSON.parse(response.body.data.rawHtml); + const obj = JSON.parse(response.rawHtml!); expect(obj.id).toBe(1); }, 25000); // TODO: mock and shorten }); @@ -97,8 +96,7 @@ describe("Scrape tests", () => { formats: ["screenshot"] }); - expectScrapeToSucceed(response); - expect(typeof response.body.data.screenshot).toBe("string"); + expect(typeof response.screenshot).toBe("string"); }, 15000); it.concurrent("screenshot@fullPage format works", async () => { @@ -107,11 +105,47 @@ describe("Scrape tests", () => { formats: ["screenshot@fullPage"] }); - expectScrapeToSucceed(response); - expect(typeof response.body.data.screenshot).toBe("string"); + expect(typeof response.screenshot).toBe("string"); }, 15000); }); + describe("JSON format", () => { + it.concurrent("works", async () => { + const response = await scrape({ + url: "http://firecrawl.dev", + formats: ["json"], + jsonOptions: { + prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.", + schema: { + type: "object", + properties: { + company_mission: { + type: "string", + }, + supports_sso: { + type: "boolean", + }, + is_open_source: { + type: "boolean", + }, + }, + required: ["company_mission", "supports_sso", "is_open_source"], + }, + }, + }); + + expect(response).toHaveProperty("json"); + expect(response.json).toHaveProperty("company_mission"); + expect(typeof response.json.company_mission).toBe("string"); + expect(response.json).toHaveProperty("supports_sso"); + expect(response.json.supports_sso).toBe(false); + expect(typeof response.json.supports_sso).toBe("boolean"); + expect(response.json).toHaveProperty("is_open_source"); + expect(response.json.is_open_source).toBe(true); + expect(typeof response.json.is_open_source).toBe("boolean"); + }, 30000); + }); + describe("Proxy API (f-e dependant)", () => { it.concurrent("undefined works", async () => { await scrape({ diff --git a/apps/api/src/__tests__/snips/search.test.ts b/apps/api/src/__tests__/snips/search.test.ts new file mode 100644 index 00000000..5cb5323d --- /dev/null +++ b/apps/api/src/__tests__/snips/search.test.ts @@ -0,0 +1,36 @@ +import request from "supertest"; +import { configDotenv } from "dotenv"; +import { Document, SearchRequestInput } from "../../controllers/v1/types"; + +configDotenv(); +const TEST_URL = "http://127.0.0.1:3002"; + +async function searchRaw(body: SearchRequestInput) { + return await request(TEST_URL) + .post("/v1/search") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(body); +} + +function expectScrapeToSucceed(response: Awaited>) { + expect(response.statusCode).toBe(200); + expect(response.body.success).toBe(true); + expect(typeof response.body.data).toBe("object"); + expect(Array.isArray(response.body.data)).toBe(true); + expect(response.body.data.length).toBeGreaterThan(0); +} + +async function search(body: SearchRequestInput): Promise { + const raw = await searchRaw(body); + expectScrapeToSucceed(raw); + return raw.body.data; +} + +describe("Scrape tests", () => { + it("works", async () => { + await search({ + query: "firecrawl" + }); + }, 15000); +}); diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index b8bf4363..9f0150b6 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -221,6 +221,54 @@ const baseScrapeOptions = z }) .strict(strictMessage); +const extractRefine = (obj) => { + const hasExtractFormat = obj.formats?.includes("extract"); + const hasExtractOptions = obj.extract !== undefined; + const hasJsonFormat = obj.formats?.includes("json"); + const hasJsonOptions = obj.jsonOptions !== undefined; + return ( + (hasExtractFormat && hasExtractOptions) || + (!hasExtractFormat && !hasExtractOptions) || + (hasJsonFormat && hasJsonOptions) || + (!hasJsonFormat && !hasJsonOptions) + ); +}; +const extractRefineOpts = { + message: + "When 'extract' or 'json' format is specified, corresponding options must be provided, and vice versa", +}; +const extractTransform = (obj) => { + // Handle timeout + if ( + (obj.formats?.includes("extract") || + obj.extract || + obj.formats?.includes("json") || + obj.jsonOptions) && + !obj.timeout + ) { + obj = { ...obj, timeout: 60000 }; + } + + if (obj.formats?.includes("json")) { + obj.formats.push("extract"); + } + + // Convert JSON options to extract options if needed + if (obj.jsonOptions && !obj.extract) { + obj = { + ...obj, + extract: { + prompt: obj.jsonOptions.prompt, + systemPrompt: obj.jsonOptions.systemPrompt, + schema: obj.jsonOptions.schema, + mode: "llm", + }, + }; + } + + return obj; +} + export const scrapeOptions = baseScrapeOptions.refine( (obj) => { if (!obj.actions) return true; @@ -229,7 +277,8 @@ export const scrapeOptions = baseScrapeOptions.refine( { message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`, } -); +).refine(extractRefine, extractRefineOpts) +.transform(extractTransform); export type ScrapeOptions = z.infer; @@ -281,11 +330,14 @@ export const extractV1Options = z .transform((obj) => ({ ...obj, allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch, - })); + })) + .refine(x => x.scrapeOptions ? extractRefine(x.scrapeOptions) : true, extractRefineOpts) + .transform(x => ({ ...x, scrapeOptions: x.scrapeOptions ? extractTransform(x.scrapeOptions) : x.scrapeOptions })); export type ExtractV1Options = z.infer; export const extractRequestSchema = extractV1Options; export type ExtractRequest = z.infer; +export type ExtractRequestInput = z.input; export const scrapeRequestSchema = baseScrapeOptions .omit({ timeout: true }) @@ -295,55 +347,8 @@ export const scrapeRequestSchema = baseScrapeOptions timeout: z.number().int().positive().finite().safe().default(30000), }) .strict(strictMessage) - .refine( - (obj) => { - const hasExtractFormat = obj.formats?.includes("extract"); - const hasExtractOptions = obj.extract !== undefined; - const hasJsonFormat = obj.formats?.includes("json"); - const hasJsonOptions = obj.jsonOptions !== undefined; - return ( - (hasExtractFormat && hasExtractOptions) || - (!hasExtractFormat && !hasExtractOptions) || - (hasJsonFormat && hasJsonOptions) || - (!hasJsonFormat && !hasJsonOptions) - ); - }, - { - message: - "When 'extract' or 'json' format is specified, corresponding options must be provided, and vice versa", - }, - ) - .transform((obj) => { - // Handle timeout - if ( - (obj.formats?.includes("extract") || - obj.extract || - obj.formats?.includes("json") || - obj.jsonOptions) && - !obj.timeout - ) { - obj = { ...obj, timeout: 60000 }; - } - - if (obj.formats?.includes("json")) { - obj.formats.push("extract"); - } - - // Convert JSON options to extract options if needed - if (obj.jsonOptions && !obj.extract) { - obj = { - ...obj, - extract: { - prompt: obj.jsonOptions.prompt, - systemPrompt: obj.jsonOptions.systemPrompt, - schema: obj.jsonOptions.schema, - mode: "llm", - }, - }; - } - - return obj; - }); + .refine(extractRefine, extractRefineOpts) + .transform(extractTransform); export type ScrapeRequest = z.infer; export type ScrapeRequestInput = z.input; @@ -375,20 +380,8 @@ export const batchScrapeRequestSchema = baseScrapeOptions ignoreInvalidURLs: z.boolean().default(false), }) .strict(strictMessage) - .refine( - (obj) => { - const hasExtractFormat = obj.formats?.includes("extract"); - const hasExtractOptions = obj.extract !== undefined; - return ( - (hasExtractFormat && hasExtractOptions) || - (!hasExtractFormat && !hasExtractOptions) - ); - }, - { - message: - "When 'extract' format is specified, 'extract' options must be provided, and vice versa", - }, - ); + .refine(extractRefine, extractRefineOpts) + .transform(extractTransform); export const batchScrapeRequestSchemaNoURLValidation = baseScrapeOptions .extend({ @@ -399,22 +392,11 @@ export const batchScrapeRequestSchemaNoURLValidation = baseScrapeOptions ignoreInvalidURLs: z.boolean().default(false), }) .strict(strictMessage) - .refine( - (obj) => { - const hasExtractFormat = obj.formats?.includes("extract"); - const hasExtractOptions = obj.extract !== undefined; - return ( - (hasExtractFormat && hasExtractOptions) || - (!hasExtractFormat && !hasExtractOptions) - ); - }, - { - message: - "When 'extract' format is specified, 'extract' options must be provided, and vice versa", - }, - ); + .refine(extractRefine, extractRefineOpts) + .transform(extractTransform); export type BatchScrapeRequest = z.infer; +export type BatchScrapeRequestInput = z.input; const crawlerOptions = z .object({ @@ -452,7 +434,9 @@ export const crawlRequestSchema = crawlerOptions webhook: webhookSchema.optional(), limit: z.number().default(10000), }) - .strict(strictMessage); + .strict(strictMessage) + .refine(x => extractRefine(x.scrapeOptions), extractRefineOpts) + .transform(x => ({ ...x, scrapeOptions: extractTransform(x.scrapeOptions) })); // export type CrawlRequest = { // url: string; @@ -467,6 +451,7 @@ export const crawlRequestSchema = crawlerOptions // } export type CrawlRequest = z.infer; +export type CrawlRequestInput = z.input; export const mapRequestSchema = crawlerOptions .extend({ @@ -936,9 +921,12 @@ export const searchRequestSchema = z }) .strict( "Unrecognized key in body -- please review the v1 API documentation for request body changes", - ); + ) + .refine(x => extractRefine(x.scrapeOptions), extractRefineOpts) + .transform(x => ({ ...x, scrapeOptions: extractTransform(x.scrapeOptions) })); export type SearchRequest = z.infer; +export type SearchRequestInput = z.input; export type SearchResponse = | ErrorResponse