fix(v1/types): fix extract -> json rename (FIR-1072) (#1195)

* fix(v1/types): fix extract -> json rename * fix(types/v1): bad transform
2025-08-13 20:35:56 +08:00 · 2025-02-18 14:32:19 +01:00 · 2025-02-18 14:32:19 +01:00 · 586a10f40d
commit 586a10f40d
parent 5ac6eb7440
3 changed files with 212 additions and 101 deletions
--- a/apps/api/src/tests/snips/batch-scrape.test.ts
+++ b/apps/api/src/tests/snips/batch-scrape.test.ts
@ -0,0 +1,92 @@
+import request from "supertest";
+import { configDotenv } from "dotenv";
+import { BatchScrapeRequestInput } from "../../controllers/v1/types";
+
+configDotenv();
+const TEST_URL = "http://127.0.0.1:3002";
+
+async function batchScrapeStart(body: BatchScrapeRequestInput) {
+  return await request(TEST_URL)
+    .post("/v1/batch/scrape")
+    .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+    .set("Content-Type", "application/json")
+    .send(body);
+}
+
+async function batchScrapeStatus(id: string) {
+    return await request(TEST_URL)
+      .get("/v1/batch/scrape/" + encodeURIComponent(id))
+      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+      .send();
+}
+
+async function batchScrape(body: BatchScrapeRequestInput): ReturnType<typeof batchScrapeStatus> {
+    const bss = await batchScrapeStart(body);
+    expectBatchScrapeStartToSucceed(bss);
+    
+    let x;
+
+    do {
+        x = await batchScrapeStatus(bss.body.id);
+        expect(x.statusCode).toBe(200);
+        expect(typeof x.body.status).toBe("string");
+    } while (x.body.status !== "completed")
+
+    expectBatchScrapeToSucceed(x);
+    return x;
+}
+
+function expectBatchScrapeStartToSucceed(response: Awaited<ReturnType<typeof batchScrape>>) {
+  expect(response.statusCode).toBe(200);
+  expect(response.body.success).toBe(true);
+  expect(typeof response.body.id).toBe("string");
+}
+
+function expectBatchScrapeToSucceed(response: Awaited<ReturnType<typeof batchScrapeStatus>>) {
+    expect(response.statusCode).toBe(200);
+    expect(response.body.success).toBe(true);
+    expect(typeof response.body.status).toBe("string");
+    expect(response.body.status).toBe("completed");
+    expect(response.body).toHaveProperty("data");
+    expect(Array.isArray(response.body.data)).toBe(true);
+    expect(response.body.data.length).toBeGreaterThan(0);
+}
+
+describe("Batch scrape tests", () => {
+  describe("JSON format", () => {
+    it.concurrent("works", async () => {
+        const response = await batchScrape({
+            urls: ["http://firecrawl.dev"],
+            formats: ["json"],
+            jsonOptions: {
+                prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.",
+                schema: {
+                    type: "object",
+                    properties: {
+                        company_mission: {
+                            type: "string",
+                        },
+                        supports_sso: {
+                            type: "boolean",
+                        },
+                        is_open_source: {
+                            type: "boolean",
+                        },
+                    },
+                    required: ["company_mission", "supports_sso", "is_open_source"],
+                },
+            },
+        });
+      
+        expect(response.body.data[0]).toHaveProperty("json");
+        expect(response.body.data[0].json).toHaveProperty("company_mission");
+        expect(typeof response.body.data[0].json.company_mission).toBe("string");
+        expect(response.body.data[0].json).toHaveProperty("supports_sso");
+        expect(response.body.data[0].json.supports_sso).toBe(false);
+        expect(typeof response.body.data[0].json.supports_sso).toBe("boolean");
+        expect(response.body.data[0].json).toHaveProperty("is_open_source");
+        expect(response.body.data[0].json.is_open_source).toBe(true);
+        expect(typeof response.body.data[0].json.is_open_source).toBe("boolean");
+    }, 30000);
+  });
+});
--- a/apps/api/src/tests/snips/scrape.test.ts
+++ b/apps/api/src/tests/snips/scrape.test.ts
@ -1,11 +1,11 @@
 import request from "supertest";
 import { configDotenv } from "dotenv";
-import { ScrapeRequestInput } from "../../controllers/v1/types";
+import { Document, ScrapeRequestInput } from "../../controllers/v1/types";

 configDotenv();
 const TEST_URL = "http://127.0.0.1:3002";

-async function scrape(body: ScrapeRequestInput) {
+async function scrapeRaw(body: ScrapeRequestInput) {
  return await request(TEST_URL)
    .post("/v1/scrape")
    .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -13,12 +13,18 @@ async function scrape(body: ScrapeRequestInput) {
    .send(body);
 }

-function expectScrapeToSucceed(response: Awaited<ReturnType<typeof scrape>>) {
+function expectScrapeToSucceed(response: Awaited<ReturnType<typeof scrapeRaw>>) {
  expect(response.statusCode).toBe(200);
  expect(response.body.success).toBe(true);
  expect(typeof response.body.data).toBe("object");
 }

+async function scrape(body: ScrapeRequestInput): Promise<Document> {
+  const raw = await scrapeRaw(body);
+  expectScrapeToSucceed(raw);
+  return raw.body.data;
+}
+
 describe("Scrape tests", () => {
  it("mocking works properly", async () => {
    // depends on falsified mock mocking-works-properly
@ -30,8 +36,7 @@ describe("Scrape tests", () => {
      useMock: "mocking-works-properly",
    });

-    expectScrapeToSucceed(response);
-    expect(response.body.data.markdown).toBe(
+    expect(response.markdown).toBe(
      "this is fake data coming from the mocking system!",
    );
  }, 10000);
@ -42,8 +47,7 @@ describe("Scrape tests", () => {
        url: "https://canyoublockit.com/testing/",
      });

-      expectScrapeToSucceed(response);
-      expect(response.body.data.markdown).not.toContain(".g.doubleclick.net/");
+      expect(response.markdown).not.toContain(".g.doubleclick.net/");
    }, 10000);

    it.concurrent("doesn't block ads if explicitly disabled", async () => {
@ -52,8 +56,7 @@ describe("Scrape tests", () => {
        blockAds: false,
      });

-      expectScrapeToSucceed(response);
-      expect(response.body.data.markdown).toContain(".g.doubleclick.net/");
+      expect(response.markdown).toContain(".g.doubleclick.net/");
    }, 10000);
  });
  
@ -62,8 +65,6 @@ describe("Scrape tests", () => {
      const response = await scrape({
        url: "https://iplocation.com",
      });
-  
-      expectScrapeToSucceed(response);
    }, 10000);

    it.concurrent("works with country US", async () => {
@ -72,8 +73,7 @@ describe("Scrape tests", () => {
        location: { country: "US" },
      });
  
-      expectScrapeToSucceed(response);
-      expect(response.body.data.markdown).toContain("| Country | United States |");
+      expect(response.markdown).toContain("| Country | United States |");
    }, 10000);
  });

@ -84,8 +84,7 @@ describe("Scrape tests", () => {
        formats: ["rawHtml"],
      });

-      expectScrapeToSucceed(response);
-      const obj = JSON.parse(response.body.data.rawHtml);
+      const obj = JSON.parse(response.rawHtml!);
      expect(obj.id).toBe(1);
    }, 25000); // TODO: mock and shorten
  });
@ -97,8 +96,7 @@ describe("Scrape tests", () => {
        formats: ["screenshot"]
      });
  
-      expectScrapeToSucceed(response);
-      expect(typeof response.body.data.screenshot).toBe("string");
+      expect(typeof response.screenshot).toBe("string");
    }, 15000);

    it.concurrent("screenshot@fullPage format works", async () => {
@ -107,8 +105,44 @@ describe("Scrape tests", () => {
        formats: ["screenshot@fullPage"]
      });
  
-      expectScrapeToSucceed(response);
-      expect(typeof response.body.data.screenshot).toBe("string");
+      expect(typeof response.screenshot).toBe("string");
    }, 15000);
-  })
+  });
+
+  describe("JSON format", () => {
+    it.concurrent("works", async () => {
+      const response = await scrape({
+        url: "http://firecrawl.dev",
+        formats: ["json"],
+        jsonOptions: {
+          prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.",
+          schema: {
+            type: "object",
+            properties: {
+              company_mission: {
+                type: "string",
+              },
+              supports_sso: {
+                type: "boolean",
+              },
+              is_open_source: {
+                type: "boolean",
+              },
+            },
+            required: ["company_mission", "supports_sso", "is_open_source"],
+          },
+        },
+      });
+  
+      expect(response).toHaveProperty("json");
+      expect(response.json).toHaveProperty("company_mission");
+      expect(typeof response.json.company_mission).toBe("string");
+      expect(response.json).toHaveProperty("supports_sso");
+      expect(response.json.supports_sso).toBe(false);
+      expect(typeof response.json.supports_sso).toBe("boolean");
+      expect(response.json).toHaveProperty("is_open_source");
+      expect(response.json.is_open_source).toBe(true);
+      expect(typeof response.json.is_open_source).toBe("boolean");
+    }, 30000);
+  });
 });
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -220,6 +220,54 @@ const baseScrapeOptions = z
  })
  .strict(strictMessage);

+const extractRefine = (obj) => {
+  const hasExtractFormat = obj.formats?.includes("extract");
+  const hasExtractOptions = obj.extract !== undefined;
+  const hasJsonFormat = obj.formats?.includes("json");
+  const hasJsonOptions = obj.jsonOptions !== undefined;
+  return (
+    (hasExtractFormat && hasExtractOptions) ||
+    (!hasExtractFormat && !hasExtractOptions) ||
+    (hasJsonFormat && hasJsonOptions) ||
+    (!hasJsonFormat && !hasJsonOptions)
+  );
+};
+const extractRefineOpts = {
+  message:
+    "When 'extract' or 'json' format is specified, corresponding options must be provided, and vice versa",
+};
+const extractTransform = (obj) => {
+  // Handle timeout
+  if (
+    (obj.formats?.includes("extract") ||
+      obj.extract ||
+      obj.formats?.includes("json") ||
+      obj.jsonOptions) &&
+    !obj.timeout
+  ) {
+    obj = { ...obj, timeout: 60000 };
+  }
+
+  if (obj.formats?.includes("json")) {
+    obj.formats.push("extract");
+  }
+
+  // Convert JSON options to extract options if needed
+  if (obj.jsonOptions && !obj.extract) {
+    obj = {
+      ...obj,
+      extract: {
+        prompt: obj.jsonOptions.prompt,
+        systemPrompt: obj.jsonOptions.systemPrompt,
+        schema: obj.jsonOptions.schema,
+        mode: "llm",
+      },
+    };
+  }
+
+  return obj;
+}
+
 export const scrapeOptions = baseScrapeOptions.refine(
  (obj) => {
    if (!obj.actions) return true;
@ -228,7 +276,8 @@ export const scrapeOptions = baseScrapeOptions.refine(
  {
    message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`,
  }
-);
+).refine(extractRefine, extractRefineOpts)
+.transform(extractTransform);

 export type ScrapeOptions = z.infer<typeof baseScrapeOptions>;

@ -280,7 +329,9 @@ export const extractV1Options = z
  .transform((obj) => ({
    ...obj,
    allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch,
-  }));
+  }))
+  .refine(x => extractRefine(x.scrapeOptions), extractRefineOpts)
+  .transform(x => extractTransform(x.scrapeOptions));

 export type ExtractV1Options = z.infer<typeof extractV1Options>;
 export const extractRequestSchema = extractV1Options;
@ -294,55 +345,8 @@ export const scrapeRequestSchema = baseScrapeOptions
    timeout: z.number().int().positive().finite().safe().default(30000),
  })
  .strict(strictMessage)
-  .refine(
-    (obj) => {
-      const hasExtractFormat = obj.formats?.includes("extract");
-      const hasExtractOptions = obj.extract !== undefined;
-      const hasJsonFormat = obj.formats?.includes("json");
-      const hasJsonOptions = obj.jsonOptions !== undefined;
-      return (
-        (hasExtractFormat && hasExtractOptions) ||
-        (!hasExtractFormat && !hasExtractOptions) ||
-        (hasJsonFormat && hasJsonOptions) ||
-        (!hasJsonFormat && !hasJsonOptions)
-      );
-    },
-    {
-      message:
-        "When 'extract' or 'json' format is specified, corresponding options must be provided, and vice versa",
-    },
-  )
-  .transform((obj) => {
-    // Handle timeout
-    if (
-      (obj.formats?.includes("extract") ||
-        obj.extract ||
-        obj.formats?.includes("json") ||
-        obj.jsonOptions) &&
-      !obj.timeout
-    ) {
-      obj = { ...obj, timeout: 60000 };
-    }
-
-    if (obj.formats?.includes("json")) {
-      obj.formats.push("extract");
-    }
-
-    // Convert JSON options to extract options if needed
-    if (obj.jsonOptions && !obj.extract) {
-      obj = {
-        ...obj,
-        extract: {
-          prompt: obj.jsonOptions.prompt,
-          systemPrompt: obj.jsonOptions.systemPrompt,
-          schema: obj.jsonOptions.schema,
-          mode: "llm",
-        },
-      };
-    }
-
-    return obj;
-  });
+  .refine(extractRefine, extractRefineOpts)
+  .transform(extractTransform);

 export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
 export type ScrapeRequestInput = z.input<typeof scrapeRequestSchema>;
@ -374,20 +378,8 @@ export const batchScrapeRequestSchema = baseScrapeOptions
    ignoreInvalidURLs: z.boolean().default(false),
  })
  .strict(strictMessage)
-  .refine(
-    (obj) => {
-      const hasExtractFormat = obj.formats?.includes("extract");
-      const hasExtractOptions = obj.extract !== undefined;
-      return (
-        (hasExtractFormat && hasExtractOptions) ||
-        (!hasExtractFormat && !hasExtractOptions)
-      );
-    },
-    {
-      message:
-        "When 'extract' format is specified, 'extract' options must be provided, and vice versa",
-    },
-  );
+  .refine(extractRefine, extractRefineOpts)
+  .transform(extractTransform);

 export const batchScrapeRequestSchemaNoURLValidation = baseScrapeOptions
  .extend({
@ -398,22 +390,11 @@ export const batchScrapeRequestSchemaNoURLValidation = baseScrapeOptions
    ignoreInvalidURLs: z.boolean().default(false),
  })
  .strict(strictMessage)
-  .refine(
-    (obj) => {
-      const hasExtractFormat = obj.formats?.includes("extract");
-      const hasExtractOptions = obj.extract !== undefined;
-      return (
-        (hasExtractFormat && hasExtractOptions) ||
-        (!hasExtractFormat && !hasExtractOptions)
-      );
-    },
-    {
-      message:
-        "When 'extract' format is specified, 'extract' options must be provided, and vice versa",
-    },
-  );
+  .refine(extractRefine, extractRefineOpts)
+  .transform(extractTransform);

 export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>;
+export type BatchScrapeRequestInput = z.input<typeof batchScrapeRequestSchema>;

 const crawlerOptions = z
  .object({
@ -451,7 +432,9 @@ export const crawlRequestSchema = crawlerOptions
    webhook: webhookSchema.optional(),
    limit: z.number().default(10000),
  })
-  .strict(strictMessage);
+  .strict(strictMessage)
+  .refine(x => extractRefine(x.scrapeOptions), extractRefineOpts)
+  .transform(x => extractTransform(x.scrapeOptions));

 // export type CrawlRequest = {
 //   url: string;
@ -935,7 +918,9 @@ export const searchRequestSchema = z
  })
  .strict(
    "Unrecognized key in body -- please review the v1 API documentation for request body changes",
-  );
+  )
+  .refine(x => extractRefine(x.scrapeOptions), extractRefineOpts)
+  .transform(x => extractTransform(x.scrapeOptions));

 export type SearchRequest = z.infer<typeof searchRequestSchema>;