fix(v1/types): fix extract -> json rename, ROUND II (FIR-1072) (#1199)

* Revert "Revert "fix(v1/types): fix extract -> json rename (FIR-1072) (#1195)""

This reverts commit e28a44463ae49ffc195507204492cc7c15c438c4.

* fix(v1/types): fix bad transform

* feat(v1): proxy option / stealthProxy flag (FIR-1050) (#1196)

* feat(v1): proxy option / stealthProxy flag

* feat(js-sdk): add proxy option

* fix

* fix extract tests
This commit is contained in:
Gergő Móricz 2025-02-19 16:07:55 +01:00 committed by GitHub
parent 42050d3d6e
commit fc64f436ed
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 394 additions and 100 deletions

View File

@ -27,6 +27,7 @@ env:
HDX_NODE_BETA_MODE: 1
FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
SERPER_API_KEY: ${{ secrets.SERPER_API_KEY }}
ENV: ${{ secrets.ENV }}
jobs:

View File

@ -0,0 +1,92 @@
import request from "supertest";
import { configDotenv } from "dotenv";
import { BatchScrapeRequestInput } from "../../controllers/v1/types";
configDotenv();
const TEST_URL = "http://127.0.0.1:3002";
async function batchScrapeStart(body: BatchScrapeRequestInput) {
return await request(TEST_URL)
.post("/v1/batch/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(body);
}
async function batchScrapeStatus(id: string) {
return await request(TEST_URL)
.get("/v1/batch/scrape/" + encodeURIComponent(id))
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.send();
}
async function batchScrape(body: BatchScrapeRequestInput): ReturnType<typeof batchScrapeStatus> {
const bss = await batchScrapeStart(body);
expectBatchScrapeStartToSucceed(bss);
let x;
do {
x = await batchScrapeStatus(bss.body.id);
expect(x.statusCode).toBe(200);
expect(typeof x.body.status).toBe("string");
} while (x.body.status !== "completed")
expectBatchScrapeToSucceed(x);
return x;
}
function expectBatchScrapeStartToSucceed(response: Awaited<ReturnType<typeof batchScrape>>) {
expect(response.statusCode).toBe(200);
expect(response.body.success).toBe(true);
expect(typeof response.body.id).toBe("string");
}
function expectBatchScrapeToSucceed(response: Awaited<ReturnType<typeof batchScrapeStatus>>) {
expect(response.statusCode).toBe(200);
expect(response.body.success).toBe(true);
expect(typeof response.body.status).toBe("string");
expect(response.body.status).toBe("completed");
expect(response.body).toHaveProperty("data");
expect(Array.isArray(response.body.data)).toBe(true);
expect(response.body.data.length).toBeGreaterThan(0);
}
describe("Batch scrape tests", () => {
describe("JSON format", () => {
it.concurrent("works", async () => {
const response = await batchScrape({
urls: ["http://firecrawl.dev"],
formats: ["json"],
jsonOptions: {
prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.",
schema: {
type: "object",
properties: {
company_mission: {
type: "string",
},
supports_sso: {
type: "boolean",
},
is_open_source: {
type: "boolean",
},
},
required: ["company_mission", "supports_sso", "is_open_source"],
},
},
});
expect(response.body.data[0]).toHaveProperty("json");
expect(response.body.data[0].json).toHaveProperty("company_mission");
expect(typeof response.body.data[0].json.company_mission).toBe("string");
expect(response.body.data[0].json).toHaveProperty("supports_sso");
expect(response.body.data[0].json.supports_sso).toBe(false);
expect(typeof response.body.data[0].json.supports_sso).toBe("boolean");
expect(response.body.data[0].json).toHaveProperty("is_open_source");
expect(response.body.data[0].json.is_open_source).toBe(true);
expect(typeof response.body.data[0].json.is_open_source).toBe("boolean");
}, 30000);
});
});

View File

@ -0,0 +1,62 @@
import request from "supertest";
import { configDotenv } from "dotenv";
import { CrawlRequestInput } from "../../controllers/v1/types";
configDotenv();
const TEST_URL = "http://127.0.0.1:3002";
async function crawlStart(body: CrawlRequestInput) {
return await request(TEST_URL)
.post("/v1/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(body);
}
async function crawlStatus(id: string) {
return await request(TEST_URL)
.get("/v1/crawl/" + encodeURIComponent(id))
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.send();
}
async function crawl(body: CrawlRequestInput): ReturnType<typeof crawlStatus> {
const cs = await crawlStart(body);
expectCrawlStartToSucceed(cs);
let x;
do {
x = await crawlStatus(cs.body.id);
expect(x.statusCode).toBe(200);
expect(typeof x.body.status).toBe("string");
} while (x.body.status !== "completed")
expectCrawlToSucceed(x);
return x;
}
function expectCrawlStartToSucceed(response: Awaited<ReturnType<typeof crawlStart>>) {
expect(response.statusCode).toBe(200);
expect(response.body.success).toBe(true);
expect(typeof response.body.id).toBe("string");
}
function expectCrawlToSucceed(response: Awaited<ReturnType<typeof crawlStatus>>) {
expect(response.statusCode).toBe(200);
expect(response.body.success).toBe(true);
expect(typeof response.body.status).toBe("string");
expect(response.body.status).toBe("completed");
expect(response.body).toHaveProperty("data");
expect(Array.isArray(response.body.data)).toBe(true);
expect(response.body.data.length).toBeGreaterThan(0);
}
describe("Crawl tests", () => {
it.concurrent("works", async () => {
await crawl({
url: "https://firecrawl.dev",
limit: 10,
});
}, 120000);
});

View File

@ -0,0 +1,81 @@
import request from "supertest";
import { configDotenv } from "dotenv";
import { ExtractRequestInput, ExtractResponse } from "../../controllers/v1/types";
configDotenv();
const TEST_URL = "http://127.0.0.1:3002";
async function extractStart(body: ExtractRequestInput) {
return await request(TEST_URL)
.post("/v1/extract")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(body);
}
async function extractStatus(id: string) {
return await request(TEST_URL)
.get("/v1/extract/" + encodeURIComponent(id))
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.send();
}
async function extract(body: ExtractRequestInput): Promise<ExtractResponse> {
const es = await extractStart(body);
expectExtractStartToSucceed(es);
let x;
do {
x = await extractStatus(es.body.id);
expect(x.statusCode).toBe(200);
expect(typeof x.body.status).toBe("string");
} while (x.body.status !== "completed");
expectExtractToSucceed(x);
return x.body;
}
function expectExtractStartToSucceed(response: Awaited<ReturnType<typeof extractStart>>) {
expect(response.statusCode).toBe(200);
expect(response.body.success).toBe(true);
expect(typeof response.body.id).toBe("string");
}
function expectExtractToSucceed(response: Awaited<ReturnType<typeof extractStatus>>) {
expect(response.statusCode).toBe(200);
expect(response.body.success).toBe(true);
expect(typeof response.body.status).toBe("string");
expect(response.body.status).toBe("completed");
expect(response.body).toHaveProperty("data");
}
describe("Extract tests", () => {
it.concurrent("works", async () => {
const res = await extract({
urls: ["https://firecrawl.dev"],
schema: {
"type": "object",
"properties": {
"company_mission": {
"type": "string"
},
"is_open_source": {
"type": "boolean"
}
},
"required": [
"company_mission",
"is_open_source"
]
},
origin: "api-sdk",
});
expect(res.data).toHaveProperty("company_mission");
expect(typeof res.data.company_mission).toBe("string")
expect(res.data).toHaveProperty("is_open_source");
expect(typeof res.data.is_open_source).toBe("boolean");
expect(res.data.is_open_source).toBe(true);
}, 60000);
});

View File

@ -1,11 +1,11 @@
import request from "supertest";
import { configDotenv } from "dotenv";
import { ScrapeRequestInput } from "../../controllers/v1/types";
import { Document, ScrapeRequestInput } from "../../controllers/v1/types";
configDotenv();
const TEST_URL = "http://127.0.0.1:3002";
async function scrape(body: ScrapeRequestInput) {
async function scrapeRaw(body: ScrapeRequestInput) {
return await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -13,12 +13,18 @@ async function scrape(body: ScrapeRequestInput) {
.send(body);
}
function expectScrapeToSucceed(response: Awaited<ReturnType<typeof scrape>>) {
function expectScrapeToSucceed(response: Awaited<ReturnType<typeof scrapeRaw>>) {
expect(response.statusCode).toBe(200);
expect(response.body.success).toBe(true);
expect(typeof response.body.data).toBe("object");
}
async function scrape(body: ScrapeRequestInput): Promise<Document> {
const raw = await scrapeRaw(body);
expectScrapeToSucceed(raw);
return raw.body.data;
}
describe("Scrape tests", () => {
it("mocking works properly", async () => {
// depends on falsified mock mocking-works-properly
@ -30,8 +36,7 @@ describe("Scrape tests", () => {
useMock: "mocking-works-properly",
});
expectScrapeToSucceed(response);
expect(response.body.data.markdown).toBe(
expect(response.markdown).toBe(
"this is fake data coming from the mocking system!",
);
}, 10000);
@ -42,8 +47,7 @@ describe("Scrape tests", () => {
url: "https://canyoublockit.com/testing/",
});
expectScrapeToSucceed(response);
expect(response.body.data.markdown).not.toContain(".g.doubleclick.net/");
expect(response.markdown).not.toContain(".g.doubleclick.net/");
}, 10000);
it.concurrent("doesn't block ads if explicitly disabled", async () => {
@ -52,8 +56,7 @@ describe("Scrape tests", () => {
blockAds: false,
});
expectScrapeToSucceed(response);
expect(response.body.data.markdown).toContain(".g.doubleclick.net/");
expect(response.markdown).toContain(".g.doubleclick.net/");
}, 10000);
});
@ -62,8 +65,6 @@ describe("Scrape tests", () => {
const response = await scrape({
url: "https://iplocation.com",
});
expectScrapeToSucceed(response);
}, 10000);
it.concurrent("works with country US", async () => {
@ -72,8 +73,7 @@ describe("Scrape tests", () => {
location: { country: "US" },
});
expectScrapeToSucceed(response);
expect(response.body.data.markdown).toContain("| Country | United States |");
expect(response.markdown).toContain("| Country | United States |");
}, 10000);
});
@ -84,8 +84,7 @@ describe("Scrape tests", () => {
formats: ["rawHtml"],
});
expectScrapeToSucceed(response);
const obj = JSON.parse(response.body.data.rawHtml);
const obj = JSON.parse(response.rawHtml!);
expect(obj.id).toBe(1);
}, 25000); // TODO: mock and shorten
});
@ -97,8 +96,7 @@ describe("Scrape tests", () => {
formats: ["screenshot"]
});
expectScrapeToSucceed(response);
expect(typeof response.body.data.screenshot).toBe("string");
expect(typeof response.screenshot).toBe("string");
}, 15000);
it.concurrent("screenshot@fullPage format works", async () => {
@ -107,11 +105,47 @@ describe("Scrape tests", () => {
formats: ["screenshot@fullPage"]
});
expectScrapeToSucceed(response);
expect(typeof response.body.data.screenshot).toBe("string");
expect(typeof response.screenshot).toBe("string");
}, 15000);
});
describe("JSON format", () => {
it.concurrent("works", async () => {
const response = await scrape({
url: "http://firecrawl.dev",
formats: ["json"],
jsonOptions: {
prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.",
schema: {
type: "object",
properties: {
company_mission: {
type: "string",
},
supports_sso: {
type: "boolean",
},
is_open_source: {
type: "boolean",
},
},
required: ["company_mission", "supports_sso", "is_open_source"],
},
},
});
expect(response).toHaveProperty("json");
expect(response.json).toHaveProperty("company_mission");
expect(typeof response.json.company_mission).toBe("string");
expect(response.json).toHaveProperty("supports_sso");
expect(response.json.supports_sso).toBe(false);
expect(typeof response.json.supports_sso).toBe("boolean");
expect(response.json).toHaveProperty("is_open_source");
expect(response.json.is_open_source).toBe(true);
expect(typeof response.json.is_open_source).toBe("boolean");
}, 30000);
});
describe("Proxy API (f-e dependant)", () => {
it.concurrent("undefined works", async () => {
await scrape({

View File

@ -0,0 +1,36 @@
import request from "supertest";
import { configDotenv } from "dotenv";
import { Document, SearchRequestInput } from "../../controllers/v1/types";
configDotenv();
const TEST_URL = "http://127.0.0.1:3002";
async function searchRaw(body: SearchRequestInput) {
return await request(TEST_URL)
.post("/v1/search")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(body);
}
function expectScrapeToSucceed(response: Awaited<ReturnType<typeof searchRaw>>) {
expect(response.statusCode).toBe(200);
expect(response.body.success).toBe(true);
expect(typeof response.body.data).toBe("object");
expect(Array.isArray(response.body.data)).toBe(true);
expect(response.body.data.length).toBeGreaterThan(0);
}
async function search(body: SearchRequestInput): Promise<Document> {
const raw = await searchRaw(body);
expectScrapeToSucceed(raw);
return raw.body.data;
}
describe("Scrape tests", () => {
it("works", async () => {
await search({
query: "firecrawl"
});
}, 15000);
});

View File

@ -221,6 +221,54 @@ const baseScrapeOptions = z
})
.strict(strictMessage);
const extractRefine = (obj) => {
const hasExtractFormat = obj.formats?.includes("extract");
const hasExtractOptions = obj.extract !== undefined;
const hasJsonFormat = obj.formats?.includes("json");
const hasJsonOptions = obj.jsonOptions !== undefined;
return (
(hasExtractFormat && hasExtractOptions) ||
(!hasExtractFormat && !hasExtractOptions) ||
(hasJsonFormat && hasJsonOptions) ||
(!hasJsonFormat && !hasJsonOptions)
);
};
const extractRefineOpts = {
message:
"When 'extract' or 'json' format is specified, corresponding options must be provided, and vice versa",
};
const extractTransform = (obj) => {
// Handle timeout
if (
(obj.formats?.includes("extract") ||
obj.extract ||
obj.formats?.includes("json") ||
obj.jsonOptions) &&
!obj.timeout
) {
obj = { ...obj, timeout: 60000 };
}
if (obj.formats?.includes("json")) {
obj.formats.push("extract");
}
// Convert JSON options to extract options if needed
if (obj.jsonOptions && !obj.extract) {
obj = {
...obj,
extract: {
prompt: obj.jsonOptions.prompt,
systemPrompt: obj.jsonOptions.systemPrompt,
schema: obj.jsonOptions.schema,
mode: "llm",
},
};
}
return obj;
}
export const scrapeOptions = baseScrapeOptions.refine(
(obj) => {
if (!obj.actions) return true;
@ -229,7 +277,8 @@ export const scrapeOptions = baseScrapeOptions.refine(
{
message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`,
}
);
).refine(extractRefine, extractRefineOpts)
.transform(extractTransform);
export type ScrapeOptions = z.infer<typeof baseScrapeOptions>;
@ -281,11 +330,14 @@ export const extractV1Options = z
.transform((obj) => ({
...obj,
allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch,
}));
}))
.refine(x => x.scrapeOptions ? extractRefine(x.scrapeOptions) : true, extractRefineOpts)
.transform(x => ({ ...x, scrapeOptions: x.scrapeOptions ? extractTransform(x.scrapeOptions) : x.scrapeOptions }));
export type ExtractV1Options = z.infer<typeof extractV1Options>;
export const extractRequestSchema = extractV1Options;
export type ExtractRequest = z.infer<typeof extractRequestSchema>;
export type ExtractRequestInput = z.input<typeof extractRequestSchema>;
export const scrapeRequestSchema = baseScrapeOptions
.omit({ timeout: true })
@ -295,55 +347,8 @@ export const scrapeRequestSchema = baseScrapeOptions
timeout: z.number().int().positive().finite().safe().default(30000),
})
.strict(strictMessage)
.refine(
(obj) => {
const hasExtractFormat = obj.formats?.includes("extract");
const hasExtractOptions = obj.extract !== undefined;
const hasJsonFormat = obj.formats?.includes("json");
const hasJsonOptions = obj.jsonOptions !== undefined;
return (
(hasExtractFormat && hasExtractOptions) ||
(!hasExtractFormat && !hasExtractOptions) ||
(hasJsonFormat && hasJsonOptions) ||
(!hasJsonFormat && !hasJsonOptions)
);
},
{
message:
"When 'extract' or 'json' format is specified, corresponding options must be provided, and vice versa",
},
)
.transform((obj) => {
// Handle timeout
if (
(obj.formats?.includes("extract") ||
obj.extract ||
obj.formats?.includes("json") ||
obj.jsonOptions) &&
!obj.timeout
) {
obj = { ...obj, timeout: 60000 };
}
if (obj.formats?.includes("json")) {
obj.formats.push("extract");
}
// Convert JSON options to extract options if needed
if (obj.jsonOptions && !obj.extract) {
obj = {
...obj,
extract: {
prompt: obj.jsonOptions.prompt,
systemPrompt: obj.jsonOptions.systemPrompt,
schema: obj.jsonOptions.schema,
mode: "llm",
},
};
}
return obj;
});
.refine(extractRefine, extractRefineOpts)
.transform(extractTransform);
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
export type ScrapeRequestInput = z.input<typeof scrapeRequestSchema>;
@ -375,20 +380,8 @@ export const batchScrapeRequestSchema = baseScrapeOptions
ignoreInvalidURLs: z.boolean().default(false),
})
.strict(strictMessage)
.refine(
(obj) => {
const hasExtractFormat = obj.formats?.includes("extract");
const hasExtractOptions = obj.extract !== undefined;
return (
(hasExtractFormat && hasExtractOptions) ||
(!hasExtractFormat && !hasExtractOptions)
);
},
{
message:
"When 'extract' format is specified, 'extract' options must be provided, and vice versa",
},
);
.refine(extractRefine, extractRefineOpts)
.transform(extractTransform);
export const batchScrapeRequestSchemaNoURLValidation = baseScrapeOptions
.extend({
@ -399,22 +392,11 @@ export const batchScrapeRequestSchemaNoURLValidation = baseScrapeOptions
ignoreInvalidURLs: z.boolean().default(false),
})
.strict(strictMessage)
.refine(
(obj) => {
const hasExtractFormat = obj.formats?.includes("extract");
const hasExtractOptions = obj.extract !== undefined;
return (
(hasExtractFormat && hasExtractOptions) ||
(!hasExtractFormat && !hasExtractOptions)
);
},
{
message:
"When 'extract' format is specified, 'extract' options must be provided, and vice versa",
},
);
.refine(extractRefine, extractRefineOpts)
.transform(extractTransform);
export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>;
export type BatchScrapeRequestInput = z.input<typeof batchScrapeRequestSchema>;
const crawlerOptions = z
.object({
@ -452,7 +434,9 @@ export const crawlRequestSchema = crawlerOptions
webhook: webhookSchema.optional(),
limit: z.number().default(10000),
})
.strict(strictMessage);
.strict(strictMessage)
.refine(x => extractRefine(x.scrapeOptions), extractRefineOpts)
.transform(x => ({ ...x, scrapeOptions: extractTransform(x.scrapeOptions) }));
// export type CrawlRequest = {
// url: string;
@ -467,6 +451,7 @@ export const crawlRequestSchema = crawlerOptions
// }
export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
export type CrawlRequestInput = z.input<typeof crawlRequestSchema>;
export const mapRequestSchema = crawlerOptions
.extend({
@ -936,9 +921,12 @@ export const searchRequestSchema = z
})
.strict(
"Unrecognized key in body -- please review the v1 API documentation for request body changes",
);
)
.refine(x => extractRefine(x.scrapeOptions), extractRefineOpts)
.transform(x => ({ ...x, scrapeOptions: extractTransform(x.scrapeOptions) }));
export type SearchRequest = z.infer<typeof searchRequestSchema>;
export type SearchRequestInput = z.input<typeof searchRequestSchema>;
export type SearchResponse =
| ErrorResponse