mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-13 20:35:56 +08:00
fix(v1/types): fix extract -> json rename (FIR-1072) (#1195)
* fix(v1/types): fix extract -> json rename * fix(types/v1): bad transform
This commit is contained in:
parent
5ac6eb7440
commit
586a10f40d
92
apps/api/src/__tests__/snips/batch-scrape.test.ts
Normal file
92
apps/api/src/__tests__/snips/batch-scrape.test.ts
Normal file
@ -0,0 +1,92 @@
|
||||
import request from "supertest";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { BatchScrapeRequestInput } from "../../controllers/v1/types";
|
||||
|
||||
configDotenv();
|
||||
const TEST_URL = "http://127.0.0.1:3002";
|
||||
|
||||
async function batchScrapeStart(body: BatchScrapeRequestInput) {
|
||||
return await request(TEST_URL)
|
||||
.post("/v1/batch/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(body);
|
||||
}
|
||||
|
||||
async function batchScrapeStatus(id: string) {
|
||||
return await request(TEST_URL)
|
||||
.get("/v1/batch/scrape/" + encodeURIComponent(id))
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.send();
|
||||
}
|
||||
|
||||
async function batchScrape(body: BatchScrapeRequestInput): ReturnType<typeof batchScrapeStatus> {
|
||||
const bss = await batchScrapeStart(body);
|
||||
expectBatchScrapeStartToSucceed(bss);
|
||||
|
||||
let x;
|
||||
|
||||
do {
|
||||
x = await batchScrapeStatus(bss.body.id);
|
||||
expect(x.statusCode).toBe(200);
|
||||
expect(typeof x.body.status).toBe("string");
|
||||
} while (x.body.status !== "completed")
|
||||
|
||||
expectBatchScrapeToSucceed(x);
|
||||
return x;
|
||||
}
|
||||
|
||||
function expectBatchScrapeStartToSucceed(response: Awaited<ReturnType<typeof batchScrape>>) {
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body.success).toBe(true);
|
||||
expect(typeof response.body.id).toBe("string");
|
||||
}
|
||||
|
||||
function expectBatchScrapeToSucceed(response: Awaited<ReturnType<typeof batchScrapeStatus>>) {
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body.success).toBe(true);
|
||||
expect(typeof response.body.status).toBe("string");
|
||||
expect(response.body.status).toBe("completed");
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(Array.isArray(response.body.data)).toBe(true);
|
||||
expect(response.body.data.length).toBeGreaterThan(0);
|
||||
}
|
||||
|
||||
describe("Batch scrape tests", () => {
|
||||
describe("JSON format", () => {
|
||||
it.concurrent("works", async () => {
|
||||
const response = await batchScrape({
|
||||
urls: ["http://firecrawl.dev"],
|
||||
formats: ["json"],
|
||||
jsonOptions: {
|
||||
prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.",
|
||||
schema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
company_mission: {
|
||||
type: "string",
|
||||
},
|
||||
supports_sso: {
|
||||
type: "boolean",
|
||||
},
|
||||
is_open_source: {
|
||||
type: "boolean",
|
||||
},
|
||||
},
|
||||
required: ["company_mission", "supports_sso", "is_open_source"],
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(response.body.data[0]).toHaveProperty("json");
|
||||
expect(response.body.data[0].json).toHaveProperty("company_mission");
|
||||
expect(typeof response.body.data[0].json.company_mission).toBe("string");
|
||||
expect(response.body.data[0].json).toHaveProperty("supports_sso");
|
||||
expect(response.body.data[0].json.supports_sso).toBe(false);
|
||||
expect(typeof response.body.data[0].json.supports_sso).toBe("boolean");
|
||||
expect(response.body.data[0].json).toHaveProperty("is_open_source");
|
||||
expect(response.body.data[0].json.is_open_source).toBe(true);
|
||||
expect(typeof response.body.data[0].json.is_open_source).toBe("boolean");
|
||||
}, 30000);
|
||||
});
|
||||
});
|
@ -1,11 +1,11 @@
|
||||
import request from "supertest";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { ScrapeRequestInput } from "../../controllers/v1/types";
|
||||
import { Document, ScrapeRequestInput } from "../../controllers/v1/types";
|
||||
|
||||
configDotenv();
|
||||
const TEST_URL = "http://127.0.0.1:3002";
|
||||
|
||||
async function scrape(body: ScrapeRequestInput) {
|
||||
async function scrapeRaw(body: ScrapeRequestInput) {
|
||||
return await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
@ -13,12 +13,18 @@ async function scrape(body: ScrapeRequestInput) {
|
||||
.send(body);
|
||||
}
|
||||
|
||||
function expectScrapeToSucceed(response: Awaited<ReturnType<typeof scrape>>) {
|
||||
function expectScrapeToSucceed(response: Awaited<ReturnType<typeof scrapeRaw>>) {
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body.success).toBe(true);
|
||||
expect(typeof response.body.data).toBe("object");
|
||||
}
|
||||
|
||||
async function scrape(body: ScrapeRequestInput): Promise<Document> {
|
||||
const raw = await scrapeRaw(body);
|
||||
expectScrapeToSucceed(raw);
|
||||
return raw.body.data;
|
||||
}
|
||||
|
||||
describe("Scrape tests", () => {
|
||||
it("mocking works properly", async () => {
|
||||
// depends on falsified mock mocking-works-properly
|
||||
@ -30,8 +36,7 @@ describe("Scrape tests", () => {
|
||||
useMock: "mocking-works-properly",
|
||||
});
|
||||
|
||||
expectScrapeToSucceed(response);
|
||||
expect(response.body.data.markdown).toBe(
|
||||
expect(response.markdown).toBe(
|
||||
"this is fake data coming from the mocking system!",
|
||||
);
|
||||
}, 10000);
|
||||
@ -42,8 +47,7 @@ describe("Scrape tests", () => {
|
||||
url: "https://canyoublockit.com/testing/",
|
||||
});
|
||||
|
||||
expectScrapeToSucceed(response);
|
||||
expect(response.body.data.markdown).not.toContain(".g.doubleclick.net/");
|
||||
expect(response.markdown).not.toContain(".g.doubleclick.net/");
|
||||
}, 10000);
|
||||
|
||||
it.concurrent("doesn't block ads if explicitly disabled", async () => {
|
||||
@ -52,8 +56,7 @@ describe("Scrape tests", () => {
|
||||
blockAds: false,
|
||||
});
|
||||
|
||||
expectScrapeToSucceed(response);
|
||||
expect(response.body.data.markdown).toContain(".g.doubleclick.net/");
|
||||
expect(response.markdown).toContain(".g.doubleclick.net/");
|
||||
}, 10000);
|
||||
});
|
||||
|
||||
@ -62,8 +65,6 @@ describe("Scrape tests", () => {
|
||||
const response = await scrape({
|
||||
url: "https://iplocation.com",
|
||||
});
|
||||
|
||||
expectScrapeToSucceed(response);
|
||||
}, 10000);
|
||||
|
||||
it.concurrent("works with country US", async () => {
|
||||
@ -72,8 +73,7 @@ describe("Scrape tests", () => {
|
||||
location: { country: "US" },
|
||||
});
|
||||
|
||||
expectScrapeToSucceed(response);
|
||||
expect(response.body.data.markdown).toContain("| Country | United States |");
|
||||
expect(response.markdown).toContain("| Country | United States |");
|
||||
}, 10000);
|
||||
});
|
||||
|
||||
@ -84,8 +84,7 @@ describe("Scrape tests", () => {
|
||||
formats: ["rawHtml"],
|
||||
});
|
||||
|
||||
expectScrapeToSucceed(response);
|
||||
const obj = JSON.parse(response.body.data.rawHtml);
|
||||
const obj = JSON.parse(response.rawHtml!);
|
||||
expect(obj.id).toBe(1);
|
||||
}, 25000); // TODO: mock and shorten
|
||||
});
|
||||
@ -97,8 +96,7 @@ describe("Scrape tests", () => {
|
||||
formats: ["screenshot"]
|
||||
});
|
||||
|
||||
expectScrapeToSucceed(response);
|
||||
expect(typeof response.body.data.screenshot).toBe("string");
|
||||
expect(typeof response.screenshot).toBe("string");
|
||||
}, 15000);
|
||||
|
||||
it.concurrent("screenshot@fullPage format works", async () => {
|
||||
@ -107,8 +105,44 @@ describe("Scrape tests", () => {
|
||||
formats: ["screenshot@fullPage"]
|
||||
});
|
||||
|
||||
expectScrapeToSucceed(response);
|
||||
expect(typeof response.body.data.screenshot).toBe("string");
|
||||
expect(typeof response.screenshot).toBe("string");
|
||||
}, 15000);
|
||||
})
|
||||
});
|
||||
|
||||
describe("JSON format", () => {
|
||||
it.concurrent("works", async () => {
|
||||
const response = await scrape({
|
||||
url: "http://firecrawl.dev",
|
||||
formats: ["json"],
|
||||
jsonOptions: {
|
||||
prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.",
|
||||
schema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
company_mission: {
|
||||
type: "string",
|
||||
},
|
||||
supports_sso: {
|
||||
type: "boolean",
|
||||
},
|
||||
is_open_source: {
|
||||
type: "boolean",
|
||||
},
|
||||
},
|
||||
required: ["company_mission", "supports_sso", "is_open_source"],
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(response).toHaveProperty("json");
|
||||
expect(response.json).toHaveProperty("company_mission");
|
||||
expect(typeof response.json.company_mission).toBe("string");
|
||||
expect(response.json).toHaveProperty("supports_sso");
|
||||
expect(response.json.supports_sso).toBe(false);
|
||||
expect(typeof response.json.supports_sso).toBe("boolean");
|
||||
expect(response.json).toHaveProperty("is_open_source");
|
||||
expect(response.json.is_open_source).toBe(true);
|
||||
expect(typeof response.json.is_open_source).toBe("boolean");
|
||||
}, 30000);
|
||||
});
|
||||
});
|
||||
|
@ -220,6 +220,54 @@ const baseScrapeOptions = z
|
||||
})
|
||||
.strict(strictMessage);
|
||||
|
||||
const extractRefine = (obj) => {
|
||||
const hasExtractFormat = obj.formats?.includes("extract");
|
||||
const hasExtractOptions = obj.extract !== undefined;
|
||||
const hasJsonFormat = obj.formats?.includes("json");
|
||||
const hasJsonOptions = obj.jsonOptions !== undefined;
|
||||
return (
|
||||
(hasExtractFormat && hasExtractOptions) ||
|
||||
(!hasExtractFormat && !hasExtractOptions) ||
|
||||
(hasJsonFormat && hasJsonOptions) ||
|
||||
(!hasJsonFormat && !hasJsonOptions)
|
||||
);
|
||||
};
|
||||
const extractRefineOpts = {
|
||||
message:
|
||||
"When 'extract' or 'json' format is specified, corresponding options must be provided, and vice versa",
|
||||
};
|
||||
const extractTransform = (obj) => {
|
||||
// Handle timeout
|
||||
if (
|
||||
(obj.formats?.includes("extract") ||
|
||||
obj.extract ||
|
||||
obj.formats?.includes("json") ||
|
||||
obj.jsonOptions) &&
|
||||
!obj.timeout
|
||||
) {
|
||||
obj = { ...obj, timeout: 60000 };
|
||||
}
|
||||
|
||||
if (obj.formats?.includes("json")) {
|
||||
obj.formats.push("extract");
|
||||
}
|
||||
|
||||
// Convert JSON options to extract options if needed
|
||||
if (obj.jsonOptions && !obj.extract) {
|
||||
obj = {
|
||||
...obj,
|
||||
extract: {
|
||||
prompt: obj.jsonOptions.prompt,
|
||||
systemPrompt: obj.jsonOptions.systemPrompt,
|
||||
schema: obj.jsonOptions.schema,
|
||||
mode: "llm",
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
return obj;
|
||||
}
|
||||
|
||||
export const scrapeOptions = baseScrapeOptions.refine(
|
||||
(obj) => {
|
||||
if (!obj.actions) return true;
|
||||
@ -228,7 +276,8 @@ export const scrapeOptions = baseScrapeOptions.refine(
|
||||
{
|
||||
message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`,
|
||||
}
|
||||
);
|
||||
).refine(extractRefine, extractRefineOpts)
|
||||
.transform(extractTransform);
|
||||
|
||||
export type ScrapeOptions = z.infer<typeof baseScrapeOptions>;
|
||||
|
||||
@ -280,7 +329,9 @@ export const extractV1Options = z
|
||||
.transform((obj) => ({
|
||||
...obj,
|
||||
allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch,
|
||||
}));
|
||||
}))
|
||||
.refine(x => extractRefine(x.scrapeOptions), extractRefineOpts)
|
||||
.transform(x => extractTransform(x.scrapeOptions));
|
||||
|
||||
export type ExtractV1Options = z.infer<typeof extractV1Options>;
|
||||
export const extractRequestSchema = extractV1Options;
|
||||
@ -294,55 +345,8 @@ export const scrapeRequestSchema = baseScrapeOptions
|
||||
timeout: z.number().int().positive().finite().safe().default(30000),
|
||||
})
|
||||
.strict(strictMessage)
|
||||
.refine(
|
||||
(obj) => {
|
||||
const hasExtractFormat = obj.formats?.includes("extract");
|
||||
const hasExtractOptions = obj.extract !== undefined;
|
||||
const hasJsonFormat = obj.formats?.includes("json");
|
||||
const hasJsonOptions = obj.jsonOptions !== undefined;
|
||||
return (
|
||||
(hasExtractFormat && hasExtractOptions) ||
|
||||
(!hasExtractFormat && !hasExtractOptions) ||
|
||||
(hasJsonFormat && hasJsonOptions) ||
|
||||
(!hasJsonFormat && !hasJsonOptions)
|
||||
);
|
||||
},
|
||||
{
|
||||
message:
|
||||
"When 'extract' or 'json' format is specified, corresponding options must be provided, and vice versa",
|
||||
},
|
||||
)
|
||||
.transform((obj) => {
|
||||
// Handle timeout
|
||||
if (
|
||||
(obj.formats?.includes("extract") ||
|
||||
obj.extract ||
|
||||
obj.formats?.includes("json") ||
|
||||
obj.jsonOptions) &&
|
||||
!obj.timeout
|
||||
) {
|
||||
obj = { ...obj, timeout: 60000 };
|
||||
}
|
||||
|
||||
if (obj.formats?.includes("json")) {
|
||||
obj.formats.push("extract");
|
||||
}
|
||||
|
||||
// Convert JSON options to extract options if needed
|
||||
if (obj.jsonOptions && !obj.extract) {
|
||||
obj = {
|
||||
...obj,
|
||||
extract: {
|
||||
prompt: obj.jsonOptions.prompt,
|
||||
systemPrompt: obj.jsonOptions.systemPrompt,
|
||||
schema: obj.jsonOptions.schema,
|
||||
mode: "llm",
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
return obj;
|
||||
});
|
||||
.refine(extractRefine, extractRefineOpts)
|
||||
.transform(extractTransform);
|
||||
|
||||
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
|
||||
export type ScrapeRequestInput = z.input<typeof scrapeRequestSchema>;
|
||||
@ -374,20 +378,8 @@ export const batchScrapeRequestSchema = baseScrapeOptions
|
||||
ignoreInvalidURLs: z.boolean().default(false),
|
||||
})
|
||||
.strict(strictMessage)
|
||||
.refine(
|
||||
(obj) => {
|
||||
const hasExtractFormat = obj.formats?.includes("extract");
|
||||
const hasExtractOptions = obj.extract !== undefined;
|
||||
return (
|
||||
(hasExtractFormat && hasExtractOptions) ||
|
||||
(!hasExtractFormat && !hasExtractOptions)
|
||||
);
|
||||
},
|
||||
{
|
||||
message:
|
||||
"When 'extract' format is specified, 'extract' options must be provided, and vice versa",
|
||||
},
|
||||
);
|
||||
.refine(extractRefine, extractRefineOpts)
|
||||
.transform(extractTransform);
|
||||
|
||||
export const batchScrapeRequestSchemaNoURLValidation = baseScrapeOptions
|
||||
.extend({
|
||||
@ -398,22 +390,11 @@ export const batchScrapeRequestSchemaNoURLValidation = baseScrapeOptions
|
||||
ignoreInvalidURLs: z.boolean().default(false),
|
||||
})
|
||||
.strict(strictMessage)
|
||||
.refine(
|
||||
(obj) => {
|
||||
const hasExtractFormat = obj.formats?.includes("extract");
|
||||
const hasExtractOptions = obj.extract !== undefined;
|
||||
return (
|
||||
(hasExtractFormat && hasExtractOptions) ||
|
||||
(!hasExtractFormat && !hasExtractOptions)
|
||||
);
|
||||
},
|
||||
{
|
||||
message:
|
||||
"When 'extract' format is specified, 'extract' options must be provided, and vice versa",
|
||||
},
|
||||
);
|
||||
.refine(extractRefine, extractRefineOpts)
|
||||
.transform(extractTransform);
|
||||
|
||||
export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>;
|
||||
export type BatchScrapeRequestInput = z.input<typeof batchScrapeRequestSchema>;
|
||||
|
||||
const crawlerOptions = z
|
||||
.object({
|
||||
@ -451,7 +432,9 @@ export const crawlRequestSchema = crawlerOptions
|
||||
webhook: webhookSchema.optional(),
|
||||
limit: z.number().default(10000),
|
||||
})
|
||||
.strict(strictMessage);
|
||||
.strict(strictMessage)
|
||||
.refine(x => extractRefine(x.scrapeOptions), extractRefineOpts)
|
||||
.transform(x => extractTransform(x.scrapeOptions));
|
||||
|
||||
// export type CrawlRequest = {
|
||||
// url: string;
|
||||
@ -935,7 +918,9 @@ export const searchRequestSchema = z
|
||||
})
|
||||
.strict(
|
||||
"Unrecognized key in body -- please review the v1 API documentation for request body changes",
|
||||
);
|
||||
)
|
||||
.refine(x => extractRefine(x.scrapeOptions), extractRefineOpts)
|
||||
.transform(x => extractTransform(x.scrapeOptions));
|
||||
|
||||
export type SearchRequest = z.infer<typeof searchRequestSchema>;
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user