mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 07:39:00 +08:00
fix(v1/types): fix extract -> json rename, ROUND II (FIR-1072) (#1199)
* Revert "Revert "fix(v1/types): fix extract -> json rename (FIR-1072) (#1195)"" This reverts commit e28a44463ae49ffc195507204492cc7c15c438c4. * fix(v1/types): fix bad transform * feat(v1): proxy option / stealthProxy flag (FIR-1050) (#1196) * feat(v1): proxy option / stealthProxy flag * feat(js-sdk): add proxy option * fix * fix extract tests
This commit is contained in:
parent
42050d3d6e
commit
fc64f436ed
1
.github/workflows/ci.yml
vendored
1
.github/workflows/ci.yml
vendored
@ -27,6 +27,7 @@ env:
|
||||
HDX_NODE_BETA_MODE: 1
|
||||
FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
|
||||
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
|
||||
SERPER_API_KEY: ${{ secrets.SERPER_API_KEY }}
|
||||
ENV: ${{ secrets.ENV }}
|
||||
|
||||
jobs:
|
||||
|
92
apps/api/src/__tests__/snips/batch-scrape.test.ts
Normal file
92
apps/api/src/__tests__/snips/batch-scrape.test.ts
Normal file
@ -0,0 +1,92 @@
|
||||
import request from "supertest";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { BatchScrapeRequestInput } from "../../controllers/v1/types";
|
||||
|
||||
configDotenv();
|
||||
const TEST_URL = "http://127.0.0.1:3002";
|
||||
|
||||
async function batchScrapeStart(body: BatchScrapeRequestInput) {
|
||||
return await request(TEST_URL)
|
||||
.post("/v1/batch/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(body);
|
||||
}
|
||||
|
||||
async function batchScrapeStatus(id: string) {
|
||||
return await request(TEST_URL)
|
||||
.get("/v1/batch/scrape/" + encodeURIComponent(id))
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.send();
|
||||
}
|
||||
|
||||
async function batchScrape(body: BatchScrapeRequestInput): ReturnType<typeof batchScrapeStatus> {
|
||||
const bss = await batchScrapeStart(body);
|
||||
expectBatchScrapeStartToSucceed(bss);
|
||||
|
||||
let x;
|
||||
|
||||
do {
|
||||
x = await batchScrapeStatus(bss.body.id);
|
||||
expect(x.statusCode).toBe(200);
|
||||
expect(typeof x.body.status).toBe("string");
|
||||
} while (x.body.status !== "completed")
|
||||
|
||||
expectBatchScrapeToSucceed(x);
|
||||
return x;
|
||||
}
|
||||
|
||||
function expectBatchScrapeStartToSucceed(response: Awaited<ReturnType<typeof batchScrape>>) {
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body.success).toBe(true);
|
||||
expect(typeof response.body.id).toBe("string");
|
||||
}
|
||||
|
||||
function expectBatchScrapeToSucceed(response: Awaited<ReturnType<typeof batchScrapeStatus>>) {
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body.success).toBe(true);
|
||||
expect(typeof response.body.status).toBe("string");
|
||||
expect(response.body.status).toBe("completed");
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(Array.isArray(response.body.data)).toBe(true);
|
||||
expect(response.body.data.length).toBeGreaterThan(0);
|
||||
}
|
||||
|
||||
describe("Batch scrape tests", () => {
|
||||
describe("JSON format", () => {
|
||||
it.concurrent("works", async () => {
|
||||
const response = await batchScrape({
|
||||
urls: ["http://firecrawl.dev"],
|
||||
formats: ["json"],
|
||||
jsonOptions: {
|
||||
prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.",
|
||||
schema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
company_mission: {
|
||||
type: "string",
|
||||
},
|
||||
supports_sso: {
|
||||
type: "boolean",
|
||||
},
|
||||
is_open_source: {
|
||||
type: "boolean",
|
||||
},
|
||||
},
|
||||
required: ["company_mission", "supports_sso", "is_open_source"],
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(response.body.data[0]).toHaveProperty("json");
|
||||
expect(response.body.data[0].json).toHaveProperty("company_mission");
|
||||
expect(typeof response.body.data[0].json.company_mission).toBe("string");
|
||||
expect(response.body.data[0].json).toHaveProperty("supports_sso");
|
||||
expect(response.body.data[0].json.supports_sso).toBe(false);
|
||||
expect(typeof response.body.data[0].json.supports_sso).toBe("boolean");
|
||||
expect(response.body.data[0].json).toHaveProperty("is_open_source");
|
||||
expect(response.body.data[0].json.is_open_source).toBe(true);
|
||||
expect(typeof response.body.data[0].json.is_open_source).toBe("boolean");
|
||||
}, 30000);
|
||||
});
|
||||
});
|
62
apps/api/src/__tests__/snips/crawl.test.ts
Normal file
62
apps/api/src/__tests__/snips/crawl.test.ts
Normal file
@ -0,0 +1,62 @@
|
||||
import request from "supertest";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { CrawlRequestInput } from "../../controllers/v1/types";
|
||||
|
||||
configDotenv();
|
||||
const TEST_URL = "http://127.0.0.1:3002";
|
||||
|
||||
async function crawlStart(body: CrawlRequestInput) {
|
||||
return await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(body);
|
||||
}
|
||||
|
||||
async function crawlStatus(id: string) {
|
||||
return await request(TEST_URL)
|
||||
.get("/v1/crawl/" + encodeURIComponent(id))
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.send();
|
||||
}
|
||||
|
||||
async function crawl(body: CrawlRequestInput): ReturnType<typeof crawlStatus> {
|
||||
const cs = await crawlStart(body);
|
||||
expectCrawlStartToSucceed(cs);
|
||||
|
||||
let x;
|
||||
|
||||
do {
|
||||
x = await crawlStatus(cs.body.id);
|
||||
expect(x.statusCode).toBe(200);
|
||||
expect(typeof x.body.status).toBe("string");
|
||||
} while (x.body.status !== "completed")
|
||||
|
||||
expectCrawlToSucceed(x);
|
||||
return x;
|
||||
}
|
||||
|
||||
function expectCrawlStartToSucceed(response: Awaited<ReturnType<typeof crawlStart>>) {
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body.success).toBe(true);
|
||||
expect(typeof response.body.id).toBe("string");
|
||||
}
|
||||
|
||||
function expectCrawlToSucceed(response: Awaited<ReturnType<typeof crawlStatus>>) {
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body.success).toBe(true);
|
||||
expect(typeof response.body.status).toBe("string");
|
||||
expect(response.body.status).toBe("completed");
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(Array.isArray(response.body.data)).toBe(true);
|
||||
expect(response.body.data.length).toBeGreaterThan(0);
|
||||
}
|
||||
|
||||
describe("Crawl tests", () => {
|
||||
it.concurrent("works", async () => {
|
||||
await crawl({
|
||||
url: "https://firecrawl.dev",
|
||||
limit: 10,
|
||||
});
|
||||
}, 120000);
|
||||
});
|
81
apps/api/src/__tests__/snips/extract.test.ts
Normal file
81
apps/api/src/__tests__/snips/extract.test.ts
Normal file
@ -0,0 +1,81 @@
|
||||
import request from "supertest";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { ExtractRequestInput, ExtractResponse } from "../../controllers/v1/types";
|
||||
|
||||
configDotenv();
|
||||
const TEST_URL = "http://127.0.0.1:3002";
|
||||
|
||||
async function extractStart(body: ExtractRequestInput) {
|
||||
return await request(TEST_URL)
|
||||
.post("/v1/extract")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(body);
|
||||
}
|
||||
|
||||
async function extractStatus(id: string) {
|
||||
return await request(TEST_URL)
|
||||
.get("/v1/extract/" + encodeURIComponent(id))
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.send();
|
||||
}
|
||||
|
||||
async function extract(body: ExtractRequestInput): Promise<ExtractResponse> {
|
||||
const es = await extractStart(body);
|
||||
expectExtractStartToSucceed(es);
|
||||
|
||||
let x;
|
||||
|
||||
do {
|
||||
x = await extractStatus(es.body.id);
|
||||
expect(x.statusCode).toBe(200);
|
||||
expect(typeof x.body.status).toBe("string");
|
||||
} while (x.body.status !== "completed");
|
||||
|
||||
expectExtractToSucceed(x);
|
||||
return x.body;
|
||||
}
|
||||
|
||||
function expectExtractStartToSucceed(response: Awaited<ReturnType<typeof extractStart>>) {
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body.success).toBe(true);
|
||||
expect(typeof response.body.id).toBe("string");
|
||||
}
|
||||
|
||||
function expectExtractToSucceed(response: Awaited<ReturnType<typeof extractStatus>>) {
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body.success).toBe(true);
|
||||
expect(typeof response.body.status).toBe("string");
|
||||
expect(response.body.status).toBe("completed");
|
||||
expect(response.body).toHaveProperty("data");
|
||||
}
|
||||
|
||||
describe("Extract tests", () => {
|
||||
it.concurrent("works", async () => {
|
||||
const res = await extract({
|
||||
urls: ["https://firecrawl.dev"],
|
||||
schema: {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"company_mission": {
|
||||
"type": "string"
|
||||
},
|
||||
"is_open_source": {
|
||||
"type": "boolean"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"company_mission",
|
||||
"is_open_source"
|
||||
]
|
||||
},
|
||||
origin: "api-sdk",
|
||||
});
|
||||
|
||||
expect(res.data).toHaveProperty("company_mission");
|
||||
expect(typeof res.data.company_mission).toBe("string")
|
||||
expect(res.data).toHaveProperty("is_open_source");
|
||||
expect(typeof res.data.is_open_source).toBe("boolean");
|
||||
expect(res.data.is_open_source).toBe(true);
|
||||
}, 60000);
|
||||
});
|
@ -1,11 +1,11 @@
|
||||
import request from "supertest";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { ScrapeRequestInput } from "../../controllers/v1/types";
|
||||
import { Document, ScrapeRequestInput } from "../../controllers/v1/types";
|
||||
|
||||
configDotenv();
|
||||
const TEST_URL = "http://127.0.0.1:3002";
|
||||
|
||||
async function scrape(body: ScrapeRequestInput) {
|
||||
async function scrapeRaw(body: ScrapeRequestInput) {
|
||||
return await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
@ -13,12 +13,18 @@ async function scrape(body: ScrapeRequestInput) {
|
||||
.send(body);
|
||||
}
|
||||
|
||||
function expectScrapeToSucceed(response: Awaited<ReturnType<typeof scrape>>) {
|
||||
function expectScrapeToSucceed(response: Awaited<ReturnType<typeof scrapeRaw>>) {
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body.success).toBe(true);
|
||||
expect(typeof response.body.data).toBe("object");
|
||||
}
|
||||
|
||||
async function scrape(body: ScrapeRequestInput): Promise<Document> {
|
||||
const raw = await scrapeRaw(body);
|
||||
expectScrapeToSucceed(raw);
|
||||
return raw.body.data;
|
||||
}
|
||||
|
||||
describe("Scrape tests", () => {
|
||||
it("mocking works properly", async () => {
|
||||
// depends on falsified mock mocking-works-properly
|
||||
@ -30,8 +36,7 @@ describe("Scrape tests", () => {
|
||||
useMock: "mocking-works-properly",
|
||||
});
|
||||
|
||||
expectScrapeToSucceed(response);
|
||||
expect(response.body.data.markdown).toBe(
|
||||
expect(response.markdown).toBe(
|
||||
"this is fake data coming from the mocking system!",
|
||||
);
|
||||
}, 10000);
|
||||
@ -42,8 +47,7 @@ describe("Scrape tests", () => {
|
||||
url: "https://canyoublockit.com/testing/",
|
||||
});
|
||||
|
||||
expectScrapeToSucceed(response);
|
||||
expect(response.body.data.markdown).not.toContain(".g.doubleclick.net/");
|
||||
expect(response.markdown).not.toContain(".g.doubleclick.net/");
|
||||
}, 10000);
|
||||
|
||||
it.concurrent("doesn't block ads if explicitly disabled", async () => {
|
||||
@ -52,8 +56,7 @@ describe("Scrape tests", () => {
|
||||
blockAds: false,
|
||||
});
|
||||
|
||||
expectScrapeToSucceed(response);
|
||||
expect(response.body.data.markdown).toContain(".g.doubleclick.net/");
|
||||
expect(response.markdown).toContain(".g.doubleclick.net/");
|
||||
}, 10000);
|
||||
});
|
||||
|
||||
@ -62,8 +65,6 @@ describe("Scrape tests", () => {
|
||||
const response = await scrape({
|
||||
url: "https://iplocation.com",
|
||||
});
|
||||
|
||||
expectScrapeToSucceed(response);
|
||||
}, 10000);
|
||||
|
||||
it.concurrent("works with country US", async () => {
|
||||
@ -72,8 +73,7 @@ describe("Scrape tests", () => {
|
||||
location: { country: "US" },
|
||||
});
|
||||
|
||||
expectScrapeToSucceed(response);
|
||||
expect(response.body.data.markdown).toContain("| Country | United States |");
|
||||
expect(response.markdown).toContain("| Country | United States |");
|
||||
}, 10000);
|
||||
});
|
||||
|
||||
@ -84,8 +84,7 @@ describe("Scrape tests", () => {
|
||||
formats: ["rawHtml"],
|
||||
});
|
||||
|
||||
expectScrapeToSucceed(response);
|
||||
const obj = JSON.parse(response.body.data.rawHtml);
|
||||
const obj = JSON.parse(response.rawHtml!);
|
||||
expect(obj.id).toBe(1);
|
||||
}, 25000); // TODO: mock and shorten
|
||||
});
|
||||
@ -97,8 +96,7 @@ describe("Scrape tests", () => {
|
||||
formats: ["screenshot"]
|
||||
});
|
||||
|
||||
expectScrapeToSucceed(response);
|
||||
expect(typeof response.body.data.screenshot).toBe("string");
|
||||
expect(typeof response.screenshot).toBe("string");
|
||||
}, 15000);
|
||||
|
||||
it.concurrent("screenshot@fullPage format works", async () => {
|
||||
@ -107,11 +105,47 @@ describe("Scrape tests", () => {
|
||||
formats: ["screenshot@fullPage"]
|
||||
});
|
||||
|
||||
expectScrapeToSucceed(response);
|
||||
expect(typeof response.body.data.screenshot).toBe("string");
|
||||
expect(typeof response.screenshot).toBe("string");
|
||||
}, 15000);
|
||||
});
|
||||
|
||||
describe("JSON format", () => {
|
||||
it.concurrent("works", async () => {
|
||||
const response = await scrape({
|
||||
url: "http://firecrawl.dev",
|
||||
formats: ["json"],
|
||||
jsonOptions: {
|
||||
prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.",
|
||||
schema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
company_mission: {
|
||||
type: "string",
|
||||
},
|
||||
supports_sso: {
|
||||
type: "boolean",
|
||||
},
|
||||
is_open_source: {
|
||||
type: "boolean",
|
||||
},
|
||||
},
|
||||
required: ["company_mission", "supports_sso", "is_open_source"],
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(response).toHaveProperty("json");
|
||||
expect(response.json).toHaveProperty("company_mission");
|
||||
expect(typeof response.json.company_mission).toBe("string");
|
||||
expect(response.json).toHaveProperty("supports_sso");
|
||||
expect(response.json.supports_sso).toBe(false);
|
||||
expect(typeof response.json.supports_sso).toBe("boolean");
|
||||
expect(response.json).toHaveProperty("is_open_source");
|
||||
expect(response.json.is_open_source).toBe(true);
|
||||
expect(typeof response.json.is_open_source).toBe("boolean");
|
||||
}, 30000);
|
||||
});
|
||||
|
||||
describe("Proxy API (f-e dependant)", () => {
|
||||
it.concurrent("undefined works", async () => {
|
||||
await scrape({
|
||||
|
36
apps/api/src/__tests__/snips/search.test.ts
Normal file
36
apps/api/src/__tests__/snips/search.test.ts
Normal file
@ -0,0 +1,36 @@
|
||||
import request from "supertest";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { Document, SearchRequestInput } from "../../controllers/v1/types";
|
||||
|
||||
configDotenv();
|
||||
const TEST_URL = "http://127.0.0.1:3002";
|
||||
|
||||
async function searchRaw(body: SearchRequestInput) {
|
||||
return await request(TEST_URL)
|
||||
.post("/v1/search")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(body);
|
||||
}
|
||||
|
||||
function expectScrapeToSucceed(response: Awaited<ReturnType<typeof searchRaw>>) {
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body.success).toBe(true);
|
||||
expect(typeof response.body.data).toBe("object");
|
||||
expect(Array.isArray(response.body.data)).toBe(true);
|
||||
expect(response.body.data.length).toBeGreaterThan(0);
|
||||
}
|
||||
|
||||
async function search(body: SearchRequestInput): Promise<Document> {
|
||||
const raw = await searchRaw(body);
|
||||
expectScrapeToSucceed(raw);
|
||||
return raw.body.data;
|
||||
}
|
||||
|
||||
describe("Scrape tests", () => {
|
||||
it("works", async () => {
|
||||
await search({
|
||||
query: "firecrawl"
|
||||
});
|
||||
}, 15000);
|
||||
});
|
@ -221,6 +221,54 @@ const baseScrapeOptions = z
|
||||
})
|
||||
.strict(strictMessage);
|
||||
|
||||
const extractRefine = (obj) => {
|
||||
const hasExtractFormat = obj.formats?.includes("extract");
|
||||
const hasExtractOptions = obj.extract !== undefined;
|
||||
const hasJsonFormat = obj.formats?.includes("json");
|
||||
const hasJsonOptions = obj.jsonOptions !== undefined;
|
||||
return (
|
||||
(hasExtractFormat && hasExtractOptions) ||
|
||||
(!hasExtractFormat && !hasExtractOptions) ||
|
||||
(hasJsonFormat && hasJsonOptions) ||
|
||||
(!hasJsonFormat && !hasJsonOptions)
|
||||
);
|
||||
};
|
||||
const extractRefineOpts = {
|
||||
message:
|
||||
"When 'extract' or 'json' format is specified, corresponding options must be provided, and vice versa",
|
||||
};
|
||||
const extractTransform = (obj) => {
|
||||
// Handle timeout
|
||||
if (
|
||||
(obj.formats?.includes("extract") ||
|
||||
obj.extract ||
|
||||
obj.formats?.includes("json") ||
|
||||
obj.jsonOptions) &&
|
||||
!obj.timeout
|
||||
) {
|
||||
obj = { ...obj, timeout: 60000 };
|
||||
}
|
||||
|
||||
if (obj.formats?.includes("json")) {
|
||||
obj.formats.push("extract");
|
||||
}
|
||||
|
||||
// Convert JSON options to extract options if needed
|
||||
if (obj.jsonOptions && !obj.extract) {
|
||||
obj = {
|
||||
...obj,
|
||||
extract: {
|
||||
prompt: obj.jsonOptions.prompt,
|
||||
systemPrompt: obj.jsonOptions.systemPrompt,
|
||||
schema: obj.jsonOptions.schema,
|
||||
mode: "llm",
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
return obj;
|
||||
}
|
||||
|
||||
export const scrapeOptions = baseScrapeOptions.refine(
|
||||
(obj) => {
|
||||
if (!obj.actions) return true;
|
||||
@ -229,7 +277,8 @@ export const scrapeOptions = baseScrapeOptions.refine(
|
||||
{
|
||||
message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`,
|
||||
}
|
||||
);
|
||||
).refine(extractRefine, extractRefineOpts)
|
||||
.transform(extractTransform);
|
||||
|
||||
export type ScrapeOptions = z.infer<typeof baseScrapeOptions>;
|
||||
|
||||
@ -281,11 +330,14 @@ export const extractV1Options = z
|
||||
.transform((obj) => ({
|
||||
...obj,
|
||||
allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch,
|
||||
}));
|
||||
}))
|
||||
.refine(x => x.scrapeOptions ? extractRefine(x.scrapeOptions) : true, extractRefineOpts)
|
||||
.transform(x => ({ ...x, scrapeOptions: x.scrapeOptions ? extractTransform(x.scrapeOptions) : x.scrapeOptions }));
|
||||
|
||||
export type ExtractV1Options = z.infer<typeof extractV1Options>;
|
||||
export const extractRequestSchema = extractV1Options;
|
||||
export type ExtractRequest = z.infer<typeof extractRequestSchema>;
|
||||
export type ExtractRequestInput = z.input<typeof extractRequestSchema>;
|
||||
|
||||
export const scrapeRequestSchema = baseScrapeOptions
|
||||
.omit({ timeout: true })
|
||||
@ -295,55 +347,8 @@ export const scrapeRequestSchema = baseScrapeOptions
|
||||
timeout: z.number().int().positive().finite().safe().default(30000),
|
||||
})
|
||||
.strict(strictMessage)
|
||||
.refine(
|
||||
(obj) => {
|
||||
const hasExtractFormat = obj.formats?.includes("extract");
|
||||
const hasExtractOptions = obj.extract !== undefined;
|
||||
const hasJsonFormat = obj.formats?.includes("json");
|
||||
const hasJsonOptions = obj.jsonOptions !== undefined;
|
||||
return (
|
||||
(hasExtractFormat && hasExtractOptions) ||
|
||||
(!hasExtractFormat && !hasExtractOptions) ||
|
||||
(hasJsonFormat && hasJsonOptions) ||
|
||||
(!hasJsonFormat && !hasJsonOptions)
|
||||
);
|
||||
},
|
||||
{
|
||||
message:
|
||||
"When 'extract' or 'json' format is specified, corresponding options must be provided, and vice versa",
|
||||
},
|
||||
)
|
||||
.transform((obj) => {
|
||||
// Handle timeout
|
||||
if (
|
||||
(obj.formats?.includes("extract") ||
|
||||
obj.extract ||
|
||||
obj.formats?.includes("json") ||
|
||||
obj.jsonOptions) &&
|
||||
!obj.timeout
|
||||
) {
|
||||
obj = { ...obj, timeout: 60000 };
|
||||
}
|
||||
|
||||
if (obj.formats?.includes("json")) {
|
||||
obj.formats.push("extract");
|
||||
}
|
||||
|
||||
// Convert JSON options to extract options if needed
|
||||
if (obj.jsonOptions && !obj.extract) {
|
||||
obj = {
|
||||
...obj,
|
||||
extract: {
|
||||
prompt: obj.jsonOptions.prompt,
|
||||
systemPrompt: obj.jsonOptions.systemPrompt,
|
||||
schema: obj.jsonOptions.schema,
|
||||
mode: "llm",
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
return obj;
|
||||
});
|
||||
.refine(extractRefine, extractRefineOpts)
|
||||
.transform(extractTransform);
|
||||
|
||||
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
|
||||
export type ScrapeRequestInput = z.input<typeof scrapeRequestSchema>;
|
||||
@ -375,20 +380,8 @@ export const batchScrapeRequestSchema = baseScrapeOptions
|
||||
ignoreInvalidURLs: z.boolean().default(false),
|
||||
})
|
||||
.strict(strictMessage)
|
||||
.refine(
|
||||
(obj) => {
|
||||
const hasExtractFormat = obj.formats?.includes("extract");
|
||||
const hasExtractOptions = obj.extract !== undefined;
|
||||
return (
|
||||
(hasExtractFormat && hasExtractOptions) ||
|
||||
(!hasExtractFormat && !hasExtractOptions)
|
||||
);
|
||||
},
|
||||
{
|
||||
message:
|
||||
"When 'extract' format is specified, 'extract' options must be provided, and vice versa",
|
||||
},
|
||||
);
|
||||
.refine(extractRefine, extractRefineOpts)
|
||||
.transform(extractTransform);
|
||||
|
||||
export const batchScrapeRequestSchemaNoURLValidation = baseScrapeOptions
|
||||
.extend({
|
||||
@ -399,22 +392,11 @@ export const batchScrapeRequestSchemaNoURLValidation = baseScrapeOptions
|
||||
ignoreInvalidURLs: z.boolean().default(false),
|
||||
})
|
||||
.strict(strictMessage)
|
||||
.refine(
|
||||
(obj) => {
|
||||
const hasExtractFormat = obj.formats?.includes("extract");
|
||||
const hasExtractOptions = obj.extract !== undefined;
|
||||
return (
|
||||
(hasExtractFormat && hasExtractOptions) ||
|
||||
(!hasExtractFormat && !hasExtractOptions)
|
||||
);
|
||||
},
|
||||
{
|
||||
message:
|
||||
"When 'extract' format is specified, 'extract' options must be provided, and vice versa",
|
||||
},
|
||||
);
|
||||
.refine(extractRefine, extractRefineOpts)
|
||||
.transform(extractTransform);
|
||||
|
||||
export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>;
|
||||
export type BatchScrapeRequestInput = z.input<typeof batchScrapeRequestSchema>;
|
||||
|
||||
const crawlerOptions = z
|
||||
.object({
|
||||
@ -452,7 +434,9 @@ export const crawlRequestSchema = crawlerOptions
|
||||
webhook: webhookSchema.optional(),
|
||||
limit: z.number().default(10000),
|
||||
})
|
||||
.strict(strictMessage);
|
||||
.strict(strictMessage)
|
||||
.refine(x => extractRefine(x.scrapeOptions), extractRefineOpts)
|
||||
.transform(x => ({ ...x, scrapeOptions: extractTransform(x.scrapeOptions) }));
|
||||
|
||||
// export type CrawlRequest = {
|
||||
// url: string;
|
||||
@ -467,6 +451,7 @@ export const crawlRequestSchema = crawlerOptions
|
||||
// }
|
||||
|
||||
export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
|
||||
export type CrawlRequestInput = z.input<typeof crawlRequestSchema>;
|
||||
|
||||
export const mapRequestSchema = crawlerOptions
|
||||
.extend({
|
||||
@ -936,9 +921,12 @@ export const searchRequestSchema = z
|
||||
})
|
||||
.strict(
|
||||
"Unrecognized key in body -- please review the v1 API documentation for request body changes",
|
||||
);
|
||||
)
|
||||
.refine(x => extractRefine(x.scrapeOptions), extractRefineOpts)
|
||||
.transform(x => ({ ...x, scrapeOptions: extractTransform(x.scrapeOptions) }));
|
||||
|
||||
export type SearchRequest = z.infer<typeof searchRequestSchema>;
|
||||
export type SearchRequestInput = z.input<typeof searchRequestSchema>;
|
||||
|
||||
export type SearchResponse =
|
||||
| ErrorResponse
|
||||
|
Loading…
x
Reference in New Issue
Block a user