mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 14:19:00 +08:00
fix(v1/types): fix extract -> json rename, ROUND II (FIR-1072) (#1199)
* Revert "Revert "fix(v1/types): fix extract -> json rename (FIR-1072) (#1195)"" This reverts commit e28a44463ae49ffc195507204492cc7c15c438c4. * fix(v1/types): fix bad transform * feat(v1): proxy option / stealthProxy flag (FIR-1050) (#1196) * feat(v1): proxy option / stealthProxy flag * feat(js-sdk): add proxy option * fix * fix extract tests
This commit is contained in:
parent
42050d3d6e
commit
fc64f436ed
1
.github/workflows/ci.yml
vendored
1
.github/workflows/ci.yml
vendored
@ -27,6 +27,7 @@ env:
|
|||||||
HDX_NODE_BETA_MODE: 1
|
HDX_NODE_BETA_MODE: 1
|
||||||
FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
|
FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
|
||||||
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
|
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
|
||||||
|
SERPER_API_KEY: ${{ secrets.SERPER_API_KEY }}
|
||||||
ENV: ${{ secrets.ENV }}
|
ENV: ${{ secrets.ENV }}
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
92
apps/api/src/__tests__/snips/batch-scrape.test.ts
Normal file
92
apps/api/src/__tests__/snips/batch-scrape.test.ts
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
import request from "supertest";
|
||||||
|
import { configDotenv } from "dotenv";
|
||||||
|
import { BatchScrapeRequestInput } from "../../controllers/v1/types";
|
||||||
|
|
||||||
|
configDotenv();
|
||||||
|
const TEST_URL = "http://127.0.0.1:3002";
|
||||||
|
|
||||||
|
async function batchScrapeStart(body: BatchScrapeRequestInput) {
|
||||||
|
return await request(TEST_URL)
|
||||||
|
.post("/v1/batch/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(body);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function batchScrapeStatus(id: string) {
|
||||||
|
return await request(TEST_URL)
|
||||||
|
.get("/v1/batch/scrape/" + encodeURIComponent(id))
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.send();
|
||||||
|
}
|
||||||
|
|
||||||
|
async function batchScrape(body: BatchScrapeRequestInput): ReturnType<typeof batchScrapeStatus> {
|
||||||
|
const bss = await batchScrapeStart(body);
|
||||||
|
expectBatchScrapeStartToSucceed(bss);
|
||||||
|
|
||||||
|
let x;
|
||||||
|
|
||||||
|
do {
|
||||||
|
x = await batchScrapeStatus(bss.body.id);
|
||||||
|
expect(x.statusCode).toBe(200);
|
||||||
|
expect(typeof x.body.status).toBe("string");
|
||||||
|
} while (x.body.status !== "completed")
|
||||||
|
|
||||||
|
expectBatchScrapeToSucceed(x);
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
function expectBatchScrapeStartToSucceed(response: Awaited<ReturnType<typeof batchScrape>>) {
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body.success).toBe(true);
|
||||||
|
expect(typeof response.body.id).toBe("string");
|
||||||
|
}
|
||||||
|
|
||||||
|
function expectBatchScrapeToSucceed(response: Awaited<ReturnType<typeof batchScrapeStatus>>) {
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body.success).toBe(true);
|
||||||
|
expect(typeof response.body.status).toBe("string");
|
||||||
|
expect(response.body.status).toBe("completed");
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
expect(Array.isArray(response.body.data)).toBe(true);
|
||||||
|
expect(response.body.data.length).toBeGreaterThan(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
describe("Batch scrape tests", () => {
|
||||||
|
describe("JSON format", () => {
|
||||||
|
it.concurrent("works", async () => {
|
||||||
|
const response = await batchScrape({
|
||||||
|
urls: ["http://firecrawl.dev"],
|
||||||
|
formats: ["json"],
|
||||||
|
jsonOptions: {
|
||||||
|
prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.",
|
||||||
|
schema: {
|
||||||
|
type: "object",
|
||||||
|
properties: {
|
||||||
|
company_mission: {
|
||||||
|
type: "string",
|
||||||
|
},
|
||||||
|
supports_sso: {
|
||||||
|
type: "boolean",
|
||||||
|
},
|
||||||
|
is_open_source: {
|
||||||
|
type: "boolean",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
required: ["company_mission", "supports_sso", "is_open_source"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(response.body.data[0]).toHaveProperty("json");
|
||||||
|
expect(response.body.data[0].json).toHaveProperty("company_mission");
|
||||||
|
expect(typeof response.body.data[0].json.company_mission).toBe("string");
|
||||||
|
expect(response.body.data[0].json).toHaveProperty("supports_sso");
|
||||||
|
expect(response.body.data[0].json.supports_sso).toBe(false);
|
||||||
|
expect(typeof response.body.data[0].json.supports_sso).toBe("boolean");
|
||||||
|
expect(response.body.data[0].json).toHaveProperty("is_open_source");
|
||||||
|
expect(response.body.data[0].json.is_open_source).toBe(true);
|
||||||
|
expect(typeof response.body.data[0].json.is_open_source).toBe("boolean");
|
||||||
|
}, 30000);
|
||||||
|
});
|
||||||
|
});
|
62
apps/api/src/__tests__/snips/crawl.test.ts
Normal file
62
apps/api/src/__tests__/snips/crawl.test.ts
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
import request from "supertest";
|
||||||
|
import { configDotenv } from "dotenv";
|
||||||
|
import { CrawlRequestInput } from "../../controllers/v1/types";
|
||||||
|
|
||||||
|
configDotenv();
|
||||||
|
const TEST_URL = "http://127.0.0.1:3002";
|
||||||
|
|
||||||
|
async function crawlStart(body: CrawlRequestInput) {
|
||||||
|
return await request(TEST_URL)
|
||||||
|
.post("/v1/crawl")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(body);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function crawlStatus(id: string) {
|
||||||
|
return await request(TEST_URL)
|
||||||
|
.get("/v1/crawl/" + encodeURIComponent(id))
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.send();
|
||||||
|
}
|
||||||
|
|
||||||
|
async function crawl(body: CrawlRequestInput): ReturnType<typeof crawlStatus> {
|
||||||
|
const cs = await crawlStart(body);
|
||||||
|
expectCrawlStartToSucceed(cs);
|
||||||
|
|
||||||
|
let x;
|
||||||
|
|
||||||
|
do {
|
||||||
|
x = await crawlStatus(cs.body.id);
|
||||||
|
expect(x.statusCode).toBe(200);
|
||||||
|
expect(typeof x.body.status).toBe("string");
|
||||||
|
} while (x.body.status !== "completed")
|
||||||
|
|
||||||
|
expectCrawlToSucceed(x);
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
function expectCrawlStartToSucceed(response: Awaited<ReturnType<typeof crawlStart>>) {
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body.success).toBe(true);
|
||||||
|
expect(typeof response.body.id).toBe("string");
|
||||||
|
}
|
||||||
|
|
||||||
|
function expectCrawlToSucceed(response: Awaited<ReturnType<typeof crawlStatus>>) {
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body.success).toBe(true);
|
||||||
|
expect(typeof response.body.status).toBe("string");
|
||||||
|
expect(response.body.status).toBe("completed");
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
expect(Array.isArray(response.body.data)).toBe(true);
|
||||||
|
expect(response.body.data.length).toBeGreaterThan(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
describe("Crawl tests", () => {
|
||||||
|
it.concurrent("works", async () => {
|
||||||
|
await crawl({
|
||||||
|
url: "https://firecrawl.dev",
|
||||||
|
limit: 10,
|
||||||
|
});
|
||||||
|
}, 120000);
|
||||||
|
});
|
81
apps/api/src/__tests__/snips/extract.test.ts
Normal file
81
apps/api/src/__tests__/snips/extract.test.ts
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
import request from "supertest";
|
||||||
|
import { configDotenv } from "dotenv";
|
||||||
|
import { ExtractRequestInput, ExtractResponse } from "../../controllers/v1/types";
|
||||||
|
|
||||||
|
configDotenv();
|
||||||
|
const TEST_URL = "http://127.0.0.1:3002";
|
||||||
|
|
||||||
|
async function extractStart(body: ExtractRequestInput) {
|
||||||
|
return await request(TEST_URL)
|
||||||
|
.post("/v1/extract")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(body);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function extractStatus(id: string) {
|
||||||
|
return await request(TEST_URL)
|
||||||
|
.get("/v1/extract/" + encodeURIComponent(id))
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.send();
|
||||||
|
}
|
||||||
|
|
||||||
|
async function extract(body: ExtractRequestInput): Promise<ExtractResponse> {
|
||||||
|
const es = await extractStart(body);
|
||||||
|
expectExtractStartToSucceed(es);
|
||||||
|
|
||||||
|
let x;
|
||||||
|
|
||||||
|
do {
|
||||||
|
x = await extractStatus(es.body.id);
|
||||||
|
expect(x.statusCode).toBe(200);
|
||||||
|
expect(typeof x.body.status).toBe("string");
|
||||||
|
} while (x.body.status !== "completed");
|
||||||
|
|
||||||
|
expectExtractToSucceed(x);
|
||||||
|
return x.body;
|
||||||
|
}
|
||||||
|
|
||||||
|
function expectExtractStartToSucceed(response: Awaited<ReturnType<typeof extractStart>>) {
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body.success).toBe(true);
|
||||||
|
expect(typeof response.body.id).toBe("string");
|
||||||
|
}
|
||||||
|
|
||||||
|
function expectExtractToSucceed(response: Awaited<ReturnType<typeof extractStatus>>) {
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body.success).toBe(true);
|
||||||
|
expect(typeof response.body.status).toBe("string");
|
||||||
|
expect(response.body.status).toBe("completed");
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
}
|
||||||
|
|
||||||
|
describe("Extract tests", () => {
|
||||||
|
it.concurrent("works", async () => {
|
||||||
|
const res = await extract({
|
||||||
|
urls: ["https://firecrawl.dev"],
|
||||||
|
schema: {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"company_mission": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"is_open_source": {
|
||||||
|
"type": "boolean"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"company_mission",
|
||||||
|
"is_open_source"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
origin: "api-sdk",
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(res.data).toHaveProperty("company_mission");
|
||||||
|
expect(typeof res.data.company_mission).toBe("string")
|
||||||
|
expect(res.data).toHaveProperty("is_open_source");
|
||||||
|
expect(typeof res.data.is_open_source).toBe("boolean");
|
||||||
|
expect(res.data.is_open_source).toBe(true);
|
||||||
|
}, 60000);
|
||||||
|
});
|
@ -1,11 +1,11 @@
|
|||||||
import request from "supertest";
|
import request from "supertest";
|
||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
import { ScrapeRequestInput } from "../../controllers/v1/types";
|
import { Document, ScrapeRequestInput } from "../../controllers/v1/types";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
const TEST_URL = "http://127.0.0.1:3002";
|
const TEST_URL = "http://127.0.0.1:3002";
|
||||||
|
|
||||||
async function scrape(body: ScrapeRequestInput) {
|
async function scrapeRaw(body: ScrapeRequestInput) {
|
||||||
return await request(TEST_URL)
|
return await request(TEST_URL)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@ -13,12 +13,18 @@ async function scrape(body: ScrapeRequestInput) {
|
|||||||
.send(body);
|
.send(body);
|
||||||
}
|
}
|
||||||
|
|
||||||
function expectScrapeToSucceed(response: Awaited<ReturnType<typeof scrape>>) {
|
function expectScrapeToSucceed(response: Awaited<ReturnType<typeof scrapeRaw>>) {
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body.success).toBe(true);
|
expect(response.body.success).toBe(true);
|
||||||
expect(typeof response.body.data).toBe("object");
|
expect(typeof response.body.data).toBe("object");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function scrape(body: ScrapeRequestInput): Promise<Document> {
|
||||||
|
const raw = await scrapeRaw(body);
|
||||||
|
expectScrapeToSucceed(raw);
|
||||||
|
return raw.body.data;
|
||||||
|
}
|
||||||
|
|
||||||
describe("Scrape tests", () => {
|
describe("Scrape tests", () => {
|
||||||
it("mocking works properly", async () => {
|
it("mocking works properly", async () => {
|
||||||
// depends on falsified mock mocking-works-properly
|
// depends on falsified mock mocking-works-properly
|
||||||
@ -30,8 +36,7 @@ describe("Scrape tests", () => {
|
|||||||
useMock: "mocking-works-properly",
|
useMock: "mocking-works-properly",
|
||||||
});
|
});
|
||||||
|
|
||||||
expectScrapeToSucceed(response);
|
expect(response.markdown).toBe(
|
||||||
expect(response.body.data.markdown).toBe(
|
|
||||||
"this is fake data coming from the mocking system!",
|
"this is fake data coming from the mocking system!",
|
||||||
);
|
);
|
||||||
}, 10000);
|
}, 10000);
|
||||||
@ -42,8 +47,7 @@ describe("Scrape tests", () => {
|
|||||||
url: "https://canyoublockit.com/testing/",
|
url: "https://canyoublockit.com/testing/",
|
||||||
});
|
});
|
||||||
|
|
||||||
expectScrapeToSucceed(response);
|
expect(response.markdown).not.toContain(".g.doubleclick.net/");
|
||||||
expect(response.body.data.markdown).not.toContain(".g.doubleclick.net/");
|
|
||||||
}, 10000);
|
}, 10000);
|
||||||
|
|
||||||
it.concurrent("doesn't block ads if explicitly disabled", async () => {
|
it.concurrent("doesn't block ads if explicitly disabled", async () => {
|
||||||
@ -52,8 +56,7 @@ describe("Scrape tests", () => {
|
|||||||
blockAds: false,
|
blockAds: false,
|
||||||
});
|
});
|
||||||
|
|
||||||
expectScrapeToSucceed(response);
|
expect(response.markdown).toContain(".g.doubleclick.net/");
|
||||||
expect(response.body.data.markdown).toContain(".g.doubleclick.net/");
|
|
||||||
}, 10000);
|
}, 10000);
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -62,8 +65,6 @@ describe("Scrape tests", () => {
|
|||||||
const response = await scrape({
|
const response = await scrape({
|
||||||
url: "https://iplocation.com",
|
url: "https://iplocation.com",
|
||||||
});
|
});
|
||||||
|
|
||||||
expectScrapeToSucceed(response);
|
|
||||||
}, 10000);
|
}, 10000);
|
||||||
|
|
||||||
it.concurrent("works with country US", async () => {
|
it.concurrent("works with country US", async () => {
|
||||||
@ -72,8 +73,7 @@ describe("Scrape tests", () => {
|
|||||||
location: { country: "US" },
|
location: { country: "US" },
|
||||||
});
|
});
|
||||||
|
|
||||||
expectScrapeToSucceed(response);
|
expect(response.markdown).toContain("| Country | United States |");
|
||||||
expect(response.body.data.markdown).toContain("| Country | United States |");
|
|
||||||
}, 10000);
|
}, 10000);
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -84,8 +84,7 @@ describe("Scrape tests", () => {
|
|||||||
formats: ["rawHtml"],
|
formats: ["rawHtml"],
|
||||||
});
|
});
|
||||||
|
|
||||||
expectScrapeToSucceed(response);
|
const obj = JSON.parse(response.rawHtml!);
|
||||||
const obj = JSON.parse(response.body.data.rawHtml);
|
|
||||||
expect(obj.id).toBe(1);
|
expect(obj.id).toBe(1);
|
||||||
}, 25000); // TODO: mock and shorten
|
}, 25000); // TODO: mock and shorten
|
||||||
});
|
});
|
||||||
@ -97,8 +96,7 @@ describe("Scrape tests", () => {
|
|||||||
formats: ["screenshot"]
|
formats: ["screenshot"]
|
||||||
});
|
});
|
||||||
|
|
||||||
expectScrapeToSucceed(response);
|
expect(typeof response.screenshot).toBe("string");
|
||||||
expect(typeof response.body.data.screenshot).toBe("string");
|
|
||||||
}, 15000);
|
}, 15000);
|
||||||
|
|
||||||
it.concurrent("screenshot@fullPage format works", async () => {
|
it.concurrent("screenshot@fullPage format works", async () => {
|
||||||
@ -107,11 +105,47 @@ describe("Scrape tests", () => {
|
|||||||
formats: ["screenshot@fullPage"]
|
formats: ["screenshot@fullPage"]
|
||||||
});
|
});
|
||||||
|
|
||||||
expectScrapeToSucceed(response);
|
expect(typeof response.screenshot).toBe("string");
|
||||||
expect(typeof response.body.data.screenshot).toBe("string");
|
|
||||||
}, 15000);
|
}, 15000);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("JSON format", () => {
|
||||||
|
it.concurrent("works", async () => {
|
||||||
|
const response = await scrape({
|
||||||
|
url: "http://firecrawl.dev",
|
||||||
|
formats: ["json"],
|
||||||
|
jsonOptions: {
|
||||||
|
prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.",
|
||||||
|
schema: {
|
||||||
|
type: "object",
|
||||||
|
properties: {
|
||||||
|
company_mission: {
|
||||||
|
type: "string",
|
||||||
|
},
|
||||||
|
supports_sso: {
|
||||||
|
type: "boolean",
|
||||||
|
},
|
||||||
|
is_open_source: {
|
||||||
|
type: "boolean",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
required: ["company_mission", "supports_sso", "is_open_source"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(response).toHaveProperty("json");
|
||||||
|
expect(response.json).toHaveProperty("company_mission");
|
||||||
|
expect(typeof response.json.company_mission).toBe("string");
|
||||||
|
expect(response.json).toHaveProperty("supports_sso");
|
||||||
|
expect(response.json.supports_sso).toBe(false);
|
||||||
|
expect(typeof response.json.supports_sso).toBe("boolean");
|
||||||
|
expect(response.json).toHaveProperty("is_open_source");
|
||||||
|
expect(response.json.is_open_source).toBe(true);
|
||||||
|
expect(typeof response.json.is_open_source).toBe("boolean");
|
||||||
|
}, 30000);
|
||||||
|
});
|
||||||
|
|
||||||
describe("Proxy API (f-e dependant)", () => {
|
describe("Proxy API (f-e dependant)", () => {
|
||||||
it.concurrent("undefined works", async () => {
|
it.concurrent("undefined works", async () => {
|
||||||
await scrape({
|
await scrape({
|
||||||
|
36
apps/api/src/__tests__/snips/search.test.ts
Normal file
36
apps/api/src/__tests__/snips/search.test.ts
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
import request from "supertest";
|
||||||
|
import { configDotenv } from "dotenv";
|
||||||
|
import { Document, SearchRequestInput } from "../../controllers/v1/types";
|
||||||
|
|
||||||
|
configDotenv();
|
||||||
|
const TEST_URL = "http://127.0.0.1:3002";
|
||||||
|
|
||||||
|
async function searchRaw(body: SearchRequestInput) {
|
||||||
|
return await request(TEST_URL)
|
||||||
|
.post("/v1/search")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(body);
|
||||||
|
}
|
||||||
|
|
||||||
|
function expectScrapeToSucceed(response: Awaited<ReturnType<typeof searchRaw>>) {
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body.success).toBe(true);
|
||||||
|
expect(typeof response.body.data).toBe("object");
|
||||||
|
expect(Array.isArray(response.body.data)).toBe(true);
|
||||||
|
expect(response.body.data.length).toBeGreaterThan(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function search(body: SearchRequestInput): Promise<Document> {
|
||||||
|
const raw = await searchRaw(body);
|
||||||
|
expectScrapeToSucceed(raw);
|
||||||
|
return raw.body.data;
|
||||||
|
}
|
||||||
|
|
||||||
|
describe("Scrape tests", () => {
|
||||||
|
it("works", async () => {
|
||||||
|
await search({
|
||||||
|
query: "firecrawl"
|
||||||
|
});
|
||||||
|
}, 15000);
|
||||||
|
});
|
@ -221,6 +221,54 @@ const baseScrapeOptions = z
|
|||||||
})
|
})
|
||||||
.strict(strictMessage);
|
.strict(strictMessage);
|
||||||
|
|
||||||
|
const extractRefine = (obj) => {
|
||||||
|
const hasExtractFormat = obj.formats?.includes("extract");
|
||||||
|
const hasExtractOptions = obj.extract !== undefined;
|
||||||
|
const hasJsonFormat = obj.formats?.includes("json");
|
||||||
|
const hasJsonOptions = obj.jsonOptions !== undefined;
|
||||||
|
return (
|
||||||
|
(hasExtractFormat && hasExtractOptions) ||
|
||||||
|
(!hasExtractFormat && !hasExtractOptions) ||
|
||||||
|
(hasJsonFormat && hasJsonOptions) ||
|
||||||
|
(!hasJsonFormat && !hasJsonOptions)
|
||||||
|
);
|
||||||
|
};
|
||||||
|
const extractRefineOpts = {
|
||||||
|
message:
|
||||||
|
"When 'extract' or 'json' format is specified, corresponding options must be provided, and vice versa",
|
||||||
|
};
|
||||||
|
const extractTransform = (obj) => {
|
||||||
|
// Handle timeout
|
||||||
|
if (
|
||||||
|
(obj.formats?.includes("extract") ||
|
||||||
|
obj.extract ||
|
||||||
|
obj.formats?.includes("json") ||
|
||||||
|
obj.jsonOptions) &&
|
||||||
|
!obj.timeout
|
||||||
|
) {
|
||||||
|
obj = { ...obj, timeout: 60000 };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (obj.formats?.includes("json")) {
|
||||||
|
obj.formats.push("extract");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert JSON options to extract options if needed
|
||||||
|
if (obj.jsonOptions && !obj.extract) {
|
||||||
|
obj = {
|
||||||
|
...obj,
|
||||||
|
extract: {
|
||||||
|
prompt: obj.jsonOptions.prompt,
|
||||||
|
systemPrompt: obj.jsonOptions.systemPrompt,
|
||||||
|
schema: obj.jsonOptions.schema,
|
||||||
|
mode: "llm",
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return obj;
|
||||||
|
}
|
||||||
|
|
||||||
export const scrapeOptions = baseScrapeOptions.refine(
|
export const scrapeOptions = baseScrapeOptions.refine(
|
||||||
(obj) => {
|
(obj) => {
|
||||||
if (!obj.actions) return true;
|
if (!obj.actions) return true;
|
||||||
@ -229,7 +277,8 @@ export const scrapeOptions = baseScrapeOptions.refine(
|
|||||||
{
|
{
|
||||||
message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`,
|
message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`,
|
||||||
}
|
}
|
||||||
);
|
).refine(extractRefine, extractRefineOpts)
|
||||||
|
.transform(extractTransform);
|
||||||
|
|
||||||
export type ScrapeOptions = z.infer<typeof baseScrapeOptions>;
|
export type ScrapeOptions = z.infer<typeof baseScrapeOptions>;
|
||||||
|
|
||||||
@ -281,11 +330,14 @@ export const extractV1Options = z
|
|||||||
.transform((obj) => ({
|
.transform((obj) => ({
|
||||||
...obj,
|
...obj,
|
||||||
allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch,
|
allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch,
|
||||||
}));
|
}))
|
||||||
|
.refine(x => x.scrapeOptions ? extractRefine(x.scrapeOptions) : true, extractRefineOpts)
|
||||||
|
.transform(x => ({ ...x, scrapeOptions: x.scrapeOptions ? extractTransform(x.scrapeOptions) : x.scrapeOptions }));
|
||||||
|
|
||||||
export type ExtractV1Options = z.infer<typeof extractV1Options>;
|
export type ExtractV1Options = z.infer<typeof extractV1Options>;
|
||||||
export const extractRequestSchema = extractV1Options;
|
export const extractRequestSchema = extractV1Options;
|
||||||
export type ExtractRequest = z.infer<typeof extractRequestSchema>;
|
export type ExtractRequest = z.infer<typeof extractRequestSchema>;
|
||||||
|
export type ExtractRequestInput = z.input<typeof extractRequestSchema>;
|
||||||
|
|
||||||
export const scrapeRequestSchema = baseScrapeOptions
|
export const scrapeRequestSchema = baseScrapeOptions
|
||||||
.omit({ timeout: true })
|
.omit({ timeout: true })
|
||||||
@ -295,55 +347,8 @@ export const scrapeRequestSchema = baseScrapeOptions
|
|||||||
timeout: z.number().int().positive().finite().safe().default(30000),
|
timeout: z.number().int().positive().finite().safe().default(30000),
|
||||||
})
|
})
|
||||||
.strict(strictMessage)
|
.strict(strictMessage)
|
||||||
.refine(
|
.refine(extractRefine, extractRefineOpts)
|
||||||
(obj) => {
|
.transform(extractTransform);
|
||||||
const hasExtractFormat = obj.formats?.includes("extract");
|
|
||||||
const hasExtractOptions = obj.extract !== undefined;
|
|
||||||
const hasJsonFormat = obj.formats?.includes("json");
|
|
||||||
const hasJsonOptions = obj.jsonOptions !== undefined;
|
|
||||||
return (
|
|
||||||
(hasExtractFormat && hasExtractOptions) ||
|
|
||||||
(!hasExtractFormat && !hasExtractOptions) ||
|
|
||||||
(hasJsonFormat && hasJsonOptions) ||
|
|
||||||
(!hasJsonFormat && !hasJsonOptions)
|
|
||||||
);
|
|
||||||
},
|
|
||||||
{
|
|
||||||
message:
|
|
||||||
"When 'extract' or 'json' format is specified, corresponding options must be provided, and vice versa",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
.transform((obj) => {
|
|
||||||
// Handle timeout
|
|
||||||
if (
|
|
||||||
(obj.formats?.includes("extract") ||
|
|
||||||
obj.extract ||
|
|
||||||
obj.formats?.includes("json") ||
|
|
||||||
obj.jsonOptions) &&
|
|
||||||
!obj.timeout
|
|
||||||
) {
|
|
||||||
obj = { ...obj, timeout: 60000 };
|
|
||||||
}
|
|
||||||
|
|
||||||
if (obj.formats?.includes("json")) {
|
|
||||||
obj.formats.push("extract");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Convert JSON options to extract options if needed
|
|
||||||
if (obj.jsonOptions && !obj.extract) {
|
|
||||||
obj = {
|
|
||||||
...obj,
|
|
||||||
extract: {
|
|
||||||
prompt: obj.jsonOptions.prompt,
|
|
||||||
systemPrompt: obj.jsonOptions.systemPrompt,
|
|
||||||
schema: obj.jsonOptions.schema,
|
|
||||||
mode: "llm",
|
|
||||||
},
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
return obj;
|
|
||||||
});
|
|
||||||
|
|
||||||
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
|
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
|
||||||
export type ScrapeRequestInput = z.input<typeof scrapeRequestSchema>;
|
export type ScrapeRequestInput = z.input<typeof scrapeRequestSchema>;
|
||||||
@ -375,20 +380,8 @@ export const batchScrapeRequestSchema = baseScrapeOptions
|
|||||||
ignoreInvalidURLs: z.boolean().default(false),
|
ignoreInvalidURLs: z.boolean().default(false),
|
||||||
})
|
})
|
||||||
.strict(strictMessage)
|
.strict(strictMessage)
|
||||||
.refine(
|
.refine(extractRefine, extractRefineOpts)
|
||||||
(obj) => {
|
.transform(extractTransform);
|
||||||
const hasExtractFormat = obj.formats?.includes("extract");
|
|
||||||
const hasExtractOptions = obj.extract !== undefined;
|
|
||||||
return (
|
|
||||||
(hasExtractFormat && hasExtractOptions) ||
|
|
||||||
(!hasExtractFormat && !hasExtractOptions)
|
|
||||||
);
|
|
||||||
},
|
|
||||||
{
|
|
||||||
message:
|
|
||||||
"When 'extract' format is specified, 'extract' options must be provided, and vice versa",
|
|
||||||
},
|
|
||||||
);
|
|
||||||
|
|
||||||
export const batchScrapeRequestSchemaNoURLValidation = baseScrapeOptions
|
export const batchScrapeRequestSchemaNoURLValidation = baseScrapeOptions
|
||||||
.extend({
|
.extend({
|
||||||
@ -399,22 +392,11 @@ export const batchScrapeRequestSchemaNoURLValidation = baseScrapeOptions
|
|||||||
ignoreInvalidURLs: z.boolean().default(false),
|
ignoreInvalidURLs: z.boolean().default(false),
|
||||||
})
|
})
|
||||||
.strict(strictMessage)
|
.strict(strictMessage)
|
||||||
.refine(
|
.refine(extractRefine, extractRefineOpts)
|
||||||
(obj) => {
|
.transform(extractTransform);
|
||||||
const hasExtractFormat = obj.formats?.includes("extract");
|
|
||||||
const hasExtractOptions = obj.extract !== undefined;
|
|
||||||
return (
|
|
||||||
(hasExtractFormat && hasExtractOptions) ||
|
|
||||||
(!hasExtractFormat && !hasExtractOptions)
|
|
||||||
);
|
|
||||||
},
|
|
||||||
{
|
|
||||||
message:
|
|
||||||
"When 'extract' format is specified, 'extract' options must be provided, and vice versa",
|
|
||||||
},
|
|
||||||
);
|
|
||||||
|
|
||||||
export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>;
|
export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>;
|
||||||
|
export type BatchScrapeRequestInput = z.input<typeof batchScrapeRequestSchema>;
|
||||||
|
|
||||||
const crawlerOptions = z
|
const crawlerOptions = z
|
||||||
.object({
|
.object({
|
||||||
@ -452,7 +434,9 @@ export const crawlRequestSchema = crawlerOptions
|
|||||||
webhook: webhookSchema.optional(),
|
webhook: webhookSchema.optional(),
|
||||||
limit: z.number().default(10000),
|
limit: z.number().default(10000),
|
||||||
})
|
})
|
||||||
.strict(strictMessage);
|
.strict(strictMessage)
|
||||||
|
.refine(x => extractRefine(x.scrapeOptions), extractRefineOpts)
|
||||||
|
.transform(x => ({ ...x, scrapeOptions: extractTransform(x.scrapeOptions) }));
|
||||||
|
|
||||||
// export type CrawlRequest = {
|
// export type CrawlRequest = {
|
||||||
// url: string;
|
// url: string;
|
||||||
@ -467,6 +451,7 @@ export const crawlRequestSchema = crawlerOptions
|
|||||||
// }
|
// }
|
||||||
|
|
||||||
export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
|
export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
|
||||||
|
export type CrawlRequestInput = z.input<typeof crawlRequestSchema>;
|
||||||
|
|
||||||
export const mapRequestSchema = crawlerOptions
|
export const mapRequestSchema = crawlerOptions
|
||||||
.extend({
|
.extend({
|
||||||
@ -936,9 +921,12 @@ export const searchRequestSchema = z
|
|||||||
})
|
})
|
||||||
.strict(
|
.strict(
|
||||||
"Unrecognized key in body -- please review the v1 API documentation for request body changes",
|
"Unrecognized key in body -- please review the v1 API documentation for request body changes",
|
||||||
);
|
)
|
||||||
|
.refine(x => extractRefine(x.scrapeOptions), extractRefineOpts)
|
||||||
|
.transform(x => ({ ...x, scrapeOptions: extractTransform(x.scrapeOptions) }));
|
||||||
|
|
||||||
export type SearchRequest = z.infer<typeof searchRequestSchema>;
|
export type SearchRequest = z.infer<typeof searchRequestSchema>;
|
||||||
|
export type SearchRequestInput = z.input<typeof searchRequestSchema>;
|
||||||
|
|
||||||
export type SearchResponse =
|
export type SearchResponse =
|
||||||
| ErrorResponse
|
| ErrorResponse
|
||||||
|
Loading…
x
Reference in New Issue
Block a user