mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 10:39:02 +08:00
Nick: fixes and more e2e tests
This commit is contained in:
parent
37ae9a9043
commit
23a033fe61
@ -112,6 +112,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send(scrapeRequest);
|
.send(scrapeRequest);
|
||||||
|
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty("data");
|
expect(response.body).toHaveProperty("data");
|
||||||
if (!("data" in response.body)) {
|
if (!("data" in response.body)) {
|
||||||
@ -127,5 +128,95 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
},
|
},
|
||||||
30000
|
30000
|
||||||
);
|
);
|
||||||
|
it.concurrent('should return a successful response for a valid scrape with PDF file', async () => {
|
||||||
|
const scrapeRequest: ScrapeRequest = {
|
||||||
|
url: "https://arxiv.org/pdf/astro-ph/9301001.pdf"
|
||||||
|
// formats: ["markdown", "html"],
|
||||||
|
};
|
||||||
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
|
.post('/v1/scrape')
|
||||||
|
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set('Content-Type', 'application/json')
|
||||||
|
.send(scrapeRequest);
|
||||||
|
await new Promise((r) => setTimeout(r, 6000));
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty('data');
|
||||||
|
if (!("data" in response.body)) {
|
||||||
|
throw new Error("Expected response body to have 'data' property");
|
||||||
|
}
|
||||||
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
|
expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy');
|
||||||
|
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||||
|
expect(response.body.data.metadata.error).toBeUndefined();
|
||||||
|
}, 60000);
|
||||||
|
|
||||||
|
it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
|
||||||
|
const scrapeRequest: ScrapeRequest = {
|
||||||
|
url: "https://arxiv.org/pdf/astro-ph/9301001"
|
||||||
|
};
|
||||||
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
|
.post('/v1/scrape')
|
||||||
|
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set('Content-Type', 'application/json')
|
||||||
|
.send(scrapeRequest);
|
||||||
|
await new Promise((r) => setTimeout(r, 6000));
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty('data');
|
||||||
|
if (!("data" in response.body)) {
|
||||||
|
throw new Error("Expected response body to have 'data' property");
|
||||||
|
}
|
||||||
|
expect(response.body.data).toHaveProperty('markdown');
|
||||||
|
expect(response.body.data).toHaveProperty('metadata');
|
||||||
|
expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy');
|
||||||
|
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||||
|
expect(response.body.data.metadata.error).toBeUndefined();
|
||||||
|
}, 60000);
|
||||||
|
|
||||||
|
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
|
||||||
|
const scrapeRequest: ScrapeRequest = {
|
||||||
|
url: "https://www.scrapethissite.com/",
|
||||||
|
onlyMainContent: false // default is true
|
||||||
|
};
|
||||||
|
const responseWithoutRemoveTags: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
|
.post("/v1/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(scrapeRequest);
|
||||||
|
expect(responseWithoutRemoveTags.statusCode).toBe(200);
|
||||||
|
expect(responseWithoutRemoveTags.body).toHaveProperty("data");
|
||||||
|
|
||||||
|
if (!("data" in responseWithoutRemoveTags.body)) {
|
||||||
|
throw new Error("Expected response body to have 'data' property");
|
||||||
|
}
|
||||||
|
expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
|
||||||
|
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
|
||||||
|
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
|
||||||
|
expect(responseWithoutRemoveTags.body.data.markdown).toContain("[FAQ](/faq/)"); // .nav
|
||||||
|
expect(responseWithoutRemoveTags.body.data.markdown).toContain("Hartley Brody 2023"); // #footer
|
||||||
|
|
||||||
|
const scrapeRequestWithRemoveTags: ScrapeRequest = {
|
||||||
|
url: "https://www.scrapethissite.com/",
|
||||||
|
excludeTags: ['.nav', '#footer', 'strong'],
|
||||||
|
onlyMainContent: false // default is true
|
||||||
|
};
|
||||||
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
|
.post("/v1/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(scrapeRequestWithRemoveTags);
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
if (!("data" in response.body)) {
|
||||||
|
throw new Error("Expected response body to have 'data' property");
|
||||||
|
}
|
||||||
|
expect(response.body.data).toHaveProperty("markdown");
|
||||||
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
|
expect(response.body.data).not.toHaveProperty("html");
|
||||||
|
expect(response.body.data.markdown).not.toContain("Hartley Brody 2023");
|
||||||
|
expect(response.body.data.markdown).not.toContain("[FAQ](/faq/)"); //
|
||||||
|
}, 30000);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
@ -28,6 +28,7 @@ export const scrapeOptions = z.object({
|
|||||||
onlyMainContent: z.boolean().default(true),
|
onlyMainContent: z.boolean().default(true),
|
||||||
timeout: z.number().int().positive().finite().safe().default(30000), // default?
|
timeout: z.number().int().positive().finite().safe().default(30000), // default?
|
||||||
waitFor: z.number().int().nonnegative().finite().safe().default(0),
|
waitFor: z.number().int().nonnegative().finite().safe().default(0),
|
||||||
|
parsePDF: z.boolean().default(true),
|
||||||
});
|
});
|
||||||
|
|
||||||
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
|
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
|
||||||
@ -207,5 +208,6 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
|||||||
includeLinks: x.formats.includes("links"),
|
includeLinks: x.formats.includes("links"),
|
||||||
screenshot: x.formats.includes("screenshot"),
|
screenshot: x.formats.includes("screenshot"),
|
||||||
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
|
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
|
||||||
|
parsePDF: x.parsePDF
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -346,6 +346,7 @@ export class WebScraperDataProvider {
|
|||||||
});
|
});
|
||||||
return {
|
return {
|
||||||
content: content,
|
content: content,
|
||||||
|
markdown: content,
|
||||||
metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
|
metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
|
||||||
provider: "web-scraper",
|
provider: "web-scraper",
|
||||||
};
|
};
|
||||||
|
Loading…
x
Reference in New Issue
Block a user