diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
index 5693d414..82ed5bfe 100644
--- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
@@ -1,6 +1,10 @@
import request from "supertest";
import dotenv from "dotenv";
-import { FirecrawlCrawlResponse, FirecrawlCrawlStatusResponse, FirecrawlScrapeResponse } from "../../types";
+import {
+ FirecrawlCrawlResponse,
+ FirecrawlCrawlStatusResponse,
+ FirecrawlScrapeResponse,
+} from "../../types";
dotenv.config();
const TEST_URL = "http://127.0.0.1:3002";
@@ -24,273 +28,365 @@ describe("E2E Tests for v0 API Routes", () => {
describe("POST /v0/scrape", () => {
it.concurrent("should require authorization", async () => {
- const response: FirecrawlScrapeResponse = await request(TEST_URL).post("/v0/scrape");
+ const response: FirecrawlScrapeResponse = await request(TEST_URL).post(
+ "/v0/scrape"
+ );
expect(response.statusCode).toBe(401);
});
- it.concurrent("should return an error response with an invalid API key", async () => {
- const response: FirecrawlScrapeResponse = await request(TEST_URL)
- .post("/v0/scrape")
- .set("Authorization", `Bearer invalid-api-key`)
- .set("Content-Type", "application/json")
- .send({ url: "https://firecrawl.dev" });
- expect(response.statusCode).toBe(401);
- });
+ it.concurrent(
+ "should return an error response with an invalid API key",
+ async () => {
+ const response: FirecrawlScrapeResponse = await request(TEST_URL)
+ .post("/v0/scrape")
+ .set("Authorization", `Bearer invalid-api-key`)
+ .set("Content-Type", "application/json")
+ .send({ url: "https://firecrawl.dev" });
+ expect(response.statusCode).toBe(401);
+ }
+ );
- it.concurrent("should return a successful response with a valid API key", async () => {
- const response: FirecrawlScrapeResponse = await request(TEST_URL)
- .post("/v0/scrape")
- .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
- .set("Content-Type", "application/json")
- .send({ url: "https://roastmywebsite.ai" });
- expect(response.statusCode).toBe(200);
- expect(response.body).toHaveProperty("data");
- expect(response.body.data).toHaveProperty("content");
- expect(response.body.data).toHaveProperty("markdown");
- expect(response.body.data).toHaveProperty("metadata");
- expect(response.body.data).not.toHaveProperty("html");
- expect(response.body.data.content).toContain("_Roast_");
- expect(response.body.data.metadata.pageError).toBeUndefined();
- expect(response.body.data.metadata.title).toBe("Roast My Website");
- expect(response.body.data.metadata.description).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️");
- expect(response.body.data.metadata.keywords).toBe("Roast My Website,Roast,Website,GitHub,Firecrawl");
- expect(response.body.data.metadata.robots).toBe("follow, index");
- expect(response.body.data.metadata.ogTitle).toBe("Roast My Website");
- expect(response.body.data.metadata.ogDescription).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️");
- expect(response.body.data.metadata.ogUrl).toBe("https://www.roastmywebsite.ai");
- expect(response.body.data.metadata.ogImage).toBe("https://www.roastmywebsite.ai/og.png");
- expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]);
- expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website");
- expect(response.body.data.metadata.sourceURL).toBe("https://roastmywebsite.ai");
- expect(response.body.data.metadata.pageStatusCode).toBe(200);
- }, 30000); // 30 seconds timeout
+ it.concurrent(
+ "should return a successful response with a valid API key",
+ async () => {
+ const response: FirecrawlScrapeResponse = await request(TEST_URL)
+ .post("/v0/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({ url: "https://roastmywebsite.ai" });
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("data");
+ expect(response.body.data).toHaveProperty("content");
+ expect(response.body.data).toHaveProperty("markdown");
+ expect(response.body.data).toHaveProperty("metadata");
+ expect(response.body.data).not.toHaveProperty("html");
+ expect(response.body.data.content).toContain("_Roast_");
+ expect(response.body.data.metadata.pageError).toBeUndefined();
+ expect(response.body.data.metadata.title).toBe("Roast My Website");
+ expect(response.body.data.metadata.description).toBe(
+ "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
+ );
+ expect(response.body.data.metadata.keywords).toBe(
+ "Roast My Website,Roast,Website,GitHub,Firecrawl"
+ );
+ expect(response.body.data.metadata.robots).toBe("follow, index");
+ expect(response.body.data.metadata.ogTitle).toBe("Roast My Website");
+ expect(response.body.data.metadata.ogDescription).toBe(
+ "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
+ );
+ expect(response.body.data.metadata.ogUrl).toBe(
+ "https://www.roastmywebsite.ai"
+ );
+ expect(response.body.data.metadata.ogImage).toBe(
+ "https://www.roastmywebsite.ai/og.png"
+ );
+ expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]);
+ expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website");
+ expect(response.body.data.metadata.sourceURL).toBe(
+ "https://roastmywebsite.ai"
+ );
+ expect(response.body.data.metadata.pageStatusCode).toBe(200);
+ },
+ 30000
+ ); // 30 seconds timeout
+ it.concurrent(
+ "should return a successful response with a valid API key and includeHtml set to true",
+ async () => {
+ const response: FirecrawlScrapeResponse = await request(TEST_URL)
+ .post("/v0/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({
+ url: "https://roastmywebsite.ai",
+ pageOptions: { includeHtml: true },
+ });
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("data");
+ expect(response.body.data).toHaveProperty("content");
+ expect(response.body.data).toHaveProperty("markdown");
+ expect(response.body.data).toHaveProperty("html");
+ expect(response.body.data).toHaveProperty("metadata");
+ expect(response.body.data.content).toContain("_Roast_");
+ expect(response.body.data.markdown).toContain("_Roast_");
+ expect(response.body.data.html).toContain("
{
- const response: FirecrawlScrapeResponse = await request(TEST_URL)
- .post("/v0/scrape")
- .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
- .set("Content-Type", "application/json")
- .send({
- url: "https://roastmywebsite.ai",
- pageOptions: { includeHtml: true },
- });
- expect(response.statusCode).toBe(200);
- expect(response.body).toHaveProperty("data");
- expect(response.body.data).toHaveProperty("content");
- expect(response.body.data).toHaveProperty("markdown");
- expect(response.body.data).toHaveProperty("html");
- expect(response.body.data).toHaveProperty("metadata");
- expect(response.body.data.content).toContain("_Roast_");
- expect(response.body.data.markdown).toContain("_Roast_");
- expect(response.body.data.html).toContain(" {
- const response: FirecrawlScrapeResponse = await request(TEST_URL)
- .post('/v0/scrape')
- .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
- .set('Content-Type', 'application/json')
- .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf' });
- await new Promise((r) => setTimeout(r, 6000));
+ it.concurrent(
+ "should return a successful response for a valid scrape with PDF file",
+ async () => {
+ const response: FirecrawlScrapeResponse = await request(TEST_URL)
+ .post("/v0/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({ url: "https://arxiv.org/pdf/astro-ph/9301001.pdf" });
+ await new Promise((r) => setTimeout(r, 6000));
- expect(response.statusCode).toBe(200);
- expect(response.body).toHaveProperty('data');
- expect(response.body.data).toHaveProperty('content');
- expect(response.body.data).toHaveProperty('metadata');
- expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
- expect(response.body.data.metadata.pageStatusCode).toBe(200);
- expect(response.body.data.metadata.pageError).toBeUndefined();
- }, 60000); // 60 seconds
-
- it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
- const response: FirecrawlScrapeResponse = await request(TEST_URL)
- .post('/v0/scrape')
- .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
- .set('Content-Type', 'application/json')
- .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001' });
- await new Promise((r) => setTimeout(r, 6000));
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("data");
+ expect(response.body.data).toHaveProperty("content");
+ expect(response.body.data).toHaveProperty("metadata");
+ expect(response.body.data.content).toContain(
+ "We present spectrophotometric observations of the Broad Line Radio Galaxy"
+ );
+ expect(response.body.data.metadata.pageStatusCode).toBe(200);
+ expect(response.body.data.metadata.pageError).toBeUndefined();
+ },
+ 60000
+ ); // 60 seconds
- expect(response.statusCode).toBe(200);
- expect(response.body).toHaveProperty('data');
- expect(response.body.data).toHaveProperty('content');
- expect(response.body.data).toHaveProperty('metadata');
- expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
- expect(response.body.data.metadata.pageStatusCode).toBe(200);
- expect(response.body.data.metadata.pageError).toBeUndefined();
- }, 60000); // 60 seconds
+ it.concurrent(
+ "should return a successful response for a valid scrape with PDF file without explicit .pdf extension",
+ async () => {
+ const response: FirecrawlScrapeResponse = await request(TEST_URL)
+ .post("/v0/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({ url: "https://arxiv.org/pdf/astro-ph/9301001" });
+ await new Promise((r) => setTimeout(r, 6000));
- it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
- const responseWithoutRemoveTags: FirecrawlScrapeResponse = await request(TEST_URL)
- .post("/v0/scrape")
- .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
- .set("Content-Type", "application/json")
- .send({ url: "https://www.scrapethissite.com/" });
- expect(responseWithoutRemoveTags.statusCode).toBe(200);
- expect(responseWithoutRemoveTags.body).toHaveProperty("data");
- expect(responseWithoutRemoveTags.body.data).toHaveProperty("content");
- expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
- expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
- expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
- expect(responseWithoutRemoveTags.body.data.content).toContain("Scrape This Site");
- expect(responseWithoutRemoveTags.body.data.content).toContain("Lessons and Videos"); // #footer
- expect(responseWithoutRemoveTags.body.data.content).toContain("[Sandbox]("); // .nav
- expect(responseWithoutRemoveTags.body.data.content).toContain("web scraping"); // strong
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("data");
+ expect(response.body.data).toHaveProperty("content");
+ expect(response.body.data).toHaveProperty("metadata");
+ expect(response.body.data.content).toContain(
+ "We present spectrophotometric observations of the Broad Line Radio Galaxy"
+ );
+ expect(response.body.data.metadata.pageStatusCode).toBe(200);
+ expect(response.body.data.metadata.pageError).toBeUndefined();
+ },
+ 60000
+ ); // 60 seconds
- const response: FirecrawlScrapeResponse = await request(TEST_URL)
- .post("/v0/scrape")
- .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
- .set("Content-Type", "application/json")
- .send({ url: "https://www.scrapethissite.com/", pageOptions: { removeTags: ['.nav', '#footer', 'strong'] } });
- expect(response.statusCode).toBe(200);
- expect(response.body).toHaveProperty("data");
- expect(response.body.data).toHaveProperty("content");
- expect(response.body.data).toHaveProperty("markdown");
- expect(response.body.data).toHaveProperty("metadata");
- expect(response.body.data).not.toHaveProperty("html");
- expect(response.body.data.content).toContain("Scrape This Site");
- expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer
- expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav
- expect(response.body.data.content).not.toContain("web scraping"); // strong
- }, 30000); // 30 seconds timeout
+ it.concurrent(
+ "should return a successful response with a valid API key with removeTags option",
+ async () => {
+ const responseWithoutRemoveTags: FirecrawlScrapeResponse =
+ await request(TEST_URL)
+ .post("/v0/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({ url: "https://www.scrapethissite.com/" });
+ expect(responseWithoutRemoveTags.statusCode).toBe(200);
+ expect(responseWithoutRemoveTags.body).toHaveProperty("data");
+ expect(responseWithoutRemoveTags.body.data).toHaveProperty("content");
+ expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
+ expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
+ expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
+ expect(responseWithoutRemoveTags.body.data.content).toContain(
+ "Scrape This Site"
+ );
+ expect(responseWithoutRemoveTags.body.data.content).toContain(
+ "Lessons and Videos"
+ ); // #footer
+ expect(responseWithoutRemoveTags.body.data.content).toContain(
+ "[Sandbox]("
+ ); // .nav
+ expect(responseWithoutRemoveTags.body.data.content).toContain(
+ "web scraping"
+ ); // strong
- it.concurrent('should return a successful response for a scrape with 400 page', async () => {
- const response: FirecrawlScrapeResponse = await request(TEST_URL)
- .post('/v0/scrape')
- .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
- .set('Content-Type', 'application/json')
- .send({ url: 'https://httpstat.us/400' });
- await new Promise((r) => setTimeout(r, 5000));
+ const response: FirecrawlScrapeResponse = await request(TEST_URL)
+ .post("/v0/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({
+ url: "https://www.scrapethissite.com/",
+ pageOptions: { removeTags: [".nav", "#footer", "strong"] },
+ });
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("data");
+ expect(response.body.data).toHaveProperty("content");
+ expect(response.body.data).toHaveProperty("markdown");
+ expect(response.body.data).toHaveProperty("metadata");
+ expect(response.body.data).not.toHaveProperty("html");
+ expect(response.body.data.content).toContain("Scrape This Site");
+ expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer
+ expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav
+ expect(response.body.data.content).not.toContain("web scraping"); // strong
+ },
+ 30000
+ ); // 30 seconds timeout
- expect(response.statusCode).toBe(200);
- expect(response.body).toHaveProperty('data');
- expect(response.body.data).toHaveProperty('content');
- expect(response.body.data).toHaveProperty('metadata');
- expect(response.body.data.metadata.pageStatusCode).toBe(400);
- expect(response.body.data.metadata.pageError.toLowerCase()).toContain("bad request");
- }, 60000); // 60 seconds
+ it.concurrent(
+ "should return a successful response for a scrape with 400 page",
+ async () => {
+ const response: FirecrawlScrapeResponse = await request(TEST_URL)
+ .post("/v0/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({ url: "https://httpstat.us/400" });
+ await new Promise((r) => setTimeout(r, 5000));
- it.concurrent('should return a successful response for a scrape with 401 page', async () => {
- const response: FirecrawlScrapeResponse = await request(TEST_URL)
- .post('/v0/scrape')
- .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
- .set('Content-Type', 'application/json')
- .send({ url: 'https://httpstat.us/401' });
- await new Promise((r) => setTimeout(r, 5000));
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("data");
+ expect(response.body.data).toHaveProperty("content");
+ expect(response.body.data).toHaveProperty("metadata");
+ expect(response.body.data.metadata.pageStatusCode).toBe(400);
+ expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
+ "bad request"
+ );
+ },
+ 60000
+ ); // 60 seconds
- expect(response.statusCode).toBe(200);
- expect(response.body).toHaveProperty('data');
- expect(response.body.data).toHaveProperty('content');
- expect(response.body.data).toHaveProperty('metadata');
- expect(response.body.data.metadata.pageStatusCode).toBe(401);
- expect(response.body.data.metadata.pageError.toLowerCase()).toContain("unauthorized");
- }, 60000); // 60 seconds
+ it.concurrent(
+ "should return a successful response for a scrape with 401 page",
+ async () => {
+ const response: FirecrawlScrapeResponse = await request(TEST_URL)
+ .post("/v0/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({ url: "https://httpstat.us/401" });
+ await new Promise((r) => setTimeout(r, 5000));
- it.concurrent("should return a successful response for a scrape with 403 page", async () => {
- const response: FirecrawlScrapeResponse = await request(TEST_URL)
- .post('/v0/scrape')
- .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
- .set('Content-Type', 'application/json')
- .send({ url: 'https://httpstat.us/403' });
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("data");
+ expect(response.body.data).toHaveProperty("content");
+ expect(response.body.data).toHaveProperty("metadata");
+ expect(response.body.data.metadata.pageStatusCode).toBe(401);
+ expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
+ "unauthorized"
+ );
+ },
+ 60000
+ ); // 60 seconds
- await new Promise((r) => setTimeout(r, 5000));
- expect(response.statusCode).toBe(200);
- expect(response.body).toHaveProperty('data');
- expect(response.body.data).toHaveProperty('content');
- expect(response.body.data).toHaveProperty('metadata');
- expect(response.body.data.metadata.pageStatusCode).toBe(403);
- expect(response.body.data.metadata.pageError.toLowerCase()).toContain("forbidden");
- }, 60000); // 60 seconds
+ it.concurrent(
+ "should return a successful response for a scrape with 403 page",
+ async () => {
+ const response: FirecrawlScrapeResponse = await request(TEST_URL)
+ .post("/v0/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({ url: "https://httpstat.us/403" });
- it.concurrent('should return a successful response for a scrape with 404 page', async () => {
- const response: FirecrawlScrapeResponse = await request(TEST_URL)
- .post('/v0/scrape')
- .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
- .set('Content-Type', 'application/json')
- .send({ url: 'https://httpstat.us/404' });
- await new Promise((r) => setTimeout(r, 5000));
+ await new Promise((r) => setTimeout(r, 5000));
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("data");
+ expect(response.body.data).toHaveProperty("content");
+ expect(response.body.data).toHaveProperty("metadata");
+ expect(response.body.data.metadata.pageStatusCode).toBe(403);
+ expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
+ "forbidden"
+ );
+ },
+ 60000
+ ); // 60 seconds
- expect(response.statusCode).toBe(200);
- expect(response.body).toHaveProperty('data');
- expect(response.body.data).toHaveProperty('content');
- expect(response.body.data).toHaveProperty('metadata');
- expect(response.body.data.metadata.pageStatusCode).toBe(404);
- }, 60000); // 60 seconds
+ it.concurrent(
+ "should return a successful response for a scrape with 404 page",
+ async () => {
+ const response: FirecrawlScrapeResponse = await request(TEST_URL)
+ .post("/v0/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({ url: "https://httpstat.us/404" });
+ await new Promise((r) => setTimeout(r, 5000));
- it.concurrent('should return a successful response for a scrape with 405 page', async () => {
- const response = await request(TEST_URL)
- .post('/v0/scrape')
- .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
- .set('Content-Type', 'application/json')
- .send({ url: 'https://httpstat.us/405' });
- await new Promise((r) => setTimeout(r, 5000));
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("data");
+ expect(response.body.data).toHaveProperty("content");
+ expect(response.body.data).toHaveProperty("metadata");
+ expect(response.body.data.metadata.pageStatusCode).toBe(404);
+ },
+ 60000
+ ); // 60 seconds
- expect(response.statusCode).toBe(200);
- expect(response.body).toHaveProperty('data');
- expect(response.body.data).toHaveProperty('content');
- expect(response.body.data).toHaveProperty('metadata');
- expect(response.body.data.metadata.pageStatusCode).toBe(405);
- }, 60000); // 60 seconds
+ it.concurrent(
+ "should return a successful response for a scrape with 405 page",
+ async () => {
+ const response = await request(TEST_URL)
+ .post("/v0/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({ url: "https://httpstat.us/405" });
+ await new Promise((r) => setTimeout(r, 5000));
- it.concurrent('should return a successful response for a scrape with 500 page', async () => {
- const response: FirecrawlScrapeResponse = await request(TEST_URL)
- .post('/v0/scrape')
- .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
- .set('Content-Type', 'application/json')
- .send({ url: 'https://httpstat.us/500' });
- await new Promise((r) => setTimeout(r, 5000));
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("data");
+ expect(response.body.data).toHaveProperty("content");
+ expect(response.body.data).toHaveProperty("metadata");
+ expect(response.body.data.metadata.pageStatusCode).toBe(405);
+ },
+ 60000
+ ); // 60 seconds
- expect(response.statusCode).toBe(200);
- expect(response.body).toHaveProperty('data');
- expect(response.body.data).toHaveProperty('content');
- expect(response.body.data).toHaveProperty('metadata');
- expect(response.body.data.metadata.pageStatusCode).toBe(500);
- }, 60000); // 60 seconds
+ it.concurrent(
+ "should return a successful response for a scrape with 500 page",
+ async () => {
+ const response: FirecrawlScrapeResponse = await request(TEST_URL)
+ .post("/v0/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({ url: "https://httpstat.us/500" });
+ await new Promise((r) => setTimeout(r, 5000));
+
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("data");
+ expect(response.body.data).toHaveProperty("content");
+ expect(response.body.data).toHaveProperty("metadata");
+ expect(response.body.data.metadata.pageStatusCode).toBe(500);
+ },
+ 60000
+ ); // 60 seconds
});
describe("POST /v0/crawl", () => {
it.concurrent("should require authorization", async () => {
- const response: FirecrawlCrawlResponse = await request(TEST_URL).post("/v0/crawl");
- expect(response.statusCode).toBe(401);
- });
-
- it.concurrent("should return an error response with an invalid API key", async () => {
- const response: FirecrawlCrawlResponse = await request(TEST_URL)
- .post("/v0/crawl")
- .set("Authorization", `Bearer invalid-api-key`)
- .set("Content-Type", "application/json")
- .send({ url: "https://firecrawl.dev" });
- expect(response.statusCode).toBe(401);
- });
-
- it.concurrent("should return a successful response with a valid API key for crawl", async () => {
- const response: FirecrawlCrawlResponse = await request(TEST_URL)
- .post("/v0/crawl")
- .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
- .set("Content-Type", "application/json")
- .send({ url: "https://firecrawl.dev" });
- expect(response.statusCode).toBe(200);
- expect(response.body).toHaveProperty("jobId");
- expect(response.body.jobId).toMatch(
- /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/
+ const response: FirecrawlCrawlResponse = await request(TEST_URL).post(
+ "/v0/crawl"
);
+ expect(response.statusCode).toBe(401);
});
-
- it.concurrent("should return a successful response with a valid API key and valid includes option", async () => {
- const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL)
- .post("/v0/crawl")
- .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
- .set("Content-Type", "application/json")
- .send({
- url: "https://mendable.ai",
- limit: 10,
- crawlerOptions: {
- includes: ["blog/*"],
- },
- });
-
+
+ it.concurrent(
+ "should return an error response with an invalid API key",
+ async () => {
+ const response: FirecrawlCrawlResponse = await request(TEST_URL)
+ .post("/v0/crawl")
+ .set("Authorization", `Bearer invalid-api-key`)
+ .set("Content-Type", "application/json")
+ .send({ url: "https://firecrawl.dev" });
+ expect(response.statusCode).toBe(401);
+ }
+ );
+
+ it.concurrent(
+ "should return a successful response with a valid API key for crawl",
+ async () => {
+ const response: FirecrawlCrawlResponse = await request(TEST_URL)
+ .post("/v0/crawl")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({ url: "https://firecrawl.dev" });
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("jobId");
+ expect(response.body.jobId).toMatch(
+ /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/
+ );
+ }
+ );
+
+ it.concurrent(
+ "should return a successful response with a valid API key and valid includes option",
+ async () => {
+ const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL)
+ .post("/v0/crawl")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({
+ url: "https://mendable.ai",
+ limit: 10,
+ crawlerOptions: {
+ includes: ["blog/*"],
+ },
+ });
+
let response: FirecrawlCrawlStatusResponse;
let isFinished = false;
@@ -310,153 +406,189 @@ describe("E2E Tests for v0 API Routes", () => {
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
const completedResponse = await request(TEST_URL)
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+
+ const urls = completedResponse.body.data.map(
+ (item: any) => item.metadata?.sourceURL
+ );
+ expect(urls.length).toBeGreaterThan(5);
+ urls.forEach((url: string) => {
+ expect(url.startsWith("https://www.mendable.ai/blog/")).toBeTruthy();
+ });
+
+ expect(completedResponse.statusCode).toBe(200);
+ expect(completedResponse.body).toHaveProperty("status");
+ expect(completedResponse.body.status).toBe("completed");
+ expect(completedResponse.body).toHaveProperty("data");
+ expect(completedResponse.body.data[0]).toHaveProperty("content");
+ expect(completedResponse.body.data[0]).toHaveProperty("markdown");
+ expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+ expect(completedResponse.body.data[0].content).toContain("Mendable");
+ expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
+ 200
+ );
+ expect(
+ completedResponse.body.data[0].metadata.pageError
+ ).toBeUndefined();
+ },
+ 180000
+ ); // 180 seconds
+
+ it.concurrent(
+ "should return a successful response with a valid API key and valid excludes option",
+ async () => {
+ const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL)
+ .post("/v0/crawl")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({
+ url: "https://mendable.ai",
+ limit: 10,
+ crawlerOptions: {
+ excludes: ["blog/*"],
+ },
+ });
+
+ let isFinished = false;
+ let response: FirecrawlCrawlStatusResponse;
+
+ while (!isFinished) {
+ response = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
- const urls = completedResponse.body.data.map(
- (item: any) => item.metadata?.sourceURL
- );
- expect(urls.length).toBeGreaterThan(5);
- urls.forEach((url: string) => {
- expect(url.startsWith("https://www.mendable.ai/blog/")).toBeTruthy();
- });
-
- expect(completedResponse.statusCode).toBe(200);
- expect(completedResponse.body).toHaveProperty("status");
- expect(completedResponse.body.status).toBe("completed");
- expect(completedResponse.body).toHaveProperty("data");
- expect(completedResponse.body.data[0]).toHaveProperty("content");
- expect(completedResponse.body.data[0]).toHaveProperty("markdown");
- expect(completedResponse.body.data[0]).toHaveProperty("metadata");
- expect(completedResponse.body.data[0].content).toContain("Mendable");
- expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
- expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
- }, 180000); // 180 seconds
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("status");
+ isFinished = response.body.status === "completed";
- it.concurrent("should return a successful response with a valid API key and valid excludes option", async () => {
- const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL)
- .post("/v0/crawl")
- .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
- .set("Content-Type", "application/json")
- .send({
- url: "https://mendable.ai",
- limit: 10,
- crawlerOptions: {
- excludes: ["blog/*"],
- },
- });
-
- let isFinished = false;
- let response: FirecrawlCrawlStatusResponse;
+ if (!isFinished) {
+ await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
+ }
+ }
- while (!isFinished) {
- response = await request(TEST_URL)
+ await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
+ const completedResponse: FirecrawlCrawlStatusResponse = await request(
+ TEST_URL
+ )
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+ const urls = completedResponse.body.data.map(
+ (item: any) => item.metadata?.sourceURL
+ );
+ expect(urls.length).toBeGreaterThan(5);
+ urls.forEach((url: string) => {
+ expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy();
+ });
+ },
+ 90000
+ ); // 90 seconds
+
+ it.concurrent(
+ "should return a successful response with max depth option for a valid crawl job",
+ async () => {
+ const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL)
+ .post("/v0/crawl")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({
+ url: "https://www.scrapethissite.com",
+ crawlerOptions: { maxDepth: 1 },
+ });
+ expect(crawlResponse.statusCode).toBe(200);
+
+ const response: FirecrawlCrawlStatusResponse = await request(TEST_URL)
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
- isFinished = response.body.status === "completed";
-
- if (!isFinished) {
- await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
+ expect(["active", "waiting"]).toContain(response.body.status);
+ // wait for 60 seconds
+ let isCompleted = false;
+ while (!isCompleted) {
+ const statusCheckResponse = await request(TEST_URL)
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+ expect(statusCheckResponse.statusCode).toBe(200);
+ isCompleted = statusCheckResponse.body.status === "completed";
+ if (!isCompleted) {
+ await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
+ }
}
- }
-
- await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
- const completedResponse: FirecrawlCrawlStatusResponse = await request(TEST_URL)
- .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
- .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
-
- const urls = completedResponse.body.data.map(
- (item: any) => item.metadata?.sourceURL
- );
- expect(urls.length).toBeGreaterThan(5);
- urls.forEach((url: string) => {
- expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy();
- });
- }, 90000); // 90 seconds
-
- it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => {
- const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL)
- .post("/v0/crawl")
- .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
- .set("Content-Type", "application/json")
- .send({
- url: "https://www.scrapethissite.com",
- crawlerOptions: { maxDepth: 1 },
- });
- expect(crawlResponse.statusCode).toBe(200);
-
- const response: FirecrawlCrawlStatusResponse = await request(TEST_URL)
- .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
- .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
- expect(response.statusCode).toBe(200);
- expect(response.body).toHaveProperty("status");
- expect(["active", "waiting"]).toContain(response.body.status);
- // wait for 60 seconds
- let isCompleted = false;
- while (!isCompleted) {
- const statusCheckResponse = await request(TEST_URL)
+ const completedResponse: FirecrawlCrawlStatusResponse = await request(
+ TEST_URL
+ )
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
- expect(statusCheckResponse.statusCode).toBe(200);
- isCompleted = statusCheckResponse.body.status === "completed";
- if (!isCompleted) {
- await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
- }
- }
- const completedResponse: FirecrawlCrawlStatusResponse = await request(TEST_URL)
- .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
- .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
- expect(completedResponse.statusCode).toBe(200);
- expect(completedResponse.body).toHaveProperty("status");
- expect(completedResponse.body.status).toBe("completed");
- expect(completedResponse.body).toHaveProperty("data");
- expect(completedResponse.body.data[0]).toHaveProperty("content");
- expect(completedResponse.body.data[0]).toHaveProperty("markdown");
- expect(completedResponse.body.data[0]).toHaveProperty("metadata");
- expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
- expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
- const urls = completedResponse.body.data.map(
- (item: any) => item.metadata?.sourceURL
- );
- expect(urls.length).toBeGreaterThan(1);
+ expect(completedResponse.statusCode).toBe(200);
+ expect(completedResponse.body).toHaveProperty("status");
+ expect(completedResponse.body.status).toBe("completed");
+ expect(completedResponse.body).toHaveProperty("data");
+ expect(completedResponse.body.data[0]).toHaveProperty("content");
+ expect(completedResponse.body.data[0]).toHaveProperty("markdown");
+ expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+ expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
+ 200
+ );
+ expect(
+ completedResponse.body.data[0].metadata.pageError
+ ).toBeUndefined();
+ const urls = completedResponse.body.data.map(
+ (item: any) => item.metadata?.sourceURL
+ );
+ expect(urls.length).toBeGreaterThan(1);
- // Check if all URLs have a maximum depth of 1
- urls.forEach((url: string) => {
- const pathSplits = new URL(url).pathname.split('/');
- const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
- expect(depth).toBeLessThanOrEqual(2);
- });
- }, 180000);
+ // Check if all URLs have a maximum depth of 1
+ urls.forEach((url: string) => {
+ const pathSplits = new URL(url).pathname.split("/");
+ const depth =
+ pathSplits.length -
+ (pathSplits[0].length === 0 &&
+ pathSplits[pathSplits.length - 1].length === 0
+ ? 1
+ : 0);
+ expect(depth).toBeLessThanOrEqual(2);
+ });
+ },
+ 180000
+ );
});
describe("POST /v0/crawlWebsitePreview", () => {
it.concurrent("should require authorization", async () => {
- const response: FirecrawlCrawlResponse = await request(TEST_URL).post("/v0/crawlWebsitePreview");
+ const response: FirecrawlCrawlResponse = await request(TEST_URL).post(
+ "/v0/crawlWebsitePreview"
+ );
expect(response.statusCode).toBe(401);
});
- it.concurrent("should return an error response with an invalid API key", async () => {
- const response: FirecrawlCrawlResponse = await request(TEST_URL)
- .post("/v0/crawlWebsitePreview")
- .set("Authorization", `Bearer invalid-api-key`)
- .set("Content-Type", "application/json")
- .send({ url: "https://firecrawl.dev" });
- expect(response.statusCode).toBe(401);
- });
+ it.concurrent(
+ "should return an error response with an invalid API key",
+ async () => {
+ const response: FirecrawlCrawlResponse = await request(TEST_URL)
+ .post("/v0/crawlWebsitePreview")
+ .set("Authorization", `Bearer invalid-api-key`)
+ .set("Content-Type", "application/json")
+ .send({ url: "https://firecrawl.dev" });
+ expect(response.statusCode).toBe(401);
+ }
+ );
- it.concurrent("should return a timeout error when scraping takes longer than the specified timeout", async () => {
- const response: FirecrawlCrawlResponse = await request(TEST_URL)
- .post("/v0/scrape")
- .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
- .set("Content-Type", "application/json")
- .send({ url: "https://firecrawl.dev", timeout: 1000 });
+ it.concurrent(
+ "should return a timeout error when scraping takes longer than the specified timeout",
+ async () => {
+ const response: FirecrawlCrawlResponse = await request(TEST_URL)
+ .post("/v0/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({ url: "https://firecrawl.dev", timeout: 1000 });
- expect(response.statusCode).toBe(408);
- }, 3000);
+ expect(response.statusCode).toBe(408);
+ },
+ 3000
+ );
});
describe("POST /v0/search", () => {
@@ -465,26 +597,33 @@ describe("E2E Tests for v0 API Routes", () => {
expect(response.statusCode).toBe(401);
});
- it.concurrent("should return an error response with an invalid API key", async () => {
- const response = await request(TEST_URL)
- .post("/v0/search")
- .set("Authorization", `Bearer invalid-api-key`)
- .set("Content-Type", "application/json")
- .send({ query: "test" });
- expect(response.statusCode).toBe(401);
- });
+ it.concurrent(
+ "should return an error response with an invalid API key",
+ async () => {
+ const response = await request(TEST_URL)
+ .post("/v0/search")
+ .set("Authorization", `Bearer invalid-api-key`)
+ .set("Content-Type", "application/json")
+ .send({ query: "test" });
+ expect(response.statusCode).toBe(401);
+ }
+ );
- it.concurrent("should return a successful response with a valid API key for search", async () => {
- const response = await request(TEST_URL)
- .post("/v0/search")
- .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
- .set("Content-Type", "application/json")
- .send({ query: "test" });
- expect(response.statusCode).toBe(200);
- expect(response.body).toHaveProperty("success");
- expect(response.body.success).toBe(true);
- expect(response.body).toHaveProperty("data");
- }, 60000); // 60 seconds timeout
+ it.concurrent(
+ "should return a successful response with a valid API key for search",
+ async () => {
+ const response = await request(TEST_URL)
+ .post("/v0/search")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({ query: "test" });
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("success");
+ expect(response.body.success).toBe(true);
+ expect(response.body).toHaveProperty("data");
+ },
+ 60000
+ ); // 60 seconds timeout
});
describe("GET /v0/crawl/status/:jobId", () => {
@@ -493,66 +632,83 @@ describe("E2E Tests for v0 API Routes", () => {
expect(response.statusCode).toBe(401);
});
- it.concurrent("should return an error response with an invalid API key", async () => {
- const response = await request(TEST_URL)
- .get("/v0/crawl/status/123")
- .set("Authorization", `Bearer invalid-api-key`);
- expect(response.statusCode).toBe(401);
- });
-
- it.concurrent("should return Job not found for invalid job ID", async () => {
- const response = await request(TEST_URL)
- .get("/v0/crawl/status/invalidJobId")
- .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
- expect(response.statusCode).toBe(404);
- });
-
- it.concurrent("should return a successful crawl status response for a valid crawl job", async () => {
- const crawlResponse = await request(TEST_URL)
- .post("/v0/crawl")
- .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
- .set("Content-Type", "application/json")
- .send({ url: "https://mendable.ai/blog" });
- expect(crawlResponse.statusCode).toBe(200);
-
- let isCompleted = false;
-
- while (!isCompleted) {
+ it.concurrent(
+ "should return an error response with an invalid API key",
+ async () => {
const response = await request(TEST_URL)
+ .get("/v0/crawl/status/123")
+ .set("Authorization", `Bearer invalid-api-key`);
+ expect(response.statusCode).toBe(401);
+ }
+ );
+
+ it.concurrent(
+ "should return Job not found for invalid job ID",
+ async () => {
+ const response = await request(TEST_URL)
+ .get("/v0/crawl/status/invalidJobId")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+ expect(response.statusCode).toBe(404);
+ }
+ );
+
+ it.concurrent(
+ "should return a successful crawl status response for a valid crawl job",
+ async () => {
+ const crawlResponse = await request(TEST_URL)
+ .post("/v0/crawl")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({ url: "https://mendable.ai/blog" });
+ expect(crawlResponse.statusCode).toBe(200);
+
+ let isCompleted = false;
+
+ while (!isCompleted) {
+ const response = await request(TEST_URL)
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("status");
+
+ if (response.body.status === "completed") {
+ isCompleted = true;
+ } else {
+ await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
+ }
+ }
+
+ await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
+ const completedResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
- expect(response.statusCode).toBe(200);
- expect(response.body).toHaveProperty("status");
- if (response.body.status === "completed") {
- isCompleted = true;
- } else {
- await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
- }
- }
+ expect(completedResponse.body).toHaveProperty("status");
+ expect(completedResponse.body.status).toBe("completed");
+ expect(completedResponse.body).toHaveProperty("data");
+ expect(completedResponse.body.data[0]).toHaveProperty("content");
+ expect(completedResponse.body.data[0]).toHaveProperty("markdown");
+ expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+ expect(completedResponse.body.data[0].content).toContain("Mendable");
+ expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
+ 200
+ );
+ expect(
+ completedResponse.body.data[0].metadata.pageError
+ ).toBeUndefined();
- await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
- const completedResponse = await request(TEST_URL)
- .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
- .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+ const childrenLinks = completedResponse.body.data.filter(
+ (doc) =>
+ doc.metadata &&
+ doc.metadata.sourceURL &&
+ doc.metadata.sourceURL.includes("mendable.ai/blog")
+ );
- expect(completedResponse.body).toHaveProperty("status");
- expect(completedResponse.body.status).toBe("completed");
- expect(completedResponse.body).toHaveProperty("data");
- expect(completedResponse.body.data[0]).toHaveProperty("content");
- expect(completedResponse.body.data[0]).toHaveProperty("markdown");
- expect(completedResponse.body.data[0]).toHaveProperty("metadata");
- expect(completedResponse.body.data[0].content).toContain("Mendable");
- expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
- expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
+ expect(childrenLinks.length).toBe(completedResponse.body.data.length);
+ },
+ 180000
+ ); // 120 seconds
- const childrenLinks = completedResponse.body.data.filter(doc =>
- doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog")
- );
-
- expect(childrenLinks.length).toBe(completedResponse.body.data.length);
- }, 180000); // 120 seconds
-
// TODO: review the test below
// it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension ', async () => {
// const crawlResponse = await request(TEST_URL)
@@ -599,97 +755,210 @@ describe("E2E Tests for v0 API Routes", () => {
// expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
// }, 180000); // 120 seconds
- it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => {
- const crawlResponse = await request(TEST_URL)
- .post("/v0/crawl")
- .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
- .set("Content-Type", "application/json")
- .send({ url: "https://jestjs.io" });
+ it.concurrent(
+ "If someone cancels a crawl job, it should turn into failed status",
+ async () => {
+ const crawlResponse = await request(TEST_URL)
+ .post("/v0/crawl")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({ url: "https://jestjs.io" });
- expect(crawlResponse.statusCode).toBe(200);
+ expect(crawlResponse.statusCode).toBe(200);
- await new Promise((r) => setTimeout(r, 20000));
+ await new Promise((r) => setTimeout(r, 20000));
- const responseCancel = await request(TEST_URL)
- .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
- .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
- expect(responseCancel.statusCode).toBe(200);
- expect(responseCancel.body).toHaveProperty("status");
- expect(responseCancel.body.status).toBe("cancelled");
+ const responseCancel = await request(TEST_URL)
+ .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+ expect(responseCancel.statusCode).toBe(200);
+ expect(responseCancel.body).toHaveProperty("status");
+ expect(responseCancel.body.status).toBe("cancelled");
- await new Promise((r) => setTimeout(r, 10000));
- const completedResponse = await request(TEST_URL)
- .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
- .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+ await new Promise((r) => setTimeout(r, 10000));
+ const completedResponse = await request(TEST_URL)
+ .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
- expect(completedResponse.statusCode).toBe(200);
- expect(completedResponse.body).toHaveProperty("status");
- expect(completedResponse.body.status).toBe("failed");
- expect(completedResponse.body).toHaveProperty("data");
+ expect(completedResponse.statusCode).toBe(200);
+ expect(completedResponse.body).toHaveProperty("status");
+ expect(completedResponse.body.status).toBe("failed");
+ expect(completedResponse.body).toHaveProperty("data");
- let isNullOrEmptyArray = false;
- if (completedResponse.body.data === null || completedResponse.body.data.length === 0) {
- isNullOrEmptyArray = true;
- }
- expect(isNullOrEmptyArray).toBe(true);
- expect(completedResponse.body.data).toEqual(expect.arrayContaining([]));
- expect(completedResponse.body).toHaveProperty("partial_data");
- expect(completedResponse.body.partial_data[0]).toHaveProperty("content");
- expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown");
- expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata");
- expect(completedResponse.body.partial_data[0].metadata.pageStatusCode).toBe(200);
- expect(completedResponse.body.partial_data[0].metadata.pageError).toBeUndefined();
- }, 60000); // 60 seconds
+ let isNullOrEmptyArray = false;
+ if (
+ completedResponse.body.data === null ||
+ completedResponse.body.data.length === 0
+ ) {
+ isNullOrEmptyArray = true;
+ }
+ expect(isNullOrEmptyArray).toBe(true);
+ expect(completedResponse.body.data).toEqual(expect.arrayContaining([]));
+ expect(completedResponse.body).toHaveProperty("partial_data");
+ expect(completedResponse.body.partial_data[0]).toHaveProperty(
+ "content"
+ );
+ expect(completedResponse.body.partial_data[0]).toHaveProperty(
+ "markdown"
+ );
+ expect(completedResponse.body.partial_data[0]).toHaveProperty(
+ "metadata"
+ );
+ expect(
+ completedResponse.body.partial_data[0].metadata.pageStatusCode
+ ).toBe(200);
+ expect(
+ completedResponse.body.partial_data[0].metadata.pageError
+ ).toBeUndefined();
+ },
+ 60000
+ ); // 60 seconds
});
describe("POST /v0/scrape with LLM Extraction", () => {
- it.concurrent("should extract data using LLM extraction mode", async () => {
- const response = await request(TEST_URL)
- .post("/v0/scrape")
- .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
- .set("Content-Type", "application/json")
- .send({
- url: "https://mendable.ai",
- pageOptions: {
- onlyMainContent: true,
- },
- extractorOptions: {
- mode: "llm-extraction",
- extractionPrompt:
- "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
- extractionSchema: {
- type: "object",
- properties: {
- company_mission: {
- type: "string",
- },
- supports_sso: {
- type: "boolean",
- },
- is_open_source: {
- type: "boolean",
- },
- },
- required: ["company_mission", "supports_sso", "is_open_source"],
+ it.concurrent(
+ "should extract data using LLM extraction mode",
+ async () => {
+ const response = await request(TEST_URL)
+ .post("/v0/scrape")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({
+ url: "https://mendable.ai",
+ pageOptions: {
+ onlyMainContent: true,
},
- },
+ extractorOptions: {
+ mode: "llm-extraction",
+ extractionPrompt:
+ "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
+ extractionSchema: {
+ type: "object",
+ properties: {
+ company_mission: {
+ type: "string",
+ },
+ supports_sso: {
+ type: "boolean",
+ },
+ is_open_source: {
+ type: "boolean",
+ },
+ },
+ required: ["company_mission", "supports_sso", "is_open_source"],
+ },
+ },
+ });
+
+ // Ensure that the job was successfully created before proceeding with LLM extraction
+ expect(response.statusCode).toBe(200);
+
+ // Assuming the LLM extraction object is available in the response body under `data.llm_extraction`
+ let llmExtraction = response.body.data.llm_extraction;
+
+ // Check if the llm_extraction object has the required properties with correct types and values
+ expect(llmExtraction).toHaveProperty("company_mission");
+ expect(typeof llmExtraction.company_mission).toBe("string");
+ expect(llmExtraction).toHaveProperty("supports_sso");
+ expect(llmExtraction.supports_sso).toBe(true);
+ expect(typeof llmExtraction.supports_sso).toBe("boolean");
+ expect(llmExtraction).toHaveProperty("is_open_source");
+ expect(llmExtraction.is_open_source).toBe(false);
+ expect(typeof llmExtraction.is_open_source).toBe("boolean");
+ },
+ 60000
+ ); // 60 secs
+ });
+
+ describe("POST /v0/map", () => {
+ it.concurrent(
+ "should return a list of links for mendable.ai without subdomains included",
+ async () => {
+ const response = await request(TEST_URL)
+ .post("/v1/map")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({
+ url: "https://mendable.ai",
+ });
+
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("success", true);
+ expect(response.body).toHaveProperty("links");
+ expect(response.body.links).not.toContain("https://docs.mendable.ai");
+ expect(Array.isArray(response.body.links)).toBe(true);
+ expect(response.body.links.length).toBeGreaterThan(0);
+ },
+ 60000
+ ); // 60 secs
+
+ it.concurrent(
+ "should return a list of links for a given URL with subdomains included",
+ async () => {
+ const response = await request(TEST_URL)
+ .post("/v1/map")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({
+ url: "https://python.langchain.com",
+ includeSubdomains: true,
+ });
+
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("success", true);
+ expect(response.body).toHaveProperty("links");
+ expect(Array.isArray(response.body.links)).toBe(true);
+ expect(response.body.links.length).toBeGreaterThan(0);
+ },
+ 60000
+ ); // 60 secs
+
+ it.concurrent(
+ "should return a list of links for a given URL with subdomains and search",
+ async () => {
+ const response = await request(TEST_URL)
+ .post("/v1/map")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({
+ url: "https://python.langchain.com",
+ includeSubdomains: true,
+ search: "agents",
+ });
+
+ expect(response.statusCode).toBe(200);
+ expect(response.body).toHaveProperty("success", true);
+ expect(response.body).toHaveProperty("links");
+ expect(response.body.links).toContain(
+ "https://api.python.langchain.com/en/latest/_modules/langchain/agents/openai_functions_agent/base.html"
+ );
+ expect(Array.isArray(response.body.links)).toBe(true);
+ expect(response.body.links.length).toBeGreaterThan(0);
+ response.body.links.forEach((link) => {
+ expect(link).toContain("python.langchain.com");
});
+ },
+ 60000
+ ); // 60 secs
- // Ensure that the job was successfully created before proceeding with LLM extraction
- expect(response.statusCode).toBe(200);
+ it.concurrent(
+ "should handle invalid URL input gracefully",
+ async () => {
+ const response = await request(TEST_URL)
+ .post("/v1/map")
+ .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+ .set("Content-Type", "application/json")
+ .send({
+ url: "invalid-url",
+ includeSubdomains: true,
+ search: "agents",
+ });
- // Assuming the LLM extraction object is available in the response body under `data.llm_extraction`
- let llmExtraction = response.body.data.llm_extraction;
-
- // Check if the llm_extraction object has the required properties with correct types and values
- expect(llmExtraction).toHaveProperty("company_mission");
- expect(typeof llmExtraction.company_mission).toBe("string");
- expect(llmExtraction).toHaveProperty("supports_sso");
- expect(llmExtraction.supports_sso).toBe(true);
- expect(typeof llmExtraction.supports_sso).toBe("boolean");
- expect(llmExtraction).toHaveProperty("is_open_source");
- expect(llmExtraction.is_open_source).toBe(false);
- expect(typeof llmExtraction.is_open_source).toBe("boolean");
- }, 60000); // 60 secs
+ expect(response.statusCode).toBe(400);
+ expect(response.body).toHaveProperty("success", false);
+ expect(response.body).toHaveProperty("details");
+ },
+ 60000
+ ); // 60 secs
});
});
diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts
index 190b4c0b..7f71303f 100644
--- a/apps/api/src/controllers/v1/map.ts
+++ b/apps/api/src/controllers/v1/map.ts
@@ -41,9 +41,10 @@ export async function mapController(
const crawler = crawlToCrawler(id, sc);
- const sitemap = sc.crawlerOptions.ignoreSitemap
- ? null
- : await crawler.tryGetSitemap();
+ const sitemap =
+ sc.crawlerOptions.ignoreSitemap || req.body.search
+ ? null
+ : await crawler.tryGetSitemap();
if (sitemap !== null) {
sitemap.map((x) => {
@@ -51,13 +52,23 @@ export async function mapController(
});
}
- const mapResults = await fireEngineMap(`site:${req.body.url}`, {
+ let mapUrl = req.body.search
+ ? `"${req.body.search}" site:${req.body.url}`
+ : `site:${req.body.url}`;
+ console.log(mapUrl);
+ // www. seems to exclude subdomains in some cases
+ const mapResults = await fireEngineMap(mapUrl, {
numResults: 50,
});
+ console.log(mapResults);
if (mapResults.length > 0) {
mapResults.map((x) => {
- links.push(x.url);
+ if (req.body.search) {
+ links.unshift(x.url);
+ } else {
+ links.push(x.url);
+ }
});
}
diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts
index 4411b29f..77a9f2dd 100644
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@@ -3,22 +3,46 @@ import { z } from "zod";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { PageOptions } from "../../lib/entities";
-export type Format = "markdown" | "html" | "rawHtml" | "links" | "screenshot" | "screenshot@fullPage";
+export type Format =
+ | "markdown"
+ | "html"
+ | "rawHtml"
+ | "links"
+ | "screenshot"
+ | "screenshot@fullPage";
-const url = z.preprocess(x => {
- if (typeof x === "string" && !/^([^.:]+:\/\/)/.test(x)) {
- if (x.startsWith("://")) {
- return "http" + x;
+const url = z.preprocess(
+ (x) => {
+ if (typeof x === "string" && !/^([^.:]+:\/\/)/.test(x)) {
+ if (x.startsWith("://")) {
+ return "http" + x;
+ } else {
+ return "http://" + x;
+ }
} else {
- return "http://" + x;
+ return x;
}
- } else {
- return x;
- }
-}, z.string().url().regex(/^https?:\/\//, "URL uses unsupported protocol").refine(x => !isUrlBlocked(x), "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."));
+ },
+ z
+ .string()
+ .url()
+ .regex(/^https?:\/\//, "URL uses unsupported protocol")
+ .refine(
+ (x) => !isUrlBlocked(x),
+ "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
+ )
+);
export const scrapeOptions = z.object({
- formats: z.enum(["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"])
+ formats: z
+ .enum([
+ "markdown",
+ "html",
+ "rawHtml",
+ "links",
+ "screenshot",
+ "screenshot@fullPage",
+ ])
.array()
.optional()
.default(["markdown"]),
@@ -34,7 +58,7 @@ export const scrapeOptions = z.object({
export type ScrapeOptions = z.infer;
export const scrapeRequestSchema = scrapeOptions.extend({
-url: z.string().url(),
+ url,
origin: z.string().optional().default("api"),
});
@@ -90,10 +114,10 @@ export const crawlRequestSchema = z.object({
export type CrawlRequest = z.infer;
export const mapRequestSchema = crawlerOptions.extend({
- url,
+ url: z.string().url(),
origin: z.string().optional().default("api"),
includeSubdomains: z.boolean().default(false),
- searchEngine: z.string().optional(),
+ search: z.string().optional(),
});
// export type MapRequest = {
@@ -104,11 +128,11 @@ export const mapRequestSchema = crawlerOptions.extend({
export type MapRequest = z.infer;
export type Document = {
- markdown?: string,
- html?: string,
- rawHtml?: string,
- links?: string[],
- screenshot?: string,
+ markdown?: string;
+ html?: string;
+ rawHtml?: string;
+ links?: string[];
+ screenshot?: string;
metadata: {
title?: string;
description?: string;
@@ -142,8 +166,8 @@ export type Document = {
sourceURL?: string;
statusCode?: number;
error?: string;
- },
-}
+ };
+};
export type ErrorResponse = {
success: false;
@@ -151,11 +175,13 @@ export type ErrorResponse = {
details?: any;
};
-export type ScrapeResponse = ErrorResponse | {
- success: true;
- warning?: string;
- data: Document;
-};
+export type ScrapeResponse =
+ | ErrorResponse
+ | {
+ success: true;
+ warning?: string;
+ data: Document;
+ };
export interface ScrapeResponseRequestTest {
statusCode: number;
@@ -163,40 +189,54 @@ export interface ScrapeResponseRequestTest {
error?: string;
}
-export type CrawlResponse = ErrorResponse | {
- success: true;
- id: string;
- url: string;
-}
+export type CrawlResponse =
+ | ErrorResponse
+ | {
+ success: true;
+ id: string;
+ url: string;
+ };
-export type MapResponse = ErrorResponse | {
- success: true;
- links: string[];
-}
+export type MapResponse =
+ | ErrorResponse
+ | {
+ success: true;
+ links: string[];
+ };
export type CrawlStatusParams = {
jobId: string;
-}
+};
-export type CrawlStatusResponse = ErrorResponse | {
- status: "scraping" | "completed" | "failed" | "cancelled",
- totalCount: number;
- creditsUsed: number;
- expiresAt: string;
- next?: string;
- data: Document[];
-}
+export type CrawlStatusResponse =
+ | ErrorResponse
+ | {
+ status: "scraping" | "completed" | "failed" | "cancelled";
+ totalCount: number;
+ creditsUsed: number;
+ expiresAt: string;
+ next?: string;
+ data: Document[];
+ };
type AuthObject = {
team_id: string;
plan: string;
-}
+};
-export interface RequestWithMaybeAuth extends Request {
+export interface RequestWithMaybeAuth<
+ ReqParams = {},
+ ReqBody = undefined,
+ ResBody = undefined
+> extends Request {
auth?: AuthObject;
}
-export interface RequestWithAuth extends Request {
+export interface RequestWithAuth<
+ ReqParams = {},
+ ReqBody = undefined,
+ ResBody = undefined
+> extends Request {
auth: AuthObject;
}
@@ -225,7 +265,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
includeLinks: x.formats.includes("links"),
screenshot: x.formats.includes("screenshot"),
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
- parsePDF: x.parsePDF
+ parsePDF: x.parsePDF,
};
}
@@ -243,5 +283,5 @@ export function legacyDocumentConverter(doc: any): Document {
error: doc.metadata.pageError,
statusCode: doc.metadata.pageStatusCode,
},
- }
+ };
}