Nick: all tests passing

This commit is contained in:
Nicolas 2024-08-16 19:55:44 -04:00
parent 5205c5f005
commit ba5279eafc
3 changed files with 888 additions and 568 deletions

View File

@ -1,6 +1,10 @@
import request from "supertest";
import dotenv from "dotenv";
import { FirecrawlCrawlResponse, FirecrawlCrawlStatusResponse, FirecrawlScrapeResponse } from "../../types";
import {
FirecrawlCrawlResponse,
FirecrawlCrawlStatusResponse,
FirecrawlScrapeResponse,
} from "../../types";
dotenv.config();
const TEST_URL = "http://127.0.0.1:3002";
@ -24,20 +28,27 @@ describe("E2E Tests for v0 API Routes", () => {
describe("POST /v0/scrape", () => {
it.concurrent("should require authorization", async () => {
const response: FirecrawlScrapeResponse = await request(TEST_URL).post("/v0/scrape");
const response: FirecrawlScrapeResponse = await request(TEST_URL).post(
"/v0/scrape"
);
expect(response.statusCode).toBe(401);
});
it.concurrent("should return an error response with an invalid API key", async () => {
it.concurrent(
"should return an error response with an invalid API key",
async () => {
const response: FirecrawlScrapeResponse = await request(TEST_URL)
.post("/v0/scrape")
.set("Authorization", `Bearer invalid-api-key`)
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
});
}
);
it.concurrent("should return a successful response with a valid API key", async () => {
it.concurrent(
"should return a successful response with a valid API key",
async () => {
const response: FirecrawlScrapeResponse = await request(TEST_URL)
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -52,21 +63,36 @@ describe("E2E Tests for v0 API Routes", () => {
expect(response.body.data.content).toContain("_Roast_");
expect(response.body.data.metadata.pageError).toBeUndefined();
expect(response.body.data.metadata.title).toBe("Roast My Website");
expect(response.body.data.metadata.description).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️");
expect(response.body.data.metadata.keywords).toBe("Roast My Website,Roast,Website,GitHub,Firecrawl");
expect(response.body.data.metadata.description).toBe(
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
);
expect(response.body.data.metadata.keywords).toBe(
"Roast My Website,Roast,Website,GitHub,Firecrawl"
);
expect(response.body.data.metadata.robots).toBe("follow, index");
expect(response.body.data.metadata.ogTitle).toBe("Roast My Website");
expect(response.body.data.metadata.ogDescription).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️");
expect(response.body.data.metadata.ogUrl).toBe("https://www.roastmywebsite.ai");
expect(response.body.data.metadata.ogImage).toBe("https://www.roastmywebsite.ai/og.png");
expect(response.body.data.metadata.ogDescription).toBe(
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
);
expect(response.body.data.metadata.ogUrl).toBe(
"https://www.roastmywebsite.ai"
);
expect(response.body.data.metadata.ogImage).toBe(
"https://www.roastmywebsite.ai/og.png"
);
expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]);
expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website");
expect(response.body.data.metadata.sourceURL).toBe("https://roastmywebsite.ai");
expect(response.body.data.metadata.sourceURL).toBe(
"https://roastmywebsite.ai"
);
expect(response.body.data.metadata.pageStatusCode).toBe(200);
}, 30000); // 30 seconds timeout
},
30000
); // 30 seconds timeout
it.concurrent("should return a successful response with a valid API key and includeHtml set to true", async () => {
it.concurrent(
"should return a successful response with a valid API key and includeHtml set to true",
async () => {
const response: FirecrawlScrapeResponse = await request(TEST_URL)
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -86,44 +112,61 @@ describe("E2E Tests for v0 API Routes", () => {
expect(response.body.data.html).toContain("<h1");
expect(response.body.data.metadata.pageStatusCode).toBe(200);
expect(response.body.data.metadata.pageError).toBeUndefined();
}, 30000); // 30 seconds timeout
},
30000
); // 30 seconds timeout
it.concurrent('should return a successful response for a valid scrape with PDF file', async () => {
it.concurrent(
"should return a successful response for a valid scrape with PDF file",
async () => {
const response: FirecrawlScrapeResponse = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf' });
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: "https://arxiv.org/pdf/astro-ph/9301001.pdf" });
await new Promise((r) => setTimeout(r, 6000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty("content");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.content).toContain(
"We present spectrophotometric observations of the Broad Line Radio Galaxy"
);
expect(response.body.data.metadata.pageStatusCode).toBe(200);
expect(response.body.data.metadata.pageError).toBeUndefined();
}, 60000); // 60 seconds
},
60000
); // 60 seconds
it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
it.concurrent(
"should return a successful response for a valid scrape with PDF file without explicit .pdf extension",
async () => {
const response: FirecrawlScrapeResponse = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001' });
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: "https://arxiv.org/pdf/astro-ph/9301001" });
await new Promise((r) => setTimeout(r, 6000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty("content");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.content).toContain(
"We present spectrophotometric observations of the Broad Line Radio Galaxy"
);
expect(response.body.data.metadata.pageStatusCode).toBe(200);
expect(response.body.data.metadata.pageError).toBeUndefined();
}, 60000); // 60 seconds
},
60000
); // 60 seconds
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
const responseWithoutRemoveTags: FirecrawlScrapeResponse = await request(TEST_URL)
it.concurrent(
"should return a successful response with a valid API key with removeTags option",
async () => {
const responseWithoutRemoveTags: FirecrawlScrapeResponse =
await request(TEST_URL)
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
@ -134,16 +177,27 @@ describe("E2E Tests for v0 API Routes", () => {
expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
expect(responseWithoutRemoveTags.body.data.content).toContain("Scrape This Site");
expect(responseWithoutRemoveTags.body.data.content).toContain("Lessons and Videos"); // #footer
expect(responseWithoutRemoveTags.body.data.content).toContain("[Sandbox]("); // .nav
expect(responseWithoutRemoveTags.body.data.content).toContain("web scraping"); // strong
expect(responseWithoutRemoveTags.body.data.content).toContain(
"Scrape This Site"
);
expect(responseWithoutRemoveTags.body.data.content).toContain(
"Lessons and Videos"
); // #footer
expect(responseWithoutRemoveTags.body.data.content).toContain(
"[Sandbox]("
); // .nav
expect(responseWithoutRemoveTags.body.data.content).toContain(
"web scraping"
); // strong
const response: FirecrawlScrapeResponse = await request(TEST_URL)
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: "https://www.scrapethissite.com/", pageOptions: { removeTags: ['.nav', '#footer', 'strong'] } });
.send({
url: "https://www.scrapethissite.com/",
pageOptions: { removeTags: [".nav", "#footer", "strong"] },
});
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty("content");
@ -154,118 +208,157 @@ describe("E2E Tests for v0 API Routes", () => {
expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer
expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav
expect(response.body.data.content).not.toContain("web scraping"); // strong
}, 30000); // 30 seconds timeout
},
30000
); // 30 seconds timeout
it.concurrent('should return a successful response for a scrape with 400 page', async () => {
it.concurrent(
"should return a successful response for a scrape with 400 page",
async () => {
const response: FirecrawlScrapeResponse = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/400' });
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: "https://httpstat.us/400" });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty("content");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.pageStatusCode).toBe(400);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("bad request");
}, 60000); // 60 seconds
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
"bad request"
);
},
60000
); // 60 seconds
it.concurrent('should return a successful response for a scrape with 401 page', async () => {
it.concurrent(
"should return a successful response for a scrape with 401 page",
async () => {
const response: FirecrawlScrapeResponse = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/401' });
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: "https://httpstat.us/401" });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty("content");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.pageStatusCode).toBe(401);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("unauthorized");
}, 60000); // 60 seconds
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
"unauthorized"
);
},
60000
); // 60 seconds
it.concurrent("should return a successful response for a scrape with 403 page", async () => {
it.concurrent(
"should return a successful response for a scrape with 403 page",
async () => {
const response: FirecrawlScrapeResponse = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/403' });
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: "https://httpstat.us/403" });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty("content");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.pageStatusCode).toBe(403);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain("forbidden");
}, 60000); // 60 seconds
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
"forbidden"
);
},
60000
); // 60 seconds
it.concurrent('should return a successful response for a scrape with 404 page', async () => {
it.concurrent(
"should return a successful response for a scrape with 404 page",
async () => {
const response: FirecrawlScrapeResponse = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/404' });
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: "https://httpstat.us/404" });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty("content");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.pageStatusCode).toBe(404);
}, 60000); // 60 seconds
},
60000
); // 60 seconds
it.concurrent('should return a successful response for a scrape with 405 page', async () => {
it.concurrent(
"should return a successful response for a scrape with 405 page",
async () => {
const response = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/405' });
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: "https://httpstat.us/405" });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty("content");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.pageStatusCode).toBe(405);
}, 60000); // 60 seconds
},
60000
); // 60 seconds
it.concurrent('should return a successful response for a scrape with 500 page', async () => {
it.concurrent(
"should return a successful response for a scrape with 500 page",
async () => {
const response: FirecrawlScrapeResponse = await request(TEST_URL)
.post('/v0/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/500' });
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: "https://httpstat.us/500" });
await new Promise((r) => setTimeout(r, 5000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
expect(response.body.data).toHaveProperty('content');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty("content");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.pageStatusCode).toBe(500);
}, 60000); // 60 seconds
},
60000
); // 60 seconds
});
describe("POST /v0/crawl", () => {
it.concurrent("should require authorization", async () => {
const response: FirecrawlCrawlResponse = await request(TEST_URL).post("/v0/crawl");
const response: FirecrawlCrawlResponse = await request(TEST_URL).post(
"/v0/crawl"
);
expect(response.statusCode).toBe(401);
});
it.concurrent("should return an error response with an invalid API key", async () => {
it.concurrent(
"should return an error response with an invalid API key",
async () => {
const response: FirecrawlCrawlResponse = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer invalid-api-key`)
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
});
}
);
it.concurrent("should return a successful response with a valid API key for crawl", async () => {
it.concurrent(
"should return a successful response with a valid API key for crawl",
async () => {
const response: FirecrawlCrawlResponse = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -276,9 +369,12 @@ describe("E2E Tests for v0 API Routes", () => {
expect(response.body.jobId).toMatch(
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/
);
});
}
);
it.concurrent("should return a successful response with a valid API key and valid includes option", async () => {
it.concurrent(
"should return a successful response with a valid API key and valid includes option",
async () => {
const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -329,11 +425,19 @@ describe("E2E Tests for v0 API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("Mendable");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
}, 180000); // 180 seconds
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
200
);
expect(
completedResponse.body.data[0].metadata.pageError
).toBeUndefined();
},
180000
); // 180 seconds
it.concurrent("should return a successful response with a valid API key and valid excludes option", async () => {
it.concurrent(
"should return a successful response with a valid API key and valid excludes option",
async () => {
const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -364,7 +468,9 @@ describe("E2E Tests for v0 API Routes", () => {
}
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
const completedResponse: FirecrawlCrawlStatusResponse = await request(TEST_URL)
const completedResponse: FirecrawlCrawlStatusResponse = await request(
TEST_URL
)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
@ -375,9 +481,13 @@ describe("E2E Tests for v0 API Routes", () => {
urls.forEach((url: string) => {
expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy();
});
}, 90000); // 90 seconds
},
90000
); // 90 seconds
it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => {
it.concurrent(
"should return a successful response with max depth option for a valid crawl job",
async () => {
const crawlResponse: FirecrawlCrawlResponse = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -406,7 +516,9 @@ describe("E2E Tests for v0 API Routes", () => {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
}
}
const completedResponse: FirecrawlCrawlStatusResponse = await request(TEST_URL)
const completedResponse: FirecrawlCrawlStatusResponse = await request(
TEST_URL
)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
@ -417,8 +529,12 @@ describe("E2E Tests for v0 API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
200
);
expect(
completedResponse.body.data[0].metadata.pageError
).toBeUndefined();
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
);
@ -426,29 +542,43 @@ describe("E2E Tests for v0 API Routes", () => {
// Check if all URLs have a maximum depth of 1
urls.forEach((url: string) => {
const pathSplits = new URL(url).pathname.split('/');
const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
const pathSplits = new URL(url).pathname.split("/");
const depth =
pathSplits.length -
(pathSplits[0].length === 0 &&
pathSplits[pathSplits.length - 1].length === 0
? 1
: 0);
expect(depth).toBeLessThanOrEqual(2);
});
}, 180000);
},
180000
);
});
describe("POST /v0/crawlWebsitePreview", () => {
it.concurrent("should require authorization", async () => {
const response: FirecrawlCrawlResponse = await request(TEST_URL).post("/v0/crawlWebsitePreview");
const response: FirecrawlCrawlResponse = await request(TEST_URL).post(
"/v0/crawlWebsitePreview"
);
expect(response.statusCode).toBe(401);
});
it.concurrent("should return an error response with an invalid API key", async () => {
it.concurrent(
"should return an error response with an invalid API key",
async () => {
const response: FirecrawlCrawlResponse = await request(TEST_URL)
.post("/v0/crawlWebsitePreview")
.set("Authorization", `Bearer invalid-api-key`)
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
});
}
);
it.concurrent("should return a timeout error when scraping takes longer than the specified timeout", async () => {
it.concurrent(
"should return a timeout error when scraping takes longer than the specified timeout",
async () => {
const response: FirecrawlCrawlResponse = await request(TEST_URL)
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -456,7 +586,9 @@ describe("E2E Tests for v0 API Routes", () => {
.send({ url: "https://firecrawl.dev", timeout: 1000 });
expect(response.statusCode).toBe(408);
}, 3000);
},
3000
);
});
describe("POST /v0/search", () => {
@ -465,16 +597,21 @@ describe("E2E Tests for v0 API Routes", () => {
expect(response.statusCode).toBe(401);
});
it.concurrent("should return an error response with an invalid API key", async () => {
it.concurrent(
"should return an error response with an invalid API key",
async () => {
const response = await request(TEST_URL)
.post("/v0/search")
.set("Authorization", `Bearer invalid-api-key`)
.set("Content-Type", "application/json")
.send({ query: "test" });
expect(response.statusCode).toBe(401);
});
}
);
it.concurrent("should return a successful response with a valid API key for search", async () => {
it.concurrent(
"should return a successful response with a valid API key for search",
async () => {
const response = await request(TEST_URL)
.post("/v0/search")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -484,7 +621,9 @@ describe("E2E Tests for v0 API Routes", () => {
expect(response.body).toHaveProperty("success");
expect(response.body.success).toBe(true);
expect(response.body).toHaveProperty("data");
}, 60000); // 60 seconds timeout
},
60000
); // 60 seconds timeout
});
describe("GET /v0/crawl/status/:jobId", () => {
@ -493,21 +632,29 @@ describe("E2E Tests for v0 API Routes", () => {
expect(response.statusCode).toBe(401);
});
it.concurrent("should return an error response with an invalid API key", async () => {
it.concurrent(
"should return an error response with an invalid API key",
async () => {
const response = await request(TEST_URL)
.get("/v0/crawl/status/123")
.set("Authorization", `Bearer invalid-api-key`);
expect(response.statusCode).toBe(401);
});
}
);
it.concurrent("should return Job not found for invalid job ID", async () => {
it.concurrent(
"should return Job not found for invalid job ID",
async () => {
const response = await request(TEST_URL)
.get("/v0/crawl/status/invalidJobId")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(404);
});
}
);
it.concurrent("should return a successful crawl status response for a valid crawl job", async () => {
it.concurrent(
"should return a successful crawl status response for a valid crawl job",
async () => {
const crawlResponse = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -543,15 +690,24 @@ describe("E2E Tests for v0 API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("Mendable");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
200
);
expect(
completedResponse.body.data[0].metadata.pageError
).toBeUndefined();
const childrenLinks = completedResponse.body.data.filter(doc =>
doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog")
const childrenLinks = completedResponse.body.data.filter(
(doc) =>
doc.metadata &&
doc.metadata.sourceURL &&
doc.metadata.sourceURL.includes("mendable.ai/blog")
);
expect(childrenLinks.length).toBe(completedResponse.body.data.length);
}, 180000); // 120 seconds
},
180000
); // 120 seconds
// TODO: review the test below
// it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension ', async () => {
@ -599,7 +755,9 @@ describe("E2E Tests for v0 API Routes", () => {
// expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
// }, 180000); // 120 seconds
it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => {
it.concurrent(
"If someone cancels a crawl job, it should turn into failed status",
async () => {
const crawlResponse = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -628,22 +786,39 @@ describe("E2E Tests for v0 API Routes", () => {
expect(completedResponse.body).toHaveProperty("data");
let isNullOrEmptyArray = false;
if (completedResponse.body.data === null || completedResponse.body.data.length === 0) {
if (
completedResponse.body.data === null ||
completedResponse.body.data.length === 0
) {
isNullOrEmptyArray = true;
}
expect(isNullOrEmptyArray).toBe(true);
expect(completedResponse.body.data).toEqual(expect.arrayContaining([]));
expect(completedResponse.body).toHaveProperty("partial_data");
expect(completedResponse.body.partial_data[0]).toHaveProperty("content");
expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown");
expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata");
expect(completedResponse.body.partial_data[0].metadata.pageStatusCode).toBe(200);
expect(completedResponse.body.partial_data[0].metadata.pageError).toBeUndefined();
}, 60000); // 60 seconds
expect(completedResponse.body.partial_data[0]).toHaveProperty(
"content"
);
expect(completedResponse.body.partial_data[0]).toHaveProperty(
"markdown"
);
expect(completedResponse.body.partial_data[0]).toHaveProperty(
"metadata"
);
expect(
completedResponse.body.partial_data[0].metadata.pageStatusCode
).toBe(200);
expect(
completedResponse.body.partial_data[0].metadata.pageError
).toBeUndefined();
},
60000
); // 60 seconds
});
describe("POST /v0/scrape with LLM Extraction", () => {
it.concurrent("should extract data using LLM extraction mode", async () => {
it.concurrent(
"should extract data using LLM extraction mode",
async () => {
const response = await request(TEST_URL)
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -690,6 +865,100 @@ describe("E2E Tests for v0 API Routes", () => {
expect(llmExtraction).toHaveProperty("is_open_source");
expect(llmExtraction.is_open_source).toBe(false);
expect(typeof llmExtraction.is_open_source).toBe("boolean");
}, 60000); // 60 secs
},
60000
); // 60 secs
});
describe("POST /v0/map", () => {
it.concurrent(
"should return a list of links for mendable.ai without subdomains included",
async () => {
const response = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://mendable.ai",
});
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("links");
expect(response.body.links).not.toContain("https://docs.mendable.ai");
expect(Array.isArray(response.body.links)).toBe(true);
expect(response.body.links.length).toBeGreaterThan(0);
},
60000
); // 60 secs
it.concurrent(
"should return a list of links for a given URL with subdomains included",
async () => {
const response = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://python.langchain.com",
includeSubdomains: true,
});
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("links");
expect(Array.isArray(response.body.links)).toBe(true);
expect(response.body.links.length).toBeGreaterThan(0);
},
60000
); // 60 secs
it.concurrent(
"should return a list of links for a given URL with subdomains and search",
async () => {
const response = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://python.langchain.com",
includeSubdomains: true,
search: "agents",
});
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("links");
expect(response.body.links).toContain(
"https://api.python.langchain.com/en/latest/_modules/langchain/agents/openai_functions_agent/base.html"
);
expect(Array.isArray(response.body.links)).toBe(true);
expect(response.body.links.length).toBeGreaterThan(0);
response.body.links.forEach((link) => {
expect(link).toContain("python.langchain.com");
});
},
60000
); // 60 secs
it.concurrent(
"should handle invalid URL input gracefully",
async () => {
const response = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "invalid-url",
includeSubdomains: true,
search: "agents",
});
expect(response.statusCode).toBe(400);
expect(response.body).toHaveProperty("success", false);
expect(response.body).toHaveProperty("details");
},
60000
); // 60 secs
});
});

View File

@ -41,7 +41,8 @@ export async function mapController(
const crawler = crawlToCrawler(id, sc);
const sitemap = sc.crawlerOptions.ignoreSitemap
const sitemap =
sc.crawlerOptions.ignoreSitemap || req.body.search
? null
: await crawler.tryGetSitemap();
@ -51,13 +52,23 @@ export async function mapController(
});
}
const mapResults = await fireEngineMap(`site:${req.body.url}`, {
let mapUrl = req.body.search
? `"${req.body.search}" site:${req.body.url}`
: `site:${req.body.url}`;
console.log(mapUrl);
// www. seems to exclude subdomains in some cases
const mapResults = await fireEngineMap(mapUrl, {
numResults: 50,
});
console.log(mapResults);
if (mapResults.length > 0) {
mapResults.map((x) => {
if (req.body.search) {
links.unshift(x.url);
} else {
links.push(x.url);
}
});
}

View File

@ -3,9 +3,16 @@ import { z } from "zod";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { PageOptions } from "../../lib/entities";
export type Format = "markdown" | "html" | "rawHtml" | "links" | "screenshot" | "screenshot@fullPage";
export type Format =
| "markdown"
| "html"
| "rawHtml"
| "links"
| "screenshot"
| "screenshot@fullPage";
const url = z.preprocess(x => {
const url = z.preprocess(
(x) => {
if (typeof x === "string" && !/^([^.:]+:\/\/)/.test(x)) {
if (x.startsWith("://")) {
return "http" + x;
@ -15,10 +22,27 @@ const url = z.preprocess(x => {
} else {
return x;
}
}, z.string().url().regex(/^https?:\/\//, "URL uses unsupported protocol").refine(x => !isUrlBlocked(x), "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."));
},
z
.string()
.url()
.regex(/^https?:\/\//, "URL uses unsupported protocol")
.refine(
(x) => !isUrlBlocked(x),
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
)
);
export const scrapeOptions = z.object({
formats: z.enum(["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"])
formats: z
.enum([
"markdown",
"html",
"rawHtml",
"links",
"screenshot",
"screenshot@fullPage",
])
.array()
.optional()
.default(["markdown"]),
@ -34,7 +58,7 @@ export const scrapeOptions = z.object({
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
export const scrapeRequestSchema = scrapeOptions.extend({
url: z.string().url(),
url,
origin: z.string().optional().default("api"),
});
@ -90,10 +114,10 @@ export const crawlRequestSchema = z.object({
export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
export const mapRequestSchema = crawlerOptions.extend({
url,
url: z.string().url(),
origin: z.string().optional().default("api"),
includeSubdomains: z.boolean().default(false),
searchEngine: z.string().optional(),
search: z.string().optional(),
});
// export type MapRequest = {
@ -104,11 +128,11 @@ export const mapRequestSchema = crawlerOptions.extend({
export type MapRequest = z.infer<typeof mapRequestSchema>;
export type Document = {
markdown?: string,
html?: string,
rawHtml?: string,
links?: string[],
screenshot?: string,
markdown?: string;
html?: string;
rawHtml?: string;
links?: string[];
screenshot?: string;
metadata: {
title?: string;
description?: string;
@ -142,8 +166,8 @@ export type Document = {
sourceURL?: string;
statusCode?: number;
error?: string;
},
}
};
};
export type ErrorResponse = {
success: false;
@ -151,11 +175,13 @@ export type ErrorResponse = {
details?: any;
};
export type ScrapeResponse = ErrorResponse | {
export type ScrapeResponse =
| ErrorResponse
| {
success: true;
warning?: string;
data: Document;
};
};
export interface ScrapeResponseRequestTest {
statusCode: number;
@ -163,40 +189,54 @@ export interface ScrapeResponseRequestTest {
error?: string;
}
export type CrawlResponse = ErrorResponse | {
export type CrawlResponse =
| ErrorResponse
| {
success: true;
id: string;
url: string;
}
};
export type MapResponse = ErrorResponse | {
export type MapResponse =
| ErrorResponse
| {
success: true;
links: string[];
}
};
export type CrawlStatusParams = {
jobId: string;
}
};
export type CrawlStatusResponse = ErrorResponse | {
status: "scraping" | "completed" | "failed" | "cancelled",
export type CrawlStatusResponse =
| ErrorResponse
| {
status: "scraping" | "completed" | "failed" | "cancelled";
totalCount: number;
creditsUsed: number;
expiresAt: string;
next?: string;
data: Document[];
}
};
type AuthObject = {
team_id: string;
plan: string;
}
};
export interface RequestWithMaybeAuth<ReqParams = {}, ReqBody = undefined, ResBody = undefined> extends Request<ReqParams, ReqBody, ResBody> {
export interface RequestWithMaybeAuth<
ReqParams = {},
ReqBody = undefined,
ResBody = undefined
> extends Request<ReqParams, ReqBody, ResBody> {
auth?: AuthObject;
}
export interface RequestWithAuth<ReqParams = {}, ReqBody = undefined, ResBody = undefined> extends Request<ReqParams, ReqBody, ResBody> {
export interface RequestWithAuth<
ReqParams = {},
ReqBody = undefined,
ResBody = undefined
> extends Request<ReqParams, ReqBody, ResBody> {
auth: AuthObject;
}
@ -225,7 +265,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
includeLinks: x.formats.includes("links"),
screenshot: x.formats.includes("screenshot"),
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
parsePDF: x.parsePDF
parsePDF: x.parsePDF,
};
}
@ -243,5 +283,5 @@ export function legacyDocumentConverter(doc: any): Document {
error: doc.metadata.pageError,
statusCode: doc.metadata.pageStatusCode,
},
}
};
}