Nick: revert trailing comma

This commit is contained in:
Nicolas 2024-12-11 19:51:08 -03:00
parent 52f2e733e2
commit 8a1c404918
121 changed files with 1965 additions and 1952 deletions

View File

@ -1,3 +1,3 @@
{
"trailingComma": "none"
"trailingComma": "all"
}

View File

@ -3,7 +3,7 @@ import dotenv from "dotenv";
import {
FirecrawlCrawlResponse,
FirecrawlCrawlStatusResponse,
FirecrawlScrapeResponse
FirecrawlScrapeResponse,
} from "../../types";
dotenv.config();
@ -23,9 +23,9 @@ describe("E2E Tests for Extract API Routes", () => {
schema: {
type: "object",
properties: {
authors: { type: "array", items: { type: "string" } }
}
}
authors: { type: "array", items: { type: "string" } },
},
},
});
console.log(response.body);
@ -45,7 +45,7 @@ describe("E2E Tests for Extract API Routes", () => {
expect(gotItRight).toBeGreaterThan(1);
},
60000
60000,
);
it.concurrent(
@ -62,9 +62,9 @@ describe("E2E Tests for Extract API Routes", () => {
schema: {
type: "object",
properties: {
founders: { type: "array", items: { type: "string" } }
}
}
founders: { type: "array", items: { type: "string" } },
},
},
});
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
@ -83,7 +83,7 @@ describe("E2E Tests for Extract API Routes", () => {
expect(gotItRight).toBeGreaterThanOrEqual(2);
},
60000
60000,
);
it.concurrent(
@ -100,10 +100,10 @@ describe("E2E Tests for Extract API Routes", () => {
schema: {
type: "array",
items: {
type: "string"
type: "string",
},
required: ["items"],
},
required: ["items"]
}
});
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
@ -118,7 +118,7 @@ describe("E2E Tests for Extract API Routes", () => {
expect(gotItRight).toBeGreaterThan(2);
},
60000
60000,
);
it.concurrent(
@ -135,15 +135,15 @@ describe("E2E Tests for Extract API Routes", () => {
schema: {
type: "object",
properties: {
pciDssCompliance: { type: "boolean" }
}
}
pciDssCompliance: { type: "boolean" },
},
},
});
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
expect(response.body.data?.pciDssCompliance).toBe(true);
},
60000
60000,
);
it.concurrent(
@ -163,10 +163,10 @@ describe("E2E Tests for Extract API Routes", () => {
properties: {
connector: { type: "string" },
description: { type: "string" },
supportsCaptureDelete: { type: "boolean" }
}
}
}
supportsCaptureDelete: { type: "boolean" },
},
},
},
});
console.log(response.body);
@ -174,7 +174,7 @@ describe("E2E Tests for Extract API Routes", () => {
// expect(response.body).toHaveProperty("data");
// expect(response.body.data?.pciDssCompliance).toBe(true);
},
60000
60000,
);
it.concurrent(
@ -186,17 +186,17 @@ describe("E2E Tests for Extract API Routes", () => {
.set("Content-Type", "application/json")
.send({
urls: [
"https://careers.abnormalsecurity.com/jobs/6119456003?gh_jid=6119456003"
"https://careers.abnormalsecurity.com/jobs/6119456003?gh_jid=6119456003",
],
prompt: "what applicant tracking system is this company using?",
schema: {
type: "object",
properties: {
isGreenhouseATS: { type: "boolean" },
answer: { type: "string" }
}
answer: { type: "string" },
},
allowExternalLinks: true
},
allowExternalLinks: true,
});
console.log(response.body);
@ -204,7 +204,7 @@ describe("E2E Tests for Extract API Routes", () => {
expect(response.body).toHaveProperty("data");
expect(response.body.data?.isGreenhouseATS).toBe(true);
},
60000
60000,
);
it.concurrent(
@ -222,12 +222,12 @@ describe("E2E Tests for Extract API Routes", () => {
items: {
type: "object",
properties: {
component: { type: "string" }
}
component: { type: "string" },
},
required: ["items"]
},
allowExternalLinks: true
required: ["items"],
},
allowExternalLinks: true,
});
console.log(response.body.data?.items);
@ -248,7 +248,7 @@ describe("E2E Tests for Extract API Routes", () => {
}
expect(gotItRight).toBeGreaterThan(2);
},
60000
60000,
);
it.concurrent(
@ -267,11 +267,11 @@ describe("E2E Tests for Extract API Routes", () => {
properties: {
name: { type: "string" },
work: { type: "string" },
education: { type: "string" }
education: { type: "string" },
},
required: ["name", "work", "education"]
required: ["name", "work", "education"],
},
allowExternalLinks: true
allowExternalLinks: true,
});
console.log(response.body.data);
@ -281,7 +281,7 @@ describe("E2E Tests for Extract API Routes", () => {
expect(response.body.data?.work).toBeDefined();
expect(response.body.data?.education).toBeDefined();
},
60000
60000,
);
it.concurrent(
@ -293,7 +293,7 @@ describe("E2E Tests for Extract API Routes", () => {
.set("Content-Type", "application/json")
.send({
urls: ["https://docs.firecrawl.dev"],
prompt: "What is the title and description of the page?"
prompt: "What is the title and description of the page?",
});
console.log(response.body.data);
@ -302,6 +302,6 @@ describe("E2E Tests for Extract API Routes", () => {
expect(typeof response.body.data).toBe("object");
expect(Object.keys(response.body.data).length).toBeGreaterThan(0);
},
60000
60000,
);
});

View File

@ -47,7 +47,7 @@ describe("E2E Tests for API Routes", () => {
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
}
},
);
it.concurrent("should return an error for a blocklisted URL", async () => {
@ -59,7 +59,7 @@ describe("E2E Tests for API Routes", () => {
.send({ url: blocklistedUrl });
expect(response.statusCode).toBe(403);
expect(response.body.error).toContain(
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
);
});
@ -103,30 +103,30 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data.metadata.pageError).toBeUndefined();
expect(response.body.data.metadata.title).toBe("Roast My Website");
expect(response.body.data.metadata.description).toBe(
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
);
expect(response.body.data.metadata.keywords).toBe(
"Roast My Website,Roast,Website,GitHub,Firecrawl"
"Roast My Website,Roast,Website,GitHub,Firecrawl",
);
expect(response.body.data.metadata.robots).toBe("follow, index");
expect(response.body.data.metadata.ogTitle).toBe("Roast My Website");
expect(response.body.data.metadata.ogDescription).toBe(
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
);
expect(response.body.data.metadata.ogUrl).toBe(
"https://www.roastmywebsite.ai"
"https://www.roastmywebsite.ai",
);
expect(response.body.data.metadata.ogImage).toBe(
"https://www.roastmywebsite.ai/og.png"
"https://www.roastmywebsite.ai/og.png",
);
expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]);
expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website");
expect(response.body.data.metadata.sourceURL).toBe(
"https://roastmywebsite.ai"
"https://roastmywebsite.ai",
);
expect(response.body.data.metadata.pageStatusCode).toBe(200);
},
30000
30000,
); // 30 seconds timeout
it.concurrent(
@ -138,7 +138,7 @@ describe("E2E Tests for API Routes", () => {
.set("Content-Type", "application/json")
.send({
url: "https://roastmywebsite.ai",
pageOptions: { includeHtml: true }
pageOptions: { includeHtml: true },
});
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
@ -152,7 +152,7 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data.metadata.pageStatusCode).toBe(200);
expect(response.body.data.metadata.pageError).toBeUndefined();
},
30000
30000,
); // 30 seconds timeout
it.concurrent(
@ -164,7 +164,7 @@ describe("E2E Tests for API Routes", () => {
.set("Content-Type", "application/json")
.send({
url: "https://roastmywebsite.ai",
pageOptions: { includeRawHtml: true }
pageOptions: { includeRawHtml: true },
});
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
@ -178,7 +178,7 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data.metadata.pageStatusCode).toBe(200);
expect(response.body.data.metadata.pageError).toBeUndefined();
},
30000
30000,
); // 30 seconds timeout
it.concurrent(
@ -196,12 +196,12 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data).toHaveProperty("content");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.content).toContain(
"We present spectrophotometric observations of the Broad Line Radio Galaxy"
"We present spectrophotometric observations of the Broad Line Radio Galaxy",
);
expect(response.body.data.metadata.pageStatusCode).toBe(200);
expect(response.body.data.metadata.pageError).toBeUndefined();
},
60000
60000,
); // 60 seconds
it.concurrent(
@ -219,12 +219,12 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data).toHaveProperty("content");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.content).toContain(
"We present spectrophotometric observations of the Broad Line Radio Galaxy"
"We present spectrophotometric observations of the Broad Line Radio Galaxy",
);
expect(response.body.data.metadata.pageStatusCode).toBe(200);
expect(response.body.data.metadata.pageError).toBeUndefined();
},
60000
60000,
); // 60 seconds
it.concurrent(
@ -236,7 +236,7 @@ describe("E2E Tests for API Routes", () => {
.set("Content-Type", "application/json")
.send({
url: "https://arxiv.org/pdf/astro-ph/9301001.pdf",
pageOptions: { parsePDF: false }
pageOptions: { parsePDF: false },
});
await new Promise((r) => setTimeout(r, 6000));
@ -245,10 +245,10 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data).toHaveProperty("content");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.content).toContain(
"/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj"
"/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj",
);
},
60000
60000,
); // 60 seconds
it.concurrent(
@ -266,16 +266,16 @@ describe("E2E Tests for API Routes", () => {
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
expect(responseWithoutRemoveTags.body.data.content).toContain(
"Scrape This Site"
"Scrape This Site",
);
expect(responseWithoutRemoveTags.body.data.content).toContain(
"Lessons and Videos"
"Lessons and Videos",
); // #footer
expect(responseWithoutRemoveTags.body.data.content).toContain(
"[Sandbox]("
"[Sandbox](",
); // .nav
expect(responseWithoutRemoveTags.body.data.content).toContain(
"web scraping"
"web scraping",
); // strong
const response = await request(TEST_URL)
@ -284,7 +284,7 @@ describe("E2E Tests for API Routes", () => {
.set("Content-Type", "application/json")
.send({
url: "https://www.scrapethissite.com/",
pageOptions: { removeTags: [".nav", "#footer", "strong"] }
pageOptions: { removeTags: [".nav", "#footer", "strong"] },
});
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
@ -297,7 +297,7 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav
expect(response.body.data.content).not.toContain("web scraping"); // strong
},
30000
30000,
); // 30 seconds timeout
// TODO: add this test back once we nail the waitFor option to be more deterministic
@ -337,10 +337,10 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.pageStatusCode).toBe(400);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
"bad request"
"bad request",
);
},
60000
60000,
); // 60 seconds
it.concurrent(
@ -359,10 +359,10 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.pageStatusCode).toBe(401);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
"unauthorized"
"unauthorized",
);
},
60000
60000,
); // 60 seconds
it.concurrent(
@ -381,10 +381,10 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.pageStatusCode).toBe(403);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
"forbidden"
"forbidden",
);
},
60000
60000,
); // 60 seconds
it.concurrent(
@ -403,10 +403,10 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.pageStatusCode).toBe(404);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
"not found"
"not found",
);
},
60000
60000,
); // 60 seconds
it.concurrent(
@ -425,10 +425,10 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.pageStatusCode).toBe(405);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
"method not allowed"
"method not allowed",
);
},
60000
60000,
); // 60 seconds
it.concurrent(
@ -447,10 +447,10 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.pageStatusCode).toBe(500);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
"internal server error"
"internal server error",
);
},
60000
60000,
); // 60 seconds
});
@ -469,7 +469,7 @@ describe("E2E Tests for API Routes", () => {
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
}
},
);
it.concurrent("should return an error for a blocklisted URL", async () => {
@ -481,7 +481,7 @@ describe("E2E Tests for API Routes", () => {
.send({ url: blocklistedUrl });
expect(response.statusCode).toBe(403);
expect(response.body.error).toContain(
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
);
});
@ -496,9 +496,9 @@ describe("E2E Tests for API Routes", () => {
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("jobId");
expect(response.body.jobId).toMatch(
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/,
);
}
},
);
it.concurrent(
"should prevent duplicate requests using the same idempotency key",
@ -525,7 +525,7 @@ describe("E2E Tests for API Routes", () => {
expect(secondResponse.statusCode).toBe(409);
expect(secondResponse.body.error).toBe("Idempotency key already used");
}
},
);
it.concurrent(
@ -539,8 +539,8 @@ describe("E2E Tests for API Routes", () => {
url: "https://mendable.ai",
limit: 10,
crawlerOptions: {
includes: ["blog/*"]
}
includes: ["blog/*"],
},
});
let response;
@ -563,7 +563,7 @@ describe("E2E Tests for API Routes", () => {
const completedResponse = response;
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
(item: any) => item.metadata?.sourceURL,
);
expect(urls.length).toBeGreaterThan(5);
urls.forEach((url: string) => {
@ -579,13 +579,13 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("Mendable");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
200
200,
);
expect(
completedResponse.body.data[0].metadata.pageError
completedResponse.body.data[0].metadata.pageError,
).toBeUndefined();
},
60000
60000,
); // 60 seconds
it.concurrent(
@ -599,8 +599,8 @@ describe("E2E Tests for API Routes", () => {
url: "https://mendable.ai",
limit: 10,
crawlerOptions: {
excludes: ["blog/*"]
}
excludes: ["blog/*"],
},
});
let isFinished = false;
@ -623,14 +623,14 @@ describe("E2E Tests for API Routes", () => {
const completedResponse = response;
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
(item: any) => item.metadata?.sourceURL,
);
expect(urls.length).toBeGreaterThan(5);
urls.forEach((url: string) => {
expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy();
});
},
90000
90000,
); // 90 seconds
it.concurrent(
@ -642,7 +642,7 @@ describe("E2E Tests for API Routes", () => {
.set("Content-Type", "application/json")
.send({
url: "https://mendable.ai",
crawlerOptions: { limit: 3 }
crawlerOptions: { limit: 3 },
});
let isFinished = false;
@ -674,13 +674,13 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("Mendable");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
200
200,
);
expect(
completedResponse.body.data[0].metadata.pageError
completedResponse.body.data[0].metadata.pageError,
).toBeUndefined();
},
60000
60000,
); // 60 seconds
it.concurrent(
@ -692,7 +692,7 @@ describe("E2E Tests for API Routes", () => {
.set("Content-Type", "application/json")
.send({
url: "https://www.scrapethissite.com",
crawlerOptions: { maxDepth: 1 }
crawlerOptions: { maxDepth: 1 },
});
expect(crawlResponse.statusCode).toBe(200);
@ -726,13 +726,13 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
200
200,
);
expect(
completedResponse.body.data[0].metadata.pageError
completedResponse.body.data[0].metadata.pageError,
).toBeUndefined();
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
(item: any) => item.metadata?.sourceURL,
);
expect(urls.length).toBeGreaterThan(1);
@ -748,7 +748,7 @@ describe("E2E Tests for API Routes", () => {
expect(depth).toBeLessThanOrEqual(2);
});
},
180000
180000,
);
it.concurrent(
@ -760,7 +760,7 @@ describe("E2E Tests for API Routes", () => {
.set("Content-Type", "application/json")
.send({
url: "https://www.scrapethissite.com/pages/",
crawlerOptions: { maxDepth: 1 }
crawlerOptions: { maxDepth: 1 },
});
expect(crawlResponse.statusCode).toBe(200);
@ -794,7 +794,7 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
(item: any) => item.metadata?.sourceURL,
);
expect(urls.length).toBeGreaterThan(1);
@ -810,7 +810,7 @@ describe("E2E Tests for API Routes", () => {
expect(depth).toBeLessThanOrEqual(3);
});
},
180000
180000,
);
it.concurrent(
@ -822,7 +822,7 @@ describe("E2E Tests for API Routes", () => {
.set("Content-Type", "application/json")
.send({
url: "https://www.mendable.ai",
crawlerOptions: { maxDepth: 0 }
crawlerOptions: { maxDepth: 0 },
});
expect(crawlResponse.statusCode).toBe(200);
@ -849,7 +849,7 @@ describe("E2E Tests for API Routes", () => {
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
const testurls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
(item: any) => item.metadata?.sourceURL,
);
//console.log(testurls)
@ -861,7 +861,7 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
(item: any) => item.metadata?.sourceURL,
);
expect(urls.length).toBeGreaterThanOrEqual(1);
@ -877,7 +877,7 @@ describe("E2E Tests for API Routes", () => {
expect(depth).toBeLessThanOrEqual(1);
});
},
180000
180000,
);
// it.concurrent("should return a successful response with a valid API key and valid limit option", async () => {
@ -934,7 +934,7 @@ describe("E2E Tests for API Routes", () => {
.set("Content-Type", "application/json")
.send({
url: "https://roastmywebsite.ai",
pageOptions: { includeHtml: true }
pageOptions: { includeHtml: true },
});
expect(crawlResponse.statusCode).toBe(200);
@ -969,10 +969,10 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
200
200,
);
expect(
completedResponse.body.data[0].metadata.pageError
completedResponse.body.data[0].metadata.pageError,
).toBeUndefined();
// 120 seconds
@ -983,13 +983,13 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0].html).toContain("<h1");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
200
200,
);
expect(
completedResponse.body.data[0].metadata.pageError
completedResponse.body.data[0].metadata.pageError,
).toBeUndefined();
},
180000
180000,
);
it.concurrent(
@ -1005,8 +1005,8 @@ describe("E2E Tests for API Routes", () => {
allowExternalContentLinks: true,
ignoreSitemap: true,
returnOnlyUrls: true,
limit: 50
}
limit: 50,
},
});
expect(crawlInitResponse.statusCode).toBe(200);
@ -1031,19 +1031,19 @@ describe("E2E Tests for API Routes", () => {
expect.arrayContaining([
expect.objectContaining({
url: expect.stringContaining(
"https://firecrawl.dev/?ref=mendable+banner"
)
"https://firecrawl.dev/?ref=mendable+banner",
),
}),
expect.objectContaining({
url: expect.stringContaining("https://mendable.ai/pricing")
url: expect.stringContaining("https://mendable.ai/pricing"),
}),
expect.objectContaining({
url: expect.stringContaining("https://x.com/CalebPeffer")
})
])
url: expect.stringContaining("https://x.com/CalebPeffer"),
}),
]),
);
},
180000
180000,
); // 3 minutes timeout
});
@ -1062,7 +1062,7 @@ describe("E2E Tests for API Routes", () => {
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
}
},
);
// it.concurrent("should return an error for a blocklisted URL", async () => {
@ -1088,7 +1088,7 @@ describe("E2E Tests for API Routes", () => {
expect(response.statusCode).toBe(408);
},
3000
3000,
);
// it.concurrent("should return a successful response with a valid API key for crawlWebsitePreview", async () => {
@ -1120,7 +1120,7 @@ describe("E2E Tests for API Routes", () => {
.set("Content-Type", "application/json")
.send({ query: "test" });
expect(response.statusCode).toBe(401);
}
},
);
it.concurrent(
@ -1136,7 +1136,7 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.success).toBe(true);
expect(response.body).toHaveProperty("data");
},
30000
30000,
); // 30 seconds timeout
});
@ -1153,7 +1153,7 @@ describe("E2E Tests for API Routes", () => {
.get("/v0/crawl/status/123")
.set("Authorization", `Bearer invalid-api-key`);
expect(response.statusCode).toBe(401);
}
},
);
it.concurrent(
@ -1163,7 +1163,7 @@ describe("E2E Tests for API Routes", () => {
.get("/v0/crawl/status/invalidJobId")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(404);
}
},
);
it.concurrent(
@ -1201,22 +1201,22 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("Mendable");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
200
200,
);
expect(
completedResponse.body.data[0].metadata.pageError
completedResponse.body.data[0].metadata.pageError,
).toBeUndefined();
const childrenLinks = completedResponse.body.data.filter(
(doc) =>
doc.metadata &&
doc.metadata.sourceURL &&
doc.metadata.sourceURL.includes("mendable.ai/blog")
doc.metadata.sourceURL.includes("mendable.ai/blog"),
);
expect(childrenLinks.length).toBe(completedResponse.body.data.length);
},
180000
180000,
); // 120 seconds
it.concurrent(
@ -1236,9 +1236,9 @@ describe("E2E Tests for API Routes", () => {
"abs/*",
"static/*",
"about/*",
"archive/*"
]
}
"archive/*",
],
},
});
expect(crawlResponse.statusCode).toBe(200);
@ -1266,21 +1266,21 @@ describe("E2E Tests for API Routes", () => {
expect.arrayContaining([
expect.objectContaining({
content: expect.stringContaining(
"asymmetries might represent, for instance, preferred source orientations to our line of sight."
)
})
])
"asymmetries might represent, for instance, preferred source orientations to our line of sight.",
),
}),
]),
);
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
200
200,
);
expect(
completedResponse.body.data[0].metadata.pageError
completedResponse.body.data[0].metadata.pageError,
).toBeUndefined();
},
180000
180000,
); // 120 seconds
it.concurrent(
@ -1292,7 +1292,7 @@ describe("E2E Tests for API Routes", () => {
.set("Content-Type", "application/json")
.send({
url: "https://roastmywebsite.ai",
pageOptions: { includeHtml: true }
pageOptions: { includeHtml: true },
});
expect(crawlResponse.statusCode).toBe(200);
@ -1333,13 +1333,13 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0].markdown).toContain("_Roast_");
expect(completedResponse.body.data[0].html).toContain("<h1");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
200
200,
);
expect(
completedResponse.body.data[0].metadata.pageError
completedResponse.body.data[0].metadata.pageError,
).toBeUndefined();
},
60000
60000,
);
}); // 60 seconds
@ -1353,7 +1353,7 @@ describe("E2E Tests for API Routes", () => {
.send({
url: "https://mendable.ai/blog",
pageOptions: { includeHtml: true },
crawlerOptions: { allowBackwardCrawling: true }
crawlerOptions: { allowBackwardCrawling: true },
});
expect(crawlResponse.statusCode).toBe(200);
@ -1397,10 +1397,10 @@ describe("E2E Tests for API Routes", () => {
});
expect(completedResponse.body.data.length).toBeGreaterThan(
onlyChildrenLinks.length
onlyChildrenLinks.length,
);
},
60000
60000,
);
it.concurrent(
@ -1438,13 +1438,13 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown");
expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata");
expect(
completedResponse.body.partial_data[0].metadata.pageStatusCode
completedResponse.body.partial_data[0].metadata.pageStatusCode,
).toBe(200);
expect(
completedResponse.body.partial_data[0].metadata.pageError
completedResponse.body.partial_data[0].metadata.pageError,
).toBeUndefined();
},
60000
60000,
); // 60 seconds
describe("POST /v0/scrape with LLM Extraction", () => {
@ -1458,7 +1458,7 @@ describe("E2E Tests for API Routes", () => {
.send({
url: "https://mendable.ai",
pageOptions: {
onlyMainContent: true
onlyMainContent: true,
},
extractorOptions: {
mode: "llm-extraction",
@ -1468,18 +1468,18 @@ describe("E2E Tests for API Routes", () => {
type: "object",
properties: {
company_mission: {
type: "string"
type: "string",
},
supports_sso: {
type: "boolean"
type: "boolean",
},
is_open_source: {
type: "boolean"
}
type: "boolean",
},
},
required: ["company_mission", "supports_sso", "is_open_source"],
},
},
required: ["company_mission", "supports_sso", "is_open_source"]
}
}
});
// Ensure that the job was successfully created before proceeding with LLM extraction
@ -1498,7 +1498,7 @@ describe("E2E Tests for API Routes", () => {
expect(llmExtraction.is_open_source).toBe(false);
expect(typeof llmExtraction.is_open_source).toBe("boolean");
},
60000
60000,
); // 60 secs
it.concurrent(
@ -1519,15 +1519,15 @@ describe("E2E Tests for API Routes", () => {
type: "object",
properties: {
primary_cta: {
type: "string"
type: "string",
},
secondary_cta: {
type: "string"
}
type: "string",
},
},
required: ["primary_cta", "secondary_cta"],
},
},
required: ["primary_cta", "secondary_cta"]
}
}
});
// Ensure that the job was successfully created before proceeding with LLM extraction
@ -1542,7 +1542,7 @@ describe("E2E Tests for API Routes", () => {
expect(llmExtraction).toHaveProperty("secondary_cta");
expect(typeof llmExtraction.secondary_cta).toBe("string");
},
60000
60000,
); // 60 secs
});
@ -1617,8 +1617,8 @@ describe("E2E Tests for API Routes", () => {
.send({
url: "https://flutterbricks.com",
crawlerOptions: {
mode: "fast"
}
mode: "fast",
},
});
expect(crawlResponse.statusCode).toBe(200);
@ -1660,7 +1660,7 @@ describe("E2E Tests for API Routes", () => {
expect(results.length).toBeGreaterThanOrEqual(10);
expect(results.length).toBeLessThanOrEqual(15);
},
20000
20000,
);
// it.concurrent("should complete the crawl in more than 10 seconds", async () => {
@ -1741,7 +1741,7 @@ describe("E2E Tests for API Routes", () => {
expect(response.statusCode).toBe(429);
},
90000
90000,
);
});

View File

@ -15,7 +15,7 @@ describe("E2E Tests for Map API Routes", () => {
.send({
url: "https://firecrawl.dev",
sitemapOnly: false,
search: "smart-crawl"
search: "smart-crawl",
});
console.log(response.body);
@ -24,7 +24,7 @@ describe("E2E Tests for Map API Routes", () => {
expect(response.body.links.length).toBeGreaterThan(0);
expect(response.body.links[0]).toContain("firecrawl.dev/smart-crawl");
},
60000
60000,
);
it.concurrent(
@ -37,7 +37,7 @@ describe("E2E Tests for Map API Routes", () => {
.send({
url: "https://firecrawl.dev",
sitemapOnly: false,
includeSubdomains: true
includeSubdomains: true,
});
console.log(response.body);
@ -45,10 +45,10 @@ describe("E2E Tests for Map API Routes", () => {
expect(response.body).toHaveProperty("links");
expect(response.body.links.length).toBeGreaterThan(0);
expect(response.body.links[response.body.links.length - 1]).toContain(
"docs.firecrawl.dev"
"docs.firecrawl.dev",
);
},
60000
60000,
);
it.concurrent(
@ -60,7 +60,7 @@ describe("E2E Tests for Map API Routes", () => {
.set("Content-Type", "application/json")
.send({
url: "https://firecrawl.dev",
sitemapOnly: true
sitemapOnly: true,
});
console.log(response.body);
@ -68,10 +68,10 @@ describe("E2E Tests for Map API Routes", () => {
expect(response.body).toHaveProperty("links");
expect(response.body.links.length).toBeGreaterThan(0);
expect(response.body.links[response.body.links.length - 1]).not.toContain(
"docs.firecrawl.dev"
"docs.firecrawl.dev",
);
},
60000
60000,
);
it.concurrent(
@ -84,7 +84,7 @@ describe("E2E Tests for Map API Routes", () => {
.send({
url: "https://firecrawl.dev",
sitemapOnly: false,
limit: 10
limit: 10,
});
console.log(response.body);
@ -92,7 +92,7 @@ describe("E2E Tests for Map API Routes", () => {
expect(response.body).toHaveProperty("links");
expect(response.body.links.length).toBeLessThanOrEqual(10);
},
60000
60000,
);
it.concurrent(
@ -104,7 +104,7 @@ describe("E2E Tests for Map API Routes", () => {
.set("Content-Type", "application/json")
.send({
url: "https://geekflare.com/sitemap_index.xml",
sitemapOnly: true
sitemapOnly: true,
});
console.log(response.body);
@ -112,6 +112,6 @@ describe("E2E Tests for Map API Routes", () => {
expect(response.body).toHaveProperty("links");
expect(response.body.links.length).toBeGreaterThan(1900);
},
60000
60000,
);
});

View File

@ -62,7 +62,7 @@ describe("E2E Tests for API Routes with No Authentication", () => {
.send({ url: blocklistedUrl });
expect(response.statusCode).toBe(403);
expect(response.body.error).toContain(
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
);
});
@ -89,7 +89,7 @@ describe("E2E Tests for API Routes with No Authentication", () => {
.send({ url: blocklistedUrl });
expect(response.statusCode).toBe(403);
expect(response.body.error).toContain(
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
);
});
@ -101,7 +101,7 @@ describe("E2E Tests for API Routes with No Authentication", () => {
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("jobId");
expect(response.body.jobId).toMatch(
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/,
);
});
});
@ -120,7 +120,7 @@ describe("E2E Tests for API Routes with No Authentication", () => {
.send({ url: blocklistedUrl });
expect(response.statusCode).toBe(403);
expect(response.body.error).toContain(
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
);
});
@ -132,7 +132,7 @@ describe("E2E Tests for API Routes with No Authentication", () => {
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("jobId");
expect(response.body.jobId).toMatch(
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/,
);
});
});
@ -172,7 +172,7 @@ describe("E2E Tests for API Routes with No Authentication", () => {
it("should return Job not found for invalid job ID", async () => {
const response = await request(TEST_URL).get(
"/v0/crawl/status/invalidJobId"
"/v0/crawl/status/invalidJobId",
);
expect(response.statusCode).toBe(404);
});
@ -185,7 +185,7 @@ describe("E2E Tests for API Routes with No Authentication", () => {
expect(crawlResponse.statusCode).toBe(200);
const response = await request(TEST_URL).get(
`/v0/crawl/status/${crawlResponse.body.jobId}`
`/v0/crawl/status/${crawlResponse.body.jobId}`,
);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
@ -195,7 +195,7 @@ describe("E2E Tests for API Routes with No Authentication", () => {
await new Promise((r) => setTimeout(r, 30000));
const completedResponse = await request(TEST_URL).get(
`/v0/crawl/status/${crawlResponse.body.jobId}`
`/v0/crawl/status/${crawlResponse.body.jobId}`,
);
expect(completedResponse.statusCode).toBe(200);
expect(completedResponse.body).toHaveProperty("status");

View File

@ -2,7 +2,7 @@ import request from "supertest";
import { configDotenv } from "dotenv";
import {
ScrapeRequestInput,
ScrapeResponseRequestTest
ScrapeResponseRequestTest,
} from "../../controllers/v1/types";
configDotenv();
@ -24,7 +24,7 @@ describe("E2E Tests for v1 API Routes", () => {
console.log(
"process.env.USE_DB_AUTHENTICATION",
process.env.USE_DB_AUTHENTICATION
process.env.USE_DB_AUTHENTICATION,
);
console.log("?", process.env.USE_DB_AUTHENTICATION === "true");
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
@ -47,7 +47,7 @@ describe("E2E Tests for v1 API Routes", () => {
it.concurrent("should throw error for blocklisted URL", async () => {
const scrapeRequest: ScrapeRequestInput = {
url: "https://facebook.com/fake-test"
url: "https://facebook.com/fake-test",
};
const response = await request(TEST_URL)
@ -58,7 +58,7 @@ describe("E2E Tests for v1 API Routes", () => {
expect(response.statusCode).toBe(403);
expect(response.body.error).toBe(
"URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions."
"URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.",
);
});
@ -71,14 +71,14 @@ describe("E2E Tests for v1 API Routes", () => {
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
}
},
);
it.concurrent(
"should return a successful response with a valid API key",
async () => {
const scrapeRequest: ScrapeRequestInput = {
url: "https://roastmywebsite.ai"
url: "https://roastmywebsite.ai",
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
@ -100,37 +100,37 @@ describe("E2E Tests for v1 API Routes", () => {
expect(response.body.data.metadata.error).toBeUndefined();
expect(response.body.data.metadata.title).toBe("Roast My Website");
expect(response.body.data.metadata.description).toBe(
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
);
expect(response.body.data.metadata.keywords).toBe(
"Roast My Website,Roast,Website,GitHub,Firecrawl"
"Roast My Website,Roast,Website,GitHub,Firecrawl",
);
expect(response.body.data.metadata.robots).toBe("follow, index");
expect(response.body.data.metadata.ogTitle).toBe("Roast My Website");
expect(response.body.data.metadata.ogDescription).toBe(
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
);
expect(response.body.data.metadata.ogUrl).toBe(
"https://www.roastmywebsite.ai"
"https://www.roastmywebsite.ai",
);
expect(response.body.data.metadata.ogImage).toBe(
"https://www.roastmywebsite.ai/og.png"
"https://www.roastmywebsite.ai/og.png",
);
expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]);
expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website");
expect(response.body.data.metadata.sourceURL).toBe(
"https://roastmywebsite.ai"
"https://roastmywebsite.ai",
);
expect(response.body.data.metadata.statusCode).toBe(200);
},
30000
30000,
); // 30 seconds timeout
it.concurrent(
"should return a successful response with a valid API key",
async () => {
const scrapeRequest: ScrapeRequestInput = {
url: "https://arxiv.org/abs/2410.04840"
url: "https://arxiv.org/abs/2410.04840",
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
@ -151,43 +151,43 @@ describe("E2E Tests for v1 API Routes", () => {
expect(response.body.data.markdown).toContain("Strong Model Collapse");
expect(response.body.data.metadata.error).toBeUndefined();
expect(response.body.data.metadata.description).toContain(
"Abstract page for arXiv paper 2410.04840: Strong Model Collapse"
"Abstract page for arXiv paper 2410.04840: Strong Model Collapse",
);
expect(response.body.data.metadata.citation_title).toBe(
"Strong Model Collapse"
"Strong Model Collapse",
);
expect(response.body.data.metadata.citation_author).toEqual([
"Dohmatob, Elvis",
"Feng, Yunzhen",
"Subramonian, Arjun",
"Kempe, Julia"
"Kempe, Julia",
]);
expect(response.body.data.metadata.citation_date).toBe("2024/10/07");
expect(response.body.data.metadata.citation_online_date).toBe(
"2024/10/08"
"2024/10/08",
);
expect(response.body.data.metadata.citation_pdf_url).toBe(
"http://arxiv.org/pdf/2410.04840"
"http://arxiv.org/pdf/2410.04840",
);
expect(response.body.data.metadata.citation_arxiv_id).toBe(
"2410.04840"
"2410.04840",
);
expect(response.body.data.metadata.citation_abstract).toContain(
"Within the scaling laws paradigm"
"Within the scaling laws paradigm",
);
expect(response.body.data.metadata.sourceURL).toBe(
"https://arxiv.org/abs/2410.04840"
"https://arxiv.org/abs/2410.04840",
);
expect(response.body.data.metadata.statusCode).toBe(200);
},
30000
30000,
);
it.concurrent(
"should return a successful response with a valid API key and includeHtml set to true",
async () => {
const scrapeRequest: ScrapeRequestInput = {
url: "https://roastmywebsite.ai",
formats: ["markdown", "html"]
formats: ["markdown", "html"],
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
@ -209,13 +209,13 @@ describe("E2E Tests for v1 API Routes", () => {
expect(response.body.data.metadata.statusCode).toBe(200);
expect(response.body.data.metadata.error).toBeUndefined();
},
30000
30000,
);
it.concurrent(
"should return a successful response for a valid scrape with PDF file",
async () => {
const scrapeRequest: ScrapeRequestInput = {
url: "https://arxiv.org/pdf/astro-ph/9301001.pdf"
url: "https://arxiv.org/pdf/astro-ph/9301001.pdf",
// formats: ["markdown", "html"],
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
@ -232,19 +232,19 @@ describe("E2E Tests for v1 API Routes", () => {
}
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.markdown).toContain(
"Broad Line Radio Galaxy"
"Broad Line Radio Galaxy",
);
expect(response.body.data.metadata.statusCode).toBe(200);
expect(response.body.data.metadata.error).toBeUndefined();
},
60000
60000,
);
it.concurrent(
"should return a successful response for a valid scrape with PDF file without explicit .pdf extension",
async () => {
const scrapeRequest: ScrapeRequestInput = {
url: "https://arxiv.org/pdf/astro-ph/9301001"
url: "https://arxiv.org/pdf/astro-ph/9301001",
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
@ -261,12 +261,12 @@ describe("E2E Tests for v1 API Routes", () => {
expect(response.body.data).toHaveProperty("markdown");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.markdown).toContain(
"Broad Line Radio Galaxy"
"Broad Line Radio Galaxy",
);
expect(response.body.data.metadata.statusCode).toBe(200);
expect(response.body.data.metadata.error).toBeUndefined();
},
60000
60000,
);
it.concurrent(
@ -274,7 +274,7 @@ describe("E2E Tests for v1 API Routes", () => {
async () => {
const scrapeRequest: ScrapeRequestInput = {
url: "https://www.scrapethissite.com/",
onlyMainContent: false // default is true
onlyMainContent: false, // default is true
};
const responseWithoutRemoveTags: ScrapeResponseRequestTest =
await request(TEST_URL)
@ -292,16 +292,16 @@ describe("E2E Tests for v1 API Routes", () => {
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
expect(responseWithoutRemoveTags.body.data.markdown).toContain(
"[FAQ](/faq/)"
"[FAQ](/faq/)",
); // .nav
expect(responseWithoutRemoveTags.body.data.markdown).toContain(
"Hartley Brody 2023"
"Hartley Brody 2023",
); // #footer
const scrapeRequestWithRemoveTags: ScrapeRequestInput = {
url: "https://www.scrapethissite.com/",
excludeTags: [".nav", "#footer", "strong"],
onlyMainContent: false // default is true
onlyMainContent: false, // default is true
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
@ -320,7 +320,7 @@ describe("E2E Tests for v1 API Routes", () => {
expect(response.body.data.markdown).not.toContain("Hartley Brody 2023");
expect(response.body.data.markdown).not.toContain("[FAQ](/faq/)"); //
},
30000
30000,
);
it.concurrent(
@ -342,7 +342,7 @@ describe("E2E Tests for v1 API Routes", () => {
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.statusCode).toBe(400);
},
60000
60000,
);
it.concurrent(
@ -364,7 +364,7 @@ describe("E2E Tests for v1 API Routes", () => {
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.statusCode).toBe(401);
},
60000
60000,
);
// Removed it as we want to retry fallback to the next scraper
@ -405,7 +405,7 @@ describe("E2E Tests for v1 API Routes", () => {
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.statusCode).toBe(404);
},
60000
60000,
);
// it.concurrent('should return a successful response for a scrape with 405 page', async () => {
@ -455,7 +455,7 @@ describe("E2E Tests for v1 API Routes", () => {
expect(response.statusCode).toBe(408);
},
3000
3000,
);
it.concurrent(
@ -463,7 +463,7 @@ describe("E2E Tests for v1 API Routes", () => {
async () => {
const scrapeRequest: ScrapeRequestInput = {
url: "https://roastmywebsite.ai",
formats: ["html", "rawHtml"]
formats: ["html", "rawHtml"],
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
@ -486,7 +486,7 @@ describe("E2E Tests for v1 API Routes", () => {
expect(response.body.data.metadata.statusCode).toBe(200);
expect(response.body.data.metadata.error).toBeUndefined();
},
30000
30000,
);
it.concurrent(
@ -495,7 +495,7 @@ describe("E2E Tests for v1 API Routes", () => {
const scrapeRequest: ScrapeRequestInput = {
url: "https://ycombinator.com/companies",
formats: ["markdown"],
waitFor: 8000
waitFor: 8000,
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
@ -518,7 +518,7 @@ describe("E2E Tests for v1 API Routes", () => {
expect(response.body.data.metadata.statusCode).toBe(200);
expect(response.body.data.metadata.error).toBeUndefined();
},
30000
30000,
);
it.concurrent(
@ -526,7 +526,7 @@ describe("E2E Tests for v1 API Routes", () => {
async () => {
const scrapeRequest: ScrapeRequestInput = {
url: "https://roastmywebsite.ai",
formats: ["links"]
formats: ["links"],
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
@ -548,7 +548,7 @@ describe("E2E Tests for v1 API Routes", () => {
expect(response.body.data.metadata.statusCode).toBe(200);
expect(response.body.data.metadata.error).toBeUndefined();
},
30000
30000,
);
});
@ -569,14 +569,14 @@ describe("E2E Tests for v1 API Routes", () => {
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
}
},
);
it.concurrent(
"should return a successful response with a valid API key",
async () => {
const mapRequest = {
url: "https://roastmywebsite.ai"
url: "https://roastmywebsite.ai",
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
@ -594,7 +594,7 @@ describe("E2E Tests for v1 API Routes", () => {
const links = response.body.links as unknown[];
expect(Array.isArray(links)).toBe(true);
expect(links.length).toBeGreaterThan(0);
}
},
);
it.concurrent(
@ -602,7 +602,7 @@ describe("E2E Tests for v1 API Routes", () => {
async () => {
const mapRequest = {
url: "https://usemotion.com",
search: "pricing"
search: "pricing",
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
@ -621,7 +621,7 @@ describe("E2E Tests for v1 API Routes", () => {
expect(Array.isArray(links)).toBe(true);
expect(links.length).toBeGreaterThan(0);
expect(links[0]).toContain("usemotion.com/pricing");
}
},
);
it.concurrent(
@ -630,7 +630,7 @@ describe("E2E Tests for v1 API Routes", () => {
const mapRequest = {
url: "https://firecrawl.dev",
search: "docs",
includeSubdomains: true
includeSubdomains: true,
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
@ -650,10 +650,10 @@ describe("E2E Tests for v1 API Routes", () => {
expect(links.length).toBeGreaterThan(0);
const containsDocsFirecrawlDev = links.some((link: string) =>
link.includes("docs.firecrawl.dev")
link.includes("docs.firecrawl.dev"),
);
expect(containsDocsFirecrawlDev).toBe(true);
}
},
);
it.concurrent(
@ -662,7 +662,7 @@ describe("E2E Tests for v1 API Routes", () => {
const mapRequest = {
url: "https://www.firecrawl.dev",
search: "docs",
includeSubdomains: true
includeSubdomains: true,
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
@ -682,11 +682,11 @@ describe("E2E Tests for v1 API Routes", () => {
expect(links.length).toBeGreaterThan(0);
const containsDocsFirecrawlDev = links.some((link: string) =>
link.includes("docs.firecrawl.dev")
link.includes("docs.firecrawl.dev"),
);
expect(containsDocsFirecrawlDev).toBe(true);
},
10000
10000,
);
it.concurrent(
@ -695,7 +695,7 @@ describe("E2E Tests for v1 API Routes", () => {
const mapRequest = {
url: "https://www.firecrawl.dev",
search: "docs",
includeSubdomains: false
includeSubdomains: false,
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
@ -714,14 +714,14 @@ describe("E2E Tests for v1 API Routes", () => {
expect(Array.isArray(links)).toBe(true);
expect(links.length).toBeGreaterThan(0);
expect(links[0]).not.toContain("docs.firecrawl.dev");
}
},
);
it.concurrent("should return an error for invalid URL", async () => {
const mapRequest = {
url: "invalid-url",
includeSubdomains: true,
search: "test"
search: "test",
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
@ -746,7 +746,7 @@ describe("E2E Tests for v1 API Routes", () => {
it.concurrent("should throw error for blocklisted URL", async () => {
const scrapeRequest: ScrapeRequestInput = {
url: "https://facebook.com/fake-test"
url: "https://facebook.com/fake-test",
};
const response = await request(TEST_URL)
@ -757,7 +757,7 @@ describe("E2E Tests for v1 API Routes", () => {
expect(response.statusCode).toBe(403);
expect(response.body.error).toBe(
"URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions."
"URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.",
);
});
@ -770,7 +770,7 @@ describe("E2E Tests for v1 API Routes", () => {
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
}
},
);
it.concurrent("should return a successful response", async () => {
@ -783,7 +783,7 @@ describe("E2E Tests for v1 API Routes", () => {
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("id");
expect(response.body.id).toMatch(
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/,
);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("url");
@ -800,7 +800,7 @@ describe("E2E Tests for v1 API Routes", () => {
.send({
url: "https://firecrawl.dev",
limit: 40,
includePaths: ["blog/*"]
includePaths: ["blog/*"],
});
let response;
@ -826,7 +826,7 @@ describe("E2E Tests for v1 API Routes", () => {
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
(item: any) => item.metadata?.sourceURL,
);
expect(urls.length).toBeGreaterThan(5);
urls.forEach((url: string) => {
@ -843,7 +843,7 @@ describe("E2E Tests for v1 API Routes", () => {
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
},
180000
180000,
); // 180 seconds
it.concurrent(
@ -856,7 +856,7 @@ describe("E2E Tests for v1 API Routes", () => {
.send({
url: "https://firecrawl.dev",
limit: 40,
excludePaths: ["blog/*"]
excludePaths: ["blog/*"],
});
let isFinished = false;
@ -882,14 +882,14 @@ describe("E2E Tests for v1 API Routes", () => {
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
(item: any) => item.metadata?.sourceURL,
);
expect(urls.length).toBeGreaterThan(3);
urls.forEach((url: string) => {
expect(url.startsWith("https://www.firecrawl.dev/blog/")).toBeFalsy();
});
},
90000
90000,
); // 90 seconds
it.concurrent(
@ -901,7 +901,7 @@ describe("E2E Tests for v1 API Routes", () => {
.set("Content-Type", "application/json")
.send({
url: "https://www.scrapethissite.com",
maxDepth: 1
maxDepth: 1,
});
expect(crawlResponse.statusCode).toBe(200);
@ -911,7 +911,7 @@ describe("E2E Tests for v1 API Routes", () => {
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
expect(["active", "waiting", "completed", "scraping"]).toContain(
response.body.status
response.body.status,
);
// wait for 60 seconds
let isCompleted = false;
@ -939,7 +939,7 @@ describe("E2E Tests for v1 API Routes", () => {
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
(item: any) => item.metadata?.sourceURL,
);
expect(urls.length).toBeGreaterThan(1);
@ -955,7 +955,7 @@ describe("E2E Tests for v1 API Routes", () => {
expect(depth).toBeLessThanOrEqual(2);
});
},
180000
180000,
);
});
@ -972,7 +972,7 @@ describe("E2E Tests for v1 API Routes", () => {
.get("/v1/crawl/123")
.set("Authorization", `Bearer invalid-api-key`);
expect(response.statusCode).toBe(401);
}
},
);
it.concurrent(
@ -982,7 +982,7 @@ describe("E2E Tests for v1 API Routes", () => {
.get("/v1/crawl/invalidJobId")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(404);
}
},
);
it.concurrent(
@ -1026,12 +1026,12 @@ describe("E2E Tests for v1 API Routes", () => {
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
const childrenLinks = completedResponse.body.data.filter(
(doc) => doc.metadata && doc.metadata.sourceURL
(doc) => doc.metadata && doc.metadata.sourceURL,
);
expect(childrenLinks.length).toBe(completedResponse.body.data.length);
},
180000
180000,
); // 120 seconds
it.concurrent(
@ -1068,7 +1068,7 @@ describe("E2E Tests for v1 API Routes", () => {
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
},
60000
60000,
); // 60 seconds
});
});

View File

@ -2,7 +2,7 @@ import request from "supertest";
import { configDotenv } from "dotenv";
import {
ScrapeRequest,
ScrapeResponseRequestTest
ScrapeResponseRequestTest,
} from "../../controllers/v1/types";
configDotenv();
@ -14,7 +14,7 @@ describe("E2E Tests for v1 API Routes", () => {
"should return a successful response for a scrape with 403 page",
async () => {
const response: ScrapeResponseRequestTest = await request(
FIRECRAWL_API_URL
FIRECRAWL_API_URL,
)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -30,18 +30,18 @@ describe("E2E Tests for v1 API Routes", () => {
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.statusCode).toBe(403);
},
30000
30000,
);
it.concurrent(
"should handle 'formats:markdown (default)' parameter correctly",
async () => {
const scrapeRequest = {
url: E2E_TEST_SERVER_URL
url: E2E_TEST_SERVER_URL,
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(
FIRECRAWL_API_URL
FIRECRAWL_API_URL,
)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -57,26 +57,26 @@ describe("E2E Tests for v1 API Routes", () => {
expect(response.body.data).toHaveProperty("markdown");
expect(response.body.data.markdown).toContain(
"This page is used for end-to-end (e2e) testing with Firecrawl."
"This page is used for end-to-end (e2e) testing with Firecrawl.",
);
expect(response.body.data.markdown).toContain(
"Content with id #content-1"
"Content with id #content-1",
);
// expect(response.body.data.markdown).toContain("Loading...");
expect(response.body.data.markdown).toContain("Click me!");
expect(response.body.data.markdown).toContain(
"Power your AI apps with clean data crawled from any website. It's also open-source."
"Power your AI apps with clean data crawled from any website. It's also open-source.",
); // firecrawl.dev inside an iframe
expect(response.body.data.markdown).toContain(
"This content loads only when you see it. Don't blink! 👼"
"This content loads only when you see it. Don't blink! 👼",
); // the browser always scroll to the bottom
expect(response.body.data.markdown).not.toContain("Header"); // Only main content is returned by default
expect(response.body.data.markdown).not.toContain("footer"); // Only main content is returned by default
expect(response.body.data.markdown).not.toContain(
"This content is only visible on mobile"
"This content is only visible on mobile",
);
},
30000
30000,
);
it.concurrent(
@ -84,11 +84,11 @@ describe("E2E Tests for v1 API Routes", () => {
async () => {
const scrapeRequest = {
url: E2E_TEST_SERVER_URL,
formats: ["html"]
formats: ["html"],
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(
FIRECRAWL_API_URL
FIRECRAWL_API_URL,
)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -105,13 +105,13 @@ describe("E2E Tests for v1 API Routes", () => {
expect(response.body.data).toHaveProperty("html");
expect(response.body.data.html).not.toContain(
'<header class="row-start-1" style="">Header</header>'
'<header class="row-start-1" style="">Header</header>',
);
expect(response.body.data.html).toContain(
'<p style="">This page is used for end-to-end (e2e) testing with Firecrawl.</p>'
'<p style="">This page is used for end-to-end (e2e) testing with Firecrawl.</p>',
);
},
30000
30000,
);
it.concurrent(
@ -119,11 +119,11 @@ describe("E2E Tests for v1 API Routes", () => {
async () => {
const scrapeRequest = {
url: E2E_TEST_SERVER_URL,
formats: ["rawHtml"]
formats: ["rawHtml"],
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(
FIRECRAWL_API_URL
FIRECRAWL_API_URL,
)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -140,11 +140,11 @@ describe("E2E Tests for v1 API Routes", () => {
expect(response.body.data).toHaveProperty("rawHtml");
expect(response.body.data.rawHtml).toContain(
">This page is used for end-to-end (e2e) testing with Firecrawl.</p>"
">This page is used for end-to-end (e2e) testing with Firecrawl.</p>",
);
expect(response.body.data.rawHtml).toContain(">Header</header>");
},
30000
30000,
);
// - TODO: tests for links
@ -157,11 +157,11 @@ describe("E2E Tests for v1 API Routes", () => {
// @ts-ignore
const scrapeRequest = {
url: E2E_TEST_SERVER_URL,
headers: { "e2e-header-test": "firecrawl" }
headers: { "e2e-header-test": "firecrawl" },
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(
FIRECRAWL_API_URL
FIRECRAWL_API_URL,
)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -175,10 +175,10 @@ describe("E2E Tests for v1 API Routes", () => {
}
expect(response.body.data.markdown).toContain(
"e2e-header-test: firecrawl"
"e2e-header-test: firecrawl",
);
},
30000
30000,
);
it.concurrent(
@ -186,11 +186,11 @@ describe("E2E Tests for v1 API Routes", () => {
async () => {
const scrapeRequest = {
url: E2E_TEST_SERVER_URL,
includeTags: ["#content-1"]
includeTags: ["#content-1"],
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(
FIRECRAWL_API_URL
FIRECRAWL_API_URL,
)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -204,13 +204,13 @@ describe("E2E Tests for v1 API Routes", () => {
}
expect(response.body.data.markdown).not.toContain(
"<p>This page is used for end-to-end (e2e) testing with Firecrawl.</p>"
"<p>This page is used for end-to-end (e2e) testing with Firecrawl.</p>",
);
expect(response.body.data.markdown).toContain(
"Content with id #content-1"
"Content with id #content-1",
);
},
30000
30000,
);
it.concurrent(
@ -218,11 +218,11 @@ describe("E2E Tests for v1 API Routes", () => {
async () => {
const scrapeRequest = {
url: E2E_TEST_SERVER_URL,
excludeTags: ["#content-1"]
excludeTags: ["#content-1"],
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(
FIRECRAWL_API_URL
FIRECRAWL_API_URL,
)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -236,13 +236,13 @@ describe("E2E Tests for v1 API Routes", () => {
}
expect(response.body.data.markdown).toContain(
"This page is used for end-to-end (e2e) testing with Firecrawl."
"This page is used for end-to-end (e2e) testing with Firecrawl.",
);
expect(response.body.data.markdown).not.toContain(
"Content with id #content-1"
"Content with id #content-1",
);
},
30000
30000,
);
it.concurrent(
@ -251,11 +251,11 @@ describe("E2E Tests for v1 API Routes", () => {
const scrapeRequest = {
url: E2E_TEST_SERVER_URL,
formats: ["html", "markdown"],
onlyMainContent: false
onlyMainContent: false,
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(
FIRECRAWL_API_URL
FIRECRAWL_API_URL,
)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -269,13 +269,13 @@ describe("E2E Tests for v1 API Routes", () => {
}
expect(response.body.data.markdown).toContain(
"This page is used for end-to-end (e2e) testing with Firecrawl."
"This page is used for end-to-end (e2e) testing with Firecrawl.",
);
expect(response.body.data.html).toContain(
'<header class="row-start-1" style="">Header</header>'
'<header class="row-start-1" style="">Header</header>',
);
},
30000
30000,
);
it.concurrent(
@ -283,11 +283,11 @@ describe("E2E Tests for v1 API Routes", () => {
async () => {
const scrapeRequest = {
url: E2E_TEST_SERVER_URL,
timeout: 500
timeout: 500,
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(
FIRECRAWL_API_URL
FIRECRAWL_API_URL,
)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -302,7 +302,7 @@ describe("E2E Tests for v1 API Routes", () => {
expect(response.body.error).toBe("Request timed out");
expect(response.body.success).toBe(false);
},
30000
30000,
);
it.concurrent(
@ -310,11 +310,11 @@ describe("E2E Tests for v1 API Routes", () => {
async () => {
const scrapeRequest = {
url: E2E_TEST_SERVER_URL,
mobile: true
mobile: true,
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(
FIRECRAWL_API_URL
FIRECRAWL_API_URL,
)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -327,17 +327,17 @@ describe("E2E Tests for v1 API Routes", () => {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data.markdown).toContain(
"This content is only visible on mobile"
"This content is only visible on mobile",
);
},
30000
30000,
);
it.concurrent(
"should handle 'parsePDF' parameter correctly",
async () => {
const response: ScrapeResponseRequestTest = await request(
FIRECRAWL_API_URL
FIRECRAWL_API_URL,
)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -352,21 +352,21 @@ describe("E2E Tests for v1 API Routes", () => {
}
expect(response.body.data.markdown).toContain(
"arXiv:astro-ph/9301001v1 7 Jan 1993"
"arXiv:astro-ph/9301001v1 7 Jan 1993",
);
expect(response.body.data.markdown).not.toContain(
"h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm"
"h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm",
);
const responseNoParsePDF: ScrapeResponseRequestTest = await request(
FIRECRAWL_API_URL
FIRECRAWL_API_URL,
)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://arxiv.org/pdf/astro-ph/9301001.pdf",
parsePDF: false
parsePDF: false,
});
await new Promise((r) => setTimeout(r, 6000));
@ -376,10 +376,10 @@ describe("E2E Tests for v1 API Routes", () => {
throw new Error("Expected response body to have 'data' property");
}
expect(responseNoParsePDF.body.data.markdown).toContain(
"h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm"
"h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm",
);
},
30000
30000,
);
// it.concurrent("should handle 'location' parameter correctly",
@ -408,11 +408,11 @@ describe("E2E Tests for v1 API Routes", () => {
async () => {
const scrapeRequest = {
url: "https://expired.badssl.com/",
timeout: 120000
timeout: 120000,
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(
FIRECRAWL_API_URL
FIRECRAWL_API_URL,
)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -430,7 +430,7 @@ describe("E2E Tests for v1 API Routes", () => {
const scrapeRequestWithSkipTlsVerification = {
url: "https://expired.badssl.com/",
skipTlsVerification: true,
timeout: 120000
timeout: 120000,
} as ScrapeRequest;
const responseWithSkipTlsVerification: ScrapeResponseRequestTest =
@ -448,10 +448,10 @@ describe("E2E Tests for v1 API Routes", () => {
}
// console.log(responseWithSkipTlsVerification.body.data)
expect(responseWithSkipTlsVerification.body.data.markdown).toContain(
"badssl.com"
"badssl.com",
);
},
60000
60000,
);
it.concurrent(
@ -459,11 +459,11 @@ describe("E2E Tests for v1 API Routes", () => {
async () => {
const scrapeRequest = {
url: E2E_TEST_SERVER_URL,
removeBase64Images: true
removeBase64Images: true,
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(
FIRECRAWL_API_URL
FIRECRAWL_API_URL,
)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -478,7 +478,7 @@ describe("E2E Tests for v1 API Routes", () => {
// - TODO: not working for every image
// expect(response.body.data.markdown).toContain("Image-Removed");
},
30000
30000,
);
it.concurrent(
@ -489,13 +489,13 @@ describe("E2E Tests for v1 API Routes", () => {
actions: [
{
type: "wait",
milliseconds: 10000
}
]
milliseconds: 10000,
},
],
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(
FIRECRAWL_API_URL
FIRECRAWL_API_URL,
)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -508,10 +508,10 @@ describe("E2E Tests for v1 API Routes", () => {
}
expect(response.body.data.markdown).not.toContain("Loading...");
expect(response.body.data.markdown).toContain(
"Content loaded after 5 seconds!"
"Content loaded after 5 seconds!",
);
},
30000
30000,
);
// screenshot
@ -522,13 +522,13 @@ describe("E2E Tests for v1 API Routes", () => {
url: E2E_TEST_SERVER_URL,
actions: [
{
type: "screenshot"
}
]
type: "screenshot",
},
],
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(
FIRECRAWL_API_URL
FIRECRAWL_API_URL,
)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -543,15 +543,15 @@ describe("E2E Tests for v1 API Routes", () => {
throw new Error("Expected response body to have screenshots array");
}
expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(
0
0,
);
expect(response.body.data.actions.screenshots[0]).toContain(
"https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-"
"https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-",
);
// TODO compare screenshot with expected screenshot
},
30000
30000,
);
it.concurrent(
@ -562,16 +562,16 @@ describe("E2E Tests for v1 API Routes", () => {
actions: [
{
type: "screenshot",
fullPage: true
fullPage: true,
},
{
type: "scrape"
}
]
type: "scrape",
},
],
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(
FIRECRAWL_API_URL
FIRECRAWL_API_URL,
)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -587,24 +587,24 @@ describe("E2E Tests for v1 API Routes", () => {
throw new Error("Expected response body to have screenshots array");
}
expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(
0
0,
);
expect(response.body.data.actions.screenshots[0]).toContain(
"https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-"
"https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-",
);
if (!response.body.data.actions?.scrapes) {
throw new Error("Expected response body to have scrapes array");
}
expect(response.body.data.actions.scrapes[0].url).toBe(
"https://firecrawl-e2e-test.vercel.app/"
"https://firecrawl-e2e-test.vercel.app/",
);
expect(response.body.data.actions.scrapes[0].html).toContain(
"This page is used for end-to-end (e2e) testing with Firecrawl.</p>"
"This page is used for end-to-end (e2e) testing with Firecrawl.</p>",
);
// TODO compare screenshot with expected full page screenshot
},
30000
30000,
);
it.concurrent(
@ -615,13 +615,13 @@ describe("E2E Tests for v1 API Routes", () => {
actions: [
{
type: "click",
selector: "#click-me"
}
]
selector: "#click-me",
},
],
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(
FIRECRAWL_API_URL
FIRECRAWL_API_URL,
)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -634,10 +634,10 @@ describe("E2E Tests for v1 API Routes", () => {
}
expect(response.body.data.markdown).not.toContain("Click me!");
expect(response.body.data.markdown).toContain(
"Text changed after click!"
"Text changed after click!",
);
},
30000
30000,
);
it.concurrent(
@ -649,17 +649,17 @@ describe("E2E Tests for v1 API Routes", () => {
actions: [
{
type: "click",
selector: "#input-1"
selector: "#input-1",
},
{
type: "write",
text: "Hello, world!"
}
]
text: "Hello, world!",
},
],
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(
FIRECRAWL_API_URL
FIRECRAWL_API_URL,
)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -675,7 +675,7 @@ describe("E2E Tests for v1 API Routes", () => {
// uncomment the following line:
// expect(response.body.data.html).toContain("<input id=\"input-1\" type=\"text\" placeholder=\"Enter text here...\" style=\"padding:8px;margin:10px;border:1px solid #ccc;border-radius:4px;background-color:#000\" value=\"Hello, world!\">");
},
30000
30000,
);
// TODO: fix this test (need to fix fire-engine first)
@ -688,13 +688,13 @@ describe("E2E Tests for v1 API Routes", () => {
actions: [
{
type: "press",
key: "ArrowDown"
}
]
key: "ArrowDown",
},
],
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(
FIRECRAWL_API_URL
FIRECRAWL_API_URL,
)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -709,7 +709,7 @@ describe("E2E Tests for v1 API Routes", () => {
// }
// expect(response.body.data.markdown).toContain("Last Key Clicked: ArrowDown")
},
30000
30000,
);
// TODO: fix this test (need to fix fire-engine first)
@ -722,18 +722,18 @@ describe("E2E Tests for v1 API Routes", () => {
actions: [
{
type: "click",
selector: "#scroll-bottom-loader"
selector: "#scroll-bottom-loader",
},
{
type: "scroll",
direction: "down",
amount: 2000
}
]
amount: 2000,
},
],
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(
FIRECRAWL_API_URL
FIRECRAWL_API_URL,
)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
@ -748,7 +748,7 @@ describe("E2E Tests for v1 API Routes", () => {
//
// expect(response.body.data.markdown).toContain("You have reached the bottom!")
},
30000
30000,
);
// TODO: test scrape action

View File

@ -3,7 +3,7 @@ import dotenv from "dotenv";
import {
FirecrawlCrawlResponse,
FirecrawlCrawlStatusResponse,
FirecrawlScrapeResponse
FirecrawlScrapeResponse,
} from "../../types";
dotenv.config();
@ -42,7 +42,7 @@ describe("E2E Tests for v0 API Routes", () => {
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
}
},
);
it.concurrent(
@ -63,30 +63,30 @@ describe("E2E Tests for v0 API Routes", () => {
expect(response.body.data.metadata.pageError).toBeUndefined();
expect(response.body.data.metadata.title).toBe("Roast My Website");
expect(response.body.data.metadata.description).toBe(
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
);
expect(response.body.data.metadata.keywords).toBe(
"Roast My Website,Roast,Website,GitHub,Firecrawl"
"Roast My Website,Roast,Website,GitHub,Firecrawl",
);
expect(response.body.data.metadata.robots).toBe("follow, index");
expect(response.body.data.metadata.ogTitle).toBe("Roast My Website");
expect(response.body.data.metadata.ogDescription).toBe(
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
);
expect(response.body.data.metadata.ogUrl).toBe(
"https://www.roastmywebsite.ai"
"https://www.roastmywebsite.ai",
);
expect(response.body.data.metadata.ogImage).toBe(
"https://www.roastmywebsite.ai/og.png"
"https://www.roastmywebsite.ai/og.png",
);
expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]);
expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website");
expect(response.body.data.metadata.sourceURL).toBe(
"https://roastmywebsite.ai"
"https://roastmywebsite.ai",
);
expect(response.body.data.metadata.pageStatusCode).toBe(200);
},
30000
30000,
); // 30 seconds timeout
it.concurrent(
@ -98,7 +98,7 @@ describe("E2E Tests for v0 API Routes", () => {
.set("Content-Type", "application/json")
.send({
url: "https://roastmywebsite.ai",
pageOptions: { includeHtml: true }
pageOptions: { includeHtml: true },
});
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
@ -112,7 +112,7 @@ describe("E2E Tests for v0 API Routes", () => {
expect(response.body.data.metadata.pageStatusCode).toBe(200);
expect(response.body.data.metadata.pageError).toBeUndefined();
},
30000
30000,
); // 30 seconds timeout
it.concurrent(
@ -130,12 +130,12 @@ describe("E2E Tests for v0 API Routes", () => {
expect(response.body.data).toHaveProperty("content");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.content).toContain(
"We present spectrophotometric observations of the Broad Line Radio Galaxy"
"We present spectrophotometric observations of the Broad Line Radio Galaxy",
);
expect(response.body.data.metadata.pageStatusCode).toBe(200);
expect(response.body.data.metadata.pageError).toBeUndefined();
},
60000
60000,
); // 60 seconds
it.concurrent(
@ -153,12 +153,12 @@ describe("E2E Tests for v0 API Routes", () => {
expect(response.body.data).toHaveProperty("content");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.content).toContain(
"We present spectrophotometric observations of the Broad Line Radio Galaxy"
"We present spectrophotometric observations of the Broad Line Radio Galaxy",
);
expect(response.body.data.metadata.pageStatusCode).toBe(200);
expect(response.body.data.metadata.pageError).toBeUndefined();
},
60000
60000,
); // 60 seconds
it.concurrent(
@ -177,16 +177,16 @@ describe("E2E Tests for v0 API Routes", () => {
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
expect(responseWithoutRemoveTags.body.data.content).toContain(
"Scrape This Site"
"Scrape This Site",
);
expect(responseWithoutRemoveTags.body.data.content).toContain(
"Lessons and Videos"
"Lessons and Videos",
); // #footer
expect(responseWithoutRemoveTags.body.data.content).toContain(
"[Sandbox]("
"[Sandbox](",
); // .nav
expect(responseWithoutRemoveTags.body.data.content).toContain(
"web scraping"
"web scraping",
); // strong
const response: FirecrawlScrapeResponse = await request(TEST_URL)
@ -195,7 +195,7 @@ describe("E2E Tests for v0 API Routes", () => {
.set("Content-Type", "application/json")
.send({
url: "https://www.scrapethissite.com/",
pageOptions: { removeTags: [".nav", "#footer", "strong"] }
pageOptions: { removeTags: [".nav", "#footer", "strong"] },
});
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
@ -208,7 +208,7 @@ describe("E2E Tests for v0 API Routes", () => {
expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav
expect(response.body.data.content).not.toContain("web scraping"); // strong
},
30000
30000,
); // 30 seconds timeout
it.concurrent(
@ -227,10 +227,10 @@ describe("E2E Tests for v0 API Routes", () => {
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.pageStatusCode).toBe(400);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
"bad request"
"bad request",
);
},
60000
60000,
); // 60 seconds
it.concurrent(
@ -249,10 +249,10 @@ describe("E2E Tests for v0 API Routes", () => {
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.pageStatusCode).toBe(401);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
"unauthorized"
"unauthorized",
);
},
60000
60000,
); // 60 seconds
it.concurrent(
@ -271,10 +271,10 @@ describe("E2E Tests for v0 API Routes", () => {
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.pageStatusCode).toBe(403);
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
"forbidden"
"forbidden",
);
},
60000
60000,
); // 60 seconds
it.concurrent(
@ -293,7 +293,7 @@ describe("E2E Tests for v0 API Routes", () => {
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.pageStatusCode).toBe(404);
},
60000
60000,
); // 60 seconds
it.concurrent(
@ -312,7 +312,7 @@ describe("E2E Tests for v0 API Routes", () => {
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.pageStatusCode).toBe(405);
},
60000
60000,
); // 60 seconds
it.concurrent(
@ -331,7 +331,7 @@ describe("E2E Tests for v0 API Routes", () => {
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.metadata.pageStatusCode).toBe(500);
},
60000
60000,
); // 60 seconds
});
@ -351,7 +351,7 @@ describe("E2E Tests for v0 API Routes", () => {
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
}
},
);
it.concurrent(
@ -365,9 +365,9 @@ describe("E2E Tests for v0 API Routes", () => {
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("jobId");
expect(response.body.jobId).toMatch(
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/,
);
}
},
);
it.concurrent(
@ -381,8 +381,8 @@ describe("E2E Tests for v0 API Routes", () => {
url: "https://mendable.ai",
limit: 10,
crawlerOptions: {
includes: ["blog/*"]
}
includes: ["blog/*"],
},
});
let response: FirecrawlCrawlStatusResponse;
@ -408,7 +408,7 @@ describe("E2E Tests for v0 API Routes", () => {
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
(item: any) => item.metadata?.sourceURL,
);
expect(urls.length).toBeGreaterThan(5);
urls.forEach((url: string) => {
@ -424,13 +424,13 @@ describe("E2E Tests for v0 API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("Mendable");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
200
200,
);
expect(
completedResponse.body.data[0].metadata.pageError
completedResponse.body.data[0].metadata.pageError,
).toBeUndefined();
},
180000
180000,
); // 180 seconds
it.concurrent(
@ -444,8 +444,8 @@ describe("E2E Tests for v0 API Routes", () => {
url: "https://mendable.ai",
limit: 10,
crawlerOptions: {
excludes: ["blog/*"]
}
excludes: ["blog/*"],
},
});
let isFinished = false;
@ -467,20 +467,20 @@ describe("E2E Tests for v0 API Routes", () => {
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
const completedResponse: FirecrawlCrawlStatusResponse = await request(
TEST_URL
TEST_URL,
)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
(item: any) => item.metadata?.sourceURL,
);
expect(urls.length).toBeGreaterThan(5);
urls.forEach((url: string) => {
expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy();
});
},
90000
90000,
); // 90 seconds
it.concurrent(
@ -492,7 +492,7 @@ describe("E2E Tests for v0 API Routes", () => {
.set("Content-Type", "application/json")
.send({
url: "https://www.scrapethissite.com",
crawlerOptions: { maxDepth: 1 }
crawlerOptions: { maxDepth: 1 },
});
expect(crawlResponse.statusCode).toBe(200);
@ -515,7 +515,7 @@ describe("E2E Tests for v0 API Routes", () => {
}
}
const completedResponse: FirecrawlCrawlStatusResponse = await request(
TEST_URL
TEST_URL,
)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
@ -528,13 +528,13 @@ describe("E2E Tests for v0 API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
200
200,
);
expect(
completedResponse.body.data[0].metadata.pageError
completedResponse.body.data[0].metadata.pageError,
).toBeUndefined();
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
(item: any) => item.metadata?.sourceURL,
);
expect(urls.length).toBeGreaterThan(1);
@ -550,14 +550,14 @@ describe("E2E Tests for v0 API Routes", () => {
expect(depth).toBeLessThanOrEqual(2);
});
},
180000
180000,
);
});
describe("POST /v0/crawlWebsitePreview", () => {
it.concurrent("should require authorization", async () => {
const response: FirecrawlCrawlResponse = await request(TEST_URL).post(
"/v0/crawlWebsitePreview"
"/v0/crawlWebsitePreview",
);
expect(response.statusCode).toBe(401);
});
@ -571,7 +571,7 @@ describe("E2E Tests for v0 API Routes", () => {
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
}
},
);
it.concurrent(
@ -585,7 +585,7 @@ describe("E2E Tests for v0 API Routes", () => {
expect(response.statusCode).toBe(408);
},
3000
3000,
);
});
@ -604,7 +604,7 @@ describe("E2E Tests for v0 API Routes", () => {
.set("Content-Type", "application/json")
.send({ query: "test" });
expect(response.statusCode).toBe(401);
}
},
);
it.concurrent(
@ -620,7 +620,7 @@ describe("E2E Tests for v0 API Routes", () => {
expect(response.body.success).toBe(true);
expect(response.body).toHaveProperty("data");
},
60000
60000,
); // 60 seconds timeout
});
@ -637,7 +637,7 @@ describe("E2E Tests for v0 API Routes", () => {
.get("/v0/crawl/status/123")
.set("Authorization", `Bearer invalid-api-key`);
expect(response.statusCode).toBe(401);
}
},
);
it.concurrent(
@ -647,7 +647,7 @@ describe("E2E Tests for v0 API Routes", () => {
.get("/v0/crawl/status/invalidJobId")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(404);
}
},
);
it.concurrent(
@ -689,22 +689,22 @@ describe("E2E Tests for v0 API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("Firecrawl");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
200
200,
);
expect(
completedResponse.body.data[0].metadata.pageError
completedResponse.body.data[0].metadata.pageError,
).toBeUndefined();
const childrenLinks = completedResponse.body.data.filter(
(doc) =>
doc.metadata &&
doc.metadata.sourceURL &&
doc.metadata.sourceURL.includes("firecrawl.dev/blog")
doc.metadata.sourceURL.includes("firecrawl.dev/blog"),
);
expect(childrenLinks.length).toBe(completedResponse.body.data.length);
},
180000
180000,
); // 120 seconds
// TODO: review the test below
@ -762,7 +762,7 @@ describe("E2E Tests for v0 API Routes", () => {
.set("Content-Type", "application/json")
.send({
url: "https://docs.tatum.io",
crawlerOptions: { limit: 200 }
crawlerOptions: { limit: 200 },
});
expect(crawlResponse.statusCode).toBe(200);
@ -798,22 +798,22 @@ describe("E2E Tests for v0 API Routes", () => {
expect(completedResponse.body.data).toEqual(expect.arrayContaining([]));
expect(completedResponse.body).toHaveProperty("partial_data");
expect(completedResponse.body.partial_data[0]).toHaveProperty(
"content"
"content",
);
expect(completedResponse.body.partial_data[0]).toHaveProperty(
"markdown"
"markdown",
);
expect(completedResponse.body.partial_data[0]).toHaveProperty(
"metadata"
"metadata",
);
expect(
completedResponse.body.partial_data[0].metadata.pageStatusCode
completedResponse.body.partial_data[0].metadata.pageStatusCode,
).toBe(200);
expect(
completedResponse.body.partial_data[0].metadata.pageError
completedResponse.body.partial_data[0].metadata.pageError,
).toBeUndefined();
},
60000
60000,
); // 60 seconds
});
@ -828,7 +828,7 @@ describe("E2E Tests for v0 API Routes", () => {
.send({
url: "https://mendable.ai",
pageOptions: {
onlyMainContent: true
onlyMainContent: true,
},
extractorOptions: {
mode: "llm-extraction",
@ -838,18 +838,18 @@ describe("E2E Tests for v0 API Routes", () => {
type: "object",
properties: {
company_mission: {
type: "string"
type: "string",
},
supports_sso: {
type: "boolean"
type: "boolean",
},
is_open_source: {
type: "boolean"
}
type: "boolean",
},
},
required: ["company_mission", "supports_sso", "is_open_source"],
},
},
required: ["company_mission", "supports_sso", "is_open_source"]
}
}
});
// Ensure that the job was successfully created before proceeding with LLM extraction
@ -868,7 +868,7 @@ describe("E2E Tests for v0 API Routes", () => {
expect(llmExtraction.is_open_source).toBe(false);
expect(typeof llmExtraction.is_open_source).toBe("boolean");
},
60000
60000,
); // 60 secs
});
});

View File

@ -10,9 +10,9 @@ jest.mock("../auth", () => ({
success: true,
team_id: "team123",
error: null,
status: 200
status: 200,
}),
reduce: jest.fn()
reduce: jest.fn(),
}));
jest.mock("../../services/idempotency/validate");
@ -21,15 +21,15 @@ describe("crawlController", () => {
const req = {
headers: {
"x-idempotency-key": await uuidv4(),
Authorization: `Bearer ${process.env.TEST_API_KEY}`
Authorization: `Bearer ${process.env.TEST_API_KEY}`,
},
body: {
url: "https://mendable.ai"
}
url: "https://mendable.ai",
},
} as unknown as Request;
const res = {
status: jest.fn().mockReturnThis(),
json: jest.fn()
json: jest.fn(),
} as unknown as Response;
// Mock the idempotency key validation to return false for the second call
@ -45,7 +45,7 @@ describe("crawlController", () => {
await crawlController(req, res);
expect(res.status).toHaveBeenCalledWith(409);
expect(res.json).toHaveBeenCalledWith({
error: "Idempotency key already used"
error: "Idempotency key already used",
});
});
});

View File

@ -4,7 +4,7 @@ import {
AuthResponse,
NotificationType,
PlanType,
RateLimiterMode
RateLimiterMode,
} from "../types";
import { supabase_service } from "../services/supabase";
import { withAuth } from "../lib/withAuth";
@ -41,7 +41,7 @@ export async function setCachedACUC(
acuc:
| AuthCreditUsageChunk
| null
| ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk | null)
| ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk | null),
) {
const cacheKeyACUC = `acuc_${api_key}`;
const redLockKey = `lock_${cacheKeyACUC}`;
@ -76,7 +76,7 @@ export async function setCachedACUC(
export async function getACUC(
api_key: string,
cacheOnly = false,
useCache = true
useCache = true,
): Promise<AuthCreditUsageChunk | null> {
const cacheKeyACUC = `acuc_${api_key}`;
@ -97,7 +97,7 @@ export async function getACUC(
({ data, error } = await supabase_service.rpc(
"auth_credit_usage_chunk_test_21_credit_pack",
{ input_key: api_key },
{ get: true }
{ get: true },
));
if (!error) {
@ -105,13 +105,13 @@ export async function getACUC(
}
logger.warn(
`Failed to retrieve authentication and credit usage data after ${retries}, trying again...`
`Failed to retrieve authentication and credit usage data after ${retries}, trying again...`,
);
retries++;
if (retries === maxRetries) {
throw new Error(
"Failed to retrieve authentication and credit usage data after 3 attempts: " +
JSON.stringify(error)
JSON.stringify(error),
);
}
@ -143,19 +143,19 @@ export async function clearACUC(api_key: string): Promise<void> {
export async function authenticateUser(
req,
res,
mode?: RateLimiterMode
mode?: RateLimiterMode,
): Promise<AuthResponse> {
return withAuth(supaAuthenticateUser, {
success: true,
chunk: null,
team_id: "bypass"
team_id: "bypass",
})(req, res, mode);
}
export async function supaAuthenticateUser(
req,
res,
mode?: RateLimiterMode
mode?: RateLimiterMode,
): Promise<AuthResponse> {
const authHeader =
req.headers.authorization ??
@ -170,7 +170,7 @@ export async function supaAuthenticateUser(
return {
success: false,
error: "Unauthorized: Token missing",
status: 401
status: 401,
};
}
@ -199,7 +199,7 @@ export async function supaAuthenticateUser(
return {
success: false,
error: "Unauthorized: Invalid token",
status: 401
status: 401,
};
}
@ -209,7 +209,7 @@ export async function supaAuthenticateUser(
return {
success: false,
error: "Unauthorized: Invalid token",
status: 401
status: 401,
};
}
@ -219,14 +219,14 @@ export async function supaAuthenticateUser(
const plan = getPlanByPriceId(priceId);
subscriptionData = {
team_id: teamId,
plan
plan,
};
switch (mode) {
case RateLimiterMode.Crawl:
rateLimiter = getRateLimiter(
RateLimiterMode.Crawl,
token,
subscriptionData.plan
subscriptionData.plan,
);
break;
case RateLimiterMode.Scrape:
@ -234,21 +234,21 @@ export async function supaAuthenticateUser(
RateLimiterMode.Scrape,
token,
subscriptionData.plan,
teamId
teamId,
);
break;
case RateLimiterMode.Search:
rateLimiter = getRateLimiter(
RateLimiterMode.Search,
token,
subscriptionData.plan
subscriptionData.plan,
);
break;
case RateLimiterMode.Map:
rateLimiter = getRateLimiter(
RateLimiterMode.Map,
token,
subscriptionData.plan
subscriptionData.plan,
);
break;
case RateLimiterMode.CrawlStatus:
@ -278,7 +278,7 @@ export async function supaAuthenticateUser(
priceId,
plan: subscriptionData?.plan,
mode,
rateLimiterRes
rateLimiterRes,
});
const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
@ -293,7 +293,7 @@ export async function supaAuthenticateUser(
return {
success: false,
error: `Rate limit exceeded. Consumed (req/min): ${rateLimiterRes.consumedPoints}, Remaining (req/min): ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`,
status: 429
status: 429,
};
}
@ -323,7 +323,7 @@ export async function supaAuthenticateUser(
success: true,
team_id: teamId ?? undefined,
plan: (subscriptionData?.plan ?? "") as PlanType,
chunk
chunk,
};
}
function getPlanByPriceId(price_id: string | null): PlanType {

View File

@ -8,7 +8,7 @@ import { sendSlackWebhook } from "../../../services/alerts/slack";
export async function cleanBefore24hCompleteJobsController(
req: Request,
res: Response
res: Response,
) {
logger.info("🐂 Cleaning jobs older than 24h");
try {
@ -22,8 +22,8 @@ export async function cleanBefore24hCompleteJobsController(
["completed"],
i * batchSize,
i * batchSize + batchSize,
true
)
true,
),
);
}
const completedJobs: Job[] = (
@ -33,7 +33,7 @@ export async function cleanBefore24hCompleteJobsController(
completedJobs.filter(
(job) =>
job.finishedOn !== undefined &&
job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
job.finishedOn < Date.now() - 24 * 60 * 60 * 1000,
) || [];
let count = 0;
@ -73,14 +73,14 @@ export async function queuesController(req: Request, res: Response) {
const scrapeQueue = getScrapeQueue();
const [webScraperActive] = await Promise.all([
scrapeQueue.getActiveCount()
scrapeQueue.getActiveCount(),
]);
const noActiveJobs = webScraperActive === 0;
// 200 if no active jobs, 503 if there are active jobs
return res.status(noActiveJobs ? 200 : 500).json({
webScraperActive,
noActiveJobs
noActiveJobs,
});
} catch (error) {
logger.error(error);
@ -99,7 +99,7 @@ export async function autoscalerController(req: Request, res: Response) {
await Promise.all([
scrapeQueue.getActiveCount(),
scrapeQueue.getWaitingCount(),
scrapeQueue.getPrioritizedCount()
scrapeQueue.getPrioritizedCount(),
]);
let waitingAndPriorityCount = webScraperWaiting + webScraperPriority;
@ -109,9 +109,9 @@ export async function autoscalerController(req: Request, res: Response) {
"https://api.machines.dev/v1/apps/firecrawl-scraper-js/machines",
{
headers: {
Authorization: `Bearer ${process.env.FLY_API_TOKEN}`
}
}
Authorization: `Bearer ${process.env.FLY_API_TOKEN}`,
},
},
);
const machines = await request.json();
@ -121,7 +121,7 @@ export async function autoscalerController(req: Request, res: Response) {
(machine.state === "started" ||
machine.state === "starting" ||
machine.state === "replacing") &&
machine.config.env["FLY_PROCESS_GROUP"] === "worker"
machine.config.env["FLY_PROCESS_GROUP"] === "worker",
).length;
let targetMachineCount = activeMachines;
@ -134,17 +134,17 @@ export async function autoscalerController(req: Request, res: Response) {
if (webScraperActive > 9000 || waitingAndPriorityCount > 2000) {
targetMachineCount = Math.min(
maxNumberOfMachines,
activeMachines + baseScaleUp * 3
activeMachines + baseScaleUp * 3,
);
} else if (webScraperActive > 5000 || waitingAndPriorityCount > 1000) {
targetMachineCount = Math.min(
maxNumberOfMachines,
activeMachines + baseScaleUp * 2
activeMachines + baseScaleUp * 2,
);
} else if (webScraperActive > 1000 || waitingAndPriorityCount > 500) {
targetMachineCount = Math.min(
maxNumberOfMachines,
activeMachines + baseScaleUp
activeMachines + baseScaleUp,
);
}
@ -152,47 +152,47 @@ export async function autoscalerController(req: Request, res: Response) {
if (webScraperActive < 100 && waitingAndPriorityCount < 50) {
targetMachineCount = Math.max(
minNumberOfMachines,
activeMachines - baseScaleDown * 3
activeMachines - baseScaleDown * 3,
);
} else if (webScraperActive < 500 && waitingAndPriorityCount < 200) {
targetMachineCount = Math.max(
minNumberOfMachines,
activeMachines - baseScaleDown * 2
activeMachines - baseScaleDown * 2,
);
} else if (webScraperActive < 1000 && waitingAndPriorityCount < 500) {
targetMachineCount = Math.max(
minNumberOfMachines,
activeMachines - baseScaleDown
activeMachines - baseScaleDown,
);
}
if (targetMachineCount !== activeMachines) {
logger.info(
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`,
);
if (targetMachineCount > activeMachines) {
sendSlackWebhook(
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`,
false,
process.env.SLACK_AUTOSCALER ?? ""
process.env.SLACK_AUTOSCALER ?? "",
);
} else {
sendSlackWebhook(
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`,
false,
process.env.SLACK_AUTOSCALER ?? ""
process.env.SLACK_AUTOSCALER ?? "",
);
}
return res.status(200).json({
mode: "scale-descale",
count: targetMachineCount
count: targetMachineCount,
});
}
return res.status(200).json({
mode: "normal",
count: activeMachines
count: activeMachines,
});
} catch (error) {
logger.error(error);

View File

@ -38,7 +38,7 @@ export async function redisHealthController(req: Request, res: Response) {
try {
await retryOperation(() => redisRateLimitClient.set(testKey, testValue));
redisRateLimitHealth = await retryOperation(() =>
redisRateLimitClient.get(testKey)
redisRateLimitClient.get(testKey),
);
await retryOperation(() => redisRateLimitClient.del(testKey));
} catch (error) {
@ -49,7 +49,7 @@ export async function redisHealthController(req: Request, res: Response) {
const healthStatus = {
queueRedis: queueRedisHealth === testValue ? "healthy" : "unhealthy",
redisRateLimitClient:
redisRateLimitHealth === testValue ? "healthy" : "unhealthy"
redisRateLimitHealth === testValue ? "healthy" : "unhealthy",
};
if (
@ -60,7 +60,7 @@ export async function redisHealthController(req: Request, res: Response) {
return res.status(200).json({ status: "healthy", details: healthStatus });
} else {
logger.info(
`Redis instances health check: ${JSON.stringify(healthStatus)}`
`Redis instances health check: ${JSON.stringify(healthStatus)}`,
);
// await sendSlackWebhook(
// `[REDIS DOWN] Redis instances health check: ${JSON.stringify(

View File

@ -48,7 +48,7 @@ export async function crawlCancelController(req: Request, res: Response) {
}
res.json({
status: "cancelled"
status: "cancelled",
});
} catch (error) {
Sentry.captureException(error);

View File

@ -60,12 +60,12 @@ export async function crawlStatusController(req: Request, res: Response) {
// Combine jobs and jobStatuses into a single array of objects
let jobsWithStatuses = jobs.map((job, index) => ({
job,
status: jobStatuses[index]
status: jobStatuses[index],
}));
// Filter out failed jobs
jobsWithStatuses = jobsWithStatuses.filter(
(x) => x.status !== "failed" && x.status !== "unknown"
(x) => x.status !== "failed" && x.status !== "unknown",
);
// Sort jobs by timestamp
@ -84,10 +84,10 @@ export async function crawlStatusController(req: Request, res: Response) {
const data = jobs
.filter(
(x) =>
x.failedReason !== "Concurreny limit hit" && x.returnvalue !== null
x.failedReason !== "Concurreny limit hit" && x.returnvalue !== null,
)
.map((x) =>
Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue
Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue,
);
if (
@ -117,7 +117,7 @@ export async function crawlStatusController(req: Request, res: Response) {
? []
: data
.filter((x) => x !== null)
.map((x) => toLegacyDocument(x, sc.internalOptions))
.map((x) => toLegacyDocument(x, sc.internalOptions)),
});
} catch (error) {
Sentry.captureException(error);

View File

@ -10,7 +10,7 @@ import { createIdempotencyKey } from "../../../src/services/idempotency/create";
import {
defaultCrawlPageOptions,
defaultCrawlerOptions,
defaultOrigin
defaultOrigin,
} from "../../../src/lib/default-values";
import { v4 as uuidv4 } from "uuid";
import { logger } from "../../../src/lib/logger";
@ -21,7 +21,7 @@ import {
lockURL,
lockURLs,
saveCrawl,
StoredCrawl
StoredCrawl,
} from "../../../src/lib/crawl-redis";
import { getScrapeQueue } from "../../../src/services/queue-service";
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
@ -54,7 +54,7 @@ export async function crawlController(req: Request, res: Response) {
const crawlerOptions = {
...defaultCrawlerOptions,
...req.body.crawlerOptions
...req.body.crawlerOptions,
};
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
@ -82,13 +82,13 @@ export async function crawlController(req: Request, res: Response) {
const {
success: creditsCheckSuccess,
message: creditsCheckMessage,
remainingCredits
remainingCredits,
} = await checkTeamCredits(chunk, team_id, limitCheck);
if (!creditsCheckSuccess) {
return res.status(402).json({
error:
"Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at help@firecrawl.com"
"Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at help@firecrawl.com",
});
}
@ -113,7 +113,7 @@ export async function crawlController(req: Request, res: Response) {
if (isUrlBlocked(url)) {
return res.status(403).json({
error:
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
});
}
@ -153,7 +153,7 @@ export async function crawlController(req: Request, res: Response) {
const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(
pageOptions,
undefined,
undefined
undefined,
);
internalOptions.disableSmartWaitCache = true; // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
@ -166,7 +166,7 @@ export async function crawlController(req: Request, res: Response) {
internalOptions,
team_id,
plan,
createdAt: Date.now()
createdAt: Date.now(),
};
const crawler = crawlToCrawler(id, sc);
@ -204,23 +204,23 @@ export async function crawlController(req: Request, res: Response) {
plan,
origin: req.body.origin ?? defaultOrigin,
crawl_id: id,
sitemapped: true
sitemapped: true,
},
opts: {
jobId: uuid,
priority: jobPriority
}
priority: jobPriority,
},
};
});
await lockURLs(
id,
sc,
jobs.map((x) => x.data.url)
jobs.map((x) => x.data.url),
);
await addCrawlJobs(
id,
jobs.map((x) => x.opts.jobId)
jobs.map((x) => x.opts.jobId),
);
for (const job of jobs) {
// add with sentry instrumentation
@ -243,12 +243,12 @@ export async function crawlController(req: Request, res: Response) {
team_id,
plan: plan!,
origin: req.body.origin ?? defaultOrigin,
crawl_id: id
crawl_id: id,
},
{
priority: 15 // prioritize request 0 of crawl jobs same as scrape jobs
priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs
},
jobId
jobId,
);
await addCrawlJob(id, jobId);
}
@ -258,7 +258,7 @@ export async function crawlController(req: Request, res: Response) {
Sentry.captureException(error);
logger.error(error);
return res.status(500).json({
error: error instanceof ZodError ? "Invalid URL" : error.message
error: error instanceof ZodError ? "Invalid URL" : error.message,
});
}
}

View File

@ -9,7 +9,7 @@ import {
crawlToCrawler,
lockURL,
saveCrawl,
StoredCrawl
StoredCrawl,
} from "../../../src/lib/crawl-redis";
import { addScrapeJob } from "../../../src/services/queue-jobs";
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
@ -43,7 +43,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
if (isUrlBlocked(url)) {
return res.status(403).json({
error:
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
});
}
@ -51,7 +51,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
const pageOptions = req.body.pageOptions ?? {
onlyMainContent: false,
includeHtml: false,
removeTags: []
removeTags: [],
};
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
@ -94,7 +94,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(
pageOptions,
undefined,
undefined
undefined,
);
const sc: StoredCrawl = {
@ -105,7 +105,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
team_id,
plan,
robots,
createdAt: Date.now()
createdAt: Date.now(),
};
await saveCrawl(id, sc);
@ -131,10 +131,10 @@ export async function crawlPreviewController(req: Request, res: Response) {
internalOptions,
origin: "website-preview",
crawl_id: id,
sitemapped: true
sitemapped: true,
},
{},
jobId
jobId,
);
await addCrawlJob(id, jobId);
}
@ -151,10 +151,10 @@ export async function crawlPreviewController(req: Request, res: Response) {
scrapeOptions,
internalOptions,
origin: "website-preview",
crawl_id: id
crawl_id: id,
},
{},
jobId
jobId,
);
await addCrawlJob(id, jobId);
}

View File

@ -2,7 +2,7 @@ import { ExtractorOptions, PageOptions } from "./../../lib/entities";
import { Request, Response } from "express";
import {
billTeam,
checkTeamCredits
checkTeamCredits,
} from "../../services/billing/credit_billing";
import { authenticateUser } from "../auth";
import { PlanType, RateLimiterMode } from "../../types";
@ -11,7 +11,7 @@ import {
Document,
fromLegacyCombo,
toLegacyDocument,
url as urlSchema
url as urlSchema,
} from "../v1/types";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
@ -19,7 +19,7 @@ import {
defaultPageOptions,
defaultExtractorOptions,
defaultTimeout,
defaultOrigin
defaultOrigin,
} from "../../lib/default-values";
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
import { getScrapeQueue } from "../../services/queue-service";
@ -38,7 +38,7 @@ export async function scrapeHelper(
pageOptions: PageOptions,
extractorOptions: ExtractorOptions,
timeout: number,
plan?: PlanType
plan?: PlanType,
): Promise<{
success: boolean;
error?: string;
@ -55,7 +55,7 @@ export async function scrapeHelper(
success: false,
error:
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
returnCode: 403
returnCode: 403,
};
}
@ -65,7 +65,7 @@ export async function scrapeHelper(
pageOptions,
extractorOptions,
timeout,
crawlerOptions
crawlerOptions,
);
await addScrapeJob(
@ -77,11 +77,11 @@ export async function scrapeHelper(
internalOptions,
plan: plan!,
origin: req.body.origin ?? defaultOrigin,
is_scrape: true
is_scrape: true,
},
{},
jobId,
jobPriority
jobPriority,
);
let doc;
@ -90,7 +90,7 @@ export async function scrapeHelper(
{
name: "Wait for job to finish",
op: "bullmq.wait",
attributes: { job: jobId }
attributes: { job: jobId },
},
async (span) => {
try {
@ -104,20 +104,20 @@ export async function scrapeHelper(
return {
success: false,
error: "Request timed out",
returnCode: 408
returnCode: 408,
};
} else if (
typeof e === "string" &&
(e.includes("Error generating completions: ") ||
e.includes("Invalid schema for function") ||
e.includes(
"LLM extraction did not match the extraction schema you provided."
"LLM extraction did not match the extraction schema you provided.",
))
) {
return {
success: false,
error: e,
returnCode: 500
returnCode: 500,
};
} else {
throw e;
@ -125,7 +125,7 @@ export async function scrapeHelper(
}
span.setAttribute("result", JSON.stringify(doc));
return null;
}
},
);
if (err !== null) {
@ -140,7 +140,7 @@ export async function scrapeHelper(
success: true,
error: "No page found",
returnCode: 200,
data: doc
data: doc,
};
}
@ -166,7 +166,7 @@ export async function scrapeHelper(
return {
success: true,
data: toLegacyDocument(doc, internalOptions),
returnCode: 200
returnCode: 200,
};
}
@ -185,7 +185,7 @@ export async function scrapeController(req: Request, res: Response) {
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
const extractorOptions = {
...defaultExtractorOptions,
...req.body.extractorOptions
...req.body.extractorOptions,
};
const origin = req.body.origin ?? defaultOrigin;
let timeout = req.body.timeout ?? defaultTimeout;
@ -197,7 +197,7 @@ export async function scrapeController(req: Request, res: Response) {
) {
return res.status(400).json({
error:
"extractorOptions.extractionSchema must be an object if llm-extraction mode is specified"
"extractorOptions.extractionSchema must be an object if llm-extraction mode is specified",
});
}
@ -213,7 +213,7 @@ export async function scrapeController(req: Request, res: Response) {
earlyReturn = true;
return res.status(402).json({
error:
"Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing"
"Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing",
});
}
} catch (error) {
@ -221,7 +221,7 @@ export async function scrapeController(req: Request, res: Response) {
earlyReturn = true;
return res.status(500).json({
error:
"Error checking team credits. Please contact help@firecrawl.com for help."
"Error checking team credits. Please contact help@firecrawl.com for help.",
});
}
@ -236,7 +236,7 @@ export async function scrapeController(req: Request, res: Response) {
pageOptions,
extractorOptions,
timeout,
plan
plan,
);
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
@ -244,7 +244,7 @@ export async function scrapeController(req: Request, res: Response) {
result.data && (result.data as Document).markdown
? numTokensFromString(
(result.data as Document).markdown!,
"gpt-3.5-turbo"
"gpt-3.5-turbo",
)
: 0;
@ -267,7 +267,7 @@ export async function scrapeController(req: Request, res: Response) {
// billing for doc done on queue end, bill only for llm extraction
billTeam(team_id, chunk?.sub_id, creditsToBeBilled).catch((error) => {
logger.error(
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`,
);
// Optionally, you could notify an admin or add to a retry queue here
});
@ -290,7 +290,7 @@ export async function scrapeController(req: Request, res: Response) {
const { scrapeOptions } = fromLegacyScrapeOptions(
pageOptions,
extractorOptions,
timeout
timeout,
);
logJob({
@ -306,7 +306,7 @@ export async function scrapeController(req: Request, res: Response) {
crawlerOptions: crawlerOptions,
scrapeOptions,
origin: origin,
num_tokens: numTokens
num_tokens: numTokens,
});
return res.status(result.returnCode).json(result);
@ -319,7 +319,7 @@ export async function scrapeController(req: Request, res: Response) {
? "Invalid URL"
: typeof error === "string"
? error
: (error?.message ?? "Internal Server Error")
: (error?.message ?? "Internal Server Error"),
});
}
}

View File

@ -1,7 +1,7 @@
import { Request, Response } from "express";
import {
billTeam,
checkTeamCredits
checkTeamCredits,
} from "../../services/billing/credit_billing";
import { authenticateUser } from "../auth";
import { PlanType, RateLimiterMode } from "../../types";
@ -20,7 +20,7 @@ import {
Document,
fromLegacyCombo,
fromLegacyScrapeOptions,
toLegacyDocument
toLegacyDocument,
} from "../v1/types";
export async function searchHelper(
@ -31,7 +31,7 @@ export async function searchHelper(
crawlerOptions: any,
pageOptions: PageOptions,
searchOptions: SearchOptions,
plan: PlanType | undefined
plan: PlanType | undefined,
): Promise<{
success: boolean;
error?: string;
@ -62,7 +62,7 @@ export async function searchHelper(
filter: filter,
lang: searchOptions.lang ?? "en",
country: searchOptions.country ?? "us",
location: searchOptions.location
location: searchOptions.location,
});
let justSearch = pageOptions.fetchPageContent === false;
@ -71,13 +71,13 @@ export async function searchHelper(
pageOptions,
undefined,
60000,
crawlerOptions
crawlerOptions,
);
if (justSearch) {
billTeam(team_id, subscription_id, res.length).catch((error) => {
logger.error(
`Failed to bill team ${team_id} for ${res.length} credits: ${error}`
`Failed to bill team ${team_id} for ${res.length} credits: ${error}`,
);
// Optionally, you could notify an admin or add to a retry queue here
});
@ -107,12 +107,12 @@ export async function searchHelper(
mode: "single_urls",
team_id: team_id,
scrapeOptions,
internalOptions
internalOptions,
},
opts: {
jobId: uuid,
priority: jobPriority
}
priority: jobPriority,
},
};
});
@ -123,7 +123,7 @@ export async function searchHelper(
const docs = (
await Promise.all(
jobDatas.map((x) => waitForJob<Document>(x.opts.jobId, 60000))
jobDatas.map((x) => waitForJob<Document>(x.opts.jobId, 60000)),
)
).map((x) => toLegacyDocument(x, internalOptions));
@ -136,7 +136,7 @@ export async function searchHelper(
// make sure doc.content is not empty
const filteredDocs = docs.filter(
(doc: any) => doc && doc.content && doc.content.trim().length > 0
(doc: any) => doc && doc.content && doc.content.trim().length > 0,
);
if (filteredDocs.length === 0) {
@ -144,14 +144,14 @@ export async function searchHelper(
success: true,
error: "No page found",
returnCode: 200,
data: docs
data: docs,
};
}
return {
success: true,
data: filteredDocs,
returnCode: 200
returnCode: 200,
};
}
@ -169,7 +169,7 @@ export async function searchController(req: Request, res: Response) {
onlyMainContent: req.body.pageOptions?.onlyMainContent ?? false,
fetchPageContent: req.body.pageOptions?.fetchPageContent ?? true,
removeTags: req.body.pageOptions?.removeTags ?? [],
fallback: req.body.pageOptions?.fallback ?? false
fallback: req.body.pageOptions?.fallback ?? false,
};
const origin = req.body.origin ?? "api";
@ -197,7 +197,7 @@ export async function searchController(req: Request, res: Response) {
crawlerOptions,
pageOptions,
searchOptions,
plan
plan,
);
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
@ -212,7 +212,7 @@ export async function searchController(req: Request, res: Response) {
mode: "search",
url: req.body.query,
crawlerOptions: crawlerOptions,
origin: origin
origin: origin,
});
return res.status(result.returnCode).json(result);
} catch (error) {

View File

@ -6,7 +6,7 @@ import * as Sentry from "@sentry/node";
export async function crawlJobStatusPreviewController(
req: Request,
res: Response
res: Response,
) {
try {
const sc = await getCrawl(req.params.jobId);
@ -26,7 +26,7 @@ export async function crawlJobStatusPreviewController(
// }
const jobs = (await getJobs(req.params.jobId, jobIDs)).sort(
(a, b) => a.timestamp - b.timestamp
(a, b) => a.timestamp - b.timestamp,
);
const jobStatuses = await Promise.all(jobs.map((x) => x.getState()));
const jobStatus = sc.cancelled
@ -38,7 +38,7 @@ export async function crawlJobStatusPreviewController(
: "active";
const data = jobs.map((x) =>
Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue
Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue,
);
res.json({
@ -48,7 +48,7 @@ export async function crawlJobStatusPreviewController(
total: jobs.length,
data: jobStatus === "completed" ? data : null,
partial_data:
jobStatus === "completed" ? [] : data.filter((x) => x !== null)
jobStatus === "completed" ? [] : data.filter((x) => x !== null),
});
} catch (error) {
Sentry.captureException(error);

View File

@ -25,13 +25,13 @@ describe("URL Schema Validation", () => {
it("should reject URLs without a valid top-level domain", () => {
expect(() => url.parse("http://example")).toThrow(
"URL must have a valid top-level domain or be a valid path"
"URL must have a valid top-level domain or be a valid path",
);
});
it("should reject blocked URLs", () => {
expect(() => url.parse("https://facebook.com")).toThrow(
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
);
});
@ -47,28 +47,28 @@ describe("URL Schema Validation", () => {
it("should handle URLs with subdomains that are blocked", () => {
expect(() => url.parse("https://sub.facebook.com")).toThrow(
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
);
});
it("should handle URLs with paths that are blocked", () => {
expect(() => url.parse("http://facebook.com/path")).toThrow(
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
);
expect(() => url.parse("https://facebook.com/another/path")).toThrow(
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
);
});
it("should reject malformed URLs starting with 'http://http'", () => {
expect(() => url.parse("http://http://example.com")).toThrow(
"Invalid URL. Invalid protocol."
"Invalid URL. Invalid protocol.",
);
});
it("should reject malformed URLs containing multiple 'http://'", () => {
expect(() =>
url.parse("http://example.com/http://example.com")
url.parse("http://example.com/http://example.com"),
).not.toThrow();
});

View File

@ -5,14 +5,14 @@ import {
batchScrapeRequestSchema,
CrawlResponse,
RequestWithAuth,
ScrapeOptions
ScrapeOptions,
} from "./types";
import {
addCrawlJobs,
getCrawl,
lockURLs,
saveCrawl,
StoredCrawl
StoredCrawl,
} from "../../lib/crawl-redis";
import { logCrawl } from "../../services/logging/crawl_log";
import { getJobPriority } from "../../lib/job-priority";
@ -22,7 +22,7 @@ import { logger as _logger } from "../../lib/logger";
export async function batchScrapeController(
req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>,
res: Response<CrawlResponse>
res: Response<CrawlResponse>,
) {
req.body = batchScrapeRequestSchema.parse(req.body);
@ -33,12 +33,12 @@ export async function batchScrapeController(
module: "api/v1",
method: "batchScrapeController",
teamId: req.auth.team_id,
plan: req.auth.plan
plan: req.auth.plan,
});
logger.debug("Batch scrape " + id + " starting", {
urlsLength: req.body.urls,
appendToId: req.body.appendToId,
account: req.account
account: req.account,
});
if (!req.body.appendToId) {
@ -59,7 +59,7 @@ export async function batchScrapeController(
internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter
team_id: req.auth.team_id,
createdAt: Date.now(),
plan: req.auth.plan
plan: req.auth.plan,
};
if (!req.body.appendToId) {
@ -75,7 +75,7 @@ export async function batchScrapeController(
jobPriority = await getJobPriority({
plan: req.auth.plan,
team_id: req.auth.team_id,
basePriority: 21
basePriority: 21,
});
}
logger.debug("Using job priority " + jobPriority, { jobPriority });
@ -97,12 +97,12 @@ export async function batchScrapeController(
crawl_id: id,
sitemapped: true,
v1: true,
webhook: req.body.webhook
webhook: req.body.webhook,
},
opts: {
jobId: uuidv4(),
priority: 20
}
priority: 20,
},
};
});
@ -110,19 +110,19 @@ export async function batchScrapeController(
await lockURLs(
id,
sc,
jobs.map((x) => x.data.url)
jobs.map((x) => x.data.url),
);
logger.debug("Adding scrape jobs to Redis...");
await addCrawlJobs(
id,
jobs.map((x) => x.opts.jobId)
jobs.map((x) => x.opts.jobId),
);
logger.debug("Adding scrape jobs to BullMQ...");
await addScrapeJobs(jobs);
if (req.body.webhook) {
logger.debug("Calling webhook with batch_scrape.started...", {
webhook: req.body.webhook
webhook: req.body.webhook,
});
await callWebhook(
req.auth.team_id,
@ -130,7 +130,7 @@ export async function batchScrapeController(
null,
req.body.webhook,
true,
"batch_scrape.started"
"batch_scrape.started",
);
}
@ -139,6 +139,6 @@ export async function batchScrapeController(
return res.status(200).json({
success: true,
id,
url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`
url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`,
});
}

View File

@ -2,7 +2,7 @@ import { authenticateUser } from "../auth";
import {
ConcurrencyCheckParams,
ConcurrencyCheckResponse,
RequestWithAuth
RequestWithAuth,
} from "./types";
import { RateLimiterMode } from "../../types";
import { Response } from "express";
@ -10,14 +10,14 @@ import { redisConnection } from "../../services/queue-service";
// Basically just middleware and error wrapping
export async function concurrencyCheckController(
req: RequestWithAuth<ConcurrencyCheckParams, undefined, undefined>,
res: Response<ConcurrencyCheckResponse>
res: Response<ConcurrencyCheckResponse>,
) {
const concurrencyLimiterKey = "concurrency-limiter:" + req.auth.team_id;
const now = Date.now();
const activeJobsOfTeam = await redisConnection.zrangebyscore(
concurrencyLimiterKey,
now,
Infinity
Infinity,
);
return res
.status(200)

View File

@ -9,7 +9,7 @@ configDotenv();
export async function crawlCancelController(
req: RequestWithAuth<{ jobId: string }>,
res: Response
res: Response,
) {
try {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
@ -43,7 +43,7 @@ export async function crawlCancelController(
}
res.json({
status: "cancelled"
status: "cancelled",
});
} catch (error) {
Sentry.captureException(error);

View File

@ -6,7 +6,7 @@ import {
CrawlStatusResponse,
Document,
ErrorResponse,
RequestWithAuth
RequestWithAuth,
} from "./types";
import { WebSocket } from "ws";
import { v4 as uuidv4 } from "uuid";
@ -19,7 +19,7 @@ import {
getDoneJobsOrderedLength,
getThrottledJobs,
isCrawlFinished,
isCrawlFinishedLocked
isCrawlFinishedLocked,
} from "../../lib/crawl-redis";
import { getScrapeQueue } from "../../services/queue-service";
import { getJob, getJobs } from "./crawl-status";
@ -64,7 +64,7 @@ function close(ws: WebSocket, code: number, msg: Message) {
async function crawlStatusWS(
ws: WebSocket,
req: RequestWithAuth<CrawlStatusParams, undefined, undefined>
req: RequestWithAuth<CrawlStatusParams, undefined, undefined>,
) {
const sc = await getCrawl(req.params.jobId);
if (!sc) {
@ -89,7 +89,10 @@ async function crawlStatusWS(
const notDoneJobIDs = jobIDs.filter((x) => !doneJobIDs.includes(x));
const jobStatuses = await Promise.all(
notDoneJobIDs.map(async (x) => [x, await getScrapeQueue().getJobState(x)])
notDoneJobIDs.map(async (x) => [
x,
await getScrapeQueue().getJobState(x),
]),
);
const newlyDoneJobIDs: string[] = jobStatuses
.filter((x) => x[1] === "completed" || x[1] === "failed")
@ -102,7 +105,7 @@ async function crawlStatusWS(
if (job.returnvalue) {
send(ws, {
type: "document",
data: job.returnvalue
data: job.returnvalue,
});
} else {
return close(ws, 3000, { type: "error", error: job.failedReason });
@ -120,7 +123,9 @@ async function crawlStatusWS(
let jobIDs = await getCrawlJobs(req.params.jobId);
let jobStatuses = await Promise.all(
jobIDs.map(async (x) => [x, await getScrapeQueue().getJobState(x)] as const)
jobIDs.map(
async (x) => [x, await getScrapeQueue().getJobState(x)] as const,
),
);
const throttledJobs = new Set(...(await getThrottledJobs(req.auth.team_id)));
@ -161,8 +166,8 @@ async function crawlStatusWS(
completed: doneJobIDs.length,
creditsUsed: jobIDs.length,
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
data: data
}
data: data,
},
});
if (status !== "scraping") {
@ -174,7 +179,7 @@ async function crawlStatusWS(
// Basically just middleware and error wrapping
export async function crawlStatusWSController(
ws: WebSocket,
req: RequestWithAuth<CrawlStatusParams, undefined, undefined>
req: RequestWithAuth<CrawlStatusParams, undefined, undefined>,
) {
try {
const auth = await authenticateUser(req, null, RateLimiterMode.CrawlStatus);
@ -182,7 +187,7 @@ export async function crawlStatusWSController(
if (!auth.success) {
return close(ws, 3000, {
type: "error",
error: auth.error
error: auth.error,
});
}
@ -201,7 +206,7 @@ export async function crawlStatusWSController(
verbose = JSON.stringify({
message: err.message,
name: err.name,
stack: err.stack
stack: err.stack,
});
}
}
@ -212,13 +217,13 @@ export async function crawlStatusWSController(
") -- ID " +
id +
" -- " +
verbose
verbose,
);
return close(ws, 1011, {
type: "error",
error:
"An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " +
id
id,
});
}
}

View File

@ -3,7 +3,7 @@ import {
CrawlStatusParams,
CrawlStatusResponse,
ErrorResponse,
RequestWithAuth
RequestWithAuth,
} from "./types";
import {
getCrawl,
@ -11,12 +11,12 @@ import {
getCrawlJobs,
getDoneJobsOrdered,
getDoneJobsOrderedLength,
getThrottledJobs
getThrottledJobs,
} from "../../lib/crawl-redis";
import { getScrapeQueue } from "../../services/queue-service";
import {
supabaseGetJobById,
supabaseGetJobsById
supabaseGetJobsById,
} from "../../lib/supabase-jobs";
import { configDotenv } from "dotenv";
import { Job, JobState } from "bullmq";
@ -70,7 +70,7 @@ export async function getJobs(ids: string[]) {
export async function crawlStatusController(
req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>,
res: Response<CrawlStatusResponse>,
isBatch = false
isBatch = false,
) {
const sc = await getCrawl(req.params.jobId);
if (!sc) {
@ -90,7 +90,9 @@ export async function crawlStatusController(
let jobIDs = await getCrawlJobs(req.params.jobId);
let jobStatuses = await Promise.all(
jobIDs.map(async (x) => [x, await getScrapeQueue().getJobState(x)] as const)
jobIDs.map(
async (x) => [x, await getScrapeQueue().getJobState(x)] as const,
),
);
const throttledJobs = new Set(...(await getThrottledJobs(req.auth.team_id)));
@ -124,7 +126,7 @@ export async function crawlStatusController(
const doneJobsOrder = await getDoneJobsOrdered(
req.params.jobId,
start,
end ?? -1
end ?? -1,
);
let doneJobs: Job[] = [];
@ -158,7 +160,7 @@ export async function crawlStatusController(
if (job.returnvalue === undefined) {
logger.warn(
"Job was considered done, but returnvalue is undefined!",
{ jobId: job.id, state }
{ jobId: job.id, state },
);
continue;
}
@ -175,8 +177,8 @@ export async function crawlStatusController(
doneJobs = (
await Promise.all(
(await getJobs(doneJobsOrder)).map(async (x) =>
(await x.getState()) === "failed" ? null : x
)
(await x.getState()) === "failed" ? null : x,
),
)
).filter((x) => x !== null) as Job[];
}
@ -185,7 +187,7 @@ export async function crawlStatusController(
const protocol = process.env.ENV === "local" ? req.protocol : "https";
const nextURL = new URL(
`${protocol}://${req.get("host")}/v1/${isBatch ? "batch/scrape" : "crawl"}/${req.params.jobId}`
`${protocol}://${req.get("host")}/v1/${isBatch ? "batch/scrape" : "crawl"}/${req.params.jobId}`,
);
nextURL.searchParams.set("skip", (start + data.length).toString());
@ -215,6 +217,6 @@ export async function crawlStatusController(
status !== "scraping" && start + data.length === doneJobsLength // if there's not gonna be any documents after this
? undefined
: nextURL.href,
data: data
data: data,
});
}

View File

@ -5,7 +5,7 @@ import {
crawlRequestSchema,
CrawlResponse,
RequestWithAuth,
toLegacyCrawlerOptions
toLegacyCrawlerOptions,
} from "./types";
import {
addCrawlJob,
@ -14,7 +14,7 @@ import {
lockURL,
lockURLs,
saveCrawl,
StoredCrawl
StoredCrawl,
} from "../../lib/crawl-redis";
import { logCrawl } from "../../services/logging/crawl_log";
import { getScrapeQueue } from "../../services/queue-service";
@ -26,7 +26,7 @@ import { scrapeOptions as scrapeOptionsSchema } from "./types";
export async function crawlController(
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
res: Response<CrawlResponse>
res: Response<CrawlResponse>,
) {
const preNormalizedBody = req.body;
req.body = crawlRequestSchema.parse(req.body);
@ -37,12 +37,12 @@ export async function crawlController(
module: "api/v1",
method: "crawlController",
teamId: req.auth.team_id,
plan: req.auth.plan
plan: req.auth.plan,
});
logger.debug("Crawl " + id + " starting", {
request: req.body,
originalRequest: preNormalizedBody,
account: req.account
account: req.account,
});
await logCrawl(id, req.auth.team_id);
@ -56,7 +56,7 @@ export async function crawlController(
const crawlerOptions = {
...req.body,
url: undefined,
scrapeOptions: undefined
scrapeOptions: undefined,
};
const scrapeOptions = req.body.scrapeOptions;
@ -86,7 +86,7 @@ export async function crawlController(
logger.debug("Determined limit: " + crawlerOptions.limit, {
remainingCredits,
bodyLimit: originalLimit,
originalBodyLimit: preNormalizedBody.limit
originalBodyLimit: preNormalizedBody.limit,
});
const sc: StoredCrawl = {
@ -96,7 +96,7 @@ export async function crawlController(
internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
team_id: req.auth.team_id,
createdAt: Date.now(),
plan: req.auth.plan
plan: req.auth.plan,
};
const crawler = crawlToCrawler(id, sc);
@ -105,7 +105,7 @@ export async function crawlController(
sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification);
} catch (e) {
logger.debug("Failed to get robots.txt (this is probably fine!)", {
error: e
error: e,
});
}
@ -117,7 +117,7 @@ export async function crawlController(
if (sitemap !== null && sitemap.length > 0) {
logger.debug("Using sitemap of length " + sitemap.length, {
sitemapLength: sitemap.length
sitemapLength: sitemap.length,
});
let jobPriority = 20;
// If it is over 1000, we need to get the job priority,
@ -127,7 +127,7 @@ export async function crawlController(
jobPriority = await getJobPriority({
plan: req.auth.plan,
team_id: req.auth.team_id,
basePriority: 21
basePriority: 21,
});
}
logger.debug("Using job priority " + jobPriority, { jobPriority });
@ -149,12 +149,12 @@ export async function crawlController(
crawl_id: id,
sitemapped: true,
webhook: req.body.webhook,
v1: true
v1: true,
},
opts: {
jobId: uuid,
priority: 20
}
priority: 20,
},
};
});
@ -162,18 +162,18 @@ export async function crawlController(
await lockURLs(
id,
sc,
jobs.map((x) => x.data.url)
jobs.map((x) => x.data.url),
);
logger.debug("Adding scrape jobs to Redis...");
await addCrawlJobs(
id,
jobs.map((x) => x.opts.jobId)
jobs.map((x) => x.opts.jobId),
);
logger.debug("Adding scrape jobs to BullMQ...");
await getScrapeQueue().addBulk(jobs);
} else {
logger.debug("Sitemap not found or ignored.", {
ignoreSitemap: sc.crawlerOptions.ignoreSitemap
ignoreSitemap: sc.crawlerOptions.ignoreSitemap,
});
logger.debug("Locking URL...");
@ -192,12 +192,12 @@ export async function crawlController(
origin: "api",
crawl_id: id,
webhook: req.body.webhook,
v1: true
v1: true,
},
{
priority: 15
priority: 15,
},
jobId
jobId,
);
logger.debug("Adding scrape job to BullMQ...", { jobId });
await addCrawlJob(id, jobId);
@ -206,7 +206,7 @@ export async function crawlController(
if (req.body.webhook) {
logger.debug("Calling webhook with crawl.started...", {
webhook: req.body.webhook
webhook: req.body.webhook,
});
await callWebhook(
req.auth.team_id,
@ -214,7 +214,7 @@ export async function crawlController(
null,
req.body.webhook,
true,
"crawl.started"
"crawl.started",
);
}
@ -223,6 +223,6 @@ export async function crawlController(
return res.status(200).json({
success: true,
id,
url: `${protocol}://${req.get("host")}/v1/crawl/${id}`
url: `${protocol}://${req.get("host")}/v1/crawl/${id}`,
});
}

View File

@ -6,7 +6,7 @@ import {
extractRequestSchema,
ExtractResponse,
MapDocument,
scrapeOptions
scrapeOptions,
} from "./types";
import { Document } from "../../lib/entities";
import Redis from "ioredis";
@ -43,7 +43,7 @@ const MIN_REQUIRED_LINKS = 1;
*/
export async function extractController(
req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
res: Response<ExtractResponse>
res: Response<ExtractResponse>,
) {
const selfHosted = process.env.USE_DB_AUTHENTICATION !== "true";
@ -81,7 +81,7 @@ export async function extractController(
// If we're self-hosted, we don't want to ignore the sitemap, due to our fire-engine mapping
ignoreSitemap: !selfHosted ? true : false,
includeMetadata: true,
includeSubdomains: req.body.includeSubdomains
includeSubdomains: req.body.includeSubdomains,
});
let mappedLinks = mapResults.links as MapDocument[];
@ -89,7 +89,8 @@ export async function extractController(
mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT);
let mappedLinksRerank = mappedLinks.map(
(x) => `url: ${x.url}, title: ${x.title}, description: ${x.description}`
(x) =>
`url: ${x.url}, title: ${x.title}, description: ${x.description}`,
);
// Filter by path prefix if present
@ -103,31 +104,31 @@ export async function extractController(
const linksAndScores = await performRanking(
mappedLinksRerank,
mappedLinks.map((l) => l.url),
mapUrl
mapUrl,
);
// First try with high threshold
let filteredLinks = filterAndProcessLinks(
mappedLinks,
linksAndScores,
INITIAL_SCORE_THRESHOLD
INITIAL_SCORE_THRESHOLD,
);
// If we don't have enough high-quality links, try with lower threshold
if (filteredLinks.length < MIN_REQUIRED_LINKS) {
logger.info(
`Only found ${filteredLinks.length} links with score > ${INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`
`Only found ${filteredLinks.length} links with score > ${INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`,
);
filteredLinks = filterAndProcessLinks(
mappedLinks,
linksAndScores,
FALLBACK_SCORE_THRESHOLD
FALLBACK_SCORE_THRESHOLD,
);
if (filteredLinks.length === 0) {
// If still no results, take top N results regardless of score
logger.warn(
`No links found with score > ${FALLBACK_SCORE_THRESHOLD}. Taking top ${MIN_REQUIRED_LINKS} results.`
`No links found with score > ${FALLBACK_SCORE_THRESHOLD}. Taking top ${MIN_REQUIRED_LINKS} results.`,
);
filteredLinks = linksAndScores
.sort((a, b) => b.score - a.score)
@ -135,7 +136,9 @@ export async function extractController(
.map((x) => mappedLinks.find((link) => link.url === x.link))
.filter(
(x): x is MapDocument =>
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url)
x !== undefined &&
x.url !== undefined &&
!isUrlBlocked(x.url),
);
}
}
@ -161,7 +164,7 @@ export async function extractController(
return res.status(400).json({
success: false,
error:
"No valid URLs found to scrape. Try adjusting your search criteria or including more URLs."
"No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.",
});
}
@ -174,7 +177,7 @@ export async function extractController(
const jobPriority = await getJobPriority({
plan: req.auth.plan as PlanType,
team_id: req.auth.team_id,
basePriority: 10
basePriority: 10,
});
await addScrapeJob(
@ -186,11 +189,11 @@ export async function extractController(
internalOptions: {},
plan: req.auth.plan!,
origin,
is_scrape: true
is_scrape: true,
},
{},
jobId,
jobPriority
jobPriority,
);
try {
@ -208,12 +211,12 @@ export async function extractController(
) {
throw {
status: 408,
error: "Request timed out"
error: "Request timed out",
};
} else {
throw {
status: 500,
error: `(Internal server error) - ${e && e.message ? e.message : e}`
error: `(Internal server error) - ${e && e.message ? e.message : e}`,
};
}
}
@ -225,7 +228,7 @@ export async function extractController(
} catch (e) {
return res.status(e.status).json({
success: false,
error: e.error
error: e.error,
});
}
@ -237,11 +240,11 @@ export async function extractController(
"Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema if provided. Here are the urls the user provided of which he wants to extract information from: " +
links.join(", "),
prompt: req.body.prompt,
schema: req.body.schema
schema: req.body.schema,
},
docs.map((x) => buildDocument(x)).join("\n"),
undefined,
true // isExtractEndpoint
true, // isExtractEndpoint
);
// TODO: change this later
@ -249,9 +252,9 @@ export async function extractController(
billTeam(req.auth.team_id, req.acuc?.sub_id, links.length * 5).catch(
(error) => {
logger.error(
`Failed to bill team ${req.auth.team_id} for ${links.length * 5} credits: ${error}`
`Failed to bill team ${req.auth.team_id} for ${links.length * 5} credits: ${error}`,
);
}
},
);
let data = completions.extract ?? {};
@ -269,14 +272,14 @@ export async function extractController(
url: req.body.urls.join(", "),
scrapeOptions: req.body,
origin: req.body.origin ?? "api",
num_tokens: completions.numTokens ?? 0
num_tokens: completions.numTokens ?? 0,
});
return res.status(200).json({
success: true,
data: data,
scrape_id: id,
warning: warning
warning: warning,
});
}
@ -295,13 +298,13 @@ function filterAndProcessLinks(
score: number;
originalIndex: number;
}[],
threshold: number
threshold: number,
): MapDocument[] {
return linksAndScores
.filter((x) => x.score > threshold)
.map((x) => mappedLinks.find((link) => link.url === x.link))
.filter(
(x): x is MapDocument =>
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url)
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url),
);
}

View File

@ -4,7 +4,7 @@ import {
MapDocument,
mapRequestSchema,
RequestWithAuth,
scrapeOptions
scrapeOptions,
} from "./types";
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
import { MapResponse, MapRequest } from "./types";
@ -13,7 +13,7 @@ import {
checkAndUpdateURLForMap,
isSameDomain,
isSameSubdomain,
removeDuplicateUrls
removeDuplicateUrls,
} from "../../lib/validateUrl";
import { fireEngineMap } from "../../search/fireEngine";
import { billTeam } from "../../services/billing/credit_billing";
@ -49,7 +49,7 @@ export async function getMapResults({
plan,
origin,
includeMetadata = false,
allowExternalLinks
allowExternalLinks,
}: {
url: string;
search?: string;
@ -72,13 +72,13 @@ export async function getMapResults({
crawlerOptions: {
...crawlerOptions,
limit: crawlerOptions.sitemapOnly ? 10000000 : limit,
scrapeOptions: undefined
scrapeOptions: undefined,
},
scrapeOptions: scrapeOptions.parse({}),
internalOptions: {},
team_id: teamId,
createdAt: Date.now(),
plan: plan
plan: plan,
};
const crawler = crawlToCrawler(id, sc);
@ -114,7 +114,7 @@ export async function getMapResults({
const resultsPerPage = 100;
const maxPages = Math.ceil(
Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage
Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage,
);
const cacheKey = `fireEngineMap:${mapUrl}`;
@ -129,12 +129,12 @@ export async function getMapResults({
const fetchPage = async (page: number) => {
return fireEngineMap(mapUrl, {
numResults: resultsPerPage,
page: page
page: page,
});
};
pagePromises = Array.from({ length: maxPages }, (_, i) =>
fetchPage(i + 1)
fetchPage(i + 1),
);
allResults = await Promise.all(pagePromises);
@ -144,7 +144,7 @@ export async function getMapResults({
// Parallelize sitemap fetch with serper search
const [sitemap, ...searchResults] = await Promise.all([
ignoreSitemap ? null : crawler.tryGetSitemap(true),
...(cachedResult ? [] : pagePromises)
...(cachedResult ? [] : pagePromises),
]);
if (!cachedResult) {
@ -172,7 +172,7 @@ export async function getMapResults({
links = [
mapResults[0].url,
...mapResults.slice(1).map((x) => x.url),
...links
...links,
];
} else {
mapResults.map((x) => {
@ -218,13 +218,13 @@ export async function getMapResults({
links: includeMetadata ? mapResults : linksToReturn,
scrape_id: origin?.includes("website") ? id : undefined,
job_id: id,
time_taken: (new Date().getTime() - Date.now()) / 1000
time_taken: (new Date().getTime() - Date.now()) / 1000,
};
}
export async function mapController(
req: RequestWithAuth<{}, MapResponse, MapRequest>,
res: Response<MapResponse>
res: Response<MapResponse>,
) {
req.body = mapRequestSchema.parse(req.body);
@ -237,13 +237,13 @@ export async function mapController(
crawlerOptions: req.body,
origin: req.body.origin,
teamId: req.auth.team_id,
plan: req.auth.plan
plan: req.auth.plan,
});
// Bill the team
billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
logger.error(
`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`
`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`,
);
});
@ -261,13 +261,13 @@ export async function mapController(
crawlerOptions: {},
scrapeOptions: {},
origin: req.body.origin ?? "api",
num_tokens: 0
num_tokens: 0,
});
const response = {
success: true as const,
links: result.links,
scrape_id: result.scrape_id
scrape_id: result.scrape_id,
};
return res.status(200).json(response);

View File

@ -13,29 +13,29 @@ export async function scrapeStatusController(req: any, res: any) {
const job = await supabaseGetJobByIdOnlyData(req.params.jobId);
const allowedTeams = [
"41bdbfe1-0579-4d9b-b6d5-809f16be12f5",
"511544f2-2fce-4183-9c59-6c29b02c69b5"
"511544f2-2fce-4183-9c59-6c29b02c69b5",
];
if (!allowedTeams.includes(job?.team_id)) {
return res.status(403).json({
success: false,
error: "You are not allowed to access this resource."
error: "You are not allowed to access this resource.",
});
}
return res.status(200).json({
success: true,
data: job?.docs[0]
data: job?.docs[0],
});
} catch (error) {
if (error instanceof Error && error.message == "Too Many Requests") {
return res.status(429).json({
success: false,
error: "Rate limit exceeded. Please try again later."
error: "Rate limit exceeded. Please try again later.",
});
} else {
return res.status(500).json({
success: false,
error: "An unexpected error occurred."
error: "An unexpected error occurred.",
});
}
}

View File

@ -5,7 +5,7 @@ import {
RequestWithAuth,
ScrapeRequest,
scrapeRequestSchema,
ScrapeResponse
ScrapeResponse,
} from "./types";
import { billTeam } from "../../services/billing/credit_billing";
import { v4 as uuidv4 } from "uuid";
@ -17,7 +17,7 @@ import { getScrapeQueue } from "../../services/queue-service";
export async function scrapeController(
req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>,
res: Response<ScrapeResponse>
res: Response<ScrapeResponse>,
) {
req.body = scrapeRequestSchema.parse(req.body);
let earlyReturn = false;
@ -30,7 +30,7 @@ export async function scrapeController(
const jobPriority = await getJobPriority({
plan: req.auth.plan as PlanType,
team_id: req.auth.team_id,
basePriority: 10
basePriority: 10,
});
await addScrapeJob(
@ -42,18 +42,18 @@ export async function scrapeController(
internalOptions: {},
plan: req.auth.plan!,
origin: req.body.origin,
is_scrape: true
is_scrape: true,
},
{},
jobId,
jobPriority
jobPriority,
);
const totalWait =
(req.body.waitFor ?? 0) +
(req.body.actions ?? []).reduce(
(a, x) => (x.type === "wait" ? (x.milliseconds ?? 0) : 0) + a,
0
0,
);
let doc: Document;
@ -67,12 +67,12 @@ export async function scrapeController(
) {
return res.status(408).json({
success: false,
error: "Request timed out"
error: "Request timed out",
});
} else {
return res.status(500).json({
success: false,
error: `(Internal server error) - ${e && e.message ? e.message : e}`
error: `(Internal server error) - ${e && e.message ? e.message : e}`,
});
}
}
@ -99,10 +99,10 @@ export async function scrapeController(
billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(
(error) => {
logger.error(
`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`
`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`,
);
// Optionally, you could notify an admin or add to a retry queue here
}
},
);
if (!req.body.formats.includes("rawHtml")) {
@ -123,12 +123,12 @@ export async function scrapeController(
url: req.body.url,
scrapeOptions: req.body,
origin: origin,
num_tokens: numTokens
num_tokens: numTokens,
});
return res.status(200).json({
success: true,
data: doc,
scrape_id: origin?.includes("website") ? jobId : undefined
scrape_id: origin?.includes("website") ? jobId : undefined,
});
}

View File

@ -8,7 +8,7 @@ import {
ExtractorOptions,
PageOptions,
ScrapeActionContent,
Document as V0Document
Document as V0Document,
} from "../../lib/entities";
import { InternalOptions } from "../../scraper/scrapeURL";
@ -34,7 +34,7 @@ export const url = z.preprocess(
.regex(/^https?:\/\//, "URL uses unsupported protocol")
.refine(
(x) => /\.[a-z]{2,}([\/?#]|$)/i.test(x),
"URL must have a valid top-level domain or be a valid path"
"URL must have a valid top-level domain or be a valid path",
)
.refine((x) => {
try {
@ -46,8 +46,8 @@ export const url = z.preprocess(
}, "Invalid URL")
.refine(
(x) => !isUrlBlocked(x as string),
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
)
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
),
);
const strictMessage =
@ -60,9 +60,9 @@ export const extractOptions = z
systemPrompt: z
.string()
.default(
"Based on the information on the page, extract all the information from the schema in JSON format. Try to extract all the fields even those that might not be marked as required."
"Based on the information on the page, extract all the information from the schema in JSON format. Try to extract all the fields even those that might not be marked as required.",
),
prompt: z.string().optional()
prompt: z.string().optional(),
})
.strict(strictMessage);
@ -74,7 +74,7 @@ export const actionsSchema = z.array(
.object({
type: z.literal("wait"),
milliseconds: z.number().int().positive().finite().optional(),
selector: z.string().optional()
selector: z.string().optional(),
})
.refine(
(data) =>
@ -82,38 +82,38 @@ export const actionsSchema = z.array(
!(data.milliseconds !== undefined && data.selector !== undefined),
{
message:
"Either 'milliseconds' or 'selector' must be provided, but not both."
}
"Either 'milliseconds' or 'selector' must be provided, but not both.",
},
),
z.object({
type: z.literal("click"),
selector: z.string()
selector: z.string(),
}),
z.object({
type: z.literal("screenshot"),
fullPage: z.boolean().default(false)
fullPage: z.boolean().default(false),
}),
z.object({
type: z.literal("write"),
text: z.string()
text: z.string(),
}),
z.object({
type: z.literal("press"),
key: z.string()
key: z.string(),
}),
z.object({
type: z.literal("scroll"),
direction: z.enum(["up", "down"]).optional().default("down"),
selector: z.string().optional()
selector: z.string().optional(),
}),
z.object({
type: z.literal("scrape")
type: z.literal("scrape"),
}),
z.object({
type: z.literal("executeJavascript"),
script: z.string()
})
])
script: z.string(),
}),
]),
);
export const scrapeOptions = z
@ -126,14 +126,14 @@ export const scrapeOptions = z
"links",
"screenshot",
"screenshot@fullPage",
"extract"
"extract",
])
.array()
.optional()
.default(["markdown"])
.refine(
(x) => !(x.includes("screenshot") && x.includes("screenshot@fullPage")),
"You may only specify either screenshot or screenshot@fullPage"
"You may only specify either screenshot or screenshot@fullPage",
),
headers: z.record(z.string(), z.string()).optional(),
includeTags: z.string().array().optional(),
@ -155,11 +155,11 @@ export const scrapeOptions = z
(val) => !val || Object.keys(countries).includes(val.toUpperCase()),
{
message:
"Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code."
}
"Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.",
},
)
.transform((val) => (val ? val.toUpperCase() : "US")),
languages: z.string().array().optional()
languages: z.string().array().optional(),
})
.optional(),
@ -173,15 +173,15 @@ export const scrapeOptions = z
(val) => !val || Object.keys(countries).includes(val.toUpperCase()),
{
message:
"Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code."
}
"Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.",
},
)
.transform((val) => (val ? val.toUpperCase() : "US")),
languages: z.string().array().optional()
languages: z.string().array().optional(),
})
.optional(),
skipTlsVerification: z.boolean().default(false),
removeBase64Images: z.boolean().default(true)
removeBase64Images: z.boolean().default(true),
})
.strict(strictMessage);
@ -199,7 +199,7 @@ export const extractV1Options = z
includeSubdomains: z.boolean().default(true),
allowExternalLinks: z.boolean().default(false),
origin: z.string().optional().default("api"),
timeout: z.number().int().positive().finite().safe().default(60000)
timeout: z.number().int().positive().finite().safe().default(60000),
})
.strict(strictMessage);
@ -212,7 +212,7 @@ export const scrapeRequestSchema = scrapeOptions
.extend({
url,
origin: z.string().optional().default("api"),
timeout: z.number().int().positive().finite().safe().default(30000)
timeout: z.number().int().positive().finite().safe().default(30000),
})
.strict(strictMessage)
.refine(
@ -226,8 +226,8 @@ export const scrapeRequestSchema = scrapeOptions
},
{
message:
"When 'extract' format is specified, 'extract' options must be provided, and vice versa"
}
"When 'extract' format is specified, 'extract' options must be provided, and vice versa",
},
)
.transform((obj) => {
if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) {
@ -250,9 +250,9 @@ export const webhookSchema = z.preprocess(
z
.object({
url: z.string().url(),
headers: z.record(z.string(), z.string()).default({})
headers: z.record(z.string(), z.string()).default({}),
})
.strict(strictMessage)
.strict(strictMessage),
);
export const batchScrapeRequestSchema = scrapeOptions
@ -260,7 +260,7 @@ export const batchScrapeRequestSchema = scrapeOptions
urls: url.array(),
origin: z.string().optional().default("api"),
webhook: webhookSchema.optional(),
appendToId: z.string().uuid().optional()
appendToId: z.string().uuid().optional(),
})
.strict(strictMessage)
.refine(
@ -274,8 +274,8 @@ export const batchScrapeRequestSchema = scrapeOptions
},
{
message:
"When 'extract' format is specified, 'extract' options must be provided, and vice versa"
}
"When 'extract' format is specified, 'extract' options must be provided, and vice versa",
},
);
export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>;
@ -292,7 +292,7 @@ const crawlerOptions = z
ignoreRobotsTxt: z.boolean().default(false),
ignoreSitemap: z.boolean().default(false),
deduplicateSimilarURLs: z.boolean().default(true),
ignoreQueryParameters: z.boolean().default(false)
ignoreQueryParameters: z.boolean().default(false),
})
.strict(strictMessage);
@ -314,7 +314,7 @@ export const crawlRequestSchema = crawlerOptions
origin: z.string().optional().default("api"),
scrapeOptions: scrapeOptions.default({}),
webhook: webhookSchema.optional(),
limit: z.number().default(10000)
limit: z.number().default(10000),
})
.strict(strictMessage);
@ -340,7 +340,7 @@ export const mapRequestSchema = crawlerOptions
search: z.string().optional(),
ignoreSitemap: z.boolean().default(false),
sitemapOnly: z.boolean().default(false),
limit: z.number().min(1).max(5000).default(5000)
limit: z.number().min(1).max(5000).default(5000),
})
.strict(strictMessage);
@ -510,7 +510,7 @@ export type AuthCreditUsageChunk = {
export interface RequestWithMaybeACUC<
ReqParams = {},
ReqBody = undefined,
ResBody = undefined
ResBody = undefined,
> extends Request<ReqParams, ReqBody, ResBody> {
acuc?: AuthCreditUsageChunk;
}
@ -518,7 +518,7 @@ export interface RequestWithMaybeACUC<
export interface RequestWithACUC<
ReqParams = {},
ReqBody = undefined,
ResBody = undefined
ResBody = undefined,
> extends Request<ReqParams, ReqBody, ResBody> {
acuc: AuthCreditUsageChunk;
}
@ -526,7 +526,7 @@ export interface RequestWithACUC<
export interface RequestWithAuth<
ReqParams = {},
ReqBody = undefined,
ResBody = undefined
ResBody = undefined,
> extends Request<ReqParams, ReqBody, ResBody> {
auth: AuthObject;
account?: Account;
@ -535,7 +535,7 @@ export interface RequestWithAuth<
export interface RequestWithMaybeAuth<
ReqParams = {},
ReqBody = undefined,
ResBody = undefined
ResBody = undefined,
> extends RequestWithMaybeACUC<ReqParams, ReqBody, ResBody> {
auth?: AuthObject;
account?: Account;
@ -544,7 +544,7 @@ export interface RequestWithMaybeAuth<
export interface RequestWithAuth<
ReqParams = {},
ReqBody = undefined,
ResBody = undefined
ResBody = undefined,
> extends RequestWithACUC<ReqParams, ReqBody, ResBody> {
auth: AuthObject;
account?: Account;
@ -569,7 +569,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
ignoreRobotsTxt: x.ignoreRobotsTxt,
ignoreSitemap: x.ignoreSitemap,
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
ignoreQueryParameters: x.ignoreQueryParameters
ignoreQueryParameters: x.ignoreQueryParameters,
};
}
@ -589,11 +589,11 @@ export function fromLegacyCrawlerOptions(x: any): {
ignoreRobotsTxt: x.ignoreRobotsTxt,
ignoreSitemap: x.ignoreSitemap,
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
ignoreQueryParameters: x.ignoreQueryParameters
ignoreQueryParameters: x.ignoreQueryParameters,
}),
internalOptions: {
v0CrawlOnlyUrls: x.returnOnlyUrls
}
v0CrawlOnlyUrls: x.returnOnlyUrls,
},
};
}
@ -605,7 +605,7 @@ export interface MapDocument {
export function fromLegacyScrapeOptions(
pageOptions: PageOptions,
extractorOptions: ExtractorOptions | undefined,
timeout: number | undefined
timeout: number | undefined,
): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } {
return {
scrapeOptions: scrapeOptions.parse({
@ -621,7 +621,7 @@ export function fromLegacyScrapeOptions(
extractorOptions.mode.includes("llm-extraction")
? ("extract" as const)
: null,
"links"
"links",
].filter((x) => x !== null),
waitFor: pageOptions.waitFor,
headers: pageOptions.headers,
@ -646,16 +646,16 @@ export function fromLegacyScrapeOptions(
? {
systemPrompt: extractorOptions.extractionPrompt,
prompt: extractorOptions.userPrompt,
schema: extractorOptions.extractionSchema
schema: extractorOptions.extractionSchema,
}
: undefined,
mobile: pageOptions.mobile
mobile: pageOptions.mobile,
}),
internalOptions: {
atsv: pageOptions.atsv,
v0DisableJsDom: pageOptions.disableJsDom,
v0UseFastMode: pageOptions.useFastMode
}
v0UseFastMode: pageOptions.useFastMode,
},
// TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks
};
}
@ -664,12 +664,12 @@ export function fromLegacyCombo(
pageOptions: PageOptions,
extractorOptions: ExtractorOptions | undefined,
timeout: number | undefined,
crawlerOptions: any
crawlerOptions: any,
): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } {
const { scrapeOptions, internalOptions: i1 } = fromLegacyScrapeOptions(
pageOptions,
extractorOptions,
timeout
timeout,
);
const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions);
return { scrapeOptions, internalOptions: Object.assign(i1, i2) };
@ -677,7 +677,7 @@ export function fromLegacyCombo(
export function toLegacyDocument(
document: Document,
internalOptions: InternalOptions
internalOptions: InternalOptions,
): V0Document | { url: string } {
if (internalOptions.v0CrawlOnlyUrls) {
return { url: document.metadata.sourceURL! };
@ -696,9 +696,9 @@ export function toLegacyDocument(
statusCode: undefined,
pageError: document.metadata.error,
pageStatusCode: document.metadata.statusCode,
screenshot: document.screenshot
screenshot: document.screenshot,
},
actions: document.actions,
warning: document.warning
warning: document.warning,
};
}

View File

@ -46,12 +46,12 @@ serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`);
const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
queues: [new BullAdapter(getScrapeQueue())],
serverAdapter: serverAdapter
serverAdapter: serverAdapter,
});
app.use(
`/admin/${process.env.BULL_AUTH_KEY}/queues`,
serverAdapter.getRouter()
serverAdapter.getRouter(),
);
app.get("/", (req, res) => {
@ -75,7 +75,7 @@ function startServer(port = DEFAULT_PORT) {
const server = app.listen(Number(port), HOST, () => {
logger.info(`Worker ${process.pid} listening on port ${port}`);
logger.info(
`For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
`For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`,
);
});
@ -103,7 +103,7 @@ app.get(`/serverHealthCheck`, async (req, res) => {
const noWaitingJobs = waitingJobs === 0;
// 200 if no active jobs, 503 if there are active jobs
return res.status(noWaitingJobs ? 200 : 500).json({
waitingJobs
waitingJobs,
});
} catch (error) {
Sentry.captureException(error);
@ -120,7 +120,7 @@ app.get("/serverHealthCheck/notify", async (req, res) => {
const getWaitingJobsCount = async () => {
const scrapeQueue = getScrapeQueue();
const [waitingJobsCount] = await Promise.all([
scrapeQueue.getWaitingCount()
scrapeQueue.getWaitingCount(),
]);
return waitingJobsCount;
@ -140,15 +140,15 @@ app.get("/serverHealthCheck/notify", async (req, res) => {
const message = {
text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${
timeout / 60000
} minute(s).`
} minute(s).`,
};
const response = await fetch(slackWebhookUrl, {
method: "POST",
headers: {
"Content-Type": "application/json"
"Content-Type": "application/json",
},
body: JSON.stringify(message)
body: JSON.stringify(message),
});
if (!response.ok) {
@ -176,7 +176,7 @@ app.use(
err: unknown,
req: Request<{}, ErrorResponse, undefined>,
res: Response<ErrorResponse>,
next: NextFunction
next: NextFunction,
) => {
if (err instanceof ZodError) {
if (
@ -192,7 +192,7 @@ app.use(
} else {
next(err);
}
}
},
);
Sentry.setupExpressErrorHandler(app);
@ -202,7 +202,7 @@ app.use(
err: unknown,
req: Request<{}, ErrorResponse, undefined>,
res: ResponseWithSentry<ErrorResponse>,
next: NextFunction
next: NextFunction,
) => {
if (
err instanceof SyntaxError &&
@ -222,7 +222,7 @@ app.use(
verbose = JSON.stringify({
message: err.message,
name: err.name,
stack: err.stack
stack: err.stack,
});
}
}
@ -233,15 +233,15 @@ app.use(
") -- ID " +
id +
" -- " +
verbose
verbose,
);
res.status(500).json({
success: false,
error:
"An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " +
id
id,
});
}
},
);
logger.info(`Worker ${process.pid} started`);

View File

@ -10,7 +10,7 @@ import { logger } from "../logger";
export async function generateCompletions(
documents: Document[],
extractionOptions: ExtractorOptions | undefined,
mode: "markdown" | "raw-html"
mode: "markdown" | "raw-html",
): Promise<Document[]> {
// const schema = zodToJsonSchema(options.schema)
@ -32,7 +32,7 @@ export async function generateCompletions(
schema: schema,
prompt: prompt,
systemPrompt: systemPrompt,
mode: mode
mode: mode,
});
// Validate the JSON output against the schema using AJV
if (schema) {
@ -43,8 +43,8 @@ export async function generateCompletions(
`JSON parsing error(s): ${validate.errors
?.map((err) => err.message)
.join(
", "
)}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.`
", ",
)}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.`,
);
}
}
@ -57,7 +57,7 @@ export async function generateCompletions(
default:
throw new Error("Invalid client");
}
})
}),
);
return completions;

View File

@ -14,7 +14,7 @@ const defaultPrompt =
function prepareOpenAIDoc(
document: Document,
mode: "markdown" | "raw-html"
mode: "markdown" | "raw-html",
): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] | null {
let markdown = document.markdown;
@ -50,7 +50,7 @@ export async function generateOpenAICompletions({
systemPrompt = defaultPrompt,
prompt,
temperature,
mode
mode,
}: {
client: OpenAI;
model?: string;
@ -68,7 +68,7 @@ export async function generateOpenAICompletions({
return {
...document,
warning:
"LLM extraction was not performed since the document's content is empty or missing."
"LLM extraction was not performed since the document's content is empty or missing.",
};
}
const [content, numTokens] = preparedDoc;
@ -81,21 +81,21 @@ export async function generateOpenAICompletions({
messages: [
{
role: "system",
content: systemPrompt
content: systemPrompt,
},
{ role: "user", content },
{
role: "user",
content: `Transform the above content into structured json output based on the following user request: ${prompt}`
}
content: `Transform the above content into structured json output based on the following user request: ${prompt}`,
},
],
response_format: { type: "json_object" },
temperature
temperature,
});
try {
llmExtraction = JSON.parse(
(jsonCompletion.choices[0].message.content ?? "").trim()
(jsonCompletion.choices[0].message.content ?? "").trim(),
);
} catch (e) {
throw new Error("Invalid JSON");
@ -106,9 +106,9 @@ export async function generateOpenAICompletions({
messages: [
{
role: "system",
content: systemPrompt
content: systemPrompt,
},
{ role: "user", content }
{ role: "user", content },
],
tools: [
{
@ -116,12 +116,12 @@ export async function generateOpenAICompletions({
function: {
name: "extract_content",
description: "Extracts the content from the given webpage(s)",
parameters: schema
}
}
parameters: schema,
},
},
],
tool_choice: { type: "function", function: { name: "extract_content" } },
temperature
temperature,
});
const c = completion.choices[0].message.tool_calls[0].function.arguments;
@ -140,6 +140,6 @@ export async function generateOpenAICompletions({
warning:
numTokens > maxTokens
? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.`
: undefined
: undefined,
};
}

View File

@ -31,16 +31,16 @@ describe("parseMarkdown", () => {
{ html: "<html><p>Unclosed tag", expected: "Unclosed tag" },
{
html: "<div><span>Missing closing div",
expected: "Missing closing div"
expected: "Missing closing div",
},
{
html: "<p><strong>Wrong nesting</em></strong></p>",
expected: "**Wrong nesting**"
expected: "**Wrong nesting**",
},
{
html: '<a href="http://example.com">Link without closing tag',
expected: "[Link without closing tag](http://example.com)"
}
expected: "[Link without closing tag](http://example.com)",
},
];
for (const { html, expected } of invalidHtmls) {

View File

@ -1,7 +1,7 @@
import {
getJobPriority,
addJobPriority,
deleteJobPriority
deleteJobPriority,
} from "../job-priority";
import { redisConnection } from "../../services/queue-service";
import { PlanType } from "../../types";
@ -11,8 +11,8 @@ jest.mock("../../services/queue-service", () => ({
sadd: jest.fn(),
srem: jest.fn(),
scard: jest.fn(),
expire: jest.fn()
}
expire: jest.fn(),
},
}));
describe("Job Priority Tests", () => {
@ -26,11 +26,11 @@ describe("Job Priority Tests", () => {
await addJobPriority(team_id, job_id);
expect(redisConnection.sadd).toHaveBeenCalledWith(
`limit_team_id:${team_id}`,
job_id
job_id,
);
expect(redisConnection.expire).toHaveBeenCalledWith(
`limit_team_id:${team_id}`,
60
60,
);
});
@ -40,7 +40,7 @@ describe("Job Priority Tests", () => {
await deleteJobPriority(team_id, job_id);
expect(redisConnection.srem).toHaveBeenCalledWith(
`limit_team_id:${team_id}`,
job_id
job_id,
);
});
@ -89,7 +89,7 @@ describe("Job Priority Tests", () => {
await addJobPriority(team_id, job_id1);
expect(redisConnection.expire).toHaveBeenCalledWith(
`limit_team_id:${team_id}`,
60
60,
);
// Clear the mock calls
@ -99,7 +99,7 @@ describe("Job Priority Tests", () => {
await addJobPriority(team_id, job_id2);
expect(redisConnection.expire).toHaveBeenCalledWith(
`limit_team_id:${team_id}`,
60
60,
);
});
@ -112,7 +112,7 @@ describe("Job Priority Tests", () => {
await addJobPriority(team_id, job_id);
expect(redisConnection.expire).toHaveBeenCalledWith(
`limit_team_id:${team_id}`,
60
60,
);
// Fast-forward time by 59 seconds

View File

@ -1,7 +1,7 @@
export async function batchProcess<T>(
array: T[],
batchSize: number,
asyncFunction: (item: T, index: number) => Promise<void>
asyncFunction: (item: T, index: number) => Promise<void>,
): Promise<void> {
const batches: T[][] = [];
for (let i = 0; i < array.length; i += batchSize) {

View File

@ -6,14 +6,14 @@ const logger = _logger.child({ module: "cache" });
export const cacheRedis = process.env.CACHE_REDIS_URL
? new IORedis(process.env.CACHE_REDIS_URL, {
maxRetriesPerRequest: null
maxRetriesPerRequest: null,
})
: null;
export function cacheKey(
url: string,
scrapeOptions: ScrapeOptions,
internalOptions: InternalOptions
internalOptions: InternalOptions,
): string | null {
if (!cacheRedis) return null;
@ -49,7 +49,7 @@ export async function saveEntryToCache(key: string, entry: CacheEntry) {
}
export async function getEntryFromCache(
key: string
key: string,
): Promise<CacheEntry | null> {
if (!cacheRedis) return null;

View File

@ -14,37 +14,37 @@ export function getConcurrencyLimitMax(plan: string): number {
export async function cleanOldConcurrencyLimitEntries(
team_id: string,
now: number = Date.now()
now: number = Date.now(),
) {
await redisConnection.zremrangebyscore(constructKey(team_id), -Infinity, now);
}
export async function getConcurrencyLimitActiveJobs(
team_id: string,
now: number = Date.now()
now: number = Date.now(),
): Promise<string[]> {
return await redisConnection.zrangebyscore(
constructKey(team_id),
now,
Infinity
Infinity,
);
}
export async function pushConcurrencyLimitActiveJob(
team_id: string,
id: string,
now: number = Date.now()
now: number = Date.now(),
) {
await redisConnection.zadd(
constructKey(team_id),
now + stalledJobTimeoutMs,
id
id,
);
}
export async function removeConcurrencyLimitActiveJob(
team_id: string,
id: string
id: string,
) {
await redisConnection.zrem(constructKey(team_id), id);
}
@ -57,7 +57,7 @@ export type ConcurrencyLimitedJob = {
};
export async function takeConcurrencyLimitedJob(
team_id: string
team_id: string,
): Promise<ConcurrencyLimitedJob | null> {
const res = await redisConnection.zmpop(1, constructQueueKey(team_id), "MIN");
if (res === null || res === undefined) {
@ -69,11 +69,11 @@ export async function takeConcurrencyLimitedJob(
export async function pushConcurrencyLimitedJob(
team_id: string,
job: ConcurrencyLimitedJob
job: ConcurrencyLimitedJob,
) {
await redisConnection.zadd(
constructQueueKey(team_id),
job.priority ?? 1,
JSON.stringify(job)
JSON.stringify(job),
);
}

View File

@ -3,7 +3,7 @@ import { generateURLPermutations } from "./crawl-redis";
describe("generateURLPermutations", () => {
it("generates permutations correctly", () => {
const bareHttps = generateURLPermutations("https://firecrawl.dev").map(
(x) => x.href
(x) => x.href,
);
expect(bareHttps.length).toBe(4);
expect(bareHttps.includes("https://firecrawl.dev/")).toBe(true);
@ -12,7 +12,7 @@ describe("generateURLPermutations", () => {
expect(bareHttps.includes("http://www.firecrawl.dev/")).toBe(true);
const bareHttp = generateURLPermutations("http://firecrawl.dev").map(
(x) => x.href
(x) => x.href,
);
expect(bareHttp.length).toBe(4);
expect(bareHttp.includes("https://firecrawl.dev/")).toBe(true);
@ -21,7 +21,7 @@ describe("generateURLPermutations", () => {
expect(bareHttp.includes("http://www.firecrawl.dev/")).toBe(true);
const wwwHttps = generateURLPermutations("https://www.firecrawl.dev").map(
(x) => x.href
(x) => x.href,
);
expect(wwwHttps.length).toBe(4);
expect(wwwHttps.includes("https://firecrawl.dev/")).toBe(true);
@ -30,7 +30,7 @@ describe("generateURLPermutations", () => {
expect(wwwHttps.includes("http://www.firecrawl.dev/")).toBe(true);
const wwwHttp = generateURLPermutations("http://www.firecrawl.dev").map(
(x) => x.href
(x) => x.href,
);
expect(wwwHttp.length).toBe(4);
expect(wwwHttp.includes("https://firecrawl.dev/")).toBe(true);

View File

@ -24,7 +24,7 @@ export async function saveCrawl(id: string, crawl: StoredCrawl) {
method: "saveCrawl",
crawlId: id,
teamId: crawl.team_id,
plan: crawl.plan
plan: crawl.plan,
});
await redisConnection.set("crawl:" + id, JSON.stringify(crawl));
await redisConnection.expire("crawl:" + id, 24 * 60 * 60, "NX");
@ -53,7 +53,7 @@ export async function addCrawlJob(id: string, job_id: string) {
jobId: job_id,
module: "crawl-redis",
method: "addCrawlJob",
crawlId: id
crawlId: id,
});
await redisConnection.sadd("crawl:" + id + ":jobs", job_id);
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
@ -64,7 +64,7 @@ export async function addCrawlJobs(id: string, job_ids: string[]) {
jobIds: job_ids,
module: "crawl-redis",
method: "addCrawlJobs",
crawlId: id
crawlId: id,
});
await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids);
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
@ -73,19 +73,19 @@ export async function addCrawlJobs(id: string, job_ids: string[]) {
export async function addCrawlJobDone(
id: string,
job_id: string,
success: boolean
success: boolean,
) {
_logger.debug("Adding done crawl job to Redis...", {
jobId: job_id,
module: "crawl-redis",
method: "addCrawlJobDone",
crawlId: id
crawlId: id,
});
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
await redisConnection.expire(
"crawl:" + id + ":jobs_done",
24 * 60 * 60,
"NX"
"NX",
);
if (success) {
@ -93,7 +93,7 @@ export async function addCrawlJobDone(
await redisConnection.expire(
"crawl:" + id + ":jobs_done_ordered",
24 * 60 * 60,
"NX"
"NX",
);
}
}
@ -105,12 +105,12 @@ export async function getDoneJobsOrderedLength(id: string): Promise<number> {
export async function getDoneJobsOrdered(
id: string,
start = 0,
end = -1
end = -1,
): Promise<string[]> {
return await redisConnection.lrange(
"crawl:" + id + ":jobs_done_ordered",
start,
end
end,
);
}
@ -130,7 +130,7 @@ export async function finishCrawl(id: string) {
_logger.debug("Marking crawl as finished.", {
module: "crawl-redis",
method: "finishCrawl",
crawlId: id
crawlId: id,
});
const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes");
if (set === 1) {
@ -141,7 +141,7 @@ export async function finishCrawl(id: string) {
_logger.debug("Crawl can not be finished yet, not marking as finished.", {
module: "crawl-redis",
method: "finishCrawl",
crawlId: id
crawlId: id,
});
}
}
@ -154,7 +154,7 @@ export async function getThrottledJobs(teamId: string): Promise<string[]> {
return await redisConnection.zrangebyscore(
"concurrency-limiter:" + teamId + ":throttled",
Date.now(),
Infinity
Infinity,
);
}
@ -201,7 +201,7 @@ export function generateURLPermutations(url: string | URL): URL[] {
export async function lockURL(
id: string,
sc: StoredCrawl,
url: string
url: string,
): Promise<boolean> {
let logger = _logger.child({
crawlId: id,
@ -209,7 +209,7 @@ export async function lockURL(
method: "lockURL",
preNormalizedURL: url,
teamId: sc.team_id,
plan: sc.plan
plan: sc.plan,
});
if (typeof sc.crawlerOptions?.limit === "number") {
@ -218,7 +218,7 @@ export async function lockURL(
sc.crawlerOptions.limit
) {
logger.debug(
"Crawl has already hit visited_unique limit, not locking URL."
"Crawl has already hit visited_unique limit, not locking URL.",
);
return false;
}
@ -231,7 +231,7 @@ export async function lockURL(
await redisConnection.expire(
"crawl:" + id + ":visited_unique",
24 * 60 * 60,
"NX"
"NX",
);
let res: boolean;
@ -242,7 +242,7 @@ export async function lockURL(
// logger.debug("Adding URL permutations for URL " + JSON.stringify(url) + "...", { permutations });
const x = await redisConnection.sadd(
"crawl:" + id + ":visited",
...permutations
...permutations,
);
res = x === permutations.length;
}
@ -250,7 +250,7 @@ export async function lockURL(
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
logger.debug("Locking URL " + JSON.stringify(url) + "... result: " + res, {
res
res,
});
return res;
}
@ -259,7 +259,7 @@ export async function lockURL(
export async function lockURLs(
id: string,
sc: StoredCrawl,
urls: string[]
urls: string[],
): Promise<boolean> {
urls = urls.map((url) => normalizeURL(url, sc));
const logger = _logger.child({
@ -267,7 +267,7 @@ export async function lockURLs(
module: "crawl-redis",
method: "lockURL",
teamId: sc.team_id,
plan: sc.plan
plan: sc.plan,
});
// Add to visited_unique set
@ -276,7 +276,7 @@ export async function lockURLs(
await redisConnection.expire(
"crawl:" + id + ":visited_unique",
24 * 60 * 60,
"NX"
"NX",
);
let res: boolean;
@ -285,12 +285,12 @@ export async function lockURLs(
res = x === urls.length;
} else {
const allPermutations = urls.flatMap((url) =>
generateURLPermutations(url).map((x) => x.href)
generateURLPermutations(url).map((x) => x.href),
);
logger.debug("Adding " + allPermutations.length + " URL permutations...");
const x = await redisConnection.sadd(
"crawl:" + id + ":visited",
...allPermutations
...allPermutations,
);
res = x === allPermutations.length;
}
@ -304,7 +304,7 @@ export async function lockURLs(
export function crawlToCrawler(
id: string,
sc: StoredCrawl,
newBase?: string
newBase?: string,
): WebCrawler {
const crawler = new WebCrawler({
jobId: id,
@ -315,7 +315,7 @@ export function crawlToCrawler(
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
maxCrawledDepth: getAdjustedMaxDepth(
sc.originUrl!,
sc.crawlerOptions?.maxDepth ?? 10
sc.crawlerOptions?.maxDepth ?? 10,
),
limit: sc.crawlerOptions?.limit ?? 10000,
generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false,
@ -323,7 +323,7 @@ export function crawlToCrawler(
allowExternalContentLinks:
sc.crawlerOptions?.allowExternalContentLinks ?? false,
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false
ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false,
});
if (sc.robots !== undefined) {

View File

@ -8,7 +8,7 @@ export class CustomError extends Error {
statusCode: number,
status: string,
message: string = "",
dataIngestionJob?: any
dataIngestionJob?: any,
) {
super(message);
this.statusCode = statusCode;

View File

@ -8,21 +8,21 @@ export const defaultPageOptions = {
waitFor: 0,
screenshot: false,
fullPageScreenshot: false,
parsePDF: true
parsePDF: true,
};
export const defaultCrawlerOptions = {
allowBackwardCrawling: false,
limit: 10000
limit: 10000,
};
export const defaultCrawlPageOptions = {
onlyMainContent: false,
includeHtml: false,
removeTags: [],
parsePDF: true
parsePDF: true,
};
export const defaultExtractorOptions = {
mode: "markdown"
mode: "markdown",
};

View File

@ -1,21 +1,21 @@
import { CohereClient } from "cohere-ai";
import { MapDocument } from "../../controllers/v1/types";
const cohere = new CohereClient({
token: process.env.COHERE_API_KEY
token: process.env.COHERE_API_KEY,
});
export async function rerankDocuments(
documents: (string | Record<string, string>)[],
query: string,
topN = 3,
model = "rerank-english-v3.0"
model = "rerank-english-v3.0",
) {
const rerank = await cohere.v2.rerank({
documents,
query,
topN,
model,
returnDocuments: true
returnDocuments: true,
});
return rerank.results
@ -23,6 +23,6 @@ export async function rerankDocuments(
.map((x) => ({
document: x.document,
index: x.index,
relevanceScore: x.relevanceScore
relevanceScore: x.relevanceScore,
}));
}

View File

@ -13,7 +13,7 @@ const goExecutablePath = join(
process.cwd(),
"sharedLibs",
"go-html-to-md",
"html-to-markdown.so"
"html-to-markdown.so",
);
class GoMarkdownConverter {
@ -51,7 +51,7 @@ class GoMarkdownConverter {
}
export async function parseMarkdown(
html: string | null | undefined
html: string | null | undefined,
): Promise<string> {
if (!html) {
return "";
@ -74,12 +74,12 @@ export async function parseMarkdown(
) {
Sentry.captureException(error);
logger.error(
`Error converting HTML to Markdown with Go parser: ${error}`
`Error converting HTML to Markdown with Go parser: ${error}`,
);
} else {
logger.warn(
"Tried to use Go parser, but it doesn't exist in the file system.",
{ goExecutablePath }
{ goExecutablePath },
);
}
}
@ -101,7 +101,7 @@ export async function parseMarkdown(
var href = node.getAttribute("href").trim();
var title = node.title ? ' "' + node.title + '"' : "";
return "[" + content.trim() + "](" + href + title + ")\n";
}
},
});
var gfm = turndownPluginGfm.gfm;
turndownService.use(gfm);
@ -145,7 +145,7 @@ function removeSkipToContentLinks(markdownContent: string): string {
// Remove [Skip to Content](#page) and [Skip to content](#skip)
const newMarkdownContent = markdownContent.replace(
/\[Skip to Content\]\(#[^\)]*\)/gi,
""
"",
);
return newMarkdownContent;
}

View File

@ -31,7 +31,7 @@ export async function deleteJobPriority(team_id, job_id) {
export async function getJobPriority({
plan,
team_id,
basePriority = 10
basePriority = 10,
}: {
plan: PlanType | undefined;
team_id: string;
@ -91,12 +91,12 @@ export async function getJobPriority({
} else {
// If not, we keep base priority + planModifier
return Math.ceil(
basePriority + Math.ceil((setLength - bucketLimit) * planModifier)
basePriority + Math.ceil((setLength - bucketLimit) * planModifier),
);
}
} catch (e) {
logger.error(
`Get job priority failed: ${team_id}, ${plan}, ${basePriority}`
`Get job priority failed: ${team_id}, ${plan}, ${basePriority}`,
);
return basePriority;
}

View File

@ -14,14 +14,14 @@ const logFormat = winston.format.printf(
name: value.name,
message: value.message,
stack: value.stack,
cause: value.cause
cause: value.cause,
};
} else {
return value;
}
})
: ""
}`
}`,
);
export const logger = winston.createLogger({
@ -34,26 +34,26 @@ export const logger = winston.createLogger({
name: value.name,
message: value.message,
stack: value.stack,
cause: value.cause
cause: value.cause,
};
} else {
return value;
}
}
},
}),
transports: [
new winston.transports.Console({
format: winston.format.combine(
winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }),
winston.format.metadata({
fillExcept: ["message", "level", "timestamp"]
fillExcept: ["message", "level", "timestamp"],
}),
...((process.env.ENV === "production" &&
process.env.SENTRY_ENVIRONMENT === "dev") ||
process.env.ENV !== "production"
? [winston.format.colorize(), logFormat]
: [])
)
})
]
: []),
),
}),
],
});

View File

@ -6,10 +6,10 @@ export function performCosineSimilarity(links: string[], searchQuery: string) {
const cosineSimilarity = (vec1: number[], vec2: number[]): number => {
const dotProduct = vec1.reduce((sum, val, i) => sum + val * vec2[i], 0);
const magnitude1 = Math.sqrt(
vec1.reduce((sum, val) => sum + val * val, 0)
vec1.reduce((sum, val) => sum + val * val, 0),
);
const magnitude2 = Math.sqrt(
vec2.reduce((sum, val) => sum + val * val, 0)
vec2.reduce((sum, val) => sum + val * val, 0),
);
if (magnitude1 === 0 || magnitude2 === 0) return 0;
return dotProduct / (magnitude1 * magnitude2);

View File

@ -5,13 +5,13 @@ describe("performRanking", () => {
const linksWithContext = [
"url: https://example.com/dogs, title: All about dogs, description: Learn about different dog breeds",
"url: https://example.com/cats, title: Cat care guide, description: Everything about cats",
"url: https://example.com/pets, title: General pet care, description: Care for all types of pets"
"url: https://example.com/pets, title: General pet care, description: Care for all types of pets",
];
const links = [
"https://example.com/dogs",
"https://example.com/cats",
"https://example.com/pets"
"https://example.com/pets",
];
const searchQuery = "cats training";
@ -50,7 +50,7 @@ describe("performRanking", () => {
it("should maintain original order for equal scores", async () => {
const linksWithContext = [
"url: https://example.com/1, title: Similar content A, description: test",
"url: https://example.com/2, title: Similar content B, description: test"
"url: https://example.com/2, title: Similar content B, description: test",
];
const links = ["https://example.com/1", "https://example.com/2"];

View File

@ -5,14 +5,14 @@ import OpenAI from "openai";
configDotenv();
const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY
apiKey: process.env.OPENAI_API_KEY,
});
async function getEmbedding(text: string) {
const embedding = await openai.embeddings.create({
model: "text-embedding-ada-002",
input: text,
encoding_format: "float"
encoding_format: "float",
});
return embedding.data[0].embedding;
@ -39,7 +39,7 @@ const textToVector = (searchQuery: string, text: string): number[] => {
async function performRanking(
linksWithContext: string[],
links: string[],
searchQuery: string
searchQuery: string,
) {
try {
// Handle invalid inputs
@ -64,7 +64,7 @@ async function performRanking(
link: links[index],
linkWithContext,
score,
originalIndex: index
originalIndex: index,
};
} catch (err) {
// If embedding fails for a link, return with score 0
@ -72,10 +72,10 @@ async function performRanking(
link: links[index],
linkWithContext,
score: 0,
originalIndex: index
originalIndex: index,
};
}
})
}),
);
// Sort links based on similarity scores while preserving original order for equal scores

View File

@ -56,7 +56,7 @@ export class ScrapeEvents {
.insert({
job_id: jobId,
type: content.type,
content: content
content: content,
// created_at
})
.select()
@ -73,7 +73,7 @@ export class ScrapeEvents {
static async updateScrapeResult(
logId: number | null,
result: ScrapeScrapeEvent["result"]
result: ScrapeScrapeEvent["result"],
) {
if (logId === null) return;
@ -86,8 +86,8 @@ export class ScrapeEvents {
.update({
content: {
...previousLog.content,
result
}
result,
},
})
.eq("id", logId);
} catch (error) {
@ -100,7 +100,7 @@ export class ScrapeEvents {
await this.insert(((job as any).id ? (job as any).id : job) as string, {
type: "queue",
event,
worker: process.env.FLY_MACHINE_ID
worker: process.env.FLY_MACHINE_ID,
});
} catch (error) {
logger.error(`Error logging job event: ${error}`);

File diff suppressed because it is too large Load Diff

View File

@ -20,7 +20,7 @@ describe("isSameDomain", () => {
it("should return true for a subdomain with different protocols", () => {
const result = isSameDomain(
"https://sub.example.com",
"http://example.com"
"http://example.com",
);
expect(result).toBe(true);
});
@ -35,7 +35,7 @@ describe("isSameDomain", () => {
it("should return true for a subdomain with www prefix", () => {
const result = isSameDomain(
"http://www.sub.example.com",
"http://example.com"
"http://example.com",
);
expect(result).toBe(true);
});
@ -43,7 +43,7 @@ describe("isSameDomain", () => {
it("should return true for the same domain with www prefix", () => {
const result = isSameDomain(
"http://docs.s.s.example.com",
"http://example.com"
"http://example.com",
);
expect(result).toBe(true);
});
@ -53,7 +53,7 @@ describe("isSameSubdomain", () => {
it("should return false for a subdomain", () => {
const result = isSameSubdomain(
"http://example.com",
"http://docs.example.com"
"http://docs.example.com",
);
expect(result).toBe(false);
});
@ -61,7 +61,7 @@ describe("isSameSubdomain", () => {
it("should return true for the same subdomain", () => {
const result = isSameSubdomain(
"http://docs.example.com",
"http://docs.example.com"
"http://docs.example.com",
);
expect(result).toBe(true);
});
@ -69,7 +69,7 @@ describe("isSameSubdomain", () => {
it("should return false for different subdomains", () => {
const result = isSameSubdomain(
"http://docs.example.com",
"http://blog.example.com"
"http://blog.example.com",
);
expect(result).toBe(false);
});
@ -89,7 +89,7 @@ describe("isSameSubdomain", () => {
it("should return true for the same subdomain with different protocols", () => {
const result = isSameSubdomain(
"https://docs.example.com",
"http://docs.example.com"
"http://docs.example.com",
);
expect(result).toBe(true);
});
@ -97,7 +97,7 @@ describe("isSameSubdomain", () => {
it("should return true for the same subdomain with www prefix", () => {
const result = isSameSubdomain(
"http://www.docs.example.com",
"http://docs.example.com"
"http://docs.example.com",
);
expect(result).toBe(true);
});
@ -105,7 +105,7 @@ describe("isSameSubdomain", () => {
it("should return false for a subdomain with www prefix and different subdomain", () => {
const result = isSameSubdomain(
"http://www.docs.example.com",
"http://blog.example.com"
"http://blog.example.com",
);
expect(result).toBe(false);
});
@ -117,7 +117,7 @@ describe("removeDuplicateUrls", () => {
"http://example.com",
"https://example.com",
"http://www.example.com",
"https://www.example.com"
"https://www.example.com",
];
const result = removeDuplicateUrls(urls);
expect(result).toEqual(["https://example.com"]);
@ -128,14 +128,14 @@ describe("removeDuplicateUrls", () => {
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page1?param=1",
"https://example.com/page1#section1"
"https://example.com/page1#section1",
];
const result = removeDuplicateUrls(urls);
expect(result).toEqual([
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page1?param=1",
"https://example.com/page1#section1"
"https://example.com/page1#section1",
]);
});

View File

@ -8,7 +8,7 @@ let warningCount = 0;
export function withAuth<T, U extends any[]>(
originalFunction: (...args: U) => Promise<T>,
mockSuccess: T
mockSuccess: T,
) {
return async function (...args: U): Promise<T> {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";

View File

@ -2,7 +2,7 @@ import { Job } from "bullmq";
import {
WebScraperOptions,
RunWebScraperParams,
RunWebScraperResult
RunWebScraperResult,
} from "../types";
import { billTeam } from "../services/billing/credit_billing";
import { Document } from "../controllers/v1/types";
@ -13,14 +13,14 @@ import { configDotenv } from "dotenv";
import {
EngineResultsTracker,
scrapeURL,
ScrapeUrlResponse
ScrapeUrlResponse,
} from "../scraper/scrapeURL";
import { Engine } from "../scraper/scrapeURL/engines";
configDotenv();
export async function startWebScraperPipeline({
job,
token
token,
}: {
job: Job<WebScraperOptions> & { id: string };
token: string;
@ -32,9 +32,9 @@ export async function startWebScraperPipeline({
...job.data.scrapeOptions,
...(job.data.crawl_id
? {
formats: job.data.scrapeOptions.formats.concat(["rawHtml"])
formats: job.data.scrapeOptions.formats.concat(["rawHtml"]),
}
: {})
: {}),
},
internalOptions: job.data.internalOptions,
// onSuccess: (result, mode) => {
@ -48,7 +48,7 @@ export async function startWebScraperPipeline({
team_id: job.data.team_id,
bull_job_id: job.id.toString(),
priority: job.opts.priority,
is_scrape: job.data.is_scrape ?? false
is_scrape: job.data.is_scrape ?? false,
});
}
@ -62,14 +62,14 @@ export async function runWebScraper({
team_id,
bull_job_id,
priority,
is_scrape = false
is_scrape = false,
}: RunWebScraperParams): Promise<ScrapeUrlResponse> {
let response: ScrapeUrlResponse | undefined = undefined;
let engines: EngineResultsTracker = {};
try {
response = await scrapeURL(bull_job_id, url, scrapeOptions, {
priority,
...internalOptions
...internalOptions,
});
if (!response.success) {
if (response.error instanceof Error) {
@ -81,7 +81,7 @@ export async function runWebScraper({
? JSON.stringify(response.error)
: typeof response.error === "object"
? JSON.stringify({ ...response.error })
: response.error)
: response.error),
);
}
}
@ -94,7 +94,7 @@ export async function runWebScraper({
billTeam(team_id, undefined, creditsToBeBilled).catch((error) => {
logger.error(
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`,
);
// Optionally, you could notify an admin or add to a retry queue here
});
@ -117,14 +117,14 @@ export async function runWebScraper({
return {
...response,
success: false,
error
error,
};
} else {
return {
success: false,
error,
logs: ["no logs -- error coming from runWebScraper"],
engines
engines,
};
}
// onError(error);
@ -154,8 +154,8 @@ export async function runWebScraper({
: result.state === "timeout"
? "Timed out"
: undefined,
time_taken: result.finishedAt - result.startedAt
}
time_taken: result.finishedAt - result.startedAt,
},
});
}
}
@ -166,7 +166,7 @@ const saveJob = async (
result: any,
token: string,
mode: string,
engines?: EngineResultsTracker
engines?: EngineResultsTracker,
) => {
try {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";

View File

@ -4,7 +4,7 @@ import {
autoscalerController,
checkQueuesController,
cleanBefore24hCompleteJobsController,
queuesController
queuesController,
} from "../controllers/v0/admin/queue";
import { wrap } from "./v1";
import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
@ -13,27 +13,27 @@ export const adminRouter = express.Router();
adminRouter.get(
`/admin/${process.env.BULL_AUTH_KEY}/redis-health`,
redisHealthController
redisHealthController,
);
adminRouter.get(
`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`,
cleanBefore24hCompleteJobsController
cleanBefore24hCompleteJobsController,
);
adminRouter.get(
`/admin/${process.env.BULL_AUTH_KEY}/check-queues`,
checkQueuesController
checkQueuesController,
);
adminRouter.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, queuesController);
adminRouter.get(
`/admin/${process.env.BULL_AUTH_KEY}/autoscaler`,
autoscalerController
autoscalerController,
);
adminRouter.post(
`/admin/${process.env.BULL_AUTH_KEY}/acuc-cache-clear`,
wrap(acucCacheClearController)
wrap(acucCacheClearController),
);

View File

@ -8,7 +8,7 @@ import {
ErrorResponse,
RequestWithACUC,
RequestWithAuth,
RequestWithMaybeAuth
RequestWithMaybeAuth,
} from "../controllers/v1/types";
import { RateLimiterMode } from "../types";
import { authenticateUser } from "../controllers/auth";
@ -33,7 +33,7 @@ import { extractController } from "../controllers/v1/extract";
// import { readinessController } from "../controllers/v1/readiness";
function checkCreditsMiddleware(
minimum?: number
minimum?: number,
): (req: RequestWithAuth, res: Response, next: NextFunction) => void {
return (req, res, next) => {
(async () => {
@ -44,20 +44,20 @@ function checkCreditsMiddleware(
const { success, remainingCredits, chunk } = await checkTeamCredits(
req.acuc,
req.auth.team_id,
minimum ?? 1
minimum ?? 1,
);
if (chunk) {
req.acuc = chunk;
}
if (!success) {
logger.error(
`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`
`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`,
);
if (!res.headersSent) {
return res.status(402).json({
success: false,
error:
"Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value."
"Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value.",
});
}
}
@ -68,7 +68,7 @@ function checkCreditsMiddleware(
}
export function authMiddleware(
rateLimiterMode: RateLimiterMode
rateLimiterMode: RateLimiterMode,
): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void {
return (req, res, next) => {
(async () => {
@ -99,7 +99,7 @@ export function authMiddleware(
function idempotencyMiddleware(
req: Request,
res: Response,
next: NextFunction
next: NextFunction,
) {
(async () => {
if (req.headers["x-idempotency-key"]) {
@ -123,7 +123,7 @@ function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
return res.status(403).json({
success: false,
error:
"URL is blocked intentionally. Firecrawl currently does not support social media scraping due to policy restrictions."
"URL is blocked intentionally. Firecrawl currently does not support social media scraping due to policy restrictions.",
});
}
}
@ -131,7 +131,7 @@ function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
}
export function wrap(
controller: (req: Request, res: Response) => Promise<any>
controller: (req: Request, res: Response) => Promise<any>,
): (req: Request, res: Response, next: NextFunction) => any {
return (req, res, next) => {
controller(req, res).catch((err) => next(err));
@ -147,7 +147,7 @@ v1Router.post(
authMiddleware(RateLimiterMode.Scrape),
checkCreditsMiddleware(1),
blocklistMiddleware,
wrap(scrapeController)
wrap(scrapeController),
);
v1Router.post(
@ -156,7 +156,7 @@ v1Router.post(
checkCreditsMiddleware(),
blocklistMiddleware,
idempotencyMiddleware,
wrap(crawlController)
wrap(crawlController),
);
v1Router.post(
@ -165,7 +165,7 @@ v1Router.post(
checkCreditsMiddleware(),
blocklistMiddleware,
idempotencyMiddleware,
wrap(batchScrapeController)
wrap(batchScrapeController),
);
v1Router.post(
@ -173,20 +173,20 @@ v1Router.post(
authMiddleware(RateLimiterMode.Map),
checkCreditsMiddleware(1),
blocklistMiddleware,
wrap(mapController)
wrap(mapController),
);
v1Router.get(
"/crawl/:jobId",
authMiddleware(RateLimiterMode.CrawlStatus),
wrap(crawlStatusController)
wrap(crawlStatusController),
);
v1Router.get(
"/batch/scrape/:jobId",
authMiddleware(RateLimiterMode.CrawlStatus),
// Yes, it uses the same controller as the normal crawl status controller
wrap((req: any, res): any => crawlStatusController(req, res, true))
wrap((req: any, res): any => crawlStatusController(req, res, true)),
);
v1Router.get("/scrape/:jobId", wrap(scrapeStatusController));
@ -194,7 +194,7 @@ v1Router.get("/scrape/:jobId", wrap(scrapeStatusController));
v1Router.get(
"/concurrency-check",
authMiddleware(RateLimiterMode.CrawlStatus),
wrap(concurrencyCheckController)
wrap(concurrencyCheckController),
);
v1Router.ws("/crawl/:jobId", crawlStatusWSController);
@ -203,7 +203,7 @@ v1Router.post(
"/extract",
authMiddleware(RateLimiterMode.Scrape),
checkCreditsMiddleware(1),
wrap(extractController)
wrap(extractController),
);
// v1Router.post("/crawlWebsitePreview", crawlPreviewController);
@ -211,7 +211,7 @@ v1Router.post(
v1Router.delete(
"/crawl/:jobId",
authMiddleware(RateLimiterMode.CrawlStatus),
crawlCancelController
crawlCancelController,
);
// v1Router.get("/checkJobStatus/:jobId", crawlJobStatusPreviewController);

View File

@ -18,20 +18,20 @@ async function sendCrawl(result: Result): Promise<string | undefined> {
{
url: url,
crawlerOptions: {
limit: 75
limit: 75,
},
pageOptions: {
includeHtml: true,
replaceAllPathsWithAbsolutePaths: true,
waitFor: 1000
}
waitFor: 1000,
},
},
{
headers: {
"Content-Type": "application/json",
Authorization: `Bearer `
}
}
Authorization: `Bearer `,
},
},
);
result.idempotency_key = idempotencyKey;
return response.data.jobId;
@ -51,9 +51,9 @@ async function getContent(result: Result): Promise<boolean> {
{
headers: {
"Content-Type": "application/json",
Authorization: `Bearer `
}
}
Authorization: `Bearer `,
},
},
);
if (response.data.status === "completed") {
result.result_data_jsonb = response.data.data;
@ -97,11 +97,11 @@ async function processResults(results: Result[]): Promise<void> {
// Save job id along with the start_url
const resultWithJobId = results.map((r) => ({
start_url: r.start_url,
job_id: r.job_id
job_id: r.job_id,
}));
await fs.writeFile(
"results_with_job_id_4000_6000.json",
JSON.stringify(resultWithJobId, null, 4)
JSON.stringify(resultWithJobId, null, 4),
);
} catch (error) {
console.error("Error writing to results_with_content.json:", error);

View File

@ -32,7 +32,7 @@ describe("WebCrawler", () => {
getMatchingLineNumber: jest.fn().mockReturnValue(0),
getCrawlDelay: jest.fn().mockReturnValue(0),
getSitemaps: jest.fn().mockReturnValue([]),
getPreferredHost: jest.fn().mockReturnValue("example.com")
getPreferredHost: jest.fn().mockReturnValue("example.com"),
});
});
@ -46,7 +46,7 @@ describe("WebCrawler", () => {
includes: [],
excludes: [],
limit: limit, // Apply the limit
maxCrawledDepth: 10
maxCrawledDepth: 10,
});
// Mock sitemap fetching function to return more links than the limit
@ -56,7 +56,7 @@ describe("WebCrawler", () => {
initialUrl,
initialUrl + "/page1",
initialUrl + "/page2",
initialUrl + "/page3"
initialUrl + "/page3",
]);
const filteredLinks = crawler["filterLinks"](
@ -64,10 +64,10 @@ describe("WebCrawler", () => {
initialUrl,
initialUrl + "/page1",
initialUrl + "/page2",
initialUrl + "/page3"
initialUrl + "/page3",
],
limit,
10
10,
);
expect(filteredLinks.length).toBe(limit); // Check if the number of results respects the limit

View File

@ -40,7 +40,7 @@ export class WebCrawler {
allowBackwardCrawling = false,
allowExternalContentLinks = false,
allowSubdomains = false,
ignoreRobotsTxt = false
ignoreRobotsTxt = false,
}: {
jobId: string;
initialUrl: string;
@ -79,7 +79,7 @@ export class WebCrawler {
sitemapLinks: string[],
limit: number,
maxDepth: number,
fromMap: boolean = false
fromMap: boolean = false,
): string[] {
// If the initial URL is a sitemap.xml, skip filtering
if (this.initialUrl.endsWith("sitemap.xml") && fromMap) {
@ -95,7 +95,7 @@ export class WebCrawler {
this.logger.debug(`Error processing link: ${link}`, {
link,
error,
method: "filterLinks"
method: "filterLinks",
});
return false;
}
@ -112,7 +112,7 @@ export class WebCrawler {
if (this.excludes.length > 0 && this.excludes[0] !== "") {
if (
this.excludes.some((excludePattern) =>
new RegExp(excludePattern).test(path)
new RegExp(excludePattern).test(path),
)
) {
return false;
@ -123,7 +123,7 @@ export class WebCrawler {
if (this.includes.length > 0 && this.includes[0] !== "") {
if (
!this.includes.some((includePattern) =>
new RegExp(includePattern).test(path)
new RegExp(includePattern).test(path),
)
) {
return false;
@ -140,7 +140,7 @@ export class WebCrawler {
}
const initialHostname = normalizedInitialUrl.hostname.replace(
/^www\./,
""
"",
);
const linkHostname = normalizedLink.hostname.replace(/^www\./, "");
@ -165,7 +165,7 @@ export class WebCrawler {
if (!isAllowed) {
this.logger.debug(`Link disallowed by robots.txt: ${link}`, {
method: "filterLinks",
link
link,
});
return false;
}
@ -183,12 +183,12 @@ export class WebCrawler {
let extraArgs = {};
if (skipTlsVerification) {
extraArgs["httpsAgent"] = new https.Agent({
rejectUnauthorized: false
rejectUnauthorized: false,
});
}
const response = await axios.get(this.robotsTxtUrl, {
timeout: axiosTimeout,
...extraArgs
...extraArgs,
});
return response.data;
}
@ -199,10 +199,10 @@ export class WebCrawler {
public async tryGetSitemap(
fromMap: boolean = false,
onlySitemap: boolean = false
onlySitemap: boolean = false,
): Promise<{ url: string; html: string }[] | null> {
this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, {
method: "tryGetSitemap"
method: "tryGetSitemap",
});
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
if (fromMap && onlySitemap) {
@ -213,7 +213,7 @@ export class WebCrawler {
sitemapLinks,
this.limit,
this.maxCrawledDepth,
fromMap
fromMap,
);
return filteredLinks.map((link) => ({ url: link, html: "" }));
}
@ -303,7 +303,7 @@ export class WebCrawler {
private isRobotsAllowed(
url: string,
ignoreRobotsTxt: boolean = false
ignoreRobotsTxt: boolean = false,
): boolean {
return ignoreRobotsTxt
? true
@ -352,7 +352,7 @@ export class WebCrawler {
url
.split("/")
.slice(3)
.filter((subArray) => subArray.length > 0).length
.filter((subArray) => subArray.length > 0).length,
);
}
@ -373,7 +373,7 @@ export class WebCrawler {
private isSubdomain(link: string): boolean {
return new URL(link, this.baseUrl).hostname.endsWith(
"." + new URL(this.baseUrl).hostname.split(".").slice(-2).join(".")
"." + new URL(this.baseUrl).hostname.split(".").slice(-2).join("."),
);
}
@ -405,7 +405,7 @@ export class WebCrawler {
".ttf",
".woff2",
".webp",
".inc"
".inc",
];
try {
@ -414,7 +414,7 @@ export class WebCrawler {
} catch (error) {
this.logger.error(`Error processing URL in isFile`, {
method: "isFile",
error
error,
});
return false;
}
@ -431,7 +431,7 @@ export class WebCrawler {
"github.com",
"calendly.com",
"discord.gg",
"discord.com"
"discord.com",
];
return socialMediaOrEmail.some((ext) => url.includes(ext));
}
@ -457,14 +457,14 @@ export class WebCrawler {
} catch (error) {
this.logger.debug(
`Failed to fetch sitemap with axios from ${sitemapUrl}`,
{ method: "tryFetchSitemapLinks", sitemapUrl, error }
{ method: "tryFetchSitemapLinks", sitemapUrl, error },
);
if (error instanceof AxiosError && error.response?.status === 404) {
// ignore 404
} else {
const response = await getLinksFromSitemap(
{ sitemapUrl, mode: "fire-engine" },
this.logger
this.logger,
);
if (response) {
sitemapLinks = response;
@ -476,26 +476,26 @@ export class WebCrawler {
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
try {
const response = await axios.get(baseUrlSitemap, {
timeout: axiosTimeout
timeout: axiosTimeout,
});
if (response.status === 200) {
sitemapLinks = await getLinksFromSitemap(
{ sitemapUrl: baseUrlSitemap, mode: "fire-engine" },
this.logger
this.logger,
);
}
} catch (error) {
this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, {
method: "tryFetchSitemapLinks",
sitemapUrl: baseUrlSitemap,
error
error,
});
if (error instanceof AxiosError && error.response?.status === 404) {
// ignore 404
} else {
sitemapLinks = await getLinksFromSitemap(
{ sitemapUrl: baseUrlSitemap, mode: "fire-engine" },
this.logger
this.logger,
);
}
}
@ -503,7 +503,7 @@ export class WebCrawler {
const normalizedUrl = normalizeUrl(url);
const normalizedSitemapLinks = sitemapLinks.map((link) =>
normalizeUrl(link)
normalizeUrl(link),
);
// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
if (

View File

@ -2,7 +2,7 @@ import { logger } from "../../../lib/logger";
export async function handleCustomScraping(
text: string,
url: string
url: string,
): Promise<{
scraper: string;
url: string;
@ -15,7 +15,7 @@ export async function handleCustomScraping(
!url.includes("developers.notion.com")
) {
logger.debug(
`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`,
);
return {
scraper: "fire-engine",
@ -23,21 +23,21 @@ export async function handleCustomScraping(
waitAfterLoad: 1000,
pageOptions: {
scrollXPaths: [
'//*[@id="ReferencePlayground"]/section[3]/div/pre/div/div/div[5]'
]
}
'//*[@id="ReferencePlayground"]/section[3]/div/pre/div/div/div[5]',
],
},
};
}
// Check for Vanta security portals
if (text.includes('<link href="https://static.vanta.com')) {
logger.debug(
`Vanta link detected for ${url}, using Fire Engine with wait time 3000ms`
`Vanta link detected for ${url}, using Fire Engine with wait time 3000ms`,
);
return {
scraper: "fire-engine",
url: url,
waitAfterLoad: 3000
waitAfterLoad: 3000,
};
}
@ -50,7 +50,7 @@ export async function handleCustomScraping(
logger.debug(`Google Drive PDF link detected: ${url}`);
const fileIdMatch = url.match(
/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/
/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/,
);
if (fileIdMatch) {
const fileId = fileIdMatch[1];
@ -58,7 +58,7 @@ export async function handleCustomScraping(
return {
scraper: "pdf",
url: pdfUrl
url: pdfUrl,
};
}
}

View File

@ -10,13 +10,13 @@ export async function getLinksFromSitemap(
{
sitemapUrl,
allUrls = [],
mode = "axios"
mode = "axios",
}: {
sitemapUrl: string;
allUrls?: string[];
mode?: "axios" | "fire-engine";
},
logger: Logger
logger: Logger,
): Promise<string[]> {
try {
let content: string = "";
@ -29,7 +29,7 @@ export async function getLinksFromSitemap(
"sitemap",
sitemapUrl,
scrapeOptions.parse({ formats: ["rawHtml"] }),
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true }
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true },
);
if (!response.success) {
throw response.error;
@ -41,7 +41,7 @@ export async function getLinksFromSitemap(
method: "getLinksFromSitemap",
mode,
sitemapUrl,
error
error,
});
return allUrls;
@ -56,8 +56,8 @@ export async function getLinksFromSitemap(
.map((sitemap) =>
getLinksFromSitemap(
{ sitemapUrl: sitemap.loc[0], allUrls, mode },
logger
)
logger,
),
);
await Promise.all(sitemapPromises);
} else if (root && root.url) {
@ -66,7 +66,7 @@ export async function getLinksFromSitemap(
(url) =>
url.loc &&
url.loc.length > 0 &&
!WebCrawler.prototype.isFile(url.loc[0])
!WebCrawler.prototype.isFile(url.loc[0]),
)
.map((url) => url.loc[0]);
allUrls.push(...validUrls);
@ -76,7 +76,7 @@ export async function getLinksFromSitemap(
method: "getLinksFromSitemap",
mode,
sitemapUrl,
error
error,
});
}
@ -85,12 +85,12 @@ export async function getLinksFromSitemap(
export const fetchSitemapData = async (
url: string,
timeout?: number
timeout?: number,
): Promise<SitemapEntry[] | null> => {
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
try {
const response = await axios.get(sitemapUrl, {
timeout: timeout || axiosTimeout
timeout: timeout || axiosTimeout,
});
if (response.status === 200) {
const xml = response.data;

View File

@ -15,7 +15,7 @@ describe("Blocklist Functionality", () => {
"https://flickr.com/photos/johndoe",
"https://whatsapp.com/download",
"https://wechat.com/features",
"https://telegram.org/apps"
"https://telegram.org/apps",
])("should return true for blocklisted URL %s", (url) => {
expect(isUrlBlocked(url)).toBe(true);
});
@ -33,7 +33,7 @@ describe("Blocklist Functionality", () => {
"https://flickr.com/help/terms",
"https://whatsapp.com/legal",
"https://wechat.com/en/privacy-policy",
"https://telegram.org/tos"
"https://telegram.org/tos",
])("should return false for allowed URLs with keywords %s", (url) => {
expect(isUrlBlocked(url)).toBe(false);
});
@ -54,35 +54,35 @@ describe("Blocklist Functionality", () => {
"https://facebook.com.someotherdomain.com",
"https://www.facebook.com/profile",
"https://api.twitter.com/info",
"https://instagram.com/accounts/login"
"https://instagram.com/accounts/login",
])(
"should return true for URLs with blocklisted domains in subdomains or paths %s",
(url) => {
expect(isUrlBlocked(url)).toBe(true);
}
},
);
test.each([
"https://example.com/facebook.com",
"https://example.com/redirect?url=https://twitter.com",
"https://facebook.com.policy.example.com"
"https://facebook.com.policy.example.com",
])(
"should return false for URLs where blocklisted domain is part of another domain or path %s",
(url) => {
expect(isUrlBlocked(url)).toBe(false);
}
},
);
test.each(["https://FACEBOOK.com", "https://INSTAGRAM.com/@something"])(
"should handle case variations %s",
(url) => {
expect(isUrlBlocked(url)).toBe(true);
}
},
);
test.each([
"https://facebook.com?redirect=https://example.com",
"https://twitter.com?query=something"
"https://twitter.com?query=something",
])("should handle query parameters %s", (url) => {
expect(isUrlBlocked(url)).toBe(true);
});

View File

@ -18,7 +18,7 @@ const socialMediaBlocklist = [
"youtube.com",
"corterix.com",
"southwest.com",
"ryanair.com"
"ryanair.com",
];
const allowedKeywords = [
@ -41,7 +41,7 @@ const allowedKeywords = [
"://library.tiktok.com",
"://ads.tiktok.com",
"://tiktok.com/business",
"://developers.facebook.com"
"://developers.facebook.com",
];
export function isUrlBlocked(url: string): boolean {
@ -50,7 +50,7 @@ export function isUrlBlocked(url: string): boolean {
// Check if the URL contains any allowed keywords as whole words
if (
allowedKeywords.some((keyword) =>
new RegExp(`\\b${keyword}\\b`, "i").test(lowerCaseUrl)
new RegExp(`\\b${keyword}\\b`, "i").test(lowerCaseUrl),
)
) {
return false;
@ -68,7 +68,7 @@ export function isUrlBlocked(url: string): boolean {
const isBlocked = socialMediaBlocklist.some((domain) => {
const domainPattern = new RegExp(
`(^|\\.)${domain.replace(".", "\\.")}(\\.|$)`,
"i"
"i",
);
return domainPattern.test(hostname);
});

View File

@ -1,6 +1,6 @@
export function getAdjustedMaxDepth(
url: string,
maxCrawlDepth: number
maxCrawlDepth: number,
): number {
const baseURLDepth = getURLDepth(url);
const adjustedMaxDepth = maxCrawlDepth + baseURLDepth;

View File

@ -14,6 +14,6 @@ export async function scrapeCache(meta: Meta): Promise<EngineScrapeResult> {
url: entry.url,
html: entry.html,
statusCode: entry.statusCode,
error: entry.error
error: entry.error,
};
}

View File

@ -10,6 +10,6 @@ export async function scrapeDOCX(meta: Meta): Promise<EngineScrapeResult> {
url: response.url,
statusCode: response.status,
html: (await mammoth.convertToHtml({ path: tempFilePath })).value
html: (await mammoth.convertToHtml({ path: tempFilePath })).value,
};
}

View File

@ -4,33 +4,33 @@ import { TimeoutError } from "../../error";
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
export async function scrapeURLWithFetch(
meta: Meta
meta: Meta,
): Promise<EngineScrapeResult> {
const timeout = 20000;
const response = await Promise.race([
fetch(meta.url, {
redirect: "follow",
headers: meta.options.headers
headers: meta.options.headers,
}),
(async () => {
await new Promise((resolve) => setTimeout(() => resolve(null), timeout));
throw new TimeoutError(
"Fetch was unable to scrape the page before timing out",
{ cause: { timeout } }
{ cause: { timeout } },
);
})()
})(),
]);
specialtyScrapeCheck(
meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }),
Object.fromEntries(response.headers as any)
Object.fromEntries(response.headers as any),
);
return {
url: response.url,
html: await response.text(),
statusCode: response.status
statusCode: response.status,
// TODO: error?
};
}

View File

@ -31,10 +31,10 @@ const successSchema = z.object({
actionContent: z
.object({
url: z.string(),
html: z.string()
html: z.string(),
})
.array()
.optional()
.optional(),
});
export type FireEngineCheckStatusSuccess = z.infer<typeof successSchema>;
@ -47,16 +47,16 @@ const processingSchema = z.object({
"waiting",
"waiting-children",
"unknown",
"prioritized"
"prioritized",
]),
processing: z.boolean()
processing: z.boolean(),
});
const failedSchema = z.object({
jobId: z.string(),
state: z.literal("failed"),
processing: z.literal(false),
error: z.string()
error: z.string(),
});
export class StillProcessingError extends Error {
@ -67,7 +67,7 @@ export class StillProcessingError extends Error {
export async function fireEngineCheckStatus(
logger: Logger,
jobId: string
jobId: string,
): Promise<FireEngineCheckStatusSuccess> {
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
@ -75,8 +75,8 @@ export async function fireEngineCheckStatus(
{
name: "fire-engine: Check status",
attributes: {
jobId
}
jobId,
},
},
async (span) => {
return await robustFetch({
@ -87,12 +87,12 @@ export async function fireEngineCheckStatus(
...(Sentry.isInitialized()
? {
"sentry-trace": Sentry.spanToTraceHeader(span),
baggage: Sentry.spanToBaggageHeader(span)
}
: {})
baggage: Sentry.spanToBaggageHeader(span),
}
: {}),
},
});
}
},
);
const successParse = successSchema.safeParse(status);
@ -115,23 +115,23 @@ export async function fireEngineCheckStatus(
throw new EngineError("Scrape job failed", {
cause: {
status,
jobId
}
jobId,
},
});
}
} else {
logger.debug("Check status returned response not matched by any schema", {
status,
jobId
jobId,
});
throw new Error(
"Check status returned response not matched by any schema",
{
cause: {
status,
jobId
}
}
jobId,
},
},
);
}
}

View File

@ -10,8 +10,8 @@ export async function fireEngineDelete(logger: Logger, jobId: string) {
{
name: "fire-engine: Delete scrape",
attributes: {
jobId
}
jobId,
},
},
async (span) => {
await robustFetch({
@ -21,15 +21,15 @@ export async function fireEngineDelete(logger: Logger, jobId: string) {
...(Sentry.isInitialized()
? {
"sentry-trace": Sentry.spanToTraceHeader(span),
baggage: Sentry.spanToBaggageHeader(span)
baggage: Sentry.spanToBaggageHeader(span),
}
: {})
: {}),
},
ignoreResponse: true,
ignoreFailure: true,
logger: logger.child({ method: "fireEngineDelete/robustFetch", jobId })
logger: logger.child({ method: "fireEngineDelete/robustFetch", jobId }),
});
}
},
);
// We do not care whether this fails or not.

View File

@ -5,13 +5,13 @@ import {
FireEngineScrapeRequestChromeCDP,
FireEngineScrapeRequestCommon,
FireEngineScrapeRequestPlaywright,
FireEngineScrapeRequestTLSClient
FireEngineScrapeRequestTLSClient,
} from "./scrape";
import { EngineScrapeResult } from "..";
import {
fireEngineCheckStatus,
FireEngineCheckStatusSuccess,
StillProcessingError
StillProcessingError,
} from "./checkStatus";
import { EngineError, SiteError, TimeoutError } from "../../error";
import * as Sentry from "@sentry/node";
@ -27,15 +27,15 @@ async function performFireEngineScrape<
Engine extends
| FireEngineScrapeRequestChromeCDP
| FireEngineScrapeRequestPlaywright
| FireEngineScrapeRequestTLSClient
| FireEngineScrapeRequestTLSClient,
>(
logger: Logger,
request: FireEngineScrapeRequestCommon & Engine,
timeout = defaultTimeout
timeout = defaultTimeout,
): Promise<FireEngineCheckStatusSuccess> {
const scrape = await fireEngineScrape(
logger.child({ method: "fireEngineScrape" }),
request
request,
);
const startTime = Date.now();
@ -47,25 +47,25 @@ async function performFireEngineScrape<
if (errors.length >= errorLimit) {
logger.error("Error limit hit.", { errors });
throw new Error("Error limit hit. See e.cause.errors for errors.", {
cause: { errors }
cause: { errors },
});
}
if (Date.now() - startTime > timeout) {
logger.info(
"Fire-engine was unable to scrape the page before timing out.",
{ errors, timeout }
{ errors, timeout },
);
throw new TimeoutError(
"Fire-engine was unable to scrape the page before timing out",
{ cause: { errors, timeout } }
{ cause: { errors, timeout } },
);
}
try {
status = await fireEngineCheckStatus(
logger.child({ method: "fireEngineCheckStatus" }),
scrape.jobId
scrape.jobId,
);
} catch (error) {
if (error instanceof StillProcessingError) {
@ -73,7 +73,7 @@ async function performFireEngineScrape<
} else if (error instanceof EngineError || error instanceof SiteError) {
logger.debug("Fire-engine scrape job failed.", {
error,
jobId: scrape.jobId
jobId: scrape.jobId,
});
throw error;
} else {
@ -81,7 +81,7 @@ async function performFireEngineScrape<
errors.push(error);
logger.debug(
`An unexpeceted error occurred while calling checkStatus. Error counter is now at ${errors.length}.`,
{ error, jobId: scrape.jobId }
{ error, jobId: scrape.jobId },
);
}
}
@ -93,7 +93,7 @@ async function performFireEngineScrape<
}
export async function scrapeURLWithFireEngineChromeCDP(
meta: Meta
meta: Meta,
): Promise<EngineScrapeResult> {
const actions: Action[] = [
// Transform waitFor option into an action (unsupported by chrome-cdp)
@ -101,8 +101,8 @@ export async function scrapeURLWithFireEngineChromeCDP(
? [
{
type: "wait" as const,
milliseconds: meta.options.waitFor
}
milliseconds: meta.options.waitFor,
},
]
: []),
@ -112,13 +112,13 @@ export async function scrapeURLWithFireEngineChromeCDP(
? [
{
type: "screenshot" as const,
fullPage: meta.options.formats.includes("screenshot@fullPage")
}
fullPage: meta.options.formats.includes("screenshot@fullPage"),
},
]
: []),
// Include specified actions
...(meta.options.actions ?? [])
...(meta.options.actions ?? []),
];
const request: FireEngineScrapeRequestCommon &
@ -130,36 +130,36 @@ export async function scrapeURLWithFireEngineChromeCDP(
headers: meta.options.headers,
...(actions.length > 0
? {
actions
actions,
}
: {}),
priority: meta.internalOptions.priority,
geolocation: meta.options.geolocation,
mobile: meta.options.mobile,
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache
disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache,
// TODO: scrollXPaths
};
const totalWait = actions.reduce(
(a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a),
0
0,
);
let response = await performFireEngineScrape(
meta.logger.child({
method: "scrapeURLWithFireEngineChromeCDP/callFireEngine",
request
request,
}),
request,
meta.options.timeout !== undefined ? defaultTimeout + totalWait : Infinity // TODO: better timeout handling
meta.options.timeout !== undefined ? defaultTimeout + totalWait : Infinity, // TODO: better timeout handling
);
specialtyScrapeCheck(
meta.logger.child({
method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck"
method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck",
}),
response.responseHeaders
response.responseHeaders,
);
if (
@ -168,20 +168,20 @@ export async function scrapeURLWithFireEngineChromeCDP(
) {
meta.logger.debug(
"Transforming screenshots from actions into screenshot field",
{ screenshots: response.screenshots }
{ screenshots: response.screenshots },
);
response.screenshot = (response.screenshots ?? [])[0];
(response.screenshots ?? []).splice(0, 1);
meta.logger.debug("Screenshot transformation done", {
screenshots: response.screenshots,
screenshot: response.screenshot
screenshot: response.screenshot,
});
}
if (!response.url) {
meta.logger.warn("Fire-engine did not return the response's URL", {
response,
sourceURL: meta.url
sourceURL: meta.url,
});
}
@ -197,15 +197,15 @@ export async function scrapeURLWithFireEngineChromeCDP(
? {
actions: {
screenshots: response.screenshots ?? [],
scrapes: response.actionContent ?? []
scrapes: response.actionContent ?? [],
},
}
}
: {})
: {}),
};
}
export async function scrapeURLWithFireEnginePlaywright(
meta: Meta
meta: Meta,
): Promise<EngineScrapeResult> {
const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestPlaywright = {
@ -220,31 +220,31 @@ export async function scrapeURLWithFireEnginePlaywright(
wait: meta.options.waitFor,
geolocation: meta.options.geolocation,
timeout: meta.options.timeout === undefined ? 300000 : undefined // TODO: better timeout logic
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
};
let response = await performFireEngineScrape(
meta.logger.child({
method: "scrapeURLWithFireEngineChromeCDP/callFireEngine",
request
request,
}),
request,
meta.options.timeout !== undefined
? defaultTimeout + meta.options.waitFor
: Infinity // TODO: better timeout handling
: Infinity, // TODO: better timeout handling
);
specialtyScrapeCheck(
meta.logger.child({
method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck"
method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck",
}),
response.responseHeaders
response.responseHeaders,
);
if (!response.url) {
meta.logger.warn("Fire-engine did not return the response's URL", {
response,
sourceURL: meta.url
sourceURL: meta.url,
});
}
@ -257,14 +257,14 @@ export async function scrapeURLWithFireEnginePlaywright(
...(response.screenshots !== undefined && response.screenshots.length > 0
? {
screenshot: response.screenshots[0]
screenshot: response.screenshots[0],
}
: {})
: {}),
};
}
export async function scrapeURLWithFireEngineTLSClient(
meta: Meta
meta: Meta,
): Promise<EngineScrapeResult> {
const request: FireEngineScrapeRequestCommon &
FireEngineScrapeRequestTLSClient = {
@ -279,29 +279,29 @@ export async function scrapeURLWithFireEngineTLSClient(
geolocation: meta.options.geolocation,
disableJsDom: meta.internalOptions.v0DisableJsDom,
timeout: meta.options.timeout === undefined ? 300000 : undefined // TODO: better timeout logic
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
};
let response = await performFireEngineScrape(
meta.logger.child({
method: "scrapeURLWithFireEngineChromeCDP/callFireEngine",
request
request,
}),
request,
meta.options.timeout !== undefined ? defaultTimeout : Infinity // TODO: better timeout handling
meta.options.timeout !== undefined ? defaultTimeout : Infinity, // TODO: better timeout handling
);
specialtyScrapeCheck(
meta.logger.child({
method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck"
method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck",
}),
response.responseHeaders
response.responseHeaders,
);
if (!response.url) {
meta.logger.warn("Fire-engine did not return the response's URL", {
response,
sourceURL: meta.url
sourceURL: meta.url,
});
}
@ -310,6 +310,6 @@ export async function scrapeURLWithFireEngineTLSClient(
html: response.content,
error: response.pageError,
statusCode: response.pageStatusCode
statusCode: response.pageStatusCode,
};
}

View File

@ -58,17 +58,17 @@ export type FireEngineScrapeRequestTLSClient = {
const schema = z.object({
jobId: z.string(),
processing: z.boolean()
processing: z.boolean(),
});
export async function fireEngineScrape<
Engine extends
| FireEngineScrapeRequestChromeCDP
| FireEngineScrapeRequestPlaywright
| FireEngineScrapeRequestTLSClient
| FireEngineScrapeRequestTLSClient,
>(
logger: Logger,
request: FireEngineScrapeRequestCommon & Engine
request: FireEngineScrapeRequestCommon & Engine,
): Promise<z.infer<typeof schema>> {
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
@ -78,8 +78,8 @@ export async function fireEngineScrape<
{
name: "fire-engine: Scrape",
attributes: {
url: request.url
}
url: request.url,
},
},
async (span) => {
return await robustFetch({
@ -89,16 +89,16 @@ export async function fireEngineScrape<
...(Sentry.isInitialized()
? {
"sentry-trace": Sentry.spanToTraceHeader(span),
baggage: Sentry.spanToBaggageHeader(span)
baggage: Sentry.spanToBaggageHeader(span),
}
: {})
: {}),
},
body: request,
logger: logger.child({ method: "fireEngineScrape/robustFetch" }),
schema,
tryCount: 3
tryCount: 3,
});
}
},
);
return scrapeRequest;

View File

@ -4,7 +4,7 @@ import { scrapeDOCX } from "./docx";
import {
scrapeURLWithFireEngineChromeCDP,
scrapeURLWithFireEnginePlaywright,
scrapeURLWithFireEngineTLSClient
scrapeURLWithFireEngineTLSClient,
} from "./fire-engine";
import { scrapePDF } from "./pdf";
import { scrapeURLWithScrapingBee } from "./scrapingbee";
@ -43,7 +43,7 @@ export const engines: Engine[] = [
? [
"fire-engine;chrome-cdp" as const,
"fire-engine;playwright" as const,
"fire-engine;tlsclient" as const
"fire-engine;tlsclient" as const,
]
: []),
...(useScrapingBee
@ -52,7 +52,7 @@ export const engines: Engine[] = [
...(usePlaywright ? ["playwright" as const] : []),
"fetch",
"pdf",
"docx"
"docx",
];
export const featureFlags = [
@ -66,7 +66,7 @@ export const featureFlags = [
"location",
"mobile",
"skipTlsVerification",
"useFastMode"
"useFastMode",
] as const;
export type FeatureFlag = (typeof featureFlags)[number];
@ -86,7 +86,7 @@ export const featureFlagOptions: {
useFastMode: { priority: 90 },
location: { priority: 10 },
mobile: { priority: 10 },
skipTlsVerification: { priority: 10 }
skipTlsVerification: { priority: 10 },
} as const;
export type EngineScrapeResult = {
@ -116,7 +116,7 @@ const engineHandlers: {
playwright: scrapeURLWithPlaywright,
fetch: scrapeURLWithFetch,
pdf: scrapePDF,
docx: scrapeDOCX
docx: scrapeDOCX,
};
export const engineOptions: {
@ -141,9 +141,9 @@ export const engineOptions: {
location: false,
mobile: false,
skipTlsVerification: false,
useFastMode: false
useFastMode: false,
},
quality: 1000 // cache should always be tried first
quality: 1000, // cache should always be tried first
},
"fire-engine;chrome-cdp": {
features: {
@ -157,9 +157,9 @@ export const engineOptions: {
location: true,
mobile: true,
skipTlsVerification: true,
useFastMode: false
useFastMode: false,
},
quality: 50
quality: 50,
},
"fire-engine;playwright": {
features: {
@ -173,9 +173,9 @@ export const engineOptions: {
location: false,
mobile: false,
skipTlsVerification: false,
useFastMode: false
useFastMode: false,
},
quality: 40
quality: 40,
},
scrapingbee: {
features: {
@ -189,9 +189,9 @@ export const engineOptions: {
location: false,
mobile: false,
skipTlsVerification: false,
useFastMode: false
useFastMode: false,
},
quality: 30
quality: 30,
},
scrapingbeeLoad: {
features: {
@ -205,9 +205,9 @@ export const engineOptions: {
location: false,
mobile: false,
skipTlsVerification: false,
useFastMode: false
useFastMode: false,
},
quality: 29
quality: 29,
},
playwright: {
features: {
@ -221,9 +221,9 @@ export const engineOptions: {
location: false,
mobile: false,
skipTlsVerification: false,
useFastMode: false
useFastMode: false,
},
quality: 20
quality: 20,
},
"fire-engine;tlsclient": {
features: {
@ -237,9 +237,9 @@ export const engineOptions: {
location: true,
mobile: false,
skipTlsVerification: false,
useFastMode: true
useFastMode: true,
},
quality: 10
quality: 10,
},
fetch: {
features: {
@ -253,9 +253,9 @@ export const engineOptions: {
location: false,
mobile: false,
skipTlsVerification: false,
useFastMode: true
useFastMode: true,
},
quality: 5
quality: 5,
},
pdf: {
features: {
@ -269,9 +269,9 @@ export const engineOptions: {
location: false,
mobile: false,
skipTlsVerification: false,
useFastMode: true
useFastMode: true,
},
quality: -10
quality: -10,
},
docx: {
features: {
@ -285,10 +285,10 @@ export const engineOptions: {
location: false,
mobile: false,
skipTlsVerification: false,
useFastMode: true
useFastMode: true,
},
quality: -10,
},
quality: -10
}
};
export function buildFallbackList(meta: Meta): {
@ -297,7 +297,7 @@ export function buildFallbackList(meta: Meta): {
}[] {
const prioritySum = [...meta.featureFlags].reduce(
(a, x) => a + featureFlagOptions[x].priority,
0
0,
);
const priorityThreshold = Math.floor(prioritySum / 2);
let selectedEngines: {
@ -315,13 +315,13 @@ export function buildFallbackList(meta: Meta): {
const supportedFlags = new Set([
...Object.entries(engineOptions[engine].features)
.filter(
([k, v]) => meta.featureFlags.has(k as FeatureFlag) && v === true
([k, v]) => meta.featureFlags.has(k as FeatureFlag) && v === true,
)
.map(([k, _]) => k)
.map(([k, _]) => k),
]);
const supportScore = [...supportedFlags].reduce(
(a, x) => a + featureFlagOptions[x].priority,
0
0,
);
const unsupportedFeatures = new Set([...meta.featureFlags]);
@ -338,7 +338,7 @@ export function buildFallbackList(meta: Meta): {
prioritySum,
priorityThreshold,
featureFlags: [...meta.featureFlags],
unsupportedFeatures
unsupportedFeatures,
});
} else {
meta.logger.debug(
@ -348,22 +348,22 @@ export function buildFallbackList(meta: Meta): {
prioritySum,
priorityThreshold,
featureFlags: [...meta.featureFlags],
unsupportedFeatures
}
unsupportedFeatures,
},
);
}
}
if (selectedEngines.some((x) => engineOptions[x.engine].quality > 0)) {
selectedEngines = selectedEngines.filter(
(x) => engineOptions[x.engine].quality > 0
(x) => engineOptions[x.engine].quality > 0,
);
}
selectedEngines.sort(
(a, b) =>
b.supportScore - a.supportScore ||
engineOptions[b.engine].quality - engineOptions[a.engine].quality
engineOptions[b.engine].quality - engineOptions[a.engine].quality,
);
return selectedEngines;
@ -371,16 +371,16 @@ export function buildFallbackList(meta: Meta): {
export async function scrapeURLWithEngine(
meta: Meta,
engine: Engine
engine: Engine,
): Promise<EngineScrapeResult> {
const fn = engineHandlers[engine];
const logger = meta.logger.child({
method: fn.name ?? "scrapeURLWithEngine",
engine
engine,
});
const _meta = {
...meta,
logger
logger,
};
return await fn(_meta);

View File

@ -14,10 +14,10 @@ type PDFProcessorResult = { html: string; markdown?: string };
async function scrapePDFWithLlamaParse(
meta: Meta,
tempFilePath: string
tempFilePath: string,
): Promise<PDFProcessorResult> {
meta.logger.debug("Processing PDF document with LlamaIndex", {
tempFilePath
tempFilePath,
});
const uploadForm = new FormData();
@ -28,7 +28,7 @@ async function scrapePDFWithLlamaParse(
name: tempFilePath,
stream() {
return createReadStream(
tempFilePath
tempFilePath,
) as unknown as ReadableStream<Uint8Array>;
},
arrayBuffer() {
@ -41,22 +41,22 @@ async function scrapePDFWithLlamaParse(
slice(start, end, contentType) {
throw Error("Unimplemented in mock Blob: slice");
},
type: "application/pdf"
type: "application/pdf",
} as Blob);
const upload = await robustFetch({
url: "https://api.cloud.llamaindex.ai/api/parsing/upload",
method: "POST",
headers: {
Authorization: `Bearer ${process.env.LLAMAPARSE_API_KEY}`
Authorization: `Bearer ${process.env.LLAMAPARSE_API_KEY}`,
},
body: uploadForm,
logger: meta.logger.child({
method: "scrapePDFWithLlamaParse/upload/robustFetch"
method: "scrapePDFWithLlamaParse/upload/robustFetch",
}),
schema: z.object({
id: z.string()
})
id: z.string(),
}),
});
const jobId = upload.id;
@ -70,18 +70,18 @@ async function scrapePDFWithLlamaParse(
url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
method: "GET",
headers: {
Authorization: `Bearer ${process.env.LLAMAPARSE_API_KEY}`
Authorization: `Bearer ${process.env.LLAMAPARSE_API_KEY}`,
},
logger: meta.logger.child({
method: "scrapePDFWithLlamaParse/result/robustFetch"
method: "scrapePDFWithLlamaParse/result/robustFetch",
}),
schema: z.object({
markdown: z.string()
})
markdown: z.string(),
}),
});
return {
markdown: result.markdown,
html: await marked.parse(result.markdown, { async: true })
html: await marked.parse(result.markdown, { async: true }),
};
} catch (e) {
if (e instanceof Error && e.message === "Request sent failure status") {
@ -93,7 +93,7 @@ async function scrapePDFWithLlamaParse(
throw new RemoveFeatureError(["pdf"]);
} else {
throw new Error("LlamaParse threw an error", {
cause: e.cause
cause: e.cause,
});
}
} else {
@ -109,7 +109,7 @@ async function scrapePDFWithLlamaParse(
async function scrapePDFWithParsePDF(
meta: Meta,
tempFilePath: string
tempFilePath: string,
): Promise<PDFProcessorResult> {
meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath });
@ -118,7 +118,7 @@ async function scrapePDFWithParsePDF(
return {
markdown: escaped,
html: escaped
html: escaped,
};
}
@ -131,7 +131,7 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
statusCode: file.response.status,
html: content,
markdown: content
markdown: content,
};
}
@ -144,22 +144,22 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
{
...meta,
logger: meta.logger.child({
method: "scrapePDF/scrapePDFWithLlamaParse"
})
method: "scrapePDF/scrapePDFWithLlamaParse",
}),
},
tempFilePath
tempFilePath,
);
} catch (error) {
if (error instanceof Error && error.message === "LlamaParse timed out") {
meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", {
error
error,
});
} else if (error instanceof RemoveFeatureError) {
throw error;
} else {
meta.logger.warn(
"LlamaParse failed to parse PDF -- falling back to parse-pdf",
{ error }
{ error },
);
Sentry.captureException(error);
}
@ -170,9 +170,11 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
result = await scrapePDFWithParsePDF(
{
...meta,
logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithParsePDF" })
logger: meta.logger.child({
method: "scrapePDF/scrapePDFWithParsePDF",
}),
},
tempFilePath
tempFilePath,
);
}
@ -183,6 +185,6 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
statusCode: response.status,
html: result.html,
markdown: result.markdown
markdown: result.markdown,
};
}

View File

@ -5,7 +5,7 @@ import { TimeoutError } from "../../error";
import { robustFetch } from "../../lib/fetch";
export async function scrapeURLWithPlaywright(
meta: Meta
meta: Meta,
): Promise<EngineScrapeResult> {
const timeout = 20000 + meta.options.waitFor;
@ -13,35 +13,35 @@ export async function scrapeURLWithPlaywright(
await robustFetch({
url: process.env.PLAYWRIGHT_MICROSERVICE_URL!,
headers: {
"Content-Type": "application/json"
"Content-Type": "application/json",
},
body: {
url: meta.url,
wait_after_load: meta.options.waitFor,
timeout,
headers: meta.options.headers
headers: meta.options.headers,
},
method: "POST",
logger: meta.logger.child("scrapeURLWithPlaywright/robustFetch"),
schema: z.object({
content: z.string(),
pageStatusCode: z.number(),
pageError: z.string().optional()
})
pageError: z.string().optional(),
}),
}),
(async () => {
await new Promise((resolve) => setTimeout(() => resolve(null), 20000));
throw new TimeoutError(
"Playwright was unable to scrape the page before timing out",
{ cause: { timeout } }
{ cause: { timeout } },
);
})()
})(),
]);
return {
url: meta.url, // TODO: impove redirect following
html: response.content,
statusCode: response.pageStatusCode,
error: response.pageError
error: response.pageError,
};
}

View File

@ -8,7 +8,7 @@ import { EngineError } from "../../error";
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!);
export function scrapeURLWithScrapingBee(
wait_browser: "domcontentloaded" | "networkidle2"
wait_browser: "domcontentloaded" | "networkidle2",
): (meta: Meta) => Promise<EngineScrapeResult> {
return async (meta: Meta): Promise<EngineScrapeResult> => {
let response: AxiosResponse<any>;
@ -23,12 +23,12 @@ export function scrapeURLWithScrapingBee(
json_response: true,
screenshot: meta.options.formats.includes("screenshot"),
screenshot_full_page: meta.options.formats.includes(
"screenshot@fullPage"
)
"screenshot@fullPage",
),
},
headers: {
"ScrapingService-Request": "TRUE" // this is sent to the page, not to ScrapingBee - mogery
}
"ScrapingService-Request": "TRUE", // this is sent to the page, not to ScrapingBee - mogery
},
});
} catch (error) {
if (error instanceof AxiosError && error.response !== undefined) {
@ -51,25 +51,25 @@ export function scrapeURLWithScrapingBee(
if (body.errors || body.body?.error || isHiddenEngineError) {
meta.logger.error("ScrapingBee threw an error", {
body: body.body?.error ?? body.errors ?? body.body ?? body
body: body.body?.error ?? body.errors ?? body.body ?? body,
});
throw new EngineError("Engine error #34", {
cause: { body, statusCode: response.status }
cause: { body, statusCode: response.status },
});
}
if (typeof body.body !== "string") {
meta.logger.error("ScrapingBee: Body is not string??", { body });
throw new EngineError("Engine error #35", {
cause: { body, statusCode: response.status }
cause: { body, statusCode: response.status },
});
}
specialtyScrapeCheck(
meta.logger.child({
method: "scrapeURLWithScrapingBee/specialtyScrapeCheck"
method: "scrapeURLWithScrapingBee/specialtyScrapeCheck",
}),
body.headers
body.headers,
);
return {
@ -80,9 +80,9 @@ export function scrapeURLWithScrapingBee(
statusCode: response.status,
...(body.screenshot
? {
screenshot: `data:image/png;base64,${body.screenshot}`
screenshot: `data:image/png;base64,${body.screenshot}`,
}
: {})
: {}),
};
};
}

View File

@ -13,13 +13,13 @@ export async function fetchFileToBuffer(url: string): Promise<{
const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying
return {
response,
buffer: Buffer.from(await response.arrayBuffer())
buffer: Buffer.from(await response.arrayBuffer()),
};
}
export async function downloadFile(
id: string,
url: string
url: string,
): Promise<{
response: undici.Response;
tempFilePath: string;
@ -32,9 +32,9 @@ export async function downloadFile(
const response = await undici.fetch(url, {
dispatcher: new undici.Agent({
connect: {
rejectUnauthorized: false
}
})
rejectUnauthorized: false,
},
}),
});
// This should never happen in the current state of JS (2024), but let's check anyways.
@ -47,13 +47,13 @@ export async function downloadFile(
tempFileWrite.on("finish", () => resolve(null));
tempFileWrite.on("error", (error) => {
reject(
new EngineError("Failed to write to temp file", { cause: { error } })
new EngineError("Failed to write to temp file", { cause: { error } }),
);
});
});
return {
response,
tempFilePath
tempFilePath,
};
}

View File

@ -3,15 +3,15 @@ import { AddFeatureError } from "../../error";
export function specialtyScrapeCheck(
logger: Logger,
headers: Record<string, string> | undefined
headers: Record<string, string> | undefined,
) {
const contentType = (Object.entries(headers ?? {}).find(
(x) => x[0].toLowerCase() === "content-type"
(x) => x[0].toLowerCase() === "content-type",
) ?? [])[1];
if (contentType === undefined) {
logger.warn("Failed to check contentType -- was not present in headers", {
headers
headers,
});
} else if (
contentType === "application/pdf" ||
@ -23,7 +23,7 @@ export function specialtyScrapeCheck(
contentType ===
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" ||
contentType.startsWith(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document;"
"application/vnd.openxmlformats-officedocument.wordprocessingml.document;",
)
) {
// .docx

View File

@ -19,7 +19,7 @@ export class NoEnginesLeftError extends Error {
constructor(fallbackList: Engine[], results: EngineResultsTracker) {
super(
"All scraping engines failed! -- Double check the URL to make sure it's not broken. If the issue persists, contact us at help@firecrawl.com."
"All scraping engines failed! -- Double check the URL to make sure it's not broken. If the issue persists, contact us at help@firecrawl.com.",
);
this.fallbackList = fallbackList;
this.results = results;
@ -40,7 +40,8 @@ export class RemoveFeatureError extends Error {
constructor(featureFlags: FeatureFlag[]) {
super(
"Incorrect feature flags have been discovered: " + featureFlags.join(", ")
"Incorrect feature flags have been discovered: " +
featureFlags.join(", "),
);
this.featureFlags = featureFlags;
}
@ -50,7 +51,7 @@ export class SiteError extends Error {
public code: string;
constructor(code: string) {
super(
"Specified URL is failing to load in the browser. Error code: " + code
"Specified URL is failing to load in the browser. Error code: " + code,
);
this.code = code;
}

View File

@ -8,7 +8,7 @@ import {
Engine,
EngineScrapeResult,
FeatureFlag,
scrapeURLWithEngine
scrapeURLWithEngine,
} from "./engines";
import { parseMarkdown } from "../../lib/html-to-markdown";
import {
@ -17,7 +17,7 @@ import {
NoEnginesLeftError,
RemoveFeatureError,
SiteError,
TimeoutError
TimeoutError,
} from "./error";
import { executeTransformers } from "./transformers";
import { LLMRefusalError } from "./transformers/llmExtract";
@ -50,7 +50,7 @@ export type Meta = {
function buildFeatureFlags(
url: string,
options: ScrapeOptions,
internalOptions: InternalOptions
internalOptions: InternalOptions,
): Set<FeatureFlag> {
const flags: Set<FeatureFlag> = new Set();
@ -112,7 +112,7 @@ function buildMetaObject(
id: string,
url: string,
options: ScrapeOptions,
internalOptions: InternalOptions
internalOptions: InternalOptions,
): Meta {
const specParams =
urlSpecificParams[new URL(url).hostname.replace(/^www\./, "")];
@ -120,14 +120,14 @@ function buildMetaObject(
options = Object.assign(options, specParams.scrapeOptions);
internalOptions = Object.assign(
internalOptions,
specParams.internalOptions
specParams.internalOptions,
);
}
const _logger = logger.child({
module: "ScrapeURL",
scrapeId: id,
scrapeURL: url
scrapeURL: url,
});
const logs: any[] = [];
@ -138,7 +138,7 @@ function buildMetaObject(
internalOptions,
logger: _logger,
logs,
featureFlags: buildFeatureFlags(url, options, internalOptions)
featureFlags: buildFeatureFlags(url, options, internalOptions),
};
}
@ -229,7 +229,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
factors: { isLongEnough, isGoodStatusCode, hasNoPageError },
unsupportedFeatures,
startedAt,
finishedAt: Date.now()
finishedAt: Date.now(),
};
// NOTE: TODO: what to do when status code is bad is tough...
@ -237,35 +237,35 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
// should we just use all the fallbacks and pick the one with the longest text? - mogery
if (isLongEnough || !isGoodStatusCode) {
meta.logger.info("Scrape via " + engine + " deemed successful.", {
factors: { isLongEnough, isGoodStatusCode, hasNoPageError }
factors: { isLongEnough, isGoodStatusCode, hasNoPageError },
});
result = {
engine,
unsupportedFeatures,
result: engineResult as EngineScrapeResult & { markdown: string }
result: engineResult as EngineScrapeResult & { markdown: string },
};
break;
}
} catch (error) {
if (error instanceof EngineError) {
meta.logger.info("Engine " + engine + " could not scrape the page.", {
error
error,
});
results[engine] = {
state: "error",
error: safeguardCircularError(error),
unexpected: false,
startedAt,
finishedAt: Date.now()
finishedAt: Date.now(),
};
} else if (error instanceof TimeoutError) {
meta.logger.info("Engine " + engine + " timed out while scraping.", {
error
error,
});
results[engine] = {
state: "timeout",
startedAt,
finishedAt: Date.now()
finishedAt: Date.now(),
};
} else if (
error instanceof AddFeatureError ||
@ -278,7 +278,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
error: safeguardCircularError(error),
unexpected: true,
startedAt,
finishedAt: Date.now()
finishedAt: Date.now(),
};
error.results = results;
meta.logger.warn("LLM refusal encountered", { error });
@ -289,14 +289,14 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
Sentry.captureException(error);
meta.logger.info(
"An unexpected error happened while scraping with " + engine + ".",
{ error }
{ error },
);
results[engine] = {
state: "error",
error: safeguardCircularError(error),
unexpected: true,
startedAt,
finishedAt: Date.now()
finishedAt: Date.now(),
};
}
}
@ -305,7 +305,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
if (result === null) {
throw new NoEnginesLeftError(
fallbackList.map((x) => x.engine),
results
results,
);
}
@ -318,15 +318,15 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
sourceURL: meta.url,
url: result.result.url,
statusCode: result.result.statusCode,
error: result.result.error
}
error: result.result.error,
},
};
if (result.unsupportedFeatures.size > 0) {
const warning = `The engine used does not support the following features: ${[...result.unsupportedFeatures].join(", ")} -- your scrape may be partial.`;
meta.logger.warn(warning, {
engine: result.engine,
unsupportedFeatures: result.unsupportedFeatures
unsupportedFeatures: result.unsupportedFeatures,
});
document.warning =
document.warning !== undefined
@ -340,7 +340,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
success: true,
document,
logs: meta.logs,
engines: results
engines: results,
};
}
@ -348,7 +348,7 @@ export async function scrapeURL(
id: string,
url: string,
options: ScrapeOptions,
internalOptions: InternalOptions = {}
internalOptions: InternalOptions = {},
): Promise<ScrapeUrlResponse> {
const meta = buildMetaObject(id, url, options, internalOptions);
try {
@ -363,10 +363,10 @@ export async function scrapeURL(
meta.logger.debug(
"More feature flags requested by scraper: adding " +
error.featureFlags.join(", "),
{ error, existingFlags: meta.featureFlags }
{ error, existingFlags: meta.featureFlags },
);
meta.featureFlags = new Set(
[...meta.featureFlags].concat(error.featureFlags)
[...meta.featureFlags].concat(error.featureFlags),
);
} else if (
error instanceof RemoveFeatureError &&
@ -375,12 +375,12 @@ export async function scrapeURL(
meta.logger.debug(
"Incorrect feature flags reported by scraper: removing " +
error.featureFlags.join(","),
{ error, existingFlags: meta.featureFlags }
{ error, existingFlags: meta.featureFlags },
);
meta.featureFlags = new Set(
[...meta.featureFlags].filter(
(x) => !error.featureFlags.includes(x)
)
(x) => !error.featureFlags.includes(x),
),
);
} else {
throw error;
@ -415,7 +415,7 @@ export async function scrapeURL(
success: false,
error,
logs: meta.logs,
engines: results
engines: results,
};
}
}

View File

@ -27,7 +27,7 @@ export function extractLinks(html: string, baseUrl: string): string[] {
} catch (error) {
logger.error(
`Failed to construct URL for href: ${href} with base: ${baseUrl}`,
{ error }
{ error },
);
}
}

View File

@ -4,7 +4,7 @@ import { Meta } from "..";
export function extractMetadata(
meta: Meta,
html: string
html: string,
): Document["metadata"] {
let title: string | undefined = undefined;
let description: string | undefined = undefined;
@ -148,6 +148,6 @@ export function extractMetadata(
publishedTime,
articleTag,
articleSection,
...customMetadata
...customMetadata,
};
}

View File

@ -20,7 +20,7 @@ export type RobustFetchParams<Schema extends z.Schema<any>> = {
export async function robustFetch<
Schema extends z.Schema<any>,
Output = z.infer<Schema>
Output = z.infer<Schema>,
>({
url,
logger,
@ -32,7 +32,7 @@ export async function robustFetch<
ignoreFailure = false,
requestId = uuid(),
tryCount = 1,
tryCooldown
tryCooldown,
}: RobustFetchParams<Schema>): Promise<Output> {
const params = {
url,
@ -44,7 +44,7 @@ export async function robustFetch<
ignoreResponse,
ignoreFailure,
tryCount,
tryCooldown
tryCooldown,
};
let request: Response;
@ -56,20 +56,20 @@ export async function robustFetch<
? {}
: body !== undefined
? {
"Content-Type": "application/json"
"Content-Type": "application/json",
}
: {}),
...(headers !== undefined ? headers : {})
...(headers !== undefined ? headers : {}),
},
...(body instanceof FormData
? {
body
body,
}
: body !== undefined
? {
body: JSON.stringify(body)
body: JSON.stringify(body),
}
: {})
: {}),
});
} catch (error) {
if (!ignoreFailure) {
@ -77,12 +77,12 @@ export async function robustFetch<
if (tryCount > 1) {
logger.debug(
"Request failed, trying " + (tryCount - 1) + " more times",
{ params, error, requestId }
{ params, error, requestId },
);
return await robustFetch({
...params,
requestId,
tryCount: tryCount - 1
tryCount: tryCount - 1,
});
} else {
logger.debug("Request failed", { params, error, requestId });
@ -90,8 +90,8 @@ export async function robustFetch<
cause: {
params,
requestId,
error
}
error,
},
});
}
} else {
@ -106,39 +106,39 @@ export async function robustFetch<
const response = {
status: request.status,
headers: request.headers,
body: await request.text() // NOTE: can this throw an exception?
body: await request.text(), // NOTE: can this throw an exception?
};
if (request.status >= 300) {
if (tryCount > 1) {
logger.debug(
"Request sent failure status, trying " + (tryCount - 1) + " more times",
{ params, request, response, requestId }
{ params, request, response, requestId },
);
if (tryCooldown !== undefined) {
await new Promise((resolve) =>
setTimeout(() => resolve(null), tryCooldown)
setTimeout(() => resolve(null), tryCooldown),
);
}
return await robustFetch({
...params,
requestId,
tryCount: tryCount - 1
tryCount: tryCount - 1,
});
} else {
logger.debug("Request sent failure status", {
params,
request,
response,
requestId
requestId,
});
throw new Error("Request sent failure status", {
cause: {
params,
request,
response,
requestId
}
requestId,
},
});
}
}
@ -151,15 +151,15 @@ export async function robustFetch<
params,
request,
response,
requestId
requestId,
});
throw new Error("Request sent malformed JSON", {
cause: {
params,
request,
response,
requestId
}
requestId,
},
});
}
@ -174,7 +174,7 @@ export async function robustFetch<
response,
requestId,
error,
schema
schema,
});
throw new Error("Response does not match provided schema", {
cause: {
@ -183,8 +183,8 @@ export async function robustFetch<
response,
requestId,
error,
schema
}
schema,
},
});
} else {
logger.debug("Parsing response with provided schema failed", {
@ -193,7 +193,7 @@ export async function robustFetch<
response,
requestId,
error,
schema
schema,
});
throw new Error("Parsing response with provided schema failed", {
cause: {
@ -202,8 +202,8 @@ export async function robustFetch<
response,
requestId,
error,
schema
}
schema,
},
});
}
}

View File

@ -47,14 +47,14 @@ const excludeNonMainTags = [
".widget",
"#widget",
".cookie",
"#cookie"
"#cookie",
];
const forceIncludeMainTags = ["#main"];
export const removeUnwantedElements = (
html: string,
scrapeOptions: ScrapeOptions
scrapeOptions: ScrapeOptions,
) => {
const soup = load(html);
@ -89,11 +89,11 @@ export const removeUnwantedElements = (
const attributes = element.attribs;
const tagNameMatches = regexPattern.test(element.name);
const attributesMatch = Object.keys(attributes).some((attr) =>
regexPattern.test(`${attr}="${attributes[attr]}"`)
regexPattern.test(`${attr}="${attributes[attr]}"`),
);
if (tag.startsWith("*.")) {
classMatch = Object.keys(attributes).some((attr) =>
regexPattern.test(`class="${attributes[attr]}"`)
regexPattern.test(`class="${attributes[attr]}"`),
);
}
return tagNameMatches || attributesMatch || classMatch;
@ -110,7 +110,7 @@ export const removeUnwantedElements = (
if (scrapeOptions.onlyMainContent) {
excludeNonMainTags.forEach((tag) => {
const elementsToRemove = soup(tag).filter(
forceIncludeMainTags.map((x) => ":not(:has(" + x + "))").join("")
forceIncludeMainTags.map((x) => ":not(:has(" + x + "))").join(""),
);
elementsToRemove.remove();

View File

@ -42,10 +42,10 @@ export const urlSpecificParams: Record<string, UrlSpecificParams> = {
// },
"digikey.com": {
scrapeOptions: {},
internalOptions: { forceEngine: "fire-engine;tlsclient" }
internalOptions: { forceEngine: "fire-engine;tlsclient" },
},
"lorealparis.hu": {
scrapeOptions: {},
internalOptions: { forceEngine: "fire-engine;tlsclient" }
}
internalOptions: { forceEngine: "fire-engine;tlsclient" },
},
};

View File

@ -13,7 +13,7 @@ const testEngines: (Engine | undefined)[] = [
"fire-engine;tlsclient",
"scrapingbee",
"scrapingbeeLoad",
"fetch"
"fetch",
];
const testEnginesScreenshot: (Engine | undefined)[] = [
@ -21,7 +21,7 @@ const testEnginesScreenshot: (Engine | undefined)[] = [
"fire-engine;chrome-cdp",
"fire-engine;playwright",
"scrapingbee",
"scrapingbeeLoad"
"scrapingbeeLoad",
];
describe("Standalone scrapeURL tests", () => {
@ -31,7 +31,7 @@ describe("Standalone scrapeURL tests", () => {
"test:scrape-basic",
"https://www.roastmywebsite.ai/",
scrapeOptions.parse({}),
{ forceEngine }
{ forceEngine },
);
// expect(out.logs.length).toBeGreaterThan(0);
@ -46,26 +46,26 @@ describe("Standalone scrapeURL tests", () => {
expect(out.document.metadata.error).toBeUndefined();
expect(out.document.metadata.title).toBe("Roast My Website");
expect(out.document.metadata.description).toBe(
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
);
expect(out.document.metadata.keywords).toBe(
"Roast My Website,Roast,Website,GitHub,Firecrawl"
"Roast My Website,Roast,Website,GitHub,Firecrawl",
);
expect(out.document.metadata.robots).toBe("follow, index");
expect(out.document.metadata.ogTitle).toBe("Roast My Website");
expect(out.document.metadata.ogDescription).toBe(
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
);
expect(out.document.metadata.ogUrl).toBe(
"https://www.roastmywebsite.ai"
"https://www.roastmywebsite.ai",
);
expect(out.document.metadata.ogImage).toBe(
"https://www.roastmywebsite.ai/og.png"
"https://www.roastmywebsite.ai/og.png",
);
expect(out.document.metadata.ogLocaleAlternate).toStrictEqual([]);
expect(out.document.metadata.ogSiteName).toBe("Roast My Website");
expect(out.document.metadata.sourceURL).toBe(
"https://www.roastmywebsite.ai/"
"https://www.roastmywebsite.ai/",
);
expect(out.document.metadata.statusCode).toBe(200);
}
@ -76,9 +76,9 @@ describe("Standalone scrapeURL tests", () => {
"test:scrape-formats-markdown-html",
"https://roastmywebsite.ai",
scrapeOptions.parse({
formats: ["markdown", "html"]
formats: ["markdown", "html"],
}),
{ forceEngine }
{ forceEngine },
);
// expect(out.logs.length).toBeGreaterThan(0);
@ -100,9 +100,9 @@ describe("Standalone scrapeURL tests", () => {
"test:scrape-onlyMainContent-false",
"https://www.scrapethissite.com/",
scrapeOptions.parse({
onlyMainContent: false
onlyMainContent: false,
}),
{ forceEngine }
{ forceEngine },
);
// expect(out.logs.length).toBeGreaterThan(0);
@ -123,9 +123,9 @@ describe("Standalone scrapeURL tests", () => {
"https://www.scrapethissite.com/",
scrapeOptions.parse({
onlyMainContent: false,
excludeTags: [".nav", "#footer", "strong"]
excludeTags: [".nav", "#footer", "strong"],
}),
{ forceEngine }
{ forceEngine },
);
// expect(out.logs.length).toBeGreaterThan(0);
@ -145,7 +145,7 @@ describe("Standalone scrapeURL tests", () => {
"test:scrape-400",
"https://httpstat.us/400",
scrapeOptions.parse({}),
{ forceEngine }
{ forceEngine },
);
// expect(out.logs.length).toBeGreaterThan(0);
@ -163,7 +163,7 @@ describe("Standalone scrapeURL tests", () => {
"test:scrape-401",
"https://httpstat.us/401",
scrapeOptions.parse({}),
{ forceEngine }
{ forceEngine },
);
// expect(out.logs.length).toBeGreaterThan(0);
@ -181,7 +181,7 @@ describe("Standalone scrapeURL tests", () => {
"test:scrape-403",
"https://httpstat.us/403",
scrapeOptions.parse({}),
{ forceEngine }
{ forceEngine },
);
// expect(out.logs.length).toBeGreaterThan(0);
@ -199,7 +199,7 @@ describe("Standalone scrapeURL tests", () => {
"test:scrape-404",
"https://httpstat.us/404",
scrapeOptions.parse({}),
{ forceEngine }
{ forceEngine },
);
// expect(out.logs.length).toBeGreaterThan(0);
@ -217,7 +217,7 @@ describe("Standalone scrapeURL tests", () => {
"test:scrape-405",
"https://httpstat.us/405",
scrapeOptions.parse({}),
{ forceEngine }
{ forceEngine },
);
// expect(out.logs.length).toBeGreaterThan(0);
@ -235,7 +235,7 @@ describe("Standalone scrapeURL tests", () => {
"test:scrape-500",
"https://httpstat.us/500",
scrapeOptions.parse({}),
{ forceEngine }
{ forceEngine },
);
// expect(out.logs.length).toBeGreaterThan(0);
@ -253,7 +253,7 @@ describe("Standalone scrapeURL tests", () => {
"test:scrape-redirect",
"https://scrapethissite.com/",
scrapeOptions.parse({}),
{ forceEngine }
{ forceEngine },
);
// expect(out.logs.length).toBeGreaterThan(0);
@ -264,10 +264,10 @@ describe("Standalone scrapeURL tests", () => {
expect(out.document.markdown).toContain("Explore Sandbox");
expect(out.document).toHaveProperty("metadata");
expect(out.document.metadata.sourceURL).toBe(
"https://scrapethissite.com/"
"https://scrapethissite.com/",
);
expect(out.document.metadata.url).toBe(
"https://www.scrapethissite.com/"
"https://www.scrapethissite.com/",
);
expect(out.document.metadata.statusCode).toBe(200);
expect(out.document.metadata.error).toBeUndefined();
@ -283,9 +283,9 @@ describe("Standalone scrapeURL tests", () => {
"test:scrape-screenshot",
"https://www.scrapethissite.com/",
scrapeOptions.parse({
formats: ["screenshot"]
formats: ["screenshot"],
}),
{ forceEngine }
{ forceEngine },
);
// expect(out.logs.length).toBeGreaterThan(0);
@ -296,8 +296,8 @@ describe("Standalone scrapeURL tests", () => {
expect(typeof out.document.screenshot).toBe("string");
expect(
out.document.screenshot!.startsWith(
"https://service.firecrawl.dev/storage/v1/object/public/media/"
)
"https://service.firecrawl.dev/storage/v1/object/public/media/",
),
);
// TODO: attempt to fetch screenshot
expect(out.document).toHaveProperty("metadata");
@ -311,9 +311,9 @@ describe("Standalone scrapeURL tests", () => {
"test:scrape-screenshot-fullPage",
"https://www.scrapethissite.com/",
scrapeOptions.parse({
formats: ["screenshot@fullPage"]
formats: ["screenshot@fullPage"],
}),
{ forceEngine }
{ forceEngine },
);
// expect(out.logs.length).toBeGreaterThan(0);
@ -324,8 +324,8 @@ describe("Standalone scrapeURL tests", () => {
expect(typeof out.document.screenshot).toBe("string");
expect(
out.document.screenshot!.startsWith(
"https://service.firecrawl.dev/storage/v1/object/public/media/"
)
"https://service.firecrawl.dev/storage/v1/object/public/media/",
),
);
// TODO: attempt to fetch screenshot
expect(out.document).toHaveProperty("metadata");
@ -333,14 +333,14 @@ describe("Standalone scrapeURL tests", () => {
expect(out.document.metadata.error).toBeUndefined();
}
}, 30000);
}
},
);
it("Scrape of a PDF file", async () => {
const out = await scrapeURL(
"test:scrape-pdf",
"https://arxiv.org/pdf/astro-ph/9301001.pdf",
scrapeOptions.parse({})
scrapeOptions.parse({}),
);
// expect(out.logs.length).toBeGreaterThan(0);
@ -358,7 +358,7 @@ describe("Standalone scrapeURL tests", () => {
const out = await scrapeURL(
"test:scrape-docx",
"https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx",
scrapeOptions.parse({})
scrapeOptions.parse({}),
);
// expect(out.logs.length).toBeGreaterThan(0);
@ -367,7 +367,7 @@ describe("Standalone scrapeURL tests", () => {
expect(out.document.warning).toBeUndefined();
expect(out.document).toHaveProperty("metadata");
expect(out.document.markdown).toContain(
"SERIES A PREFERRED STOCK PURCHASE AGREEMENT"
"SERIES A PREFERRED STOCK PURCHASE AGREEMENT",
);
expect(out.document.metadata.statusCode).toBe(200);
expect(out.document.metadata.error).toBeUndefined();
@ -388,13 +388,13 @@ describe("Standalone scrapeURL tests", () => {
properties: {
company_mission: { type: "string" },
supports_sso: { type: "boolean" },
is_open_source: { type: "boolean" }
is_open_source: { type: "boolean" },
},
required: ["company_mission", "supports_sso", "is_open_source"],
additionalProperties: false
}
}
})
additionalProperties: false,
},
},
}),
);
// expect(out.logs.length).toBeGreaterThan(0);
@ -423,13 +423,13 @@ describe("Standalone scrapeURL tests", () => {
properties: {
company_mission: { type: "string" },
supports_sso: { type: "boolean" },
is_open_source: { type: "boolean" }
is_open_source: { type: "boolean" },
},
required: ["company_mission", "supports_sso", "is_open_source"],
additionalProperties: false
}
}
})
additionalProperties: false,
},
},
}),
);
// expect(out.logs.length).toBeGreaterThan(0);
@ -460,7 +460,7 @@ describe("Standalone scrapeURL tests", () => {
message: value.message,
name: value.name,
cause: value.cause,
stack: value.stack
stack: value.stack,
};
} else {
return value;
@ -486,6 +486,6 @@ describe("Standalone scrapeURL tests", () => {
expect(out.document.metadata.statusCode).toBe(200);
}
},
30000
30000,
);
});

View File

@ -11,7 +11,7 @@ export function saveToCache(meta: Meta, document: Document): Document {
if (document.rawHtml === undefined) {
throw new Error(
"rawHtml is undefined -- this transformer is being called out of order"
"rawHtml is undefined -- this transformer is being called out of order",
);
}
@ -22,7 +22,7 @@ export function saveToCache(meta: Meta, document: Document): Document {
html: document.rawHtml!,
statusCode: document.metadata.statusCode!,
url: document.metadata.url ?? document.metadata.sourceURL!,
error: document.metadata.error ?? undefined
error: document.metadata.error ?? undefined,
};
saveEntryToCache(key, entry);

View File

@ -11,33 +11,33 @@ import { saveToCache } from "./cache";
export type Transformer = (
meta: Meta,
document: Document
document: Document,
) => Document | Promise<Document>;
export function deriveMetadataFromRawHTML(
meta: Meta,
document: Document
document: Document,
): Document {
if (document.rawHtml === undefined) {
throw new Error(
"rawHtml is undefined -- this transformer is being called out of order"
"rawHtml is undefined -- this transformer is being called out of order",
);
}
document.metadata = {
...extractMetadata(meta, document.rawHtml),
...document.metadata
...document.metadata,
};
return document;
}
export function deriveHTMLFromRawHTML(
meta: Meta,
document: Document
document: Document,
): Document {
if (document.rawHtml === undefined) {
throw new Error(
"rawHtml is undefined -- this transformer is being called out of order"
"rawHtml is undefined -- this transformer is being called out of order",
);
}
@ -47,11 +47,11 @@ export function deriveHTMLFromRawHTML(
export async function deriveMarkdownFromHTML(
_meta: Meta,
document: Document
document: Document,
): Promise<Document> {
if (document.html === undefined) {
throw new Error(
"html is undefined -- this transformer is being called out of order"
"html is undefined -- this transformer is being called out of order",
);
}
@ -64,7 +64,7 @@ export function deriveLinksFromHTML(meta: Meta, document: Document): Document {
if (meta.options.formats.includes("links")) {
if (document.html === undefined) {
throw new Error(
"html is undefined -- this transformer is being called out of order"
"html is undefined -- this transformer is being called out of order",
);
}
@ -76,7 +76,7 @@ export function deriveLinksFromHTML(meta: Meta, document: Document): Document {
export function coerceFieldsToFormats(
meta: Meta,
document: Document
document: Document,
): Document {
const formats = new Set(meta.options.formats);
@ -84,7 +84,7 @@ export function coerceFieldsToFormats(
delete document.markdown;
} else if (formats.has("markdown") && document.markdown === undefined) {
meta.logger.warn(
"Request had format: markdown, but there was no markdown field in the result."
"Request had format: markdown, but there was no markdown field in the result.",
);
}
@ -92,7 +92,7 @@ export function coerceFieldsToFormats(
delete document.rawHtml;
} else if (formats.has("rawHtml") && document.rawHtml === undefined) {
meta.logger.warn(
"Request had format: rawHtml, but there was no rawHtml field in the result."
"Request had format: rawHtml, but there was no rawHtml field in the result.",
);
}
@ -100,7 +100,7 @@ export function coerceFieldsToFormats(
delete document.html;
} else if (formats.has("html") && document.html === undefined) {
meta.logger.warn(
"Request had format: html, but there was no html field in the result."
"Request had format: html, but there was no html field in the result.",
);
}
@ -110,7 +110,7 @@ export function coerceFieldsToFormats(
document.screenshot !== undefined
) {
meta.logger.warn(
"Removed screenshot from Document because it wasn't in formats -- this is very wasteful and indicates a bug."
"Removed screenshot from Document because it wasn't in formats -- this is very wasteful and indicates a bug.",
);
delete document.screenshot;
} else if (
@ -118,29 +118,29 @@ export function coerceFieldsToFormats(
document.screenshot === undefined
) {
meta.logger.warn(
"Request had format: screenshot / screenshot@fullPage, but there was no screenshot field in the result."
"Request had format: screenshot / screenshot@fullPage, but there was no screenshot field in the result.",
);
}
if (!formats.has("links") && document.links !== undefined) {
meta.logger.warn(
"Removed links from Document because it wasn't in formats -- this is wasteful and indicates a bug."
"Removed links from Document because it wasn't in formats -- this is wasteful and indicates a bug.",
);
delete document.links;
} else if (formats.has("links") && document.links === undefined) {
meta.logger.warn(
"Request had format: links, but there was no links field in the result."
"Request had format: links, but there was no links field in the result.",
);
}
if (!formats.has("extract") && document.extract !== undefined) {
meta.logger.warn(
"Removed extract from Document because it wasn't in formats -- this is extremely wasteful and indicates a bug."
"Removed extract from Document because it wasn't in formats -- this is extremely wasteful and indicates a bug.",
);
delete document.extract;
} else if (formats.has("extract") && document.extract === undefined) {
meta.logger.warn(
"Request had format: extract, but there was no extract field in the result."
"Request had format: extract, but there was no extract field in the result.",
);
}
@ -161,12 +161,12 @@ export const transformerStack: Transformer[] = [
uploadScreenshot,
performLLMExtract,
coerceFieldsToFormats,
removeBase64Images
removeBase64Images,
];
export async function executeTransformers(
meta: Meta,
document: Document
document: Document,
): Promise<Document> {
const executions: [string, number][] = [];
@ -174,8 +174,8 @@ export async function executeTransformers(
const _meta = {
...meta,
logger: meta.logger.child({
method: "executeTransformers/" + transformer.name
})
method: "executeTransformers/" + transformer.name,
}),
};
const start = Date.now();
document = await transformer(_meta, document);

View File

@ -25,8 +25,8 @@ function normalizeSchema(x: any): any {
x["$defs"] = Object.fromEntries(
Object.entries(x["$defs"]).map(([name, schema]) => [
name,
normalizeSchema(schema)
])
normalizeSchema(schema),
]),
);
}
@ -50,15 +50,15 @@ function normalizeSchema(x: any): any {
return {
...x,
properties: Object.fromEntries(
Object.entries(x.properties).map(([k, v]) => [k, normalizeSchema(v)])
Object.entries(x.properties).map(([k, v]) => [k, normalizeSchema(v)]),
),
required: Object.keys(x.properties),
additionalProperties: false
additionalProperties: false,
};
} else if (x && x.type === "array") {
return {
...x,
items: normalizeSchema(x.items)
items: normalizeSchema(x.items),
};
} else {
return x;
@ -70,7 +70,7 @@ export async function generateOpenAICompletions(
options: ExtractOptions,
markdown?: string,
previousWarning?: string,
isExtractEndpoint?: boolean
isExtractEndpoint?: boolean,
): Promise<{ extract: any; numTokens: number; warning: string | undefined }> {
let extract: any;
let warning: string | undefined;
@ -125,19 +125,19 @@ export async function generateOpenAICompletions(
schema = {
type: "object",
properties: {
items: options.schema
items: options.schema,
},
required: ["items"],
additionalProperties: false
additionalProperties: false,
};
} else if (schema && typeof schema === "object" && !schema.type) {
schema = {
type: "object",
properties: Object.fromEntries(
Object.entries(schema).map(([key, value]) => [key, { type: value }])
Object.entries(schema).map(([key, value]) => [key, { type: value }]),
),
required: Object.keys(schema),
additionalProperties: false
additionalProperties: false,
};
}
@ -149,19 +149,19 @@ export async function generateOpenAICompletions(
messages: [
{
role: "system",
content: options.systemPrompt
content: options.systemPrompt,
},
{
role: "user",
content: [{ type: "text", text: markdown }]
content: [{ type: "text", text: markdown }],
},
{
role: "user",
content:
options.prompt !== undefined
? `Transform the above content into structured JSON output based on the following user request: ${options.prompt}`
: "Transform the above content into structured JSON output."
}
: "Transform the above content into structured JSON output.",
},
],
response_format: options.schema
? {
@ -169,10 +169,10 @@ export async function generateOpenAICompletions(
json_schema: {
name: "websiteContent",
schema: schema,
strict: true
strict: true,
},
}
}
: { type: "json_object" }
: { type: "json_object" },
});
if (jsonCompletion.choices[0].message.refusal !== null) {
@ -187,16 +187,16 @@ export async function generateOpenAICompletions(
extract = JSON.parse(jsonCompletion.choices[0].message.content);
} else {
const extractData = JSON.parse(
jsonCompletion.choices[0].message.content
jsonCompletion.choices[0].message.content,
);
extract = options.schema ? extractData.data.extract : extractData;
}
} catch (e) {
logger.error("Failed to parse returned JSON, no schema specified.", {
error: e
error: e,
});
throw new LLMRefusalError(
"Failed to parse returned JSON. Please specify a schema in the extract object."
"Failed to parse returned JSON. Please specify a schema in the extract object.",
);
}
}
@ -215,16 +215,16 @@ export async function generateOpenAICompletions(
export async function performLLMExtract(
meta: Meta,
document: Document
document: Document,
): Promise<Document> {
if (meta.options.formats.includes("extract")) {
const { extract, warning } = await generateOpenAICompletions(
meta.logger.child({
method: "performLLMExtract/generateOpenAICompletions"
method: "performLLMExtract/generateOpenAICompletions",
}),
meta.options.extract!,
document.markdown,
document.warning
document.warning,
);
document.extract = extract;
document.warning = warning;

View File

@ -7,7 +7,7 @@ export function removeBase64Images(meta: Meta, document: Document): Document {
if (meta.options.removeBase64Images && document.markdown !== undefined) {
document.markdown = document.markdown.replace(
regex,
"$1(<Base64-Image-Removed>)"
"$1(<Base64-Image-Removed>)",
);
}
return document;

View File

@ -23,8 +23,8 @@ export function uploadScreenshot(meta: Meta, document: Document): Document {
{
cacheControl: "3600",
upsert: false,
contentType: document.screenshot.split(":")[1].split(";")[0]
}
contentType: document.screenshot.split(":")[1].split(";")[0],
},
);
document.screenshot = `https://service.firecrawl.dev/storage/v1/object/public/media/${encodeURIComponent(fileName)}`;

View File

@ -15,7 +15,7 @@ export async function fireEngineMap(
location?: string;
numResults: number;
page?: number;
}
},
): Promise<SearchResult[]> {
try {
let data = JSON.stringify({
@ -25,12 +25,12 @@ export async function fireEngineMap(
location: options.location,
tbs: options.tbs,
numResults: options.numResults,
page: options.page ?? 1
page: options.page ?? 1,
});
if (!process.env.FIRE_ENGINE_BETA_URL) {
console.warn(
"(v1/map Beta) Results might differ from cloud offering currently."
"(v1/map Beta) Results might differ from cloud offering currently.",
);
return [];
}
@ -39,9 +39,9 @@ export async function fireEngineMap(
method: "POST",
headers: {
"Content-Type": "application/json",
"X-Disable-Cache": "true"
"X-Disable-Cache": "true",
},
body: data
body: data,
});
if (response.ok) {

View File

@ -11,7 +11,7 @@ const _useragent_list = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
];
function get_useragent(): string {
@ -27,14 +27,14 @@ async function _req(
proxies: any,
timeout: number,
tbs: string | undefined = undefined,
filter: string | undefined = undefined
filter: string | undefined = undefined,
) {
const params = {
q: term,
num: results, // Number of results to return
hl: lang,
gl: country,
start: start
start: start,
};
if (tbs) {
params["tbs"] = tbs;
@ -45,11 +45,11 @@ async function _req(
try {
const resp = await axios.get("https://www.google.com/search", {
headers: {
"User-Agent": get_useragent()
"User-Agent": get_useragent(),
},
params: params,
proxy: proxies,
timeout: timeout
timeout: timeout,
});
return resp;
} catch (error) {
@ -70,7 +70,7 @@ export async function googleSearch(
country = "us",
proxy = undefined as string | undefined,
sleep_interval = 0,
timeout = 5000
timeout = 5000,
): Promise<SearchResult[]> {
let proxies: any = null;
if (proxy) {
@ -98,7 +98,7 @@ export async function googleSearch(
proxies,
timeout,
tbs,
filter
filter,
);
const $ = cheerio.load(resp.data);
const result_block = $("div.g");
@ -117,7 +117,7 @@ export async function googleSearch(
const title = $(element).find("h3");
const ogImage = $(element).find("img").eq(1).attr("src");
const description_box = $(element).find(
"div[style='-webkit-line-clamp:2']"
"div[style='-webkit-line-clamp:2']",
);
const answerBox = $(element).find(".mod").text();
if (description_box) {
@ -129,7 +129,7 @@ export async function googleSearch(
}
});
await new Promise((resolve) =>
setTimeout(resolve, sleep_interval * 1000)
setTimeout(resolve, sleep_interval * 1000),
);
} catch (error) {
if (error.message === "Too many requests") {

View File

@ -16,7 +16,7 @@ export async function search({
location = undefined,
proxy = undefined,
sleep_interval = 0,
timeout = 5000
timeout = 5000,
}: {
query: string;
advanced?: boolean;
@ -38,7 +38,7 @@ export async function search({
filter,
lang,
country,
location
location,
});
}
if (process.env.SEARCHAPI_API_KEY) {
@ -48,7 +48,7 @@ export async function search({
filter,
lang,
country,
location
location,
});
}
return await googleSearch(
@ -61,7 +61,7 @@ export async function search({
country,
proxy,
sleep_interval,
timeout
timeout,
);
} catch (error) {
logger.error(`Error in search function: ${error}`);

View File

@ -16,7 +16,7 @@ interface SearchOptions {
export async function searchapi_search(
q: string,
options: SearchOptions
options: SearchOptions,
): Promise<SearchResult[]> {
const params = {
q: q,
@ -25,7 +25,7 @@ export async function searchapi_search(
location: options.location,
num: options.num_results,
page: options.page ?? 1,
engine: process.env.SEARCHAPI_ENGINE || "google"
engine: process.env.SEARCHAPI_ENGINE || "google",
};
const url = `https://www.searchapi.io/api/v1/search`;
@ -35,9 +35,9 @@ export async function searchapi_search(
headers: {
Authorization: `Bearer ${process.env.SEARCHAPI_API_KEY}`,
"Content-Type": "application/json",
"X-SearchApi-Source": "Firecrawl"
"X-SearchApi-Source": "Firecrawl",
},
params: params
params: params,
});
if (response.status === 401) {
@ -50,7 +50,7 @@ export async function searchapi_search(
return data.organic_results.map((a: any) => ({
url: a.link,
title: a.title,
description: a.snippet
description: a.snippet,
}));
} else {
return [];

View File

@ -14,7 +14,7 @@ export async function serper_search(
location?: string;
num_results: number;
page?: number;
}
},
): Promise<SearchResult[]> {
let data = JSON.stringify({
q: q,
@ -23,7 +23,7 @@ export async function serper_search(
location: options.location,
tbs: options.tbs,
num: options.num_results,
page: options.page ?? 1
page: options.page ?? 1,
});
let config = {
@ -31,16 +31,16 @@ export async function serper_search(
url: "https://google.serper.dev/search",
headers: {
"X-API-KEY": process.env.SERPER_API_KEY,
"Content-Type": "application/json"
"Content-Type": "application/json",
},
data: data
data: data,
};
const response = await axios(config);
if (response && response.data && Array.isArray(response.data.organic)) {
return response.data.organic.map((a) => ({
url: a.link,
title: a.title,
description: a.snippet
description: a.snippet,
}));
} else {
return [];

View File

@ -17,15 +17,15 @@ export async function checkAlerts() {
const activeJobs = await scrapeQueue.getActiveCount();
if (activeJobs > Number(process.env.ALERT_NUM_ACTIVE_JOBS)) {
logger.warn(
`Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}.`
`Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}.`,
);
sendSlackWebhook(
`Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}`,
true
true,
);
} else {
logger.info(
`Number of active jobs is under ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}`
`Number of active jobs is under ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}`,
);
}
} catch (error) {
@ -39,11 +39,11 @@ export async function checkAlerts() {
if (waitingJobs > Number(process.env.ALERT_NUM_WAITING_JOBS)) {
logger.warn(
`Alert: Number of waiting jobs is over ${process.env.ALERT_NUM_WAITING_JOBS}. Current waiting jobs: ${waitingJobs}.`
`Alert: Number of waiting jobs is over ${process.env.ALERT_NUM_WAITING_JOBS}. Current waiting jobs: ${waitingJobs}.`,
);
sendSlackWebhook(
`Alert: Number of waiting jobs is over ${process.env.ALERT_NUM_WAITING_JOBS}. Current waiting jobs: ${waitingJobs}. Scale up the number of workers with fly scale count worker=20`,
true
true,
);
}
};

View File

@ -4,18 +4,18 @@ import { logger } from "../../../src/lib/logger";
export async function sendSlackWebhook(
message: string,
alertEveryone: boolean = false,
webhookUrl: string = process.env.SLACK_WEBHOOK_URL ?? ""
webhookUrl: string = process.env.SLACK_WEBHOOK_URL ?? "",
) {
const messagePrefix = alertEveryone ? "<!channel> " : "";
const payload = {
text: `${messagePrefix} ${message}`
text: `${messagePrefix} ${message}`,
};
try {
const response = await axios.post(webhookUrl, payload, {
headers: {
"Content-Type": "application/json"
}
"Content-Type": "application/json",
},
});
logger.info("Webhook sent successfully:", response.data);
} catch (error) {

View File

@ -22,7 +22,7 @@ const AUTO_RECHARGE_COOLDOWN = 300; // 5 minutes in seconds
*/
export async function autoCharge(
chunk: AuthCreditUsageChunk,
autoRechargeThreshold: number
autoRechargeThreshold: number,
): Promise<{
success: boolean;
message: string;
@ -38,13 +38,13 @@ export async function autoCharge(
const cooldownValue = await getValue(cooldownKey);
if (cooldownValue) {
logger.info(
`Auto-recharge for team ${chunk.team_id} is in cooldown period`
`Auto-recharge for team ${chunk.team_id} is in cooldown period`,
);
return {
success: false,
message: "Auto-recharge is in cooldown period",
remainingCredits: chunk.remaining_credits,
chunk
chunk,
};
}
@ -53,7 +53,7 @@ export async function autoCharge(
[resource],
5000,
async (
signal
signal,
): Promise<{
success: boolean;
message: string;
@ -81,7 +81,7 @@ export async function autoCharge(
success: false,
message: "Error fetching customer data",
remainingCredits: chunk.remaining_credits,
chunk
chunk,
};
}
@ -90,7 +90,7 @@ export async function autoCharge(
// Attempt to create a payment intent
const paymentStatus = await createPaymentIntent(
chunk.team_id,
customer.stripe_customer_id
customer.stripe_customer_id,
);
// If payment is successful or requires further action, issue credits
@ -100,7 +100,7 @@ export async function autoCharge(
) {
issueCreditsSuccess = await issueCredits(
chunk.team_id,
AUTO_RECHARGE_CREDITS
AUTO_RECHARGE_CREDITS,
);
}
@ -109,7 +109,7 @@ export async function autoCharge(
team_id: chunk.team_id,
initial_payment_status: paymentStatus.return_status,
credits_issued: issueCreditsSuccess ? AUTO_RECHARGE_CREDITS : 0,
stripe_charge_id: paymentStatus.charge_id
stripe_charge_id: paymentStatus.charge_id,
});
// Send a notification if credits were successfully issued
@ -120,7 +120,7 @@ export async function autoCharge(
chunk.sub_current_period_start,
chunk.sub_current_period_end,
chunk,
true
true,
);
// Set cooldown period
@ -139,7 +139,7 @@ export async function autoCharge(
sendSlackWebhook(
`Auto-recharge: Team ${chunk.team_id}. ${AUTO_RECHARGE_CREDITS} credits added. Payment status: ${paymentStatus.return_status}.`,
false,
process.env.SLACK_ADMIN_WEBHOOK_URL
process.env.SLACK_ADMIN_WEBHOOK_URL,
).catch((error) => {
logger.debug(`Error sending slack notification: ${error}`);
});
@ -156,8 +156,8 @@ export async function autoCharge(
chunk: {
...chunk,
remaining_credits:
chunk.remaining_credits + AUTO_RECHARGE_CREDITS
}
chunk.remaining_credits + AUTO_RECHARGE_CREDITS,
},
};
} else {
logger.error("No Stripe customer ID found for user");
@ -165,7 +165,7 @@ export async function autoCharge(
success: false,
message: "No Stripe customer ID found for user",
remainingCredits: chunk.remaining_credits,
chunk
chunk,
};
}
} else {
@ -174,7 +174,7 @@ export async function autoCharge(
success: false,
message: "No sub_user_id found in chunk",
remainingCredits: chunk.remaining_credits,
chunk
chunk,
};
}
}
@ -182,9 +182,9 @@ export async function autoCharge(
success: false,
message: "No need to auto-recharge",
remainingCredits: chunk.remaining_credits,
chunk
chunk,
};
}
},
);
} catch (error) {
logger.error(`Failed to acquire lock for auto-recharge: ${error}`);
@ -192,7 +192,7 @@ export async function autoCharge(
success: false,
message: "Failed to acquire lock for auto-recharge",
remainingCredits: chunk.remaining_credits,
chunk
chunk,
};
}
}

View File

@ -19,18 +19,18 @@ const FREE_CREDITS = 500;
export async function billTeam(
team_id: string,
subscription_id: string | null | undefined,
credits: number
credits: number,
) {
return withAuth(supaBillTeam, { success: true, message: "No DB, bypassed." })(
team_id,
subscription_id,
credits
credits,
);
}
export async function supaBillTeam(
team_id: string,
subscription_id: string | null | undefined,
credits: number
credits: number,
) {
if (team_id === "preview") {
return { success: true, message: "Preview team, no credits used" };
@ -41,7 +41,7 @@ export async function supaBillTeam(
_team_id: team_id,
sub_id: subscription_id ?? null,
fetch_subscription: subscription_id === undefined,
credits
credits,
});
if (error) {
@ -58,9 +58,9 @@ export async function supaBillTeam(
...acuc,
credits_used: acuc.credits_used + credits,
adjusted_credits_used: acuc.adjusted_credits_used + credits,
remaining_credits: acuc.remaining_credits - credits
remaining_credits: acuc.remaining_credits - credits,
}
: null
: null,
);
}
})();
@ -76,12 +76,12 @@ export type CheckTeamCreditsResponse = {
export async function checkTeamCredits(
chunk: AuthCreditUsageChunk | null,
team_id: string,
credits: number
credits: number,
): Promise<CheckTeamCreditsResponse> {
return withAuth(supaCheckTeamCredits, {
success: true,
message: "No DB, bypassed",
remainingCredits: Infinity
remainingCredits: Infinity,
})(chunk, team_id, credits);
}
@ -89,14 +89,14 @@ export async function checkTeamCredits(
export async function supaCheckTeamCredits(
chunk: AuthCreditUsageChunk | null,
team_id: string,
credits: number
credits: number,
): Promise<CheckTeamCreditsResponse> {
// WARNING: chunk will be null if team_id is preview -- do not perform operations on it under ANY circumstances - mogery
if (team_id === "preview") {
return {
success: true,
message: "Preview team, no credits used",
remainingCredits: Infinity
remainingCredits: Infinity,
};
} else if (chunk === null) {
throw new Error("NULL ACUC passed to supaCheckTeamCredits");
@ -141,7 +141,7 @@ export async function supaCheckTeamCredits(
success: true,
message: autoChargeResult.message,
remainingCredits: autoChargeResult.remainingCredits,
chunk: autoChargeResult.chunk
chunk: autoChargeResult.chunk,
};
}
}
@ -155,7 +155,7 @@ export async function supaCheckTeamCredits(
NotificationType.LIMIT_REACHED,
chunk.sub_current_period_start,
chunk.sub_current_period_end,
chunk
chunk,
);
}
return {
@ -163,7 +163,7 @@ export async function supaCheckTeamCredits(
message:
"Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing.",
remainingCredits: chunk.remaining_credits,
chunk
chunk,
};
} else if (creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) {
// Send email notification for approaching credit limit
@ -172,7 +172,7 @@ export async function supaCheckTeamCredits(
NotificationType.APPROACHING_LIMIT,
chunk.sub_current_period_start,
chunk.sub_current_period_end,
chunk
chunk,
);
}
@ -180,13 +180,13 @@ export async function supaCheckTeamCredits(
success: true,
message: "Sufficient credits available",
remainingCredits: chunk.remaining_credits,
chunk
chunk,
};
}
// Count the total credits used by a team within the current billing period and return the remaining credits.
export async function countCreditsAndRemainingForCurrentBillingPeriod(
team_id: string
team_id: string,
) {
// 1. Retrieve the team's active subscription based on the team_id.
const { data: subscription, error: subscriptionError } =
@ -206,7 +206,7 @@ export async function countCreditsAndRemainingForCurrentBillingPeriod(
if (coupons && coupons.length > 0) {
couponCredits = coupons.reduce(
(total, coupon) => total + coupon.credits,
0
0,
);
}
@ -221,20 +221,20 @@ export async function countCreditsAndRemainingForCurrentBillingPeriod(
if (creditUsageError || !creditUsages) {
throw new Error(
`Failed to retrieve credit usage for team_id: ${team_id}`
`Failed to retrieve credit usage for team_id: ${team_id}`,
);
}
const totalCreditsUsed = creditUsages.reduce(
(acc, usage) => acc + usage.credits_used,
0
0,
);
const remainingCredits = FREE_CREDITS + couponCredits - totalCreditsUsed;
return {
totalCreditsUsed: totalCreditsUsed,
remainingCredits,
totalCredits: FREE_CREDITS + couponCredits
totalCredits: FREE_CREDITS + couponCredits,
};
}
@ -247,13 +247,13 @@ export async function countCreditsAndRemainingForCurrentBillingPeriod(
if (creditUsageError || !creditUsages) {
throw new Error(
`Failed to retrieve credit usage for subscription_id: ${subscription.id}`
`Failed to retrieve credit usage for subscription_id: ${subscription.id}`,
);
}
const totalCreditsUsed = creditUsages.reduce(
(acc, usage) => acc + usage.credits_used,
0
0,
);
const { data: price, error: priceError } = await supabase_service
@ -264,7 +264,7 @@ export async function countCreditsAndRemainingForCurrentBillingPeriod(
if (priceError || !price) {
throw new Error(
`Failed to retrieve price for price_id: ${subscription.price_id}`
`Failed to retrieve price for price_id: ${subscription.price_id}`,
);
}
@ -273,6 +273,6 @@ export async function countCreditsAndRemainingForCurrentBillingPeriod(
return {
totalCreditsUsed,
remainingCredits,
totalCredits: price.credits
totalCredits: price.credits,
};
}

Some files were not shown because too many files have changed in this diff Show More