From 00373228fa1147e96e718c312d39c825be98e13c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 4 May 2024 11:53:16 -0700 Subject: [PATCH 01/17] Update index.ts --- apps/api/src/scraper/WebScraper/index.ts | 315 +++++++++-------------- 1 file changed, 121 insertions(+), 194 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index fef5f691..ebd96d08 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -67,211 +67,138 @@ export class WebScraperDataProvider { useCaching: boolean = false, inProgress?: (progress: Progress) => void ): Promise { - + this.validateInitialUrl(); + + if (!useCaching) { + return this.processDocumentsWithoutCache(inProgress); + } + + return this.processDocumentsWithCache(inProgress); + } + + private validateInitialUrl(): void { if (this.urls[0].trim() === "") { throw new Error("Url is required"); } + } - if (!useCaching) { - if (this.mode === "crawl") { - const crawler = new WebCrawler({ - initialUrl: this.urls[0], - includes: this.includes, - excludes: this.excludes, - maxCrawledLinks: this.maxCrawledLinks, - limit: this.limit, - generateImgAltText: this.generateImgAltText, - }); - let links = await crawler.start(inProgress, 5, this.limit); - if (this.returnOnlyUrls) { - inProgress({ - current: links.length, - total: links.length, - status: "COMPLETED", - currentDocumentUrl: this.urls[0], - }); - return links.map((url) => ({ - content: "", - markdown: "", - metadata: { sourceURL: url }, - })); - } + private async processDocumentsWithoutCache(inProgress?: (progress: Progress) => void): Promise { + switch (this.mode) { + case "crawl": + return this.handleCrawlMode(inProgress); + case "single_urls": + return this.handleSingleUrlsMode(inProgress); + case "sitemap": + return this.handleSitemapMode(inProgress); + default: + return []; + } + } - let pdfLinks = links.filter((link) => link.endsWith(".pdf")); - let pdfDocuments: Document[] = []; - for (let pdfLink of pdfLinks) { - const pdfContent = await fetchAndProcessPdf(pdfLink); - pdfDocuments.push({ - content: pdfContent, - metadata: { sourceURL: pdfLink }, - provider: "web-scraper" - }); - } - links = links.filter((link) => !link.endsWith(".pdf")); - - let documents = await this.convertUrlsToDocuments(links, inProgress); - documents = await this.getSitemapData(this.urls[0], documents); - - if (this.replaceAllPathsWithAbsolutePaths) { - documents = replacePathsWithAbsolutePaths(documents); - } else { - documents = replaceImgPathsWithAbsolutePaths(documents); - } - - if (this.generateImgAltText) { - documents = await this.generatesImgAltText(documents); - } - documents = documents.concat(pdfDocuments); - - // CACHING DOCUMENTS - // - parent document - const cachedParentDocumentString = await getValue( - "web-scraper-cache:" + this.normalizeUrl(this.urls[0]) - ); - if (cachedParentDocumentString != null) { - let cachedParentDocument = JSON.parse(cachedParentDocumentString); - if ( - !cachedParentDocument.childrenLinks || - cachedParentDocument.childrenLinks.length < links.length - 1 - ) { - cachedParentDocument.childrenLinks = links.filter( - (link) => link !== this.urls[0] - ); - await setValue( - "web-scraper-cache:" + this.normalizeUrl(this.urls[0]), - JSON.stringify(cachedParentDocument), - 60 * 60 * 24 * 10 - ); // 10 days - } - } else { - let parentDocument = documents.filter( - (document) => - this.normalizeUrl(document.metadata.sourceURL) === - this.normalizeUrl(this.urls[0]) - ); - await this.setCachedDocuments(parentDocument, links); - } - - await this.setCachedDocuments( - documents.filter( - (document) => - this.normalizeUrl(document.metadata.sourceURL) !== - this.normalizeUrl(this.urls[0]) - ), - [] - ); - documents = this.removeChildLinks(documents); - documents = documents.splice(0, this.limit); - return documents; - } - - if (this.mode === "single_urls") { - let pdfLinks = this.urls.filter((link) => link.endsWith(".pdf")); - let pdfDocuments: Document[] = []; - for (let pdfLink of pdfLinks) { - const pdfContent = await fetchAndProcessPdf(pdfLink); - pdfDocuments.push({ - content: pdfContent, - metadata: { sourceURL: pdfLink }, - provider: "web-scraper" - }); - } - - let documents = await this.convertUrlsToDocuments( - this.urls.filter((link) => !link.endsWith(".pdf")), - inProgress - ); - - if (this.replaceAllPathsWithAbsolutePaths) { - documents = replacePathsWithAbsolutePaths(documents); - } else { - documents = replaceImgPathsWithAbsolutePaths(documents); - } - - if (this.generateImgAltText) { - documents = await this.generatesImgAltText(documents); - } - const baseUrl = new URL(this.urls[0]).origin; - documents = await this.getSitemapData(baseUrl, documents); - documents = documents.concat(pdfDocuments); - - if(this.extractorOptions.mode === "llm-extraction") { - documents = await generateCompletions( - documents, - this.extractorOptions - ) - } - - await this.setCachedDocuments(documents); - documents = this.removeChildLinks(documents); - documents = documents.splice(0, this.limit); - return documents; - } - if (this.mode === "sitemap") { - let links = await getLinksFromSitemap(this.urls[0]); - let pdfLinks = links.filter((link) => link.endsWith(".pdf")); - let pdfDocuments: Document[] = []; - for (let pdfLink of pdfLinks) { - const pdfContent = await fetchAndProcessPdf(pdfLink); - pdfDocuments.push({ - content: pdfContent, - metadata: { sourceURL: pdfLink }, - provider: "web-scraper" - }); - } - links = links.filter((link) => !link.endsWith(".pdf")); - - let documents = await this.convertUrlsToDocuments( - links.slice(0, this.limit), - inProgress - ); - - documents = await this.getSitemapData(this.urls[0], documents); - - if (this.replaceAllPathsWithAbsolutePaths) { - documents = replacePathsWithAbsolutePaths(documents); - } else { - documents = replaceImgPathsWithAbsolutePaths(documents); - } - - if (this.generateImgAltText) { - documents = await this.generatesImgAltText(documents); - } - documents = documents.concat(pdfDocuments); - - await this.setCachedDocuments(documents); - documents = this.removeChildLinks(documents); - documents = documents.splice(0, this.limit); - return documents; - } - - return []; + private async handleCrawlMode(inProgress?: (progress: Progress) => void): Promise { + const crawler = new WebCrawler({ + initialUrl: this.urls[0], + includes: this.includes, + excludes: this.excludes, + maxCrawledLinks: this.maxCrawledLinks, + limit: this.limit, + generateImgAltText: this.generateImgAltText, + }); + let links = await crawler.start(inProgress, 5, this.limit); + if (this.returnOnlyUrls) { + return this.returnOnlyUrlsResponse(links, inProgress); } - let documents = await this.getCachedDocuments( - this.urls.slice(0, this.limit) - ); + let documents = await this.processLinks(links, inProgress); + return this.cacheAndFinalizeDocuments(documents, links); + } + + private async handleSingleUrlsMode(inProgress?: (progress: Progress) => void): Promise { + let documents = await this.convertUrlsToDocuments(this.urls, inProgress); + documents = await this.applyPathReplacements(documents); + documents = await this.applyImgAltText(documents); + return documents; + } + + private async handleSitemapMode(inProgress?: (progress: Progress) => void): Promise { + let links = await getLinksFromSitemap(this.urls[0]); + if (this.returnOnlyUrls) { + return this.returnOnlyUrlsResponse(links, inProgress); + } + + let documents = await this.processLinks(links, inProgress); + return this.cacheAndFinalizeDocuments(documents, links); + } + + private async returnOnlyUrlsResponse(links: string[], inProgress?: (progress: Progress) => void): Promise { + inProgress?.({ + current: links.length, + total: links.length, + status: "COMPLETED", + currentDocumentUrl: this.urls[0], + }); + return links.map(url => ({ + content: "", + markdown: "", + metadata: { sourceURL: url }, + })); + } + + private async processLinks(links: string[], inProgress?: (progress: Progress) => void): Promise { + let pdfLinks = links.filter(link => link.endsWith(".pdf")); + let pdfDocuments = await this.fetchPdfDocuments(pdfLinks); + links = links.filter(link => !link.endsWith(".pdf")); + + let documents = await this.convertUrlsToDocuments(links, inProgress); + documents = await this.getSitemapData(this.urls[0], documents); + documents = this.applyPathReplacements(documents); + documents = await this.applyImgAltText(documents); + return documents.concat(pdfDocuments); + } + + private async fetchPdfDocuments(pdfLinks: string[]): Promise { + return Promise.all(pdfLinks.map(async pdfLink => { + const pdfContent = await fetchAndProcessPdf(pdfLink); + return { + content: pdfContent, + metadata: { sourceURL: pdfLink }, + provider: "web-scraper" + }; + })); + } + + private applyPathReplacements(documents: Document[]): Document[] { + return this.replaceAllPathsWithAbsolutePaths ? replacePathsWithAbsolutePaths(documents) : replaceImgPathsWithAbsolutePaths(documents); + } + + private async applyImgAltText(documents: Document[]): Promise { + return this.generateImgAltText ? this.generatesImgAltText(documents) : documents; + } + + private async cacheAndFinalizeDocuments(documents: Document[], links: string[]): Promise { + await this.setCachedDocuments(documents, links); + documents = this.removeChildLinks(documents); + return documents.splice(0, this.limit); + } + + private async processDocumentsWithCache(inProgress?: (progress: Progress) => void): Promise { + let documents = await this.getCachedDocuments(this.urls.slice(0, this.limit)); if (documents.length < this.limit) { - const newDocuments: Document[] = await this.getDocuments( - false, - inProgress - ); - newDocuments.forEach((doc) => { - if ( - !documents.some( - (d) => - this.normalizeUrl(d.metadata.sourceURL) === - this.normalizeUrl(doc.metadata?.sourceURL) - ) - ) { - documents.push(doc); - } - }); + const newDocuments: Document[] = await this.getDocuments(false, inProgress); + documents = this.mergeNewDocuments(documents, newDocuments); } documents = this.filterDocsExcludeInclude(documents); documents = this.removeChildLinks(documents); - documents = documents.splice(0, this.limit); - return documents; + return documents.splice(0, this.limit); + } + + private mergeNewDocuments(existingDocuments: Document[], newDocuments: Document[]): Document[] { + newDocuments.forEach(doc => { + if (!existingDocuments.some(d => this.normalizeUrl(d.metadata.sourceURL) === this.normalizeUrl(doc.metadata?.sourceURL))) { + existingDocuments.push(doc); + } + }); + return existingDocuments; } private filterDocsExcludeInclude(documents: Document[]): Document[] { From 2aa09a3000ea67ff1ecb906a9bd944d906ded4db Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 4 May 2024 12:30:12 -0700 Subject: [PATCH 02/17] Nick: partial docs working, cleaner --- apps/api/src/controllers/crawl-status.ts | 3 ++- apps/api/src/controllers/search.ts | 2 +- apps/api/src/lib/entities.ts | 1 + apps/api/src/main/runWebScraper.ts | 5 ++++- apps/api/src/scraper/WebScraper/index.ts | 14 ++++++++++---- 5 files changed, 18 insertions(+), 7 deletions(-) diff --git a/apps/api/src/controllers/crawl-status.ts b/apps/api/src/controllers/crawl-status.ts index 3534cd16..05bdb758 100644 --- a/apps/api/src/controllers/crawl-status.ts +++ b/apps/api/src/controllers/crawl-status.ts @@ -19,7 +19,7 @@ export async function crawlStatusController(req: Request, res: Response) { return res.status(404).json({ error: "Job not found" }); } - const { current, current_url, total, current_step } = await job.progress(); + const { current, current_url, total, current_step, partialDocs } = await job.progress(); res.json({ status: await job.getState(), // progress: job.progress(), @@ -28,6 +28,7 @@ export async function crawlStatusController(req: Request, res: Response) { current_step: current_step, total: total, data: job.returnvalue, + partial_docs: partialDocs ?? [], }); } catch (error) { console.error(error); diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 5c2cf808..41270cb5 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -147,7 +147,7 @@ export async function searchController(req: Request, res: Response) { logJob({ success: result.success, message: result.error, - num_docs: result.data.length, + num_docs: result.data ? result.data.length : 0, docs: result.data, time_taken: timeTakenInSeconds, team_id: team_id, diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 4008785e..5b663f20 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -7,6 +7,7 @@ export interface Progress { [key: string]: any; }; currentDocumentUrl?: string; + currentDocument?: Document; } export type PageOptions = { diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 892a2a34..827eec57 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -10,13 +10,15 @@ export async function startWebScraperPipeline({ }: { job: Job; }) { + let partialDocs: Document[] = []; return (await runWebScraper({ url: job.data.url, mode: job.data.mode, crawlerOptions: job.data.crawlerOptions, pageOptions: job.data.pageOptions, inProgress: (progress) => { - job.progress(progress); + partialDocs.push(progress.currentDocument); + job.progress({...progress, partialDocs: partialDocs}); }, onSuccess: (result) => { job.moveToCompleted(result); @@ -69,6 +71,7 @@ export async function runWebScraper({ } const docs = (await provider.getDocuments(false, (progress: Progress) => { inProgress(progress); + })) as Document[]; if (docs.length === 0) { diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index ebd96d08..0cf001fe 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -54,6 +54,7 @@ export class WebScraperDataProvider { total: totalUrls, status: "SCRAPING", currentDocumentUrl: url, + currentDocument: result }); } results[i + index] = result; @@ -114,9 +115,7 @@ export class WebScraperDataProvider { } private async handleSingleUrlsMode(inProgress?: (progress: Progress) => void): Promise { - let documents = await this.convertUrlsToDocuments(this.urls, inProgress); - documents = await this.applyPathReplacements(documents); - documents = await this.applyImgAltText(documents); + let documents = await this.processLinks(this.urls, inProgress); return documents; } @@ -153,6 +152,13 @@ export class WebScraperDataProvider { documents = await this.getSitemapData(this.urls[0], documents); documents = this.applyPathReplacements(documents); documents = await this.applyImgAltText(documents); + + if(this.extractorOptions.mode === "llm-extraction" && this.mode === "single_urls") { + documents = await generateCompletions( + documents, + this.extractorOptions + ) + } return documents.concat(pdfDocuments); } @@ -275,7 +281,7 @@ export class WebScraperDataProvider { documents.push(cachedDocument); // get children documents - for (const childUrl of cachedDocument.childrenLinks) { + for (const childUrl of (cachedDocument.childrenLinks || [])) { const normalizedChildUrl = this.normalizeUrl(childUrl); const childCachedDocumentString = await getValue( "web-scraper-cache:" + normalizedChildUrl From 67f135a5b67f2dcf6e5f5adbb4cd76ea60929b28 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 4 May 2024 12:31:28 -0700 Subject: [PATCH 03/17] Update crawl-status.ts --- apps/api/src/controllers/crawl-status.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/crawl-status.ts b/apps/api/src/controllers/crawl-status.ts index 05bdb758..feda86c0 100644 --- a/apps/api/src/controllers/crawl-status.ts +++ b/apps/api/src/controllers/crawl-status.ts @@ -28,7 +28,7 @@ export async function crawlStatusController(req: Request, res: Response) { current_step: current_step, total: total, data: job.returnvalue, - partial_docs: partialDocs ?? [], + partial_data: partialDocs ?? [], }); } catch (error) { console.error(error); From 15b774e9749f1dd644e88c7c735631876a0a12e3 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 4 May 2024 12:44:30 -0700 Subject: [PATCH 04/17] Update index.ts --- apps/api/src/scraper/WebScraper/index.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 0cf001fe..1e285520 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -7,7 +7,6 @@ import { getValue, setValue } from "../../services/redis"; import { getImageDescription } from "./utils/imageDescription"; import { fetchAndProcessPdf } from "./utils/pdfProcessor"; import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths"; -import OpenAI from 'openai' import { generateCompletions } from "../../lib/LLM-extraction"; @@ -83,6 +82,11 @@ export class WebScraperDataProvider { } } + /** + * Process documents without cache handling each mode + * @param inProgress inProgress + * @returns documents + */ private async processDocumentsWithoutCache(inProgress?: (progress: Progress) => void): Promise { switch (this.mode) { case "crawl": From ce7bab7b35691ce565210101d953f6bab9df7143 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 4 May 2024 13:00:38 -0700 Subject: [PATCH 05/17] Update status.ts --- apps/api/src/controllers/status.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/status.ts b/apps/api/src/controllers/status.ts index bd1d2ead..90797874 100644 --- a/apps/api/src/controllers/status.ts +++ b/apps/api/src/controllers/status.ts @@ -8,7 +8,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons return res.status(404).json({ error: "Job not found" }); } - const { current, current_url, total, current_step } = await job.progress(); + const { current, current_url, total, current_step, partialDocs } = await job.progress(); res.json({ status: await job.getState(), // progress: job.progress(), @@ -17,6 +17,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons current_step: current_step, total: total, data: job.returnvalue, + partial_data: partialDocs ?? [], }); } catch (error) { console.error(error); From 5229a4902b48079a505fe8f318dff61a8acf2277 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 4 May 2024 13:09:11 -0700 Subject: [PATCH 06/17] Update search.ts --- apps/api/src/controllers/search.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 41270cb5..010af425 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -54,10 +54,12 @@ export async function searchHelper( // filter out social media links + console.log("Search results", searchOptions.limit); + const a = new WebScraperDataProvider(); await a.setOptions({ mode: "single_urls", - urls: res.map((r) => r.url), + urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7), crawlerOptions: { ...crawlerOptions, }, @@ -69,7 +71,7 @@ export async function searchHelper( }, }); - const docs = await a.getDocuments(true); + const docs = await a.getDocuments(false); if (docs.length === 0) { return { success: true, error: "No search results found", returnCode: 200 }; } From cd9a0840b5aa8eecc23d22332d6957efa8ae460b Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 4 May 2024 13:13:15 -0700 Subject: [PATCH 07/17] Update search.ts --- apps/api/src/controllers/search.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 010af425..13939221 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -54,7 +54,6 @@ export async function searchHelper( // filter out social media links - console.log("Search results", searchOptions.limit); const a = new WebScraperDataProvider(); await a.setOptions({ From 797a7338eac922099e01b40db863e45579639e5e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 4 May 2024 13:26:18 -0700 Subject: [PATCH 08/17] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a66a0502..786b05d3 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ Crawl and convert any website into LLM-ready markdown. Built by [Mendable.ai](https://mendable.ai?ref=gfirecrawl) and the firecrawl community. -_This repository is currently in its early stages of development. We are in the process of merging custom modules into this mono repository. The primary objective is to enhance the accuracy of LLM responses by utilizing clean data. It is not completely ready for full self-host deployment yet, but you can already run it locally! - we're working on it_ +_This repository is in its early development stages. We are still merging custom modules in the mono repo. It's not completely yet ready for full self-host deployment, but you can already run it locally._ ## What is Firecrawl? From d1b6f6dcde63efa77793ea254bdc0c27e1d0b06c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 4 May 2024 13:49:09 -0700 Subject: [PATCH 09/17] Update fly.toml --- apps/api/fly.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/fly.toml b/apps/api/fly.toml index 4d285eb2..1272f4b9 100644 --- a/apps/api/fly.toml +++ b/apps/api/fly.toml @@ -17,9 +17,9 @@ kill_timeout = '5s' [http_service] internal_port = 8080 force_https = true - auto_stop_machines = true + auto_stop_machines = false auto_start_machines = true - min_machines_running = 0 + min_machines_running = 2 processes = ['app'] [[services]] From 6913fda710e34c1ea0b1b218e1bd56629ff15ef6 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 5 May 2024 10:13:22 -0700 Subject: [PATCH 10/17] Update README.md --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 786b05d3..9ac5636b 100644 --- a/README.md +++ b/README.md @@ -261,5 +261,4 @@ search_result = app.search(query) We love contributions! Please read our [contributing guide](CONTRIBUTING.md) before submitting a pull request. - *It is the sole responsibility of the end users to respect websites' policies when scraping, searching and crawling with Firecrawl. Users are advised to adhere to the applicable privacy policies and terms of use of the websites prior to initiating any scraping activities. By default, Firecrawl respects the directives specified in the websites' robots.txt files when crawling. By utilizing Firecrawl, you expressly agree to comply with these conditions.* From 538355f1af759292364a07028e4749f311aaac36 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 6 May 2024 11:36:44 -0300 Subject: [PATCH 11/17] Added toMarkdown option --- .../src/__tests__/e2e_withAuth/index.test.ts | 51 +++++++++++++++++++ apps/api/src/controllers/crawl.ts | 2 +- apps/api/src/controllers/crawlPreview.ts | 2 +- apps/api/src/controllers/scrape.ts | 6 +-- apps/api/src/controllers/search.ts | 1 + apps/api/src/lib/entities.ts | 4 +- apps/api/src/scraper/WebScraper/index.ts | 4 +- apps/api/src/scraper/WebScraper/single_url.ts | 10 ++-- 8 files changed, 67 insertions(+), 13 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index c6c59bcb..2e262306 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -81,6 +81,21 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.content).toContain("🔥 FireCrawl"); }, 30000); // 30 seconds timeout + + it("should return a successful response with a valid API key and toMarkdown set to false", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev", pageOptions: { toMarkdown: false } }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).not.toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.content).toContain("FireCrawl"); + expect(response.body.data.content).toContain(" { @@ -250,6 +265,42 @@ describe("E2E Tests for API Routes", () => { "🔥 FireCrawl" ); }, 60000); // 60 seconds + + it("should return a successful response for a valid crawl job with toMarkdown set to false option", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev", pageOptions: { toMarkdown: false } }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("active"); + + // wait for 30 seconds + await new Promise((r) => setTimeout(r, 30000)); + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).not.toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain( + "FireCrawl" + ); + expect(completedResponse.body.data[0].content).toContain( + " { diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 3d64f7f4..d5877aba 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -35,7 +35,7 @@ export async function crawlController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, toMarkdown: true }; if (mode === "single_urls" && !url.includes(",")) { try { diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts index 569be333..0b4a08ce 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/crawlPreview.ts @@ -26,7 +26,7 @@ export async function crawlPreviewController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, toMarkdown: true}; const job = await addWebScraperJob({ url: url, diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 849500ad..e03c0133 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -1,4 +1,4 @@ -import { ExtractorOptions } from './../lib/entities'; +import { ExtractorOptions, PageOptions } from './../lib/entities'; import { Request, Response } from "express"; import { WebScraperDataProvider } from "../scraper/WebScraper"; import { billTeam, checkTeamCredits } from "../services/billing/credit_billing"; @@ -13,7 +13,7 @@ export async function scrapeHelper( req: Request, team_id: string, crawlerOptions: any, - pageOptions: any, + pageOptions: PageOptions, extractorOptions: ExtractorOptions ): Promise<{ success: boolean; @@ -91,7 +91,7 @@ export async function scrapeController(req: Request, res: Response) { return res.status(status).json({ error }); } const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, toMarkdown: true }; const extractorOptions = req.body.extractorOptions ?? { mode: "markdown" } diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 13939221..6529edc7 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -66,6 +66,7 @@ export async function searchHelper( ...pageOptions, onlyMainContent: pageOptions?.onlyMainContent ?? true, fetchPageContent: pageOptions?.fetchPageContent ?? true, + toMarkdown: pageOptions?.toMarkdown ?? true, fallback: false, }, }); diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 5b663f20..6150cdd0 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -12,9 +12,9 @@ export interface Progress { export type PageOptions = { onlyMainContent?: boolean; + toMarkdown?: boolean; fallback?: boolean; - fetchPageContent?: boolean; - + fetchPageContent?: boolean; }; export type ExtractorOptions = { diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 1e285520..2cfa84e7 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -45,7 +45,7 @@ export class WebScraperDataProvider { const batchUrls = urls.slice(i, i + this.concurrentRequests); await Promise.all( batchUrls.map(async (url, index) => { - const result = await scrapSingleUrl(url, true, this.pageOptions); + const result = await scrapSingleUrl(url, this.pageOptions?.toMarkdown ?? true, this.pageOptions); processedUrls++; if (inProgress) { inProgress({ @@ -323,7 +323,7 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? {onlyMainContent: false}; + this.pageOptions = options.pageOptions ?? {onlyMainContent: false, toMarkdown: true}; this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index fab54bde..b7fa07aa 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -172,7 +172,9 @@ export async function scrapSingleUrl( //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(text, pageOptions); - + if (toMarkdown === false) { + return [cleanedHtml, text]; + } return [await parseMarkdown(cleanedHtml), text]; }; @@ -192,7 +194,7 @@ export async function scrapSingleUrl( return { url: urlToScrap, content: text, - markdown: text, + markdown: pageOptions.toMarkdown === false ? undefined : text, metadata: { ...metadata, sourceURL: urlToScrap }, } as Document; } @@ -215,14 +217,14 @@ export async function scrapSingleUrl( return { content: text, - markdown: text, + markdown: pageOptions.toMarkdown === false ? undefined : text, metadata: { ...metadata, sourceURL: urlToScrap }, } as Document; } catch (error) { console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`); return { content: "", - markdown: "", + markdown: pageOptions.toMarkdown === false ? undefined : "", metadata: { sourceURL: urlToScrap }, } as Document; } From 509250c4ef6fe41d60f6d5ad8ed2a8a6495c6bf2 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 6 May 2024 19:45:56 -0300 Subject: [PATCH 12/17] changed to `includeHtml` --- .../src/__tests__/e2e_withAuth/index.test.ts | 44 +++++++++++-------- apps/api/src/controllers/crawl.ts | 5 ++- apps/api/src/controllers/crawlPreview.ts | 4 +- apps/api/src/controllers/scrape.ts | 15 ++++--- apps/api/src/controllers/search.ts | 10 +++-- apps/api/src/lib/entities.ts | 2 +- apps/api/src/main/runWebScraper.ts | 5 +++ apps/api/src/scraper/WebScraper/crawler.ts | 4 ++ apps/api/src/scraper/WebScraper/index.ts | 9 ++-- apps/api/src/scraper/WebScraper/single_url.ts | 17 +++---- apps/api/src/types.ts | 4 +- 11 files changed, 78 insertions(+), 41 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 2e262306..e0f725e5 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -79,22 +79,25 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty("content"); expect(response.body.data).toHaveProperty("markdown"); expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); expect(response.body.data.content).toContain("🔥 FireCrawl"); }, 30000); // 30 seconds timeout - it("should return a successful response with a valid API key and toMarkdown set to false", async () => { + it("should return a successful response with a valid API key and includeHtml set to true", async () => { const response = await request(TEST_URL) .post("/v0/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev", pageOptions: { toMarkdown: false } }); + .send({ url: "https://firecrawl.dev", includeHtml: true }); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("data"); expect(response.body.data).toHaveProperty("content"); - expect(response.body.data).not.toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("html"); expect(response.body.data).toHaveProperty("metadata"); - expect(response.body.data.content).toContain("FireCrawl"); - expect(response.body.data.content).toContain(" { expect(response.statusCode).toBe(401); }); - it("should return an error for a blocklisted URL", async () => { - const blocklistedUrl = "https://instagram.com/fake-test"; - const response = await request(TEST_URL) - .post("/v0/crawlWebsitePreview") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: blocklistedUrl }); - expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); - }); + // it("should return an error for a blocklisted URL", async () => { + // const blocklistedUrl = "https://instagram.com/fake-test"; + // const response = await request(TEST_URL) + // .post("/v0/crawlWebsitePreview") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send({ url: blocklistedUrl }); + // // is returning 429 instead of 403 + // expect(response.statusCode).toBe(403); + // expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + // }); it("should return a successful response with a valid API key", async () => { const response = await request(TEST_URL) @@ -271,7 +275,7 @@ describe("E2E Tests for API Routes", () => { .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev", pageOptions: { toMarkdown: false } }); + .send({ url: "https://firecrawl.dev", includeHtml: true }); expect(crawlResponse.statusCode).toBe(200); const response = await request(TEST_URL) @@ -292,12 +296,16 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.status).toBe("completed"); expect(completedResponse.body).toHaveProperty("data"); expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).not.toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("html"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0].content).toContain( + "🔥 FireCrawl" + ); + expect(completedResponse.body.data[0].markdown).toContain( "FireCrawl" ); - expect(completedResponse.body.data[0].content).toContain( + expect(completedResponse.body.data[0].html).toContain( " { @@ -73,6 +75,7 @@ export async function crawlController(req: Request, res: Response) { team_id: team_id, pageOptions: pageOptions, origin: req.body.origin ?? "api", + includeHtml: includeHtml, }); res.json({ jobId: job.id }); diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts index 0b4a08ce..2b1b6767 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/crawlPreview.ts @@ -26,7 +26,8 @@ export async function crawlPreviewController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, toMarkdown: true}; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const includeHtml = req.body.includeHtml ?? false; const job = await addWebScraperJob({ url: url, @@ -35,6 +36,7 @@ export async function crawlPreviewController(req: Request, res: Response) { team_id: "preview", pageOptions: pageOptions, origin: "website-preview", + includeHtml: includeHtml, }); res.json({ jobId: job.id }); diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index e03c0133..5bd61a5f 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -14,7 +14,8 @@ export async function scrapeHelper( team_id: string, crawlerOptions: any, pageOptions: PageOptions, - extractorOptions: ExtractorOptions + extractorOptions: ExtractorOptions, + includeHtml: boolean = false ): Promise<{ success: boolean; error?: string; @@ -39,7 +40,8 @@ export async function scrapeHelper( ...crawlerOptions, }, pageOptions: pageOptions, - extractorOptions: extractorOptions + extractorOptions: extractorOptions, + includeHtml: includeHtml }); const docs = await a.getDocuments(false); @@ -91,11 +93,12 @@ export async function scrapeController(req: Request, res: Response) { return res.status(status).json({ error }); } const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, toMarkdown: true }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; const extractorOptions = req.body.extractorOptions ?? { mode: "markdown" } const origin = req.body.origin ?? "api"; + const includeHtml = req.body.includeHtml ?? false; try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = @@ -113,7 +116,8 @@ export async function scrapeController(req: Request, res: Response) { team_id, crawlerOptions, pageOptions, - extractorOptions + extractorOptions, + includeHtml ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; @@ -132,7 +136,8 @@ export async function scrapeController(req: Request, res: Response) { pageOptions: pageOptions, origin: origin, extractor_options: extractorOptions, - num_tokens: numTokens + num_tokens: numTokens, + includeHtml: includeHtml }); return res.status(result.returnCode).json(result); } catch (error) { diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 6529edc7..314e475f 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -13,7 +13,8 @@ export async function searchHelper( team_id: string, crawlerOptions: any, pageOptions: PageOptions, - searchOptions: SearchOptions + searchOptions: SearchOptions, + includeHtml: boolean = false ): Promise<{ success: boolean; error?: string; @@ -59,6 +60,7 @@ export async function searchHelper( await a.setOptions({ mode: "single_urls", urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7), + includeHtml, crawlerOptions: { ...crawlerOptions, }, @@ -66,7 +68,6 @@ export async function searchHelper( ...pageOptions, onlyMainContent: pageOptions?.onlyMainContent ?? true, fetchPageContent: pageOptions?.fetchPageContent ?? true, - toMarkdown: pageOptions?.toMarkdown ?? true, fallback: false, }, }); @@ -125,6 +126,7 @@ export async function searchController(req: Request, res: Response) { const origin = req.body.origin ?? "api"; const searchOptions = req.body.searchOptions ?? { limit: 7 }; + const includeHtml = req.body.includeHtml ?? false; try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = @@ -142,7 +144,8 @@ export async function searchController(req: Request, res: Response) { team_id, crawlerOptions, pageOptions, - searchOptions + searchOptions, + includeHtml ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; @@ -158,6 +161,7 @@ export async function searchController(req: Request, res: Response) { crawlerOptions: crawlerOptions, pageOptions: pageOptions, origin: origin, + includeHtml, }); return res.status(result.returnCode).json(result); } catch (error) { diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 6150cdd0..b6340d87 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -12,7 +12,6 @@ export interface Progress { export type PageOptions = { onlyMainContent?: boolean; - toMarkdown?: boolean; fallback?: boolean; fetchPageContent?: boolean; }; @@ -47,6 +46,7 @@ export type WebScraperOptions = { pageOptions?: PageOptions; extractorOptions?: ExtractorOptions; concurrentRequests?: number; + includeHtml?: boolean; }; export interface DocumentUrl { diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 827eec57..798bb654 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -27,6 +27,7 @@ export async function startWebScraperPipeline({ job.moveToFailed(error); }, team_id: job.data.team_id, + includeHtml: job.data.includeHtml, })) as { success: boolean; message: string; docs: Document[] }; } export async function runWebScraper({ @@ -38,6 +39,7 @@ export async function runWebScraper({ onSuccess, onError, team_id, + includeHtml = false, }: { url: string; mode: "crawl" | "single_urls" | "sitemap"; @@ -47,6 +49,7 @@ export async function runWebScraper({ onSuccess: (result: any) => void; onError: (error: any) => void; team_id: string; + includeHtml?: boolean; }): Promise<{ success: boolean; message: string; @@ -60,6 +63,7 @@ export async function runWebScraper({ urls: [url], crawlerOptions: crawlerOptions, pageOptions: pageOptions, + includeHtml: includeHtml, }); } else { await provider.setOptions({ @@ -67,6 +71,7 @@ export async function runWebScraper({ urls: url.split(","), crawlerOptions: crawlerOptions, pageOptions: pageOptions, + includeHtml: includeHtml, }); } const docs = (await provider.getDocuments(false, (progress: Progress) => { diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 23cb6293..d3877b3f 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -19,6 +19,7 @@ export class WebCrawler { private robotsTxtUrl: string; private robots: any; private generateImgAltText: boolean; + private includeHtml: boolean; constructor({ initialUrl, @@ -27,6 +28,7 @@ export class WebCrawler { maxCrawledLinks, limit = 10000, generateImgAltText = false, + includeHtml = false, }: { initialUrl: string; includes?: string[]; @@ -34,6 +36,7 @@ export class WebCrawler { maxCrawledLinks?: number; limit?: number; generateImgAltText?: boolean; + includeHtml?: boolean; }) { this.initialUrl = initialUrl; this.baseUrl = new URL(initialUrl).origin; @@ -45,6 +48,7 @@ export class WebCrawler { // Deprecated, use limit instead this.maxCrawledLinks = maxCrawledLinks ?? limit; this.generateImgAltText = generateImgAltText ?? false; + this.includeHtml = includeHtml ?? false; } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 2cfa84e7..2a3916b6 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -24,6 +24,7 @@ export class WebScraperDataProvider { private extractorOptions?: ExtractorOptions; private replaceAllPathsWithAbsolutePaths?: boolean = false; private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo"; + private includeHtml: boolean = false; authorize(): void { throw new Error("Method not implemented."); @@ -45,7 +46,7 @@ export class WebScraperDataProvider { const batchUrls = urls.slice(i, i + this.concurrentRequests); await Promise.all( batchUrls.map(async (url, index) => { - const result = await scrapSingleUrl(url, this.pageOptions?.toMarkdown ?? true, this.pageOptions); + const result = await scrapSingleUrl(url, this.pageOptions, this.includeHtml); processedUrls++; if (inProgress) { inProgress({ @@ -108,6 +109,7 @@ export class WebScraperDataProvider { maxCrawledLinks: this.maxCrawledLinks, limit: this.limit, generateImgAltText: this.generateImgAltText, + includeHtml: this.includeHtml, }); let links = await crawler.start(inProgress, 5, this.limit); if (this.returnOnlyUrls) { @@ -142,6 +144,7 @@ export class WebScraperDataProvider { }); return links.map(url => ({ content: "", + html: this.includeHtml ? "" : undefined, markdown: "", metadata: { sourceURL: url }, })); @@ -323,10 +326,10 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? {onlyMainContent: false, toMarkdown: true}; + this.pageOptions = options.pageOptions ?? {onlyMainContent: false }; this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; - + this.includeHtml = options?.includeHtml ?? false; //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index b7fa07aa..4d071db3 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -103,8 +103,8 @@ export async function scrapWithPlaywright(url: string): Promise { export async function scrapSingleUrl( urlToScrap: string, - toMarkdown: boolean = true, - pageOptions: PageOptions = { onlyMainContent: true } + pageOptions: PageOptions = { onlyMainContent: true }, + includeHtml: boolean = false ): Promise { urlToScrap = urlToScrap.trim(); @@ -172,9 +172,7 @@ export async function scrapSingleUrl( //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(text, pageOptions); - if (toMarkdown === false) { - return [cleanedHtml, text]; - } + return [await parseMarkdown(cleanedHtml), text]; }; @@ -194,7 +192,8 @@ export async function scrapSingleUrl( return { url: urlToScrap, content: text, - markdown: pageOptions.toMarkdown === false ? undefined : text, + markdown: text, + html: includeHtml ? html : undefined, metadata: { ...metadata, sourceURL: urlToScrap }, } as Document; } @@ -217,14 +216,16 @@ export async function scrapSingleUrl( return { content: text, - markdown: pageOptions.toMarkdown === false ? undefined : text, + markdown: text, + html: includeHtml ? html : undefined, metadata: { ...metadata, sourceURL: urlToScrap }, } as Document; } catch (error) { console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`); return { content: "", - markdown: pageOptions.toMarkdown === false ? undefined : "", + markdown: "", + html: "", metadata: { sourceURL: urlToScrap }, } as Document; } diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index c1858f1b..3fbdcdd5 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -25,6 +25,7 @@ export interface WebScraperOptions { pageOptions: any; team_id: string; origin?: string; + includeHtml?: boolean; } export interface FirecrawlJob { @@ -40,7 +41,8 @@ export interface FirecrawlJob { pageOptions?: any; origin: string; extractor_options?: ExtractorOptions, - num_tokens?: number + num_tokens?: number, + includeHtml?: boolean; } export enum RateLimiterMode { From 6d5da358cca6f6ef3b9d047fec7c1eea63997664 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 6 May 2024 17:16:43 -0700 Subject: [PATCH 13/17] Nick: cancel job --- apps/api/fly.toml | 15 +- .../src/__tests__/e2e_withAuth/index.test.ts | 35 +++++ apps/api/src/controllers/crawl-cancel.ts | 50 ++++++ apps/api/src/controllers/crawl.ts | 13 +- apps/api/src/lib/entities.ts | 1 + apps/api/src/main/runWebScraper.ts | 4 + apps/api/src/routes/v0.ts | 2 + apps/api/src/scraper/WebScraper/index.ts | 148 ++++++++++++------ apps/api/src/services/logging/crawl_log.ts | 17 ++ 9 files changed, 236 insertions(+), 49 deletions(-) create mode 100644 apps/api/src/controllers/crawl-cancel.ts create mode 100644 apps/api/src/services/logging/crawl_log.ts diff --git a/apps/api/fly.toml b/apps/api/fly.toml index 1272f4b9..ca619d16 100644 --- a/apps/api/fly.toml +++ b/apps/api/fly.toml @@ -22,6 +22,11 @@ kill_timeout = '5s' min_machines_running = 2 processes = ['app'] +[http_service.concurrency] + type = "requests" + hard_limit = 200 + soft_limit = 100 + [[services]] protocol = 'tcp' internal_port = 8080 @@ -38,10 +43,14 @@ kill_timeout = '5s' [services.concurrency] type = 'connections' - hard_limit = 45 - soft_limit = 20 + hard_limit = 75 + soft_limit = 30 [[vm]] - size = 'performance-1x' + size = 'performance-4x' + processes = ['app'] + + + diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index c6c59bcb..78d20e44 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -252,6 +252,41 @@ describe("E2E Tests for API Routes", () => { }, 60000); // 60 seconds }); + it("If someone cancels a crawl job, it should turn into failed status", async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://jestjs.io" }); + expect(crawlResponse.statusCode).toBe(200); + + + + // wait for 30 seconds + await new Promise((r) => setTimeout(r, 10000)); + + const response = await request(TEST_URL) + .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(response.body.status).toBe("cancelled"); + + await new Promise((r) => setTimeout(r, 20000)); + + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("failed"); + expect(completedResponse.body.partial_data?.length ?? 0).toBeLessThanOrEqual(completedResponse.body.data?.length ?? 0); + + + }, 60000); // 60 seconds + + + describe("POST /v0/scrape with LLM Extraction", () => { it("should extract data using LLM extraction mode", async () => { const response = await request(TEST_URL) diff --git a/apps/api/src/controllers/crawl-cancel.ts b/apps/api/src/controllers/crawl-cancel.ts new file mode 100644 index 00000000..7523b785 --- /dev/null +++ b/apps/api/src/controllers/crawl-cancel.ts @@ -0,0 +1,50 @@ +import { Request, Response } from "express"; +import { authenticateUser } from "./auth"; +import { RateLimiterMode } from "../../src/types"; +import { addWebScraperJob } from "../../src/services/queue-jobs"; +import { getWebScraperQueue } from "../../src/services/queue-service"; +import { supabase_service } from "../../src/services/supabase"; + +export async function crawlCancelController(req: Request, res: Response) { + try { + const { success, team_id, error, status } = await authenticateUser( + req, + res, + RateLimiterMode.CrawlStatus + ); + if (!success) { + return res.status(status).json({ error }); + } + const job = await getWebScraperQueue().getJob(req.params.jobId); + if (!job) { + return res.status(404).json({ error: "Job not found" }); + } + + // check if the job belongs to the team + const {data, error: supaError}= await supabase_service.from("bulljobs_teams").select("*").eq("job_id", req.params.jobId).eq("team_id", team_id); + if (supaError) { + return res.status(500).json({ error: supaError.message }); + } + + if (data.length === 0) { + return res.status(403).json({ error: "Unauthorized" }); + } + + try { + await job.moveToFailed(Error("Job cancelled by user"), true); + + } catch (error) { + console.error(error); + + } + + const jobState = await job.getState(); + + res.json({ + status: jobState === "failed" ? "cancelled" : "Cancelling...", + }); + } catch (error) { + console.error(error); + return res.status(500).json({ error: error.message }); + } +} diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 3d64f7f4..8b5249bb 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -6,6 +6,7 @@ import { authenticateUser } from "./auth"; import { RateLimiterMode } from "../../src/types"; import { addWebScraperJob } from "../../src/services/queue-jobs"; import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist"; +import { logCrawl } from "../../src/services/logging/crawl_log"; export async function crawlController(req: Request, res: Response) { try { @@ -30,9 +31,14 @@ export async function crawlController(req: Request, res: Response) { } if (isUrlBlocked(url)) { - return res.status(403).json({ error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." }); + return res + .status(403) + .json({ + error: + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", + }); } - + const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; @@ -66,6 +72,7 @@ export async function crawlController(req: Request, res: Response) { return res.status(500).json({ error: error.message }); } } + const job = await addWebScraperJob({ url: url, mode: mode ?? "crawl", // fix for single urls not working @@ -75,6 +82,8 @@ export async function crawlController(req: Request, res: Response) { origin: req.body.origin ?? "api", }); + await logCrawl(job.id.toString(), team_id); + res.json({ jobId: job.id }); } catch (error) { console.error(error); diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 5b663f20..1bb94294 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -47,6 +47,7 @@ export type WebScraperOptions = { pageOptions?: PageOptions; extractorOptions?: ExtractorOptions; concurrentRequests?: number; + bullJobId?: string; }; export interface DocumentUrl { diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 827eec57..252f2e46 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -27,6 +27,7 @@ export async function startWebScraperPipeline({ job.moveToFailed(error); }, team_id: job.data.team_id, + bull_job_id: job.id.toString(), })) as { success: boolean; message: string; docs: Document[] }; } export async function runWebScraper({ @@ -38,6 +39,7 @@ export async function runWebScraper({ onSuccess, onError, team_id, + bull_job_id, }: { url: string; mode: "crawl" | "single_urls" | "sitemap"; @@ -47,6 +49,7 @@ export async function runWebScraper({ onSuccess: (result: any) => void; onError: (error: any) => void; team_id: string; + bull_job_id: string; }): Promise<{ success: boolean; message: string; @@ -60,6 +63,7 @@ export async function runWebScraper({ urls: [url], crawlerOptions: crawlerOptions, pageOptions: pageOptions, + bullJobId: bull_job_id, }); } else { await provider.setOptions({ diff --git a/apps/api/src/routes/v0.ts b/apps/api/src/routes/v0.ts index f84b974b..42b8814d 100644 --- a/apps/api/src/routes/v0.ts +++ b/apps/api/src/routes/v0.ts @@ -5,6 +5,7 @@ import { scrapeController } from "../../src/controllers/scrape"; import { crawlPreviewController } from "../../src/controllers/crawlPreview"; import { crawlJobStatusPreviewController } from "../../src/controllers/status"; import { searchController } from "../../src/controllers/search"; +import { crawlCancelController } from "../../src/controllers/crawl-cancel"; export const v0Router = express.Router(); @@ -12,6 +13,7 @@ v0Router.post("/v0/scrape", scrapeController); v0Router.post("/v0/crawl", crawlController); v0Router.post("/v0/crawlWebsitePreview", crawlPreviewController); v0Router.get("/v0/crawl/status/:jobId", crawlStatusController); +v0Router.delete("/v0/crawl/cancel/:jobId", crawlCancelController); v0Router.get("/v0/checkJobStatus/:jobId", crawlJobStatusPreviewController); // Search routes diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 1e285520..18624e1a 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -1,4 +1,9 @@ -import { Document, ExtractorOptions, PageOptions, WebScraperOptions } from "../../lib/entities"; +import { + Document, + ExtractorOptions, + PageOptions, + WebScraperOptions, +} from "../../lib/entities"; import { Progress } from "../../lib/entities"; import { scrapSingleUrl } from "./single_url"; import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap"; @@ -6,11 +11,15 @@ import { WebCrawler } from "./crawler"; import { getValue, setValue } from "../../services/redis"; import { getImageDescription } from "./utils/imageDescription"; import { fetchAndProcessPdf } from "./utils/pdfProcessor"; -import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths"; +import { + replaceImgPathsWithAbsolutePaths, + replacePathsWithAbsolutePaths, +} from "./utils/replacePaths"; import { generateCompletions } from "../../lib/LLM-extraction"; - +import { getWebScraperQueue } from "../../../src/services/queue-service"; export class WebScraperDataProvider { + private bullJobId: string; private urls: string[] = [""]; private mode: "single_urls" | "sitemap" | "crawl" = "single_urls"; private includes: string[]; @@ -23,7 +32,8 @@ export class WebScraperDataProvider { private pageOptions?: PageOptions; private extractorOptions?: ExtractorOptions; private replaceAllPathsWithAbsolutePaths?: boolean = false; - private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo"; + private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = + "gpt-4-turbo"; authorize(): void { throw new Error("Method not implemented."); @@ -39,7 +49,7 @@ export class WebScraperDataProvider { ): Promise { const totalUrls = urls.length; let processedUrls = 0; - + const results: (Document | null)[] = new Array(urls.length).fill(null); for (let i = 0; i < urls.length; i += this.concurrentRequests) { const batchUrls = urls.slice(i, i + this.concurrentRequests); @@ -53,12 +63,20 @@ export class WebScraperDataProvider { total: totalUrls, status: "SCRAPING", currentDocumentUrl: url, - currentDocument: result + currentDocument: result, }); } + results[i + index] = result; }) ); + const job = await getWebScraperQueue().getJob(this.bullJobId); + const jobStatus = await job.getState(); + if (jobStatus === "failed") { + throw new Error( + "Job has failed or has been cancelled by the user. Stopping the job..." + ); + } } return results.filter((result) => result !== null) as Document[]; } @@ -87,7 +105,9 @@ export class WebScraperDataProvider { * @param inProgress inProgress * @returns documents */ - private async processDocumentsWithoutCache(inProgress?: (progress: Progress) => void): Promise { + private async processDocumentsWithoutCache( + inProgress?: (progress: Progress) => void + ): Promise { switch (this.mode) { case "crawl": return this.handleCrawlMode(inProgress); @@ -100,7 +120,9 @@ export class WebScraperDataProvider { } } - private async handleCrawlMode(inProgress?: (progress: Progress) => void): Promise { + private async handleCrawlMode( + inProgress?: (progress: Progress) => void + ): Promise { const crawler = new WebCrawler({ initialUrl: this.urls[0], includes: this.includes, @@ -118,12 +140,16 @@ export class WebScraperDataProvider { return this.cacheAndFinalizeDocuments(documents, links); } - private async handleSingleUrlsMode(inProgress?: (progress: Progress) => void): Promise { + private async handleSingleUrlsMode( + inProgress?: (progress: Progress) => void + ): Promise { let documents = await this.processLinks(this.urls, inProgress); return documents; } - private async handleSitemapMode(inProgress?: (progress: Progress) => void): Promise { + private async handleSitemapMode( + inProgress?: (progress: Progress) => void + ): Promise { let links = await getLinksFromSitemap(this.urls[0]); if (this.returnOnlyUrls) { return this.returnOnlyUrlsResponse(links, inProgress); @@ -133,68 +159,90 @@ export class WebScraperDataProvider { return this.cacheAndFinalizeDocuments(documents, links); } - private async returnOnlyUrlsResponse(links: string[], inProgress?: (progress: Progress) => void): Promise { + private async returnOnlyUrlsResponse( + links: string[], + inProgress?: (progress: Progress) => void + ): Promise { inProgress?.({ current: links.length, total: links.length, status: "COMPLETED", currentDocumentUrl: this.urls[0], }); - return links.map(url => ({ + return links.map((url) => ({ content: "", markdown: "", metadata: { sourceURL: url }, })); } - private async processLinks(links: string[], inProgress?: (progress: Progress) => void): Promise { - let pdfLinks = links.filter(link => link.endsWith(".pdf")); + private async processLinks( + links: string[], + inProgress?: (progress: Progress) => void + ): Promise { + let pdfLinks = links.filter((link) => link.endsWith(".pdf")); let pdfDocuments = await this.fetchPdfDocuments(pdfLinks); - links = links.filter(link => !link.endsWith(".pdf")); + links = links.filter((link) => !link.endsWith(".pdf")); let documents = await this.convertUrlsToDocuments(links, inProgress); documents = await this.getSitemapData(this.urls[0], documents); documents = this.applyPathReplacements(documents); documents = await this.applyImgAltText(documents); - - if(this.extractorOptions.mode === "llm-extraction" && this.mode === "single_urls") { - documents = await generateCompletions( - documents, - this.extractorOptions - ) + + if ( + this.extractorOptions.mode === "llm-extraction" && + this.mode === "single_urls" + ) { + documents = await generateCompletions(documents, this.extractorOptions); } return documents.concat(pdfDocuments); } private async fetchPdfDocuments(pdfLinks: string[]): Promise { - return Promise.all(pdfLinks.map(async pdfLink => { - const pdfContent = await fetchAndProcessPdf(pdfLink); - return { - content: pdfContent, - metadata: { sourceURL: pdfLink }, - provider: "web-scraper" - }; - })); + return Promise.all( + pdfLinks.map(async (pdfLink) => { + const pdfContent = await fetchAndProcessPdf(pdfLink); + return { + content: pdfContent, + metadata: { sourceURL: pdfLink }, + provider: "web-scraper", + }; + }) + ); } private applyPathReplacements(documents: Document[]): Document[] { - return this.replaceAllPathsWithAbsolutePaths ? replacePathsWithAbsolutePaths(documents) : replaceImgPathsWithAbsolutePaths(documents); + return this.replaceAllPathsWithAbsolutePaths + ? replacePathsWithAbsolutePaths(documents) + : replaceImgPathsWithAbsolutePaths(documents); } private async applyImgAltText(documents: Document[]): Promise { - return this.generateImgAltText ? this.generatesImgAltText(documents) : documents; + return this.generateImgAltText + ? this.generatesImgAltText(documents) + : documents; } - private async cacheAndFinalizeDocuments(documents: Document[], links: string[]): Promise { + private async cacheAndFinalizeDocuments( + documents: Document[], + links: string[] + ): Promise { await this.setCachedDocuments(documents, links); documents = this.removeChildLinks(documents); return documents.splice(0, this.limit); } - private async processDocumentsWithCache(inProgress?: (progress: Progress) => void): Promise { - let documents = await this.getCachedDocuments(this.urls.slice(0, this.limit)); + private async processDocumentsWithCache( + inProgress?: (progress: Progress) => void + ): Promise { + let documents = await this.getCachedDocuments( + this.urls.slice(0, this.limit) + ); if (documents.length < this.limit) { - const newDocuments: Document[] = await this.getDocuments(false, inProgress); + const newDocuments: Document[] = await this.getDocuments( + false, + inProgress + ); documents = this.mergeNewDocuments(documents, newDocuments); } documents = this.filterDocsExcludeInclude(documents); @@ -202,9 +250,18 @@ export class WebScraperDataProvider { return documents.splice(0, this.limit); } - private mergeNewDocuments(existingDocuments: Document[], newDocuments: Document[]): Document[] { - newDocuments.forEach(doc => { - if (!existingDocuments.some(d => this.normalizeUrl(d.metadata.sourceURL) === this.normalizeUrl(doc.metadata?.sourceURL))) { + private mergeNewDocuments( + existingDocuments: Document[], + newDocuments: Document[] + ): Document[] { + newDocuments.forEach((doc) => { + if ( + !existingDocuments.some( + (d) => + this.normalizeUrl(d.metadata.sourceURL) === + this.normalizeUrl(doc.metadata?.sourceURL) + ) + ) { existingDocuments.push(doc); } }); @@ -285,7 +342,7 @@ export class WebScraperDataProvider { documents.push(cachedDocument); // get children documents - for (const childUrl of (cachedDocument.childrenLinks || [])) { + for (const childUrl of cachedDocument.childrenLinks || []) { const normalizedChildUrl = this.normalizeUrl(childUrl); const childCachedDocumentString = await getValue( "web-scraper-cache:" + normalizedChildUrl @@ -313,6 +370,7 @@ export class WebScraperDataProvider { throw new Error("Urls are required"); } + this.bullJobId = options.bullJobId; this.urls = options.urls; this.mode = options.mode; this.concurrentRequests = options.concurrentRequests ?? 20; @@ -323,9 +381,10 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? {onlyMainContent: false}; - this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} - this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; + this.pageOptions = options.pageOptions ?? { onlyMainContent: false }; + this.extractorOptions = options.extractorOptions ?? { mode: "markdown" }; + this.replaceAllPathsWithAbsolutePaths = + options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); @@ -396,8 +455,9 @@ export class WebScraperDataProvider { altText = await getImageDescription( imageUrl, backText, - frontText - , this.generateImgAltTextModel); + frontText, + this.generateImgAltTextModel + ); } document.content = document.content.replace( diff --git a/apps/api/src/services/logging/crawl_log.ts b/apps/api/src/services/logging/crawl_log.ts new file mode 100644 index 00000000..76a06072 --- /dev/null +++ b/apps/api/src/services/logging/crawl_log.ts @@ -0,0 +1,17 @@ +import { supabase_service } from "../supabase"; +import "dotenv/config"; + +export async function logCrawl(job_id: string, team_id: string) { + try { + const { data, error } = await supabase_service + .from("bulljobs_teams") + .insert([ + { + job_id: job_id, + team_id: team_id, + }, + ]); + } catch (error) { + console.error("Error logging crawl job:\n", error); + } +} From 2e3ff855092881d6371baae92251213654df6d53 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 6 May 2024 17:22:16 -0700 Subject: [PATCH 14/17] Update crawl-cancel.ts --- apps/api/src/controllers/crawl-cancel.ts | 30 +++++++++++++++++------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/apps/api/src/controllers/crawl-cancel.ts b/apps/api/src/controllers/crawl-cancel.ts index 7523b785..8e8ba313 100644 --- a/apps/api/src/controllers/crawl-cancel.ts +++ b/apps/api/src/controllers/crawl-cancel.ts @@ -4,6 +4,7 @@ import { RateLimiterMode } from "../../src/types"; import { addWebScraperJob } from "../../src/services/queue-jobs"; import { getWebScraperQueue } from "../../src/services/queue-service"; import { supabase_service } from "../../src/services/supabase"; +import { billTeam } from "../../src/services/billing/credit_billing"; export async function crawlCancelController(req: Request, res: Response) { try { @@ -21,7 +22,11 @@ export async function crawlCancelController(req: Request, res: Response) { } // check if the job belongs to the team - const {data, error: supaError}= await supabase_service.from("bulljobs_teams").select("*").eq("job_id", req.params.jobId).eq("team_id", team_id); + const { data, error: supaError } = await supabase_service + .from("bulljobs_teams") + .select("*") + .eq("job_id", req.params.jobId) + .eq("team_id", team_id); if (supaError) { return res.status(500).json({ error: supaError.message }); } @@ -29,19 +34,26 @@ export async function crawlCancelController(req: Request, res: Response) { if (data.length === 0) { return res.status(403).json({ error: "Unauthorized" }); } + const jobState = await job.getState(); + const { partialDocs } = await job.progress(); - try { - await job.moveToFailed(Error("Job cancelled by user"), true); - - } catch (error) { - console.error(error); - + if (partialDocs && partialDocs.length > 0 && jobState === "active") { + console.log("Billing team for partial docs..."); + // Note: the credits that we will bill them here might be lower than the actual + // due to promises that are not yet resolved + await billTeam(team_id, partialDocs.length); } - const jobState = await job.getState(); + try { + await job.moveToFailed(Error("Job cancelled by user"), true); + } catch (error) { + console.error(error); + } + + const newJobState = await job.getState(); res.json({ - status: jobState === "failed" ? "cancelled" : "Cancelling...", + status: newJobState === "failed" ? "cancelled" : "Cancelling...", }); } catch (error) { console.error(error); From f46bf19fa53dc8a2dbe8fb8b8804f3ebd853f5fe Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 7 May 2024 09:26:52 -0700 Subject: [PATCH 15/17] Nick: --- .../src/__tests__/e2e_withAuth/index.test.ts | 1 - apps/api/src/scraper/WebScraper/index.ts | 18 +++++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 78d20e44..9a16073c 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -280,7 +280,6 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.statusCode).toBe(200); expect(completedResponse.body).toHaveProperty("status"); expect(completedResponse.body.status).toBe("failed"); - expect(completedResponse.body.partial_data?.length ?? 0).toBeLessThanOrEqual(completedResponse.body.data?.length ?? 0); }, 60000); // 60 seconds diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 18624e1a..3a778437 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -70,13 +70,17 @@ export class WebScraperDataProvider { results[i + index] = result; }) ); - const job = await getWebScraperQueue().getJob(this.bullJobId); - const jobStatus = await job.getState(); - if (jobStatus === "failed") { - throw new Error( - "Job has failed or has been cancelled by the user. Stopping the job..." - ); - } + try { + if (this.mode === "crawl" && this.bullJobId) { + const job = await getWebScraperQueue().getJob(this.bullJobId); + const jobStatus = await job.getState(); + if (jobStatus === "failed") { + throw new Error( + "Job has failed or has been cancelled by the user. Stopping the job..." + ); + } + } + } catch (error) {} } return results.filter((result) => result !== null) as Document[]; } From e1f52c538fd8852fe977303fc929077b77faf77b Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 7 May 2024 13:40:24 -0300 Subject: [PATCH 16/17] nested includeHtml inside pageOptions --- apps/api/src/__tests__/e2e_withAuth/index.test.ts | 6 +++--- apps/api/src/controllers/crawl.ts | 5 +---- apps/api/src/controllers/crawlPreview.ts | 4 +--- apps/api/src/controllers/scrape.ts | 7 +------ apps/api/src/controllers/search.ts | 7 ++----- apps/api/src/lib/entities.ts | 4 ++-- apps/api/src/main/runWebScraper.ts | 11 +++-------- apps/api/src/scraper/WebScraper/crawler.ts | 4 ---- apps/api/src/scraper/WebScraper/index.ts | 9 +++------ apps/api/src/scraper/WebScraper/single_url.ts | 7 +++---- apps/api/src/types.ts | 2 -- 11 files changed, 19 insertions(+), 47 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index e0f725e5..644ad362 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -88,7 +88,7 @@ describe("E2E Tests for API Routes", () => { .post("/v0/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev", includeHtml: true }); + .send({ url: "https://firecrawl.dev", pageOptions: { includeHtml: true }}); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("data"); expect(response.body.data).toHaveProperty("content"); @@ -270,12 +270,12 @@ describe("E2E Tests for API Routes", () => { ); }, 60000); // 60 seconds - it("should return a successful response for a valid crawl job with toMarkdown set to false option", async () => { + it("should return a successful response for a valid crawl job with includeHtml set to true option", async () => { const crawlResponse = await request(TEST_URL) .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev", includeHtml: true }); + .send({ url: "https://firecrawl.dev", pageOptions: { includeHtml: true } }); expect(crawlResponse.statusCode).toBe(200); const response = await request(TEST_URL) diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index d4320922..3ba92139 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -35,8 +35,7 @@ export async function crawlController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; - const includeHtml = req.body.includeHtml || false; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; if (mode === "single_urls" && !url.includes(",")) { try { @@ -48,7 +47,6 @@ export async function crawlController(req: Request, res: Response) { returnOnlyUrls: true, }, pageOptions: pageOptions, - includeHtml: includeHtml, }); const docs = await a.getDocuments(false, (progress) => { @@ -75,7 +73,6 @@ export async function crawlController(req: Request, res: Response) { team_id: team_id, pageOptions: pageOptions, origin: req.body.origin ?? "api", - includeHtml: includeHtml, }); res.json({ jobId: job.id }); diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts index 2b1b6767..d3e9afea 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/crawlPreview.ts @@ -26,8 +26,7 @@ export async function crawlPreviewController(req: Request, res: Response) { const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; - const includeHtml = req.body.includeHtml ?? false; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; const job = await addWebScraperJob({ url: url, @@ -36,7 +35,6 @@ export async function crawlPreviewController(req: Request, res: Response) { team_id: "preview", pageOptions: pageOptions, origin: "website-preview", - includeHtml: includeHtml, }); res.json({ jobId: job.id }); diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 5bd61a5f..021a9d05 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -15,7 +15,6 @@ export async function scrapeHelper( crawlerOptions: any, pageOptions: PageOptions, extractorOptions: ExtractorOptions, - includeHtml: boolean = false ): Promise<{ success: boolean; error?: string; @@ -41,7 +40,6 @@ export async function scrapeHelper( }, pageOptions: pageOptions, extractorOptions: extractorOptions, - includeHtml: includeHtml }); const docs = await a.getDocuments(false); @@ -93,12 +91,11 @@ export async function scrapeController(req: Request, res: Response) { return res.status(status).json({ error }); } const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; const extractorOptions = req.body.extractorOptions ?? { mode: "markdown" } const origin = req.body.origin ?? "api"; - const includeHtml = req.body.includeHtml ?? false; try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = @@ -117,7 +114,6 @@ export async function scrapeController(req: Request, res: Response) { crawlerOptions, pageOptions, extractorOptions, - includeHtml ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; @@ -137,7 +133,6 @@ export async function scrapeController(req: Request, res: Response) { origin: origin, extractor_options: extractorOptions, num_tokens: numTokens, - includeHtml: includeHtml }); return res.status(result.returnCode).json(result); } catch (error) { diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 314e475f..d98c08d5 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -14,7 +14,6 @@ export async function searchHelper( crawlerOptions: any, pageOptions: PageOptions, searchOptions: SearchOptions, - includeHtml: boolean = false ): Promise<{ success: boolean; error?: string; @@ -60,7 +59,6 @@ export async function searchHelper( await a.setOptions({ mode: "single_urls", urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7), - includeHtml, crawlerOptions: { ...crawlerOptions, }, @@ -68,6 +66,7 @@ export async function searchHelper( ...pageOptions, onlyMainContent: pageOptions?.onlyMainContent ?? true, fetchPageContent: pageOptions?.fetchPageContent ?? true, + includeHtml: pageOptions?.includeHtml ?? false, fallback: false, }, }); @@ -119,6 +118,7 @@ export async function searchController(req: Request, res: Response) { } const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = req.body.pageOptions ?? { + includeHtml: false, onlyMainContent: true, fetchPageContent: true, fallback: false, @@ -126,7 +126,6 @@ export async function searchController(req: Request, res: Response) { const origin = req.body.origin ?? "api"; const searchOptions = req.body.searchOptions ?? { limit: 7 }; - const includeHtml = req.body.includeHtml ?? false; try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = @@ -145,7 +144,6 @@ export async function searchController(req: Request, res: Response) { crawlerOptions, pageOptions, searchOptions, - includeHtml ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; @@ -161,7 +159,6 @@ export async function searchController(req: Request, res: Response) { crawlerOptions: crawlerOptions, pageOptions: pageOptions, origin: origin, - includeHtml, }); return res.status(result.returnCode).json(result); } catch (error) { diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index b6340d87..0a6a90eb 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -12,8 +12,9 @@ export interface Progress { export type PageOptions = { onlyMainContent?: boolean; + includeHtml?: boolean; fallback?: boolean; - fetchPageContent?: boolean; + fetchPageContent?: boolean; }; export type ExtractorOptions = { @@ -46,7 +47,6 @@ export type WebScraperOptions = { pageOptions?: PageOptions; extractorOptions?: ExtractorOptions; concurrentRequests?: number; - includeHtml?: boolean; }; export interface DocumentUrl { diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 798bb654..189d5005 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -26,8 +26,7 @@ export async function startWebScraperPipeline({ onError: (error) => { job.moveToFailed(error); }, - team_id: job.data.team_id, - includeHtml: job.data.includeHtml, + team_id: job.data.team_id })) as { success: boolean; message: string; docs: Document[] }; } export async function runWebScraper({ @@ -39,7 +38,6 @@ export async function runWebScraper({ onSuccess, onError, team_id, - includeHtml = false, }: { url: string; mode: "crawl" | "single_urls" | "sitemap"; @@ -49,7 +47,6 @@ export async function runWebScraper({ onSuccess: (result: any) => void; onError: (error: any) => void; team_id: string; - includeHtml?: boolean; }): Promise<{ success: boolean; message: string; @@ -62,16 +59,14 @@ export async function runWebScraper({ mode: mode, urls: [url], crawlerOptions: crawlerOptions, - pageOptions: pageOptions, - includeHtml: includeHtml, + pageOptions: pageOptions }); } else { await provider.setOptions({ mode: mode, urls: url.split(","), crawlerOptions: crawlerOptions, - pageOptions: pageOptions, - includeHtml: includeHtml, + pageOptions: pageOptions }); } const docs = (await provider.getDocuments(false, (progress: Progress) => { diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index d3877b3f..23cb6293 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -19,7 +19,6 @@ export class WebCrawler { private robotsTxtUrl: string; private robots: any; private generateImgAltText: boolean; - private includeHtml: boolean; constructor({ initialUrl, @@ -28,7 +27,6 @@ export class WebCrawler { maxCrawledLinks, limit = 10000, generateImgAltText = false, - includeHtml = false, }: { initialUrl: string; includes?: string[]; @@ -36,7 +34,6 @@ export class WebCrawler { maxCrawledLinks?: number; limit?: number; generateImgAltText?: boolean; - includeHtml?: boolean; }) { this.initialUrl = initialUrl; this.baseUrl = new URL(initialUrl).origin; @@ -48,7 +45,6 @@ export class WebCrawler { // Deprecated, use limit instead this.maxCrawledLinks = maxCrawledLinks ?? limit; this.generateImgAltText = generateImgAltText ?? false; - this.includeHtml = includeHtml ?? false; } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 2a3916b6..ed49f1da 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -24,7 +24,6 @@ export class WebScraperDataProvider { private extractorOptions?: ExtractorOptions; private replaceAllPathsWithAbsolutePaths?: boolean = false; private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo"; - private includeHtml: boolean = false; authorize(): void { throw new Error("Method not implemented."); @@ -46,7 +45,7 @@ export class WebScraperDataProvider { const batchUrls = urls.slice(i, i + this.concurrentRequests); await Promise.all( batchUrls.map(async (url, index) => { - const result = await scrapSingleUrl(url, this.pageOptions, this.includeHtml); + const result = await scrapSingleUrl(url, this.pageOptions); processedUrls++; if (inProgress) { inProgress({ @@ -109,7 +108,6 @@ export class WebScraperDataProvider { maxCrawledLinks: this.maxCrawledLinks, limit: this.limit, generateImgAltText: this.generateImgAltText, - includeHtml: this.includeHtml, }); let links = await crawler.start(inProgress, 5, this.limit); if (this.returnOnlyUrls) { @@ -144,7 +142,7 @@ export class WebScraperDataProvider { }); return links.map(url => ({ content: "", - html: this.includeHtml ? "" : undefined, + html: this.pageOptions?.includeHtml ? "" : undefined, markdown: "", metadata: { sourceURL: url }, })); @@ -326,10 +324,9 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? {onlyMainContent: false }; + this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false }; this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; - this.includeHtml = options?.includeHtml ?? false; //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 4d071db3..a67ce310 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -103,8 +103,7 @@ export async function scrapWithPlaywright(url: string): Promise { export async function scrapSingleUrl( urlToScrap: string, - pageOptions: PageOptions = { onlyMainContent: true }, - includeHtml: boolean = false + pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }, ): Promise { urlToScrap = urlToScrap.trim(); @@ -193,7 +192,7 @@ export async function scrapSingleUrl( url: urlToScrap, content: text, markdown: text, - html: includeHtml ? html : undefined, + html: pageOptions.includeHtml ? html : undefined, metadata: { ...metadata, sourceURL: urlToScrap }, } as Document; } @@ -217,7 +216,7 @@ export async function scrapSingleUrl( return { content: text, markdown: text, - html: includeHtml ? html : undefined, + html: pageOptions.includeHtml ? html : undefined, metadata: { ...metadata, sourceURL: urlToScrap }, } as Document; } catch (error) { diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 3fbdcdd5..b9b5463d 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -25,7 +25,6 @@ export interface WebScraperOptions { pageOptions: any; team_id: string; origin?: string; - includeHtml?: boolean; } export interface FirecrawlJob { @@ -42,7 +41,6 @@ export interface FirecrawlJob { origin: string; extractor_options?: ExtractorOptions, num_tokens?: number, - includeHtml?: boolean; } export enum RateLimiterMode { From 61d615c04b2aa629e33affd0afc6e400b863300d Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 7 May 2024 14:03:00 -0300 Subject: [PATCH 17/17] Added tests --- apps/api/src/__tests__/e2e_withAuth/index.test.ts | 9 +++++++-- apps/api/src/scraper/WebScraper/index.ts | 4 +++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 9a16073c..3e82fb41 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -280,8 +280,13 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.statusCode).toBe(200); expect(completedResponse.body).toHaveProperty("status"); expect(completedResponse.body.status).toBe("failed"); - - + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data).toEqual(null); + expect(completedResponse.body).toHaveProperty("partial_data"); + expect(completedResponse.body.partial_data[0]).toHaveProperty("content"); + expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata"); + }, 60000); // 60 seconds diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 3a778437..96112f8d 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -80,7 +80,9 @@ export class WebScraperDataProvider { ); } } - } catch (error) {} + } catch (error) { + console.error(error); + } } return results.filter((result) => result !== null) as Document[]; }