diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000..262f9005 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,57 @@ +version: 2 +updates: + # playwright-service + - package-ecosystem: "pip" + directory: "/apps/playwright-service" + schedule: + interval: "weekly" + groups: + prod-deps: + dependency-type: "production" + dev-deps: + dependency-type: "development" + commit-message: + prefix: "apps/playwright-service" + include: "scope" + + # python-sdk + - package-ecosystem: "pip" + directory: "/apps/python-sdk" + schedule: + interval: "weekly" + groups: + prod-deps: + dependency-type: "production" + dev-deps: + dependency-type: "development" + commit-message: + prefix: "apps/python-sdk" + include: "scope" + + # api + - package-ecosystem: "npm" + directory: "/apps/api" + schedule: + interval: "weekly" + groups: + prod-deps: + dependency-type: "production" + dev-deps: + dependency-type: "development" + commit-message: + prefix: "apps/api" + include: "scope" + + # test-suite + - package-ecosystem: "npm" + directory: "/apps/test-suite" + schedule: + interval: "weekly" + groups: + prod-deps: + dependency-type: "production" + dev-deps: + dependency-type: "development" + commit-message: + prefix: "apps/test-suite" + include: "scope" diff --git a/.gitignore b/.gitignore index a070a88f..71ba6dd2 100644 --- a/.gitignore +++ b/.gitignore @@ -14,4 +14,7 @@ apps/test-suite/node_modules/ apps/test-suite/.env apps/test-suite/logs -apps/test-suite/load-test-results/test-run-report.json \ No newline at end of file +apps/test-suite/load-test-results/test-run-report.json + +apps/playwright-service-ts/node_modules/ +apps/playwright-service-ts/package-lock.json \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index bf779860..f90010bf 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -12,7 +12,7 @@ First, start by installing dependencies 2. pnpm [instructions](https://pnpm.io/installation) 3. redis [instructions](https://redis.io/docs/latest/operate/oss_and_stack/install/install-redis/) -Set environment variables in a .env in the /apps/api/ directoryyou can copy over the template in .env.example. +Set environment variables in a .env in the /apps/api/ directory you can copy over the template in .env.example. To start, we wont set up authentication, or any optional sub services (pdf parsing, JS blocking support, AI features ) diff --git a/SELF_HOST.md b/SELF_HOST.md index 4ef6d4ca..f3a4a4b1 100644 --- a/SELF_HOST.md +++ b/SELF_HOST.md @@ -1,34 +1,64 @@ -# Self-hosting Firecrawl -*We're currently working on a more in-depth guide on how to self-host, but in the meantime, here is a simplified version.* +## Self-hosting Firecrawl + +_We're currently working on a more in-depth guide on how to self-host, but in the meantime, here is a simplified version._ Refer to [CONTRIBUTING.md](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md) for instructions on how to run it locally. ## Getting Started -First, clone this repository and copy the example env file from api folder `.env.example` to `.env`. -```bash -git clone https://github.com/mendableai/firecrawl.git -cd firecrawl -cp ./apps/api/.env.example ./.env -``` +First, clone this repository and copy the example env file from the API folder `.env.example` to `.env`. -For running the simplest version of FireCrawl, edit the `USE_DB_AUTHENTICATION` on `.env` to not use the database authentication. -```yml -USE_DB_AUTHENTICATION=false -``` - -Update the Redis URL in the .env file to align with the Docker configuration: -```yml -REDIS_URL=redis://redis:6379 -``` - -Once that's complete, you can simply run the following commands to get started: -```bash -docker compose up -``` +### Steps +1. Clone the repository: + + ```bash + git clone https://github.com/mendableai/firecrawl.git + cd firecrawl + cp ./apps/api/.env.example ./.env + ``` + +2. For running the simplest version of FireCrawl, edit the `USE_DB_AUTHENTICATION` in `.env` to not use the database authentication: + + ```plaintext + USE_DB_AUTHENTICATION=false + ``` + +3. Update the Redis URL in the .env file to align with the Docker configuration: + + ```plaintext + REDIS_URL=redis://redis:6379 + ``` + +4. #### Option: Running with TypeScript Playwright Service + + * Update the `docker-compose.yml` file to change the Playwright service: + + ```plaintext + build: apps/playwright-service + ``` + TO + ```plaintext + build: apps/playwright-service-ts + ``` + + * Set the `PLAYWRIGHT_MICROSERVICE_URL` in your `.env` file: + + ```plaintext + PLAYWRIGHT_MICROSERVICE_URL=http://localhost:3000/scrape + ``` + + * Don't forget to set the proxy server in your `.env` file as needed. +5. Build and run the Docker containers: + + ```bash + docker compose build + docker compose up + ``` + This will run a local instance of Firecrawl which can be accessed at `http://localhost:3002`. -# Install Firecrawl on a Kubernetes Cluster (Simple Version) -Read the [examples/kubernetes-cluster-install/README.md](examples/kubernetes-cluster-install/README.md) for instructions on how to install Firecrawl on a Kubernetes Cluster. +## Install Firecrawl on a Kubernetes Cluster (Simple Version) + +Read the [examples/kubernetes-cluster-install/README.md](https://github.com/mendableai/firecrawl/blob/main/examples/kubernetes-cluster-install/README.md) for instructions on how to install Firecrawl on a Kubernetes Cluster. diff --git a/apps/api/openapi.json b/apps/api/openapi.json index 17b36777..bb271976 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -68,9 +68,21 @@ }, "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" }, + "onlyIncludeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'" + }, "headers": { "type": "object", "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." + }, + "replaceAllPathsWithAbsolutePaths": { + "type": "boolean", + "description": "Replace all relative paths with absolute paths for images and links", + "default": false } } }, @@ -80,8 +92,8 @@ "properties": { "mode": { "type": "string", - "enum": ["llm-extraction"], - "description": "The extraction mode to use, currently supports 'llm-extraction'" + "enum": ["llm-extraction", "llm-extraction-from-raw-html"], + "description": "The extraction mode to use. llm-extraction: Extracts information from the cleaned and parsed content. llm-extraction-from-raw-html: Extracts information directly from the raw HTML." }, "extractionPrompt": { "type": "string", @@ -184,7 +196,7 @@ }, "maxDepth": { "type": "integer", - "description": "Maximum depth to crawl. Depth 1 is the base URL, depth 2 is the base URL and its direct children, and so on." + "description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern." }, "mode": { "type": "string", @@ -511,7 +523,7 @@ "html": { "type": "string", "nullable": true, - "description": "Raw HTML content of the page if `includeHtml` is true" + "description": "Raw HTML content of the page if `includeHtml` is true" }, "metadata": { "type": "object", @@ -526,118 +538,13 @@ "type": "string", "nullable": true }, - "keywords": { - "type": "string", - "nullable": true - }, - "robots": { - "type": "string", - "nullable": true - }, - "ogTitle": { - "type": "string", - "nullable": true - }, - "ogDescription": { - "type": "string", - "nullable": true - }, - "ogUrl": { - "type": "string", - "format": "uri", - "nullable": true - }, - "ogImage": { - "type": "string", - "nullable": true - }, - "ogAudio": { - "type": "string", - "nullable": true - }, - "ogDeterminer": { - "type": "string", - "nullable": true - }, - "ogLocale": { - "type": "string", - "nullable": true - }, - "ogLocaleAlternate": { - "type": "array", - "items": { - "type": "string" - }, - "nullable": true - }, - "ogSiteName": { - "type": "string", - "nullable": true - }, - "ogVideo": { - "type": "string", - "nullable": true - }, - "dctermsCreated": { - "type": "string", - "nullable": true - }, - "dcDateCreated": { - "type": "string", - "nullable": true - }, - "dcDate": { - "type": "string", - "nullable": true - }, - "dctermsType": { - "type": "string", - "nullable": true - }, - "dcType": { - "type": "string", - "nullable": true - }, - "dctermsAudience": { - "type": "string", - "nullable": true - }, - "dctermsSubject": { - "type": "string", - "nullable": true - }, - "dcSubject": { - "type": "string", - "nullable": true - }, - "dcDescription": { - "type": "string", - "nullable": true - }, - "dctermsKeywords": { - "type": "string", - "nullable": true - }, - "modifiedTime": { - "type": "string", - "nullable": true - }, - "publishedTime": { - "type": "string", - "nullable": true - }, - "articleTag": { - "type": "string", - "nullable": true - }, - "articleSection": { - "type": "string", - "nullable": true - }, "sourceURL": { "type": "string", "format": "uri" }, + " ": { + "type": "string" + }, "pageStatusCode": { "type": "integer", "description": "The status code of the page" @@ -647,6 +554,7 @@ "nullable": true, "description": "The error message of the page" } + } }, "llm_extraction": { @@ -694,118 +602,13 @@ "type": "string", "nullable": true }, - "keywords": { - "type": "string", - "nullable": true - }, - "robots": { - "type": "string", - "nullable": true - }, - "ogTitle": { - "type": "string", - "nullable": true - }, - "ogDescription": { - "type": "string", - "nullable": true - }, - "ogUrl": { - "type": "string", - "format": "uri", - "nullable": true - }, - "ogImage": { - "type": "string", - "nullable": true - }, - "ogAudio": { - "type": "string", - "nullable": true - }, - "ogDeterminer": { - "type": "string", - "nullable": true - }, - "ogLocale": { - "type": "string", - "nullable": true - }, - "ogLocaleAlternate": { - "type": "array", - "items": { - "type": "string" - }, - "nullable": true - }, - "ogSiteName": { - "type": "string", - "nullable": true - }, - "ogVideo": { - "type": "string", - "nullable": true - }, - "dctermsCreated": { - "type": "string", - "nullable": true - }, - "dcDateCreated": { - "type": "string", - "nullable": true - }, - "dcDate": { - "type": "string", - "nullable": true - }, - "dctermsType": { - "type": "string", - "nullable": true - }, - "dcType": { - "type": "string", - "nullable": true - }, - "dctermsAudience": { - "type": "string", - "nullable": true - }, - "dctermsSubject": { - "type": "string", - "nullable": true - }, - "dcSubject": { - "type": "string", - "nullable": true - }, - "dcDescription": { - "type": "string", - "nullable": true - }, - "dctermsKeywords": { - "type": "string", - "nullable": true - }, - "modifiedTime": { - "type": "string", - "nullable": true - }, - "publishedTime": { - "type": "string", - "nullable": true - }, - "articleTag": { - "type": "string", - "nullable": true - }, - "articleSection": { - "type": "string", - "nullable": true - }, "sourceURL": { "type": "string", "format": "uri" }, + " ": { + "type": "string" + }, "pageStatusCode": { "type": "integer", "description": "The status code of the page" @@ -878,4 +681,4 @@ "bearerAuth": [] } ] -} +} \ No newline at end of file diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 32a92745..ff485f28 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -6801,7 +6801,7 @@ packages: handlebars: 4.7.8 openai: 3.3.0 sbd: 1.0.19 - typescript: 5.4.5 + typescript: 5.5.3 uuid: 9.0.1 zod: 3.23.8 transitivePeerDependencies: @@ -7767,6 +7767,12 @@ packages: engines: {node: '>=14.17'} hasBin: true + /typescript@5.5.3: + resolution: {integrity: sha512-/hreyEujaB0w76zKo6717l3L0o/qEUtRgdvUBvlkhoWeOVMjMuHNHk0BRBzikzuGDqNmPQbg5ifMEqsHLiIUcQ==} + engines: {node: '>=14.17'} + hasBin: true + dev: false + /typesense@1.8.2(@babel/runtime@7.24.6): resolution: {integrity: sha512-aBpePjA99Qvo+OP2pJwMpvga4Jrm1Y2oV5NsrWXBxlqUDNEUCPZBIksPv2Hq0jxQxHhLLyJVbjXjByXsvpCDVA==} engines: {node: '>=18'} diff --git a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts index c8281edd..3e324d39 100644 --- a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts @@ -131,6 +131,28 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data.metadata.pageStatusCode).toBe(200); expect(response.body.data.metadata.pageError).toBeUndefined(); }, 30000); // 30 seconds timeout + + it.concurrent("should return a successful response with a valid API key and includeRawHtml set to true", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://roastmywebsite.ai", + pageOptions: { includeRawHtml: true }, + }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("rawHtml"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.content).toContain("_Roast_"); + expect(response.body.data.markdown).toContain("_Roast_"); + expect(response.body.data.rawHtml).toContain(" { const response = await request(TEST_URL) @@ -804,6 +826,46 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); }, 180000); + it.concurrent("should crawl external content links when allowed", async () => { + const crawlInitResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + crawlerOptions: { + allowExternalContentLinks: true, + ignoreSitemap: true, + returnOnlyUrls: true, + limit: 50 + } + }); + + expect(crawlInitResponse.statusCode).toBe(200); + expect(crawlInitResponse.body).toHaveProperty("jobId"); + + let crawlStatus: string; + let crawlData = []; + while (crawlStatus !== "completed") { + const statusResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlInitResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + crawlStatus = statusResponse.body.status; + if (statusResponse.body.data) { + crawlData = statusResponse.body.data; + } + if (crawlStatus !== "completed") { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + console.log(crawlData) + expect(crawlData.length).toBeGreaterThan(0); + expect(crawlData).toEqual(expect.arrayContaining([ + expect.objectContaining({ url: expect.stringContaining("https://firecrawl.dev/?ref=mendable+banner") }), + expect.objectContaining({ url: expect.stringContaining("https://mendable.ai/pricing") }), + expect.objectContaining({ url: expect.stringContaining("https://x.com/CalebPeffer") }) + ])); + }, 180000); // 3 minutes timeout }); describe("POST /v0/crawlWebsitePreview", () => { @@ -1177,6 +1239,47 @@ describe("E2E Tests for API Routes", () => { expect(llmExtraction.is_open_source).toBe(false); expect(typeof llmExtraction.is_open_source).toBe("boolean"); }, 60000); // 60 secs + + it.concurrent("should extract data using LLM extraction mode with RawHtml", async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + + extractorOptions: { + mode: "llm-extraction-from-raw-html", + extractionPrompt: + "Based on the information on the page, what are the primary and secondary CTA buttons?", + extractionSchema: { + type: "object", + properties: { + primary_cta: { + type: "string", + }, + secondary_cta: { + type: "string", + }, + }, + required: ["primary_cta", "secondary_cta"], + }, + }, + }); + + // Ensure that the job was successfully created before proceeding with LLM extraction + expect(response.statusCode).toBe(200); + + // Assuming the LLM extraction object is available in the response body under `data.llm_extraction` + let llmExtraction = response.body.data.llm_extraction; + + // Check if the llm_extraction object has the required properties with correct types and values + expect(llmExtraction).toHaveProperty("primary_cta"); + expect(typeof llmExtraction.primary_cta).toBe("string"); + expect(llmExtraction).toHaveProperty("secondary_cta"); + expect(typeof llmExtraction.secondary_cta).toBe("string"); + + }, 60000); // 60 secs }); // describe("POST /v0/scrape for Top 100 Companies", () => { diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index d2977cea..b5bc54a5 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -546,46 +546,51 @@ describe("E2E Tests for API Routes", () => { expect(childrenLinks.length).toBe(completedResponse.body.data.length); }, 180000); // 120 seconds - it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension ', async () => { - const crawlResponse = await request(TEST_URL) - .post('/v0/crawl') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001', crawlerOptions: { limit: 10, excludes: [ 'list/*', 'login', 'abs/*', 'static/*', 'about/*', 'archive/*' ] }}); - expect(crawlResponse.statusCode).toBe(200); + // TODO: review the test below + // it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension ', async () => { + // const crawlResponse = await request(TEST_URL) + // .post('/v0/crawl') + // .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + // .set('Content-Type', 'application/json') + // .send({ url: 'https://arxiv.org/list/astro-ph/1993-01', + // crawlerOptions: { + // limit: 10, + // returnOnlyUrls: true + // }}); + // expect(crawlResponse.statusCode).toBe(200); - let isCompleted = false; - let completedResponse; + // let isCompleted = false; + // let completedResponse; - while (!isCompleted) { - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('status'); + // while (!isCompleted) { + // const response = await request(TEST_URL) + // .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + // .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`); + // expect(response.statusCode).toBe(200); + // expect(response.body).toHaveProperty('status'); - if (response.body.status === 'completed') { - isCompleted = true; - completedResponse = response; - } else { - await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again - } - } - expect(completedResponse.body.status).toBe('completed'); - expect(completedResponse.body).toHaveProperty('data'); - expect(completedResponse.body.data.length).toEqual(1); - expect(completedResponse.body.data).toEqual( - expect.arrayContaining([ - expect.objectContaining({ - content: expect.stringContaining('asymmetries might represent, for instance, preferred source orientations to our line of sight.') - }) - ]) - ); + // if (response.body.status === 'completed') { + // isCompleted = true; + // completedResponse = response; + // } else { + // await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + // } + // } + // expect(completedResponse.body.status).toBe('completed'); + // expect(completedResponse.body).toHaveProperty('data'); + // expect(completedResponse.body.data.length).toEqual(1); + // expect(completedResponse.body.data).toEqual( + // expect.arrayContaining([ + // expect.objectContaining({ + // content: expect.stringContaining('asymmetries might represent, for instance, preferred source orientations to our line of sight.') + // }) + // ]) + // ); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); - expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); - }, 180000); // 120 seconds + // expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + // expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + // expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); + // }, 180000); // 120 seconds it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => { const crawlResponse = await request(TEST_URL) diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index d394efe8..f5e2c322 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -58,19 +58,27 @@ export async function scrapeHelper( } // make sure doc.content is not empty - const filteredDocs = docs.filter( + let filteredDocs = docs.filter( (doc: { content?: string }) => doc.content && doc.content.trim().length > 0 ); if (filteredDocs.length === 0) { return { success: true, error: "No page found", returnCode: 200, data: docs[0] }; } + + // Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html + if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") { + filteredDocs.forEach(doc => { + delete doc.rawHtml; + }); + } + let creditsToBeBilled = filteredDocs.length; const creditsPerLLMExtract = 50; - if (extractorOptions.mode === "llm-extraction") { + if (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "llm-extraction-from-raw-html" || extractorOptions.mode === "llm-extraction-from-markdown") { creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length); } diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts index 6614dbdf..2156fb3c 100644 --- a/apps/api/src/lib/LLM-extraction/index.ts +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -8,7 +8,8 @@ import { Document, ExtractorOptions } from "../entities"; // Generate completion using OpenAI export async function generateCompletions( documents: Document[], - extractionOptions: ExtractorOptions + extractionOptions: ExtractorOptions, + mode: "markdown" | "raw-html" ): Promise { // const schema = zodToJsonSchema(options.schema) @@ -28,6 +29,7 @@ export async function generateCompletions( document: document, schema: schema, prompt: prompt, + mode: mode, }); // Validate the JSON output against the schema using AJV const validate = ajv.compile(schema); diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts index 1434e35e..8de8ee4b 100644 --- a/apps/api/src/lib/LLM-extraction/models.ts +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -13,26 +13,37 @@ const defaultPrompt = "You are a professional web scraper. Extract the contents of the webpage"; function prepareOpenAIDoc( - document: Document + document: Document, + mode: "markdown" | "raw-html" ): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] { + let markdown = document.markdown; -// Check if the markdown content exists in the document - if (!markdown) { + let extractionTarget = document.markdown; + + if (mode === "raw-html") { + extractionTarget = document.rawHtml; + } + + // Check if the markdown content exists in the document + if (!extractionTarget) { throw new Error( - "Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai" + `${mode} content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai` ); } + + + // count number of tokens - const numTokens = numTokensFromString(document.markdown, "gpt-4"); + const numTokens = numTokensFromString(extractionTarget, "gpt-4"); if (numTokens > maxTokens) { // trim the document to the maximum number of tokens, tokens != characters - markdown = markdown.slice(0, (maxTokens * modifier)); + extractionTarget = extractionTarget.slice(0, (maxTokens * modifier)); } - return [[{ type: "text", text: markdown }], numTokens]; + return [[{ type: "text", text: extractionTarget }], numTokens]; } export async function generateOpenAICompletions({ @@ -42,6 +53,7 @@ export async function generateOpenAICompletions({ schema, //TODO - add zod dynamic type checking prompt = defaultPrompt, temperature, + mode }: { client: OpenAI; model?: string; @@ -49,9 +61,10 @@ export async function generateOpenAICompletions({ schema: any; // This should be replaced with a proper Zod schema type when available prompt?: string; temperature?: number; + mode: "markdown" | "raw-html"; }): Promise { const openai = client as OpenAI; - const [content, numTokens] = prepareOpenAIDoc(document); + const [content, numTokens] = prepareOpenAIDoc(document, mode); const completion = await openai.chat.completions.create({ model, diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 12d8c366..3cd59b6c 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -13,6 +13,7 @@ export interface Progress { export type PageOptions = { onlyMainContent?: boolean; includeHtml?: boolean; + includeRawHtml?: boolean; fallback?: boolean; fetchPageContent?: boolean; waitFor?: number; @@ -21,10 +22,11 @@ export type PageOptions = { replaceAllPathsWithAbsolutePaths?: boolean; parsePDF?: boolean; removeTags?: string | string[]; + onlyIncludeTags?: string | string[]; }; export type ExtractorOptions = { - mode: "markdown" | "llm-extraction"; + mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html"; extractionPrompt?: string; extractionSchema?: Record; } @@ -50,6 +52,7 @@ export type CrawlerOptions = { ignoreSitemap?: boolean; mode?: "default" | "fast"; // have a mode of some sort allowBackwardCrawling?: boolean; + allowExternalContentLinks?: boolean; } export type WebScraperOptions = { @@ -72,6 +75,7 @@ export class Document { content: string; markdown?: string; html?: string; + rawHtml?: string; llm_extraction?: Record; createdAt?: Date; updatedAt?: Date; diff --git a/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts index 6d383708..32c8b0a0 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts @@ -188,5 +188,38 @@ describe('WebCrawler', () => { // Check that the backward link is included if allowBackwardCrawling is true expect(results.some(r => r.url === 'https://mendable.ai')).toBe(true); }); + + it('should respect the limit parameter by not returning more links than specified', async () => { + const initialUrl = 'http://example.com'; + const limit = 2; // Set a limit for the number of links + + crawler = new WebCrawler({ + initialUrl: initialUrl, + includes: [], + excludes: [], + limit: limit, // Apply the limit + maxCrawledDepth: 10 + }); + + // Mock sitemap fetching function to return more links than the limit + crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([ + initialUrl, + initialUrl + '/page1', + initialUrl + '/page2', + initialUrl + '/page3' + ]); + + const filteredLinks = crawler['filterLinks']( + [initialUrl, initialUrl + '/page1', initialUrl + '/page2', initialUrl + '/page3'], + limit, + 10 + ); + + expect(filteredLinks.length).toBe(limit); // Check if the number of results respects the limit + expect(filteredLinks).toEqual([ + initialUrl, + initialUrl + '/page1' + ]); + }); }); diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 5003845e..99fff9e4 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -4,7 +4,7 @@ import { URL } from "url"; import { getLinksFromSitemap } from "./sitemap"; import async from "async"; import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities"; -import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url"; +import { scrapSingleUrl } from "./single_url"; import robotsParser from "robots-parser"; import { getURLDepth } from "./utils/maxDepthUtils"; import { axiosTimeout } from "../../../src/lib/timeout"; @@ -23,6 +23,7 @@ export class WebCrawler { private robots: any; private generateImgAltText: boolean; private allowBackwardCrawling: boolean; + private allowExternalContentLinks: boolean; constructor({ initialUrl, @@ -32,7 +33,8 @@ export class WebCrawler { limit = 10000, generateImgAltText = false, maxCrawledDepth = 10, - allowBackwardCrawling = false + allowBackwardCrawling = false, + allowExternalContentLinks = false }: { initialUrl: string; includes?: string[]; @@ -42,6 +44,7 @@ export class WebCrawler { generateImgAltText?: boolean; maxCrawledDepth?: number; allowBackwardCrawling?: boolean; + allowExternalContentLinks?: boolean; }) { this.initialUrl = initialUrl; this.baseUrl = new URL(initialUrl).origin; @@ -55,6 +58,7 @@ export class WebCrawler { this.maxCrawledDepth = maxCrawledDepth ?? 10; this.generateImgAltText = generateImgAltText ?? false; this.allowBackwardCrawling = allowBackwardCrawling ?? false; + this.allowExternalContentLinks = allowExternalContentLinks ?? false; } private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { @@ -98,9 +102,10 @@ export class WebCrawler { const linkHostname = normalizedLink.hostname.replace(/^www\./, ''); // Ensure the protocol and hostname match, and the path starts with the initial URL's path - if (linkHostname !== initialHostname) { - return false; - } + // commented to able to handling external link on allowExternalContentLinks + // if (linkHostname !== initialHostname) { + // return false; + // } if (!this.allowBackwardCrawling) { if (!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) { @@ -278,15 +283,24 @@ export class WebCrawler { const path = urlObj.pathname; - if ( - this.isInternalLink(fullUrl) && - this.noSections(fullUrl) && - // The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards - // this.matchesIncludes(path) && - !this.matchesExcludes(path) && - this.isRobotsAllowed(fullUrl) - ) { - links.push({ url: fullUrl, html: content, pageStatusCode, pageError }); + if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS + if (this.isInternalLink(fullUrl) && + this.noSections(fullUrl) && + !this.matchesExcludes(path) && + this.isRobotsAllowed(fullUrl) + ) { + links.push({ url: fullUrl, html: content, pageStatusCode, pageError }); + } + } else { // EXTERNAL LINKS + if ( + this.isInternalLink(url) && + this.allowExternalContentLinks && + !this.isSocialMediaOrEmail(fullUrl) && + !this.matchesExcludes(fullUrl, true) && + !this.isExternalMainPage(fullUrl) + ) { + links.push({ url: fullUrl, html: content, pageStatusCode, pageError }); + } } } }); @@ -320,9 +334,41 @@ export class WebCrawler { return this.includes.some((pattern) => new RegExp(pattern).test(url)); } - private matchesExcludes(url: string): boolean { - if (this.excludes.length === 0 || this.excludes[0] == "") return false; - return this.excludes.some((pattern) => new RegExp(pattern).test(url)); + private matchesExcludes(url: string, onlyDomains: boolean = false): boolean { + return this.excludes.some((pattern) => { + if (onlyDomains) + return this.matchesExcludesExternalDomains(url); + + return this.excludes.some((pattern) => new RegExp(pattern).test(url)); + }); + } + + // supported formats: "example.com/blog", "https://example.com", "blog.example.com", "example.com" + private matchesExcludesExternalDomains(url: string) { + try { + const urlObj = new URL(url); + const hostname = urlObj.hostname; + const pathname = urlObj.pathname; + + for (let domain of this.excludes) { + let domainObj = new URL('http://' + domain.replace(/^https?:\/\//, '')); + let domainHostname = domainObj.hostname; + let domainPathname = domainObj.pathname; + + if (hostname === domainHostname || hostname.endsWith(`.${domainHostname}`)) { + if (pathname.startsWith(domainPathname)) { + return true; + } + } + } + return false; + } catch (e) { + return false; + } + } + + private isExternalMainPage(url:string):boolean { + return !Boolean(url.split("/").slice(3).filter(subArray => subArray.length > 0).length) } private noSections(link: string): boolean { @@ -375,6 +421,10 @@ export class WebCrawler { "instagram.com", "pinterest.com", "mailto:", + "github.com", + "calendly.com", + "discord.gg", + "discord.com", ]; return socialMediaOrEmail.some((ext) => url.includes(ext)); } diff --git a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts index d78d8158..f8b2503e 100644 --- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts +++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts @@ -29,22 +29,23 @@ export async function handleCustomScraping( }; } - // Check for Google Drive PDF links in the raw HTML - const googleDrivePdfPattern = - /https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/; - const googleDrivePdfLink = url.match(googleDrivePdfPattern); - if (googleDrivePdfLink) { - console.log( - `Google Drive PDF link detected for ${url}: ${googleDrivePdfLink[0]}` - ); + // Check for Google Drive PDF links in meta tags + const googleDriveMetaPattern = / { diff --git a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts new file mode 100644 index 00000000..4c31438c --- /dev/null +++ b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts @@ -0,0 +1,77 @@ +import axios from "axios"; +import { logScrape } from "../../../services/logging/scrape_log"; +import { fetchAndProcessPdf } from "../utils/pdfProcessor"; +import { universalTimeout } from "../global"; + +/** + * Scrapes a URL with Axios + * @param url The URL to scrape + * @param pageOptions The options for the page + * @returns The scraped content + */ +export async function scrapWithFetch( + url: string, + pageOptions: { parsePDF?: boolean } = { parsePDF: true } +): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { + const logParams = { + url, + scraper: "fetch", + success: false, + response_code: null, + time_taken_seconds: null, + error_message: null, + html: "", + startTime: Date.now(), + }; + + try { + const response = await axios.get(url, { + headers: { + "Content-Type": "application/json", + }, + timeout: universalTimeout, + transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically + }); + + if (response.status !== 200) { + console.error( + `[Axios] Error fetching url: ${url} with status: ${response.status}` + ); + logParams.error_message = response.statusText; + logParams.response_code = response.status; + return { + content: "", + pageStatusCode: response.status, + pageError: response.statusText, + }; + } + + const contentType = response.headers["content-type"]; + if (contentType && contentType.includes("application/pdf")) { + logParams.success = true; + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF); + logParams.response_code = pageStatusCode; + logParams.error_message = pageError; + return { content, pageStatusCode, pageError }; + } else { + const text = response.data; + logParams.success = true; + logParams.html = text; + logParams.response_code = response.status; + return { content: text, pageStatusCode: response.status, pageError: null }; + } + } catch (error) { + if (error.code === "ECONNABORTED") { + logParams.error_message = "Request timed out"; + console.log(`[Axios] Request timed out for ${url}`); + } else { + logParams.error_message = error.message || error; + console.error(`[Axios] Error fetching url: ${url} -> ${error}`); + } + return { content: "", pageStatusCode: null, pageError: logParams.error_message }; + } finally { + const endTime = Date.now(); + logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; + await logScrape(logParams); + } +} diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts new file mode 100644 index 00000000..2e971139 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -0,0 +1,124 @@ +import axios from "axios"; +import { FireEngineResponse } from "../../../lib/entities"; +import { logScrape } from "../../../services/logging/scrape_log"; +import { generateRequestParams } from "../single_url"; +import { fetchAndProcessPdf } from "../utils/pdfProcessor"; +import { universalTimeout } from "../global"; + +/** + * Scrapes a URL with Fire-Engine + * @param url The URL to scrape + * @param waitFor The time to wait for the page to load + * @param screenshot Whether to take a screenshot + * @param pageOptions The options for the page + * @param headers The headers to send with the request + * @param options The options for the request + * @returns The scraped content + */ +export async function scrapWithFireEngine({ + url, + waitFor = 0, + screenshot = false, + pageOptions = { parsePDF: true }, + headers, + options, +}: { + url: string; + waitFor?: number; + screenshot?: boolean; + pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean }; + headers?: Record; + options?: any; +}): Promise { + const logParams = { + url, + scraper: "fire-engine", + success: false, + response_code: null, + time_taken_seconds: null, + error_message: null, + html: "", + startTime: Date.now(), + }; + + try { + const reqParams = await generateRequestParams(url); + const waitParam = reqParams["params"]?.wait ?? waitFor; + const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; + console.log( + `[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}` + ); + + const response = await axios.post( + process.env.FIRE_ENGINE_BETA_URL + "/scrape", + { + url: url, + wait: waitParam, + screenshot: screenshotParam, + headers: headers, + pageOptions: pageOptions, + }, + { + headers: { + "Content-Type": "application/json", + }, + timeout: universalTimeout + waitParam, + } + ); + + if (response.status !== 200) { + console.error( + `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` + ); + logParams.error_message = response.data?.pageError; + logParams.response_code = response.data?.pageStatusCode; + return { + html: "", + screenshot: "", + pageStatusCode: response.data?.pageStatusCode, + pageError: response.data?.pageError, + }; + } + + const contentType = response.headers["content-type"]; + if (contentType && contentType.includes("application/pdf")) { + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf( + url, + pageOptions?.parsePDF + ); + logParams.success = true; + logParams.response_code = pageStatusCode; + logParams.error_message = pageError; + return { html: content, screenshot: "", pageStatusCode, pageError }; + } else { + const data = response.data; + logParams.success = + (data.pageStatusCode >= 200 && data.pageStatusCode < 300) || + data.pageStatusCode === 404; + logParams.html = data.content ?? ""; + logParams.response_code = data.pageStatusCode; + logParams.error_message = data.pageError; + return { + html: data.content ?? "", + screenshot: data.screenshot ?? "", + pageStatusCode: data.pageStatusCode, + pageError: data.pageError, + }; + } + } catch (error) { + if (error.code === "ECONNABORTED") { + console.log(`[Fire-Engine] Request timed out for ${url}`); + logParams.error_message = "Request timed out"; + } else { + console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); + logParams.error_message = error.message || error; + } + return { html: "", screenshot: "", pageStatusCode: null, pageError: logParams.error_message }; + } finally { + const endTime = Date.now(); + logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; + await logScrape(logParams, pageOptions); + } +} + + diff --git a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts new file mode 100644 index 00000000..11c3c5ad --- /dev/null +++ b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts @@ -0,0 +1,109 @@ +import axios from "axios"; +import { logScrape } from "../../../services/logging/scrape_log"; +import { generateRequestParams } from "../single_url"; +import { fetchAndProcessPdf } from "../utils/pdfProcessor"; +import { universalTimeout } from "../global"; + +/** + * Scrapes a URL with Playwright + * @param url The URL to scrape + * @param waitFor The time to wait for the page to load + * @param headers The headers to send with the request + * @param pageOptions The options for the page + * @returns The scraped content + */ +export async function scrapWithPlaywright( + url: string, + waitFor: number = 0, + headers?: Record, + pageOptions: { parsePDF?: boolean } = { parsePDF: true } +): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { + const logParams = { + url, + scraper: "playwright", + success: false, + response_code: null, + time_taken_seconds: null, + error_message: null, + html: "", + startTime: Date.now(), + }; + + try { + const reqParams = await generateRequestParams(url); + // If the user has passed a wait parameter in the request, use that + const waitParam = reqParams["params"]?.wait ?? waitFor; + + const response = await axios.post( + process.env.PLAYWRIGHT_MICROSERVICE_URL, + { + url: url, + wait_after_load: waitParam, + headers: headers, + }, + { + headers: { + "Content-Type": "application/json", + }, + timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time + transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically + } + ); + + if (response.status !== 200) { + console.error( + `[Playwright] Error fetching url: ${url} with status: ${response.status}` + ); + logParams.error_message = response.data?.pageError; + logParams.response_code = response.data?.pageStatusCode; + return { + content: "", + pageStatusCode: response.data?.pageStatusCode, + pageError: response.data?.pageError, + }; + } + + const contentType = response.headers["content-type"]; + if (contentType && contentType.includes("application/pdf")) { + logParams.success = true; + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF); + logParams.response_code = pageStatusCode; + logParams.error_message = pageError; + return { content, pageStatusCode, pageError }; + } else { + const textData = response.data; + try { + const data = JSON.parse(textData); + const html = data.content; + logParams.success = true; + logParams.html = html; + logParams.response_code = data.pageStatusCode; + logParams.error_message = data.pageError; + return { + content: html ?? "", + pageStatusCode: data.pageStatusCode, + pageError: data.pageError, + }; + } catch (jsonError) { + logParams.error_message = jsonError.message || jsonError; + console.error( + `[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}` + ); + return { content: "", pageStatusCode: null, pageError: logParams.error_message }; + } + } + } catch (error) { + if (error.code === "ECONNABORTED") { + logParams.error_message = "Request timed out"; + console.log(`[Playwright] Request timed out for ${url}`); + } else { + logParams.error_message = error.message || error; + console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); + } + return { content: "", pageStatusCode: null, pageError: logParams.error_message }; + } finally { + const endTime = Date.now(); + logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; + await logScrape(logParams); + } +} diff --git a/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts b/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts new file mode 100644 index 00000000..9a1f0b35 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts @@ -0,0 +1,88 @@ +import { logScrape } from "../../../services/logging/scrape_log"; +import { generateRequestParams } from "../single_url"; +import { fetchAndProcessPdf } from "../utils/pdfProcessor"; +import { universalTimeout } from "../global"; +import { ScrapingBeeClient } from "scrapingbee"; + +/** + * Scrapes a URL with ScrapingBee + * @param url The URL to scrape + * @param wait_browser The browser event to wait for + * @param timeout The timeout for the scrape + * @param pageOptions The options for the page + * @returns The scraped content + */ +export async function scrapWithScrapingBee( + url: string, + wait_browser: string = "domcontentloaded", + timeout: number = universalTimeout, + pageOptions: { parsePDF?: boolean } = { parsePDF: true } + ): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { + const logParams = { + url, + scraper: wait_browser === "networkidle2" ? "scrapingBeeLoad" : "scrapingBee", + success: false, + response_code: null, + time_taken_seconds: null, + error_message: null, + html: "", + startTime: Date.now(), + }; + try { + const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); + const clientParams = await generateRequestParams( + url, + wait_browser, + timeout + ); + const response = await client.get({ + ...clientParams, + params: { + ...clientParams.params, + transparent_status_code: "True", + }, + }); + const contentType = response.headers["content-type"]; + if (contentType && contentType.includes("application/pdf")) { + logParams.success = true; + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF); + logParams.response_code = pageStatusCode; + logParams.error_message = pageError; + return { content, pageStatusCode, pageError }; + } else { + let text = ""; + try { + const decoder = new TextDecoder(); + text = decoder.decode(response.data); + logParams.success = true; + } catch (decodeError) { + console.error( + `[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}` + ); + logParams.error_message = decodeError.message || decodeError; + } + logParams.response_code = response.status; + logParams.html = text; + logParams.success = response.status >= 200 && response.status < 300 || response.status === 404; + logParams.error_message = response.statusText !== "OK" ? response.statusText : undefined; + return { + content: text, + pageStatusCode: response.status, + pageError: response.statusText !== "OK" ? response.statusText : undefined, + }; + } + } catch (error) { + console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`); + logParams.error_message = error.message || error; + logParams.response_code = error.response?.status; + return { + content: "", + pageStatusCode: error.response?.status, + pageError: error.response?.statusText, + }; + } finally { + const endTime = Date.now(); + logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; + await logScrape(logParams); + } + } \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 1a59b27b..d24e5c2e 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -1,14 +1,21 @@ import * as cheerio from "cheerio"; -import { ScrapingBeeClient } from "scrapingbee"; import { extractMetadata } from "./utils/metadata"; import dotenv from "dotenv"; -import { Document, PageOptions, FireEngineResponse } from "../../lib/entities"; +import { + Document, + PageOptions, + FireEngineResponse, + ExtractorOptions, +} from "../../lib/entities"; import { parseMarkdown } from "../../lib/html-to-markdown"; import { urlSpecificParams } from "./utils/custom/website_params"; import { fetchAndProcessPdf } from "./utils/pdfProcessor"; import { handleCustomScraping } from "./custom/handleCustomScraping"; import { removeUnwantedElements } from "./utils/removeUnwantedElements"; -import axios from "axios"; +import { scrapWithFetch } from "./scrapers/fetch"; +import { scrapWithFireEngine } from "./scrapers/fireEngine"; +import { scrapWithPlaywright } from "./scrapers/playwright"; +import { scrapWithScrapingBee } from "./scrapers/scrapingBee"; dotenv.config(); @@ -20,8 +27,6 @@ const baseScrapers = [ "fetch", ] as const; -const universalTimeout = 15000; - export async function generateRequestParams( url: string, wait_browser: string = "domcontentloaded", @@ -45,197 +50,6 @@ export async function generateRequestParams( return defaultParams; } } -export async function scrapWithFireEngine( - url: string, - waitFor: number = 0, - screenshot: boolean = false, - pageOptions: { scrollXPaths?: string[], parsePDF?: boolean } = { parsePDF: true }, - headers?: Record, - options?: any -): Promise { - try { - const reqParams = await generateRequestParams(url); - // If the user has passed a wait parameter in the request, use that - const waitParam = reqParams["params"]?.wait ?? waitFor; - const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; - console.log( - `[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}` - ); - - const response = await axios.post( - process.env.FIRE_ENGINE_BETA_URL + "/scrape", - { - url: url, - wait: waitParam, - screenshot: screenshotParam, - headers: headers, - pageOptions: pageOptions, - }, - { - headers: { - "Content-Type": "application/json", - }, - timeout: universalTimeout + waitParam - } - ); - - if (response.status !== 200) { - console.error( - `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` - ); - return { html: "", screenshot: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError }; - } - - const contentType = response.headers["content-type"]; - if (contentType && contentType.includes("application/pdf")) { - const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF); - return { html: content, screenshot: "", pageStatusCode, pageError }; - } else { - const data = response.data; - const html = data.content; - const screenshot = data.screenshot; - return { html: html ?? "", screenshot: screenshot ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError }; - } - } catch (error) { - if (error.code === 'ECONNABORTED') { - console.log(`[Fire-Engine] Request timed out for ${url}`); - } else { - console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); - } - return { html: "", screenshot: "" }; - } -} - -export async function scrapWithScrapingBee( - url: string, - wait_browser: string = "domcontentloaded", - timeout: number = universalTimeout, - pageOptions: { parsePDF?: boolean } = { parsePDF: true } -): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { - try { - const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); - const clientParams = await generateRequestParams( - url, - wait_browser, - timeout, - ); - const response = await client.get({ - ...clientParams, - params: { - ...clientParams.params, - 'transparent_status_code': 'True' - } - }); - const contentType = response.headers["content-type"]; - if (contentType && contentType.includes("application/pdf")) { - return await fetchAndProcessPdf(url, pageOptions?.parsePDF); - - } else { - let text = ""; - try { - const decoder = new TextDecoder(); - text = decoder.decode(response.data); - } catch (decodeError) { - console.error(`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`); - } - return { content: text, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }; - } - } catch (error) { - console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`); - return { content: "", pageStatusCode: error.response.status, pageError: error.response.statusText }; - } -} - -export async function scrapWithPlaywright( - url: string, - waitFor: number = 0, - headers?: Record, - pageOptions: { parsePDF?: boolean } = { parsePDF: true } -): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { - try { - const reqParams = await generateRequestParams(url); - // If the user has passed a wait parameter in the request, use that - const waitParam = reqParams["params"]?.wait ?? waitFor; - - const response = await axios.post(process.env.PLAYWRIGHT_MICROSERVICE_URL, { - url: url, - wait_after_load: waitParam, - headers: headers, - }, { - headers: { - "Content-Type": "application/json", - }, - timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time - transformResponse: [(data) => data] // Prevent axios from parsing JSON automatically - }); - - if (response.status !== 200) { - console.error( - `[Playwright] Error fetching url: ${url} with status: ${response.status}` - ); - return { content: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError }; - } - - const contentType = response.headers["content-type"]; - if (contentType && contentType.includes("application/pdf")) { - return await fetchAndProcessPdf(url, pageOptions?.parsePDF); - } else { - const textData = response.data; - try { - const data = JSON.parse(textData); - const html = data.content; - return { content: html ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError }; - } catch (jsonError) { - console.error(`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`); - return { content: "" }; - } - } - } catch (error) { - if (error.code === 'ECONNABORTED') { - console.log(`[Playwright] Request timed out for ${url}`); - } else { - console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); - } - return { content: "" }; - } -} - -export async function scrapWithFetch( - url: string, - pageOptions: { parsePDF?: boolean } = { parsePDF: true } -): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { - try { - const response = await axios.get(url, { - headers: { - "Content-Type": "application/json", - }, - timeout: universalTimeout, - transformResponse: [(data) => data] // Prevent axios from parsing JSON automatically - }); - - if (response.status !== 200) { - console.error( - `[Axios] Error fetching url: ${url} with status: ${response.status}` - ); - return { content: "", pageStatusCode: response.status, pageError: response.statusText }; - } - - const contentType = response.headers["content-type"]; - if (contentType && contentType.includes("application/pdf")) { - return await fetchAndProcessPdf(url, pageOptions?.parsePDF); - } else { - const text = response.data; - return { content: text, pageStatusCode: 200 }; - } - } catch (error) { - if (error.code === 'ECONNABORTED') { - console.log(`[Axios] Request timed out for ${url}`); - } else { - console.error(`[Axios] Error fetching url: ${url} -> ${error}`); - } - return { content: "" }; - } -} /** * Get the order of scrapers to be used for scraping a URL @@ -295,17 +109,18 @@ function getScrapingFallbackOrder( return scrapersInOrder as (typeof baseScrapers)[number][]; } - - - export async function scrapSingleUrl( urlToScrap: string, pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false, + includeRawHtml: false, waitFor: 0, screenshot: false, - headers: undefined + headers: undefined, + }, + extractorOptions: ExtractorOptions = { + mode: "llm-extraction-from-markdown", }, existingHtml: string = "" ): Promise { @@ -314,19 +129,24 @@ export async function scrapSingleUrl( const attemptScraping = async ( url: string, method: (typeof baseScrapers)[number] - ) => { - let scraperResponse: { text: string, screenshot: string, metadata: { pageStatusCode?: number, pageError?: string | null } } = { text: "", screenshot: "", metadata: {} }; + ) => { + let scraperResponse: { + text: string; + screenshot: string; + metadata: { pageStatusCode?: number; pageError?: string | null }; + } = { text: "", screenshot: "", metadata: {} }; let screenshot = ""; switch (method) { case "fire-engine": if (process.env.FIRE_ENGINE_BETA_URL) { console.log(`Scraping ${url} with Fire Engine`); - const response = await scrapWithFireEngine( + const response = await scrapWithFireEngine({ url, - pageOptions.waitFor, - pageOptions.screenshot, - pageOptions.headers - ); + waitFor: pageOptions.waitFor, + screenshot: pageOptions.screenshot, + pageOptions: pageOptions, + headers: pageOptions.headers, + }); scraperResponse.text = response.html; scraperResponse.screenshot = response.screenshot; scraperResponse.metadata.pageStatusCode = response.pageStatusCode; @@ -347,7 +167,11 @@ export async function scrapSingleUrl( break; case "playwright": if (process.env.PLAYWRIGHT_MICROSERVICE_URL) { - const response = await scrapWithPlaywright(url, pageOptions.waitFor, pageOptions.headers); + const response = await scrapWithPlaywright( + url, + pageOptions.waitFor, + pageOptions.headers + ); scraperResponse.text = response.content; scraperResponse.metadata.pageStatusCode = response.pageStatusCode; scraperResponse.metadata.pageError = response.pageError; @@ -369,22 +193,39 @@ export async function scrapSingleUrl( break; } - let customScrapedContent : FireEngineResponse | null = null; + let customScrapedContent: FireEngineResponse | null = null; // Check for custom scraping conditions - const customScraperResult = await handleCustomScraping(scraperResponse.text, url); + const customScraperResult = await handleCustomScraping( + scraperResponse.text, + url + ); - if (customScraperResult){ + if (customScraperResult) { switch (customScraperResult.scraper) { case "fire-engine": - customScrapedContent = await scrapWithFireEngine(customScraperResult.url, customScraperResult.waitAfterLoad, false, customScraperResult.pageOptions) + customScrapedContent = await scrapWithFireEngine({ + url: customScraperResult.url, + waitFor: customScraperResult.waitAfterLoad, + screenshot: false, + pageOptions: customScraperResult.pageOptions, + }); if (screenshot) { customScrapedContent.screenshot = screenshot; } break; case "pdf": - const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(customScraperResult.url, pageOptions?.parsePDF); - customScrapedContent = { html: content, screenshot, pageStatusCode, pageError } + const { content, pageStatusCode, pageError } = + await fetchAndProcessPdf( + customScraperResult.url, + pageOptions?.parsePDF + ); + customScrapedContent = { + html: content, + screenshot, + pageStatusCode, + pageError, + }; break; } } @@ -402,11 +243,18 @@ export async function scrapSingleUrl( rawHtml: scraperResponse.text, screenshot: scraperResponse.screenshot, pageStatusCode: scraperResponse.metadata.pageStatusCode, - pageError: scraperResponse.metadata.pageError || undefined + pageError: scraperResponse.metadata.pageError || undefined, }; }; - let { text, html, rawHtml, screenshot, pageStatusCode, pageError } = { text: "", html: "", rawHtml: "", screenshot: "", pageStatusCode: 200, pageError: undefined }; + let { text, html, rawHtml, screenshot, pageStatusCode, pageError } = { + text: "", + html: "", + rawHtml: "", + screenshot: "", + pageStatusCode: 200, + pageError: undefined, + }; try { let urlKey = urlToScrap; try { @@ -432,18 +280,20 @@ export async function scrapSingleUrl( } const attempt = await attemptScraping(urlToScrap, scraper); - text = attempt.text ?? ''; - html = attempt.html ?? ''; - rawHtml = attempt.rawHtml ?? ''; - screenshot = attempt.screenshot ?? ''; + text = attempt.text ?? ""; + html = attempt.html ?? ""; + rawHtml = attempt.rawHtml ?? ""; + screenshot = attempt.screenshot ?? ""; + if (attempt.pageStatusCode) { pageStatusCode = attempt.pageStatusCode; } - if (attempt.pageError) { + if (attempt.pageError && attempt.pageStatusCode >= 400) { pageError = attempt.pageError; + } else if (attempt && attempt.pageStatusCode && attempt.pageStatusCode < 400) { + pageError = undefined; } - - + if (text && text.trim().length >= 100) break; if (pageStatusCode && pageStatusCode == 404) break; const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1; @@ -465,12 +315,17 @@ export async function scrapSingleUrl( content: text, markdown: text, html: pageOptions.includeHtml ? html : undefined, + rawHtml: + pageOptions.includeRawHtml || + extractorOptions.mode === "llm-extraction-from-raw-html" + ? rawHtml + : undefined, metadata: { ...metadata, screenshot: screenshot, sourceURL: urlToScrap, pageStatusCode: pageStatusCode, - pageError: pageError + pageError: pageError, }, }; } else { @@ -478,11 +333,16 @@ export async function scrapSingleUrl( content: text, markdown: text, html: pageOptions.includeHtml ? html : undefined, + rawHtml: + pageOptions.includeRawHtml || + extractorOptions.mode === "llm-extraction-from-raw-html" + ? rawHtml + : undefined, metadata: { ...metadata, sourceURL: urlToScrap, pageStatusCode: pageStatusCode, - pageError: pageError + pageError: pageError, }, }; } @@ -497,7 +357,7 @@ export async function scrapSingleUrl( metadata: { sourceURL: urlToScrap, pageStatusCode: pageStatusCode, - pageError: pageError + pageError: pageError, }, } as Document; } diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts index 0dc24c89..18438df2 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts @@ -100,4 +100,76 @@ describe('removeUnwantedElements', () => { expect(result).not.toContain('id="remove-this"'); expect(result).toContain('class="keep"'); }); + + it('should only include specified tags', () => { + const html = `
Main Content
Footer Content
`; + const options: PageOptions = { onlyIncludeTags: ['main', 'footer'] }; + const result = removeUnwantedElements(html, options); + expect(result).toContain('
Main Content
'); + expect(result).toContain('
Footer Content
'); + expect(result).not.toContain('