Merge branch 'main' into feat/save-docs-on-supabase

This commit is contained in:
Nicolas 2024-07-05 12:27:22 -03:00
commit 914897c9d2
40 changed files with 3547 additions and 2085 deletions

57
.github/dependabot.yml vendored Normal file
View File

@ -0,0 +1,57 @@
version: 2
updates:
# playwright-service
- package-ecosystem: "pip"
directory: "/apps/playwright-service"
schedule:
interval: "weekly"
groups:
prod-deps:
dependency-type: "production"
dev-deps:
dependency-type: "development"
commit-message:
prefix: "apps/playwright-service"
include: "scope"
# python-sdk
- package-ecosystem: "pip"
directory: "/apps/python-sdk"
schedule:
interval: "weekly"
groups:
prod-deps:
dependency-type: "production"
dev-deps:
dependency-type: "development"
commit-message:
prefix: "apps/python-sdk"
include: "scope"
# api
- package-ecosystem: "npm"
directory: "/apps/api"
schedule:
interval: "weekly"
groups:
prod-deps:
dependency-type: "production"
dev-deps:
dependency-type: "development"
commit-message:
prefix: "apps/api"
include: "scope"
# test-suite
- package-ecosystem: "npm"
directory: "/apps/test-suite"
schedule:
interval: "weekly"
groups:
prod-deps:
dependency-type: "production"
dev-deps:
dependency-type: "development"
commit-message:
prefix: "apps/test-suite"
include: "scope"

5
.gitignore vendored
View File

@ -14,4 +14,7 @@ apps/test-suite/node_modules/
apps/test-suite/.env apps/test-suite/.env
apps/test-suite/logs apps/test-suite/logs
apps/test-suite/load-test-results/test-run-report.json apps/test-suite/load-test-results/test-run-report.json
apps/playwright-service-ts/node_modules/
apps/playwright-service-ts/package-lock.json

View File

@ -12,7 +12,7 @@ First, start by installing dependencies
2. pnpm [instructions](https://pnpm.io/installation) 2. pnpm [instructions](https://pnpm.io/installation)
3. redis [instructions](https://redis.io/docs/latest/operate/oss_and_stack/install/install-redis/) 3. redis [instructions](https://redis.io/docs/latest/operate/oss_and_stack/install/install-redis/)
Set environment variables in a .env in the /apps/api/ directoryyou can copy over the template in .env.example. Set environment variables in a .env in the /apps/api/ directory you can copy over the template in .env.example.
To start, we wont set up authentication, or any optional sub services (pdf parsing, JS blocking support, AI features ) To start, we wont set up authentication, or any optional sub services (pdf parsing, JS blocking support, AI features )

View File

@ -1,34 +1,64 @@
# Self-hosting Firecrawl ## Self-hosting Firecrawl
*We're currently working on a more in-depth guide on how to self-host, but in the meantime, here is a simplified version.*
_We're currently working on a more in-depth guide on how to self-host, but in the meantime, here is a simplified version._
Refer to [CONTRIBUTING.md](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md) for instructions on how to run it locally. Refer to [CONTRIBUTING.md](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md) for instructions on how to run it locally.
## Getting Started ## Getting Started
First, clone this repository and copy the example env file from api folder `.env.example` to `.env`. First, clone this repository and copy the example env file from the API folder `.env.example` to `.env`.
```bash
git clone https://github.com/mendableai/firecrawl.git
cd firecrawl
cp ./apps/api/.env.example ./.env
```
For running the simplest version of FireCrawl, edit the `USE_DB_AUTHENTICATION` on `.env` to not use the database authentication. ### Steps
```yml
USE_DB_AUTHENTICATION=false
```
Update the Redis URL in the .env file to align with the Docker configuration:
```yml
REDIS_URL=redis://redis:6379
```
Once that's complete, you can simply run the following commands to get started:
```bash
docker compose up
```
1. Clone the repository:
```bash
git clone https://github.com/mendableai/firecrawl.git
cd firecrawl
cp ./apps/api/.env.example ./.env
```
2. For running the simplest version of FireCrawl, edit the `USE_DB_AUTHENTICATION` in `.env` to not use the database authentication:
```plaintext
USE_DB_AUTHENTICATION=false
```
3. Update the Redis URL in the .env file to align with the Docker configuration:
```plaintext
REDIS_URL=redis://redis:6379
```
4. #### Option: Running with TypeScript Playwright Service
* Update the `docker-compose.yml` file to change the Playwright service:
```plaintext
build: apps/playwright-service
```
TO
```plaintext
build: apps/playwright-service-ts
```
* Set the `PLAYWRIGHT_MICROSERVICE_URL` in your `.env` file:
```plaintext
PLAYWRIGHT_MICROSERVICE_URL=http://localhost:3000/scrape
```
* Don't forget to set the proxy server in your `.env` file as needed.
5. Build and run the Docker containers:
```bash
docker compose build
docker compose up
```
This will run a local instance of Firecrawl which can be accessed at `http://localhost:3002`. This will run a local instance of Firecrawl which can be accessed at `http://localhost:3002`.
# Install Firecrawl on a Kubernetes Cluster (Simple Version) ## Install Firecrawl on a Kubernetes Cluster (Simple Version)
Read the [examples/kubernetes-cluster-install/README.md](examples/kubernetes-cluster-install/README.md) for instructions on how to install Firecrawl on a Kubernetes Cluster.
Read the [examples/kubernetes-cluster-install/README.md](https://github.com/mendableai/firecrawl/blob/main/examples/kubernetes-cluster-install/README.md) for instructions on how to install Firecrawl on a Kubernetes Cluster.

View File

@ -68,9 +68,21 @@
}, },
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
}, },
"onlyIncludeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
},
"headers": { "headers": {
"type": "object", "type": "object",
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
},
"replaceAllPathsWithAbsolutePaths": {
"type": "boolean",
"description": "Replace all relative paths with absolute paths for images and links",
"default": false
} }
} }
}, },
@ -80,8 +92,8 @@
"properties": { "properties": {
"mode": { "mode": {
"type": "string", "type": "string",
"enum": ["llm-extraction"], "enum": ["llm-extraction", "llm-extraction-from-raw-html"],
"description": "The extraction mode to use, currently supports 'llm-extraction'" "description": "The extraction mode to use. llm-extraction: Extracts information from the cleaned and parsed content. llm-extraction-from-raw-html: Extracts information directly from the raw HTML."
}, },
"extractionPrompt": { "extractionPrompt": {
"type": "string", "type": "string",
@ -184,7 +196,7 @@
}, },
"maxDepth": { "maxDepth": {
"type": "integer", "type": "integer",
"description": "Maximum depth to crawl. Depth 1 is the base URL, depth 2 is the base URL and its direct children, and so on." "description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern."
}, },
"mode": { "mode": {
"type": "string", "type": "string",
@ -511,7 +523,7 @@
"html": { "html": {
"type": "string", "type": "string",
"nullable": true, "nullable": true,
"description": "Raw HTML content of the page if `includeHtml` is true" "description": "Raw HTML content of the page if `includeHtml` is true"
}, },
"metadata": { "metadata": {
"type": "object", "type": "object",
@ -526,118 +538,13 @@
"type": "string", "type": "string",
"nullable": true "nullable": true
}, },
"keywords": {
"type": "string",
"nullable": true
},
"robots": {
"type": "string",
"nullable": true
},
"ogTitle": {
"type": "string",
"nullable": true
},
"ogDescription": {
"type": "string",
"nullable": true
},
"ogUrl": {
"type": "string",
"format": "uri",
"nullable": true
},
"ogImage": {
"type": "string",
"nullable": true
},
"ogAudio": {
"type": "string",
"nullable": true
},
"ogDeterminer": {
"type": "string",
"nullable": true
},
"ogLocale": {
"type": "string",
"nullable": true
},
"ogLocaleAlternate": {
"type": "array",
"items": {
"type": "string"
},
"nullable": true
},
"ogSiteName": {
"type": "string",
"nullable": true
},
"ogVideo": {
"type": "string",
"nullable": true
},
"dctermsCreated": {
"type": "string",
"nullable": true
},
"dcDateCreated": {
"type": "string",
"nullable": true
},
"dcDate": {
"type": "string",
"nullable": true
},
"dctermsType": {
"type": "string",
"nullable": true
},
"dcType": {
"type": "string",
"nullable": true
},
"dctermsAudience": {
"type": "string",
"nullable": true
},
"dctermsSubject": {
"type": "string",
"nullable": true
},
"dcSubject": {
"type": "string",
"nullable": true
},
"dcDescription": {
"type": "string",
"nullable": true
},
"dctermsKeywords": {
"type": "string",
"nullable": true
},
"modifiedTime": {
"type": "string",
"nullable": true
},
"publishedTime": {
"type": "string",
"nullable": true
},
"articleTag": {
"type": "string",
"nullable": true
},
"articleSection": {
"type": "string",
"nullable": true
},
"sourceURL": { "sourceURL": {
"type": "string", "type": "string",
"format": "uri" "format": "uri"
}, },
"<any other metadata> ": {
"type": "string"
},
"pageStatusCode": { "pageStatusCode": {
"type": "integer", "type": "integer",
"description": "The status code of the page" "description": "The status code of the page"
@ -647,6 +554,7 @@
"nullable": true, "nullable": true,
"description": "The error message of the page" "description": "The error message of the page"
} }
} }
}, },
"llm_extraction": { "llm_extraction": {
@ -694,118 +602,13 @@
"type": "string", "type": "string",
"nullable": true "nullable": true
}, },
"keywords": {
"type": "string",
"nullable": true
},
"robots": {
"type": "string",
"nullable": true
},
"ogTitle": {
"type": "string",
"nullable": true
},
"ogDescription": {
"type": "string",
"nullable": true
},
"ogUrl": {
"type": "string",
"format": "uri",
"nullable": true
},
"ogImage": {
"type": "string",
"nullable": true
},
"ogAudio": {
"type": "string",
"nullable": true
},
"ogDeterminer": {
"type": "string",
"nullable": true
},
"ogLocale": {
"type": "string",
"nullable": true
},
"ogLocaleAlternate": {
"type": "array",
"items": {
"type": "string"
},
"nullable": true
},
"ogSiteName": {
"type": "string",
"nullable": true
},
"ogVideo": {
"type": "string",
"nullable": true
},
"dctermsCreated": {
"type": "string",
"nullable": true
},
"dcDateCreated": {
"type": "string",
"nullable": true
},
"dcDate": {
"type": "string",
"nullable": true
},
"dctermsType": {
"type": "string",
"nullable": true
},
"dcType": {
"type": "string",
"nullable": true
},
"dctermsAudience": {
"type": "string",
"nullable": true
},
"dctermsSubject": {
"type": "string",
"nullable": true
},
"dcSubject": {
"type": "string",
"nullable": true
},
"dcDescription": {
"type": "string",
"nullable": true
},
"dctermsKeywords": {
"type": "string",
"nullable": true
},
"modifiedTime": {
"type": "string",
"nullable": true
},
"publishedTime": {
"type": "string",
"nullable": true
},
"articleTag": {
"type": "string",
"nullable": true
},
"articleSection": {
"type": "string",
"nullable": true
},
"sourceURL": { "sourceURL": {
"type": "string", "type": "string",
"format": "uri" "format": "uri"
}, },
"<any other metadata> ": {
"type": "string"
},
"pageStatusCode": { "pageStatusCode": {
"type": "integer", "type": "integer",
"description": "The status code of the page" "description": "The status code of the page"
@ -878,4 +681,4 @@
"bearerAuth": [] "bearerAuth": []
} }
] ]
} }

View File

@ -6801,7 +6801,7 @@ packages:
handlebars: 4.7.8 handlebars: 4.7.8
openai: 3.3.0 openai: 3.3.0
sbd: 1.0.19 sbd: 1.0.19
typescript: 5.4.5 typescript: 5.5.3
uuid: 9.0.1 uuid: 9.0.1
zod: 3.23.8 zod: 3.23.8
transitivePeerDependencies: transitivePeerDependencies:
@ -7767,6 +7767,12 @@ packages:
engines: {node: '>=14.17'} engines: {node: '>=14.17'}
hasBin: true hasBin: true
/typescript@5.5.3:
resolution: {integrity: sha512-/hreyEujaB0w76zKo6717l3L0o/qEUtRgdvUBvlkhoWeOVMjMuHNHk0BRBzikzuGDqNmPQbg5ifMEqsHLiIUcQ==}
engines: {node: '>=14.17'}
hasBin: true
dev: false
/typesense@1.8.2(@babel/runtime@7.24.6): /typesense@1.8.2(@babel/runtime@7.24.6):
resolution: {integrity: sha512-aBpePjA99Qvo+OP2pJwMpvga4Jrm1Y2oV5NsrWXBxlqUDNEUCPZBIksPv2Hq0jxQxHhLLyJVbjXjByXsvpCDVA==} resolution: {integrity: sha512-aBpePjA99Qvo+OP2pJwMpvga4Jrm1Y2oV5NsrWXBxlqUDNEUCPZBIksPv2Hq0jxQxHhLLyJVbjXjByXsvpCDVA==}
engines: {node: '>=18'} engines: {node: '>=18'}

View File

@ -131,6 +131,28 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data.metadata.pageStatusCode).toBe(200); expect(response.body.data.metadata.pageStatusCode).toBe(200);
expect(response.body.data.metadata.pageError).toBeUndefined(); expect(response.body.data.metadata.pageError).toBeUndefined();
}, 30000); // 30 seconds timeout }, 30000); // 30 seconds timeout
it.concurrent("should return a successful response with a valid API key and includeRawHtml set to true", async () => {
const response = await request(TEST_URL)
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://roastmywebsite.ai",
pageOptions: { includeRawHtml: true },
});
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty("content");
expect(response.body.data).toHaveProperty("markdown");
expect(response.body.data).toHaveProperty("rawHtml");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data.content).toContain("_Roast_");
expect(response.body.data.markdown).toContain("_Roast_");
expect(response.body.data.rawHtml).toContain("<h1");
expect(response.body.data.metadata.pageStatusCode).toBe(200);
expect(response.body.data.metadata.pageError).toBeUndefined();
}, 30000); // 30 seconds timeout
it.concurrent('should return a successful response for a valid scrape with PDF file', async () => { it.concurrent('should return a successful response for a valid scrape with PDF file', async () => {
const response = await request(TEST_URL) const response = await request(TEST_URL)
@ -804,6 +826,46 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
}, 180000); }, 180000);
it.concurrent("should crawl external content links when allowed", async () => {
const crawlInitResponse = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://mendable.ai",
crawlerOptions: {
allowExternalContentLinks: true,
ignoreSitemap: true,
returnOnlyUrls: true,
limit: 50
}
});
expect(crawlInitResponse.statusCode).toBe(200);
expect(crawlInitResponse.body).toHaveProperty("jobId");
let crawlStatus: string;
let crawlData = [];
while (crawlStatus !== "completed") {
const statusResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlInitResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
crawlStatus = statusResponse.body.status;
if (statusResponse.body.data) {
crawlData = statusResponse.body.data;
}
if (crawlStatus !== "completed") {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
}
}
console.log(crawlData)
expect(crawlData.length).toBeGreaterThan(0);
expect(crawlData).toEqual(expect.arrayContaining([
expect.objectContaining({ url: expect.stringContaining("https://firecrawl.dev/?ref=mendable+banner") }),
expect.objectContaining({ url: expect.stringContaining("https://mendable.ai/pricing") }),
expect.objectContaining({ url: expect.stringContaining("https://x.com/CalebPeffer") })
]));
}, 180000); // 3 minutes timeout
}); });
describe("POST /v0/crawlWebsitePreview", () => { describe("POST /v0/crawlWebsitePreview", () => {
@ -1177,6 +1239,47 @@ describe("E2E Tests for API Routes", () => {
expect(llmExtraction.is_open_source).toBe(false); expect(llmExtraction.is_open_source).toBe(false);
expect(typeof llmExtraction.is_open_source).toBe("boolean"); expect(typeof llmExtraction.is_open_source).toBe("boolean");
}, 60000); // 60 secs }, 60000); // 60 secs
it.concurrent("should extract data using LLM extraction mode with RawHtml", async () => {
const response = await request(TEST_URL)
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://mendable.ai",
extractorOptions: {
mode: "llm-extraction-from-raw-html",
extractionPrompt:
"Based on the information on the page, what are the primary and secondary CTA buttons?",
extractionSchema: {
type: "object",
properties: {
primary_cta: {
type: "string",
},
secondary_cta: {
type: "string",
},
},
required: ["primary_cta", "secondary_cta"],
},
},
});
// Ensure that the job was successfully created before proceeding with LLM extraction
expect(response.statusCode).toBe(200);
// Assuming the LLM extraction object is available in the response body under `data.llm_extraction`
let llmExtraction = response.body.data.llm_extraction;
// Check if the llm_extraction object has the required properties with correct types and values
expect(llmExtraction).toHaveProperty("primary_cta");
expect(typeof llmExtraction.primary_cta).toBe("string");
expect(llmExtraction).toHaveProperty("secondary_cta");
expect(typeof llmExtraction.secondary_cta).toBe("string");
}, 60000); // 60 secs
}); });
// describe("POST /v0/scrape for Top 100 Companies", () => { // describe("POST /v0/scrape for Top 100 Companies", () => {

View File

@ -546,46 +546,51 @@ describe("E2E Tests for API Routes", () => {
expect(childrenLinks.length).toBe(completedResponse.body.data.length); expect(childrenLinks.length).toBe(completedResponse.body.data.length);
}, 180000); // 120 seconds }, 180000); // 120 seconds
it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension ', async () => { // TODO: review the test below
const crawlResponse = await request(TEST_URL) // it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension ', async () => {
.post('/v0/crawl') // const crawlResponse = await request(TEST_URL)
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) // .post('/v0/crawl')
.set('Content-Type', 'application/json') // .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001', crawlerOptions: { limit: 10, excludes: [ 'list/*', 'login', 'abs/*', 'static/*', 'about/*', 'archive/*' ] }}); // .set('Content-Type', 'application/json')
expect(crawlResponse.statusCode).toBe(200); // .send({ url: 'https://arxiv.org/list/astro-ph/1993-01',
// crawlerOptions: {
// limit: 10,
// returnOnlyUrls: true
// }});
// expect(crawlResponse.statusCode).toBe(200);
let isCompleted = false; // let isCompleted = false;
let completedResponse; // let completedResponse;
while (!isCompleted) { // while (!isCompleted) {
const response = await request(TEST_URL) // const response = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`) // .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`); // .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200); // expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('status'); // expect(response.body).toHaveProperty('status');
if (response.body.status === 'completed') { // if (response.body.status === 'completed') {
isCompleted = true; // isCompleted = true;
completedResponse = response; // completedResponse = response;
} else { // } else {
await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again // await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
} // }
} // }
expect(completedResponse.body.status).toBe('completed'); // expect(completedResponse.body.status).toBe('completed');
expect(completedResponse.body).toHaveProperty('data'); // expect(completedResponse.body).toHaveProperty('data');
expect(completedResponse.body.data.length).toEqual(1); // expect(completedResponse.body.data.length).toEqual(1);
expect(completedResponse.body.data).toEqual( // expect(completedResponse.body.data).toEqual(
expect.arrayContaining([ // expect.arrayContaining([
expect.objectContaining({ // expect.objectContaining({
content: expect.stringContaining('asymmetries might represent, for instance, preferred source orientations to our line of sight.') // content: expect.stringContaining('asymmetries might represent, for instance, preferred source orientations to our line of sight.')
}) // })
]) // ])
); // );
expect(completedResponse.body.data[0]).toHaveProperty("metadata"); // expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); // expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); // expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
}, 180000); // 120 seconds // }, 180000); // 120 seconds
it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => { it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => {
const crawlResponse = await request(TEST_URL) const crawlResponse = await request(TEST_URL)

View File

@ -58,19 +58,27 @@ export async function scrapeHelper(
} }
// make sure doc.content is not empty // make sure doc.content is not empty
const filteredDocs = docs.filter( let filteredDocs = docs.filter(
(doc: { content?: string }) => doc.content && doc.content.trim().length > 0 (doc: { content?: string }) => doc.content && doc.content.trim().length > 0
); );
if (filteredDocs.length === 0) { if (filteredDocs.length === 0) {
return { success: true, error: "No page found", returnCode: 200, data: docs[0] }; return { success: true, error: "No page found", returnCode: 200, data: docs[0] };
} }
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
filteredDocs.forEach(doc => {
delete doc.rawHtml;
});
}
let creditsToBeBilled = filteredDocs.length; let creditsToBeBilled = filteredDocs.length;
const creditsPerLLMExtract = 50; const creditsPerLLMExtract = 50;
if (extractorOptions.mode === "llm-extraction") { if (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "llm-extraction-from-raw-html" || extractorOptions.mode === "llm-extraction-from-markdown") {
creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length); creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
} }

View File

@ -8,7 +8,8 @@ import { Document, ExtractorOptions } from "../entities";
// Generate completion using OpenAI // Generate completion using OpenAI
export async function generateCompletions( export async function generateCompletions(
documents: Document[], documents: Document[],
extractionOptions: ExtractorOptions extractionOptions: ExtractorOptions,
mode: "markdown" | "raw-html"
): Promise<Document[]> { ): Promise<Document[]> {
// const schema = zodToJsonSchema(options.schema) // const schema = zodToJsonSchema(options.schema)
@ -28,6 +29,7 @@ export async function generateCompletions(
document: document, document: document,
schema: schema, schema: schema,
prompt: prompt, prompt: prompt,
mode: mode,
}); });
// Validate the JSON output against the schema using AJV // Validate the JSON output against the schema using AJV
const validate = ajv.compile(schema); const validate = ajv.compile(schema);

View File

@ -13,26 +13,37 @@ const defaultPrompt =
"You are a professional web scraper. Extract the contents of the webpage"; "You are a professional web scraper. Extract the contents of the webpage";
function prepareOpenAIDoc( function prepareOpenAIDoc(
document: Document document: Document,
mode: "markdown" | "raw-html"
): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] { ): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] {
let markdown = document.markdown; let markdown = document.markdown;
// Check if the markdown content exists in the document let extractionTarget = document.markdown;
if (!markdown) {
if (mode === "raw-html") {
extractionTarget = document.rawHtml;
}
// Check if the markdown content exists in the document
if (!extractionTarget) {
throw new Error( throw new Error(
"Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai" `${mode} content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai`
); );
} }
// count number of tokens // count number of tokens
const numTokens = numTokensFromString(document.markdown, "gpt-4"); const numTokens = numTokensFromString(extractionTarget, "gpt-4");
if (numTokens > maxTokens) { if (numTokens > maxTokens) {
// trim the document to the maximum number of tokens, tokens != characters // trim the document to the maximum number of tokens, tokens != characters
markdown = markdown.slice(0, (maxTokens * modifier)); extractionTarget = extractionTarget.slice(0, (maxTokens * modifier));
} }
return [[{ type: "text", text: markdown }], numTokens]; return [[{ type: "text", text: extractionTarget }], numTokens];
} }
export async function generateOpenAICompletions({ export async function generateOpenAICompletions({
@ -42,6 +53,7 @@ export async function generateOpenAICompletions({
schema, //TODO - add zod dynamic type checking schema, //TODO - add zod dynamic type checking
prompt = defaultPrompt, prompt = defaultPrompt,
temperature, temperature,
mode
}: { }: {
client: OpenAI; client: OpenAI;
model?: string; model?: string;
@ -49,9 +61,10 @@ export async function generateOpenAICompletions({
schema: any; // This should be replaced with a proper Zod schema type when available schema: any; // This should be replaced with a proper Zod schema type when available
prompt?: string; prompt?: string;
temperature?: number; temperature?: number;
mode: "markdown" | "raw-html";
}): Promise<Document> { }): Promise<Document> {
const openai = client as OpenAI; const openai = client as OpenAI;
const [content, numTokens] = prepareOpenAIDoc(document); const [content, numTokens] = prepareOpenAIDoc(document, mode);
const completion = await openai.chat.completions.create({ const completion = await openai.chat.completions.create({
model, model,

View File

@ -13,6 +13,7 @@ export interface Progress {
export type PageOptions = { export type PageOptions = {
onlyMainContent?: boolean; onlyMainContent?: boolean;
includeHtml?: boolean; includeHtml?: boolean;
includeRawHtml?: boolean;
fallback?: boolean; fallback?: boolean;
fetchPageContent?: boolean; fetchPageContent?: boolean;
waitFor?: number; waitFor?: number;
@ -21,10 +22,11 @@ export type PageOptions = {
replaceAllPathsWithAbsolutePaths?: boolean; replaceAllPathsWithAbsolutePaths?: boolean;
parsePDF?: boolean; parsePDF?: boolean;
removeTags?: string | string[]; removeTags?: string | string[];
onlyIncludeTags?: string | string[];
}; };
export type ExtractorOptions = { export type ExtractorOptions = {
mode: "markdown" | "llm-extraction"; mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html";
extractionPrompt?: string; extractionPrompt?: string;
extractionSchema?: Record<string, any>; extractionSchema?: Record<string, any>;
} }
@ -50,6 +52,7 @@ export type CrawlerOptions = {
ignoreSitemap?: boolean; ignoreSitemap?: boolean;
mode?: "default" | "fast"; // have a mode of some sort mode?: "default" | "fast"; // have a mode of some sort
allowBackwardCrawling?: boolean; allowBackwardCrawling?: boolean;
allowExternalContentLinks?: boolean;
} }
export type WebScraperOptions = { export type WebScraperOptions = {
@ -72,6 +75,7 @@ export class Document {
content: string; content: string;
markdown?: string; markdown?: string;
html?: string; html?: string;
rawHtml?: string;
llm_extraction?: Record<string, any>; llm_extraction?: Record<string, any>;
createdAt?: Date; createdAt?: Date;
updatedAt?: Date; updatedAt?: Date;

View File

@ -188,5 +188,38 @@ describe('WebCrawler', () => {
// Check that the backward link is included if allowBackwardCrawling is true // Check that the backward link is included if allowBackwardCrawling is true
expect(results.some(r => r.url === 'https://mendable.ai')).toBe(true); expect(results.some(r => r.url === 'https://mendable.ai')).toBe(true);
}); });
it('should respect the limit parameter by not returning more links than specified', async () => {
const initialUrl = 'http://example.com';
const limit = 2; // Set a limit for the number of links
crawler = new WebCrawler({
initialUrl: initialUrl,
includes: [],
excludes: [],
limit: limit, // Apply the limit
maxCrawledDepth: 10
});
// Mock sitemap fetching function to return more links than the limit
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
initialUrl,
initialUrl + '/page1',
initialUrl + '/page2',
initialUrl + '/page3'
]);
const filteredLinks = crawler['filterLinks'](
[initialUrl, initialUrl + '/page1', initialUrl + '/page2', initialUrl + '/page3'],
limit,
10
);
expect(filteredLinks.length).toBe(limit); // Check if the number of results respects the limit
expect(filteredLinks).toEqual([
initialUrl,
initialUrl + '/page1'
]);
});
}); });

View File

@ -4,7 +4,7 @@ import { URL } from "url";
import { getLinksFromSitemap } from "./sitemap"; import { getLinksFromSitemap } from "./sitemap";
import async from "async"; import async from "async";
import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities"; import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities";
import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url"; import { scrapSingleUrl } from "./single_url";
import robotsParser from "robots-parser"; import robotsParser from "robots-parser";
import { getURLDepth } from "./utils/maxDepthUtils"; import { getURLDepth } from "./utils/maxDepthUtils";
import { axiosTimeout } from "../../../src/lib/timeout"; import { axiosTimeout } from "../../../src/lib/timeout";
@ -23,6 +23,7 @@ export class WebCrawler {
private robots: any; private robots: any;
private generateImgAltText: boolean; private generateImgAltText: boolean;
private allowBackwardCrawling: boolean; private allowBackwardCrawling: boolean;
private allowExternalContentLinks: boolean;
constructor({ constructor({
initialUrl, initialUrl,
@ -32,7 +33,8 @@ export class WebCrawler {
limit = 10000, limit = 10000,
generateImgAltText = false, generateImgAltText = false,
maxCrawledDepth = 10, maxCrawledDepth = 10,
allowBackwardCrawling = false allowBackwardCrawling = false,
allowExternalContentLinks = false
}: { }: {
initialUrl: string; initialUrl: string;
includes?: string[]; includes?: string[];
@ -42,6 +44,7 @@ export class WebCrawler {
generateImgAltText?: boolean; generateImgAltText?: boolean;
maxCrawledDepth?: number; maxCrawledDepth?: number;
allowBackwardCrawling?: boolean; allowBackwardCrawling?: boolean;
allowExternalContentLinks?: boolean;
}) { }) {
this.initialUrl = initialUrl; this.initialUrl = initialUrl;
this.baseUrl = new URL(initialUrl).origin; this.baseUrl = new URL(initialUrl).origin;
@ -55,6 +58,7 @@ export class WebCrawler {
this.maxCrawledDepth = maxCrawledDepth ?? 10; this.maxCrawledDepth = maxCrawledDepth ?? 10;
this.generateImgAltText = generateImgAltText ?? false; this.generateImgAltText = generateImgAltText ?? false;
this.allowBackwardCrawling = allowBackwardCrawling ?? false; this.allowBackwardCrawling = allowBackwardCrawling ?? false;
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
} }
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
@ -98,9 +102,10 @@ export class WebCrawler {
const linkHostname = normalizedLink.hostname.replace(/^www\./, ''); const linkHostname = normalizedLink.hostname.replace(/^www\./, '');
// Ensure the protocol and hostname match, and the path starts with the initial URL's path // Ensure the protocol and hostname match, and the path starts with the initial URL's path
if (linkHostname !== initialHostname) { // commented to able to handling external link on allowExternalContentLinks
return false; // if (linkHostname !== initialHostname) {
} // return false;
// }
if (!this.allowBackwardCrawling) { if (!this.allowBackwardCrawling) {
if (!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) { if (!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) {
@ -278,15 +283,24 @@ export class WebCrawler {
const path = urlObj.pathname; const path = urlObj.pathname;
if ( if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
this.isInternalLink(fullUrl) && if (this.isInternalLink(fullUrl) &&
this.noSections(fullUrl) && this.noSections(fullUrl) &&
// The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards !this.matchesExcludes(path) &&
// this.matchesIncludes(path) && this.isRobotsAllowed(fullUrl)
!this.matchesExcludes(path) && ) {
this.isRobotsAllowed(fullUrl) links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
) { }
links.push({ url: fullUrl, html: content, pageStatusCode, pageError }); } else { // EXTERNAL LINKS
if (
this.isInternalLink(url) &&
this.allowExternalContentLinks &&
!this.isSocialMediaOrEmail(fullUrl) &&
!this.matchesExcludes(fullUrl, true) &&
!this.isExternalMainPage(fullUrl)
) {
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
}
} }
} }
}); });
@ -320,9 +334,41 @@ export class WebCrawler {
return this.includes.some((pattern) => new RegExp(pattern).test(url)); return this.includes.some((pattern) => new RegExp(pattern).test(url));
} }
private matchesExcludes(url: string): boolean { private matchesExcludes(url: string, onlyDomains: boolean = false): boolean {
if (this.excludes.length === 0 || this.excludes[0] == "") return false; return this.excludes.some((pattern) => {
return this.excludes.some((pattern) => new RegExp(pattern).test(url)); if (onlyDomains)
return this.matchesExcludesExternalDomains(url);
return this.excludes.some((pattern) => new RegExp(pattern).test(url));
});
}
// supported formats: "example.com/blog", "https://example.com", "blog.example.com", "example.com"
private matchesExcludesExternalDomains(url: string) {
try {
const urlObj = new URL(url);
const hostname = urlObj.hostname;
const pathname = urlObj.pathname;
for (let domain of this.excludes) {
let domainObj = new URL('http://' + domain.replace(/^https?:\/\//, ''));
let domainHostname = domainObj.hostname;
let domainPathname = domainObj.pathname;
if (hostname === domainHostname || hostname.endsWith(`.${domainHostname}`)) {
if (pathname.startsWith(domainPathname)) {
return true;
}
}
}
return false;
} catch (e) {
return false;
}
}
private isExternalMainPage(url:string):boolean {
return !Boolean(url.split("/").slice(3).filter(subArray => subArray.length > 0).length)
} }
private noSections(link: string): boolean { private noSections(link: string): boolean {
@ -375,6 +421,10 @@ export class WebCrawler {
"instagram.com", "instagram.com",
"pinterest.com", "pinterest.com",
"mailto:", "mailto:",
"github.com",
"calendly.com",
"discord.gg",
"discord.com",
]; ];
return socialMediaOrEmail.some((ext) => url.includes(ext)); return socialMediaOrEmail.some((ext) => url.includes(ext));
} }

View File

@ -29,22 +29,23 @@ export async function handleCustomScraping(
}; };
} }
// Check for Google Drive PDF links in the raw HTML // Check for Google Drive PDF links in meta tags
const googleDrivePdfPattern = const googleDriveMetaPattern = /<meta itemprop="url" content="(https:\/\/drive\.google\.com\/file\/d\/[^"]+)"/;
/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/; const googleDriveMetaMatch = text.match(googleDriveMetaPattern);
const googleDrivePdfLink = url.match(googleDrivePdfPattern); if (googleDriveMetaMatch) {
if (googleDrivePdfLink) { const url = googleDriveMetaMatch[1];
console.log( console.log(`Google Drive PDF link detected: ${url}`);
`Google Drive PDF link detected for ${url}: ${googleDrivePdfLink[0]}`
);
const fileId = googleDrivePdfLink[1]; const fileIdMatch = url.match(/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/);
const pdfUrl = `https://drive.google.com/uc?export=download&id=${fileId}`; if (fileIdMatch) {
const fileId = fileIdMatch[1];
const pdfUrl = `https://drive.google.com/uc?export=download&id=${fileId}`;
return { return {
scraper: "pdf", scraper: "pdf",
url: pdfUrl url: pdfUrl
}; };
}
} }
return null; return null;

View File

@ -0,0 +1 @@
export const universalTimeout = 15000;

View File

@ -40,6 +40,7 @@ export class WebScraperDataProvider {
"gpt-4-turbo"; "gpt-4-turbo";
private crawlerMode: string = "default"; private crawlerMode: string = "default";
private allowBackwardCrawling: boolean = false; private allowBackwardCrawling: boolean = false;
private allowExternalContentLinks: boolean = false;
authorize(): void { authorize(): void {
throw new Error("Method not implemented."); throw new Error("Method not implemented.");
@ -66,6 +67,7 @@ export class WebScraperDataProvider {
const result = await scrapSingleUrl( const result = await scrapSingleUrl(
url, url,
this.pageOptions, this.pageOptions,
this.extractorOptions,
existingHTML existingHTML
); );
processedUrls++; processedUrls++;
@ -172,6 +174,7 @@ export class WebScraperDataProvider {
limit: this.limit, limit: this.limit,
generateImgAltText: this.generateImgAltText, generateImgAltText: this.generateImgAltText,
allowBackwardCrawling: this.allowBackwardCrawling, allowBackwardCrawling: this.allowBackwardCrawling,
allowExternalContentLinks: this.allowExternalContentLinks,
}); });
let links = await crawler.start( let links = await crawler.start(
@ -269,10 +272,16 @@ export class WebScraperDataProvider {
// documents = await this.applyImgAltText(documents); // documents = await this.applyImgAltText(documents);
if ( if (
this.extractorOptions.mode === "llm-extraction" && (this.extractorOptions.mode === "llm-extraction" || this.extractorOptions.mode === "llm-extraction-from-markdown") &&
this.mode === "single_urls" this.mode === "single_urls"
) { ) {
documents = await generateCompletions(documents, this.extractorOptions); documents = await generateCompletions(documents, this.extractorOptions, "markdown");
}
if (
(this.extractorOptions.mode === "llm-extraction-from-raw-html") &&
this.mode === "single_urls"
) {
documents = await generateCompletions(documents, this.extractorOptions, "raw-html");
} }
return documents.concat(pdfDocuments).concat(docxDocuments); return documents.concat(pdfDocuments).concat(docxDocuments);
} }
@ -489,6 +498,7 @@ export class WebScraperDataProvider {
this.crawlerMode = options.crawlerOptions?.mode ?? "default"; this.crawlerMode = options.crawlerOptions?.mode ?? "default";
this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false; this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
this.allowBackwardCrawling = options.crawlerOptions?.allowBackwardCrawling ?? false; this.allowBackwardCrawling = options.crawlerOptions?.allowBackwardCrawling ?? false;
this.allowExternalContentLinks = options.crawlerOptions?.allowExternalContentLinks ?? false;
// make sure all urls start with https:// // make sure all urls start with https://
this.urls = this.urls.map((url) => { this.urls = this.urls.map((url) => {

View File

@ -0,0 +1,77 @@
import axios from "axios";
import { logScrape } from "../../../services/logging/scrape_log";
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
import { universalTimeout } from "../global";
/**
* Scrapes a URL with Axios
* @param url The URL to scrape
* @param pageOptions The options for the page
* @returns The scraped content
*/
export async function scrapWithFetch(
url: string,
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> {
const logParams = {
url,
scraper: "fetch",
success: false,
response_code: null,
time_taken_seconds: null,
error_message: null,
html: "",
startTime: Date.now(),
};
try {
const response = await axios.get(url, {
headers: {
"Content-Type": "application/json",
},
timeout: universalTimeout,
transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically
});
if (response.status !== 200) {
console.error(
`[Axios] Error fetching url: ${url} with status: ${response.status}`
);
logParams.error_message = response.statusText;
logParams.response_code = response.status;
return {
content: "",
pageStatusCode: response.status,
pageError: response.statusText,
};
}
const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) {
logParams.success = true;
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
logParams.response_code = pageStatusCode;
logParams.error_message = pageError;
return { content, pageStatusCode, pageError };
} else {
const text = response.data;
logParams.success = true;
logParams.html = text;
logParams.response_code = response.status;
return { content: text, pageStatusCode: response.status, pageError: null };
}
} catch (error) {
if (error.code === "ECONNABORTED") {
logParams.error_message = "Request timed out";
console.log(`[Axios] Request timed out for ${url}`);
} else {
logParams.error_message = error.message || error;
console.error(`[Axios] Error fetching url: ${url} -> ${error}`);
}
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
} finally {
const endTime = Date.now();
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
await logScrape(logParams);
}
}

View File

@ -0,0 +1,124 @@
import axios from "axios";
import { FireEngineResponse } from "../../../lib/entities";
import { logScrape } from "../../../services/logging/scrape_log";
import { generateRequestParams } from "../single_url";
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
import { universalTimeout } from "../global";
/**
* Scrapes a URL with Fire-Engine
* @param url The URL to scrape
* @param waitFor The time to wait for the page to load
* @param screenshot Whether to take a screenshot
* @param pageOptions The options for the page
* @param headers The headers to send with the request
* @param options The options for the request
* @returns The scraped content
*/
export async function scrapWithFireEngine({
url,
waitFor = 0,
screenshot = false,
pageOptions = { parsePDF: true },
headers,
options,
}: {
url: string;
waitFor?: number;
screenshot?: boolean;
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean };
headers?: Record<string, string>;
options?: any;
}): Promise<FireEngineResponse> {
const logParams = {
url,
scraper: "fire-engine",
success: false,
response_code: null,
time_taken_seconds: null,
error_message: null,
html: "",
startTime: Date.now(),
};
try {
const reqParams = await generateRequestParams(url);
const waitParam = reqParams["params"]?.wait ?? waitFor;
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
console.log(
`[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}`
);
const response = await axios.post(
process.env.FIRE_ENGINE_BETA_URL + "/scrape",
{
url: url,
wait: waitParam,
screenshot: screenshotParam,
headers: headers,
pageOptions: pageOptions,
},
{
headers: {
"Content-Type": "application/json",
},
timeout: universalTimeout + waitParam,
}
);
if (response.status !== 200) {
console.error(
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
);
logParams.error_message = response.data?.pageError;
logParams.response_code = response.data?.pageStatusCode;
return {
html: "",
screenshot: "",
pageStatusCode: response.data?.pageStatusCode,
pageError: response.data?.pageError,
};
}
const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) {
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
url,
pageOptions?.parsePDF
);
logParams.success = true;
logParams.response_code = pageStatusCode;
logParams.error_message = pageError;
return { html: content, screenshot: "", pageStatusCode, pageError };
} else {
const data = response.data;
logParams.success =
(data.pageStatusCode >= 200 && data.pageStatusCode < 300) ||
data.pageStatusCode === 404;
logParams.html = data.content ?? "";
logParams.response_code = data.pageStatusCode;
logParams.error_message = data.pageError;
return {
html: data.content ?? "",
screenshot: data.screenshot ?? "",
pageStatusCode: data.pageStatusCode,
pageError: data.pageError,
};
}
} catch (error) {
if (error.code === "ECONNABORTED") {
console.log(`[Fire-Engine] Request timed out for ${url}`);
logParams.error_message = "Request timed out";
} else {
console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
logParams.error_message = error.message || error;
}
return { html: "", screenshot: "", pageStatusCode: null, pageError: logParams.error_message };
} finally {
const endTime = Date.now();
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
await logScrape(logParams, pageOptions);
}
}

View File

@ -0,0 +1,109 @@
import axios from "axios";
import { logScrape } from "../../../services/logging/scrape_log";
import { generateRequestParams } from "../single_url";
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
import { universalTimeout } from "../global";
/**
* Scrapes a URL with Playwright
* @param url The URL to scrape
* @param waitFor The time to wait for the page to load
* @param headers The headers to send with the request
* @param pageOptions The options for the page
* @returns The scraped content
*/
export async function scrapWithPlaywright(
url: string,
waitFor: number = 0,
headers?: Record<string, string>,
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> {
const logParams = {
url,
scraper: "playwright",
success: false,
response_code: null,
time_taken_seconds: null,
error_message: null,
html: "",
startTime: Date.now(),
};
try {
const reqParams = await generateRequestParams(url);
// If the user has passed a wait parameter in the request, use that
const waitParam = reqParams["params"]?.wait ?? waitFor;
const response = await axios.post(
process.env.PLAYWRIGHT_MICROSERVICE_URL,
{
url: url,
wait_after_load: waitParam,
headers: headers,
},
{
headers: {
"Content-Type": "application/json",
},
timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time
transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically
}
);
if (response.status !== 200) {
console.error(
`[Playwright] Error fetching url: ${url} with status: ${response.status}`
);
logParams.error_message = response.data?.pageError;
logParams.response_code = response.data?.pageStatusCode;
return {
content: "",
pageStatusCode: response.data?.pageStatusCode,
pageError: response.data?.pageError,
};
}
const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) {
logParams.success = true;
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
logParams.response_code = pageStatusCode;
logParams.error_message = pageError;
return { content, pageStatusCode, pageError };
} else {
const textData = response.data;
try {
const data = JSON.parse(textData);
const html = data.content;
logParams.success = true;
logParams.html = html;
logParams.response_code = data.pageStatusCode;
logParams.error_message = data.pageError;
return {
content: html ?? "",
pageStatusCode: data.pageStatusCode,
pageError: data.pageError,
};
} catch (jsonError) {
logParams.error_message = jsonError.message || jsonError;
console.error(
`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`
);
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
}
}
} catch (error) {
if (error.code === "ECONNABORTED") {
logParams.error_message = "Request timed out";
console.log(`[Playwright] Request timed out for ${url}`);
} else {
logParams.error_message = error.message || error;
console.error(`[Playwright] Error fetching url: ${url} -> ${error}`);
}
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
} finally {
const endTime = Date.now();
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
await logScrape(logParams);
}
}

View File

@ -0,0 +1,88 @@
import { logScrape } from "../../../services/logging/scrape_log";
import { generateRequestParams } from "../single_url";
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
import { universalTimeout } from "../global";
import { ScrapingBeeClient } from "scrapingbee";
/**
* Scrapes a URL with ScrapingBee
* @param url The URL to scrape
* @param wait_browser The browser event to wait for
* @param timeout The timeout for the scrape
* @param pageOptions The options for the page
* @returns The scraped content
*/
export async function scrapWithScrapingBee(
url: string,
wait_browser: string = "domcontentloaded",
timeout: number = universalTimeout,
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> {
const logParams = {
url,
scraper: wait_browser === "networkidle2" ? "scrapingBeeLoad" : "scrapingBee",
success: false,
response_code: null,
time_taken_seconds: null,
error_message: null,
html: "",
startTime: Date.now(),
};
try {
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
const clientParams = await generateRequestParams(
url,
wait_browser,
timeout
);
const response = await client.get({
...clientParams,
params: {
...clientParams.params,
transparent_status_code: "True",
},
});
const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) {
logParams.success = true;
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
logParams.response_code = pageStatusCode;
logParams.error_message = pageError;
return { content, pageStatusCode, pageError };
} else {
let text = "";
try {
const decoder = new TextDecoder();
text = decoder.decode(response.data);
logParams.success = true;
} catch (decodeError) {
console.error(
`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`
);
logParams.error_message = decodeError.message || decodeError;
}
logParams.response_code = response.status;
logParams.html = text;
logParams.success = response.status >= 200 && response.status < 300 || response.status === 404;
logParams.error_message = response.statusText !== "OK" ? response.statusText : undefined;
return {
content: text,
pageStatusCode: response.status,
pageError: response.statusText !== "OK" ? response.statusText : undefined,
};
}
} catch (error) {
console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`);
logParams.error_message = error.message || error;
logParams.response_code = error.response?.status;
return {
content: "",
pageStatusCode: error.response?.status,
pageError: error.response?.statusText,
};
} finally {
const endTime = Date.now();
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
await logScrape(logParams);
}
}

View File

@ -1,14 +1,21 @@
import * as cheerio from "cheerio"; import * as cheerio from "cheerio";
import { ScrapingBeeClient } from "scrapingbee";
import { extractMetadata } from "./utils/metadata"; import { extractMetadata } from "./utils/metadata";
import dotenv from "dotenv"; import dotenv from "dotenv";
import { Document, PageOptions, FireEngineResponse } from "../../lib/entities"; import {
Document,
PageOptions,
FireEngineResponse,
ExtractorOptions,
} from "../../lib/entities";
import { parseMarkdown } from "../../lib/html-to-markdown"; import { parseMarkdown } from "../../lib/html-to-markdown";
import { urlSpecificParams } from "./utils/custom/website_params"; import { urlSpecificParams } from "./utils/custom/website_params";
import { fetchAndProcessPdf } from "./utils/pdfProcessor"; import { fetchAndProcessPdf } from "./utils/pdfProcessor";
import { handleCustomScraping } from "./custom/handleCustomScraping"; import { handleCustomScraping } from "./custom/handleCustomScraping";
import { removeUnwantedElements } from "./utils/removeUnwantedElements"; import { removeUnwantedElements } from "./utils/removeUnwantedElements";
import axios from "axios"; import { scrapWithFetch } from "./scrapers/fetch";
import { scrapWithFireEngine } from "./scrapers/fireEngine";
import { scrapWithPlaywright } from "./scrapers/playwright";
import { scrapWithScrapingBee } from "./scrapers/scrapingBee";
dotenv.config(); dotenv.config();
@ -20,8 +27,6 @@ const baseScrapers = [
"fetch", "fetch",
] as const; ] as const;
const universalTimeout = 15000;
export async function generateRequestParams( export async function generateRequestParams(
url: string, url: string,
wait_browser: string = "domcontentloaded", wait_browser: string = "domcontentloaded",
@ -45,197 +50,6 @@ export async function generateRequestParams(
return defaultParams; return defaultParams;
} }
} }
export async function scrapWithFireEngine(
url: string,
waitFor: number = 0,
screenshot: boolean = false,
pageOptions: { scrollXPaths?: string[], parsePDF?: boolean } = { parsePDF: true },
headers?: Record<string, string>,
options?: any
): Promise<FireEngineResponse> {
try {
const reqParams = await generateRequestParams(url);
// If the user has passed a wait parameter in the request, use that
const waitParam = reqParams["params"]?.wait ?? waitFor;
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
console.log(
`[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}`
);
const response = await axios.post(
process.env.FIRE_ENGINE_BETA_URL + "/scrape",
{
url: url,
wait: waitParam,
screenshot: screenshotParam,
headers: headers,
pageOptions: pageOptions,
},
{
headers: {
"Content-Type": "application/json",
},
timeout: universalTimeout + waitParam
}
);
if (response.status !== 200) {
console.error(
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
);
return { html: "", screenshot: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError };
}
const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) {
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
return { html: content, screenshot: "", pageStatusCode, pageError };
} else {
const data = response.data;
const html = data.content;
const screenshot = data.screenshot;
return { html: html ?? "", screenshot: screenshot ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError };
}
} catch (error) {
if (error.code === 'ECONNABORTED') {
console.log(`[Fire-Engine] Request timed out for ${url}`);
} else {
console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
}
return { html: "", screenshot: "" };
}
}
export async function scrapWithScrapingBee(
url: string,
wait_browser: string = "domcontentloaded",
timeout: number = universalTimeout,
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
try {
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
const clientParams = await generateRequestParams(
url,
wait_browser,
timeout,
);
const response = await client.get({
...clientParams,
params: {
...clientParams.params,
'transparent_status_code': 'True'
}
});
const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) {
return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
} else {
let text = "";
try {
const decoder = new TextDecoder();
text = decoder.decode(response.data);
} catch (decodeError) {
console.error(`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`);
}
return { content: text, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined };
}
} catch (error) {
console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`);
return { content: "", pageStatusCode: error.response.status, pageError: error.response.statusText };
}
}
export async function scrapWithPlaywright(
url: string,
waitFor: number = 0,
headers?: Record<string, string>,
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
try {
const reqParams = await generateRequestParams(url);
// If the user has passed a wait parameter in the request, use that
const waitParam = reqParams["params"]?.wait ?? waitFor;
const response = await axios.post(process.env.PLAYWRIGHT_MICROSERVICE_URL, {
url: url,
wait_after_load: waitParam,
headers: headers,
}, {
headers: {
"Content-Type": "application/json",
},
timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time
transformResponse: [(data) => data] // Prevent axios from parsing JSON automatically
});
if (response.status !== 200) {
console.error(
`[Playwright] Error fetching url: ${url} with status: ${response.status}`
);
return { content: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError };
}
const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) {
return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
} else {
const textData = response.data;
try {
const data = JSON.parse(textData);
const html = data.content;
return { content: html ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError };
} catch (jsonError) {
console.error(`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`);
return { content: "" };
}
}
} catch (error) {
if (error.code === 'ECONNABORTED') {
console.log(`[Playwright] Request timed out for ${url}`);
} else {
console.error(`[Playwright] Error fetching url: ${url} -> ${error}`);
}
return { content: "" };
}
}
export async function scrapWithFetch(
url: string,
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
try {
const response = await axios.get(url, {
headers: {
"Content-Type": "application/json",
},
timeout: universalTimeout,
transformResponse: [(data) => data] // Prevent axios from parsing JSON automatically
});
if (response.status !== 200) {
console.error(
`[Axios] Error fetching url: ${url} with status: ${response.status}`
);
return { content: "", pageStatusCode: response.status, pageError: response.statusText };
}
const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) {
return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
} else {
const text = response.data;
return { content: text, pageStatusCode: 200 };
}
} catch (error) {
if (error.code === 'ECONNABORTED') {
console.log(`[Axios] Request timed out for ${url}`);
} else {
console.error(`[Axios] Error fetching url: ${url} -> ${error}`);
}
return { content: "" };
}
}
/** /**
* Get the order of scrapers to be used for scraping a URL * Get the order of scrapers to be used for scraping a URL
@ -295,17 +109,18 @@ function getScrapingFallbackOrder(
return scrapersInOrder as (typeof baseScrapers)[number][]; return scrapersInOrder as (typeof baseScrapers)[number][];
} }
export async function scrapSingleUrl( export async function scrapSingleUrl(
urlToScrap: string, urlToScrap: string,
pageOptions: PageOptions = { pageOptions: PageOptions = {
onlyMainContent: true, onlyMainContent: true,
includeHtml: false, includeHtml: false,
includeRawHtml: false,
waitFor: 0, waitFor: 0,
screenshot: false, screenshot: false,
headers: undefined headers: undefined,
},
extractorOptions: ExtractorOptions = {
mode: "llm-extraction-from-markdown",
}, },
existingHtml: string = "" existingHtml: string = ""
): Promise<Document> { ): Promise<Document> {
@ -314,19 +129,24 @@ export async function scrapSingleUrl(
const attemptScraping = async ( const attemptScraping = async (
url: string, url: string,
method: (typeof baseScrapers)[number] method: (typeof baseScrapers)[number]
) => { ) => {
let scraperResponse: { text: string, screenshot: string, metadata: { pageStatusCode?: number, pageError?: string | null } } = { text: "", screenshot: "", metadata: {} }; let scraperResponse: {
text: string;
screenshot: string;
metadata: { pageStatusCode?: number; pageError?: string | null };
} = { text: "", screenshot: "", metadata: {} };
let screenshot = ""; let screenshot = "";
switch (method) { switch (method) {
case "fire-engine": case "fire-engine":
if (process.env.FIRE_ENGINE_BETA_URL) { if (process.env.FIRE_ENGINE_BETA_URL) {
console.log(`Scraping ${url} with Fire Engine`); console.log(`Scraping ${url} with Fire Engine`);
const response = await scrapWithFireEngine( const response = await scrapWithFireEngine({
url, url,
pageOptions.waitFor, waitFor: pageOptions.waitFor,
pageOptions.screenshot, screenshot: pageOptions.screenshot,
pageOptions.headers pageOptions: pageOptions,
); headers: pageOptions.headers,
});
scraperResponse.text = response.html; scraperResponse.text = response.html;
scraperResponse.screenshot = response.screenshot; scraperResponse.screenshot = response.screenshot;
scraperResponse.metadata.pageStatusCode = response.pageStatusCode; scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
@ -347,7 +167,11 @@ export async function scrapSingleUrl(
break; break;
case "playwright": case "playwright":
if (process.env.PLAYWRIGHT_MICROSERVICE_URL) { if (process.env.PLAYWRIGHT_MICROSERVICE_URL) {
const response = await scrapWithPlaywright(url, pageOptions.waitFor, pageOptions.headers); const response = await scrapWithPlaywright(
url,
pageOptions.waitFor,
pageOptions.headers
);
scraperResponse.text = response.content; scraperResponse.text = response.content;
scraperResponse.metadata.pageStatusCode = response.pageStatusCode; scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
scraperResponse.metadata.pageError = response.pageError; scraperResponse.metadata.pageError = response.pageError;
@ -369,22 +193,39 @@ export async function scrapSingleUrl(
break; break;
} }
let customScrapedContent : FireEngineResponse | null = null; let customScrapedContent: FireEngineResponse | null = null;
// Check for custom scraping conditions // Check for custom scraping conditions
const customScraperResult = await handleCustomScraping(scraperResponse.text, url); const customScraperResult = await handleCustomScraping(
scraperResponse.text,
url
);
if (customScraperResult){ if (customScraperResult) {
switch (customScraperResult.scraper) { switch (customScraperResult.scraper) {
case "fire-engine": case "fire-engine":
customScrapedContent = await scrapWithFireEngine(customScraperResult.url, customScraperResult.waitAfterLoad, false, customScraperResult.pageOptions) customScrapedContent = await scrapWithFireEngine({
url: customScraperResult.url,
waitFor: customScraperResult.waitAfterLoad,
screenshot: false,
pageOptions: customScraperResult.pageOptions,
});
if (screenshot) { if (screenshot) {
customScrapedContent.screenshot = screenshot; customScrapedContent.screenshot = screenshot;
} }
break; break;
case "pdf": case "pdf":
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(customScraperResult.url, pageOptions?.parsePDF); const { content, pageStatusCode, pageError } =
customScrapedContent = { html: content, screenshot, pageStatusCode, pageError } await fetchAndProcessPdf(
customScraperResult.url,
pageOptions?.parsePDF
);
customScrapedContent = {
html: content,
screenshot,
pageStatusCode,
pageError,
};
break; break;
} }
} }
@ -402,11 +243,18 @@ export async function scrapSingleUrl(
rawHtml: scraperResponse.text, rawHtml: scraperResponse.text,
screenshot: scraperResponse.screenshot, screenshot: scraperResponse.screenshot,
pageStatusCode: scraperResponse.metadata.pageStatusCode, pageStatusCode: scraperResponse.metadata.pageStatusCode,
pageError: scraperResponse.metadata.pageError || undefined pageError: scraperResponse.metadata.pageError || undefined,
}; };
}; };
let { text, html, rawHtml, screenshot, pageStatusCode, pageError } = { text: "", html: "", rawHtml: "", screenshot: "", pageStatusCode: 200, pageError: undefined }; let { text, html, rawHtml, screenshot, pageStatusCode, pageError } = {
text: "",
html: "",
rawHtml: "",
screenshot: "",
pageStatusCode: 200,
pageError: undefined,
};
try { try {
let urlKey = urlToScrap; let urlKey = urlToScrap;
try { try {
@ -432,18 +280,20 @@ export async function scrapSingleUrl(
} }
const attempt = await attemptScraping(urlToScrap, scraper); const attempt = await attemptScraping(urlToScrap, scraper);
text = attempt.text ?? ''; text = attempt.text ?? "";
html = attempt.html ?? ''; html = attempt.html ?? "";
rawHtml = attempt.rawHtml ?? ''; rawHtml = attempt.rawHtml ?? "";
screenshot = attempt.screenshot ?? ''; screenshot = attempt.screenshot ?? "";
if (attempt.pageStatusCode) { if (attempt.pageStatusCode) {
pageStatusCode = attempt.pageStatusCode; pageStatusCode = attempt.pageStatusCode;
} }
if (attempt.pageError) { if (attempt.pageError && attempt.pageStatusCode >= 400) {
pageError = attempt.pageError; pageError = attempt.pageError;
} else if (attempt && attempt.pageStatusCode && attempt.pageStatusCode < 400) {
pageError = undefined;
} }
if (text && text.trim().length >= 100) break; if (text && text.trim().length >= 100) break;
if (pageStatusCode && pageStatusCode == 404) break; if (pageStatusCode && pageStatusCode == 404) break;
const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1; const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
@ -465,12 +315,17 @@ export async function scrapSingleUrl(
content: text, content: text,
markdown: text, markdown: text,
html: pageOptions.includeHtml ? html : undefined, html: pageOptions.includeHtml ? html : undefined,
rawHtml:
pageOptions.includeRawHtml ||
extractorOptions.mode === "llm-extraction-from-raw-html"
? rawHtml
: undefined,
metadata: { metadata: {
...metadata, ...metadata,
screenshot: screenshot, screenshot: screenshot,
sourceURL: urlToScrap, sourceURL: urlToScrap,
pageStatusCode: pageStatusCode, pageStatusCode: pageStatusCode,
pageError: pageError pageError: pageError,
}, },
}; };
} else { } else {
@ -478,11 +333,16 @@ export async function scrapSingleUrl(
content: text, content: text,
markdown: text, markdown: text,
html: pageOptions.includeHtml ? html : undefined, html: pageOptions.includeHtml ? html : undefined,
rawHtml:
pageOptions.includeRawHtml ||
extractorOptions.mode === "llm-extraction-from-raw-html"
? rawHtml
: undefined,
metadata: { metadata: {
...metadata, ...metadata,
sourceURL: urlToScrap, sourceURL: urlToScrap,
pageStatusCode: pageStatusCode, pageStatusCode: pageStatusCode,
pageError: pageError pageError: pageError,
}, },
}; };
} }
@ -497,7 +357,7 @@ export async function scrapSingleUrl(
metadata: { metadata: {
sourceURL: urlToScrap, sourceURL: urlToScrap,
pageStatusCode: pageStatusCode, pageStatusCode: pageStatusCode,
pageError: pageError pageError: pageError,
}, },
} as Document; } as Document;
} }

View File

@ -100,4 +100,76 @@ describe('removeUnwantedElements', () => {
expect(result).not.toContain('id="remove-this"'); expect(result).not.toContain('id="remove-this"');
expect(result).toContain('class="keep"'); expect(result).toContain('class="keep"');
}); });
it('should only include specified tags', () => {
const html = `<div><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div>`;
const options: PageOptions = { onlyIncludeTags: ['main', 'footer'] };
const result = removeUnwantedElements(html, options);
expect(result).toContain('<main>Main Content</main>');
expect(result).toContain('<footer>Footer Content</footer>');
expect(result).not.toContain('<aside>');
});
it('should handle multiple specified tags', () => {
const html = `<div><header>Header Content</header><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div>`;
const options: PageOptions = { onlyIncludeTags: ['header', 'main', 'footer'] };
const result = removeUnwantedElements(html, options);
expect(result).toContain('<header>Header Content</header>');
expect(result).toContain('<main>Main Content</main>');
expect(result).toContain('<footer>Footer Content</footer>');
expect(result).not.toContain('<aside>');
});
it('should handle nested specified tags', () => {
const html = `<div><main><section>Main Section</section></main><aside>Remove</aside><footer>Footer Content</footer></div>`;
const options: PageOptions = { onlyIncludeTags: ['main', 'footer'] };
const result = removeUnwantedElements(html, options);
expect(result).toContain('<main><section>Main Section</section></main>');
expect(result).toContain('<footer>Footer Content</footer>');
expect(result).not.toContain('<aside>');
});
it('should not handle no specified tags, return full content', () => {
const html = `<html><body><div><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div></body></html>`;
const options: PageOptions = { onlyIncludeTags: [] };
const result = removeUnwantedElements(html, options);
expect(result).toBe(html);
});
it('should handle specified tags as a string', () => {
const html = `<div><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div>`;
const options: PageOptions = { onlyIncludeTags: 'main' };
const result = removeUnwantedElements(html, options);
expect(result).toContain('<main>Main Content</main>');
expect(result).not.toContain('<aside>');
expect(result).not.toContain('<footer>');
});
it('should include specified tags with class', () => {
const html = `<div><main class="main-content">Main Content</main><aside class="remove">Remove</aside><footer class="footer-content">Footer Content</footer></div>`;
const options: PageOptions = { onlyIncludeTags: ['.main-content', '.footer-content'] };
const result = removeUnwantedElements(html, options);
expect(result).toContain('<main class="main-content">Main Content</main>');
expect(result).toContain('<footer class="footer-content">Footer Content</footer>');
expect(result).not.toContain('<aside class="remove">');
});
it('should include specified tags with id', () => {
const html = `<div><main id="main-content">Main Content</main><aside id="remove">Remove</aside><footer id="footer-content">Footer Content</footer></div>`;
const options: PageOptions = { onlyIncludeTags: ['#main-content', '#footer-content'] };
const result = removeUnwantedElements(html, options);
expect(result).toContain('<main id="main-content">Main Content</main>');
expect(result).toContain('<footer id="footer-content">Footer Content</footer>');
expect(result).not.toContain('<aside id="remove">');
});
it('should include specified tags with mixed class and id', () => {
const html = `<div><main class="main-content">Main Content</main><aside id="remove">Remove</aside><footer id="footer-content">Footer Content</footer></div>`;
const options: PageOptions = { onlyIncludeTags: ['.main-content', '#footer-content'] };
const result = removeUnwantedElements(html, options);
expect(result).toContain('<main class="main-content">Main Content</main>');
expect(result).toContain('<footer id="footer-content">Footer Content</footer>');
expect(result).not.toContain('<aside id="remove">');
});
}); });

View File

@ -2,31 +2,51 @@ import cheerio, { AnyNode, Cheerio } from "cheerio";
import { PageOptions } from "../../../lib/entities"; import { PageOptions } from "../../../lib/entities";
import { excludeNonMainTags } from "./excludeTags"; import { excludeNonMainTags } from "./excludeTags";
export const removeUnwantedElements = (html: string, pageOptions: PageOptions) => { export const removeUnwantedElements = (
html: string,
pageOptions: PageOptions
) => {
const soup = cheerio.load(html); const soup = cheerio.load(html);
if (pageOptions.onlyIncludeTags) {
if (typeof pageOptions.onlyIncludeTags === "string") {
pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags];
}
if (pageOptions.onlyIncludeTags.length !== 0) {
// Create a new root element to hold the tags to keep
const newRoot = cheerio.load("<div></div>")("div");
pageOptions.onlyIncludeTags.forEach((tag) => {
soup(tag).each((index, element) => {
newRoot.append(soup(element).clone());
});
});
return newRoot.html();
}
}
soup("script, style, iframe, noscript, meta, head").remove(); soup("script, style, iframe, noscript, meta, head").remove();
if (pageOptions.removeTags) { if (pageOptions.removeTags) {
if (typeof pageOptions.removeTags === 'string') { if (typeof pageOptions.removeTags === "string") {
pageOptions.removeTags = [pageOptions.removeTags]; pageOptions.removeTags = [pageOptions.removeTags];
} }
if (Array.isArray(pageOptions.removeTags)) { if (Array.isArray(pageOptions.removeTags)) {
pageOptions.removeTags.forEach((tag) => { pageOptions.removeTags.forEach((tag) => {
let elementsToRemove: Cheerio<AnyNode>; let elementsToRemove: Cheerio<AnyNode>;
if (tag.startsWith("*") && tag.endsWith("*")) { if (tag.startsWith("*") && tag.endsWith("*")) {
let classMatch = false; let classMatch = false;
const regexPattern = new RegExp(tag.slice(1, -1), 'i'); const regexPattern = new RegExp(tag.slice(1, -1), "i");
elementsToRemove = soup('*').filter((i, element) => { elementsToRemove = soup("*").filter((i, element) => {
if (element.type === 'tag') { if (element.type === "tag") {
const attributes = element.attribs; const attributes = element.attribs;
const tagNameMatches = regexPattern.test(element.name); const tagNameMatches = regexPattern.test(element.name);
const attributesMatch = Object.keys(attributes).some(attr => const attributesMatch = Object.keys(attributes).some((attr) =>
regexPattern.test(`${attr}="${attributes[attr]}"`) regexPattern.test(`${attr}="${attributes[attr]}"`)
); );
if (tag.startsWith('*.')) { if (tag.startsWith("*.")) {
classMatch = Object.keys(attributes).some(attr => classMatch = Object.keys(attributes).some((attr) =>
regexPattern.test(`class="${attributes[attr]}"`) regexPattern.test(`class="${attributes[attr]}"`)
); );
} }
@ -41,7 +61,7 @@ export const removeUnwantedElements = (html: string, pageOptions: PageOptions) =
}); });
} }
} }
if (pageOptions.onlyMainContent) { if (pageOptions.onlyMainContent) {
excludeNonMainTags.forEach((tag) => { excludeNonMainTags.forEach((tag) => {
const elementsToRemove = soup(tag); const elementsToRemove = soup(tag);
@ -50,4 +70,4 @@ export const removeUnwantedElements = (html: string, pageOptions: PageOptions) =
} }
const cleanedHtml = soup.html(); const cleanedHtml = soup.html();
return cleanedHtml; return cleanedHtml;
}; };

View File

@ -1,4 +1,4 @@
import { ExtractorOptions } from './../../lib/entities'; import { ExtractorOptions } from "./../../lib/entities";
import { supabase_service } from "../supabase"; import { supabase_service } from "../supabase";
import { FirecrawlJob } from "../../types"; import { FirecrawlJob } from "../../types";
import { posthog } from "../posthog"; import { posthog } from "../posthog";
@ -10,6 +10,16 @@ export async function logJob(job: FirecrawlJob) {
return; return;
} }
// Redact any pages that have an authorization header
if (
job.pageOptions &&
job.pageOptions.headers &&
job.pageOptions.headers["Authorization"]
) {
job.pageOptions.headers["Authorization"] = "REDACTED";
job.docs = [{ content: "REDACTED DUE TO AUTHORIZATION HEADER", html: "REDACTED DUE TO AUTHORIZATION HEADER" }];
}
const { data, error } = await supabase_service const { data, error } = await supabase_service
.from("firecrawl_jobs") .from("firecrawl_jobs")
.insert([ .insert([
@ -27,35 +37,34 @@ export async function logJob(job: FirecrawlJob) {
page_options: job.pageOptions, page_options: job.pageOptions,
origin: job.origin, origin: job.origin,
extractor_options: job.extractor_options, extractor_options: job.extractor_options,
num_tokens: job.num_tokens num_tokens: job.num_tokens,
}, },
]); ]);
if (process.env.POSTHOG_API_KEY) { if (process.env.POSTHOG_API_KEY) {
let phLog = {
let phLog = { distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user
distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user ...(job.team_id !== "preview" && {
...(job.team_id !== "preview" && { groups: { team: job.team_id },
groups: { team: job.team_id } }), //* Identifying event on this team
}), //* Identifying event on this team event: "job-logged",
event: "job-logged", properties: {
properties: { success: job.success,
success: job.success, message: job.message,
message: job.message, num_docs: job.num_docs,
num_docs: job.num_docs, time_taken: job.time_taken,
time_taken: job.time_taken, team_id: job.team_id === "preview" ? null : job.team_id,
team_id: job.team_id === "preview" ? null : job.team_id, mode: job.mode,
mode: job.mode, url: job.url,
url: job.url, crawler_options: job.crawlerOptions,
crawler_options: job.crawlerOptions, page_options: job.pageOptions,
page_options: job.pageOptions, origin: job.origin,
origin: job.origin, extractor_options: job.extractor_options,
extractor_options: job.extractor_options, num_tokens: job.num_tokens,
num_tokens: job.num_tokens },
}, };
} posthog.capture(phLog);
posthog.capture(phLog); }
}
if (error) { if (error) {
console.error("Error logging job:\n", error); console.error("Error logging job:\n", error);
} }

View File

@ -0,0 +1,47 @@
import "dotenv/config";
import { ScrapeLog } from "../../types";
import { supabase_service } from "../supabase";
import { PageOptions } from "../../lib/entities";
export async function logScrape(
scrapeLog: ScrapeLog,
pageOptions?: PageOptions
) {
try {
// Only log jobs in production
// if (process.env.ENV !== "production") {
// return;
// }
// Redact any pages that have an authorization header
if (
pageOptions &&
pageOptions.headers &&
pageOptions.headers["Authorization"]
) {
scrapeLog.html = "REDACTED DUE TO AUTHORIZATION HEADER";
}
const { data, error } = await supabase_service.from("scrape_logs").insert([
{
url: scrapeLog.url,
scraper: scrapeLog.scraper,
success: scrapeLog.success,
response_code: scrapeLog.response_code,
time_taken_seconds: scrapeLog.time_taken_seconds,
proxy: scrapeLog.proxy,
retried: scrapeLog.retried,
error_message: scrapeLog.error_message,
date_added: new Date().toISOString(),
html: scrapeLog.html,
ipv4_support: scrapeLog.ipv4_support,
ipv6_support: scrapeLog.ipv6_support,
},
]);
if (error) {
console.error("Error logging proxy:\n", error);
}
} catch (error) {
console.error("Error logging proxy:\n", error);
}
}

View File

@ -114,4 +114,19 @@ export enum NotificationType {
APPROACHING_LIMIT = "approachingLimit", APPROACHING_LIMIT = "approachingLimit",
LIMIT_REACHED = "limitReached", LIMIT_REACHED = "limitReached",
RATE_LIMIT_REACHED = "rateLimitReached", RATE_LIMIT_REACHED = "rateLimitReached",
} }
export type ScrapeLog = {
url: string;
scraper: string;
success?: boolean;
response_code?: number;
time_taken_seconds?: number;
proxy?: string;
retried?: boolean;
error_message?: string;
date_added?: string; // ISO 8601 format
html?: string;
ipv4_support?: boolean | null;
ipv6_support?: boolean | null;
};

View File

@ -107,7 +107,7 @@ export interface Params {
[key: string]: any; [key: string]: any;
extractorOptions?: { extractorOptions?: {
extractionSchema: z.ZodSchema | any; extractionSchema: z.ZodSchema | any;
mode?: "llm-extraction"; mode?: "llm-extraction" | "llm-extraction-from-raw-html";
extractionPrompt?: string; extractionPrompt?: string;
}; };
} }

View File

@ -0,0 +1,19 @@
FROM node:18-slim
WORKDIR /usr/src/app
COPY package*.json ./
RUN npm install
COPY . .
# Install Playwright dependencies
RUN npx playwright install --with-deps
RUN npm run build
ARG PORT
ENV PORT=${PORT}
EXPOSE ${PORT}
CMD [ "npm", "start" ]

View File

@ -0,0 +1,47 @@
# Playwright Scrape API
This is a simple web scraping service built with Express and Playwright.
## Features
- Scrapes HTML content from specified URLs.
- Blocks requests to known ad-serving domains.
- Blocks media files to reduce bandwidth usage.
- Uses random user-agent strings to avoid detection.
- Strategy to ensure the page is fully rendered.
## Install
```bash
npm install
npx playwright install
```
## RUN
```bash
npm run build
npm start
```
OR
```bash
npm run dev
```
## USE
```bash
curl -X POST http://localhost:3000/scrape \
-H "Content-Type: application/json" \
-d '{
"url": "https://example.com",
"wait_after_load": 1000,
"timeout": 15000,
"headers": {
"Custom-Header": "value"
},
"check_selector": "#content"
}'
```
## USING WITH FIRECRAWL
Add `PLAYWRIGHT_MICROSERVICE_URL=http://localhost:3003/scrape` to `/apps/api/.env` to configure the API to use this Playwright microservice for scraping operations.

View File

@ -0,0 +1,227 @@
import express, { Request, Response } from 'express';
import bodyParser from 'body-parser';
import { chromium, Browser, BrowserContext, Route, Request as PlaywrightRequest } from 'playwright';
import dotenv from 'dotenv';
import randomUseragent from 'random-useragent';
import { getError } from './helpers/get_error';
dotenv.config();
const app = express();
const port = process.env.PORT || 3003;
app.use(bodyParser.json());
const BLOCK_MEDIA = (process.env.BLOCK_MEDIA || 'False').toUpperCase() === 'TRUE';
const PROXY_SERVER = process.env.PROXY_SERVER || null;
const PROXY_USERNAME = process.env.PROXY_USERNAME || null;
const PROXY_PASSWORD = process.env.PROXY_PASSWORD || null;
const AD_SERVING_DOMAINS = [
'doubleclick.net',
'adservice.google.com',
'googlesyndication.com',
'googletagservices.com',
'googletagmanager.com',
'google-analytics.com',
'adsystem.com',
'adservice.com',
'adnxs.com',
'ads-twitter.com',
'facebook.net',
'fbcdn.net',
'amazon-adsystem.com'
];
interface UrlModel {
url: string;
wait_after_load?: number;
timeout?: number;
headers?: { [key: string]: string };
check_selector?: string;
}
let browser: Browser;
let context: BrowserContext;
const initializeBrowser = async () => {
browser = await chromium.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--no-first-run',
'--no-zygote',
'--single-process',
'--disable-gpu'
]
});
const userAgent = randomUseragent.getRandom();
const viewport = { width: 1280, height: 800 };
const contextOptions: any = {
userAgent,
viewport,
};
if (PROXY_SERVER && PROXY_USERNAME && PROXY_PASSWORD) {
contextOptions.proxy = {
server: PROXY_SERVER,
username: PROXY_USERNAME,
password: PROXY_PASSWORD,
};
} else if (PROXY_SERVER) {
contextOptions.proxy = {
server: PROXY_SERVER,
};
}
context = await browser.newContext(contextOptions);
if (BLOCK_MEDIA) {
await context.route('**/*.{png,jpg,jpeg,gif,svg,mp3,mp4,avi,flac,ogg,wav,webm}', async (route: Route, request: PlaywrightRequest) => {
await route.abort();
});
}
// Intercept all requests to avoid loading ads
await context.route('**/*', (route: Route, request: PlaywrightRequest) => {
const requestUrl = new URL(request.url());
const hostname = requestUrl.hostname;
if (AD_SERVING_DOMAINS.some(domain => hostname.includes(domain))) {
console.log(hostname);
return route.abort();
}
return route.continue();
});
};
const shutdownBrowser = async () => {
if (context) {
await context.close();
}
if (browser) {
await browser.close();
}
};
const isValidUrl = (urlString: string): boolean => {
try {
new URL(urlString);
return true;
} catch (_) {
return false;
}
};
const scrapePage = async (page: any, url: string, waitUntil: 'load' | 'networkidle', waitAfterLoad: number, timeout: number, checkSelector: string | undefined) => {
console.log(`Navigating to ${url} with waitUntil: ${waitUntil} and timeout: ${timeout}ms`);
const response = await page.goto(url, { waitUntil, timeout });
if (waitAfterLoad > 0) {
await page.waitForTimeout(waitAfterLoad);
}
if (checkSelector) {
try {
await page.waitForSelector(checkSelector, { timeout });
} catch (error) {
throw new Error('Required selector not found');
}
}
return {
content: await page.content(),
status: response ? response.status() : null,
};
};
app.post('/scrape', async (req: Request, res: Response) => {
const { url, wait_after_load = 0, timeout = 15000, headers, check_selector }: UrlModel = req.body;
console.log(`================= Scrape Request =================`);
console.log(`URL: ${url}`);
console.log(`Wait After Load: ${wait_after_load}`);
console.log(`Timeout: ${timeout}`);
console.log(`Headers: ${headers ? JSON.stringify(headers) : 'None'}`);
console.log(`Check Selector: ${check_selector ? check_selector : 'None'}`);
console.log(`==================================================`);
if (!url) {
return res.status(400).json({ error: 'URL is required' });
}
if (!isValidUrl(url)) {
return res.status(400).json({ error: 'Invalid URL' });
}
if (!PROXY_SERVER) {
console.warn('⚠️ WARNING: No proxy server provided. Your IP address may be blocked.');
}
if (!browser || !context) {
await initializeBrowser();
}
const page = await context.newPage();
// Set headers if provided
if (headers) {
await page.setExtraHTTPHeaders(headers);
}
let pageContent;
let pageStatusCode: number | null = null;
try {
// Strategy 1: Normal
console.log('Attempting strategy 1: Normal load');
const result = await scrapePage(page, url, 'load', wait_after_load, timeout, check_selector);
pageContent = result.content;
pageStatusCode = result.status;
} catch (error) {
console.log('Strategy 1 failed, attempting strategy 2: Wait until networkidle');
try {
// Strategy 2: Wait until networkidle
const result = await scrapePage(page, url, 'networkidle', wait_after_load, timeout, check_selector);
pageContent = result.content;
pageStatusCode = result.status;
} catch (finalError) {
await page.close();
return res.status(500).json({ error: 'An error occurred while fetching the page.' });
}
}
const pageError = pageStatusCode !== 200 ? getError(pageStatusCode) : false;
if (!pageError) {
console.log(`✅ Scrape successful!`);
} else {
console.log(`🚨 Scrape failed with status code: ${pageStatusCode} ${pageError}`);
}
await page.close();
res.json({
content: pageContent,
pageStatusCode,
pageError
});
});
app.listen(port, () => {
initializeBrowser().then(() => {
console.log(`Server is running on port ${port}`);
});
});
process.on('SIGINT', () => {
shutdownBrowser().then(() => {
console.log('Browser closed');
process.exit(0);
});
});

View File

@ -0,0 +1,73 @@
//impired by firecrawl repo @rafaelsideguide
export const getError = (statusCode: number | null): string | null => {
if (statusCode === null) {
return 'No response received';
}
const errorMessages: { [key: number]: string } = {
300: "Multiple Choices",
301: "Moved Permanently",
302: "Found",
303: "See Other",
304: "Not Modified",
305: "Use Proxy",
307: "Temporary Redirect",
308: "Permanent Redirect",
309: "Resume Incomplete",
310: "Too Many Redirects",
311: "Unavailable For Legal Reasons",
312: "Previously Used",
313: "I'm Used",
314: "Switch Proxy",
315: "Temporary Redirect",
316: "Resume Incomplete",
317: "Too Many Redirects",
400: "Bad Request",
401: "Unauthorized",
403: "Forbidden",
404: "Not Found",
405: "Method Not Allowed",
406: "Not Acceptable",
407: "Proxy Authentication Required",
408: "Request Timeout",
409: "Conflict",
410: "Gone",
411: "Length Required",
412: "Precondition Failed",
413: "Payload Too Large",
414: "URI Too Long",
415: "Unsupported Media Type",
416: "Range Not Satisfiable",
417: "Expectation Failed",
418: "I'm a teapot",
421: "Misdirected Request",
422: "Unprocessable Entity",
423: "Locked",
424: "Failed Dependency",
425: "Too Early",
426: "Upgrade Required",
428: "Precondition Required",
429: "Too Many Requests",
431: "Request Header Fields Too Large",
451: "Unavailable For Legal Reasons",
500: "Internal Server Error",
501: "Not Implemented",
502: "Bad Gateway",
503: "Service Unavailable",
504: "Gateway Timeout",
505: "HTTP Version Not Supported",
506: "Variant Also Negotiates",
507: "Insufficient Storage",
508: "Loop Detected",
510: "Not Extended",
511: "Network Authentication Required",
599: "Network Connect Timeout Error"
};
if (statusCode < 300) {
return null;
}
return errorMessages[statusCode] || "Unknown Error";
};

View File

@ -0,0 +1,28 @@
{
"name": "playwright-scraper-api",
"version": "1.0.0",
"description": "scraper api with playwright",
"main": "api.ts",
"scripts": {
"start": "node dist/api.js",
"build": "tsc",
"dev": "ts-node api.ts"
},
"keywords": [],
"author": "Jeff Pereira",
"license": "ISC",
"dependencies": {
"body-parser": "^1.20.2",
"dotenv": "^16.4.5",
"express": "^4.19.2",
"playwright": "^1.45.0",
"random-useragent": "^0.5.0"
},
"devDependencies": {
"@types/express": "^4.17.21",
"@types/node": "^20.14.9",
"@types/random-useragent": "^0.3.3",
"ts-node": "^10.9.2",
"typescript": "^5.5.2"
}
}

View File

@ -0,0 +1,110 @@
{
"compilerOptions": {
/* Visit https://aka.ms/tsconfig to read more about this file */
/* Projects */
// "incremental": true, /* Save .tsbuildinfo files to allow for incremental compilation of projects. */
// "composite": true, /* Enable constraints that allow a TypeScript project to be used with project references. */
// "tsBuildInfoFile": "./.tsbuildinfo", /* Specify the path to .tsbuildinfo incremental compilation file. */
// "disableSourceOfProjectReferenceRedirect": true, /* Disable preferring source files instead of declaration files when referencing composite projects. */
// "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */
// "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */
/* Language and Environment */
"target": "es2016", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
// "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */
// "jsx": "preserve", /* Specify what JSX code is generated. */
// "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */
// "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */
// "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */
// "jsxFragmentFactory": "", /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */
// "jsxImportSource": "", /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */
// "reactNamespace": "", /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */
// "noLib": true, /* Disable including any library files, including the default lib.d.ts. */
// "useDefineForClassFields": true, /* Emit ECMAScript-standard-compliant class fields. */
// "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */
/* Modules */
"module": "commonjs", /* Specify what module code is generated. */
"rootDir": "./", /* Specify the root folder within your source files. */
// "moduleResolution": "node10", /* Specify how TypeScript looks up a file from a given module specifier. */
// "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */
// "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */
// "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */
// "typeRoots": [], /* Specify multiple folders that act like './node_modules/@types'. */
// "types": [], /* Specify type package names to be included without being referenced in a source file. */
// "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */
// "moduleSuffixes": [], /* List of file name suffixes to search when resolving a module. */
// "allowImportingTsExtensions": true, /* Allow imports to include TypeScript file extensions. Requires '--moduleResolution bundler' and either '--noEmit' or '--emitDeclarationOnly' to be set. */
// "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */
// "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */
// "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */
// "resolveJsonModule": true, /* Enable importing .json files. */
// "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */
// "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
/* JavaScript Support */
// "allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */
// "checkJs": true, /* Enable error reporting in type-checked JavaScript files. */
// "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */
/* Emit */
// "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */
// "declarationMap": true, /* Create sourcemaps for d.ts files. */
// "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */
// "sourceMap": true, /* Create source map files for emitted JavaScript files. */
// "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */
// "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */
"outDir": "./dist", /* Specify an output folder for all emitted files. */
// "removeComments": true, /* Disable emitting comments. */
// "noEmit": true, /* Disable emitting files from a compilation. */
// "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */
// "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */
// "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */
// "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */
// "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */
// "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */
// "newLine": "crlf", /* Set the newline character for emitting files. */
// "stripInternal": true, /* Disable emitting declarations that have '@internal' in their JSDoc comments. */
// "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */
// "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */
// "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */
// "declarationDir": "./", /* Specify the output directory for generated declaration files. */
/* Interop Constraints */
// "isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */
// "verbatimModuleSyntax": true, /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */
// "isolatedDeclarations": true, /* Require sufficient annotation on exports so other tools can trivially generate declaration files. */
// "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */
"esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */
// "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
"forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */
/* Type Checking */
"strict": true, /* Enable all strict type-checking options. */
// "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */
// "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */
// "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */
// "strictBindCallApply": true, /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */
// "strictPropertyInitialization": true, /* Check for class properties that are declared but not set in the constructor. */
// "noImplicitThis": true, /* Enable error reporting when 'this' is given the type 'any'. */
// "useUnknownInCatchVariables": true, /* Default catch clause variables as 'unknown' instead of 'any'. */
// "alwaysStrict": true, /* Ensure 'use strict' is always emitted. */
// "noUnusedLocals": true, /* Enable error reporting when local variables aren't read. */
// "noUnusedParameters": true, /* Raise an error when a function parameter isn't read. */
// "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */
// "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */
// "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */
// "noUncheckedIndexedAccess": true, /* Add 'undefined' to a type when accessed using an index. */
// "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */
// "noPropertyAccessFromIndexSignature": true, /* Enforces using indexed accessors for keys declared using an indexed type. */
// "allowUnusedLabels": true, /* Disable error reporting for unused labels. */
// "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */
/* Completeness */
// "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */
"skipLibCheck": true /* Skip type checking all .d.ts files. */
},
"include": ["**/*"],
"exclude": ["node_modules", "**/*.spec.ts"]
}

View File

@ -1,4 +1,4 @@
hypercorn==0.16.0 hypercorn==0.17.3
fastapi==0.110.0 fastapi==0.111.0
playwright==1.42.0 playwright==1.44.0
uvicorn uvicorn

View File

@ -142,4 +142,4 @@ Contributions to the Firecrawl Python SDK are welcome! If you find any issues or
## License ## License
The Firecrawl Python SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT). The Firecrawl Python SDK is open-source and released under the [AGPL License](https://www.gnu.org/licenses/agpl-3.0.en.html).

View File

@ -13,7 +13,7 @@ dependencies = [
] ]
authors = [{name = "Mendable.ai",email = "nick@mendable.ai"}] authors = [{name = "Mendable.ai",email = "nick@mendable.ai"}]
maintainers = [{name = "Mendable.ai",email = "nick@mendable.ai"}] maintainers = [{name = "Mendable.ai",email = "nick@mendable.ai"}]
license = {text = "GNU General Public License v3 (GPLv3)"} license = {text = "GNU Affero General Public License v3 (AGPLv3)"}
classifiers = [ classifiers = [
"Development Status :: 5 - Production/Stable", "Development Status :: 5 - Production/Stable",
@ -45,4 +45,4 @@ keywords = ["SDK", "API", "firecrawl"]
"Tracker" = "https://github.com/mendableai/firecrawl/issues" "Tracker" = "https://github.com/mendableai/firecrawl/issues"
[tool.setuptools.packages.find] [tool.setuptools.packages.find]
where = ["."] where = ["."]

View File

@ -11,7 +11,7 @@
"author": "", "author": "",
"license": "ISC", "license": "ISC",
"dependencies": { "dependencies": {
"@anthropic-ai/sdk": "^0.20.8", "@anthropic-ai/sdk": "^0.24.3",
"@dqbd/tiktoken": "^1.0.14", "@dqbd/tiktoken": "^1.0.14",
"@supabase/supabase-js": "^2.43.1", "@supabase/supabase-js": "^2.43.1",
"dotenv": "^16.4.5", "dotenv": "^16.4.5",

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,31 @@
name: firecrawl name: firecrawl
version: '3.9' version: '3.9'
x-common-service: &common-service
build: apps/api
networks:
- backend
environment:
- REDIS_URL=${REDIS_URL:-redis://redis:6379}
- PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000}
- USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION}
- PORT=${PORT:-3002}
- NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE}
- OPENAI_API_KEY=${OPENAI_API_KEY}
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
- SERPER_API_KEY=${SERPER_API_KEY}
- LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY}
- LOGTAIL_KEY=${LOGTAIL_KEY}
- BULL_AUTH_KEY=${BULL_AUTH_KEY}
- TEST_API_KEY=${TEST_API_KEY}
- POSTHOG_API_KEY=${POSTHOG_API_KEY}
- POSTHOG_HOST=${POSTHOG_HOST}
- SUPABASE_ANON_TOKEN=${SUPABASE_ANON_TOKEN}
- SUPABASE_URL=${SUPABASE_URL}
- SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN}
- SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY}
- HOST=${HOST:-0.0.0.0}
services: services:
playwright-service: playwright-service:
build: apps/playwright-service build: apps/playwright-service
@ -11,66 +37,23 @@ services:
- BLOCK_MEDIA=${BLOCK_MEDIA} - BLOCK_MEDIA=${BLOCK_MEDIA}
networks: networks:
- backend - backend
api: api:
build: apps/api <<: *common-service
environment:
- REDIS_URL=${REDIS_URL:-redis://redis:6379}
- PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000}
- USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION}
- PORT=${PORT:-3002}
- NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE}
- OPENAI_API_KEY=${OPENAI_API_KEY}
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
- SERPER_API_KEY=${SERPER_API_KEY}
- LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY}
- LOGTAIL_KEY=${LOGTAIL_KEY}
- BULL_AUTH_KEY=${BULL_AUTH_KEY}
- TEST_API_KEY=${TEST_API_KEY}
- POSTHOG_API_KEY=${POSTHOG_API_KEY}
- POSTHOG_HOST=${POSTHOG_HOST}
- SUPABASE_ANON_TOKEN=${SUPABASE_ANON_TOKEN}
- SUPABASE_URL=${SUPABASE_URL}
- SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN}
- SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY}
- HOST=${HOST:-0.0.0.0}
depends_on: depends_on:
- redis - redis
- playwright-service - playwright-service
ports: ports:
- "3002:3002" - "3002:3002"
command: [ "pnpm", "run", "start:production" ] command: [ "pnpm", "run", "start:production" ]
networks:
- backend
worker: worker:
build: apps/api <<: *common-service
environment:
- REDIS_URL=${REDIS_URL:-redis://redis:6379}
- PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000}
- USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION}
- PORT=${PORT:-3002}
- NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE}
- OPENAI_API_KEY=${OPENAI_API_KEY}
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
- SERPER_API_KEY=${SERPER_API_KEY}
- LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY}
- LOGTAIL_KEY=${LOGTAIL_KEY}
- BULL_AUTH_KEY=${BULL_AUTH_KEY}
- TEST_API_KEY=${TEST_API_KEY}
- POSTHOG_API_KEY=${POSTHOG_API_KEY}
- POSTHOG_HOST=${POSTHOG_HOST}
- SUPABASE_ANON_TOKEN=${SUPABASE_ANON_TOKEN}
- SUPABASE_URL=${SUPABASE_URL}
- SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN}
- SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY}
- HOST=${HOST:-0.0.0.0}
depends_on: depends_on:
- redis - redis
- playwright-service - playwright-service
- api - api
networks:
- backend
redis: redis:
image: redis:alpine image: redis:alpine
networks: networks: