From 5c81ea1803e08185cae2673ea1d7864e79271bbf Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 9 Dec 2024 15:34:50 -0300 Subject: [PATCH 01/16] fixed optional+default bug on llm schema --- .../scrapeURL/transformers/llmExtract.ts | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 1c6adcd1..71a46406 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -99,6 +99,10 @@ export async function generateOpenAICompletions(logger: Logger, options: Extract } let schema = options.schema; + if (schema) { + schema = removeDefaultProperty(schema); + } + if (schema && schema.type === "array") { schema = { type: "object", @@ -112,7 +116,9 @@ export async function generateOpenAICompletions(logger: Logger, options: Extract schema = { type: "object", properties: Object.fromEntries( - Object.entries(schema).map(([key, value]) => [key, { type: value }]) + Object.entries(schema).map(([key, value]) => { + return [key, removeDefaultProperty(value)]; + }) ), required: Object.keys(schema), additionalProperties: false @@ -192,3 +198,19 @@ export async function performLLMExtract(meta: Meta, document: Document): Promise return document; } + +function removeDefaultProperty(schema: any): any { + if (typeof schema !== 'object' || schema === null) return schema; + + const { default: _, ...rest } = schema; + + for (const key in rest) { + if (Array.isArray(rest[key])) { + rest[key] = rest[key].map((item: any) => removeDefaultProperty(item)); + } else if (typeof rest[key] === 'object' && rest[key] !== null) { + rest[key] = removeDefaultProperty(rest[key]); + } + } + + return rest; +} \ No newline at end of file From eab30c474b19b8ffb89f4d64e83919fec99b0f5c Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 16 Dec 2024 09:30:40 -0300 Subject: [PATCH 02/16] added unit tests --- .../scrapeURL/transformers/llmExtract.test.ts | 33 +++++++++++++++++++ .../scrapeURL/transformers/llmExtract.ts | 2 +- 2 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts new file mode 100644 index 00000000..f23f506f --- /dev/null +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts @@ -0,0 +1,33 @@ +import { removeDefaultProperty } from "./llmExtract"; + +describe("removeDefaultProperty", () => { + it("should remove the default property from a simple object", () => { + const input = { default: "test", test: "test" }; + const expectedOutput = { test: "test" }; + expect(removeDefaultProperty(input)).toEqual(expectedOutput); + }); + + it("should remove the default property from a nested object", () => { + const input = { default: "test", nested: { default: "nestedTest", test: "nestedTest" } }; + const expectedOutput = { nested: { test: "nestedTest" } }; + expect(removeDefaultProperty(input)).toEqual(expectedOutput); + }); + + it("should remove the default property from an array of objects", () => { + const input = { array: [{ default: "test1", test: "test1" }, { default: "test2", test: "test2" }] }; + const expectedOutput = { array: [{ test: "test1" }, { test: "test2" }] }; + expect(removeDefaultProperty(input)).toEqual(expectedOutput); + }); + + it("should handle objects without a default property", () => { + const input = { test: "test" }; + const expectedOutput = { test: "test" }; + expect(removeDefaultProperty(input)).toEqual(expectedOutput); + }); + + it("should handle null and non-object inputs", () => { + expect(removeDefaultProperty(null)).toBeNull(); + expect(removeDefaultProperty("string")).toBe("string"); + expect(removeDefaultProperty(123)).toBe(123); + }); +}); \ No newline at end of file diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 71a46406..c35e20c1 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -199,7 +199,7 @@ export async function performLLMExtract(meta: Meta, document: Document): Promise return document; } -function removeDefaultProperty(schema: any): any { +export function removeDefaultProperty(schema: any): any { if (typeof schema !== 'object' || schema === null) return schema; const { default: _, ...rest } = schema; From b6802bc443a5d68679af8dfe58737e0d99be26c4 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 16 Dec 2024 11:41:59 -0300 Subject: [PATCH 03/16] merged with main --- .../scrapeURL/transformers/llmExtract.ts | 148 ++++-------------- 1 file changed, 28 insertions(+), 120 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 7b518300..c189c8f7 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -121,6 +121,10 @@ export async function generateOpenAICompletions( } let schema = options.schema; + if (schema) { + schema = removeDefaultProperty(schema); +} + if (schema && schema.type === "array") { schema = { type: "object", @@ -134,10 +138,12 @@ export async function generateOpenAICompletions( schema = { type: "object", properties: Object.fromEntries( - Object.entries(schema).map(([key, value]) => [key, { type: value }]), + Object.entries(schema).map(([key, value]) => { + return [key, removeDefaultProperty(value)]; + }) ), required: Object.keys(schema), - additionalProperties: false, + additionalProperties: false }; } @@ -183,124 +189,6 @@ export async function generateOpenAICompletions( if (extract === null && jsonCompletion.choices[0].message.content !== null) { try { - // Encode the message into tokens - const tokens = encoder.encode(markdown); - - // Return the number of tokens - numTokens = tokens.length; - } catch (error) { - logger.warn("Calculating num tokens of string failed", { error, markdown }); - - markdown = markdown.slice(0, maxTokens * modifier); - - let w = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support."; - warning = previousWarning === undefined ? w : w + " " + previousWarning; - } finally { - // Free the encoder resources after use - encoder.free(); - } - - if (numTokens > maxTokens) { - // trim the document to the maximum number of tokens, tokens != characters - markdown = markdown.slice(0, maxTokens * modifier); - - const w = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed."; - warning = previousWarning === undefined ? w : w + " " + previousWarning; - } - - let schema = options.schema; - if (schema) { - schema = removeDefaultProperty(schema); - } - - if (schema && schema.type === "array") { - schema = { - type: "object", - properties: { - items: options.schema, - }, - required: ["items"], - additionalProperties: false, - }; - } else if (schema && typeof schema === 'object' && !schema.type) { - schema = { - type: "object", - properties: Object.fromEntries( - Object.entries(schema).map(([key, value]) => { - return [key, removeDefaultProperty(value)]; - }) - ), - required: Object.keys(schema), - additionalProperties: false - }; - } - - schema = normalizeSchema(schema); - - const jsonCompletion = await openai.beta.chat.completions.parse({ - model, - temperature: 0, - messages: [ - { - role: "system", - content: options.systemPrompt, - }, - { - role: "user", - content: [{ type: "text", text: markdown }], - }, - { - role: "user", - content: options.prompt !== undefined - ? `Transform the above content into structured JSON output based on the following user request: ${options.prompt}` - : "Transform the above content into structured JSON output.", - }, - ], - response_format: options.schema ? { - type: "json_schema", - json_schema: { - name: "websiteContent", - schema: schema, - strict: true, - } - } : { type: "json_object" }, - }); - - if (jsonCompletion.choices[0].message.refusal !== null) { - throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal); - } - - extract = jsonCompletion.choices[0].message.parsed; - - if (extract === null && jsonCompletion.choices[0].message.content !== null) { - try { - if (!isExtractEndpoint) { - extract = JSON.parse(jsonCompletion.choices[0].message.content); - } else { - const extractData = JSON.parse(jsonCompletion.choices[0].message.content); - extract = options.schema ? extractData.data.extract : extractData; - } - } catch (e) { - logger.error("Failed to parse returned JSON, no schema specified.", { error: e }); - throw new LLMRefusalError("Failed to parse returned JSON. Please specify a schema in the extract object."); - } - } - - // If the users actually wants the items object, they can specify it as 'required' in the schema - // otherwise, we just return the items array - if (options.schema && options.schema.type === "array" && !schema?.required?.includes("items")) { - extract = extract?.items; - } - return { extract, warning, numTokens }; -} - -export async function performLLMExtract(meta: Meta, document: Document): Promise { - if (meta.options.formats.includes("extract")) { - const { extract, warning } = await generateOpenAICompletions( - meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }), - meta.options.extract!, - document.markdown, - document.warning, if (!isExtractEndpoint) { extract = JSON.parse(jsonCompletion.choices[0].message.content); } else { @@ -331,6 +219,26 @@ export async function performLLMExtract(meta: Meta, document: Document): Promise return { extract, warning, numTokens }; } +export async function performLLMExtract( + meta: Meta, + document: Document, +): Promise { + if (meta.options.formats.includes("extract")) { + const { extract, warning } = await generateOpenAICompletions( + meta.logger.child({ + method: "performLLMExtract/generateOpenAICompletions", + }), + meta.options.extract!, + document.markdown, + document.warning, + ); + document.extract = extract; + document.warning = warning; + } + + return document; +} + export function removeDefaultProperty(schema: any): any { if (typeof schema !== 'object' || schema === null) return schema; From d8150c61714cfd320190c08263ae4722e38b95ad Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 16 Dec 2024 11:46:56 -0300 Subject: [PATCH 04/16] added type to reqs example --- apps/api/requests.http | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/api/requests.http b/apps/api/requests.http index 0e3b9206..5d99bce9 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -19,7 +19,7 @@ Authorization: Bearer {{$dotenv TEST_API_KEY}} content-type: application/json { - "url": "firecrawl.dev" + "url": "v" } ### Check Crawl Status @@ -70,8 +70,8 @@ content-type: application/json "urls": ["firecrawl.dev"], "prompt": "What is the title, description and main product of the page?", "schema": { - "title": "string", - "description": "string", - "mainProduct": "string" + "title": { "type": "string" }, + "description": { "type": "string" }, + "mainProduct": { "type": "string" } } } \ No newline at end of file From 2c233bd3213cab52bdb493313d86a45b86dd90b4 Mon Sep 17 00:00:00 2001 From: Rafael Miller <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 16 Dec 2024 11:48:48 -0300 Subject: [PATCH 05/16] Update requests.http --- apps/api/requests.http | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/requests.http b/apps/api/requests.http index 5d99bce9..8aa3788d 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -19,7 +19,7 @@ Authorization: Bearer {{$dotenv TEST_API_KEY}} content-type: application/json { - "url": "v" + "url": "firecrawl.dev" } ### Check Crawl Status @@ -74,4 +74,4 @@ content-type: application/json "description": { "type": "string" }, "mainProduct": { "type": "string" } } -} \ No newline at end of file +} From 29cea4c51d17d9e1ea64b9e0053fa9261bf2fc7c Mon Sep 17 00:00:00 2001 From: RutamBhagat Date: Tue, 17 Dec 2024 13:31:35 -0800 Subject: [PATCH 06/16] feat(python-sdk): improve API key handling for cloud vs self-hosted services in FirecrawlApp --- .../__tests__/v1/e2e_withAuth/test.py | 56 +++++++++++++------ apps/python-sdk/firecrawl/firecrawl.py | 27 +++++---- apps/python-sdk/pyproject.toml | 4 +- apps/python-sdk/requirements.txt | 4 +- 4 files changed, 60 insertions(+), 31 deletions(-) diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py index 12fa10ce..1ed53968 100644 --- a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py +++ b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py @@ -8,7 +8,7 @@ from datetime import datetime load_dotenv() -API_URL = "http://127.0.0.1:3002"; +API_URL = os.getenv('API_URL', 'http://127.0.0.1:3002') ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py" TEST_API_KEY = os.getenv('TEST_API_KEY') @@ -20,15 +20,26 @@ spec.loader.exec_module(firecrawl) FirecrawlApp = firecrawl.FirecrawlApp def test_no_api_key(): - with pytest.raises(Exception) as excinfo: - invalid_app = FirecrawlApp(api_url=API_URL) - assert "No API key provided" in str(excinfo.value) + if 'api.firecrawl.dev' in API_URL: + with pytest.raises(Exception) as excinfo: + invalid_app = FirecrawlApp(api_url=API_URL) + assert "No API key provided" in str(excinfo.value) + else: + # Should not raise error for self-hosted + app = FirecrawlApp(api_url=API_URL) + assert app is not None def test_scrape_url_invalid_api_key(): - invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") - with pytest.raises(Exception) as excinfo: - invalid_app.scrape_url('https://firecrawl.dev') - assert "Unauthorized: Invalid token" in str(excinfo.value) + if 'api.firecrawl.dev' in API_URL: + invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") + with pytest.raises(Exception) as excinfo: + invalid_app.scrape_url('https://firecrawl.dev') + assert "Unauthorized: Invalid token" in str(excinfo.value) + else: + # Should work without API key for self-hosted + app = FirecrawlApp(api_url=API_URL) + response = app.scrape_url('https://firecrawl.dev') + assert response is not None def test_blocklisted_url(): blocklisted_url = "https://facebook.com/fake-test" @@ -131,10 +142,16 @@ def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_ext assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['markdown'] def test_crawl_url_invalid_api_key(): - invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") - with pytest.raises(Exception) as excinfo: - invalid_app.crawl_url('https://firecrawl.dev') - assert "Unauthorized: Invalid token" in str(excinfo.value) + if 'api.firecrawl.dev' in API_URL: + invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") + with pytest.raises(Exception) as excinfo: + invalid_app.crawl_url('https://firecrawl.dev') + assert "Unauthorized: Invalid token" in str(excinfo.value) + else: + # Should work without API key for self-hosted + app = FirecrawlApp(api_url=API_URL) + response = app.crawl_url('https://firecrawl.dev') + assert response is not None def test_should_return_error_for_blocklisted_url(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) @@ -291,10 +308,16 @@ def test_check_crawl_status_e2e(): assert 'error' not in status_response['data'][0]['metadata'] def test_invalid_api_key_on_map(): - invalid_app = FirecrawlApp(api_key="invalid_api_key", api_url=API_URL) - with pytest.raises(Exception) as excinfo: - invalid_app.map_url('https://roastmywebsite.ai') - assert "Unauthorized: Invalid token" in str(excinfo.value) + if 'api.firecrawl.dev' in API_URL: + invalid_app = FirecrawlApp(api_key="invalid_api_key", api_url=API_URL) + with pytest.raises(Exception) as excinfo: + invalid_app.map_url('https://roastmywebsite.ai') + assert "Unauthorized: Invalid token" in str(excinfo.value) + else: + # Should work without API key for self-hosted + app = FirecrawlApp(api_url=API_URL) + response = app.map_url('https://roastmywebsite.ai') + assert response is not None def test_blocklisted_url_on_map(): app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL) @@ -349,4 +372,3 @@ def test_search_e2e(): # assert isinstance(llm_extraction['is_open_source'], bool) - \ No newline at end of file diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 45ed27d8..4bc6697a 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -40,19 +40,22 @@ class FirecrawlApp: error: Optional[str] = None def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: - """ - Initialize the FirecrawlApp instance with API key, API URL. + """ + Initialize the FirecrawlApp instance with API key, API URL. - Args: - api_key (Optional[str]): API key for authenticating with the Firecrawl API. - api_url (Optional[str]): Base URL for the Firecrawl API. - """ - self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') - self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') - if self.api_key is None: - logger.warning("No API key provided") - raise ValueError('No API key provided') - logger.debug(f"Initialized FirecrawlApp with API key: {self.api_key}") + Args: + api_key (Optional[str]): API key for authenticating with the Firecrawl API. + api_url (Optional[str]): Base URL for the Firecrawl API. + """ + self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') + self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') + + # Only require API key when using cloud service + if 'api.firecrawl.dev' in self.api_url and self.api_key is None: + logger.warning("No API key provided for cloud service") + raise ValueError('No API key provided') + + logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}") def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: """ diff --git a/apps/python-sdk/pyproject.toml b/apps/python-sdk/pyproject.toml index 87cb91f1..31c16941 100644 --- a/apps/python-sdk/pyproject.toml +++ b/apps/python-sdk/pyproject.toml @@ -12,7 +12,9 @@ dependencies = [ "requests", "python-dotenv", "websockets", - "nest-asyncio" + "nest-asyncio", + "pytest>=8.3.4", + "pydantic>=2.10.3", ] authors = [{name = "Mendable.ai",email = "nick@mendable.ai"}] maintainers = [{name = "Mendable.ai",email = "nick@mendable.ai"}] diff --git a/apps/python-sdk/requirements.txt b/apps/python-sdk/requirements.txt index db67ceeb..dd4e7a08 100644 --- a/apps/python-sdk/requirements.txt +++ b/apps/python-sdk/requirements.txt @@ -2,4 +2,6 @@ requests pytest python-dotenv websockets -nest-asyncio \ No newline at end of file +nest-asyncio +pytest +pydantic \ No newline at end of file From 2b488cac3d380470e95da10583b48320f7793510 Mon Sep 17 00:00:00 2001 From: RutamBhagat Date: Fri, 20 Dec 2024 01:54:29 -0800 Subject: [PATCH 07/16] chore: remove pytest dependency from pyproject.toml --- apps/python-sdk/pyproject.toml | 1 - apps/python-sdk/requirements.txt | 1 - 2 files changed, 2 deletions(-) diff --git a/apps/python-sdk/pyproject.toml b/apps/python-sdk/pyproject.toml index 31c16941..67082d5e 100644 --- a/apps/python-sdk/pyproject.toml +++ b/apps/python-sdk/pyproject.toml @@ -13,7 +13,6 @@ dependencies = [ "python-dotenv", "websockets", "nest-asyncio", - "pytest>=8.3.4", "pydantic>=2.10.3", ] authors = [{name = "Mendable.ai",email = "nick@mendable.ai"}] diff --git a/apps/python-sdk/requirements.txt b/apps/python-sdk/requirements.txt index dd4e7a08..5dcd8f6c 100644 --- a/apps/python-sdk/requirements.txt +++ b/apps/python-sdk/requirements.txt @@ -3,5 +3,4 @@ pytest python-dotenv websockets nest-asyncio -pytest pydantic \ No newline at end of file From f47e3114d6d08989d00fd527954b22ed7924fee9 Mon Sep 17 00:00:00 2001 From: RutamBhagat Date: Tue, 17 Dec 2024 15:11:23 -0800 Subject: [PATCH 08/16] feat(rust-sdk): improve API key handling for cloud vs self-hosted services in FirecrawlApp --- SELF_HOST.md | 4 ++++ apps/rust-sdk/src/error.rs | 2 +- apps/rust-sdk/src/lib.rs | 19 +++++++++++++++++-- apps/rust-sdk/tests/e2e_with_auth.rs | 28 +++++++++++++++++++++++++++- 4 files changed, 49 insertions(+), 4 deletions(-) diff --git a/SELF_HOST.md b/SELF_HOST.md index 46e08db9..e8a3444f 100644 --- a/SELF_HOST.md +++ b/SELF_HOST.md @@ -116,6 +116,10 @@ If you’d like to test the crawl endpoint, you can run this: This section provides solutions to common issues you might encounter while setting up or running your self-hosted instance of Firecrawl. +### API Keys for SDK Usage + +**Note:** When using Firecrawl SDKs with a self-hosted instance, API keys are optional. API keys are only required when connecting to the cloud service (api.firecrawl.dev). + ### Supabase client is not configured **Symptom:** diff --git a/apps/rust-sdk/src/error.rs b/apps/rust-sdk/src/error.rs index f04a286a..33e4edc6 100644 --- a/apps/rust-sdk/src/error.rs +++ b/apps/rust-sdk/src/error.rs @@ -9,7 +9,7 @@ use crate::crawl::CrawlStatus; #[derive(Debug, Deserialize, Serialize, Clone)] pub struct FirecrawlAPIError { /// Always false. - success: bool, + pub success: bool, /// Error message pub error: String, diff --git a/apps/rust-sdk/src/lib.rs b/apps/rust-sdk/src/lib.rs index 38c2dc11..5d95cc7d 100644 --- a/apps/rust-sdk/src/lib.rs +++ b/apps/rust-sdk/src/lib.rs @@ -9,6 +9,7 @@ pub mod map; pub mod scrape; pub use error::FirecrawlError; +use error::FirecrawlAPIError; #[derive(Clone, Debug)] pub struct FirecrawlApp { @@ -18,16 +19,30 @@ pub struct FirecrawlApp { } pub(crate) const API_VERSION: &str = "/v1"; +const CLOUD_API_URL: &str = "https://api.firecrawl.dev"; impl FirecrawlApp { pub fn new(api_key: impl AsRef) -> Result { - FirecrawlApp::new_selfhosted("https://api.firecrawl.dev", Some(api_key)) + FirecrawlApp::new_selfhosted(CLOUD_API_URL, Some(api_key)) } pub fn new_selfhosted(api_url: impl AsRef, api_key: Option>) -> Result { + let url = api_url.as_ref().to_string(); + + if url == CLOUD_API_URL && api_key.is_none() { + return Err(FirecrawlError::APIError( + "Configuration".to_string(), + FirecrawlAPIError { + success: false, + error: "API key is required for cloud service".to_string(), + details: None, + } + )); + } + Ok(FirecrawlApp { api_key: api_key.map(|x| x.as_ref().to_string()), - api_url: api_url.as_ref().to_string(), + api_url: url, client: Client::new(), }) } diff --git a/apps/rust-sdk/tests/e2e_with_auth.rs b/apps/rust-sdk/tests/e2e_with_auth.rs index 75568f92..a9187511 100644 --- a/apps/rust-sdk/tests/e2e_with_auth.rs +++ b/apps/rust-sdk/tests/e2e_with_auth.rs @@ -1,7 +1,7 @@ use assert_matches::assert_matches; use dotenvy::dotenv; use firecrawl::scrape::{ExtractOptions, ScrapeFormats, ScrapeOptions}; -use firecrawl::FirecrawlApp; +use firecrawl::{FirecrawlApp, FirecrawlError}; use serde_json::json; use std::env; @@ -154,3 +154,29 @@ async fn test_llm_extraction() { assert!(llm_extraction["supports_sso"].is_boolean()); assert!(llm_extraction["is_open_source"].is_boolean()); } + +#[test] +fn test_api_key_requirements() { + dotenv().ok(); + + let api_url = env::var("API_URL").unwrap_or("http://localhost:3002".to_string()); + let api_key = env::var("TEST_API_KEY").ok(); + + match (api_url.contains("api.firecrawl.dev"), api_key) { + (false, _) => { + let result = FirecrawlApp::new_selfhosted(&api_url, None::); + assert!(result.is_ok(), "Local setup failed: {:?}", result.err().unwrap()); + } + (true, None) => { + let result = FirecrawlApp::new_selfhosted(&api_url, None::); + assert!(matches!( + result, + Err(FirecrawlError::APIError(msg, _)) if msg == "Configuration" + )); + } + (true, Some(key)) => { + let result = FirecrawlApp::new_selfhosted(&api_url, Some(&key)); + assert!(result.is_ok()); + } + } +} From ca2d3dc6d2cf311e45e77d9c48f83c2d14141875 Mon Sep 17 00:00:00 2001 From: RutamBhagat Date: Sat, 21 Dec 2024 06:24:53 -0800 Subject: [PATCH 09/16] docs(credit-usage-api): add new endpoint documentation for credit usage --- apps/api/v1-openapi.json | 82 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/apps/api/v1-openapi.json b/apps/api/v1-openapi.json index 6cd2b3da..5b57e511 100644 --- a/apps/api/v1-openapi.json +++ b/apps/api/v1-openapi.json @@ -719,7 +719,89 @@ } } } + }, + "/credit-usage": { + "get": { + "summary": "Get remaining credits for the authenticated team", + "operationId": "getCreditUsage", + "tags": [ + "Billing" + ], + "security": [ + { + "bearerAuth": [] + } + ], + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": true + }, + "data": { + "type": "object", + "properties": { + "remaining_credits": { + "type": "number", + "description": "Number of credits remaining for the team", + "example": 1000 + } + } + } + } + } + } + } + }, + "404": { + "description": "Credit usage information not found", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": false + }, + "error": { + "type": "string", + "example": "Could not find credit usage information" + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": false + }, + "error": { + "type": "string", + "example": "Internal server error while fetching credit usage" + } + } + } + } + } + } + } } + } }, "components": { "securitySchemes": { From 2f39bdddd9f76ce9d6a0956620d86a4e070f6c59 Mon Sep 17 00:00:00 2001 From: yujunhui Date: Thu, 26 Dec 2024 17:56:30 +0800 Subject: [PATCH 10/16] fix: merge mock success data --- apps/api/src/lib/withAuth.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/lib/withAuth.ts b/apps/api/src/lib/withAuth.ts index a585fe0a..bec3d4d1 100644 --- a/apps/api/src/lib/withAuth.ts +++ b/apps/api/src/lib/withAuth.ts @@ -17,7 +17,7 @@ export function withAuth( logger.warn("You're bypassing authentication"); warningCount++; } - return { success: true } as T; + return { success: true, ...(mockSuccess || {}) } as T; } else { return await originalFunction(...args); } From 4451c4f67153fae0876f2e7136e2f1a4932374d9 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 26 Dec 2024 13:51:20 -0300 Subject: [PATCH 11/16] Nick: --- apps/api/src/scraper/WebScraper/crawler.ts | 83 +++++++++++++++------- apps/api/src/scraper/WebScraper/sitemap.ts | 59 +++++++++++---- 2 files changed, 104 insertions(+), 38 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 2e47d352..41bee2d6 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -446,44 +446,75 @@ export class WebCrawler { }; const sitemapUrl = url.endsWith(".xml") ? url : `${url}/sitemap.xml`; - let sitemapLinks: string[] = []; + // Try to get sitemap from the provided URL first try { - const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); - if (response.status === 200) { - sitemapLinks = await getLinksFromSitemap({ sitemapUrl }, this.logger); - } + sitemapLinks = await getLinksFromSitemap( + { sitemapUrl, allUrls: [], mode: "fire-engine" }, + this.logger, + ); } catch (error) { this.logger.debug( - `Failed to fetch sitemap with axios from ${sitemapUrl}`, + `Failed to fetch sitemap from ${sitemapUrl}`, { method: "tryFetchSitemapLinks", sitemapUrl, error }, ); - if (error instanceof AxiosError && error.response?.status === 404) { - // ignore 404 - } else { - const response = await getLinksFromSitemap( - { sitemapUrl, mode: "fire-engine" }, - this.logger, - ); - if (response) { - sitemapLinks = response; - } - } } + // If this is a subdomain, also try to get sitemap from the main domain + try { + const urlObj = new URL(url); + const hostname = urlObj.hostname; + const domainParts = hostname.split('.'); + + // Check if this is a subdomain (has more than 2 parts and not www) + if (domainParts.length > 2 && domainParts[0] !== 'www') { + // Get the main domain by taking the last two parts + const mainDomain = domainParts.slice(-2).join('.'); + const mainDomainUrl = `${urlObj.protocol}//${mainDomain}`; + const mainDomainSitemapUrl = `${mainDomainUrl}/sitemap.xml`; + + try { + // Get all links from the main domain's sitemap + const mainDomainLinks = await getLinksFromSitemap( + { sitemapUrl: mainDomainSitemapUrl, allUrls: [], mode: "fire-engine" }, + this.logger, + ); + // Filter links to only include those pointing to the current subdomain + const subdomainLinks = mainDomainLinks.filter(link => { + try { + const linkUrl = new URL(link); + return linkUrl.hostname.endsWith(hostname); + } catch { + return false; + } + }); + sitemapLinks = [...new Set([...sitemapLinks, ...subdomainLinks])]; + } catch (error) { + this.logger.debug( + `Failed to fetch main domain sitemap from ${mainDomainSitemapUrl}`, + { method: "tryFetchSitemapLinks", mainDomainSitemapUrl, error }, + ); + } + } + } catch (error) { + this.logger.debug(`Error processing main domain sitemap`, { + method: "tryFetchSitemapLinks", + url, + error, + }); + } + + // If no sitemap found yet, try the baseUrl as a last resort if (sitemapLinks.length === 0) { const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; try { - const response = await axios.get(baseUrlSitemap, { - timeout: axiosTimeout, - }); - if (response.status === 200) { - sitemapLinks = await getLinksFromSitemap( - { sitemapUrl: baseUrlSitemap, mode: "fire-engine" }, - this.logger, - ); - } + const baseLinks = await getLinksFromSitemap( + { sitemapUrl: baseUrlSitemap, allUrls: [], mode: "fire-engine" }, + this.logger, + ); + + sitemapLinks = [...new Set([...sitemapLinks, ...baseLinks])]; } catch (error) { this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, { method: "tryFetchSitemapLinks", diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index c080373e..2529c022 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -5,7 +5,9 @@ import { WebCrawler } from "./crawler"; import { scrapeURL } from "../scrapeURL"; import { scrapeOptions } from "../../controllers/v1/types"; import type { Logger } from "winston"; - +const useFireEngine = + process.env.FIRE_ENGINE_BETA_URL !== "" && + process.env.FIRE_ENGINE_BETA_URL !== undefined; export async function getLinksFromSitemap( { sitemapUrl, @@ -21,10 +23,7 @@ export async function getLinksFromSitemap( try { let content: string = ""; try { - if (mode === "axios" || process.env.FIRE_ENGINE_BETA_URL === "") { - const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); - content = response.data; - } else if (mode === "fire-engine") { + if (mode === "fire-engine" && useFireEngine) { const response = await scrapeURL( "sitemap", sitemapUrl, @@ -35,6 +34,9 @@ export async function getLinksFromSitemap( throw response.error; } content = response.document.rawHtml!; + } else { + const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); + content = response.data; } } catch (error) { logger.error(`Request failed for ${sitemapUrl}`, { @@ -43,7 +45,6 @@ export async function getLinksFromSitemap( sitemapUrl, error, }); - return allUrls; } @@ -51,21 +52,55 @@ export async function getLinksFromSitemap( const root = parsed.urlset || parsed.sitemapindex; if (root && root.sitemap) { - const sitemapPromises = root.sitemap + // Handle sitemap index files + const sitemapUrls = root.sitemap .filter((sitemap) => sitemap.loc && sitemap.loc.length > 0) - .map((sitemap) => + .map((sitemap) => sitemap.loc[0]); + + const sitemapPromises = sitemapUrls.map((sitemapUrl) => + getLinksFromSitemap( + { sitemapUrl, allUrls: [], mode }, + logger, + ), + ); + + const results = await Promise.all(sitemapPromises); + results.forEach(urls => { + allUrls.push(...urls); + }); + } else if (root && root.url) { + // Check if any URLs point to additional sitemaps + const xmlSitemaps = root.url + .filter( + (url) => + url.loc && + url.loc.length > 0 && + url.loc[0].toLowerCase().endsWith('.xml') + ) + .map((url) => url.loc[0]); + + if (xmlSitemaps.length > 0) { + // Recursively fetch links from additional sitemaps + const sitemapPromises = xmlSitemaps.map((sitemapUrl) => getLinksFromSitemap( - { sitemapUrl: sitemap.loc[0], allUrls, mode }, + { sitemapUrl, allUrls: [], mode }, logger, ), ); - await Promise.all(sitemapPromises); - } else if (root && root.url) { + + const results = await Promise.all(sitemapPromises); + results.forEach(urls => { + allUrls.push(...urls); + }); + } + + // Add regular URLs that aren't sitemaps const validUrls = root.url .filter( (url) => url.loc && url.loc.length > 0 && + !url.loc[0].toLowerCase().endsWith('.xml') && !WebCrawler.prototype.isFile(url.loc[0]), ) .map((url) => url.loc[0]); @@ -80,7 +115,7 @@ export async function getLinksFromSitemap( }); } - return allUrls; + return [...new Set(allUrls)]; } export const fetchSitemapData = async ( From 477295131372011ac9d0d5217925d924871985ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 27 Dec 2024 16:44:41 +0100 Subject: [PATCH 12/16] feat(scrapeURL/fire-engine): explicitly delete job after scrape --- .../scrapeURL/engines/fire-engine/index.ts | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index aa869836..aeafebea 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -17,6 +17,7 @@ import { ActionError, EngineError, SiteError, TimeoutError, UnsupportedFileError import * as Sentry from "@sentry/node"; import { Action } from "../../../../lib/entities"; import { specialtyScrapeCheck } from "../utils/specialtyHandler"; +import { fireEngineDelete } from "./delete"; // This function does not take `Meta` on purpose. It may not access any // meta values to construct the request -- that must be done by the @@ -44,6 +45,13 @@ async function performFireEngineScrape< while (status === undefined) { if (errors.length >= errorLimit) { logger.error("Error limit hit.", { errors }); + fireEngineDelete( + logger.child({ + method: "performFireEngineScrape/fireEngineDelete", + afterErrors: errors, + }), + scrape.jobId, + ); throw new Error("Error limit hit. See e.cause.errors for errors.", { cause: { errors }, }); @@ -74,6 +82,13 @@ async function performFireEngineScrape< error instanceof ActionError || error instanceof UnsupportedFileError ) { + fireEngineDelete( + logger.child({ + method: "performFireEngineScrape/fireEngineDelete", + afterError: error, + }), + scrape.jobId, + ); logger.debug("Fire-engine scrape job failed.", { error, jobId: scrape.jobId, @@ -105,6 +120,13 @@ async function performFireEngineScrape< status.content = Buffer.from(content, "base64").toString("utf8"); // TODO: handle other encodings via Content-Type tag } + fireEngineDelete( + logger.child({ + method: "performFireEngineScrape/fireEngineDelete", + }), + scrape.jobId, + ); + return status; } From e8f0a22ebe6e277e6fbb806cde561545ce54a0a1 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 27 Dec 2024 13:59:43 -0300 Subject: [PATCH 13/16] Update v1-openapi.json --- apps/api/v1-openapi.json | 1102 ++++++++++++++++++++++++++++++++++---- 1 file changed, 994 insertions(+), 108 deletions(-) diff --git a/apps/api/v1-openapi.json b/apps/api/v1-openapi.json index 5b57e511..9aab05c9 100644 --- a/apps/api/v1-openapi.json +++ b/apps/api/v1-openapi.json @@ -42,7 +42,15 @@ "type": "array", "items": { "type": "string", - "enum": ["markdown", "html", "rawHtml", "links", "screenshot", "extract", "screenshot@fullPage"] + "enum": [ + "markdown", + "html", + "rawHtml", + "links", + "screenshot", + "extract", + "screenshot@fullPage" + ] }, "description": "Formats to include in the output.", "default": ["markdown"] @@ -75,6 +83,16 @@ "description": "Specify a delay in milliseconds before fetching the content, allowing the page sufficient time to load.", "default": 0 }, + "mobile": { + "type": "boolean", + "description": "Set to true if you want to emulate scraping from a mobile device. Useful for testing responsive pages and taking mobile screenshots.", + "default": false + }, + "skipTlsVerification": { + "type": "boolean", + "description": "Skip TLS certificate verification when making requests", + "default": false + }, "timeout": { "type": "integer", "description": "Timeout in milliseconds for the request", @@ -116,9 +134,391 @@ "type": "integer", "minimum": 1, "description": "Number of milliseconds to wait" + }, + "selector": { + "type": "string", + "description": "Query selector to find the element by", + "example": "#my-element" } }, - "required": ["type", "milliseconds"] + "required": ["type"] + }, + { + "type": "object", + "title": "Screenshot", + "properties": { + "type": { + "type": "string", + "enum": ["screenshot"], + "description": "Take a screenshot" + }, + "fullPage": { + "type": "boolean", + "description": "Should the screenshot be full-page or viewport sized?", + "default": false + } + }, + "required": ["type"] + }, + { + "type": "object", + "title": "Click", + "properties": { + "type": { + "type": "string", + "enum": ["click"], + "description": "Click on an element" + }, + "selector": { + "type": "string", + "description": "Query selector to find the element by", + "example": "#load-more-button" + } + }, + "required": ["type", "selector"] + }, + { + "type": "object", + "title": "Write text", + "properties": { + "type": { + "type": "string", + "enum": ["write"], + "description": "Write text into an input field, text area, or contenteditable element. Note: You must first focus the element using a 'click' action before writing. The text will be typed character by character to simulate keyboard input." + }, + "text": { + "type": "string", + "description": "Text to type", + "example": "Hello, world!" + } + }, + "required": ["type", "text"] + }, + { + "type": "object", + "title": "Press a key", + "description": "Press a key on the page. See https://asawicki.info/nosense/doc/devices/keyboard/key_codes.html for key codes.", + "properties": { + "type": { + "type": "string", + "enum": ["press"], + "description": "Press a key on the page" + }, + "key": { + "type": "string", + "description": "Key to press", + "example": "Enter" + } + }, + "required": ["type", "key"] + }, + { + "type": "object", + "title": "Scroll", + "properties": { + "type": { + "type": "string", + "enum": ["scroll"], + "description": "Scroll the page or a specific element" + }, + "direction": { + "type": "string", + "enum": ["up", "down"], + "description": "Direction to scroll", + "default": "down" + }, + "selector": { + "type": "string", + "description": "Query selector for the element to scroll", + "example": "#my-element" + } + }, + "required": ["type"] + }, + { + "type": "object", + "title": "Scrape", + "properties": { + "type": { + "type": "string", + "enum": ["scrape"], + "description": "Scrape the current page content, returns the url and the html." + } + }, + "required": ["type"] + }, + { + "type": "object", + "title": "Execute JavaScript", + "properties": { + "type": { + "type": "string", + "enum": ["executeJavascript"], + "description": "Execute JavaScript code on the page" + }, + "script": { + "type": "string", + "description": "JavaScript code to execute", + "example": "document.querySelector('.button').click();" + } + }, + "required": ["type", "script"] + } + ] + } + }, + "location": { + "type": "object", + "description": "Location settings for the request. When specified, this will use an appropriate proxy if available and emulate the corresponding language and timezone settings. Defaults to 'US' if not specified.", + "properties": { + "country": { + "type": "string", + "description": "ISO 3166-1 alpha-2 country code (e.g., 'US', 'AU', 'DE', 'JP')", + "pattern": "^[A-Z]{2}$", + "default": "US" + }, + "languages": { + "type": "array", + "description": "Preferred languages and locales for the request in order of priority. Defaults to the language of the specified location. See https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Language", + "items": { + "type": "string", + "example": "en-US" + } + } + } + }, + "removeBase64Images": { + "type": "boolean", + "description": "Removes all base 64 images from the output, which may be overwhelmingly long. The image's alt text remains in the output, but the URL is replaced with a placeholder." + } + }, + "required": ["url"] + } + } + } + }, + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ScrapeResponse" + } + } + } + }, + "402": { + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } + }, + "429": { + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + } + }, + "/batch/scrape": { + "post": { + "summary": "Scrape multiple URLs and optionally extract information using an LLM", + "operationId": "scrapeAndExtractFromUrls", + "tags": ["Scraping"], + "security": [ + { + "bearerAuth": [] + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "urls": { + "type": "array", + "items": { + "type": "string", + "format": "uri", + "description": "The URL to scrape" + } + }, + "webhook": { + "oneOf": [ + { + "type": "string", + "description": "The URL to send the webhook to. This will trigger for batch scrape started (batch_scrape.started), every page scraped (batch_scrape.page) and when the batch scrape is completed (batch_scrape.completed or batch_scrape.failed). The response will be the same as the `/scrape` endpoint." + }, + { + "type": "object", + "description": "A complex webhook specification object.", + "properties": { + "url": { + "type": "string", + "description": "The URL to send the webhook to. This will trigger for batch scrape started (batch_scrape.started), every page scraped (batch_scrape.page) and when the batch scrape is completed (batch_scrape.completed or batch_scrape.failed). The response will be the same as the `/scrape` endpoint." + }, + "headers": { + "type": "object", + "description": "Headers to send to the webhook URL.", + "additionalProperties": { + "type": "string" + } + }, + "metadata": { + "type": "object", + "description": "Custom metadata that will be included in all webhook payloads for this crawl", + "additionalProperties": true + } + }, + "required": ["url"] + } + ] + }, + "formats": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "markdown", + "html", + "rawHtml", + "links", + "screenshot", + "extract", + "screenshot@fullPage" + ] + }, + "description": "Formats to include in the output.", + "default": ["markdown"] + }, + "onlyMainContent": { + "type": "boolean", + "description": "Only return the main content of the page excluding headers, navs, footers, etc.", + "default": true + }, + "includeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags to include in the output." + }, + "excludeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags to exclude from the output." + }, + "headers": { + "type": "object", + "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." + }, + "waitFor": { + "type": "integer", + "description": "Specify a delay in milliseconds before fetching the content, allowing the page sufficient time to load.", + "default": 0 + }, + "mobile": { + "type": "boolean", + "description": "Set to true if you want to emulate scraping from a mobile device. Useful for testing responsive pages and taking mobile screenshots.", + "default": false + }, + "skipTlsVerification": { + "type": "boolean", + "description": "Skip TLS certificate verification when making requests", + "default": false + }, + "timeout": { + "type": "integer", + "description": "Timeout in milliseconds for the request", + "default": 30000 + }, + "extract": { + "type": "object", + "description": "Extract object", + "properties": { + "schema": { + "type": "object", + "description": "The schema to use for the extraction (Optional)" + }, + "systemPrompt": { + "type": "string", + "description": "The system prompt to use for the extraction (Optional)" + }, + "prompt": { + "type": "string", + "description": "The prompt to use for the extraction without a schema (Optional)" + } + } + }, + "actions": { + "type": "array", + "description": "Actions to perform on the page before grabbing the content", + "items": { + "oneOf": [ + { + "type": "object", + "title": "Wait", + "properties": { + "type": { + "type": "string", + "enum": ["wait"], + "description": "Wait for a specified amount of milliseconds" + }, + "milliseconds": { + "type": "integer", + "minimum": 1, + "description": "Number of milliseconds to wait" + }, + "selector": { + "type": "string", + "description": "Query selector to find the element by", + "example": "#my-element" + } + }, + "required": ["type"] }, { "type": "object", @@ -201,23 +601,82 @@ "type": { "type": "string", "enum": ["scroll"], - "description": "Scroll the page" + "description": "Scroll the page or a specific element" }, "direction": { "type": "string", "enum": ["up", "down"], - "description": "Direction to scroll" + "description": "Direction to scroll", + "default": "down" }, - "amount": { - "type": "integer", - "description": "Amount to scroll in pixels", - "minimum": 1 + "selector": { + "type": "string", + "description": "Query selector for the element to scroll", + "example": "#my-element" } }, - "required": ["type", "direction"] - } + "required": ["type"] + }, + { + "type": "object", + "title": "Scrape", + "properties": { + "type": { + "type": "string", + "enum": ["scrape"], + "description": "Scrape the current page content, returns the url and the html." + } + }, + "required": ["type"] + }, + { + "type": "object", + "title": "Execute JavaScript", + "properties": { + "type": { + "type": "string", + "enum": ["executeJavascript"], + "description": "Execute JavaScript code on the page" + }, + "script": { + "type": "string", + "description": "JavaScript code to execute", + "example": "document.querySelector('.button').click();" + } + }, + "required": ["type", "script"] + } ] } + }, + "location": { + "type": "object", + "description": "Location settings for the request. When specified, this will use an appropriate proxy if available and emulate the corresponding language and timezone settings. Defaults to 'US' if not specified.", + "properties": { + "country": { + "type": "string", + "description": "ISO 3166-1 alpha-2 country code (e.g., 'US', 'AU', 'DE', 'JP')", + "pattern": "^[A-Z]{2}$", + "default": "US" + }, + "languages": { + "type": "array", + "description": "Preferred languages and locales for the request in order of priority. Defaults to the language of the specified location. See https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Language", + "items": { + "type": "string", + "example": "en-US" + } + } + } + }, + "removeBase64Images": { + "type": "boolean", + "description": "Removes all base 64 images from the output, which may be overwhelmingly long. The image's alt text remains in the output, but the URL is replaced with a placeholder." + }, + "ignoreInvalidURLs": { + "type": "boolean", + "default": false, + "description": "If invalid URLs are specified in the urls array, they will be ignored. Instead of them failing the entire request, a batch scrape using the remaining valid URLs will be created, and the invalid URLs will be returned in the invalidURLs field of the response." } }, "required": ["url"] @@ -231,7 +690,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ScrapeResponse" + "$ref": "#/components/schemas/BatchScrapeResponseObj" } } } @@ -287,6 +746,154 @@ } } }, + "/batch/scrape/{id}": { + "parameters": [ + { + "name": "id", + "in": "path", + "description": "The ID of the batch scrape job", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "get": { + "summary": "Get the status of a batch scrape job", + "operationId": "getBatchScrapeStatus", + "tags": ["Scraping"], + "security": [ + { + "bearerAuth": [] + } + ], + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/BatchScrapeStatusResponseObj" + } + } + } + }, + "402": { + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } + }, + "429": { + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + }, + "delete": { + "summary": "Cancel a crawl job", + "operationId": "cancelCrawl", + "tags": ["Crawling"], + "security": [ + { + "bearerAuth": [] + } + ], + "responses": { + "200": { + "description": "Successful cancellation", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": true + }, + "message": { + "type": "string", + "example": "Crawl job successfully cancelled." + } + } + } + } + } + }, + "404": { + "description": "Crawl job not found", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Crawl job not found." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + } + }, "/crawl/{id}": { "parameters": [ { @@ -479,12 +1086,12 @@ "ignoreSitemap": { "type": "boolean", "description": "Ignore the website sitemap when crawling", - "default": true + "default": false }, "limit": { "type": "integer", "description": "Maximum number of pages to crawl. Default limit is 10000.", - "default": 10 + "default": 10000 }, "allowBackwardLinks": { "type": "boolean", @@ -497,8 +1104,35 @@ "default": false }, "webhook": { - "type": "string", - "description": "The URL to send the webhook to. This will trigger for crawl started (crawl.started) ,every page crawled (crawl.page) and when the crawl is completed (crawl.completed or crawl.failed). The response will be the same as the `/scrape` endpoint." + "oneOf": [ + { + "type": "string", + "description": "The URL to send the webhook to. This will trigger for crawl started (crawl.started) ,every page crawled (crawl.page) and when the crawl is completed (crawl.completed or crawl.failed). The response will be the same as the `/scrape` endpoint." + }, + { + "type": "object", + "description": "A complex webhook specification object.", + "properties": { + "url": { + "type": "string", + "description": "The URL to send the webhook to. This will trigger for crawl started (crawl.started), every page crawled (crawl.page) and when the crawl is completed (crawl.completed or crawl.failed). The response will be the same as the `/scrape` endpoint." + }, + "headers": { + "type": "object", + "description": "Headers to send to the webhook URL.", + "additionalProperties": { + "type": "string" + } + }, + "metadata": { + "type": "object", + "description": "Custom metadata that will be included in all webhook payloads for this crawl", + "additionalProperties": true + } + }, + "required": ["url"] + } + ] }, "scrapeOptions": { "type": "object", @@ -507,7 +1141,13 @@ "type": "array", "items": { "type": "string", - "enum": ["markdown", "html", "rawHtml", "links", "screenshot"] + "enum": [ + "markdown", + "html", + "rawHtml", + "links", + "screenshot" + ] }, "description": "Formats to include in the output.", "default": ["markdown"] @@ -535,6 +1175,16 @@ "description": "Only return the main content of the page excluding headers, navs, footers, etc.", "default": true }, + "removeBase64Images": { + "type": "boolean", + "description": "Remove base64 encoded images from the output", + "default": true + }, + "mobile": { + "type": "boolean", + "description": "Set to true if you want to emulate scraping from a mobile device. Useful for testing responsive pages and taking mobile screenshots.", + "default": false + }, "waitFor": { "type": "integer", "description": "Wait x amount of milliseconds for the page to load to fetch content", @@ -612,106 +1262,110 @@ }, "/map": { "post": { - "summary": "Map multiple URLs based on options", - "operationId": "mapUrls", - "tags": ["Mapping"], - "security": [ - { - "bearerAuth": [] + "summary": "Map multiple URLs based on options", + "operationId": "mapUrls", + "tags": ["Mapping"], + "security": [ + { + "bearerAuth": [] + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "url": { + "type": "string", + "format": "uri", + "description": "The base URL to start crawling from" + }, + "search": { + "type": "string", + "description": "Search query to use for mapping. During the Alpha phase, the 'smart' part of the search functionality is limited to 1000 search results. However, if map finds more results, there is no limit applied." + }, + "ignoreSitemap": { + "type": "boolean", + "description": "Ignore the website sitemap when crawling.", + "default": true + }, + "sitemapOnly": { + "type": "boolean", + "description": "Only return links found in the website sitemap", + "default": false + }, + "includeSubdomains": { + "type": "boolean", + "description": "Include subdomains of the website", + "default": false + }, + "limit": { + "type": "integer", + "description": "Maximum number of links to return", + "default": 5000, + "maximum": 5000 + } + }, + "required": ["url"] + } } - ], - "requestBody": { - "required": true, + } + }, + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/MapResponse" + } + } + } + }, + "402": { + "description": "Payment required", "content": { "application/json": { "schema": { "type": "object", "properties": { - "url": { + "error": { "type": "string", - "format": "uri", - "description": "The base URL to start crawling from" - }, - "search": { - "type": "string", - "description": "Search query to use for mapping. During the Alpha phase, the 'smart' part of the search functionality is limited to 1000 search results. However, if map finds more results, there is no limit applied." - }, - "ignoreSitemap": { - "type": "boolean", - "description": "Ignore the website sitemap when crawling", - "default": true - }, - "includeSubdomains": { - "type": "boolean", - "description": "Include subdomains of the website", - "default": false - }, - "limit": { - "type": "integer", - "description": "Maximum number of links to return", - "default": 5000, - "maximum": 5000 + "example": "Payment required to access this resource." } - }, - "required": ["url"] + } } } } }, - "responses": { - "200": { - "description": "Successful response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/MapResponse" - } - } - } - }, - "402": { - "description": "Payment required", - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "error": { - "type": "string", - "example": "Payment required to access this resource." - } + "429": { + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." } } } } - }, - "429": { - "description": "Too many requests", - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "error": { - "type": "string", - "example": "Request rate limit exceeded. Please wait and try again later." - } - } - } - } - } - }, - "500": { - "description": "Server error", - "content": { - "application/json": { - "schema": { - "type": "object", - "properties": { - "error": { - "type": "string", - "example": "An unexpected error occurred on the server." - } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." } } } @@ -719,14 +1373,109 @@ } } } - }, - "/credit-usage": { + } + }, + "/extract": { + "post": { + "summary": "Extract structured data from pages using LLMs", + "operationId": "extractData", + "tags": ["Extraction"], + "security": [ + { + "bearerAuth": [] + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "urls": { + "type": "array", + "items": { + "type": "string", + "format": "uri", + "description": "The URLs to extract data from. URLs should be in glob format." + } + }, + "prompt": { + "type": "string", + "description": "Prompt to guide the extraction process" + }, + "schema": { + "type": "object", + "description": "Schema to define the structure of the extracted data", + "properties": { + "property1": { + "type": "string", + "description": "Description of property1" + }, + "property2": { + "type": "integer", + "description": "Description of property2" + } + }, + "required": ["property1", "property2"] + } + }, + "required": ["urls", "prompt"] + } + } + } + }, + "responses": { + "200": { + "description": "Successful extraction", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ExtractResponse" + } + } + } + }, + "400": { + "description": "Invalid request", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Invalid input data." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + } + }, + "/team/credit-usage": { "get": { "summary": "Get remaining credits for the authenticated team", "operationId": "getCreditUsage", - "tags": [ - "Billing" - ], + "tags": ["Billing"], "security": [ { "bearerAuth": [] @@ -859,7 +1608,7 @@ } } } - }, + }, "metadata": { "type": "object", "properties": { @@ -889,7 +1638,6 @@ "nullable": true, "description": "The error message of the page" } - } }, "llm_extraction": { @@ -1002,6 +1750,102 @@ } } }, + "BatchScrapeStatusResponseObj": { + "type": "object", + "properties": { + "status": { + "type": "string", + "description": "The current status of the batch scrape. Can be `scraping`, `completed`, or `failed`." + }, + "total": { + "type": "integer", + "description": "The total number of pages that were attempted to be scraped." + }, + "completed": { + "type": "integer", + "description": "The number of pages that have been successfully scraped." + }, + "creditsUsed": { + "type": "integer", + "description": "The number of credits used for the batch scrape." + }, + "expiresAt": { + "type": "string", + "format": "date-time", + "description": "The date and time when the batch scrape will expire." + }, + "next": { + "type": "string", + "nullable": true, + "description": "The URL to retrieve the next 10MB of data. Returned if the batch scrape is not completed or if the response is larger than 10MB." + }, + "data": { + "type": "array", + "description": "The data of the batch scrape.", + "items": { + "type": "object", + "properties": { + "markdown": { + "type": "string" + }, + "html": { + "type": "string", + "nullable": true, + "description": "HTML version of the content on page if `includeHtml` is true" + }, + "rawHtml": { + "type": "string", + "nullable": true, + "description": "Raw HTML content of the page if `includeRawHtml` is true" + }, + "links": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of links on the page if `includeLinks` is true" + }, + "screenshot": { + "type": "string", + "nullable": true, + "description": "Screenshot of the page if `includeScreenshot` is true" + }, + "metadata": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "language": { + "type": "string", + "nullable": true + }, + "sourceURL": { + "type": "string", + "format": "uri" + }, + " ": { + "type": "string" + }, + "statusCode": { + "type": "integer", + "description": "The status code of the page" + }, + "error": { + "type": "string", + "nullable": true, + "description": "The error message of the page" + } + } + } + } + } + } + } + }, "CrawlResponse": { "type": "object", "properties": { @@ -1017,6 +1861,29 @@ } } }, + "BatchScrapeResponseObj": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "id": { + "type": "string" + }, + "url": { + "type": "string", + "format": "uri" + }, + "invalidURLs": { + "type": "array", + "nullable": true, + "items": { + "type": "string" + }, + "description": "If ignoreInvalidURLs is true, this is an array containing the invalid URLs that were specified in the request. If there were no invalid URLs, this will be an empty array. If ignoreInvalidURLs is false, this field will be undefined." + } + } + }, "MapResponse": { "type": "object", "properties": { @@ -1030,6 +1897,25 @@ } } } + }, + "ExtractResponse": { + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "data": { + "type": "object", + "properties": { + "": { + "type": "string" + }, + "": { + "type": "number" + } + } + } + } } } }, @@ -1038,4 +1924,4 @@ "bearerAuth": [] } ] -} \ No newline at end of file +} From 6851281bebc2f14608c8d7062fadc3276f4c9731 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 27 Dec 2024 15:46:00 -0300 Subject: [PATCH 14/16] Update __init__.py --- apps/python-sdk/firecrawl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 5f592c2c..352305a4 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp # noqa -__version__ = "1.6.8" +__version__ = "1.7.0" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") From 0421f8102082a0d56ec20234188be995b20d98d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 27 Dec 2024 19:59:26 +0100 Subject: [PATCH 15/16] Sitemap fixes (#1010) * sitemap fixes iter 1 * feat(sitemap): dedupe improvements --------- Co-authored-by: Nicolas --- apps/api/src/controllers/v0/crawl.ts | 91 +++++------ apps/api/src/controllers/v0/crawlPreview.ts | 50 +++--- apps/api/src/controllers/v1/crawl-status.ts | 2 +- apps/api/src/controllers/v1/crawl.ts | 123 ++------------ apps/api/src/controllers/v1/map.ts | 21 ++- apps/api/src/scraper/WebScraper/crawler.ts | 131 ++++++++------- apps/api/src/scraper/WebScraper/sitemap.ts | 43 ++--- apps/api/src/services/queue-jobs.ts | 3 +- apps/api/src/services/queue-worker.ts | 168 ++++++++++++++++++-- 9 files changed, 341 insertions(+), 291 deletions(-) diff --git a/apps/api/src/controllers/v0/crawl.ts b/apps/api/src/controllers/v0/crawl.ts index ceeaa436..9659c218 100644 --- a/apps/api/src/controllers/v0/crawl.ts +++ b/apps/api/src/controllers/v0/crawl.ts @@ -177,56 +177,51 @@ export async function crawlController(req: Request, res: Response) { await saveCrawl(id, sc); - const sitemap = sc.crawlerOptions?.ignoreSitemap - ? null - : await crawler.tryGetSitemap(); + const sitemap = sc.crawlerOptions.ignoreSitemap + ? 0 + : await crawler.tryGetSitemap(async urls => { + if (urls.length === 0) return; + + let jobPriority = await getJobPriority({ plan, team_id, basePriority: 21 }); + const jobs = urls.map(url => { + const uuid = uuidv4(); + return { + name: uuid, + data: { + url, + mode: "single_urls", + crawlerOptions, + scrapeOptions, + internalOptions, + team_id, + plan, + origin: req.body.origin ?? defaultOrigin, + crawl_id: id, + sitemapped: true, + }, + opts: { + jobId: uuid, + priority: jobPriority, + }, + }; + }); - if (sitemap !== null && sitemap.length > 0) { - let jobPriority = 20; - // If it is over 1000, we need to get the job priority, - // otherwise we can use the default priority of 20 - if (sitemap.length > 1000) { - // set base to 21 - jobPriority = await getJobPriority({ plan, team_id, basePriority: 21 }); - } - const jobs = sitemap.map((x) => { - const url = x.url; - const uuid = uuidv4(); - return { - name: uuid, - data: { - url, - mode: "single_urls", - crawlerOptions, - scrapeOptions, - internalOptions, - team_id, - plan, - origin: req.body.origin ?? defaultOrigin, - crawl_id: id, - sitemapped: true, - }, - opts: { - jobId: uuid, - priority: jobPriority, - }, - }; - }); + await lockURLs( + id, + sc, + jobs.map((x) => x.data.url), + ); + await addCrawlJobs( + id, + jobs.map((x) => x.opts.jobId), + ); + for (const job of jobs) { + // add with sentry instrumentation + await addScrapeJob(job.data as any, {}, job.opts.jobId); + } + }); - await lockURLs( - id, - sc, - jobs.map((x) => x.data.url), - ); - await addCrawlJobs( - id, - jobs.map((x) => x.opts.jobId), - ); - for (const job of jobs) { - // add with sentry instrumentation - await addScrapeJob(job.data as any, {}, job.opts.jobId); - } - } else { + if (sitemap === 0) { await lockURL(id, sc, url); // Not needed, first one should be 15. diff --git a/apps/api/src/controllers/v0/crawlPreview.ts b/apps/api/src/controllers/v0/crawlPreview.ts index f9462c3d..9ba9bd46 100644 --- a/apps/api/src/controllers/v0/crawlPreview.ts +++ b/apps/api/src/controllers/v0/crawlPreview.ts @@ -113,32 +113,32 @@ export async function crawlPreviewController(req: Request, res: Response) { const crawler = crawlToCrawler(id, sc); const sitemap = sc.crawlerOptions?.ignoreSitemap - ? null - : await crawler.tryGetSitemap(); + ? 0 + : await crawler.tryGetSitemap(async urls => { + for (const url of urls) { + await lockURL(id, sc, url); + const jobId = uuidv4(); + await addScrapeJob( + { + url, + mode: "single_urls", + team_id, + plan: plan!, + crawlerOptions, + scrapeOptions, + internalOptions, + origin: "website-preview", + crawl_id: id, + sitemapped: true, + }, + {}, + jobId, + ); + await addCrawlJob(id, jobId); + } + }); - if (sitemap !== null) { - for (const url of sitemap.map((x) => x.url)) { - await lockURL(id, sc, url); - const jobId = uuidv4(); - await addScrapeJob( - { - url, - mode: "single_urls", - team_id, - plan: plan!, - crawlerOptions, - scrapeOptions, - internalOptions, - origin: "website-preview", - crawl_id: id, - sitemapped: true, - }, - {}, - jobId, - ); - await addCrawlJob(id, jobId); - } - } else { + if (sitemap === 0) { await lockURL(id, sc, url); const jobId = uuidv4(); await addScrapeJob( diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index 59db16d8..1aec86c8 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -115,7 +115,7 @@ export async function crawlStatusController( const status: Exclude["status"] = sc.cancelled ? "cancelled" - : validJobStatuses.every((x) => x[1] === "completed") + : (validJobStatuses.every((x) => x[1] === "completed") && validJobStatuses.length > 0) ? "completed" : "scraping"; diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index c2e3369f..a759f448 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -18,7 +18,7 @@ import { } from "../../lib/crawl-redis"; import { logCrawl } from "../../services/logging/crawl_log"; import { getScrapeQueue } from "../../services/queue-service"; -import { addScrapeJob, addScrapeJobs } from "../../services/queue-jobs"; +import { _addScrapeJobToBullMQ, addScrapeJob, addScrapeJobs } from "../../services/queue-jobs"; import { logger as _logger } from "../../lib/logger"; import { getJobPriority } from "../../lib/job-priority"; import { callWebhook } from "../../services/webhook"; @@ -111,113 +111,20 @@ export async function crawlController( await saveCrawl(id, sc); - const sitemap = sc.crawlerOptions.ignoreSitemap - ? null - : await crawler.tryGetSitemap(); - - if (sitemap !== null && sitemap.length > 0) { - logger.debug("Using sitemap of length " + sitemap.length, { - sitemapLength: sitemap.length, - }); - let jobPriority = 20; - // If it is over 1000, we need to get the job priority, - // otherwise we can use the default priority of 20 - if (sitemap.length > 1000) { - // set base to 21 - jobPriority = await getJobPriority({ - plan: req.auth.plan, - team_id: req.auth.team_id, - basePriority: 21, - }); - } - logger.debug("Using job priority " + jobPriority, { jobPriority }); - - const jobs = sitemap.map((x) => { - const url = x.url; - const uuid = uuidv4(); - return { - name: uuid, - data: { - url, - mode: "single_urls" as const, - team_id: req.auth.team_id, - plan: req.auth.plan!, - crawlerOptions, - scrapeOptions, - internalOptions: sc.internalOptions, - origin: "api", - crawl_id: id, - sitemapped: true, - webhook: req.body.webhook, - v1: true, - }, - opts: { - jobId: uuid, - priority: 20, - }, - }; - }); - - logger.debug("Locking URLs..."); - await lockURLs( - id, - sc, - jobs.map((x) => x.data.url), - ); - logger.debug("Adding scrape jobs to Redis..."); - await addCrawlJobs( - id, - jobs.map((x) => x.opts.jobId), - ); - logger.debug("Adding scrape jobs to BullMQ..."); - await addScrapeJobs(jobs); - } else { - logger.debug("Sitemap not found or ignored.", { - ignoreSitemap: sc.crawlerOptions.ignoreSitemap, - }); - - logger.debug("Locking URL..."); - await lockURL(id, sc, req.body.url); - const jobId = uuidv4(); - logger.debug("Adding scrape job to Redis...", { jobId }); - await addScrapeJob( - { - url: req.body.url, - mode: "single_urls", - team_id: req.auth.team_id, - crawlerOptions, - scrapeOptions: scrapeOptionsSchema.parse(scrapeOptions), - internalOptions: sc.internalOptions, - plan: req.auth.plan!, - origin: "api", - crawl_id: id, - webhook: req.body.webhook, - v1: true, - }, - { - priority: 15, - }, - jobId, - ); - logger.debug("Adding scrape job to BullMQ...", { jobId }); - await addCrawlJob(id, jobId); - } - logger.debug("Done queueing jobs!"); - - if (req.body.webhook) { - logger.debug("Calling webhook with crawl.started...", { - webhook: req.body.webhook, - }); - await callWebhook( - req.auth.team_id, - id, - null, - req.body.webhook, - true, - "crawl.started", - ); - } - + await _addScrapeJobToBullMQ({ + url: req.body.url, + mode: "kickoff" as const, + team_id: req.auth.team_id, + plan: req.auth.plan, + crawlerOptions, + scrapeOptions: sc.scrapeOptions, + internalOptions: sc.internalOptions, + origin: "api", + crawl_id: id, + webhook: req.body.webhook, + v1: true, + }, {}, crypto.randomUUID(), 10); + const protocol = process.env.ENV === "local" ? req.protocol : "https"; return res.status(200).json({ diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 27a926fc..3274dd93 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -86,11 +86,12 @@ export async function getMapResults({ // If sitemapOnly is true, only get links from sitemap if (crawlerOptions.sitemapOnly) { - const sitemap = await crawler.tryGetSitemap(true, true); - if (sitemap !== null) { - sitemap.forEach((x) => { - links.push(x.url); + const sitemap = await crawler.tryGetSitemap(urls => { + urls.forEach((x) => { + links.push(x); }); + }, true, true); + if (sitemap > 0) { links = links .slice(1) .map((x) => { @@ -143,8 +144,10 @@ export async function getMapResults({ } // Parallelize sitemap fetch with serper search - const [sitemap, ...searchResults] = await Promise.all([ - ignoreSitemap ? null : crawler.tryGetSitemap(true), + const [_, ...searchResults] = await Promise.all([ + ignoreSitemap ? null : crawler.tryGetSitemap(urls => { + links.push(...urls); + }, true), ...(cachedResult ? [] : pagePromises), ]); @@ -152,12 +155,6 @@ export async function getMapResults({ allResults = searchResults; } - if (sitemap !== null) { - sitemap.forEach((x) => { - links.push(x.url); - }); - } - mapResults = allResults .flat() .filter((result) => result !== null && result !== undefined); diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 41bee2d6..52bea9e5 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -4,9 +4,10 @@ import { URL } from "url"; import { getLinksFromSitemap } from "./sitemap"; import robotsParser from "robots-parser"; import { getURLDepth } from "./utils/maxDepthUtils"; -import { axiosTimeout } from "../../../src/lib/timeout"; -import { logger as _logger } from "../../../src/lib/logger"; +import { axiosTimeout } from "../../lib/timeout"; +import { logger as _logger } from "../../lib/logger"; import https from "https"; +import { redisConnection } from "../../services/queue-service"; export class WebCrawler { private jobId: string; private initialUrl: string; @@ -198,26 +199,60 @@ export class WebCrawler { } public async tryGetSitemap( + urlsHandler: (urls: string[]) => unknown, fromMap: boolean = false, onlySitemap: boolean = false, - ): Promise<{ url: string; html: string }[] | null> { + ): Promise { this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, { method: "tryGetSitemap", }); - const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); - if (fromMap && onlySitemap) { - return sitemapLinks.map((link) => ({ url: link, html: "" })); + let leftOfLimit = this.limit; + + const normalizeUrl = (url: string) => { + url = url.replace(/^https?:\/\//, "").replace(/^www\./, ""); + if (url.endsWith("/")) { + url = url.slice(0, -1); + } + return url; + }; + + const _urlsHandler = async (urls: string[]) => { + let uniqueURLs: string[] = []; + for (const url of urls) { + if (await redisConnection.sadd("sitemap:" + this.jobId + ":links", normalizeUrl(url))) { + uniqueURLs.push(url); + } + } + + await redisConnection.expire("sitemap:" + this.jobId + ":links", 3600, "NX"); + if (uniqueURLs.length > 0) { + urlsHandler(uniqueURLs); + } + }; + + let count = await this.tryFetchSitemapLinks(this.initialUrl, (urls: string[]) => { + if (fromMap && onlySitemap) { + return urlsHandler(urls); + } else { + let filteredLinks = this.filterLinks( + [...new Set(urls)], + leftOfLimit, + this.maxCrawledDepth, + fromMap, + ); + leftOfLimit -= filteredLinks.length; + return _urlsHandler(filteredLinks); + } + }); + + if (count > 0) { + if (await redisConnection.sadd("sitemap:" + this.jobId + ":links", normalizeUrl(this.initialUrl))) { + urlsHandler([this.initialUrl]); + } + count++; } - if (sitemapLinks.length > 0) { - let filteredLinks = this.filterLinks( - [...new Set(sitemapLinks)], - this.limit, - this.maxCrawledDepth, - fromMap, - ); - return filteredLinks.map((link) => ({ url: link, html: "" })); - } - return null; + + return count; } public filterURL(href: string, url: string): string | null { @@ -436,22 +471,15 @@ export class WebCrawler { return socialMediaOrEmail.some((ext) => url.includes(ext)); } - private async tryFetchSitemapLinks(url: string): Promise { - const normalizeUrl = (url: string) => { - url = url.replace(/^https?:\/\//, "").replace(/^www\./, ""); - if (url.endsWith("/")) { - url = url.slice(0, -1); - } - return url; - }; - + private async tryFetchSitemapLinks(url: string, urlsHandler: (urls: string[]) => unknown): Promise { const sitemapUrl = url.endsWith(".xml") ? url : `${url}/sitemap.xml`; - let sitemapLinks: string[] = []; + + let sitemapCount: number = 0; // Try to get sitemap from the provided URL first try { - sitemapLinks = await getLinksFromSitemap( - { sitemapUrl, allUrls: [], mode: "fire-engine" }, + sitemapCount = await getLinksFromSitemap( + { sitemapUrl, urlsHandler, mode: "fire-engine" }, this.logger, ); } catch (error) { @@ -476,20 +504,18 @@ export class WebCrawler { try { // Get all links from the main domain's sitemap - const mainDomainLinks = await getLinksFromSitemap( - { sitemapUrl: mainDomainSitemapUrl, allUrls: [], mode: "fire-engine" }, + sitemapCount += await getLinksFromSitemap( + { sitemapUrl: mainDomainSitemapUrl, urlsHandler(urls) { + urlsHandler(urls.filter(link => { + try { + const linkUrl = new URL(link); + return linkUrl.hostname.endsWith(hostname); + } catch { + } + })) + }, mode: "fire-engine" }, this.logger, ); - // Filter links to only include those pointing to the current subdomain - const subdomainLinks = mainDomainLinks.filter(link => { - try { - const linkUrl = new URL(link); - return linkUrl.hostname.endsWith(hostname); - } catch { - return false; - } - }); - sitemapLinks = [...new Set([...sitemapLinks, ...subdomainLinks])]; } catch (error) { this.logger.debug( `Failed to fetch main domain sitemap from ${mainDomainSitemapUrl}`, @@ -506,15 +532,13 @@ export class WebCrawler { } // If no sitemap found yet, try the baseUrl as a last resort - if (sitemapLinks.length === 0) { + if (sitemapCount === 0) { const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; try { - const baseLinks = await getLinksFromSitemap( - { sitemapUrl: baseUrlSitemap, allUrls: [], mode: "fire-engine" }, + sitemapCount += await getLinksFromSitemap( + { sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" }, this.logger, ); - - sitemapLinks = [...new Set([...sitemapLinks, ...baseLinks])]; } catch (error) { this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, { method: "tryFetchSitemapLinks", @@ -524,25 +548,14 @@ export class WebCrawler { if (error instanceof AxiosError && error.response?.status === 404) { // ignore 404 } else { - sitemapLinks = await getLinksFromSitemap( - { sitemapUrl: baseUrlSitemap, mode: "fire-engine" }, + sitemapCount += await getLinksFromSitemap( + { sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" }, this.logger, ); } } } - const normalizedUrl = normalizeUrl(url); - const normalizedSitemapLinks = sitemapLinks.map((link) => - normalizeUrl(link), - ); - // has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl - if ( - !normalizedSitemapLinks.includes(normalizedUrl) && - sitemapLinks.length > 0 - ) { - sitemapLinks.push(url); - } - return sitemapLinks; + return sitemapCount; } } diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index 2529c022..8028d225 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -11,15 +11,15 @@ const useFireEngine = export async function getLinksFromSitemap( { sitemapUrl, - allUrls = [], + urlsHandler, mode = "axios", }: { sitemapUrl: string; - allUrls?: string[]; + urlsHandler(urls: string[]): unknown, mode?: "axios" | "fire-engine"; }, logger: Logger, -): Promise { +): Promise { try { let content: string = ""; try { @@ -31,9 +31,12 @@ export async function getLinksFromSitemap( { forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true }, ); if (!response.success) { - throw response.error; + logger.debug("Failed to scrape sitemap via TLSClient, falling back to axios...", { error: response.error }) + const ar = await axios.get(sitemapUrl, { timeout: axiosTimeout }); + content = ar.data; + } else { + content = response.document.rawHtml!; } - content = response.document.rawHtml!; } else { const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); content = response.data; @@ -45,11 +48,13 @@ export async function getLinksFromSitemap( sitemapUrl, error, }); - return allUrls; + + return 0; } const parsed = await parseStringPromise(content); const root = parsed.urlset || parsed.sitemapindex; + let count = 0; if (root && root.sitemap) { // Handle sitemap index files @@ -57,20 +62,18 @@ export async function getLinksFromSitemap( .filter((sitemap) => sitemap.loc && sitemap.loc.length > 0) .map((sitemap) => sitemap.loc[0]); - const sitemapPromises = sitemapUrls.map((sitemapUrl) => + const sitemapPromises: Promise[] = sitemapUrls.map((sitemapUrl) => getLinksFromSitemap( - { sitemapUrl, allUrls: [], mode }, + { sitemapUrl, urlsHandler, mode }, logger, ), ); const results = await Promise.all(sitemapPromises); - results.forEach(urls => { - allUrls.push(...urls); - }); + count = results.reduce((a,x) => a + x) } else if (root && root.url) { // Check if any URLs point to additional sitemaps - const xmlSitemaps = root.url + const xmlSitemaps: string[] = root.url .filter( (url) => url.loc && @@ -83,18 +86,13 @@ export async function getLinksFromSitemap( // Recursively fetch links from additional sitemaps const sitemapPromises = xmlSitemaps.map((sitemapUrl) => getLinksFromSitemap( - { sitemapUrl, allUrls: [], mode }, + { sitemapUrl: sitemapUrl, urlsHandler, mode }, logger, ), ); - - const results = await Promise.all(sitemapPromises); - results.forEach(urls => { - allUrls.push(...urls); - }); + count += (await Promise.all(sitemapPromises)).reduce((a,x) => a + x, 0); } - // Add regular URLs that aren't sitemaps const validUrls = root.url .filter( (url) => @@ -104,8 +102,11 @@ export async function getLinksFromSitemap( !WebCrawler.prototype.isFile(url.loc[0]), ) .map((url) => url.loc[0]); - allUrls.push(...validUrls); + count += validUrls.length; + urlsHandler(validUrls); } + + return count; } catch (error) { logger.debug(`Error processing sitemapUrl: ${sitemapUrl}`, { method: "getLinksFromSitemap", @@ -115,7 +116,7 @@ export async function getLinksFromSitemap( }); } - return [...new Set(allUrls)]; + return 0; } export const fetchSitemapData = async ( diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index 654f6cda..f59babe4 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -29,7 +29,7 @@ async function _addScrapeJobToConcurrencyQueue( }); } -async function _addScrapeJobToBullMQ( +export async function _addScrapeJobToBullMQ( webScraperOptions: any, options: any, jobId: string, @@ -138,7 +138,6 @@ export async function addScrapeJobs( if (jobs[0].data && jobs[0].data.team_id && jobs[0].data.plan) { const now = Date.now(); const limit = await getConcurrencyLimitMax(jobs[0].data.plan); - console.log("CC limit", limit); cleanOldConcurrencyLimitEntries(jobs[0].data.team_id, now); countCanBeDirectlyAdded = Math.max( diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 4ef9610d..e8c8bdf3 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -18,16 +18,18 @@ import { v4 as uuidv4 } from "uuid"; import { addCrawlJob, addCrawlJobDone, + addCrawlJobs, crawlToCrawler, finishCrawl, generateURLPermutations, getCrawl, getCrawlJobs, lockURL, + lockURLs, normalizeURL, } from "../lib/crawl-redis"; import { StoredCrawl } from "../lib/crawl-redis"; -import { addScrapeJob } from "./queue-jobs"; +import { addScrapeJob, addScrapeJobs } from "./queue-jobs"; import { addJobPriority, deleteJobPriority, @@ -191,22 +193,34 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => { await addJobPriority(job.data.team_id, job.id); let err = null; try { - const result = await processJob(job, token); - if (result.success) { - try { - if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") { - logger.debug( - "Job succeeded -- has crawl associated, putting null in Redis", - ); + if (job.data?.mode === "kickoff") { + const result = await processKickoffJob(job, token); + if (result.success) { + try { await job.moveToCompleted(null, token, false); - } else { - logger.debug("Job succeeded -- putting result in Redis"); - await job.moveToCompleted(result.document, token, false); - } - } catch (e) {} + } catch (e) {} + } else { + logger.debug("Job failed", { result, mode: job.data.mode }); + await job.moveToFailed((result as any).error, token, false); + } } else { - logger.debug("Job failed", { result }); - await job.moveToFailed((result as any).error, token, false); + const result = await processJob(job, token); + if (result.success) { + try { + if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") { + logger.debug( + "Job succeeded -- has crawl associated, putting null in Redis", + ); + await job.moveToCompleted(null, token, false); + } else { + logger.debug("Job succeeded -- putting result in Redis"); + await job.moveToCompleted(result.document, token, false); + } + } catch (e) {} + } else { + logger.debug("Job failed", { result }); + await job.moveToFailed((result as any).error, token, false); + } } } catch (error) { logger.debug("Job failed", { error }); @@ -379,6 +393,130 @@ const workerFun = async ( workerFun(getScrapeQueue(), processJobInternal); +async function processKickoffJob(job: Job & { id: string }, token: string) { + const logger = _logger.child({ + module: "queue-worker", + method: "processKickoffJob", + jobId: job.id, + scrapeId: job.id, + crawlId: job.data?.crawl_id ?? undefined, + teamId: job.data?.team_id ?? undefined, + }); + + try { + const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl; + const crawler = crawlToCrawler(job.data.crawl_id, sc); + + const sitemap = sc.crawlerOptions.ignoreSitemap + ? 0 + : await crawler.tryGetSitemap(async urls => { + if (urls.length === 0) return; + + logger.debug("Using sitemap chunk of length " + urls.length, { + sitemapLength: urls.length, + }); + + let jobPriority = await getJobPriority({ + plan: job.data.plan, + team_id: job.data.team_id, + basePriority: 21, + }); + logger.debug("Using job priority " + jobPriority, { jobPriority }); + + const jobs = urls.map(url => { + const uuid = uuidv4(); + return { + name: uuid, + data: { + url, + mode: "single_urls" as const, + team_id: job.data.team_id, + plan: job.data.plan!, + crawlerOptions: job.data.crawlerOptions, + scrapeOptions: job.data.scrapeOptions, + internalOptions: sc.internalOptions, + origin: job.data.origin, + crawl_id: job.data.crawl_id, + sitemapped: true, + webhook: job.data.webhook, + v1: job.data.v1, + }, + opts: { + jobId: uuid, + priority: 20, + }, + }; + }); + + logger.debug("Locking URLs..."); + await lockURLs( + job.data.crawl_id, + sc, + jobs.map((x) => x.data.url), + ); + logger.debug("Adding scrape jobs to Redis..."); + await addCrawlJobs( + job.data.crawl_id, + jobs.map((x) => x.opts.jobId), + ); + logger.debug("Adding scrape jobs to BullMQ..."); + await addScrapeJobs(jobs); + }); + + if (sitemap === 0) { + logger.debug("Sitemap not found or ignored.", { + ignoreSitemap: sc.crawlerOptions.ignoreSitemap, + }); + + logger.debug("Locking URL..."); + await lockURL(job.data.crawl_id, sc, job.data.url); + const jobId = uuidv4(); + logger.debug("Adding scrape job to Redis...", { jobId }); + await addScrapeJob( + { + url: job.data.url, + mode: "single_urls", + team_id: job.data.team_id, + crawlerOptions: job.data.crawlerOptions, + scrapeOptions: scrapeOptions.parse(job.data.scrapeOptions), + internalOptions: sc.internalOptions, + plan: job.data.plan!, + origin: job.data.origin, + crawl_id: job.data.crawl_id, + webhook: job.data.webhook, + v1: job.data.v1, + }, + { + priority: 15, + }, + jobId, + ); + logger.debug("Adding scrape job to BullMQ...", { jobId }); + await addCrawlJob(job.data.crawl_id, jobId); + } + logger.debug("Done queueing jobs!"); + + if (job.data.webhook) { + logger.debug("Calling webhook with crawl.started...", { + webhook: job.data.webhook, + }); + await callWebhook( + job.data.team_id, + job.data.crawl_id, + null, + job.data.webhook, + true, + "crawl.started", + ); + } + + return { success: true } + } catch (error) { + logger.error("An error occurred!", { error }) + return { success: false, error }; + } +} + async function processJob(job: Job & { id: string }, token: string) { const logger = _logger.child({ module: "queue-worker", From a4cf814f70ab56b986645372647980e89252f60b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adem=C3=ADlson=20F=2E=20Tonato?= Date: Fri, 27 Dec 2024 19:18:53 +0000 Subject: [PATCH 16/16] feat: return favicon url when scraping --- apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts index 66cf30cc..1f494893 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts @@ -8,6 +8,7 @@ export function extractMetadata( ): Partial { let title: string | undefined = undefined; let description: string | undefined = undefined; + let favicon: string | undefined = undefined; let language: string | undefined = undefined; let keywords: string | undefined = undefined; let robots: string | undefined = undefined; @@ -42,6 +43,12 @@ export function extractMetadata( try { title = soup("title").first().text().trim() || undefined; description = soup('meta[name="description"]').attr("content") || undefined; + + const faviconLink = soup('link[rel="icon"]').attr("href") || soup('link[rel*="icon"]').first().attr("href") || undefined; + if (faviconLink) { + const baseUrl = new URL(meta.url).origin; + favicon = faviconLink.startsWith('http') ? faviconLink : `${baseUrl}${faviconLink}`; + } // Assuming the language is part of the URL as per the regex pattern language = soup("html").attr("lang") || undefined; @@ -121,6 +128,7 @@ export function extractMetadata( return { title, description, + favicon, language, keywords, robots,