From 19246f6289172ea356bbcc45d40797bbf1838425 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 18 Dec 2024 18:36:04 -0300 Subject: [PATCH 1/7] feat-SDK/added crawl id to ws --- apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/src/index.ts | 15 +++++++++++++-- apps/python-sdk/firecrawl/__init__.py | 2 +- apps/python-sdk/firecrawl/firecrawl.py | 8 ++++---- 4 files changed, 19 insertions(+), 8 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 74dfcb02..1c7f082f 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.9.4", + "version": "1.9.5", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 020a2293..44063097 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -934,9 +934,11 @@ export class CrawlWatcher extends TypedEventTarget { private ws: WebSocket; public data: FirecrawlDocument[]; public status: CrawlStatusResponse["status"]; + public id: string; constructor(id: string, app: FirecrawlApp) { super(); + this.id = id; this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey); this.status = "scraping"; this.data = []; @@ -967,6 +969,7 @@ export class CrawlWatcher extends TypedEventTarget { detail: { status: this.status, data: this.data, + id: this.id, }, })); } else if (msg.type === "error") { @@ -976,6 +979,7 @@ export class CrawlWatcher extends TypedEventTarget { status: this.status, data: this.data, error: msg.error, + id: this.id, }, })); } else if (msg.type === "catchup") { @@ -983,12 +987,18 @@ export class CrawlWatcher extends TypedEventTarget { this.data.push(...(msg.data.data ?? [])); for (const doc of this.data) { this.dispatchTypedEvent("document", new CustomEvent("document", { - detail: doc, + detail: { + ...doc, + id: this.id, + }, })); } } else if (msg.type === "document") { this.dispatchTypedEvent("document", new CustomEvent("document", { - detail: msg.data, + detail: { + ...msg.data, + id: this.id, + }, })); } } @@ -1015,6 +1025,7 @@ export class CrawlWatcher extends TypedEventTarget { status: this.status, data: this.data, error: "WebSocket error", + id: this.id, }, })); }).bind(this); diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 31d68095..8c5d1b44 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp # noqa -__version__ = "1.6.4" +__version__ = "1.6.5" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 45ed27d8..7ac2d2dc 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -704,15 +704,15 @@ class CrawlWatcher: async def _handle_message(self, msg: Dict[str, Any]): if msg['type'] == 'done': self.status = 'completed' - self.dispatch_event('done', {'status': self.status, 'data': self.data}) + self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id}) elif msg['type'] == 'error': self.status = 'failed' - self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error']}) + self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id}) elif msg['type'] == 'catchup': self.status = msg['data']['status'] self.data.extend(msg['data'].get('data', [])) for doc in self.data: - self.dispatch_event('document', doc) + self.dispatch_event('document', {'data': doc, 'id': self.id}) elif msg['type'] == 'document': self.data.append(msg['data']) - self.dispatch_event('document', msg['data']) + self.dispatch_event('document', {'data': msg['data'], 'id': self.id}) From cf2ec7713166d6f9a7b6c7218e125e82d43aba40 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 19 Dec 2024 08:32:10 -0300 Subject: [PATCH 2/7] fixed title extra info --- apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts index c67f9cbd..66cf30cc 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts @@ -40,7 +40,7 @@ export function extractMetadata( const soup = load(html); try { - title = soup("title").text() || undefined; + title = soup("title").first().text().trim() || undefined; description = soup('meta[name="description"]').attr("content") || undefined; // Assuming the language is part of the URL as per the regex pattern From 071b9a01c35613eab2a3905343199bf9f065d569 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 19 Dec 2024 18:22:54 +0100 Subject: [PATCH 3/7] fix(scrapeURL/fire-engine): pass geolocation --- apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 14abf9a9..d753465d 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -144,7 +144,7 @@ export async function scrapeURLWithFireEngineChromeCDP( } : {}), priority: meta.internalOptions.priority, - geolocation: meta.options.geolocation, + geolocation: meta.options.geolocation ?? meta.options.location, mobile: meta.options.mobile, timeout, // TODO: better timeout logic disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache, @@ -227,7 +227,7 @@ export async function scrapeURLWithFireEnginePlaywright( screenshot: meta.options.formats.includes("screenshot"), fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"), wait: meta.options.waitFor, - geolocation: meta.options.geolocation, + geolocation: meta.options.geolocation ?? meta.options.location, timeout, }; @@ -286,7 +286,7 @@ export async function scrapeURLWithFireEngineTLSClient( priority: meta.internalOptions.priority, atsv: meta.internalOptions.atsv, - geolocation: meta.options.geolocation, + geolocation: meta.options.geolocation ?? meta.options.location, disableJsDom: meta.internalOptions.v0DisableJsDom, timeout, From 4fddc86e66143c20adaa5ff6422c24d17c0a08d1 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 19 Dec 2024 16:09:08 -0300 Subject: [PATCH 4/7] Update package.json --- apps/js-sdk/firecrawl/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 1c7f082f..a6ed595e 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.9.5", + "version": "1.9.6", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", From 525a71d789bdfece2a21a40024e19f9cea7fcefb Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 19 Dec 2024 16:10:42 -0300 Subject: [PATCH 5/7] Update __init__.py --- apps/python-sdk/firecrawl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 8c5d1b44..19a33d17 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp # noqa -__version__ = "1.6.5" +__version__ = "1.6.7" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") From 8063474c85bff7e91ee100cf54b2d809cddecb98 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 19 Dec 2024 16:14:17 -0300 Subject: [PATCH 6/7] Update __init__.py --- apps/python-sdk/firecrawl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 19a33d17..5f592c2c 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp # noqa -__version__ = "1.6.7" +__version__ = "1.6.8" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") From 3e60f175bbf3fa2f7510d1544d311f44ccadba9c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 19 Dec 2024 16:14:49 -0300 Subject: [PATCH 7/7] Nick: prompt should be optional on /extract sdks --- apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/src/index.ts | 2 +- apps/python-sdk/firecrawl/firecrawl.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index a6ed595e..1296aedb 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.9.6", + "version": "1.9.7", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 08f7b183..2772466c 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -243,7 +243,7 @@ export interface MapResponse { * Defines options for extracting information from URLs. */ export interface ExtractParams { - prompt: string; + prompt?: string; schema?: LLMSchema; systemPrompt?: string; allowExternalLinks?: boolean; diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 7ac2d2dc..e4ac2726 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -26,7 +26,7 @@ class FirecrawlApp: """ Parameters for the extract operation. """ - prompt: str + prompt: Optional[str] = None schema_: Optional[Any] = pydantic.Field(None, alias='schema') system_prompt: Optional[str] = None allow_external_links: Optional[bool] = False