diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 14abf9a9..d753465d 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -144,7 +144,7 @@ export async function scrapeURLWithFireEngineChromeCDP( } : {}), priority: meta.internalOptions.priority, - geolocation: meta.options.geolocation, + geolocation: meta.options.geolocation ?? meta.options.location, mobile: meta.options.mobile, timeout, // TODO: better timeout logic disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache, @@ -227,7 +227,7 @@ export async function scrapeURLWithFireEnginePlaywright( screenshot: meta.options.formats.includes("screenshot"), fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"), wait: meta.options.waitFor, - geolocation: meta.options.geolocation, + geolocation: meta.options.geolocation ?? meta.options.location, timeout, }; @@ -286,7 +286,7 @@ export async function scrapeURLWithFireEngineTLSClient( priority: meta.internalOptions.priority, atsv: meta.internalOptions.atsv, - geolocation: meta.options.geolocation, + geolocation: meta.options.geolocation ?? meta.options.location, disableJsDom: meta.internalOptions.v0DisableJsDom, timeout, diff --git a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts index c67f9cbd..66cf30cc 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts @@ -40,7 +40,7 @@ export function extractMetadata( const soup = load(html); try { - title = soup("title").text() || undefined; + title = soup("title").first().text().trim() || undefined; description = soup('meta[name="description"]').attr("content") || undefined; // Assuming the language is part of the URL as per the regex pattern diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 1c7f082f..1296aedb 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.9.5", + "version": "1.9.7", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 9e3a849f..62d0398b 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -259,7 +259,7 @@ export interface MapResponse { * Defines options for extracting information from URLs. */ export interface ExtractParams { - prompt: string; + prompt?: string; schema?: LLMSchema; systemPrompt?: string; allowExternalLinks?: boolean; @@ -951,11 +951,13 @@ export class CrawlWatcher extends TypedEventTarget { private ws: WebSocket; public data: FirecrawlDocument[]; public status: CrawlStatusResponse["status"]; + public id: string; constructor(id: string, app: FirecrawlApp) { super(); if(!WebSocket) throw new FirecrawlError("WebSocket module failed to load. Your system might not support WebSocket.", 500); + this.id = id; this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey); this.status = "scraping"; this.data = []; @@ -986,6 +988,7 @@ export class CrawlWatcher extends TypedEventTarget { detail: { status: this.status, data: this.data, + id: this.id, }, })); } else if (msg.type === "error") { @@ -995,6 +998,7 @@ export class CrawlWatcher extends TypedEventTarget { status: this.status, data: this.data, error: msg.error, + id: this.id, }, })); } else if (msg.type === "catchup") { @@ -1002,12 +1006,18 @@ export class CrawlWatcher extends TypedEventTarget { this.data.push(...(msg.data.data ?? [])); for (const doc of this.data) { this.dispatchTypedEvent("document", new CustomEvent("document", { - detail: doc, + detail: { + ...doc, + id: this.id, + }, })); } } else if (msg.type === "document") { this.dispatchTypedEvent("document", new CustomEvent("document", { - detail: msg.data, + detail: { + ...msg.data, + id: this.id, + }, })); } } @@ -1034,6 +1044,7 @@ export class CrawlWatcher extends TypedEventTarget { status: this.status, data: this.data, error: "WebSocket error", + id: this.id, }, })); }).bind(this); diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 31d68095..5f592c2c 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp # noqa -__version__ = "1.6.4" +__version__ = "1.6.8" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 45ed27d8..e4ac2726 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -26,7 +26,7 @@ class FirecrawlApp: """ Parameters for the extract operation. """ - prompt: str + prompt: Optional[str] = None schema_: Optional[Any] = pydantic.Field(None, alias='schema') system_prompt: Optional[str] = None allow_external_links: Optional[bool] = False @@ -704,15 +704,15 @@ class CrawlWatcher: async def _handle_message(self, msg: Dict[str, Any]): if msg['type'] == 'done': self.status = 'completed' - self.dispatch_event('done', {'status': self.status, 'data': self.data}) + self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id}) elif msg['type'] == 'error': self.status = 'failed' - self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error']}) + self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id}) elif msg['type'] == 'catchup': self.status = msg['data']['status'] self.data.extend(msg['data'].get('data', [])) for doc in self.data: - self.dispatch_event('document', doc) + self.dispatch_event('document', {'data': doc, 'id': self.id}) elif msg['type'] == 'document': self.data.append(msg['data']) - self.dispatch_event('document', msg['data']) + self.dispatch_event('document', {'data': msg['data'], 'id': self.id})