Merge branch 'main' into feat/sdk-without-ws

This commit is contained in:
Thomas Kosmas 2024-12-21 02:30:40 +02:00
commit a9d31c8e42
6 changed files with 25 additions and 14 deletions

View File

@ -144,7 +144,7 @@ export async function scrapeURLWithFireEngineChromeCDP(
} }
: {}), : {}),
priority: meta.internalOptions.priority, priority: meta.internalOptions.priority,
geolocation: meta.options.geolocation, geolocation: meta.options.geolocation ?? meta.options.location,
mobile: meta.options.mobile, mobile: meta.options.mobile,
timeout, // TODO: better timeout logic timeout, // TODO: better timeout logic
disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache, disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache,
@ -227,7 +227,7 @@ export async function scrapeURLWithFireEnginePlaywright(
screenshot: meta.options.formats.includes("screenshot"), screenshot: meta.options.formats.includes("screenshot"),
fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"), fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"),
wait: meta.options.waitFor, wait: meta.options.waitFor,
geolocation: meta.options.geolocation, geolocation: meta.options.geolocation ?? meta.options.location,
timeout, timeout,
}; };
@ -286,7 +286,7 @@ export async function scrapeURLWithFireEngineTLSClient(
priority: meta.internalOptions.priority, priority: meta.internalOptions.priority,
atsv: meta.internalOptions.atsv, atsv: meta.internalOptions.atsv,
geolocation: meta.options.geolocation, geolocation: meta.options.geolocation ?? meta.options.location,
disableJsDom: meta.internalOptions.v0DisableJsDom, disableJsDom: meta.internalOptions.v0DisableJsDom,
timeout, timeout,

View File

@ -40,7 +40,7 @@ export function extractMetadata(
const soup = load(html); const soup = load(html);
try { try {
title = soup("title").text() || undefined; title = soup("title").first().text().trim() || undefined;
description = soup('meta[name="description"]').attr("content") || undefined; description = soup('meta[name="description"]').attr("content") || undefined;
// Assuming the language is part of the URL as per the regex pattern // Assuming the language is part of the URL as per the regex pattern

View File

@ -1,6 +1,6 @@
{ {
"name": "@mendable/firecrawl-js", "name": "@mendable/firecrawl-js",
"version": "1.9.5", "version": "1.9.7",
"description": "JavaScript SDK for Firecrawl API", "description": "JavaScript SDK for Firecrawl API",
"main": "dist/index.js", "main": "dist/index.js",
"types": "dist/index.d.ts", "types": "dist/index.d.ts",

View File

@ -259,7 +259,7 @@ export interface MapResponse {
* Defines options for extracting information from URLs. * Defines options for extracting information from URLs.
*/ */
export interface ExtractParams<LLMSchema extends zt.ZodSchema = any> { export interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
prompt: string; prompt?: string;
schema?: LLMSchema; schema?: LLMSchema;
systemPrompt?: string; systemPrompt?: string;
allowExternalLinks?: boolean; allowExternalLinks?: boolean;
@ -951,11 +951,13 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
private ws: WebSocket; private ws: WebSocket;
public data: FirecrawlDocument<undefined>[]; public data: FirecrawlDocument<undefined>[];
public status: CrawlStatusResponse["status"]; public status: CrawlStatusResponse["status"];
public id: string;
constructor(id: string, app: FirecrawlApp) { constructor(id: string, app: FirecrawlApp) {
super(); super();
if(!WebSocket) if(!WebSocket)
throw new FirecrawlError("WebSocket module failed to load. Your system might not support WebSocket.", 500); throw new FirecrawlError("WebSocket module failed to load. Your system might not support WebSocket.", 500);
this.id = id;
this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey); this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey);
this.status = "scraping"; this.status = "scraping";
this.data = []; this.data = [];
@ -986,6 +988,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
detail: { detail: {
status: this.status, status: this.status,
data: this.data, data: this.data,
id: this.id,
}, },
})); }));
} else if (msg.type === "error") { } else if (msg.type === "error") {
@ -995,6 +998,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
status: this.status, status: this.status,
data: this.data, data: this.data,
error: msg.error, error: msg.error,
id: this.id,
}, },
})); }));
} else if (msg.type === "catchup") { } else if (msg.type === "catchup") {
@ -1002,12 +1006,18 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
this.data.push(...(msg.data.data ?? [])); this.data.push(...(msg.data.data ?? []));
for (const doc of this.data) { for (const doc of this.data) {
this.dispatchTypedEvent("document", new CustomEvent("document", { this.dispatchTypedEvent("document", new CustomEvent("document", {
detail: doc, detail: {
...doc,
id: this.id,
},
})); }));
} }
} else if (msg.type === "document") { } else if (msg.type === "document") {
this.dispatchTypedEvent("document", new CustomEvent("document", { this.dispatchTypedEvent("document", new CustomEvent("document", {
detail: msg.data, detail: {
...msg.data,
id: this.id,
},
})); }));
} }
} }
@ -1034,6 +1044,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
status: this.status, status: this.status,
data: this.data, data: this.data,
error: "WebSocket error", error: "WebSocket error",
id: this.id,
}, },
})); }));
}).bind(this); }).bind(this);

View File

@ -13,7 +13,7 @@ import os
from .firecrawl import FirecrawlApp # noqa from .firecrawl import FirecrawlApp # noqa
__version__ = "1.6.4" __version__ = "1.6.8"
# Define the logger for the Firecrawl project # Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl") logger: logging.Logger = logging.getLogger("firecrawl")

View File

@ -26,7 +26,7 @@ class FirecrawlApp:
""" """
Parameters for the extract operation. Parameters for the extract operation.
""" """
prompt: str prompt: Optional[str] = None
schema_: Optional[Any] = pydantic.Field(None, alias='schema') schema_: Optional[Any] = pydantic.Field(None, alias='schema')
system_prompt: Optional[str] = None system_prompt: Optional[str] = None
allow_external_links: Optional[bool] = False allow_external_links: Optional[bool] = False
@ -704,15 +704,15 @@ class CrawlWatcher:
async def _handle_message(self, msg: Dict[str, Any]): async def _handle_message(self, msg: Dict[str, Any]):
if msg['type'] == 'done': if msg['type'] == 'done':
self.status = 'completed' self.status = 'completed'
self.dispatch_event('done', {'status': self.status, 'data': self.data}) self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
elif msg['type'] == 'error': elif msg['type'] == 'error':
self.status = 'failed' self.status = 'failed'
self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error']}) self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
elif msg['type'] == 'catchup': elif msg['type'] == 'catchup':
self.status = msg['data']['status'] self.status = msg['data']['status']
self.data.extend(msg['data'].get('data', [])) self.data.extend(msg['data'].get('data', []))
for doc in self.data: for doc in self.data:
self.dispatch_event('document', doc) self.dispatch_event('document', {'data': doc, 'id': self.id})
elif msg['type'] == 'document': elif msg['type'] == 'document':
self.data.append(msg['data']) self.data.append(msg['data'])
self.dispatch_event('document', msg['data']) self.dispatch_event('document', {'data': msg['data'], 'id': self.id})