mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 12:46:06 +08:00
Merge branch 'main' into feat/sdk-without-ws
This commit is contained in:
commit
a9d31c8e42
@ -144,7 +144,7 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
|||||||
}
|
}
|
||||||
: {}),
|
: {}),
|
||||||
priority: meta.internalOptions.priority,
|
priority: meta.internalOptions.priority,
|
||||||
geolocation: meta.options.geolocation,
|
geolocation: meta.options.geolocation ?? meta.options.location,
|
||||||
mobile: meta.options.mobile,
|
mobile: meta.options.mobile,
|
||||||
timeout, // TODO: better timeout logic
|
timeout, // TODO: better timeout logic
|
||||||
disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache,
|
disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache,
|
||||||
@ -227,7 +227,7 @@ export async function scrapeURLWithFireEnginePlaywright(
|
|||||||
screenshot: meta.options.formats.includes("screenshot"),
|
screenshot: meta.options.formats.includes("screenshot"),
|
||||||
fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"),
|
fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"),
|
||||||
wait: meta.options.waitFor,
|
wait: meta.options.waitFor,
|
||||||
geolocation: meta.options.geolocation,
|
geolocation: meta.options.geolocation ?? meta.options.location,
|
||||||
|
|
||||||
timeout,
|
timeout,
|
||||||
};
|
};
|
||||||
@ -286,7 +286,7 @@ export async function scrapeURLWithFireEngineTLSClient(
|
|||||||
priority: meta.internalOptions.priority,
|
priority: meta.internalOptions.priority,
|
||||||
|
|
||||||
atsv: meta.internalOptions.atsv,
|
atsv: meta.internalOptions.atsv,
|
||||||
geolocation: meta.options.geolocation,
|
geolocation: meta.options.geolocation ?? meta.options.location,
|
||||||
disableJsDom: meta.internalOptions.v0DisableJsDom,
|
disableJsDom: meta.internalOptions.v0DisableJsDom,
|
||||||
|
|
||||||
timeout,
|
timeout,
|
||||||
|
@ -40,7 +40,7 @@ export function extractMetadata(
|
|||||||
const soup = load(html);
|
const soup = load(html);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
title = soup("title").text() || undefined;
|
title = soup("title").first().text().trim() || undefined;
|
||||||
description = soup('meta[name="description"]').attr("content") || undefined;
|
description = soup('meta[name="description"]').attr("content") || undefined;
|
||||||
|
|
||||||
// Assuming the language is part of the URL as per the regex pattern
|
// Assuming the language is part of the URL as per the regex pattern
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@mendable/firecrawl-js",
|
"name": "@mendable/firecrawl-js",
|
||||||
"version": "1.9.5",
|
"version": "1.9.7",
|
||||||
"description": "JavaScript SDK for Firecrawl API",
|
"description": "JavaScript SDK for Firecrawl API",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
|
@ -259,7 +259,7 @@ export interface MapResponse {
|
|||||||
* Defines options for extracting information from URLs.
|
* Defines options for extracting information from URLs.
|
||||||
*/
|
*/
|
||||||
export interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
|
export interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
|
||||||
prompt: string;
|
prompt?: string;
|
||||||
schema?: LLMSchema;
|
schema?: LLMSchema;
|
||||||
systemPrompt?: string;
|
systemPrompt?: string;
|
||||||
allowExternalLinks?: boolean;
|
allowExternalLinks?: boolean;
|
||||||
@ -951,11 +951,13 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|||||||
private ws: WebSocket;
|
private ws: WebSocket;
|
||||||
public data: FirecrawlDocument<undefined>[];
|
public data: FirecrawlDocument<undefined>[];
|
||||||
public status: CrawlStatusResponse["status"];
|
public status: CrawlStatusResponse["status"];
|
||||||
|
public id: string;
|
||||||
|
|
||||||
constructor(id: string, app: FirecrawlApp) {
|
constructor(id: string, app: FirecrawlApp) {
|
||||||
super();
|
super();
|
||||||
if(!WebSocket)
|
if(!WebSocket)
|
||||||
throw new FirecrawlError("WebSocket module failed to load. Your system might not support WebSocket.", 500);
|
throw new FirecrawlError("WebSocket module failed to load. Your system might not support WebSocket.", 500);
|
||||||
|
this.id = id;
|
||||||
this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey);
|
this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey);
|
||||||
this.status = "scraping";
|
this.status = "scraping";
|
||||||
this.data = [];
|
this.data = [];
|
||||||
@ -986,6 +988,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|||||||
detail: {
|
detail: {
|
||||||
status: this.status,
|
status: this.status,
|
||||||
data: this.data,
|
data: this.data,
|
||||||
|
id: this.id,
|
||||||
},
|
},
|
||||||
}));
|
}));
|
||||||
} else if (msg.type === "error") {
|
} else if (msg.type === "error") {
|
||||||
@ -995,6 +998,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|||||||
status: this.status,
|
status: this.status,
|
||||||
data: this.data,
|
data: this.data,
|
||||||
error: msg.error,
|
error: msg.error,
|
||||||
|
id: this.id,
|
||||||
},
|
},
|
||||||
}));
|
}));
|
||||||
} else if (msg.type === "catchup") {
|
} else if (msg.type === "catchup") {
|
||||||
@ -1002,12 +1006,18 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|||||||
this.data.push(...(msg.data.data ?? []));
|
this.data.push(...(msg.data.data ?? []));
|
||||||
for (const doc of this.data) {
|
for (const doc of this.data) {
|
||||||
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
||||||
detail: doc,
|
detail: {
|
||||||
|
...doc,
|
||||||
|
id: this.id,
|
||||||
|
},
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
} else if (msg.type === "document") {
|
} else if (msg.type === "document") {
|
||||||
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
||||||
detail: msg.data,
|
detail: {
|
||||||
|
...msg.data,
|
||||||
|
id: this.id,
|
||||||
|
},
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1034,6 +1044,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|||||||
status: this.status,
|
status: this.status,
|
||||||
data: this.data,
|
data: this.data,
|
||||||
error: "WebSocket error",
|
error: "WebSocket error",
|
||||||
|
id: this.id,
|
||||||
},
|
},
|
||||||
}));
|
}));
|
||||||
}).bind(this);
|
}).bind(this);
|
||||||
|
@ -13,7 +13,7 @@ import os
|
|||||||
|
|
||||||
from .firecrawl import FirecrawlApp # noqa
|
from .firecrawl import FirecrawlApp # noqa
|
||||||
|
|
||||||
__version__ = "1.6.4"
|
__version__ = "1.6.8"
|
||||||
|
|
||||||
# Define the logger for the Firecrawl project
|
# Define the logger for the Firecrawl project
|
||||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||||
|
@ -26,7 +26,7 @@ class FirecrawlApp:
|
|||||||
"""
|
"""
|
||||||
Parameters for the extract operation.
|
Parameters for the extract operation.
|
||||||
"""
|
"""
|
||||||
prompt: str
|
prompt: Optional[str] = None
|
||||||
schema_: Optional[Any] = pydantic.Field(None, alias='schema')
|
schema_: Optional[Any] = pydantic.Field(None, alias='schema')
|
||||||
system_prompt: Optional[str] = None
|
system_prompt: Optional[str] = None
|
||||||
allow_external_links: Optional[bool] = False
|
allow_external_links: Optional[bool] = False
|
||||||
@ -704,15 +704,15 @@ class CrawlWatcher:
|
|||||||
async def _handle_message(self, msg: Dict[str, Any]):
|
async def _handle_message(self, msg: Dict[str, Any]):
|
||||||
if msg['type'] == 'done':
|
if msg['type'] == 'done':
|
||||||
self.status = 'completed'
|
self.status = 'completed'
|
||||||
self.dispatch_event('done', {'status': self.status, 'data': self.data})
|
self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
|
||||||
elif msg['type'] == 'error':
|
elif msg['type'] == 'error':
|
||||||
self.status = 'failed'
|
self.status = 'failed'
|
||||||
self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error']})
|
self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
|
||||||
elif msg['type'] == 'catchup':
|
elif msg['type'] == 'catchup':
|
||||||
self.status = msg['data']['status']
|
self.status = msg['data']['status']
|
||||||
self.data.extend(msg['data'].get('data', []))
|
self.data.extend(msg['data'].get('data', []))
|
||||||
for doc in self.data:
|
for doc in self.data:
|
||||||
self.dispatch_event('document', doc)
|
self.dispatch_event('document', {'data': doc, 'id': self.id})
|
||||||
elif msg['type'] == 'document':
|
elif msg['type'] == 'document':
|
||||||
self.data.append(msg['data'])
|
self.data.append(msg['data'])
|
||||||
self.dispatch_event('document', msg['data'])
|
self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
|
||||||
|
Loading…
x
Reference in New Issue
Block a user