From 34b40f6a23abe5fe15b935828f3e344f8e3ec2f4 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 18 Jan 2025 17:17:42 -0300 Subject: [PATCH 1/3] Nick: --- apps/api/src/controllers/v1/scrape.ts | 3 +- apps/api/src/controllers/v1/types.ts | 36 ++++++++++++++++--- .../scrapeURL/transformers/llmExtract.ts | 7 +++- 3 files changed, 40 insertions(+), 6 deletions(-) diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index 1ea28995..f4bc45b5 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -33,6 +33,7 @@ export async function scrapeController( basePriority: 10, }); + await addScrapeJob( { url: req.body.url, @@ -96,7 +97,7 @@ export async function scrapeController( // Don't bill if we're early returning return; } - if (req.body.extract && req.body.formats.includes("extract")) { + if (req.body.extract && req.body.formats.includes("extract") ) { creditsToBeBilled = 5; } diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index ee141625..5573fbaa 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -125,6 +125,7 @@ export const scrapeOptions = z "screenshot", "screenshot@fullPage", "extract", + "json" ]) .array() .optional() @@ -139,7 +140,10 @@ export const scrapeOptions = z onlyMainContent: z.boolean().default(true), timeout: z.number().int().positive().finite().safe().optional(), waitFor: z.number().int().nonnegative().finite().safe().default(0), + // Deprecate this to jsonOptions extract: extractOptions.optional(), + // New + jsonOptions: extractOptions.optional(), mobile: z.boolean().default(false), parsePDF: z.boolean().default(true), actions: actionsSchema.optional(), @@ -242,20 +246,43 @@ export const scrapeRequestSchema = scrapeOptions (obj) => { const hasExtractFormat = obj.formats?.includes("extract"); const hasExtractOptions = obj.extract !== undefined; + const hasJsonFormat = obj.formats?.includes("json"); + const hasJsonOptions = obj.jsonOptions !== undefined; return ( (hasExtractFormat && hasExtractOptions) || - (!hasExtractFormat && !hasExtractOptions) + (!hasExtractFormat && !hasExtractOptions) || + (hasJsonFormat && hasJsonOptions) || + (!hasJsonFormat && !hasJsonOptions) ); }, { message: - "When 'extract' format is specified, 'extract' options must be provided, and vice versa", + "When 'extract' or 'json' format is specified, corresponding options must be provided, and vice versa", }, ) .transform((obj) => { - if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) { - return { ...obj, timeout: 60000 }; + // Handle timeout + if ((obj.formats?.includes("extract") || obj.extract || obj.formats?.includes("json") || obj.jsonOptions) && !obj.timeout) { + obj = { ...obj, timeout: 60000 }; } + + if(obj.formats?.includes("json")) { + obj.formats.push("extract"); + } + + // Convert JSON options to extract options if needed + if (obj.jsonOptions && !obj.extract) { + obj = { + ...obj, + extract: { + prompt: obj.jsonOptions.prompt, + systemPrompt: obj.jsonOptions.systemPrompt, + schema: obj.jsonOptions.schema, + mode: "llm" + } + }; + } + return obj; }); @@ -410,6 +437,7 @@ export type Document = { links?: string[]; screenshot?: string; extract?: any; + json?: any; warning?: string; actions?: { screenshots?: string[]; diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 5fdc3b75..be97036f 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -233,7 +233,12 @@ export async function performLLMExtract( document.markdown, document.warning, ); - document.extract = extract; + + if (meta.options.formats.includes("json")) { + document.json = extract; + } else { + document.extract = extract; + } document.warning = warning; } From b030a1c5dae313a046ec80a452df783a832b4df7 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 18 Jan 2025 17:23:21 -0300 Subject: [PATCH 2/3] Nick: extract to json in the sdks as well --- README.md | 20 ++++++------- apps/js-sdk/firecrawl/src/index.ts | 41 +++++++++++++++++++++++++- apps/python-sdk/firecrawl/firecrawl.py | 12 ++++++++ 3 files changed, 62 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 658fb8be..41c97f82 100644 --- a/README.md +++ b/README.md @@ -250,8 +250,8 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \ -H 'Authorization: Bearer YOUR_API_KEY' \ -d '{ "url": "https://www.mendable.ai/", - "formats": ["extract"], - "extract": { + "formats": ["json"], + "jsonOptions": { "schema": { "type": "object", "properties": { @@ -296,7 +296,7 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \ "ogSiteName": "Mendable", "sourceURL": "https://mendable.ai/" }, - "llm_extraction": { + "json": { "company_mission": "Train a secure AI on your technical resources that answers customer and employee questions so your team doesn't have to", "supports_sso": true, "is_open_source": false, @@ -316,8 +316,8 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \ -H 'Authorization: Bearer YOUR_API_KEY' \ -d '{ "url": "https://docs.firecrawl.dev/", - "formats": ["extract"], - "extract": { + "formats": ["json"], + "jsonOptions": { "prompt": "Extract the company mission from the page." } }' @@ -447,12 +447,12 @@ class TopArticlesSchema(BaseModel): top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories") data = app.scrape_url('https://news.ycombinator.com', { - 'formats': ['extract'], - 'extract': { + 'formats': ['json'], + 'jsonOptions': { 'schema': TopArticlesSchema.model_json_schema() } }) -print(data["extract"]) +print(data["json"]) ``` ## Using the Node SDK @@ -526,10 +526,10 @@ const schema = z.object({ }); const scrapeResult = await app.scrapeUrl("https://news.ycombinator.com", { - extractorOptions: { extractionSchema: schema }, + jsonOptions: { extractionSchema: schema }, }); -console.log(scrapeResult.data["llm_extraction"]); +console.log(scrapeResult.data["json"]); ``` ## Open Source vs Cloud Offering diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 1d1715ed..8e7b58e9 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -78,7 +78,7 @@ export interface FirecrawlDocument; includeTags?: string[]; excludeTags?: string[]; @@ -127,6 +127,11 @@ export interface ScrapeParams Date: Sat, 18 Jan 2025 17:37:11 -0300 Subject: [PATCH 3/3] Nick: --- apps/js-sdk/firecrawl/package.json | 2 +- apps/python-sdk/firecrawl/__init__.py | 2 +- apps/python-sdk/pyproject.toml | 2 +- apps/python-sdk/setup.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index bd527ac3..3220531a 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.14.1", + "version": "1.15.0", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index f0d201dc..225b8591 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp # noqa -__version__ = "1.9.0" +__version__ = "1.10.0" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/pyproject.toml b/apps/python-sdk/pyproject.toml index 67082d5e..9d90d4b1 100644 --- a/apps/python-sdk/pyproject.toml +++ b/apps/python-sdk/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] dynamic = ["version"] -name = "firecrawl-py" +name = "firecrawl" description = "Python SDK for Firecrawl API" readme = {file="README.md", content-type = "text/markdown"} requires-python = ">=3.8" diff --git a/apps/python-sdk/setup.py b/apps/python-sdk/setup.py index 8a67d1fd..baaf94bc 100644 --- a/apps/python-sdk/setup.py +++ b/apps/python-sdk/setup.py @@ -17,7 +17,7 @@ def get_version(): setup( - name="firecrawl-py", + name="firecrawl", version=get_version(), url="https://github.com/mendableai/firecrawl", author="Mendable.ai",