diff --git a/README.md b/README.md index 22a2c12f..89ed0127 100644 --- a/README.md +++ b/README.md @@ -229,20 +229,19 @@ Response will be an ordered list from the most relevant to the least relevant. } ``` -### LLM Extraction (v0) (Beta) +### LLM Extraction (Beta) Used to extract structured data from scraped pages. ```bash -curl -X POST https://api.firecrawl.dev/v0/scrape \ +curl -X POST https://api.firecrawl.dev/v1/scrape \ -H 'Content-Type: application/json' \ -H 'Authorization: Bearer YOUR_API_KEY' \ -d '{ "url": "https://www.mendable.ai/", - "extractorOptions": { - "mode": "llm-extraction", - "extractionPrompt": "Based on the information on the page, extract the information from the schema. ", - "extractionSchema": { + "formats": ["extract"], + "extract": { + "schema": { "type": "object", "properties": { "company_mission": { @@ -296,6 +295,23 @@ curl -X POST https://api.firecrawl.dev/v0/scrape \ } ``` +### Extracting without a schema (New) + +You can now extract without a schema by just passing a `prompt` to the endpoint. The llm chooses the structure of the data. + +```bash +curl -X POST https://api.firecrawl.dev/v1/scrape \ + -H 'Content-Type: application/json' \ + -H 'Authorization: Bearer YOUR_API_KEY' \ + -d '{ + "url": "https://docs.firecrawl.dev/", + "formats": ["extract"], + "extract": { + "prompt": "Extract the company mission from the page." + } + }' +``` + ### Search (v0) (Beta) diff --git a/apps/api/package.json b/apps/api/package.json index 0ece960a..bac13e79 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -94,7 +94,7 @@ "moment": "^2.29.4", "mongoose": "^8.4.4", "natural": "^7.0.7", - "openai": "^4.52.2", + "openai": "^4.57.0", "pdf-parse": "^1.1.1", "pos": "^0.4.2", "posthog-node": "^4.0.1", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 727c4ed4..2762a84c 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -124,7 +124,7 @@ importers: version: 0.0.28 langchain: specifier: ^0.2.8 - version: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) + version: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) languagedetect: specifier: ^2.0.0 version: 2.0.0 @@ -147,8 +147,8 @@ importers: specifier: ^7.0.7 version: 7.0.7(socks@2.8.3) openai: - specifier: ^4.52.2 - version: 4.52.2 + specifier: ^4.57.0 + version: 4.57.0(zod@3.23.8) pdf-parse: specifier: ^1.1.1 version: 1.1.1 @@ -3733,9 +3733,14 @@ packages: openai@3.3.0: resolution: {integrity: sha512-uqxI/Au+aPRnsaQRe8CojU0eCR7I0mBiKjD3sNMzY6DaC1ZVrc85u98mtJW6voDug8fgGN+DIZmTDxTthxb7dQ==} - openai@4.52.2: - resolution: {integrity: sha512-mMc0XgFuVSkcm0lRIi8zaw++otC82ZlfkCur1qguXYWPETr/+ZwL9A/vvp3YahX+shpaT6j03dwsmUyLAfmEfg==} + openai@4.57.0: + resolution: {integrity: sha512-JnwBSIYqiZ3jYjB5f2in8hQ0PRA092c6m+/6dYB0MzK0BEbn+0dioxZsPLBm5idJbg9xzLNOiGVm2OSuhZ+BdQ==} hasBin: true + peerDependencies: + zod: ^3.23.8 + peerDependenciesMeta: + zod: + optional: true openapi-types@12.1.3: resolution: {integrity: sha512-N4YtSYJqghVu4iek2ZUvcN/0aqH1kRDuNqzcycDxhOUpg7GdvLa2F3DgS6yBNhInhv2r/6I0Flkn7CqL8+nIcw==} @@ -5319,13 +5324,13 @@ snapshots: '@js-sdsl/ordered-map@4.4.2': {} - '@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)': + '@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))': dependencies: ansi-styles: 5.2.0 camelcase: 6.3.0 decamelize: 1.2.0 js-tiktoken: 1.0.12 - langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) + langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) ml-distance: 4.0.1 mustache: 4.2.0 p-queue: 6.6.2 @@ -5337,20 +5342,20 @@ snapshots: - langchain - openai - '@langchain/openai@0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))': + '@langchain/openai@0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))': dependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) + '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) js-tiktoken: 1.0.12 - openai: 4.52.2 + openai: 4.57.0(zod@3.23.8) zod: 3.23.8 zod-to-json-schema: 3.23.1(zod@3.23.8) transitivePeerDependencies: - encoding - langchain - '@langchain/textsplitters@0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)': + '@langchain/textsplitters@0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))': dependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) + '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) js-tiktoken: 1.0.12 transitivePeerDependencies: - langchain @@ -8487,17 +8492,17 @@ snapshots: kleur@3.0.3: {} - langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0): + langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0): dependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) - '@langchain/openai': 0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)) - '@langchain/textsplitters': 0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) + '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) + '@langchain/openai': 0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)) + '@langchain/textsplitters': 0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) binary-extensions: 2.3.0 js-tiktoken: 1.0.12 js-yaml: 4.1.0 jsonpointer: 5.0.1 langchainhub: 0.0.11 - langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) + langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) ml-distance: 4.0.1 openapi-types: 12.1.3 p-retry: 4.6.2 @@ -8524,7 +8529,7 @@ snapshots: langchainhub@0.0.11: {} - langsmith@0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2): + langsmith@0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)): dependencies: '@types/uuid': 9.0.8 commander: 10.0.1 @@ -8533,9 +8538,9 @@ snapshots: p-retry: 4.6.2 uuid: 9.0.1 optionalDependencies: - '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) - langchain: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) - openai: 4.52.2 + '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) + langchain: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) + openai: 4.57.0(zod@3.23.8) languagedetect@2.0.0: {} @@ -8928,16 +8933,19 @@ snapshots: transitivePeerDependencies: - debug - openai@4.52.2: + openai@4.57.0(zod@3.23.8): dependencies: '@types/node': 18.19.39 '@types/node-fetch': 2.6.11 + '@types/qs': 6.9.15 abort-controller: 3.0.0 agentkeepalive: 4.5.0 form-data-encoder: 1.7.2 formdata-node: 4.4.1 node-fetch: 2.7.0 - web-streams-polyfill: 3.3.3 + qs: 6.12.2 + optionalDependencies: + zod: 3.23.8 transitivePeerDependencies: - encoding diff --git a/apps/api/requests.http b/apps/api/requests.http index 5d55b481..3e7bd2b7 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -1,6 +1,6 @@ ### Crawl Website POST http://localhost:3002/v0/scrape HTTP/1.1 -Authorization: Bearer fc-4e6259caf03b42a4b6c9261e0f96e673 +Authorization: Bearer fc- content-type: application/json { @@ -9,7 +9,7 @@ content-type: application/json ### Check Job Status GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1 -Authorization: Bearer fc-4e6259caf03b42a4b6c9261e0f96e673 +Authorization: Bearer fc- ### Check Job Status diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index d2614d4d..40df5021 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -254,13 +254,26 @@ export async function scrapeController(req: Request, res: Response) { } } } + + let doc = result.data; + if (!pageOptions || !pageOptions.includeRawHtml) { + if (doc && doc.rawHtml) { + delete doc.rawHtml; + } + } + + if(pageOptions && pageOptions.includeExtract) { + if(!pageOptions.includeMarkdown && doc && doc.markdown) { + delete doc.markdown; + } + } logJob({ job_id: jobId, success: result.success, message: result.error, num_docs: 1, - docs: [result.data], + docs: [doc], time_taken: timeTakenInSeconds, team_id: team_id, mode: "scrape", diff --git a/apps/api/src/controllers/v1/scrape-status.ts b/apps/api/src/controllers/v1/scrape-status.ts new file mode 100644 index 00000000..5e0aecb6 --- /dev/null +++ b/apps/api/src/controllers/v1/scrape-status.ts @@ -0,0 +1,38 @@ +import { Response } from "express"; +import { supabaseGetJobByIdOnlyData } from "../../lib/supabase-jobs"; +import { scrapeStatusRateLimiter } from "../../services/rate-limiter"; + +export async function scrapeStatusController(req: any, res: any) { + try { + const rateLimiter = scrapeStatusRateLimiter; + const incomingIP = (req.headers["x-forwarded-for"] || + req.socket.remoteAddress) as string; + const iptoken = incomingIP; + await rateLimiter.consume(iptoken); + + const job = await supabaseGetJobByIdOnlyData(req.params.jobId); + + if(job.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){ + return res.status(403).json({ + success: false, + error: "You are not allowed to access this resource.", + }); + } + return res.status(200).json({ + success: true, + data: job?.docs[0], + }); + } catch (error) { + if (error instanceof Error && error.message == "Too Many Requests") { + return res.status(429).json({ + success: false, + error: "Rate limit exceeded. Please try again later.", + }); + } else { + return res.status(500).json({ + success: false, + error: "An unexpected error occurred.", + }); + } + } +} diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index b68bda2d..c573e100 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -1,36 +1,58 @@ import { Request, Response } from "express"; -import { Logger } from '../../lib/logger'; -import { Document, legacyDocumentConverter, legacyScrapeOptions, RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types"; +import { Logger } from "../../lib/logger"; +import { + Document, + legacyDocumentConverter, + legacyExtractorOptions, + legacyScrapeOptions, + RequestWithAuth, + ScrapeRequest, + scrapeRequestSchema, + ScrapeResponse, +} from "./types"; import { billTeam } from "../../services/billing/credit_billing"; -import { v4 as uuidv4 } from 'uuid'; +import { v4 as uuidv4 } from "uuid"; import { numTokensFromString } from "../../lib/LLM-extraction/helpers"; import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; import { logJob } from "../../services/logging/log_job"; import { getJobPriority } from "../../lib/job-priority"; import { PlanType } from "../../types"; -export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>, res: Response) { +export async function scrapeController( + req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>, + res: Response +) { req.body = scrapeRequestSchema.parse(req.body); let earlyReturn = false; const origin = req.body.origin; const timeout = req.body.timeout; const pageOptions = legacyScrapeOptions(req.body); + const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined; const jobId = uuidv4(); const startTime = new Date().getTime(); - const jobPriority = await getJobPriority({plan: req.auth.plan as PlanType, team_id: req.auth.team_id, basePriority: 10}) - - const job = await addScrapeJob({ - url: req.body.url, - mode: "single_urls", - crawlerOptions: {}, + const jobPriority = await getJobPriority({ + plan: req.auth.plan as PlanType, team_id: req.auth.team_id, - pageOptions, - extractorOptions: {}, - origin: req.body.origin, - is_scrape: true, - }, {}, jobId, jobPriority); + basePriority: 10, + }); + + const job = await addScrapeJob( + { + url: req.body.url, + mode: "single_urls", + crawlerOptions: {}, + team_id: req.auth.team_id, + pageOptions, + extractorOptions, + origin: req.body.origin, + is_scrape: true, + }, + {}, + jobId, + jobPriority + ); let doc: any | undefined; try { @@ -45,7 +67,11 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse, } else { return res.status(500).json({ success: false, - error: "Internal server error", + error: `(Internal server error) - ${e && e?.message ? e.message : e} ${ + extractorOptions && extractorOptions.mode !== "markdown" + ? " - Could be due to LLM parsing issues" + : "" + }`, }); } } @@ -57,7 +83,7 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse, return res.status(200).json({ success: true, warning: "No page found", - data: doc + data: doc, }); } @@ -66,25 +92,41 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse, const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; - const numTokens = (doc && doc.markdown) ? numTokensFromString(doc.markdown, "gpt-3.5-turbo") : 0; + const numTokens = + doc && doc.markdown + ? numTokensFromString(doc.markdown, "gpt-3.5-turbo") + : 0; let creditsToBeBilled = 1; // Assuming 1 credit per document if (earlyReturn) { // Don't bill if we're early returning return; } + if(req.body.extract && req.body.formats.includes("extract")) { + creditsToBeBilled = 50; + } - const billingResult = await billTeam( - req.auth.team_id, - creditsToBeBilled - ); + const billingResult = await billTeam(req.auth.team_id, creditsToBeBilled); if (!billingResult.success) { return res.status(402).json({ success: false, - error: "Failed to bill team. Insufficient credits or subscription not found.", + error: + "Failed to bill team. Insufficient credits or subscription not found.", }); } + if (!pageOptions || !pageOptions.includeRawHtml) { + if (doc && doc.rawHtml) { + delete doc.rawHtml; + } + } + + if(pageOptions && pageOptions.includeExtract) { + if(!pageOptions.includeMarkdown && doc && doc.markdown) { + delete doc.markdown; + } + } + logJob({ job_id: jobId, success: true, @@ -97,7 +139,7 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse, url: req.body.url, crawlerOptions: {}, pageOptions: pageOptions, - origin: origin, + origin: origin, extractor_options: { mode: "markdown" }, num_tokens: numTokens, }); @@ -106,4 +148,4 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse, success: true, data: legacyDocumentConverter(doc), }); -} \ No newline at end of file +} diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 12d1c501..4393cd29 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -1,7 +1,7 @@ import { Request, Response } from "express"; import { z } from "zod"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; -import { PageOptions } from "../../lib/entities"; +import { ExtractorOptions, PageOptions } from "../../lib/entities"; import { protocolIncluded, checkUrl } from "../../lib/validateUrl"; import { PlanType } from "../../types"; @@ -11,7 +11,8 @@ export type Format = | "rawHtml" | "links" | "screenshot" - | "screenshot@fullPage"; + | "screenshot@fullPage" + | "extract"; export const url = z.preprocess( (x) => { @@ -40,6 +41,15 @@ export const url = z.preprocess( const strictMessage = "Unrecognized key in body -- please review the v1 API documentation for request body changes"; +export const extractOptions = z.object({ + mode: z.enum(["llm"]).default("llm"), + schema: z.any().optional(), + systemPrompt: z.string().default("Based on the information on the page, extract all the information from the schema. Try to extract all the fields even those that might not be marked as required."), + prompt: z.string().optional() +}).strict(strictMessage); + +export type ExtractOptions = z.infer; + export const scrapeOptions = z.object({ formats: z .enum([ @@ -49,6 +59,7 @@ export const scrapeOptions = z.object({ "links", "screenshot", "screenshot@fullPage", + "extract" ]) .array() .optional() @@ -57,17 +68,33 @@ export const scrapeOptions = z.object({ includeTags: z.string().array().optional(), excludeTags: z.string().array().optional(), onlyMainContent: z.boolean().default(true), - timeout: z.number().int().positive().finite().safe().default(30000), // default? + timeout: z.number().int().positive().finite().safe().default(30000), waitFor: z.number().int().nonnegative().finite().safe().default(0), + extract: extractOptions.optional(), parsePDF: z.boolean().default(true), -}).strict(strictMessage); +}).strict(strictMessage) + export type ScrapeOptions = z.infer; export const scrapeRequestSchema = scrapeOptions.extend({ url, origin: z.string().optional().default("api"), -}).strict(strictMessage); +}).strict(strictMessage).refine( + (obj) => { + const hasExtractFormat = obj.formats?.includes("extract"); + const hasExtractOptions = obj.extract !== undefined; + return (hasExtractFormat && hasExtractOptions) || (!hasExtractFormat && !hasExtractOptions); + }, + { + message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa", + } +).transform((obj) => { + if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) { + return { ...obj, timeout: 60000 }; + } + return obj; +}); // export type ScrapeRequest = { // url: string; @@ -118,6 +145,13 @@ export const crawlRequestSchema = crawlerOptions.extend({ // scrapeOptions?: Exclude; // }; +// export type ExtractorOptions = { +// mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html"; +// extractionPrompt?: string; +// extractionSchema?: Record; +// } + + export type CrawlRequest = z.infer; export const mapRequestSchema = crawlerOptions.extend({ @@ -126,7 +160,7 @@ export const mapRequestSchema = crawlerOptions.extend({ includeSubdomains: z.boolean().default(true), search: z.string().optional(), ignoreSitemap: z.boolean().default(false), - limit: z.number().min(1).max(50).default(5000).optional(), + limit: z.number().min(1).max(5000).default(5000).optional(), }).strict(strictMessage); // export type MapRequest = { @@ -138,6 +172,7 @@ export type MapRequest = z.infer; export type Document = { markdown?: string; + extract?: string; html?: string; rawHtml?: string; links?: string[]; @@ -280,6 +315,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions { includeMarkdown: x.formats.includes("markdown"), includeHtml: x.formats.includes("html"), includeRawHtml: x.formats.includes("rawHtml"), + includeExtract: x.formats.includes("extract"), onlyIncludeTags: x.includeTags, removeTags: x.excludeTags, onlyMainContent: x.onlyMainContent, @@ -291,6 +327,15 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions { }; } +export function legacyExtractorOptions(x: ExtractOptions): ExtractorOptions { + return { + mode: x.mode ? "llm-extraction" : "markdown", + extractionPrompt: x.prompt ?? "Based on the information on the page, extract the information from the schema.", + extractionSchema: x.schema, + userPrompt: x.prompt ?? "", + }; +} + export function legacyDocumentConverter(doc: any): Document { if (doc === null || doc === undefined) return doc; @@ -311,6 +356,7 @@ export function legacyDocumentConverter(doc: any): Document { links: doc.linksOnPage, rawHtml: doc.rawHtml, html: doc.html, + extract: doc.llm_extraction, screenshot: doc.screenshot ?? doc.fullPageScreenshot, metadata: { ...doc.metadata, diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts index af8b0bb1..d05f9bd7 100644 --- a/apps/api/src/lib/LLM-extraction/index.ts +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -15,7 +15,8 @@ export async function generateCompletions( // const schema = zodToJsonSchema(options.schema) const schema = extractionOptions.extractionSchema; - const prompt = extractionOptions.extractionPrompt; + const systemPrompt = extractionOptions.extractionPrompt; + const prompt = extractionOptions.userPrompt; const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider @@ -24,30 +25,35 @@ export async function generateCompletions( switch (switchVariable) { case "openAI": const llm = new OpenAI(); - try{ - const completionResult = await generateOpenAICompletions({ - client: llm, - document: document, - schema: schema, - prompt: prompt, - mode: mode, - }); - // Validate the JSON output against the schema using AJV - const validate = ajv.compile(schema); - if (!validate(completionResult.llm_extraction)) { - //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc. - throw new Error( - `JSON parsing error(s): ${validate.errors - ?.map((err) => err.message) - .join(", ")}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.` - ); - } + try { + const completionResult = await generateOpenAICompletions({ + client: llm, + document: document, + schema: schema, + prompt: prompt, + systemPrompt: systemPrompt, + mode: mode, + }); + // Validate the JSON output against the schema using AJV + if (schema) { + const validate = ajv.compile(schema); + if (!validate(completionResult.llm_extraction)) { + //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc. + throw new Error( + `JSON parsing error(s): ${validate.errors + ?.map((err) => err.message) + .join( + ", " + )}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.` + ); + } + } - return completionResult; - } catch (error) { - Logger.error(`Error generating completions: ${error}`); - throw error; - } + return completionResult; + } catch (error) { + Logger.error(`Error generating completions: ${error}`); + throw error; + } default: throw new Error("Invalid client"); } diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts index 8ca6bbd4..23147b12 100644 --- a/apps/api/src/lib/LLM-extraction/models.ts +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -16,7 +16,6 @@ function prepareOpenAIDoc( document: Document, mode: "markdown" | "raw-html" ): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] | null { - let markdown = document.markdown; let extractionTarget = document.markdown; @@ -33,34 +32,32 @@ function prepareOpenAIDoc( // ); } - - - // count number of tokens const numTokens = numTokensFromString(extractionTarget, "gpt-4"); if (numTokens > maxTokens) { // trim the document to the maximum number of tokens, tokens != characters - extractionTarget = extractionTarget.slice(0, (maxTokens * modifier)); + extractionTarget = extractionTarget.slice(0, maxTokens * modifier); } - return [[{ type: "text", text: extractionTarget }], numTokens]; } export async function generateOpenAICompletions({ client, - model = process.env.MODEL_NAME || "gpt-4o", + model = process.env.MODEL_NAME || "gpt-4o-mini", document, schema, //TODO - add zod dynamic type checking - prompt = defaultPrompt, + systemPrompt = defaultPrompt, + prompt, temperature, - mode + mode, }: { client: OpenAI; model?: string; document: Document; schema: any; // This should be replaced with a proper Zod schema type when available prompt?: string; + systemPrompt?: string; temperature?: number; mode: "markdown" | "raw-html"; }): Promise { @@ -70,45 +67,79 @@ export async function generateOpenAICompletions({ if (preparedDoc === null) { return { ...document, - warning: "LLM extraction was not performed since the document's content is empty or missing.", + warning: + "LLM extraction was not performed since the document's content is empty or missing.", }; } - const [content, numTokens] = preparedDoc; - const completion = await openai.chat.completions.create({ - model, - messages: [ - { - role: "system", - content: prompt, - }, - { role: "user", content }, - ], - tools: [ - { - type: "function", - function: { - name: "extract_content", - description: "Extracts the content from the given webpage(s)", - parameters: schema, + let completion; + let llmExtraction; + if (prompt && !schema) { + const jsonCompletion = await openai.chat.completions.create({ + model, + messages: [ + { + role: "system", + content: systemPrompt, }, - }, - ], - tool_choice: { "type": "function", "function": {"name": "extract_content"}}, - temperature, - }); + { role: "user", content }, + { + role: "user", + content: `Transform the above content into structured json output based on the following user request: ${prompt}`, + }, + ], + response_format: { type: "json_object" }, + temperature, + }); - const c = completion.choices[0].message.tool_calls[0].function.arguments; + try { + llmExtraction = JSON.parse( + jsonCompletion.choices[0].message.content.trim() + ); + } catch (e) { + throw new Error("Invalid JSON"); + } + } else { + completion = await openai.chat.completions.create({ + model, + messages: [ + { + role: "system", + content: systemPrompt, + }, + { role: "user", content }, + ], + tools: [ + { + type: "function", + function: { + name: "extract_content", + description: "Extracts the content from the given webpage(s)", + parameters: schema, + }, + }, + ], + tool_choice: { type: "function", function: { name: "extract_content" } }, + temperature, + }); + const c = completion.choices[0].message.tool_calls[0].function.arguments; - // Extract the LLM extraction content from the completion response - const llmExtraction = JSON.parse(c); + // Extract the LLM extraction content from the completion response + try { + llmExtraction = JSON.parse(c); + } catch (e) { + throw new Error("Invalid JSON"); + } + } // Return the document with the LLM extraction content added return { ...document, llm_extraction: llmExtraction, - warning: numTokens > maxTokens ? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.` : undefined, + warning: + numTokens > maxTokens + ? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.` + : undefined, }; } - diff --git a/apps/api/src/lib/custom-error.ts b/apps/api/src/lib/custom-error.ts index 20a01cb6..2ffe52e9 100644 --- a/apps/api/src/lib/custom-error.ts +++ b/apps/api/src/lib/custom-error.ts @@ -19,3 +19,4 @@ export class CustomError extends Error { Object.setPrototypeOf(this, CustomError.prototype); } } + diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index b4b26040..dfd17c63 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -12,6 +12,7 @@ export interface Progress { export type PageOptions = { includeMarkdown?: boolean; + includeExtract?: boolean; onlyMainContent?: boolean; includeHtml?: boolean; includeRawHtml?: boolean; @@ -35,6 +36,7 @@ export type ExtractorOptions = { mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html"; extractionPrompt?: string; extractionSchema?: Record; + userPrompt?: string; } export type SearchOptions = { diff --git a/apps/api/src/lib/supabase-jobs.ts b/apps/api/src/lib/supabase-jobs.ts index 8ff46a23..cda6fd46 100644 --- a/apps/api/src/lib/supabase-jobs.ts +++ b/apps/api/src/lib/supabase-jobs.ts @@ -37,3 +37,22 @@ export const supabaseGetJobsById = async (jobIds: string[]) => { return data; }; + + +export const supabaseGetJobByIdOnlyData = async (jobId: string) => { + const { data, error } = await supabase_service + .from("firecrawl_jobs") + .select("docs, team_id") + .eq("job_id", jobId) + .single(); + + if (error) { + return null; + } + + if (!data) { + return null; + } + + return data; +}; \ No newline at end of file diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index 27da0a1a..9dcbf111 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -15,6 +15,7 @@ import { crawlStatusWSController } from "../controllers/v1/crawl-status-ws"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; import { crawlCancelController } from "../controllers/v1/crawl-cancel"; import { Logger } from "../lib/logger"; +import { scrapeStatusController } from "../controllers/v1/scrape-status"; // import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview"; // import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status"; // import { searchController } from "../../src/controllers/v1/search"; @@ -124,6 +125,11 @@ v1Router.get( wrap(crawlStatusController) ); +v1Router.get( + "/scrape/:jobId", + wrap(scrapeStatusController) +); + v1Router.ws( "/crawl/:jobId", crawlStatusWSController diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts index c4c7de65..02c8a7e0 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts @@ -31,7 +31,6 @@ it('should return a list of links on the firecrawl.ai page', async () => { // Check if the result contains a list of links expect(result.linksOnPage).toBeDefined(); - console.log({result}); expect(Array.isArray(result.linksOnPage)).toBe(true); expect(result.linksOnPage.length).toBeGreaterThan(0); expect(result.linksOnPage).toContain('https://flutterbricks.com/features') diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 44a90b85..fc828224 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -305,26 +305,21 @@ export class WebScraperDataProvider { } // documents = await this.applyImgAltText(documents); - if ( - (this.extractorOptions.mode === "llm-extraction" || - this.extractorOptions.mode === "llm-extraction-from-markdown") && - this.mode === "single_urls" - ) { - documents = await generateCompletions( - documents, - this.extractorOptions, - "markdown" - ); - } - if ( - this.extractorOptions.mode === "llm-extraction-from-raw-html" && - this.mode === "single_urls" - ) { - documents = await generateCompletions( - documents, - this.extractorOptions, - "raw-html" - ); + if (this.mode === "single_urls" && this.pageOptions.includeExtract) { + const extractionMode = this.extractorOptions?.mode ?? "markdown"; + const completionMode = extractionMode === "llm-extraction-from-raw-html" ? "raw-html" : "markdown"; + + if ( + extractionMode === "llm-extraction" || + extractionMode === "llm-extraction-from-markdown" || + extractionMode === "llm-extraction-from-raw-html" + ) { + documents = await generateCompletions( + documents, + this.extractorOptions, + completionMode + ); + } } return documents.concat(pdfDocuments).concat(docxDocuments); } @@ -588,6 +583,7 @@ export class WebScraperDataProvider { removeTags: options.pageOptions?.removeTags ?? [], includeMarkdown: options.pageOptions?.includeMarkdown ?? true, includeRawHtml: options.pageOptions?.includeRawHtml ?? false, + includeExtract: options.pageOptions?.includeExtract ?? (options.extractorOptions?.mode && options.extractorOptions?.mode !== "markdown") ?? false, waitFor: options.pageOptions?.waitFor ?? undefined, headers: options.pageOptions?.headers ?? undefined, includeLinks: options.pageOptions?.includeLinks ?? true, @@ -617,6 +613,8 @@ export class WebScraperDataProvider { this.priority = options.priority; this.teamId = options.teamId ?? null; + + // make sure all urls start with https:// this.urls = this.urls.map((url) => { if (!url.trim().startsWith("http")) { diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index bdcd62cd..adf7e53c 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -130,6 +130,7 @@ export async function scrapSingleUrl( ): Promise { pageOptions = { includeMarkdown: pageOptions.includeMarkdown ?? true, + includeExtract: pageOptions.includeExtract ?? false, onlyMainContent: pageOptions.onlyMainContent ?? false, includeHtml: pageOptions.includeHtml ?? false, includeRawHtml: pageOptions.includeRawHtml ?? false, @@ -388,11 +389,11 @@ export async function scrapSingleUrl( if (screenshot && screenshot.length > 0) { document = { content: text, - markdown: pageOptions.includeMarkdown ? text : undefined, + markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined, html: pageOptions.includeHtml ? html : undefined, rawHtml: pageOptions.includeRawHtml || - extractorOptions?.mode === "llm-extraction-from-raw-html" + (extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract) ? rawHtml : undefined, linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined, @@ -407,11 +408,11 @@ export async function scrapSingleUrl( } else { document = { content: text, - markdown: pageOptions.includeMarkdown ? text : undefined, + markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined, html: pageOptions.includeHtml ? html : undefined, rawHtml: pageOptions.includeRawHtml || - extractorOptions?.mode === "llm-extraction-from-raw-html" + (extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract) ? rawHtml : undefined, metadata: { @@ -434,7 +435,7 @@ export async function scrapSingleUrl( }); return { content: "", - markdown: pageOptions.includeMarkdown ? "" : undefined, + markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined, html: "", linksOnPage: pageOptions.includeLinks ? [] : undefined, metadata: { diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index 19c17b48..22dc72df 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -199,21 +199,44 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { ); } + + // Free credits, no coupons - if (subscriptionError || !subscription) { + if (!subscription || subscriptionError) { + // If there is no active subscription but there are available coupons if (couponCredits >= credits) { return { success: true, message: "Sufficient credits available", remainingCredits: couponCredits }; } - const { data: creditUsages, error: creditUsageError } = - await supabase_service + let creditUsages; + let creditUsageError; + let retries = 0; + const maxRetries = 3; + const retryInterval = 2000; // 2 seconds + + while (retries < maxRetries) { + const result = await supabase_service .from("credit_usage") .select("credits_used") .is("subscription_id", null) .eq("team_id", team_id); + creditUsages = result.data; + creditUsageError = result.error; + + if (!creditUsageError) { + break; + } + + retries++; + if (retries < maxRetries) { + await new Promise(resolve => setTimeout(resolve, retryInterval)); + } + } + if (creditUsageError) { + Logger.error(`Credit usage error after ${maxRetries} attempts: ${creditUsageError}`); throw new Error( `Failed to retrieve credit usage for team_id: ${team_id}` ); diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index 2b476f52..941b571d 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -62,6 +62,7 @@ export function waitForJob(jobId: string, timeout: number) { clearInterval(int); resolve((await getScrapeQueue().getJob(jobId)).returnvalue); } else if (state === "failed") { + // console.log("failed", (await getScrapeQueue().getJob(jobId)).failedReason); clearInterval(int); reject((await getScrapeQueue().getJob(jobId)).failedReason); } diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 74ba5446..e0bb0df0 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -192,17 +192,16 @@ async function processJob(job: Job, token: string) { job, token, }); + + // Better if we throw here so we capture with the correct error + if(!success) { + throw new Error(message); + } const end = Date.now(); const timeTakenInSeconds = (end - start) / 1000; const rawHtml = docs[0] ? docs[0].rawHtml : ""; - if (job.data.crawl_id && (!job.data.pageOptions || !job.data.pageOptions.includeRawHtml)) { - if (docs[0] && docs[0].rawHtml) { - delete docs[0].rawHtml; - } - } - const data = { success, result: { diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 1798b23a..dade8493 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -104,6 +104,14 @@ export const devBRateLimiter = new RateLimiterRedis({ duration: 60, // Duration in seconds }); + +export const scrapeStatusRateLimiter = new RateLimiterRedis({ + storeClient: redisRateLimitClient, + keyPrefix: "scrape-status", + points: 400, + duration: 60, // Duration in seconds +}); + export function getRateLimiter( mode: RateLimiterMode, token: string, diff --git a/apps/js-sdk/firecrawl/build/cjs/index.js b/apps/js-sdk/firecrawl/build/cjs/index.js index 7b0730f5..2908b09d 100644 --- a/apps/js-sdk/firecrawl/build/cjs/index.js +++ b/apps/js-sdk/firecrawl/build/cjs/index.js @@ -5,7 +5,6 @@ var __importDefault = (this && this.__importDefault) || function (mod) { Object.defineProperty(exports, "__esModule", { value: true }); exports.CrawlWatcher = void 0; const axios_1 = __importDefault(require("axios")); -const zod_1 = require("zod"); const zod_to_json_schema_1 = require("zod-to-json-schema"); const isows_1 = require("isows"); const typescript_event_target_1 = require("typescript-event-target"); @@ -34,18 +33,19 @@ class FirecrawlApp { Authorization: `Bearer ${this.apiKey}`, }; let jsonData = { url, ...params }; - if (jsonData?.extractorOptions?.extractionSchema) { - let schema = jsonData.extractorOptions.extractionSchema; - // Check if schema is an instance of ZodSchema to correctly identify Zod schemas - if (schema instanceof zod_1.z.ZodSchema) { + if (jsonData?.extract?.schema) { + let schema = jsonData.extract.schema; + // Try parsing the schema as a Zod schema + try { schema = (0, zod_to_json_schema_1.zodToJsonSchema)(schema); } + catch (error) { + } jsonData = { ...jsonData, - extractorOptions: { - ...jsonData.extractorOptions, - extractionSchema: schema, - mode: jsonData.extractorOptions.mode || "llm-extraction", + extract: { + ...jsonData.extract, + schema: schema, }, }; } diff --git a/apps/js-sdk/firecrawl/build/esm/index.js b/apps/js-sdk/firecrawl/build/esm/index.js index cccd1770..4245cc37 100644 --- a/apps/js-sdk/firecrawl/build/esm/index.js +++ b/apps/js-sdk/firecrawl/build/esm/index.js @@ -1,5 +1,4 @@ import axios from "axios"; -import { z } from "zod"; import { zodToJsonSchema } from "zod-to-json-schema"; import { WebSocket } from "isows"; import { TypedEventTarget } from "typescript-event-target"; @@ -28,18 +27,19 @@ export default class FirecrawlApp { Authorization: `Bearer ${this.apiKey}`, }; let jsonData = { url, ...params }; - if (jsonData?.extractorOptions?.extractionSchema) { - let schema = jsonData.extractorOptions.extractionSchema; - // Check if schema is an instance of ZodSchema to correctly identify Zod schemas - if (schema instanceof z.ZodSchema) { + if (jsonData?.extract?.schema) { + let schema = jsonData.extract.schema; + // Try parsing the schema as a Zod schema + try { schema = zodToJsonSchema(schema); } + catch (error) { + } jsonData = { ...jsonData, - extractorOptions: { - ...jsonData.extractorOptions, - extractionSchema: schema, - mode: jsonData.extractorOptions.mode || "llm-extraction", + extract: { + ...jsonData.extract, + schema: schema, }, }; } diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index a9e36a24..002e10d2 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.1.0", + "version": "1.2.0", "description": "JavaScript SDK for Firecrawl API", "main": "build/cjs/index.js", "types": "types/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 586e9240..ee55343c 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -64,6 +64,7 @@ export interface FirecrawlDocument { html?: string; rawHtml?: string; links?: string[]; + extract?: Record; screenshot?: string; metadata?: FirecrawlDocumentMetadata; } @@ -73,12 +74,17 @@ export interface FirecrawlDocument { * Defines the options and configurations available for scraping web content. */ export interface ScrapeParams { - formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "full@scrennshot")[]; + formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "extract" | "full@scrennshot")[]; headers?: Record; includeTags?: string[]; excludeTags?: string[]; onlyMainContent?: boolean; - waitFor?: number; + extract?: { + prompt?: string; + schema?: z.ZodSchema | any; + systemPrompt?: string; + }; + waitFor?: number; timeout?: number; } @@ -196,18 +202,20 @@ export default class FirecrawlApp { Authorization: `Bearer ${this.apiKey}`, } as AxiosRequestHeaders; let jsonData: any = { url, ...params }; - if (jsonData?.extractorOptions?.extractionSchema) { - let schema = jsonData.extractorOptions.extractionSchema; - // Check if schema is an instance of ZodSchema to correctly identify Zod schemas - if (schema instanceof z.ZodSchema) { + if (jsonData?.extract?.schema) { + let schema = jsonData.extract.schema; + + // Try parsing the schema as a Zod schema + try { schema = zodToJsonSchema(schema); + } catch (error) { + } jsonData = { ...jsonData, - extractorOptions: { - ...jsonData.extractorOptions, - extractionSchema: schema, - mode: jsonData.extractorOptions.mode || "llm-extraction", + extract: { + ...jsonData.extract, + schema: schema, }, }; } diff --git a/apps/js-sdk/firecrawl/types/index.d.ts b/apps/js-sdk/firecrawl/types/index.d.ts index 6b5166b3..8b620f85 100644 --- a/apps/js-sdk/firecrawl/types/index.d.ts +++ b/apps/js-sdk/firecrawl/types/index.d.ts @@ -1,4 +1,5 @@ import { AxiosResponse, AxiosRequestHeaders } from "axios"; +import { z } from "zod"; import { TypedEventTarget } from "typescript-event-target"; /** * Configuration interface for FirecrawlApp. @@ -58,6 +59,7 @@ export interface FirecrawlDocument { html?: string; rawHtml?: string; links?: string[]; + extract?: Record; screenshot?: string; metadata?: FirecrawlDocumentMetadata; } @@ -66,11 +68,16 @@ export interface FirecrawlDocument { * Defines the options and configurations available for scraping web content. */ export interface ScrapeParams { - formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "full@scrennshot")[]; + formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "extract" | "full@scrennshot")[]; headers?: Record; includeTags?: string[]; excludeTags?: string[]; onlyMainContent?: boolean; + extract?: { + prompt?: string; + schema?: z.ZodSchema | any; + systemPrompt?: string; + }; waitFor?: number; timeout?: number; } diff --git a/apps/python-sdk/example.py b/apps/python-sdk/example.py index efb13939..02c06288 100644 --- a/apps/python-sdk/example.py +++ b/apps/python-sdk/example.py @@ -3,7 +3,7 @@ import nest_asyncio import uuid from firecrawl.firecrawl import FirecrawlApp -app = FirecrawlApp(api_key="fc-YOUR_API_KEY") +app = FirecrawlApp(api_key="fc-") # Scrape a website: scrape_result = app.scrape_url('firecrawl.dev') @@ -33,63 +33,63 @@ print(crawl_status) # LLM Extraction: # Define schema to extract contents into using pydantic -# from pydantic import BaseModel, Field -# from typing import List +from pydantic import BaseModel, Field +from typing import List -# class ArticleSchema(BaseModel): -# title: str -# points: int -# by: str -# commentsURL: str +class ArticleSchema(BaseModel): + title: str + points: int + by: str + commentsURL: str -# class TopArticlesSchema(BaseModel): -# top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories") +class TopArticlesSchema(BaseModel): + top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories") -# llm_extraction_result = app.scrape_url('https://news.ycombinator.com', { -# 'extractorOptions': { -# 'extractionSchema': TopArticlesSchema.model_json_schema(), -# 'mode': 'llm-extraction' -# }, -# 'pageOptions':{ -# 'onlyMainContent': True -# } -# }) +llm_extraction_result = app.scrape_url('https://news.ycombinator.com', { + 'formats': ['extract'], + 'extract': { + 'schema': TopArticlesSchema.model_json_schema() + } +}) -# print(llm_extraction_result['llm_extraction']) +print(llm_extraction_result['extract']) # # Define schema to extract contents into using json schema -# json_schema = { -# "type": "object", -# "properties": { -# "top": { -# "type": "array", -# "items": { -# "type": "object", -# "properties": { -# "title": {"type": "string"}, -# "points": {"type": "number"}, -# "by": {"type": "string"}, -# "commentsURL": {"type": "string"} -# }, -# "required": ["title", "points", "by", "commentsURL"] -# }, -# "minItems": 5, -# "maxItems": 5, -# "description": "Top 5 stories on Hacker News" -# } -# }, -# "required": ["top"] -# } +json_schema = { + "type": "object", + "properties": { + "top": { + "type": "array", + "items": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "points": {"type": "number"}, + "by": {"type": "string"}, + "commentsURL": {"type": "string"} + }, + "required": ["title", "points", "by", "commentsURL"] + }, + "minItems": 5, + "maxItems": 5, + "description": "Top 5 stories on Hacker News" + } + }, + "required": ["top"] +} -# llm_extraction_result = app.scrape_url('https://news.ycombinator.com', { -# 'extractorOptions': { -# 'extractionSchema': json_schema, -# 'mode': 'llm-extraction' -# }, -# 'pageOptions':{ -# 'onlyMainContent': True -# } -# }) +app2 = FirecrawlApp(api_key="fc-", version="v0") + + +llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', { + 'extractorOptions': { + 'extractionSchema': json_schema, + 'mode': 'llm-extraction' + }, + 'pageOptions':{ + 'onlyMainContent': True + } +}) # print(llm_extraction_result['llm_extraction']) @@ -124,6 +124,3 @@ async def start_crawl_and_watch(): # Start the watcher await watcher.connect() - -# Run the event loop -await start_crawl_and_watch() \ No newline at end of file diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 13df20d9..4b3807be 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp -__version__ = "1.1.1" +__version__ = "1.2.1" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index b7a0bff6..75245e8d 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -59,20 +59,16 @@ class FirecrawlApp: # If there are additional params, process them if params: - # Initialize extractorOptions if present - extractor_options = params.get('extractorOptions', {}) - # Check and convert the extractionSchema if it's a Pydantic model - if 'extractionSchema' in extractor_options: - if hasattr(extractor_options['extractionSchema'], 'schema'): - extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema() - # Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided - extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction') - # Update the scrape_params with the processed extractorOptions - scrape_params['extractorOptions'] = extractor_options + # Handle extract (for v1) + extract = params.get('extract', {}) + if extract: + if 'schema' in extract and hasattr(extract['schema'], 'schema'): + extract['schema'] = extract['schema'].schema() + scrape_params['extract'] = extract # Include any other params directly at the top level of scrape_params for key, value in params.items(): - if key != 'extractorOptions': + if key not in ['extract']: scrape_params[key] = value endpoint = f'/v1/scrape'