Merge branch 'main' into v1/webhooks

This commit is contained in:
Nicolas 2024-09-01 12:56:11 -03:00
commit faae98ecb8
29 changed files with 521 additions and 256 deletions

View File

@ -229,20 +229,19 @@ Response will be an ordered list from the most relevant to the least relevant.
} }
``` ```
### LLM Extraction (v0) (Beta) ### LLM Extraction (Beta)
Used to extract structured data from scraped pages. Used to extract structured data from scraped pages.
```bash ```bash
curl -X POST https://api.firecrawl.dev/v0/scrape \ curl -X POST https://api.firecrawl.dev/v1/scrape \
-H 'Content-Type: application/json' \ -H 'Content-Type: application/json' \
-H 'Authorization: Bearer YOUR_API_KEY' \ -H 'Authorization: Bearer YOUR_API_KEY' \
-d '{ -d '{
"url": "https://www.mendable.ai/", "url": "https://www.mendable.ai/",
"extractorOptions": { "formats": ["extract"],
"mode": "llm-extraction", "extract": {
"extractionPrompt": "Based on the information on the page, extract the information from the schema. ", "schema": {
"extractionSchema": {
"type": "object", "type": "object",
"properties": { "properties": {
"company_mission": { "company_mission": {
@ -296,6 +295,23 @@ curl -X POST https://api.firecrawl.dev/v0/scrape \
} }
``` ```
### Extracting without a schema (New)
You can now extract without a schema by just passing a `prompt` to the endpoint. The llm chooses the structure of the data.
```bash
curl -X POST https://api.firecrawl.dev/v1/scrape \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer YOUR_API_KEY' \
-d '{
"url": "https://docs.firecrawl.dev/",
"formats": ["extract"],
"extract": {
"prompt": "Extract the company mission from the page."
}
}'
```
### Search (v0) (Beta) ### Search (v0) (Beta)

View File

@ -94,7 +94,7 @@
"moment": "^2.29.4", "moment": "^2.29.4",
"mongoose": "^8.4.4", "mongoose": "^8.4.4",
"natural": "^7.0.7", "natural": "^7.0.7",
"openai": "^4.52.2", "openai": "^4.57.0",
"pdf-parse": "^1.1.1", "pdf-parse": "^1.1.1",
"pos": "^0.4.2", "pos": "^0.4.2",
"posthog-node": "^4.0.1", "posthog-node": "^4.0.1",

View File

@ -124,7 +124,7 @@ importers:
version: 0.0.28 version: 0.0.28
langchain: langchain:
specifier: ^0.2.8 specifier: ^0.2.8
version: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) version: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)
languagedetect: languagedetect:
specifier: ^2.0.0 specifier: ^2.0.0
version: 2.0.0 version: 2.0.0
@ -147,8 +147,8 @@ importers:
specifier: ^7.0.7 specifier: ^7.0.7
version: 7.0.7(socks@2.8.3) version: 7.0.7(socks@2.8.3)
openai: openai:
specifier: ^4.52.2 specifier: ^4.57.0
version: 4.52.2 version: 4.57.0(zod@3.23.8)
pdf-parse: pdf-parse:
specifier: ^1.1.1 specifier: ^1.1.1
version: 1.1.1 version: 1.1.1
@ -3733,9 +3733,14 @@ packages:
openai@3.3.0: openai@3.3.0:
resolution: {integrity: sha512-uqxI/Au+aPRnsaQRe8CojU0eCR7I0mBiKjD3sNMzY6DaC1ZVrc85u98mtJW6voDug8fgGN+DIZmTDxTthxb7dQ==} resolution: {integrity: sha512-uqxI/Au+aPRnsaQRe8CojU0eCR7I0mBiKjD3sNMzY6DaC1ZVrc85u98mtJW6voDug8fgGN+DIZmTDxTthxb7dQ==}
openai@4.52.2: openai@4.57.0:
resolution: {integrity: sha512-mMc0XgFuVSkcm0lRIi8zaw++otC82ZlfkCur1qguXYWPETr/+ZwL9A/vvp3YahX+shpaT6j03dwsmUyLAfmEfg==} resolution: {integrity: sha512-JnwBSIYqiZ3jYjB5f2in8hQ0PRA092c6m+/6dYB0MzK0BEbn+0dioxZsPLBm5idJbg9xzLNOiGVm2OSuhZ+BdQ==}
hasBin: true hasBin: true
peerDependencies:
zod: ^3.23.8
peerDependenciesMeta:
zod:
optional: true
openapi-types@12.1.3: openapi-types@12.1.3:
resolution: {integrity: sha512-N4YtSYJqghVu4iek2ZUvcN/0aqH1kRDuNqzcycDxhOUpg7GdvLa2F3DgS6yBNhInhv2r/6I0Flkn7CqL8+nIcw==} resolution: {integrity: sha512-N4YtSYJqghVu4iek2ZUvcN/0aqH1kRDuNqzcycDxhOUpg7GdvLa2F3DgS6yBNhInhv2r/6I0Flkn7CqL8+nIcw==}
@ -5319,13 +5324,13 @@ snapshots:
'@js-sdsl/ordered-map@4.4.2': {} '@js-sdsl/ordered-map@4.4.2': {}
'@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)': '@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))':
dependencies: dependencies:
ansi-styles: 5.2.0 ansi-styles: 5.2.0
camelcase: 6.3.0 camelcase: 6.3.0
decamelize: 1.2.0 decamelize: 1.2.0
js-tiktoken: 1.0.12 js-tiktoken: 1.0.12
langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
ml-distance: 4.0.1 ml-distance: 4.0.1
mustache: 4.2.0 mustache: 4.2.0
p-queue: 6.6.2 p-queue: 6.6.2
@ -5337,20 +5342,20 @@ snapshots:
- langchain - langchain
- openai - openai
'@langchain/openai@0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))': '@langchain/openai@0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))':
dependencies: dependencies:
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
js-tiktoken: 1.0.12 js-tiktoken: 1.0.12
openai: 4.52.2 openai: 4.57.0(zod@3.23.8)
zod: 3.23.8 zod: 3.23.8
zod-to-json-schema: 3.23.1(zod@3.23.8) zod-to-json-schema: 3.23.1(zod@3.23.8)
transitivePeerDependencies: transitivePeerDependencies:
- encoding - encoding
- langchain - langchain
'@langchain/textsplitters@0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)': '@langchain/textsplitters@0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))':
dependencies: dependencies:
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
js-tiktoken: 1.0.12 js-tiktoken: 1.0.12
transitivePeerDependencies: transitivePeerDependencies:
- langchain - langchain
@ -8487,17 +8492,17 @@ snapshots:
kleur@3.0.3: {} kleur@3.0.3: {}
langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0): langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0):
dependencies: dependencies:
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
'@langchain/openai': 0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)) '@langchain/openai': 0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))
'@langchain/textsplitters': 0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) '@langchain/textsplitters': 0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
binary-extensions: 2.3.0 binary-extensions: 2.3.0
js-tiktoken: 1.0.12 js-tiktoken: 1.0.12
js-yaml: 4.1.0 js-yaml: 4.1.0
jsonpointer: 5.0.1 jsonpointer: 5.0.1
langchainhub: 0.0.11 langchainhub: 0.0.11
langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
ml-distance: 4.0.1 ml-distance: 4.0.1
openapi-types: 12.1.3 openapi-types: 12.1.3
p-retry: 4.6.2 p-retry: 4.6.2
@ -8524,7 +8529,7 @@ snapshots:
langchainhub@0.0.11: {} langchainhub@0.0.11: {}
langsmith@0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2): langsmith@0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)):
dependencies: dependencies:
'@types/uuid': 9.0.8 '@types/uuid': 9.0.8
commander: 10.0.1 commander: 10.0.1
@ -8533,9 +8538,9 @@ snapshots:
p-retry: 4.6.2 p-retry: 4.6.2
uuid: 9.0.1 uuid: 9.0.1
optionalDependencies: optionalDependencies:
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2) '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
langchain: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0) langchain: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)
openai: 4.52.2 openai: 4.57.0(zod@3.23.8)
languagedetect@2.0.0: {} languagedetect@2.0.0: {}
@ -8928,16 +8933,19 @@ snapshots:
transitivePeerDependencies: transitivePeerDependencies:
- debug - debug
openai@4.52.2: openai@4.57.0(zod@3.23.8):
dependencies: dependencies:
'@types/node': 18.19.39 '@types/node': 18.19.39
'@types/node-fetch': 2.6.11 '@types/node-fetch': 2.6.11
'@types/qs': 6.9.15
abort-controller: 3.0.0 abort-controller: 3.0.0
agentkeepalive: 4.5.0 agentkeepalive: 4.5.0
form-data-encoder: 1.7.2 form-data-encoder: 1.7.2
formdata-node: 4.4.1 formdata-node: 4.4.1
node-fetch: 2.7.0 node-fetch: 2.7.0
web-streams-polyfill: 3.3.3 qs: 6.12.2
optionalDependencies:
zod: 3.23.8
transitivePeerDependencies: transitivePeerDependencies:
- encoding - encoding

View File

@ -1,6 +1,6 @@
### Crawl Website ### Crawl Website
POST http://localhost:3002/v0/scrape HTTP/1.1 POST http://localhost:3002/v0/scrape HTTP/1.1
Authorization: Bearer fc-4e6259caf03b42a4b6c9261e0f96e673 Authorization: Bearer fc-
content-type: application/json content-type: application/json
{ {
@ -9,7 +9,7 @@ content-type: application/json
### Check Job Status ### Check Job Status
GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1 GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1
Authorization: Bearer fc-4e6259caf03b42a4b6c9261e0f96e673 Authorization: Bearer fc-
### Check Job Status ### Check Job Status

View File

@ -255,12 +255,25 @@ export async function scrapeController(req: Request, res: Response) {
} }
} }
let doc = result.data;
if (!pageOptions || !pageOptions.includeRawHtml) {
if (doc && doc.rawHtml) {
delete doc.rawHtml;
}
}
if(pageOptions && pageOptions.includeExtract) {
if(!pageOptions.includeMarkdown && doc && doc.markdown) {
delete doc.markdown;
}
}
logJob({ logJob({
job_id: jobId, job_id: jobId,
success: result.success, success: result.success,
message: result.error, message: result.error,
num_docs: 1, num_docs: 1,
docs: [result.data], docs: [doc],
time_taken: timeTakenInSeconds, time_taken: timeTakenInSeconds,
team_id: team_id, team_id: team_id,
mode: "scrape", mode: "scrape",

View File

@ -0,0 +1,38 @@
import { Response } from "express";
import { supabaseGetJobByIdOnlyData } from "../../lib/supabase-jobs";
import { scrapeStatusRateLimiter } from "../../services/rate-limiter";
export async function scrapeStatusController(req: any, res: any) {
try {
const rateLimiter = scrapeStatusRateLimiter;
const incomingIP = (req.headers["x-forwarded-for"] ||
req.socket.remoteAddress) as string;
const iptoken = incomingIP;
await rateLimiter.consume(iptoken);
const job = await supabaseGetJobByIdOnlyData(req.params.jobId);
if(job.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){
return res.status(403).json({
success: false,
error: "You are not allowed to access this resource.",
});
}
return res.status(200).json({
success: true,
data: job?.docs[0],
});
} catch (error) {
if (error instanceof Error && error.message == "Too Many Requests") {
return res.status(429).json({
success: false,
error: "Rate limit exceeded. Please try again later.",
});
} else {
return res.status(500).json({
success: false,
error: "An unexpected error occurred.",
});
}
}
}

View File

@ -1,36 +1,58 @@
import { Request, Response } from "express"; import { Request, Response } from "express";
import { Logger } from '../../lib/logger'; import { Logger } from "../../lib/logger";
import { Document, legacyDocumentConverter, legacyScrapeOptions, RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types"; import {
Document,
legacyDocumentConverter,
legacyExtractorOptions,
legacyScrapeOptions,
RequestWithAuth,
ScrapeRequest,
scrapeRequestSchema,
ScrapeResponse,
} from "./types";
import { billTeam } from "../../services/billing/credit_billing"; import { billTeam } from "../../services/billing/credit_billing";
import { v4 as uuidv4 } from 'uuid'; import { v4 as uuidv4 } from "uuid";
import { numTokensFromString } from "../../lib/LLM-extraction/helpers"; import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
import { logJob } from "../../services/logging/log_job"; import { logJob } from "../../services/logging/log_job";
import { getJobPriority } from "../../lib/job-priority"; import { getJobPriority } from "../../lib/job-priority";
import { PlanType } from "../../types"; import { PlanType } from "../../types";
export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>, res: Response<ScrapeResponse>) { export async function scrapeController(
req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>,
res: Response<ScrapeResponse>
) {
req.body = scrapeRequestSchema.parse(req.body); req.body = scrapeRequestSchema.parse(req.body);
let earlyReturn = false; let earlyReturn = false;
const origin = req.body.origin; const origin = req.body.origin;
const timeout = req.body.timeout; const timeout = req.body.timeout;
const pageOptions = legacyScrapeOptions(req.body); const pageOptions = legacyScrapeOptions(req.body);
const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
const jobId = uuidv4(); const jobId = uuidv4();
const startTime = new Date().getTime(); const startTime = new Date().getTime();
const jobPriority = await getJobPriority({plan: req.auth.plan as PlanType, team_id: req.auth.team_id, basePriority: 10}) const jobPriority = await getJobPriority({
plan: req.auth.plan as PlanType,
team_id: req.auth.team_id,
basePriority: 10,
});
const job = await addScrapeJob({ const job = await addScrapeJob(
{
url: req.body.url, url: req.body.url,
mode: "single_urls", mode: "single_urls",
crawlerOptions: {}, crawlerOptions: {},
team_id: req.auth.team_id, team_id: req.auth.team_id,
pageOptions, pageOptions,
extractorOptions: {}, extractorOptions,
origin: req.body.origin, origin: req.body.origin,
is_scrape: true, is_scrape: true,
}, {}, jobId, jobPriority); },
{},
jobId,
jobPriority
);
let doc: any | undefined; let doc: any | undefined;
try { try {
@ -45,7 +67,11 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse,
} else { } else {
return res.status(500).json({ return res.status(500).json({
success: false, success: false,
error: "Internal server error", error: `(Internal server error) - ${e && e?.message ? e.message : e} ${
extractorOptions && extractorOptions.mode !== "markdown"
? " - Could be due to LLM parsing issues"
: ""
}`,
}); });
} }
} }
@ -57,7 +83,7 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse,
return res.status(200).json({ return res.status(200).json({
success: true, success: true,
warning: "No page found", warning: "No page found",
data: doc data: doc,
}); });
} }
@ -66,25 +92,41 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse,
const endTime = new Date().getTime(); const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000; const timeTakenInSeconds = (endTime - startTime) / 1000;
const numTokens = (doc && doc.markdown) ? numTokensFromString(doc.markdown, "gpt-3.5-turbo") : 0; const numTokens =
doc && doc.markdown
? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
: 0;
let creditsToBeBilled = 1; // Assuming 1 credit per document let creditsToBeBilled = 1; // Assuming 1 credit per document
if (earlyReturn) { if (earlyReturn) {
// Don't bill if we're early returning // Don't bill if we're early returning
return; return;
} }
if(req.body.extract && req.body.formats.includes("extract")) {
creditsToBeBilled = 50;
}
const billingResult = await billTeam( const billingResult = await billTeam(req.auth.team_id, creditsToBeBilled);
req.auth.team_id,
creditsToBeBilled
);
if (!billingResult.success) { if (!billingResult.success) {
return res.status(402).json({ return res.status(402).json({
success: false, success: false,
error: "Failed to bill team. Insufficient credits or subscription not found.", error:
"Failed to bill team. Insufficient credits or subscription not found.",
}); });
} }
if (!pageOptions || !pageOptions.includeRawHtml) {
if (doc && doc.rawHtml) {
delete doc.rawHtml;
}
}
if(pageOptions && pageOptions.includeExtract) {
if(!pageOptions.includeMarkdown && doc && doc.markdown) {
delete doc.markdown;
}
}
logJob({ logJob({
job_id: jobId, job_id: jobId,
success: true, success: true,

View File

@ -1,7 +1,7 @@
import { Request, Response } from "express"; import { Request, Response } from "express";
import { z } from "zod"; import { z } from "zod";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { PageOptions } from "../../lib/entities"; import { ExtractorOptions, PageOptions } from "../../lib/entities";
import { protocolIncluded, checkUrl } from "../../lib/validateUrl"; import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
import { PlanType } from "../../types"; import { PlanType } from "../../types";
@ -11,7 +11,8 @@ export type Format =
| "rawHtml" | "rawHtml"
| "links" | "links"
| "screenshot" | "screenshot"
| "screenshot@fullPage"; | "screenshot@fullPage"
| "extract";
export const url = z.preprocess( export const url = z.preprocess(
(x) => { (x) => {
@ -40,6 +41,15 @@ export const url = z.preprocess(
const strictMessage = "Unrecognized key in body -- please review the v1 API documentation for request body changes"; const strictMessage = "Unrecognized key in body -- please review the v1 API documentation for request body changes";
export const extractOptions = z.object({
mode: z.enum(["llm"]).default("llm"),
schema: z.any().optional(),
systemPrompt: z.string().default("Based on the information on the page, extract all the information from the schema. Try to extract all the fields even those that might not be marked as required."),
prompt: z.string().optional()
}).strict(strictMessage);
export type ExtractOptions = z.infer<typeof extractOptions>;
export const scrapeOptions = z.object({ export const scrapeOptions = z.object({
formats: z formats: z
.enum([ .enum([
@ -49,6 +59,7 @@ export const scrapeOptions = z.object({
"links", "links",
"screenshot", "screenshot",
"screenshot@fullPage", "screenshot@fullPage",
"extract"
]) ])
.array() .array()
.optional() .optional()
@ -57,17 +68,33 @@ export const scrapeOptions = z.object({
includeTags: z.string().array().optional(), includeTags: z.string().array().optional(),
excludeTags: z.string().array().optional(), excludeTags: z.string().array().optional(),
onlyMainContent: z.boolean().default(true), onlyMainContent: z.boolean().default(true),
timeout: z.number().int().positive().finite().safe().default(30000), // default? timeout: z.number().int().positive().finite().safe().default(30000),
waitFor: z.number().int().nonnegative().finite().safe().default(0), waitFor: z.number().int().nonnegative().finite().safe().default(0),
extract: extractOptions.optional(),
parsePDF: z.boolean().default(true), parsePDF: z.boolean().default(true),
}).strict(strictMessage); }).strict(strictMessage)
export type ScrapeOptions = z.infer<typeof scrapeOptions>; export type ScrapeOptions = z.infer<typeof scrapeOptions>;
export const scrapeRequestSchema = scrapeOptions.extend({ export const scrapeRequestSchema = scrapeOptions.extend({
url, url,
origin: z.string().optional().default("api"), origin: z.string().optional().default("api"),
}).strict(strictMessage); }).strict(strictMessage).refine(
(obj) => {
const hasExtractFormat = obj.formats?.includes("extract");
const hasExtractOptions = obj.extract !== undefined;
return (hasExtractFormat && hasExtractOptions) || (!hasExtractFormat && !hasExtractOptions);
},
{
message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa",
}
).transform((obj) => {
if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) {
return { ...obj, timeout: 60000 };
}
return obj;
});
// export type ScrapeRequest = { // export type ScrapeRequest = {
// url: string; // url: string;
@ -118,6 +145,13 @@ export const crawlRequestSchema = crawlerOptions.extend({
// scrapeOptions?: Exclude<ScrapeRequest, "url">; // scrapeOptions?: Exclude<ScrapeRequest, "url">;
// }; // };
// export type ExtractorOptions = {
// mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html";
// extractionPrompt?: string;
// extractionSchema?: Record<string, any>;
// }
export type CrawlRequest = z.infer<typeof crawlRequestSchema>; export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
export const mapRequestSchema = crawlerOptions.extend({ export const mapRequestSchema = crawlerOptions.extend({
@ -126,7 +160,7 @@ export const mapRequestSchema = crawlerOptions.extend({
includeSubdomains: z.boolean().default(true), includeSubdomains: z.boolean().default(true),
search: z.string().optional(), search: z.string().optional(),
ignoreSitemap: z.boolean().default(false), ignoreSitemap: z.boolean().default(false),
limit: z.number().min(1).max(50).default(5000).optional(), limit: z.number().min(1).max(5000).default(5000).optional(),
}).strict(strictMessage); }).strict(strictMessage);
// export type MapRequest = { // export type MapRequest = {
@ -138,6 +172,7 @@ export type MapRequest = z.infer<typeof mapRequestSchema>;
export type Document = { export type Document = {
markdown?: string; markdown?: string;
extract?: string;
html?: string; html?: string;
rawHtml?: string; rawHtml?: string;
links?: string[]; links?: string[];
@ -280,6 +315,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
includeMarkdown: x.formats.includes("markdown"), includeMarkdown: x.formats.includes("markdown"),
includeHtml: x.formats.includes("html"), includeHtml: x.formats.includes("html"),
includeRawHtml: x.formats.includes("rawHtml"), includeRawHtml: x.formats.includes("rawHtml"),
includeExtract: x.formats.includes("extract"),
onlyIncludeTags: x.includeTags, onlyIncludeTags: x.includeTags,
removeTags: x.excludeTags, removeTags: x.excludeTags,
onlyMainContent: x.onlyMainContent, onlyMainContent: x.onlyMainContent,
@ -291,6 +327,15 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
}; };
} }
export function legacyExtractorOptions(x: ExtractOptions): ExtractorOptions {
return {
mode: x.mode ? "llm-extraction" : "markdown",
extractionPrompt: x.prompt ?? "Based on the information on the page, extract the information from the schema.",
extractionSchema: x.schema,
userPrompt: x.prompt ?? "",
};
}
export function legacyDocumentConverter(doc: any): Document { export function legacyDocumentConverter(doc: any): Document {
if (doc === null || doc === undefined) return doc; if (doc === null || doc === undefined) return doc;
@ -311,6 +356,7 @@ export function legacyDocumentConverter(doc: any): Document {
links: doc.linksOnPage, links: doc.linksOnPage,
rawHtml: doc.rawHtml, rawHtml: doc.rawHtml,
html: doc.html, html: doc.html,
extract: doc.llm_extraction,
screenshot: doc.screenshot ?? doc.fullPageScreenshot, screenshot: doc.screenshot ?? doc.fullPageScreenshot,
metadata: { metadata: {
...doc.metadata, ...doc.metadata,

View File

@ -15,7 +15,8 @@ export async function generateCompletions(
// const schema = zodToJsonSchema(options.schema) // const schema = zodToJsonSchema(options.schema)
const schema = extractionOptions.extractionSchema; const schema = extractionOptions.extractionSchema;
const prompt = extractionOptions.extractionPrompt; const systemPrompt = extractionOptions.extractionPrompt;
const prompt = extractionOptions.userPrompt;
const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider
@ -30,18 +31,23 @@ export async function generateCompletions(
document: document, document: document,
schema: schema, schema: schema,
prompt: prompt, prompt: prompt,
systemPrompt: systemPrompt,
mode: mode, mode: mode,
}); });
// Validate the JSON output against the schema using AJV // Validate the JSON output against the schema using AJV
if (schema) {
const validate = ajv.compile(schema); const validate = ajv.compile(schema);
if (!validate(completionResult.llm_extraction)) { if (!validate(completionResult.llm_extraction)) {
//TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc. //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
throw new Error( throw new Error(
`JSON parsing error(s): ${validate.errors `JSON parsing error(s): ${validate.errors
?.map((err) => err.message) ?.map((err) => err.message)
.join(", ")}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.` .join(
", "
)}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.`
); );
} }
}
return completionResult; return completionResult;
} catch (error) { } catch (error) {

View File

@ -16,7 +16,6 @@ function prepareOpenAIDoc(
document: Document, document: Document,
mode: "markdown" | "raw-html" mode: "markdown" | "raw-html"
): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] | null { ): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] | null {
let markdown = document.markdown; let markdown = document.markdown;
let extractionTarget = document.markdown; let extractionTarget = document.markdown;
@ -33,34 +32,32 @@ function prepareOpenAIDoc(
// ); // );
} }
// count number of tokens // count number of tokens
const numTokens = numTokensFromString(extractionTarget, "gpt-4"); const numTokens = numTokensFromString(extractionTarget, "gpt-4");
if (numTokens > maxTokens) { if (numTokens > maxTokens) {
// trim the document to the maximum number of tokens, tokens != characters // trim the document to the maximum number of tokens, tokens != characters
extractionTarget = extractionTarget.slice(0, (maxTokens * modifier)); extractionTarget = extractionTarget.slice(0, maxTokens * modifier);
} }
return [[{ type: "text", text: extractionTarget }], numTokens]; return [[{ type: "text", text: extractionTarget }], numTokens];
} }
export async function generateOpenAICompletions({ export async function generateOpenAICompletions({
client, client,
model = process.env.MODEL_NAME || "gpt-4o", model = process.env.MODEL_NAME || "gpt-4o-mini",
document, document,
schema, //TODO - add zod dynamic type checking schema, //TODO - add zod dynamic type checking
prompt = defaultPrompt, systemPrompt = defaultPrompt,
prompt,
temperature, temperature,
mode mode,
}: { }: {
client: OpenAI; client: OpenAI;
model?: string; model?: string;
document: Document; document: Document;
schema: any; // This should be replaced with a proper Zod schema type when available schema: any; // This should be replaced with a proper Zod schema type when available
prompt?: string; prompt?: string;
systemPrompt?: string;
temperature?: number; temperature?: number;
mode: "markdown" | "raw-html"; mode: "markdown" | "raw-html";
}): Promise<Document> { }): Promise<Document> {
@ -70,18 +67,46 @@ export async function generateOpenAICompletions({
if (preparedDoc === null) { if (preparedDoc === null) {
return { return {
...document, ...document,
warning: "LLM extraction was not performed since the document's content is empty or missing.", warning:
"LLM extraction was not performed since the document's content is empty or missing.",
}; };
} }
const [content, numTokens] = preparedDoc; const [content, numTokens] = preparedDoc;
const completion = await openai.chat.completions.create({ let completion;
let llmExtraction;
if (prompt && !schema) {
const jsonCompletion = await openai.chat.completions.create({
model, model,
messages: [ messages: [
{ {
role: "system", role: "system",
content: prompt, content: systemPrompt,
},
{ role: "user", content },
{
role: "user",
content: `Transform the above content into structured json output based on the following user request: ${prompt}`,
},
],
response_format: { type: "json_object" },
temperature,
});
try {
llmExtraction = JSON.parse(
jsonCompletion.choices[0].message.content.trim()
);
} catch (e) {
throw new Error("Invalid JSON");
}
} else {
completion = await openai.chat.completions.create({
model,
messages: [
{
role: "system",
content: systemPrompt,
}, },
{ role: "user", content }, { role: "user", content },
], ],
@ -95,20 +120,26 @@ export async function generateOpenAICompletions({
}, },
}, },
], ],
tool_choice: { "type": "function", "function": {"name": "extract_content"}}, tool_choice: { type: "function", function: { name: "extract_content" } },
temperature, temperature,
}); });
const c = completion.choices[0].message.tool_calls[0].function.arguments; const c = completion.choices[0].message.tool_calls[0].function.arguments;
// Extract the LLM extraction content from the completion response // Extract the LLM extraction content from the completion response
const llmExtraction = JSON.parse(c); try {
llmExtraction = JSON.parse(c);
} catch (e) {
throw new Error("Invalid JSON");
}
}
// Return the document with the LLM extraction content added // Return the document with the LLM extraction content added
return { return {
...document, ...document,
llm_extraction: llmExtraction, llm_extraction: llmExtraction,
warning: numTokens > maxTokens ? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.` : undefined, warning:
numTokens > maxTokens
? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.`
: undefined,
}; };
} }

View File

@ -19,3 +19,4 @@ export class CustomError extends Error {
Object.setPrototypeOf(this, CustomError.prototype); Object.setPrototypeOf(this, CustomError.prototype);
} }
} }

View File

@ -12,6 +12,7 @@ export interface Progress {
export type PageOptions = { export type PageOptions = {
includeMarkdown?: boolean; includeMarkdown?: boolean;
includeExtract?: boolean;
onlyMainContent?: boolean; onlyMainContent?: boolean;
includeHtml?: boolean; includeHtml?: boolean;
includeRawHtml?: boolean; includeRawHtml?: boolean;
@ -35,6 +36,7 @@ export type ExtractorOptions = {
mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html"; mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html";
extractionPrompt?: string; extractionPrompt?: string;
extractionSchema?: Record<string, any>; extractionSchema?: Record<string, any>;
userPrompt?: string;
} }
export type SearchOptions = { export type SearchOptions = {

View File

@ -37,3 +37,22 @@ export const supabaseGetJobsById = async (jobIds: string[]) => {
return data; return data;
}; };
export const supabaseGetJobByIdOnlyData = async (jobId: string) => {
const { data, error } = await supabase_service
.from("firecrawl_jobs")
.select("docs, team_id")
.eq("job_id", jobId)
.single();
if (error) {
return null;
}
if (!data) {
return null;
}
return data;
};

View File

@ -15,6 +15,7 @@ import { crawlStatusWSController } from "../controllers/v1/crawl-status-ws";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
import { crawlCancelController } from "../controllers/v1/crawl-cancel"; import { crawlCancelController } from "../controllers/v1/crawl-cancel";
import { Logger } from "../lib/logger"; import { Logger } from "../lib/logger";
import { scrapeStatusController } from "../controllers/v1/scrape-status";
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview"; // import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status"; // import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
// import { searchController } from "../../src/controllers/v1/search"; // import { searchController } from "../../src/controllers/v1/search";
@ -124,6 +125,11 @@ v1Router.get(
wrap(crawlStatusController) wrap(crawlStatusController)
); );
v1Router.get(
"/scrape/:jobId",
wrap(scrapeStatusController)
);
v1Router.ws( v1Router.ws(
"/crawl/:jobId", "/crawl/:jobId",
crawlStatusWSController crawlStatusWSController

View File

@ -31,7 +31,6 @@ it('should return a list of links on the firecrawl.ai page', async () => {
// Check if the result contains a list of links // Check if the result contains a list of links
expect(result.linksOnPage).toBeDefined(); expect(result.linksOnPage).toBeDefined();
console.log({result});
expect(Array.isArray(result.linksOnPage)).toBe(true); expect(Array.isArray(result.linksOnPage)).toBe(true);
expect(result.linksOnPage.length).toBeGreaterThan(0); expect(result.linksOnPage.length).toBeGreaterThan(0);
expect(result.linksOnPage).toContain('https://flutterbricks.com/features') expect(result.linksOnPage).toContain('https://flutterbricks.com/features')

View File

@ -305,26 +305,21 @@ export class WebScraperDataProvider {
} }
// documents = await this.applyImgAltText(documents); // documents = await this.applyImgAltText(documents);
if (this.mode === "single_urls" && this.pageOptions.includeExtract) {
const extractionMode = this.extractorOptions?.mode ?? "markdown";
const completionMode = extractionMode === "llm-extraction-from-raw-html" ? "raw-html" : "markdown";
if ( if (
(this.extractorOptions.mode === "llm-extraction" || extractionMode === "llm-extraction" ||
this.extractorOptions.mode === "llm-extraction-from-markdown") && extractionMode === "llm-extraction-from-markdown" ||
this.mode === "single_urls" extractionMode === "llm-extraction-from-raw-html"
) { ) {
documents = await generateCompletions( documents = await generateCompletions(
documents, documents,
this.extractorOptions, this.extractorOptions,
"markdown" completionMode
); );
} }
if (
this.extractorOptions.mode === "llm-extraction-from-raw-html" &&
this.mode === "single_urls"
) {
documents = await generateCompletions(
documents,
this.extractorOptions,
"raw-html"
);
} }
return documents.concat(pdfDocuments).concat(docxDocuments); return documents.concat(pdfDocuments).concat(docxDocuments);
} }
@ -588,6 +583,7 @@ export class WebScraperDataProvider {
removeTags: options.pageOptions?.removeTags ?? [], removeTags: options.pageOptions?.removeTags ?? [],
includeMarkdown: options.pageOptions?.includeMarkdown ?? true, includeMarkdown: options.pageOptions?.includeMarkdown ?? true,
includeRawHtml: options.pageOptions?.includeRawHtml ?? false, includeRawHtml: options.pageOptions?.includeRawHtml ?? false,
includeExtract: options.pageOptions?.includeExtract ?? (options.extractorOptions?.mode && options.extractorOptions?.mode !== "markdown") ?? false,
waitFor: options.pageOptions?.waitFor ?? undefined, waitFor: options.pageOptions?.waitFor ?? undefined,
headers: options.pageOptions?.headers ?? undefined, headers: options.pageOptions?.headers ?? undefined,
includeLinks: options.pageOptions?.includeLinks ?? true, includeLinks: options.pageOptions?.includeLinks ?? true,
@ -617,6 +613,8 @@ export class WebScraperDataProvider {
this.priority = options.priority; this.priority = options.priority;
this.teamId = options.teamId ?? null; this.teamId = options.teamId ?? null;
// make sure all urls start with https:// // make sure all urls start with https://
this.urls = this.urls.map((url) => { this.urls = this.urls.map((url) => {
if (!url.trim().startsWith("http")) { if (!url.trim().startsWith("http")) {

View File

@ -130,6 +130,7 @@ export async function scrapSingleUrl(
): Promise<Document> { ): Promise<Document> {
pageOptions = { pageOptions = {
includeMarkdown: pageOptions.includeMarkdown ?? true, includeMarkdown: pageOptions.includeMarkdown ?? true,
includeExtract: pageOptions.includeExtract ?? false,
onlyMainContent: pageOptions.onlyMainContent ?? false, onlyMainContent: pageOptions.onlyMainContent ?? false,
includeHtml: pageOptions.includeHtml ?? false, includeHtml: pageOptions.includeHtml ?? false,
includeRawHtml: pageOptions.includeRawHtml ?? false, includeRawHtml: pageOptions.includeRawHtml ?? false,
@ -388,11 +389,11 @@ export async function scrapSingleUrl(
if (screenshot && screenshot.length > 0) { if (screenshot && screenshot.length > 0) {
document = { document = {
content: text, content: text,
markdown: pageOptions.includeMarkdown ? text : undefined, markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
html: pageOptions.includeHtml ? html : undefined, html: pageOptions.includeHtml ? html : undefined,
rawHtml: rawHtml:
pageOptions.includeRawHtml || pageOptions.includeRawHtml ||
extractorOptions?.mode === "llm-extraction-from-raw-html" (extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
? rawHtml ? rawHtml
: undefined, : undefined,
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined, linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
@ -407,11 +408,11 @@ export async function scrapSingleUrl(
} else { } else {
document = { document = {
content: text, content: text,
markdown: pageOptions.includeMarkdown ? text : undefined, markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
html: pageOptions.includeHtml ? html : undefined, html: pageOptions.includeHtml ? html : undefined,
rawHtml: rawHtml:
pageOptions.includeRawHtml || pageOptions.includeRawHtml ||
extractorOptions?.mode === "llm-extraction-from-raw-html" (extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
? rawHtml ? rawHtml
: undefined, : undefined,
metadata: { metadata: {
@ -434,7 +435,7 @@ export async function scrapSingleUrl(
}); });
return { return {
content: "", content: "",
markdown: pageOptions.includeMarkdown ? "" : undefined, markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined,
html: "", html: "",
linksOnPage: pageOptions.includeLinks ? [] : undefined, linksOnPage: pageOptions.includeLinks ? [] : undefined,
metadata: { metadata: {

View File

@ -199,21 +199,44 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
); );
} }
// Free credits, no coupons // Free credits, no coupons
if (subscriptionError || !subscription) { if (!subscription || subscriptionError) {
// If there is no active subscription but there are available coupons // If there is no active subscription but there are available coupons
if (couponCredits >= credits) { if (couponCredits >= credits) {
return { success: true, message: "Sufficient credits available", remainingCredits: couponCredits }; return { success: true, message: "Sufficient credits available", remainingCredits: couponCredits };
} }
const { data: creditUsages, error: creditUsageError } = let creditUsages;
await supabase_service let creditUsageError;
let retries = 0;
const maxRetries = 3;
const retryInterval = 2000; // 2 seconds
while (retries < maxRetries) {
const result = await supabase_service
.from("credit_usage") .from("credit_usage")
.select("credits_used") .select("credits_used")
.is("subscription_id", null) .is("subscription_id", null)
.eq("team_id", team_id); .eq("team_id", team_id);
creditUsages = result.data;
creditUsageError = result.error;
if (!creditUsageError) {
break;
}
retries++;
if (retries < maxRetries) {
await new Promise(resolve => setTimeout(resolve, retryInterval));
}
}
if (creditUsageError) { if (creditUsageError) {
Logger.error(`Credit usage error after ${maxRetries} attempts: ${creditUsageError}`);
throw new Error( throw new Error(
`Failed to retrieve credit usage for team_id: ${team_id}` `Failed to retrieve credit usage for team_id: ${team_id}`
); );

View File

@ -62,6 +62,7 @@ export function waitForJob(jobId: string, timeout: number) {
clearInterval(int); clearInterval(int);
resolve((await getScrapeQueue().getJob(jobId)).returnvalue); resolve((await getScrapeQueue().getJob(jobId)).returnvalue);
} else if (state === "failed") { } else if (state === "failed") {
// console.log("failed", (await getScrapeQueue().getJob(jobId)).failedReason);
clearInterval(int); clearInterval(int);
reject((await getScrapeQueue().getJob(jobId)).failedReason); reject((await getScrapeQueue().getJob(jobId)).failedReason);
} }

View File

@ -192,17 +192,16 @@ async function processJob(job: Job, token: string) {
job, job,
token, token,
}); });
// Better if we throw here so we capture with the correct error
if(!success) {
throw new Error(message);
}
const end = Date.now(); const end = Date.now();
const timeTakenInSeconds = (end - start) / 1000; const timeTakenInSeconds = (end - start) / 1000;
const rawHtml = docs[0] ? docs[0].rawHtml : ""; const rawHtml = docs[0] ? docs[0].rawHtml : "";
if (job.data.crawl_id && (!job.data.pageOptions || !job.data.pageOptions.includeRawHtml)) {
if (docs[0] && docs[0].rawHtml) {
delete docs[0].rawHtml;
}
}
const data = { const data = {
success, success,
result: { result: {

View File

@ -104,6 +104,14 @@ export const devBRateLimiter = new RateLimiterRedis({
duration: 60, // Duration in seconds duration: 60, // Duration in seconds
}); });
export const scrapeStatusRateLimiter = new RateLimiterRedis({
storeClient: redisRateLimitClient,
keyPrefix: "scrape-status",
points: 400,
duration: 60, // Duration in seconds
});
export function getRateLimiter( export function getRateLimiter(
mode: RateLimiterMode, mode: RateLimiterMode,
token: string, token: string,

View File

@ -5,7 +5,6 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
Object.defineProperty(exports, "__esModule", { value: true }); Object.defineProperty(exports, "__esModule", { value: true });
exports.CrawlWatcher = void 0; exports.CrawlWatcher = void 0;
const axios_1 = __importDefault(require("axios")); const axios_1 = __importDefault(require("axios"));
const zod_1 = require("zod");
const zod_to_json_schema_1 = require("zod-to-json-schema"); const zod_to_json_schema_1 = require("zod-to-json-schema");
const isows_1 = require("isows"); const isows_1 = require("isows");
const typescript_event_target_1 = require("typescript-event-target"); const typescript_event_target_1 = require("typescript-event-target");
@ -34,18 +33,19 @@ class FirecrawlApp {
Authorization: `Bearer ${this.apiKey}`, Authorization: `Bearer ${this.apiKey}`,
}; };
let jsonData = { url, ...params }; let jsonData = { url, ...params };
if (jsonData?.extractorOptions?.extractionSchema) { if (jsonData?.extract?.schema) {
let schema = jsonData.extractorOptions.extractionSchema; let schema = jsonData.extract.schema;
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas // Try parsing the schema as a Zod schema
if (schema instanceof zod_1.z.ZodSchema) { try {
schema = (0, zod_to_json_schema_1.zodToJsonSchema)(schema); schema = (0, zod_to_json_schema_1.zodToJsonSchema)(schema);
} }
catch (error) {
}
jsonData = { jsonData = {
...jsonData, ...jsonData,
extractorOptions: { extract: {
...jsonData.extractorOptions, ...jsonData.extract,
extractionSchema: schema, schema: schema,
mode: jsonData.extractorOptions.mode || "llm-extraction",
}, },
}; };
} }

View File

@ -1,5 +1,4 @@
import axios from "axios"; import axios from "axios";
import { z } from "zod";
import { zodToJsonSchema } from "zod-to-json-schema"; import { zodToJsonSchema } from "zod-to-json-schema";
import { WebSocket } from "isows"; import { WebSocket } from "isows";
import { TypedEventTarget } from "typescript-event-target"; import { TypedEventTarget } from "typescript-event-target";
@ -28,18 +27,19 @@ export default class FirecrawlApp {
Authorization: `Bearer ${this.apiKey}`, Authorization: `Bearer ${this.apiKey}`,
}; };
let jsonData = { url, ...params }; let jsonData = { url, ...params };
if (jsonData?.extractorOptions?.extractionSchema) { if (jsonData?.extract?.schema) {
let schema = jsonData.extractorOptions.extractionSchema; let schema = jsonData.extract.schema;
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas // Try parsing the schema as a Zod schema
if (schema instanceof z.ZodSchema) { try {
schema = zodToJsonSchema(schema); schema = zodToJsonSchema(schema);
} }
catch (error) {
}
jsonData = { jsonData = {
...jsonData, ...jsonData,
extractorOptions: { extract: {
...jsonData.extractorOptions, ...jsonData.extract,
extractionSchema: schema, schema: schema,
mode: jsonData.extractorOptions.mode || "llm-extraction",
}, },
}; };
} }

View File

@ -1,6 +1,6 @@
{ {
"name": "@mendable/firecrawl-js", "name": "@mendable/firecrawl-js",
"version": "1.1.0", "version": "1.2.0",
"description": "JavaScript SDK for Firecrawl API", "description": "JavaScript SDK for Firecrawl API",
"main": "build/cjs/index.js", "main": "build/cjs/index.js",
"types": "types/index.d.ts", "types": "types/index.d.ts",

View File

@ -64,6 +64,7 @@ export interface FirecrawlDocument {
html?: string; html?: string;
rawHtml?: string; rawHtml?: string;
links?: string[]; links?: string[];
extract?: Record<any, any>;
screenshot?: string; screenshot?: string;
metadata?: FirecrawlDocumentMetadata; metadata?: FirecrawlDocumentMetadata;
} }
@ -73,11 +74,16 @@ export interface FirecrawlDocument {
* Defines the options and configurations available for scraping web content. * Defines the options and configurations available for scraping web content.
*/ */
export interface ScrapeParams { export interface ScrapeParams {
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "full@scrennshot")[]; formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "extract" | "full@scrennshot")[];
headers?: Record<string, string>; headers?: Record<string, string>;
includeTags?: string[]; includeTags?: string[];
excludeTags?: string[]; excludeTags?: string[];
onlyMainContent?: boolean; onlyMainContent?: boolean;
extract?: {
prompt?: string;
schema?: z.ZodSchema | any;
systemPrompt?: string;
};
waitFor?: number; waitFor?: number;
timeout?: number; timeout?: number;
} }
@ -196,18 +202,20 @@ export default class FirecrawlApp {
Authorization: `Bearer ${this.apiKey}`, Authorization: `Bearer ${this.apiKey}`,
} as AxiosRequestHeaders; } as AxiosRequestHeaders;
let jsonData: any = { url, ...params }; let jsonData: any = { url, ...params };
if (jsonData?.extractorOptions?.extractionSchema) { if (jsonData?.extract?.schema) {
let schema = jsonData.extractorOptions.extractionSchema; let schema = jsonData.extract.schema;
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas
if (schema instanceof z.ZodSchema) { // Try parsing the schema as a Zod schema
try {
schema = zodToJsonSchema(schema); schema = zodToJsonSchema(schema);
} catch (error) {
} }
jsonData = { jsonData = {
...jsonData, ...jsonData,
extractorOptions: { extract: {
...jsonData.extractorOptions, ...jsonData.extract,
extractionSchema: schema, schema: schema,
mode: jsonData.extractorOptions.mode || "llm-extraction",
}, },
}; };
} }

View File

@ -1,4 +1,5 @@
import { AxiosResponse, AxiosRequestHeaders } from "axios"; import { AxiosResponse, AxiosRequestHeaders } from "axios";
import { z } from "zod";
import { TypedEventTarget } from "typescript-event-target"; import { TypedEventTarget } from "typescript-event-target";
/** /**
* Configuration interface for FirecrawlApp. * Configuration interface for FirecrawlApp.
@ -58,6 +59,7 @@ export interface FirecrawlDocument {
html?: string; html?: string;
rawHtml?: string; rawHtml?: string;
links?: string[]; links?: string[];
extract?: Record<any, any>;
screenshot?: string; screenshot?: string;
metadata?: FirecrawlDocumentMetadata; metadata?: FirecrawlDocumentMetadata;
} }
@ -66,11 +68,16 @@ export interface FirecrawlDocument {
* Defines the options and configurations available for scraping web content. * Defines the options and configurations available for scraping web content.
*/ */
export interface ScrapeParams { export interface ScrapeParams {
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "full@scrennshot")[]; formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "extract" | "full@scrennshot")[];
headers?: Record<string, string>; headers?: Record<string, string>;
includeTags?: string[]; includeTags?: string[];
excludeTags?: string[]; excludeTags?: string[];
onlyMainContent?: boolean; onlyMainContent?: boolean;
extract?: {
prompt?: string;
schema?: z.ZodSchema | any;
systemPrompt?: string;
};
waitFor?: number; waitFor?: number;
timeout?: number; timeout?: number;
} }

View File

@ -3,7 +3,7 @@ import nest_asyncio
import uuid import uuid
from firecrawl.firecrawl import FirecrawlApp from firecrawl.firecrawl import FirecrawlApp
app = FirecrawlApp(api_key="fc-YOUR_API_KEY") app = FirecrawlApp(api_key="fc-")
# Scrape a website: # Scrape a website:
scrape_result = app.scrape_url('firecrawl.dev') scrape_result = app.scrape_url('firecrawl.dev')
@ -33,63 +33,63 @@ print(crawl_status)
# LLM Extraction: # LLM Extraction:
# Define schema to extract contents into using pydantic # Define schema to extract contents into using pydantic
# from pydantic import BaseModel, Field from pydantic import BaseModel, Field
# from typing import List from typing import List
# class ArticleSchema(BaseModel): class ArticleSchema(BaseModel):
# title: str title: str
# points: int points: int
# by: str by: str
# commentsURL: str commentsURL: str
# class TopArticlesSchema(BaseModel): class TopArticlesSchema(BaseModel):
# top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories") top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
# llm_extraction_result = app.scrape_url('https://news.ycombinator.com', { llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
# 'extractorOptions': { 'formats': ['extract'],
# 'extractionSchema': TopArticlesSchema.model_json_schema(), 'extract': {
# 'mode': 'llm-extraction' 'schema': TopArticlesSchema.model_json_schema()
# }, }
# 'pageOptions':{ })
# 'onlyMainContent': True
# }
# })
# print(llm_extraction_result['llm_extraction']) print(llm_extraction_result['extract'])
# # Define schema to extract contents into using json schema # # Define schema to extract contents into using json schema
# json_schema = { json_schema = {
# "type": "object", "type": "object",
# "properties": { "properties": {
# "top": { "top": {
# "type": "array", "type": "array",
# "items": { "items": {
# "type": "object", "type": "object",
# "properties": { "properties": {
# "title": {"type": "string"}, "title": {"type": "string"},
# "points": {"type": "number"}, "points": {"type": "number"},
# "by": {"type": "string"}, "by": {"type": "string"},
# "commentsURL": {"type": "string"} "commentsURL": {"type": "string"}
# }, },
# "required": ["title", "points", "by", "commentsURL"] "required": ["title", "points", "by", "commentsURL"]
# }, },
# "minItems": 5, "minItems": 5,
# "maxItems": 5, "maxItems": 5,
# "description": "Top 5 stories on Hacker News" "description": "Top 5 stories on Hacker News"
# } }
# }, },
# "required": ["top"] "required": ["top"]
# } }
# llm_extraction_result = app.scrape_url('https://news.ycombinator.com', { app2 = FirecrawlApp(api_key="fc-", version="v0")
# 'extractorOptions': {
# 'extractionSchema': json_schema,
# 'mode': 'llm-extraction' llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', {
# }, 'extractorOptions': {
# 'pageOptions':{ 'extractionSchema': json_schema,
# 'onlyMainContent': True 'mode': 'llm-extraction'
# } },
# }) 'pageOptions':{
'onlyMainContent': True
}
})
# print(llm_extraction_result['llm_extraction']) # print(llm_extraction_result['llm_extraction'])
@ -124,6 +124,3 @@ async def start_crawl_and_watch():
# Start the watcher # Start the watcher
await watcher.connect() await watcher.connect()
# Run the event loop
await start_crawl_and_watch()

View File

@ -13,7 +13,7 @@ import os
from .firecrawl import FirecrawlApp from .firecrawl import FirecrawlApp
__version__ = "1.1.1" __version__ = "1.2.1"
# Define the logger for the Firecrawl project # Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl") logger: logging.Logger = logging.getLogger("firecrawl")

View File

@ -59,20 +59,16 @@ class FirecrawlApp:
# If there are additional params, process them # If there are additional params, process them
if params: if params:
# Initialize extractorOptions if present # Handle extract (for v1)
extractor_options = params.get('extractorOptions', {}) extract = params.get('extract', {})
# Check and convert the extractionSchema if it's a Pydantic model if extract:
if 'extractionSchema' in extractor_options: if 'schema' in extract and hasattr(extract['schema'], 'schema'):
if hasattr(extractor_options['extractionSchema'], 'schema'): extract['schema'] = extract['schema'].schema()
extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema() scrape_params['extract'] = extract
# Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided
extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
# Update the scrape_params with the processed extractorOptions
scrape_params['extractorOptions'] = extractor_options
# Include any other params directly at the top level of scrape_params # Include any other params directly at the top level of scrape_params
for key, value in params.items(): for key, value in params.items():
if key != 'extractorOptions': if key not in ['extract']:
scrape_params[key] = value scrape_params[key] = value
endpoint = f'/v1/scrape' endpoint = f'/v1/scrape'