mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-06 01:16:01 +08:00
Merge branch 'main' into v1/webhooks
This commit is contained in:
commit
faae98ecb8
28
README.md
28
README.md
@ -229,20 +229,19 @@ Response will be an ordered list from the most relevant to the least relevant.
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
### LLM Extraction (v0) (Beta)
|
### LLM Extraction (Beta)
|
||||||
|
|
||||||
Used to extract structured data from scraped pages.
|
Used to extract structured data from scraped pages.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -X POST https://api.firecrawl.dev/v0/scrape \
|
curl -X POST https://api.firecrawl.dev/v1/scrape \
|
||||||
-H 'Content-Type: application/json' \
|
-H 'Content-Type: application/json' \
|
||||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||||
-d '{
|
-d '{
|
||||||
"url": "https://www.mendable.ai/",
|
"url": "https://www.mendable.ai/",
|
||||||
"extractorOptions": {
|
"formats": ["extract"],
|
||||||
"mode": "llm-extraction",
|
"extract": {
|
||||||
"extractionPrompt": "Based on the information on the page, extract the information from the schema. ",
|
"schema": {
|
||||||
"extractionSchema": {
|
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"company_mission": {
|
"company_mission": {
|
||||||
@ -296,6 +295,23 @@ curl -X POST https://api.firecrawl.dev/v0/scrape \
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Extracting without a schema (New)
|
||||||
|
|
||||||
|
You can now extract without a schema by just passing a `prompt` to the endpoint. The llm chooses the structure of the data.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST https://api.firecrawl.dev/v1/scrape \
|
||||||
|
-H 'Content-Type: application/json' \
|
||||||
|
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||||
|
-d '{
|
||||||
|
"url": "https://docs.firecrawl.dev/",
|
||||||
|
"formats": ["extract"],
|
||||||
|
"extract": {
|
||||||
|
"prompt": "Extract the company mission from the page."
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
### Search (v0) (Beta)
|
### Search (v0) (Beta)
|
||||||
|
|
||||||
|
@ -94,7 +94,7 @@
|
|||||||
"moment": "^2.29.4",
|
"moment": "^2.29.4",
|
||||||
"mongoose": "^8.4.4",
|
"mongoose": "^8.4.4",
|
||||||
"natural": "^7.0.7",
|
"natural": "^7.0.7",
|
||||||
"openai": "^4.52.2",
|
"openai": "^4.57.0",
|
||||||
"pdf-parse": "^1.1.1",
|
"pdf-parse": "^1.1.1",
|
||||||
"pos": "^0.4.2",
|
"pos": "^0.4.2",
|
||||||
"posthog-node": "^4.0.1",
|
"posthog-node": "^4.0.1",
|
||||||
|
54
apps/api/pnpm-lock.yaml
generated
54
apps/api/pnpm-lock.yaml
generated
@ -124,7 +124,7 @@ importers:
|
|||||||
version: 0.0.28
|
version: 0.0.28
|
||||||
langchain:
|
langchain:
|
||||||
specifier: ^0.2.8
|
specifier: ^0.2.8
|
||||||
version: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)
|
version: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)
|
||||||
languagedetect:
|
languagedetect:
|
||||||
specifier: ^2.0.0
|
specifier: ^2.0.0
|
||||||
version: 2.0.0
|
version: 2.0.0
|
||||||
@ -147,8 +147,8 @@ importers:
|
|||||||
specifier: ^7.0.7
|
specifier: ^7.0.7
|
||||||
version: 7.0.7(socks@2.8.3)
|
version: 7.0.7(socks@2.8.3)
|
||||||
openai:
|
openai:
|
||||||
specifier: ^4.52.2
|
specifier: ^4.57.0
|
||||||
version: 4.52.2
|
version: 4.57.0(zod@3.23.8)
|
||||||
pdf-parse:
|
pdf-parse:
|
||||||
specifier: ^1.1.1
|
specifier: ^1.1.1
|
||||||
version: 1.1.1
|
version: 1.1.1
|
||||||
@ -3733,9 +3733,14 @@ packages:
|
|||||||
openai@3.3.0:
|
openai@3.3.0:
|
||||||
resolution: {integrity: sha512-uqxI/Au+aPRnsaQRe8CojU0eCR7I0mBiKjD3sNMzY6DaC1ZVrc85u98mtJW6voDug8fgGN+DIZmTDxTthxb7dQ==}
|
resolution: {integrity: sha512-uqxI/Au+aPRnsaQRe8CojU0eCR7I0mBiKjD3sNMzY6DaC1ZVrc85u98mtJW6voDug8fgGN+DIZmTDxTthxb7dQ==}
|
||||||
|
|
||||||
openai@4.52.2:
|
openai@4.57.0:
|
||||||
resolution: {integrity: sha512-mMc0XgFuVSkcm0lRIi8zaw++otC82ZlfkCur1qguXYWPETr/+ZwL9A/vvp3YahX+shpaT6j03dwsmUyLAfmEfg==}
|
resolution: {integrity: sha512-JnwBSIYqiZ3jYjB5f2in8hQ0PRA092c6m+/6dYB0MzK0BEbn+0dioxZsPLBm5idJbg9xzLNOiGVm2OSuhZ+BdQ==}
|
||||||
hasBin: true
|
hasBin: true
|
||||||
|
peerDependencies:
|
||||||
|
zod: ^3.23.8
|
||||||
|
peerDependenciesMeta:
|
||||||
|
zod:
|
||||||
|
optional: true
|
||||||
|
|
||||||
openapi-types@12.1.3:
|
openapi-types@12.1.3:
|
||||||
resolution: {integrity: sha512-N4YtSYJqghVu4iek2ZUvcN/0aqH1kRDuNqzcycDxhOUpg7GdvLa2F3DgS6yBNhInhv2r/6I0Flkn7CqL8+nIcw==}
|
resolution: {integrity: sha512-N4YtSYJqghVu4iek2ZUvcN/0aqH1kRDuNqzcycDxhOUpg7GdvLa2F3DgS6yBNhInhv2r/6I0Flkn7CqL8+nIcw==}
|
||||||
@ -5319,13 +5324,13 @@ snapshots:
|
|||||||
|
|
||||||
'@js-sdsl/ordered-map@4.4.2': {}
|
'@js-sdsl/ordered-map@4.4.2': {}
|
||||||
|
|
||||||
'@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)':
|
'@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))':
|
||||||
dependencies:
|
dependencies:
|
||||||
ansi-styles: 5.2.0
|
ansi-styles: 5.2.0
|
||||||
camelcase: 6.3.0
|
camelcase: 6.3.0
|
||||||
decamelize: 1.2.0
|
decamelize: 1.2.0
|
||||||
js-tiktoken: 1.0.12
|
js-tiktoken: 1.0.12
|
||||||
langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
|
langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
|
||||||
ml-distance: 4.0.1
|
ml-distance: 4.0.1
|
||||||
mustache: 4.2.0
|
mustache: 4.2.0
|
||||||
p-queue: 6.6.2
|
p-queue: 6.6.2
|
||||||
@ -5337,20 +5342,20 @@ snapshots:
|
|||||||
- langchain
|
- langchain
|
||||||
- openai
|
- openai
|
||||||
|
|
||||||
'@langchain/openai@0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))':
|
'@langchain/openai@0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))':
|
||||||
dependencies:
|
dependencies:
|
||||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
|
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
|
||||||
js-tiktoken: 1.0.12
|
js-tiktoken: 1.0.12
|
||||||
openai: 4.52.2
|
openai: 4.57.0(zod@3.23.8)
|
||||||
zod: 3.23.8
|
zod: 3.23.8
|
||||||
zod-to-json-schema: 3.23.1(zod@3.23.8)
|
zod-to-json-schema: 3.23.1(zod@3.23.8)
|
||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
- encoding
|
- encoding
|
||||||
- langchain
|
- langchain
|
||||||
|
|
||||||
'@langchain/textsplitters@0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)':
|
'@langchain/textsplitters@0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))':
|
||||||
dependencies:
|
dependencies:
|
||||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
|
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
|
||||||
js-tiktoken: 1.0.12
|
js-tiktoken: 1.0.12
|
||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
- langchain
|
- langchain
|
||||||
@ -8487,17 +8492,17 @@ snapshots:
|
|||||||
|
|
||||||
kleur@3.0.3: {}
|
kleur@3.0.3: {}
|
||||||
|
|
||||||
langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0):
|
langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0):
|
||||||
dependencies:
|
dependencies:
|
||||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
|
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
|
||||||
'@langchain/openai': 0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))
|
'@langchain/openai': 0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))
|
||||||
'@langchain/textsplitters': 0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
|
'@langchain/textsplitters': 0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
|
||||||
binary-extensions: 2.3.0
|
binary-extensions: 2.3.0
|
||||||
js-tiktoken: 1.0.12
|
js-tiktoken: 1.0.12
|
||||||
js-yaml: 4.1.0
|
js-yaml: 4.1.0
|
||||||
jsonpointer: 5.0.1
|
jsonpointer: 5.0.1
|
||||||
langchainhub: 0.0.11
|
langchainhub: 0.0.11
|
||||||
langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
|
langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
|
||||||
ml-distance: 4.0.1
|
ml-distance: 4.0.1
|
||||||
openapi-types: 12.1.3
|
openapi-types: 12.1.3
|
||||||
p-retry: 4.6.2
|
p-retry: 4.6.2
|
||||||
@ -8524,7 +8529,7 @@ snapshots:
|
|||||||
|
|
||||||
langchainhub@0.0.11: {}
|
langchainhub@0.0.11: {}
|
||||||
|
|
||||||
langsmith@0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2):
|
langsmith@0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)):
|
||||||
dependencies:
|
dependencies:
|
||||||
'@types/uuid': 9.0.8
|
'@types/uuid': 9.0.8
|
||||||
commander: 10.0.1
|
commander: 10.0.1
|
||||||
@ -8533,9 +8538,9 @@ snapshots:
|
|||||||
p-retry: 4.6.2
|
p-retry: 4.6.2
|
||||||
uuid: 9.0.1
|
uuid: 9.0.1
|
||||||
optionalDependencies:
|
optionalDependencies:
|
||||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
|
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))
|
||||||
langchain: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)
|
langchain: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)
|
||||||
openai: 4.52.2
|
openai: 4.57.0(zod@3.23.8)
|
||||||
|
|
||||||
languagedetect@2.0.0: {}
|
languagedetect@2.0.0: {}
|
||||||
|
|
||||||
@ -8928,16 +8933,19 @@ snapshots:
|
|||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
- debug
|
- debug
|
||||||
|
|
||||||
openai@4.52.2:
|
openai@4.57.0(zod@3.23.8):
|
||||||
dependencies:
|
dependencies:
|
||||||
'@types/node': 18.19.39
|
'@types/node': 18.19.39
|
||||||
'@types/node-fetch': 2.6.11
|
'@types/node-fetch': 2.6.11
|
||||||
|
'@types/qs': 6.9.15
|
||||||
abort-controller: 3.0.0
|
abort-controller: 3.0.0
|
||||||
agentkeepalive: 4.5.0
|
agentkeepalive: 4.5.0
|
||||||
form-data-encoder: 1.7.2
|
form-data-encoder: 1.7.2
|
||||||
formdata-node: 4.4.1
|
formdata-node: 4.4.1
|
||||||
node-fetch: 2.7.0
|
node-fetch: 2.7.0
|
||||||
web-streams-polyfill: 3.3.3
|
qs: 6.12.2
|
||||||
|
optionalDependencies:
|
||||||
|
zod: 3.23.8
|
||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
- encoding
|
- encoding
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
### Crawl Website
|
### Crawl Website
|
||||||
POST http://localhost:3002/v0/scrape HTTP/1.1
|
POST http://localhost:3002/v0/scrape HTTP/1.1
|
||||||
Authorization: Bearer fc-4e6259caf03b42a4b6c9261e0f96e673
|
Authorization: Bearer fc-
|
||||||
content-type: application/json
|
content-type: application/json
|
||||||
|
|
||||||
{
|
{
|
||||||
@ -9,7 +9,7 @@ content-type: application/json
|
|||||||
|
|
||||||
### Check Job Status
|
### Check Job Status
|
||||||
GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1
|
GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1
|
||||||
Authorization: Bearer fc-4e6259caf03b42a4b6c9261e0f96e673
|
Authorization: Bearer fc-
|
||||||
|
|
||||||
|
|
||||||
### Check Job Status
|
### Check Job Status
|
||||||
|
@ -255,12 +255,25 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let doc = result.data;
|
||||||
|
if (!pageOptions || !pageOptions.includeRawHtml) {
|
||||||
|
if (doc && doc.rawHtml) {
|
||||||
|
delete doc.rawHtml;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(pageOptions && pageOptions.includeExtract) {
|
||||||
|
if(!pageOptions.includeMarkdown && doc && doc.markdown) {
|
||||||
|
delete doc.markdown;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
logJob({
|
logJob({
|
||||||
job_id: jobId,
|
job_id: jobId,
|
||||||
success: result.success,
|
success: result.success,
|
||||||
message: result.error,
|
message: result.error,
|
||||||
num_docs: 1,
|
num_docs: 1,
|
||||||
docs: [result.data],
|
docs: [doc],
|
||||||
time_taken: timeTakenInSeconds,
|
time_taken: timeTakenInSeconds,
|
||||||
team_id: team_id,
|
team_id: team_id,
|
||||||
mode: "scrape",
|
mode: "scrape",
|
||||||
|
38
apps/api/src/controllers/v1/scrape-status.ts
Normal file
38
apps/api/src/controllers/v1/scrape-status.ts
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
import { Response } from "express";
|
||||||
|
import { supabaseGetJobByIdOnlyData } from "../../lib/supabase-jobs";
|
||||||
|
import { scrapeStatusRateLimiter } from "../../services/rate-limiter";
|
||||||
|
|
||||||
|
export async function scrapeStatusController(req: any, res: any) {
|
||||||
|
try {
|
||||||
|
const rateLimiter = scrapeStatusRateLimiter;
|
||||||
|
const incomingIP = (req.headers["x-forwarded-for"] ||
|
||||||
|
req.socket.remoteAddress) as string;
|
||||||
|
const iptoken = incomingIP;
|
||||||
|
await rateLimiter.consume(iptoken);
|
||||||
|
|
||||||
|
const job = await supabaseGetJobByIdOnlyData(req.params.jobId);
|
||||||
|
|
||||||
|
if(job.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){
|
||||||
|
return res.status(403).json({
|
||||||
|
success: false,
|
||||||
|
error: "You are not allowed to access this resource.",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
return res.status(200).json({
|
||||||
|
success: true,
|
||||||
|
data: job?.docs[0],
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
if (error instanceof Error && error.message == "Too Many Requests") {
|
||||||
|
return res.status(429).json({
|
||||||
|
success: false,
|
||||||
|
error: "Rate limit exceeded. Please try again later.",
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
return res.status(500).json({
|
||||||
|
success: false,
|
||||||
|
error: "An unexpected error occurred.",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -1,36 +1,58 @@
|
|||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { Logger } from '../../lib/logger';
|
import { Logger } from "../../lib/logger";
|
||||||
import { Document, legacyDocumentConverter, legacyScrapeOptions, RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types";
|
import {
|
||||||
|
Document,
|
||||||
|
legacyDocumentConverter,
|
||||||
|
legacyExtractorOptions,
|
||||||
|
legacyScrapeOptions,
|
||||||
|
RequestWithAuth,
|
||||||
|
ScrapeRequest,
|
||||||
|
scrapeRequestSchema,
|
||||||
|
ScrapeResponse,
|
||||||
|
} from "./types";
|
||||||
import { billTeam } from "../../services/billing/credit_billing";
|
import { billTeam } from "../../services/billing/credit_billing";
|
||||||
import { v4 as uuidv4 } from 'uuid';
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
|
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
|
||||||
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||||
import { logJob } from "../../services/logging/log_job";
|
import { logJob } from "../../services/logging/log_job";
|
||||||
import { getJobPriority } from "../../lib/job-priority";
|
import { getJobPriority } from "../../lib/job-priority";
|
||||||
import { PlanType } from "../../types";
|
import { PlanType } from "../../types";
|
||||||
|
|
||||||
export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>, res: Response<ScrapeResponse>) {
|
export async function scrapeController(
|
||||||
|
req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>,
|
||||||
|
res: Response<ScrapeResponse>
|
||||||
|
) {
|
||||||
req.body = scrapeRequestSchema.parse(req.body);
|
req.body = scrapeRequestSchema.parse(req.body);
|
||||||
let earlyReturn = false;
|
let earlyReturn = false;
|
||||||
|
|
||||||
const origin = req.body.origin;
|
const origin = req.body.origin;
|
||||||
const timeout = req.body.timeout;
|
const timeout = req.body.timeout;
|
||||||
const pageOptions = legacyScrapeOptions(req.body);
|
const pageOptions = legacyScrapeOptions(req.body);
|
||||||
|
const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
|
||||||
const jobId = uuidv4();
|
const jobId = uuidv4();
|
||||||
|
|
||||||
const startTime = new Date().getTime();
|
const startTime = new Date().getTime();
|
||||||
const jobPriority = await getJobPriority({plan: req.auth.plan as PlanType, team_id: req.auth.team_id, basePriority: 10})
|
const jobPriority = await getJobPriority({
|
||||||
|
plan: req.auth.plan as PlanType,
|
||||||
|
team_id: req.auth.team_id,
|
||||||
|
basePriority: 10,
|
||||||
|
});
|
||||||
|
|
||||||
const job = await addScrapeJob({
|
const job = await addScrapeJob(
|
||||||
|
{
|
||||||
url: req.body.url,
|
url: req.body.url,
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
crawlerOptions: {},
|
crawlerOptions: {},
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
pageOptions,
|
pageOptions,
|
||||||
extractorOptions: {},
|
extractorOptions,
|
||||||
origin: req.body.origin,
|
origin: req.body.origin,
|
||||||
is_scrape: true,
|
is_scrape: true,
|
||||||
}, {}, jobId, jobPriority);
|
},
|
||||||
|
{},
|
||||||
|
jobId,
|
||||||
|
jobPriority
|
||||||
|
);
|
||||||
|
|
||||||
let doc: any | undefined;
|
let doc: any | undefined;
|
||||||
try {
|
try {
|
||||||
@ -45,7 +67,11 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse,
|
|||||||
} else {
|
} else {
|
||||||
return res.status(500).json({
|
return res.status(500).json({
|
||||||
success: false,
|
success: false,
|
||||||
error: "Internal server error",
|
error: `(Internal server error) - ${e && e?.message ? e.message : e} ${
|
||||||
|
extractorOptions && extractorOptions.mode !== "markdown"
|
||||||
|
? " - Could be due to LLM parsing issues"
|
||||||
|
: ""
|
||||||
|
}`,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -57,7 +83,7 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse,
|
|||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
warning: "No page found",
|
warning: "No page found",
|
||||||
data: doc
|
data: doc,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -66,25 +92,41 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse,
|
|||||||
|
|
||||||
const endTime = new Date().getTime();
|
const endTime = new Date().getTime();
|
||||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
const numTokens = (doc && doc.markdown) ? numTokensFromString(doc.markdown, "gpt-3.5-turbo") : 0;
|
const numTokens =
|
||||||
|
doc && doc.markdown
|
||||||
|
? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
|
||||||
|
: 0;
|
||||||
|
|
||||||
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||||
if (earlyReturn) {
|
if (earlyReturn) {
|
||||||
// Don't bill if we're early returning
|
// Don't bill if we're early returning
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if(req.body.extract && req.body.formats.includes("extract")) {
|
||||||
|
creditsToBeBilled = 50;
|
||||||
|
}
|
||||||
|
|
||||||
const billingResult = await billTeam(
|
const billingResult = await billTeam(req.auth.team_id, creditsToBeBilled);
|
||||||
req.auth.team_id,
|
|
||||||
creditsToBeBilled
|
|
||||||
);
|
|
||||||
if (!billingResult.success) {
|
if (!billingResult.success) {
|
||||||
return res.status(402).json({
|
return res.status(402).json({
|
||||||
success: false,
|
success: false,
|
||||||
error: "Failed to bill team. Insufficient credits or subscription not found.",
|
error:
|
||||||
|
"Failed to bill team. Insufficient credits or subscription not found.",
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!pageOptions || !pageOptions.includeRawHtml) {
|
||||||
|
if (doc && doc.rawHtml) {
|
||||||
|
delete doc.rawHtml;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(pageOptions && pageOptions.includeExtract) {
|
||||||
|
if(!pageOptions.includeMarkdown && doc && doc.markdown) {
|
||||||
|
delete doc.markdown;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
logJob({
|
logJob({
|
||||||
job_id: jobId,
|
job_id: jobId,
|
||||||
success: true,
|
success: true,
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { z } from "zod";
|
import { z } from "zod";
|
||||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||||
import { PageOptions } from "../../lib/entities";
|
import { ExtractorOptions, PageOptions } from "../../lib/entities";
|
||||||
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
|
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
|
||||||
import { PlanType } from "../../types";
|
import { PlanType } from "../../types";
|
||||||
|
|
||||||
@ -11,7 +11,8 @@ export type Format =
|
|||||||
| "rawHtml"
|
| "rawHtml"
|
||||||
| "links"
|
| "links"
|
||||||
| "screenshot"
|
| "screenshot"
|
||||||
| "screenshot@fullPage";
|
| "screenshot@fullPage"
|
||||||
|
| "extract";
|
||||||
|
|
||||||
export const url = z.preprocess(
|
export const url = z.preprocess(
|
||||||
(x) => {
|
(x) => {
|
||||||
@ -40,6 +41,15 @@ export const url = z.preprocess(
|
|||||||
|
|
||||||
const strictMessage = "Unrecognized key in body -- please review the v1 API documentation for request body changes";
|
const strictMessage = "Unrecognized key in body -- please review the v1 API documentation for request body changes";
|
||||||
|
|
||||||
|
export const extractOptions = z.object({
|
||||||
|
mode: z.enum(["llm"]).default("llm"),
|
||||||
|
schema: z.any().optional(),
|
||||||
|
systemPrompt: z.string().default("Based on the information on the page, extract all the information from the schema. Try to extract all the fields even those that might not be marked as required."),
|
||||||
|
prompt: z.string().optional()
|
||||||
|
}).strict(strictMessage);
|
||||||
|
|
||||||
|
export type ExtractOptions = z.infer<typeof extractOptions>;
|
||||||
|
|
||||||
export const scrapeOptions = z.object({
|
export const scrapeOptions = z.object({
|
||||||
formats: z
|
formats: z
|
||||||
.enum([
|
.enum([
|
||||||
@ -49,6 +59,7 @@ export const scrapeOptions = z.object({
|
|||||||
"links",
|
"links",
|
||||||
"screenshot",
|
"screenshot",
|
||||||
"screenshot@fullPage",
|
"screenshot@fullPage",
|
||||||
|
"extract"
|
||||||
])
|
])
|
||||||
.array()
|
.array()
|
||||||
.optional()
|
.optional()
|
||||||
@ -57,17 +68,33 @@ export const scrapeOptions = z.object({
|
|||||||
includeTags: z.string().array().optional(),
|
includeTags: z.string().array().optional(),
|
||||||
excludeTags: z.string().array().optional(),
|
excludeTags: z.string().array().optional(),
|
||||||
onlyMainContent: z.boolean().default(true),
|
onlyMainContent: z.boolean().default(true),
|
||||||
timeout: z.number().int().positive().finite().safe().default(30000), // default?
|
timeout: z.number().int().positive().finite().safe().default(30000),
|
||||||
waitFor: z.number().int().nonnegative().finite().safe().default(0),
|
waitFor: z.number().int().nonnegative().finite().safe().default(0),
|
||||||
|
extract: extractOptions.optional(),
|
||||||
parsePDF: z.boolean().default(true),
|
parsePDF: z.boolean().default(true),
|
||||||
}).strict(strictMessage);
|
}).strict(strictMessage)
|
||||||
|
|
||||||
|
|
||||||
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
|
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
|
||||||
|
|
||||||
export const scrapeRequestSchema = scrapeOptions.extend({
|
export const scrapeRequestSchema = scrapeOptions.extend({
|
||||||
url,
|
url,
|
||||||
origin: z.string().optional().default("api"),
|
origin: z.string().optional().default("api"),
|
||||||
}).strict(strictMessage);
|
}).strict(strictMessage).refine(
|
||||||
|
(obj) => {
|
||||||
|
const hasExtractFormat = obj.formats?.includes("extract");
|
||||||
|
const hasExtractOptions = obj.extract !== undefined;
|
||||||
|
return (hasExtractFormat && hasExtractOptions) || (!hasExtractFormat && !hasExtractOptions);
|
||||||
|
},
|
||||||
|
{
|
||||||
|
message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa",
|
||||||
|
}
|
||||||
|
).transform((obj) => {
|
||||||
|
if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) {
|
||||||
|
return { ...obj, timeout: 60000 };
|
||||||
|
}
|
||||||
|
return obj;
|
||||||
|
});
|
||||||
|
|
||||||
// export type ScrapeRequest = {
|
// export type ScrapeRequest = {
|
||||||
// url: string;
|
// url: string;
|
||||||
@ -118,6 +145,13 @@ export const crawlRequestSchema = crawlerOptions.extend({
|
|||||||
// scrapeOptions?: Exclude<ScrapeRequest, "url">;
|
// scrapeOptions?: Exclude<ScrapeRequest, "url">;
|
||||||
// };
|
// };
|
||||||
|
|
||||||
|
// export type ExtractorOptions = {
|
||||||
|
// mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html";
|
||||||
|
// extractionPrompt?: string;
|
||||||
|
// extractionSchema?: Record<string, any>;
|
||||||
|
// }
|
||||||
|
|
||||||
|
|
||||||
export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
|
export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
|
||||||
|
|
||||||
export const mapRequestSchema = crawlerOptions.extend({
|
export const mapRequestSchema = crawlerOptions.extend({
|
||||||
@ -126,7 +160,7 @@ export const mapRequestSchema = crawlerOptions.extend({
|
|||||||
includeSubdomains: z.boolean().default(true),
|
includeSubdomains: z.boolean().default(true),
|
||||||
search: z.string().optional(),
|
search: z.string().optional(),
|
||||||
ignoreSitemap: z.boolean().default(false),
|
ignoreSitemap: z.boolean().default(false),
|
||||||
limit: z.number().min(1).max(50).default(5000).optional(),
|
limit: z.number().min(1).max(5000).default(5000).optional(),
|
||||||
}).strict(strictMessage);
|
}).strict(strictMessage);
|
||||||
|
|
||||||
// export type MapRequest = {
|
// export type MapRequest = {
|
||||||
@ -138,6 +172,7 @@ export type MapRequest = z.infer<typeof mapRequestSchema>;
|
|||||||
|
|
||||||
export type Document = {
|
export type Document = {
|
||||||
markdown?: string;
|
markdown?: string;
|
||||||
|
extract?: string;
|
||||||
html?: string;
|
html?: string;
|
||||||
rawHtml?: string;
|
rawHtml?: string;
|
||||||
links?: string[];
|
links?: string[];
|
||||||
@ -280,6 +315,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
|||||||
includeMarkdown: x.formats.includes("markdown"),
|
includeMarkdown: x.formats.includes("markdown"),
|
||||||
includeHtml: x.formats.includes("html"),
|
includeHtml: x.formats.includes("html"),
|
||||||
includeRawHtml: x.formats.includes("rawHtml"),
|
includeRawHtml: x.formats.includes("rawHtml"),
|
||||||
|
includeExtract: x.formats.includes("extract"),
|
||||||
onlyIncludeTags: x.includeTags,
|
onlyIncludeTags: x.includeTags,
|
||||||
removeTags: x.excludeTags,
|
removeTags: x.excludeTags,
|
||||||
onlyMainContent: x.onlyMainContent,
|
onlyMainContent: x.onlyMainContent,
|
||||||
@ -291,6 +327,15 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function legacyExtractorOptions(x: ExtractOptions): ExtractorOptions {
|
||||||
|
return {
|
||||||
|
mode: x.mode ? "llm-extraction" : "markdown",
|
||||||
|
extractionPrompt: x.prompt ?? "Based on the information on the page, extract the information from the schema.",
|
||||||
|
extractionSchema: x.schema,
|
||||||
|
userPrompt: x.prompt ?? "",
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
export function legacyDocumentConverter(doc: any): Document {
|
export function legacyDocumentConverter(doc: any): Document {
|
||||||
if (doc === null || doc === undefined) return doc;
|
if (doc === null || doc === undefined) return doc;
|
||||||
|
|
||||||
@ -311,6 +356,7 @@ export function legacyDocumentConverter(doc: any): Document {
|
|||||||
links: doc.linksOnPage,
|
links: doc.linksOnPage,
|
||||||
rawHtml: doc.rawHtml,
|
rawHtml: doc.rawHtml,
|
||||||
html: doc.html,
|
html: doc.html,
|
||||||
|
extract: doc.llm_extraction,
|
||||||
screenshot: doc.screenshot ?? doc.fullPageScreenshot,
|
screenshot: doc.screenshot ?? doc.fullPageScreenshot,
|
||||||
metadata: {
|
metadata: {
|
||||||
...doc.metadata,
|
...doc.metadata,
|
||||||
|
@ -15,7 +15,8 @@ export async function generateCompletions(
|
|||||||
// const schema = zodToJsonSchema(options.schema)
|
// const schema = zodToJsonSchema(options.schema)
|
||||||
|
|
||||||
const schema = extractionOptions.extractionSchema;
|
const schema = extractionOptions.extractionSchema;
|
||||||
const prompt = extractionOptions.extractionPrompt;
|
const systemPrompt = extractionOptions.extractionPrompt;
|
||||||
|
const prompt = extractionOptions.userPrompt;
|
||||||
|
|
||||||
const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider
|
const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider
|
||||||
|
|
||||||
@ -30,18 +31,23 @@ export async function generateCompletions(
|
|||||||
document: document,
|
document: document,
|
||||||
schema: schema,
|
schema: schema,
|
||||||
prompt: prompt,
|
prompt: prompt,
|
||||||
|
systemPrompt: systemPrompt,
|
||||||
mode: mode,
|
mode: mode,
|
||||||
});
|
});
|
||||||
// Validate the JSON output against the schema using AJV
|
// Validate the JSON output against the schema using AJV
|
||||||
|
if (schema) {
|
||||||
const validate = ajv.compile(schema);
|
const validate = ajv.compile(schema);
|
||||||
if (!validate(completionResult.llm_extraction)) {
|
if (!validate(completionResult.llm_extraction)) {
|
||||||
//TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
|
//TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
|
||||||
throw new Error(
|
throw new Error(
|
||||||
`JSON parsing error(s): ${validate.errors
|
`JSON parsing error(s): ${validate.errors
|
||||||
?.map((err) => err.message)
|
?.map((err) => err.message)
|
||||||
.join(", ")}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.`
|
.join(
|
||||||
|
", "
|
||||||
|
)}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.`
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return completionResult;
|
return completionResult;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
@ -16,7 +16,6 @@ function prepareOpenAIDoc(
|
|||||||
document: Document,
|
document: Document,
|
||||||
mode: "markdown" | "raw-html"
|
mode: "markdown" | "raw-html"
|
||||||
): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] | null {
|
): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] | null {
|
||||||
|
|
||||||
let markdown = document.markdown;
|
let markdown = document.markdown;
|
||||||
|
|
||||||
let extractionTarget = document.markdown;
|
let extractionTarget = document.markdown;
|
||||||
@ -33,34 +32,32 @@ function prepareOpenAIDoc(
|
|||||||
// );
|
// );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// count number of tokens
|
// count number of tokens
|
||||||
const numTokens = numTokensFromString(extractionTarget, "gpt-4");
|
const numTokens = numTokensFromString(extractionTarget, "gpt-4");
|
||||||
|
|
||||||
if (numTokens > maxTokens) {
|
if (numTokens > maxTokens) {
|
||||||
// trim the document to the maximum number of tokens, tokens != characters
|
// trim the document to the maximum number of tokens, tokens != characters
|
||||||
extractionTarget = extractionTarget.slice(0, (maxTokens * modifier));
|
extractionTarget = extractionTarget.slice(0, maxTokens * modifier);
|
||||||
}
|
}
|
||||||
|
|
||||||
return [[{ type: "text", text: extractionTarget }], numTokens];
|
return [[{ type: "text", text: extractionTarget }], numTokens];
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function generateOpenAICompletions({
|
export async function generateOpenAICompletions({
|
||||||
client,
|
client,
|
||||||
model = process.env.MODEL_NAME || "gpt-4o",
|
model = process.env.MODEL_NAME || "gpt-4o-mini",
|
||||||
document,
|
document,
|
||||||
schema, //TODO - add zod dynamic type checking
|
schema, //TODO - add zod dynamic type checking
|
||||||
prompt = defaultPrompt,
|
systemPrompt = defaultPrompt,
|
||||||
|
prompt,
|
||||||
temperature,
|
temperature,
|
||||||
mode
|
mode,
|
||||||
}: {
|
}: {
|
||||||
client: OpenAI;
|
client: OpenAI;
|
||||||
model?: string;
|
model?: string;
|
||||||
document: Document;
|
document: Document;
|
||||||
schema: any; // This should be replaced with a proper Zod schema type when available
|
schema: any; // This should be replaced with a proper Zod schema type when available
|
||||||
prompt?: string;
|
prompt?: string;
|
||||||
|
systemPrompt?: string;
|
||||||
temperature?: number;
|
temperature?: number;
|
||||||
mode: "markdown" | "raw-html";
|
mode: "markdown" | "raw-html";
|
||||||
}): Promise<Document> {
|
}): Promise<Document> {
|
||||||
@ -70,18 +67,46 @@ export async function generateOpenAICompletions({
|
|||||||
if (preparedDoc === null) {
|
if (preparedDoc === null) {
|
||||||
return {
|
return {
|
||||||
...document,
|
...document,
|
||||||
warning: "LLM extraction was not performed since the document's content is empty or missing.",
|
warning:
|
||||||
|
"LLM extraction was not performed since the document's content is empty or missing.",
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
const [content, numTokens] = preparedDoc;
|
const [content, numTokens] = preparedDoc;
|
||||||
|
|
||||||
const completion = await openai.chat.completions.create({
|
let completion;
|
||||||
|
let llmExtraction;
|
||||||
|
if (prompt && !schema) {
|
||||||
|
const jsonCompletion = await openai.chat.completions.create({
|
||||||
model,
|
model,
|
||||||
messages: [
|
messages: [
|
||||||
{
|
{
|
||||||
role: "system",
|
role: "system",
|
||||||
content: prompt,
|
content: systemPrompt,
|
||||||
|
},
|
||||||
|
{ role: "user", content },
|
||||||
|
{
|
||||||
|
role: "user",
|
||||||
|
content: `Transform the above content into structured json output based on the following user request: ${prompt}`,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
response_format: { type: "json_object" },
|
||||||
|
temperature,
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
llmExtraction = JSON.parse(
|
||||||
|
jsonCompletion.choices[0].message.content.trim()
|
||||||
|
);
|
||||||
|
} catch (e) {
|
||||||
|
throw new Error("Invalid JSON");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
completion = await openai.chat.completions.create({
|
||||||
|
model,
|
||||||
|
messages: [
|
||||||
|
{
|
||||||
|
role: "system",
|
||||||
|
content: systemPrompt,
|
||||||
},
|
},
|
||||||
{ role: "user", content },
|
{ role: "user", content },
|
||||||
],
|
],
|
||||||
@ -95,20 +120,26 @@ export async function generateOpenAICompletions({
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
tool_choice: { "type": "function", "function": {"name": "extract_content"}},
|
tool_choice: { type: "function", function: { name: "extract_content" } },
|
||||||
temperature,
|
temperature,
|
||||||
});
|
});
|
||||||
|
|
||||||
const c = completion.choices[0].message.tool_calls[0].function.arguments;
|
const c = completion.choices[0].message.tool_calls[0].function.arguments;
|
||||||
|
|
||||||
// Extract the LLM extraction content from the completion response
|
// Extract the LLM extraction content from the completion response
|
||||||
const llmExtraction = JSON.parse(c);
|
try {
|
||||||
|
llmExtraction = JSON.parse(c);
|
||||||
|
} catch (e) {
|
||||||
|
throw new Error("Invalid JSON");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Return the document with the LLM extraction content added
|
// Return the document with the LLM extraction content added
|
||||||
return {
|
return {
|
||||||
...document,
|
...document,
|
||||||
llm_extraction: llmExtraction,
|
llm_extraction: llmExtraction,
|
||||||
warning: numTokens > maxTokens ? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.` : undefined,
|
warning:
|
||||||
|
numTokens > maxTokens
|
||||||
|
? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.`
|
||||||
|
: undefined,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -19,3 +19,4 @@ export class CustomError extends Error {
|
|||||||
Object.setPrototypeOf(this, CustomError.prototype);
|
Object.setPrototypeOf(this, CustomError.prototype);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -12,6 +12,7 @@ export interface Progress {
|
|||||||
|
|
||||||
export type PageOptions = {
|
export type PageOptions = {
|
||||||
includeMarkdown?: boolean;
|
includeMarkdown?: boolean;
|
||||||
|
includeExtract?: boolean;
|
||||||
onlyMainContent?: boolean;
|
onlyMainContent?: boolean;
|
||||||
includeHtml?: boolean;
|
includeHtml?: boolean;
|
||||||
includeRawHtml?: boolean;
|
includeRawHtml?: boolean;
|
||||||
@ -35,6 +36,7 @@ export type ExtractorOptions = {
|
|||||||
mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html";
|
mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html";
|
||||||
extractionPrompt?: string;
|
extractionPrompt?: string;
|
||||||
extractionSchema?: Record<string, any>;
|
extractionSchema?: Record<string, any>;
|
||||||
|
userPrompt?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
export type SearchOptions = {
|
export type SearchOptions = {
|
||||||
|
@ -37,3 +37,22 @@ export const supabaseGetJobsById = async (jobIds: string[]) => {
|
|||||||
|
|
||||||
return data;
|
return data;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
export const supabaseGetJobByIdOnlyData = async (jobId: string) => {
|
||||||
|
const { data, error } = await supabase_service
|
||||||
|
.from("firecrawl_jobs")
|
||||||
|
.select("docs, team_id")
|
||||||
|
.eq("job_id", jobId)
|
||||||
|
.single();
|
||||||
|
|
||||||
|
if (error) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!data) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return data;
|
||||||
|
};
|
@ -15,6 +15,7 @@ import { crawlStatusWSController } from "../controllers/v1/crawl-status-ws";
|
|||||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||||
import { crawlCancelController } from "../controllers/v1/crawl-cancel";
|
import { crawlCancelController } from "../controllers/v1/crawl-cancel";
|
||||||
import { Logger } from "../lib/logger";
|
import { Logger } from "../lib/logger";
|
||||||
|
import { scrapeStatusController } from "../controllers/v1/scrape-status";
|
||||||
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
|
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
|
||||||
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
|
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
|
||||||
// import { searchController } from "../../src/controllers/v1/search";
|
// import { searchController } from "../../src/controllers/v1/search";
|
||||||
@ -124,6 +125,11 @@ v1Router.get(
|
|||||||
wrap(crawlStatusController)
|
wrap(crawlStatusController)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
v1Router.get(
|
||||||
|
"/scrape/:jobId",
|
||||||
|
wrap(scrapeStatusController)
|
||||||
|
);
|
||||||
|
|
||||||
v1Router.ws(
|
v1Router.ws(
|
||||||
"/crawl/:jobId",
|
"/crawl/:jobId",
|
||||||
crawlStatusWSController
|
crawlStatusWSController
|
||||||
|
@ -31,7 +31,6 @@ it('should return a list of links on the firecrawl.ai page', async () => {
|
|||||||
|
|
||||||
// Check if the result contains a list of links
|
// Check if the result contains a list of links
|
||||||
expect(result.linksOnPage).toBeDefined();
|
expect(result.linksOnPage).toBeDefined();
|
||||||
console.log({result});
|
|
||||||
expect(Array.isArray(result.linksOnPage)).toBe(true);
|
expect(Array.isArray(result.linksOnPage)).toBe(true);
|
||||||
expect(result.linksOnPage.length).toBeGreaterThan(0);
|
expect(result.linksOnPage.length).toBeGreaterThan(0);
|
||||||
expect(result.linksOnPage).toContain('https://flutterbricks.com/features')
|
expect(result.linksOnPage).toContain('https://flutterbricks.com/features')
|
||||||
|
@ -305,26 +305,21 @@ export class WebScraperDataProvider {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// documents = await this.applyImgAltText(documents);
|
// documents = await this.applyImgAltText(documents);
|
||||||
|
if (this.mode === "single_urls" && this.pageOptions.includeExtract) {
|
||||||
|
const extractionMode = this.extractorOptions?.mode ?? "markdown";
|
||||||
|
const completionMode = extractionMode === "llm-extraction-from-raw-html" ? "raw-html" : "markdown";
|
||||||
|
|
||||||
if (
|
if (
|
||||||
(this.extractorOptions.mode === "llm-extraction" ||
|
extractionMode === "llm-extraction" ||
|
||||||
this.extractorOptions.mode === "llm-extraction-from-markdown") &&
|
extractionMode === "llm-extraction-from-markdown" ||
|
||||||
this.mode === "single_urls"
|
extractionMode === "llm-extraction-from-raw-html"
|
||||||
) {
|
) {
|
||||||
documents = await generateCompletions(
|
documents = await generateCompletions(
|
||||||
documents,
|
documents,
|
||||||
this.extractorOptions,
|
this.extractorOptions,
|
||||||
"markdown"
|
completionMode
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
if (
|
|
||||||
this.extractorOptions.mode === "llm-extraction-from-raw-html" &&
|
|
||||||
this.mode === "single_urls"
|
|
||||||
) {
|
|
||||||
documents = await generateCompletions(
|
|
||||||
documents,
|
|
||||||
this.extractorOptions,
|
|
||||||
"raw-html"
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
return documents.concat(pdfDocuments).concat(docxDocuments);
|
return documents.concat(pdfDocuments).concat(docxDocuments);
|
||||||
}
|
}
|
||||||
@ -588,6 +583,7 @@ export class WebScraperDataProvider {
|
|||||||
removeTags: options.pageOptions?.removeTags ?? [],
|
removeTags: options.pageOptions?.removeTags ?? [],
|
||||||
includeMarkdown: options.pageOptions?.includeMarkdown ?? true,
|
includeMarkdown: options.pageOptions?.includeMarkdown ?? true,
|
||||||
includeRawHtml: options.pageOptions?.includeRawHtml ?? false,
|
includeRawHtml: options.pageOptions?.includeRawHtml ?? false,
|
||||||
|
includeExtract: options.pageOptions?.includeExtract ?? (options.extractorOptions?.mode && options.extractorOptions?.mode !== "markdown") ?? false,
|
||||||
waitFor: options.pageOptions?.waitFor ?? undefined,
|
waitFor: options.pageOptions?.waitFor ?? undefined,
|
||||||
headers: options.pageOptions?.headers ?? undefined,
|
headers: options.pageOptions?.headers ?? undefined,
|
||||||
includeLinks: options.pageOptions?.includeLinks ?? true,
|
includeLinks: options.pageOptions?.includeLinks ?? true,
|
||||||
@ -617,6 +613,8 @@ export class WebScraperDataProvider {
|
|||||||
this.priority = options.priority;
|
this.priority = options.priority;
|
||||||
this.teamId = options.teamId ?? null;
|
this.teamId = options.teamId ?? null;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// make sure all urls start with https://
|
// make sure all urls start with https://
|
||||||
this.urls = this.urls.map((url) => {
|
this.urls = this.urls.map((url) => {
|
||||||
if (!url.trim().startsWith("http")) {
|
if (!url.trim().startsWith("http")) {
|
||||||
|
@ -130,6 +130,7 @@ export async function scrapSingleUrl(
|
|||||||
): Promise<Document> {
|
): Promise<Document> {
|
||||||
pageOptions = {
|
pageOptions = {
|
||||||
includeMarkdown: pageOptions.includeMarkdown ?? true,
|
includeMarkdown: pageOptions.includeMarkdown ?? true,
|
||||||
|
includeExtract: pageOptions.includeExtract ?? false,
|
||||||
onlyMainContent: pageOptions.onlyMainContent ?? false,
|
onlyMainContent: pageOptions.onlyMainContent ?? false,
|
||||||
includeHtml: pageOptions.includeHtml ?? false,
|
includeHtml: pageOptions.includeHtml ?? false,
|
||||||
includeRawHtml: pageOptions.includeRawHtml ?? false,
|
includeRawHtml: pageOptions.includeRawHtml ?? false,
|
||||||
@ -388,11 +389,11 @@ export async function scrapSingleUrl(
|
|||||||
if (screenshot && screenshot.length > 0) {
|
if (screenshot && screenshot.length > 0) {
|
||||||
document = {
|
document = {
|
||||||
content: text,
|
content: text,
|
||||||
markdown: pageOptions.includeMarkdown ? text : undefined,
|
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
|
||||||
html: pageOptions.includeHtml ? html : undefined,
|
html: pageOptions.includeHtml ? html : undefined,
|
||||||
rawHtml:
|
rawHtml:
|
||||||
pageOptions.includeRawHtml ||
|
pageOptions.includeRawHtml ||
|
||||||
extractorOptions?.mode === "llm-extraction-from-raw-html"
|
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
|
||||||
? rawHtml
|
? rawHtml
|
||||||
: undefined,
|
: undefined,
|
||||||
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
|
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
|
||||||
@ -407,11 +408,11 @@ export async function scrapSingleUrl(
|
|||||||
} else {
|
} else {
|
||||||
document = {
|
document = {
|
||||||
content: text,
|
content: text,
|
||||||
markdown: pageOptions.includeMarkdown ? text : undefined,
|
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
|
||||||
html: pageOptions.includeHtml ? html : undefined,
|
html: pageOptions.includeHtml ? html : undefined,
|
||||||
rawHtml:
|
rawHtml:
|
||||||
pageOptions.includeRawHtml ||
|
pageOptions.includeRawHtml ||
|
||||||
extractorOptions?.mode === "llm-extraction-from-raw-html"
|
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
|
||||||
? rawHtml
|
? rawHtml
|
||||||
: undefined,
|
: undefined,
|
||||||
metadata: {
|
metadata: {
|
||||||
@ -434,7 +435,7 @@ export async function scrapSingleUrl(
|
|||||||
});
|
});
|
||||||
return {
|
return {
|
||||||
content: "",
|
content: "",
|
||||||
markdown: pageOptions.includeMarkdown ? "" : undefined,
|
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined,
|
||||||
html: "",
|
html: "",
|
||||||
linksOnPage: pageOptions.includeLinks ? [] : undefined,
|
linksOnPage: pageOptions.includeLinks ? [] : undefined,
|
||||||
metadata: {
|
metadata: {
|
||||||
|
@ -199,21 +199,44 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// Free credits, no coupons
|
// Free credits, no coupons
|
||||||
if (subscriptionError || !subscription) {
|
if (!subscription || subscriptionError) {
|
||||||
|
|
||||||
// If there is no active subscription but there are available coupons
|
// If there is no active subscription but there are available coupons
|
||||||
if (couponCredits >= credits) {
|
if (couponCredits >= credits) {
|
||||||
return { success: true, message: "Sufficient credits available", remainingCredits: couponCredits };
|
return { success: true, message: "Sufficient credits available", remainingCredits: couponCredits };
|
||||||
}
|
}
|
||||||
|
|
||||||
const { data: creditUsages, error: creditUsageError } =
|
let creditUsages;
|
||||||
await supabase_service
|
let creditUsageError;
|
||||||
|
let retries = 0;
|
||||||
|
const maxRetries = 3;
|
||||||
|
const retryInterval = 2000; // 2 seconds
|
||||||
|
|
||||||
|
while (retries < maxRetries) {
|
||||||
|
const result = await supabase_service
|
||||||
.from("credit_usage")
|
.from("credit_usage")
|
||||||
.select("credits_used")
|
.select("credits_used")
|
||||||
.is("subscription_id", null)
|
.is("subscription_id", null)
|
||||||
.eq("team_id", team_id);
|
.eq("team_id", team_id);
|
||||||
|
|
||||||
|
creditUsages = result.data;
|
||||||
|
creditUsageError = result.error;
|
||||||
|
|
||||||
|
if (!creditUsageError) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
retries++;
|
||||||
|
if (retries < maxRetries) {
|
||||||
|
await new Promise(resolve => setTimeout(resolve, retryInterval));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (creditUsageError) {
|
if (creditUsageError) {
|
||||||
|
Logger.error(`Credit usage error after ${maxRetries} attempts: ${creditUsageError}`);
|
||||||
throw new Error(
|
throw new Error(
|
||||||
`Failed to retrieve credit usage for team_id: ${team_id}`
|
`Failed to retrieve credit usage for team_id: ${team_id}`
|
||||||
);
|
);
|
||||||
|
@ -62,6 +62,7 @@ export function waitForJob(jobId: string, timeout: number) {
|
|||||||
clearInterval(int);
|
clearInterval(int);
|
||||||
resolve((await getScrapeQueue().getJob(jobId)).returnvalue);
|
resolve((await getScrapeQueue().getJob(jobId)).returnvalue);
|
||||||
} else if (state === "failed") {
|
} else if (state === "failed") {
|
||||||
|
// console.log("failed", (await getScrapeQueue().getJob(jobId)).failedReason);
|
||||||
clearInterval(int);
|
clearInterval(int);
|
||||||
reject((await getScrapeQueue().getJob(jobId)).failedReason);
|
reject((await getScrapeQueue().getJob(jobId)).failedReason);
|
||||||
}
|
}
|
||||||
|
@ -192,17 +192,16 @@ async function processJob(job: Job, token: string) {
|
|||||||
job,
|
job,
|
||||||
token,
|
token,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Better if we throw here so we capture with the correct error
|
||||||
|
if(!success) {
|
||||||
|
throw new Error(message);
|
||||||
|
}
|
||||||
const end = Date.now();
|
const end = Date.now();
|
||||||
const timeTakenInSeconds = (end - start) / 1000;
|
const timeTakenInSeconds = (end - start) / 1000;
|
||||||
|
|
||||||
const rawHtml = docs[0] ? docs[0].rawHtml : "";
|
const rawHtml = docs[0] ? docs[0].rawHtml : "";
|
||||||
|
|
||||||
if (job.data.crawl_id && (!job.data.pageOptions || !job.data.pageOptions.includeRawHtml)) {
|
|
||||||
if (docs[0] && docs[0].rawHtml) {
|
|
||||||
delete docs[0].rawHtml;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const data = {
|
const data = {
|
||||||
success,
|
success,
|
||||||
result: {
|
result: {
|
||||||
|
@ -104,6 +104,14 @@ export const devBRateLimiter = new RateLimiterRedis({
|
|||||||
duration: 60, // Duration in seconds
|
duration: 60, // Duration in seconds
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
export const scrapeStatusRateLimiter = new RateLimiterRedis({
|
||||||
|
storeClient: redisRateLimitClient,
|
||||||
|
keyPrefix: "scrape-status",
|
||||||
|
points: 400,
|
||||||
|
duration: 60, // Duration in seconds
|
||||||
|
});
|
||||||
|
|
||||||
export function getRateLimiter(
|
export function getRateLimiter(
|
||||||
mode: RateLimiterMode,
|
mode: RateLimiterMode,
|
||||||
token: string,
|
token: string,
|
||||||
|
@ -5,7 +5,6 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|||||||
Object.defineProperty(exports, "__esModule", { value: true });
|
Object.defineProperty(exports, "__esModule", { value: true });
|
||||||
exports.CrawlWatcher = void 0;
|
exports.CrawlWatcher = void 0;
|
||||||
const axios_1 = __importDefault(require("axios"));
|
const axios_1 = __importDefault(require("axios"));
|
||||||
const zod_1 = require("zod");
|
|
||||||
const zod_to_json_schema_1 = require("zod-to-json-schema");
|
const zod_to_json_schema_1 = require("zod-to-json-schema");
|
||||||
const isows_1 = require("isows");
|
const isows_1 = require("isows");
|
||||||
const typescript_event_target_1 = require("typescript-event-target");
|
const typescript_event_target_1 = require("typescript-event-target");
|
||||||
@ -34,18 +33,19 @@ class FirecrawlApp {
|
|||||||
Authorization: `Bearer ${this.apiKey}`,
|
Authorization: `Bearer ${this.apiKey}`,
|
||||||
};
|
};
|
||||||
let jsonData = { url, ...params };
|
let jsonData = { url, ...params };
|
||||||
if (jsonData?.extractorOptions?.extractionSchema) {
|
if (jsonData?.extract?.schema) {
|
||||||
let schema = jsonData.extractorOptions.extractionSchema;
|
let schema = jsonData.extract.schema;
|
||||||
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas
|
// Try parsing the schema as a Zod schema
|
||||||
if (schema instanceof zod_1.z.ZodSchema) {
|
try {
|
||||||
schema = (0, zod_to_json_schema_1.zodToJsonSchema)(schema);
|
schema = (0, zod_to_json_schema_1.zodToJsonSchema)(schema);
|
||||||
}
|
}
|
||||||
|
catch (error) {
|
||||||
|
}
|
||||||
jsonData = {
|
jsonData = {
|
||||||
...jsonData,
|
...jsonData,
|
||||||
extractorOptions: {
|
extract: {
|
||||||
...jsonData.extractorOptions,
|
...jsonData.extract,
|
||||||
extractionSchema: schema,
|
schema: schema,
|
||||||
mode: jsonData.extractorOptions.mode || "llm-extraction",
|
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
import axios from "axios";
|
import axios from "axios";
|
||||||
import { z } from "zod";
|
|
||||||
import { zodToJsonSchema } from "zod-to-json-schema";
|
import { zodToJsonSchema } from "zod-to-json-schema";
|
||||||
import { WebSocket } from "isows";
|
import { WebSocket } from "isows";
|
||||||
import { TypedEventTarget } from "typescript-event-target";
|
import { TypedEventTarget } from "typescript-event-target";
|
||||||
@ -28,18 +27,19 @@ export default class FirecrawlApp {
|
|||||||
Authorization: `Bearer ${this.apiKey}`,
|
Authorization: `Bearer ${this.apiKey}`,
|
||||||
};
|
};
|
||||||
let jsonData = { url, ...params };
|
let jsonData = { url, ...params };
|
||||||
if (jsonData?.extractorOptions?.extractionSchema) {
|
if (jsonData?.extract?.schema) {
|
||||||
let schema = jsonData.extractorOptions.extractionSchema;
|
let schema = jsonData.extract.schema;
|
||||||
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas
|
// Try parsing the schema as a Zod schema
|
||||||
if (schema instanceof z.ZodSchema) {
|
try {
|
||||||
schema = zodToJsonSchema(schema);
|
schema = zodToJsonSchema(schema);
|
||||||
}
|
}
|
||||||
|
catch (error) {
|
||||||
|
}
|
||||||
jsonData = {
|
jsonData = {
|
||||||
...jsonData,
|
...jsonData,
|
||||||
extractorOptions: {
|
extract: {
|
||||||
...jsonData.extractorOptions,
|
...jsonData.extract,
|
||||||
extractionSchema: schema,
|
schema: schema,
|
||||||
mode: jsonData.extractorOptions.mode || "llm-extraction",
|
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@mendable/firecrawl-js",
|
"name": "@mendable/firecrawl-js",
|
||||||
"version": "1.1.0",
|
"version": "1.2.0",
|
||||||
"description": "JavaScript SDK for Firecrawl API",
|
"description": "JavaScript SDK for Firecrawl API",
|
||||||
"main": "build/cjs/index.js",
|
"main": "build/cjs/index.js",
|
||||||
"types": "types/index.d.ts",
|
"types": "types/index.d.ts",
|
||||||
|
@ -64,6 +64,7 @@ export interface FirecrawlDocument {
|
|||||||
html?: string;
|
html?: string;
|
||||||
rawHtml?: string;
|
rawHtml?: string;
|
||||||
links?: string[];
|
links?: string[];
|
||||||
|
extract?: Record<any, any>;
|
||||||
screenshot?: string;
|
screenshot?: string;
|
||||||
metadata?: FirecrawlDocumentMetadata;
|
metadata?: FirecrawlDocumentMetadata;
|
||||||
}
|
}
|
||||||
@ -73,11 +74,16 @@ export interface FirecrawlDocument {
|
|||||||
* Defines the options and configurations available for scraping web content.
|
* Defines the options and configurations available for scraping web content.
|
||||||
*/
|
*/
|
||||||
export interface ScrapeParams {
|
export interface ScrapeParams {
|
||||||
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "full@scrennshot")[];
|
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "extract" | "full@scrennshot")[];
|
||||||
headers?: Record<string, string>;
|
headers?: Record<string, string>;
|
||||||
includeTags?: string[];
|
includeTags?: string[];
|
||||||
excludeTags?: string[];
|
excludeTags?: string[];
|
||||||
onlyMainContent?: boolean;
|
onlyMainContent?: boolean;
|
||||||
|
extract?: {
|
||||||
|
prompt?: string;
|
||||||
|
schema?: z.ZodSchema | any;
|
||||||
|
systemPrompt?: string;
|
||||||
|
};
|
||||||
waitFor?: number;
|
waitFor?: number;
|
||||||
timeout?: number;
|
timeout?: number;
|
||||||
}
|
}
|
||||||
@ -196,18 +202,20 @@ export default class FirecrawlApp {
|
|||||||
Authorization: `Bearer ${this.apiKey}`,
|
Authorization: `Bearer ${this.apiKey}`,
|
||||||
} as AxiosRequestHeaders;
|
} as AxiosRequestHeaders;
|
||||||
let jsonData: any = { url, ...params };
|
let jsonData: any = { url, ...params };
|
||||||
if (jsonData?.extractorOptions?.extractionSchema) {
|
if (jsonData?.extract?.schema) {
|
||||||
let schema = jsonData.extractorOptions.extractionSchema;
|
let schema = jsonData.extract.schema;
|
||||||
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas
|
|
||||||
if (schema instanceof z.ZodSchema) {
|
// Try parsing the schema as a Zod schema
|
||||||
|
try {
|
||||||
schema = zodToJsonSchema(schema);
|
schema = zodToJsonSchema(schema);
|
||||||
|
} catch (error) {
|
||||||
|
|
||||||
}
|
}
|
||||||
jsonData = {
|
jsonData = {
|
||||||
...jsonData,
|
...jsonData,
|
||||||
extractorOptions: {
|
extract: {
|
||||||
...jsonData.extractorOptions,
|
...jsonData.extract,
|
||||||
extractionSchema: schema,
|
schema: schema,
|
||||||
mode: jsonData.extractorOptions.mode || "llm-extraction",
|
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
9
apps/js-sdk/firecrawl/types/index.d.ts
vendored
9
apps/js-sdk/firecrawl/types/index.d.ts
vendored
@ -1,4 +1,5 @@
|
|||||||
import { AxiosResponse, AxiosRequestHeaders } from "axios";
|
import { AxiosResponse, AxiosRequestHeaders } from "axios";
|
||||||
|
import { z } from "zod";
|
||||||
import { TypedEventTarget } from "typescript-event-target";
|
import { TypedEventTarget } from "typescript-event-target";
|
||||||
/**
|
/**
|
||||||
* Configuration interface for FirecrawlApp.
|
* Configuration interface for FirecrawlApp.
|
||||||
@ -58,6 +59,7 @@ export interface FirecrawlDocument {
|
|||||||
html?: string;
|
html?: string;
|
||||||
rawHtml?: string;
|
rawHtml?: string;
|
||||||
links?: string[];
|
links?: string[];
|
||||||
|
extract?: Record<any, any>;
|
||||||
screenshot?: string;
|
screenshot?: string;
|
||||||
metadata?: FirecrawlDocumentMetadata;
|
metadata?: FirecrawlDocumentMetadata;
|
||||||
}
|
}
|
||||||
@ -66,11 +68,16 @@ export interface FirecrawlDocument {
|
|||||||
* Defines the options and configurations available for scraping web content.
|
* Defines the options and configurations available for scraping web content.
|
||||||
*/
|
*/
|
||||||
export interface ScrapeParams {
|
export interface ScrapeParams {
|
||||||
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "full@scrennshot")[];
|
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "extract" | "full@scrennshot")[];
|
||||||
headers?: Record<string, string>;
|
headers?: Record<string, string>;
|
||||||
includeTags?: string[];
|
includeTags?: string[];
|
||||||
excludeTags?: string[];
|
excludeTags?: string[];
|
||||||
onlyMainContent?: boolean;
|
onlyMainContent?: boolean;
|
||||||
|
extract?: {
|
||||||
|
prompt?: string;
|
||||||
|
schema?: z.ZodSchema | any;
|
||||||
|
systemPrompt?: string;
|
||||||
|
};
|
||||||
waitFor?: number;
|
waitFor?: number;
|
||||||
timeout?: number;
|
timeout?: number;
|
||||||
}
|
}
|
||||||
|
@ -3,7 +3,7 @@ import nest_asyncio
|
|||||||
import uuid
|
import uuid
|
||||||
from firecrawl.firecrawl import FirecrawlApp
|
from firecrawl.firecrawl import FirecrawlApp
|
||||||
|
|
||||||
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
|
app = FirecrawlApp(api_key="fc-")
|
||||||
|
|
||||||
# Scrape a website:
|
# Scrape a website:
|
||||||
scrape_result = app.scrape_url('firecrawl.dev')
|
scrape_result = app.scrape_url('firecrawl.dev')
|
||||||
@ -33,63 +33,63 @@ print(crawl_status)
|
|||||||
|
|
||||||
# LLM Extraction:
|
# LLM Extraction:
|
||||||
# Define schema to extract contents into using pydantic
|
# Define schema to extract contents into using pydantic
|
||||||
# from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
# from typing import List
|
from typing import List
|
||||||
|
|
||||||
# class ArticleSchema(BaseModel):
|
class ArticleSchema(BaseModel):
|
||||||
# title: str
|
title: str
|
||||||
# points: int
|
points: int
|
||||||
# by: str
|
by: str
|
||||||
# commentsURL: str
|
commentsURL: str
|
||||||
|
|
||||||
# class TopArticlesSchema(BaseModel):
|
class TopArticlesSchema(BaseModel):
|
||||||
# top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
|
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
|
||||||
|
|
||||||
# llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
|
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
|
||||||
# 'extractorOptions': {
|
'formats': ['extract'],
|
||||||
# 'extractionSchema': TopArticlesSchema.model_json_schema(),
|
'extract': {
|
||||||
# 'mode': 'llm-extraction'
|
'schema': TopArticlesSchema.model_json_schema()
|
||||||
# },
|
}
|
||||||
# 'pageOptions':{
|
})
|
||||||
# 'onlyMainContent': True
|
|
||||||
# }
|
|
||||||
# })
|
|
||||||
|
|
||||||
# print(llm_extraction_result['llm_extraction'])
|
print(llm_extraction_result['extract'])
|
||||||
|
|
||||||
# # Define schema to extract contents into using json schema
|
# # Define schema to extract contents into using json schema
|
||||||
# json_schema = {
|
json_schema = {
|
||||||
# "type": "object",
|
"type": "object",
|
||||||
# "properties": {
|
"properties": {
|
||||||
# "top": {
|
"top": {
|
||||||
# "type": "array",
|
"type": "array",
|
||||||
# "items": {
|
"items": {
|
||||||
# "type": "object",
|
"type": "object",
|
||||||
# "properties": {
|
"properties": {
|
||||||
# "title": {"type": "string"},
|
"title": {"type": "string"},
|
||||||
# "points": {"type": "number"},
|
"points": {"type": "number"},
|
||||||
# "by": {"type": "string"},
|
"by": {"type": "string"},
|
||||||
# "commentsURL": {"type": "string"}
|
"commentsURL": {"type": "string"}
|
||||||
# },
|
},
|
||||||
# "required": ["title", "points", "by", "commentsURL"]
|
"required": ["title", "points", "by", "commentsURL"]
|
||||||
# },
|
},
|
||||||
# "minItems": 5,
|
"minItems": 5,
|
||||||
# "maxItems": 5,
|
"maxItems": 5,
|
||||||
# "description": "Top 5 stories on Hacker News"
|
"description": "Top 5 stories on Hacker News"
|
||||||
# }
|
}
|
||||||
# },
|
},
|
||||||
# "required": ["top"]
|
"required": ["top"]
|
||||||
# }
|
}
|
||||||
|
|
||||||
# llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
|
app2 = FirecrawlApp(api_key="fc-", version="v0")
|
||||||
# 'extractorOptions': {
|
|
||||||
# 'extractionSchema': json_schema,
|
|
||||||
# 'mode': 'llm-extraction'
|
llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', {
|
||||||
# },
|
'extractorOptions': {
|
||||||
# 'pageOptions':{
|
'extractionSchema': json_schema,
|
||||||
# 'onlyMainContent': True
|
'mode': 'llm-extraction'
|
||||||
# }
|
},
|
||||||
# })
|
'pageOptions':{
|
||||||
|
'onlyMainContent': True
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
# print(llm_extraction_result['llm_extraction'])
|
# print(llm_extraction_result['llm_extraction'])
|
||||||
|
|
||||||
@ -124,6 +124,3 @@ async def start_crawl_and_watch():
|
|||||||
|
|
||||||
# Start the watcher
|
# Start the watcher
|
||||||
await watcher.connect()
|
await watcher.connect()
|
||||||
|
|
||||||
# Run the event loop
|
|
||||||
await start_crawl_and_watch()
|
|
@ -13,7 +13,7 @@ import os
|
|||||||
|
|
||||||
from .firecrawl import FirecrawlApp
|
from .firecrawl import FirecrawlApp
|
||||||
|
|
||||||
__version__ = "1.1.1"
|
__version__ = "1.2.1"
|
||||||
|
|
||||||
# Define the logger for the Firecrawl project
|
# Define the logger for the Firecrawl project
|
||||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||||
|
@ -59,20 +59,16 @@ class FirecrawlApp:
|
|||||||
|
|
||||||
# If there are additional params, process them
|
# If there are additional params, process them
|
||||||
if params:
|
if params:
|
||||||
# Initialize extractorOptions if present
|
# Handle extract (for v1)
|
||||||
extractor_options = params.get('extractorOptions', {})
|
extract = params.get('extract', {})
|
||||||
# Check and convert the extractionSchema if it's a Pydantic model
|
if extract:
|
||||||
if 'extractionSchema' in extractor_options:
|
if 'schema' in extract and hasattr(extract['schema'], 'schema'):
|
||||||
if hasattr(extractor_options['extractionSchema'], 'schema'):
|
extract['schema'] = extract['schema'].schema()
|
||||||
extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema()
|
scrape_params['extract'] = extract
|
||||||
# Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided
|
|
||||||
extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
|
|
||||||
# Update the scrape_params with the processed extractorOptions
|
|
||||||
scrape_params['extractorOptions'] = extractor_options
|
|
||||||
|
|
||||||
# Include any other params directly at the top level of scrape_params
|
# Include any other params directly at the top level of scrape_params
|
||||||
for key, value in params.items():
|
for key, value in params.items():
|
||||||
if key != 'extractorOptions':
|
if key not in ['extract']:
|
||||||
scrape_params[key] = value
|
scrape_params[key] = value
|
||||||
|
|
||||||
endpoint = f'/v1/scrape'
|
endpoint = f'/v1/scrape'
|
||||||
|
Loading…
x
Reference in New Issue
Block a user