mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-16 17:56:02 +08:00
fix
This commit is contained in:
parent
2ffde5abc1
commit
fea102dac6
@ -12,6 +12,7 @@ content-type: application/json
|
|||||||
"url":"https://firecrawl.dev"
|
"url":"https://firecrawl.dev"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
### Crawl Website
|
### Crawl Website
|
||||||
# @name crawl
|
# @name crawl
|
||||||
POST {{baseUrl}}/v1/crawl HTTP/1.1
|
POST {{baseUrl}}/v1/crawl HTTP/1.1
|
||||||
@ -120,3 +121,91 @@ content-type: application/json
|
|||||||
GET {{baseUrl}}/v1/llmstxt/{{generateLlmsTxtId}} HTTP/1.1
|
GET {{baseUrl}}/v1/llmstxt/{{generateLlmsTxtId}} HTTP/1.1
|
||||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
|
|
||||||
|
### Scrape with JSON Schema Extraction
|
||||||
|
# @name scrapeWithSchema
|
||||||
|
POST {{baseUrl}}/v1/scrape HTTP/1.1
|
||||||
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
|
content-type: application/json
|
||||||
|
|
||||||
|
{
|
||||||
|
"url": "https://firecrawl.dev",
|
||||||
|
"formats": ["json"],
|
||||||
|
"jsonOptions": {
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"description": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Describe the site"
|
||||||
|
},
|
||||||
|
"respect_robots_txt": {
|
||||||
|
"type": ["boolean","null"],
|
||||||
|
"description": "Does firecrawl respect the robots.txt files?"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["description", "respect_robots_txt"]
|
||||||
|
}
|
||||||
|
// "systemPrompt": "You are an expert web scraper." // Optional system prompt
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
### Scrape with JSON Schema Extraction
|
||||||
|
# @name scrapeWithSchema
|
||||||
|
POST {{baseUrl}}/v1/scrape HTTP/1.1
|
||||||
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
|
content-type: application/json
|
||||||
|
|
||||||
|
{
|
||||||
|
"url": "https://firecrawl.dev",
|
||||||
|
"formats": ["json"],
|
||||||
|
"jsonOptions": {
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"description": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Describe the site"
|
||||||
|
}
|
||||||
|
|
||||||
|
},
|
||||||
|
"required": ["description" ]
|
||||||
|
}
|
||||||
|
// "systemPrompt": "You are an expert web scraper." // Optional system prompt
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
### Scrape to Extract Array of Titles
|
||||||
|
# @name scrapeItemsArray
|
||||||
|
POST {{baseUrl}}/v1/scrape HTTP/1.1
|
||||||
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
|
content-type: application/json
|
||||||
|
|
||||||
|
{
|
||||||
|
"url": "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/load-more/with-params",
|
||||||
|
"formats": ["json"],
|
||||||
|
"jsonOptions": {
|
||||||
|
"prompt": "Extract all the main article or blog post titles from the page into an array.",
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"items": {
|
||||||
|
"type": "array",
|
||||||
|
"description": "An array containing the extracted items.",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"title": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The title of a single article or blog post."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["title"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["items"]
|
||||||
|
}
|
||||||
|
// "systemPrompt": "You are an expert structured data extractor."
|
||||||
|
}
|
||||||
|
}
|
@ -230,15 +230,18 @@ export async function extractData({
|
|||||||
}),
|
}),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
console.log("smartscrapeResults", smartscrapeResults);
|
||||||
|
|
||||||
const scrapedPages = smartscrapeResults.map(
|
const scrapedPages = smartscrapeResults.map(
|
||||||
(result) => result.scrapedPages,
|
(result) => result.scrapedPages,
|
||||||
);
|
);
|
||||||
const htmls = scrapedPages.map((page) => page.html);
|
console.log("scrapedPages", scrapedPages);
|
||||||
|
const htmls = scrapedPages.flat().map((page) => page.html);
|
||||||
|
console.log("htmls", htmls);
|
||||||
const markdowns = await Promise.all(
|
const markdowns = await Promise.all(
|
||||||
htmls.map(async (html) => await parseMarkdown(html)),
|
htmls.map(async (html) => await parseMarkdown(html)),
|
||||||
);
|
);
|
||||||
|
console.log("markdowns", markdowns);
|
||||||
extractedData = await Promise.all(
|
extractedData = await Promise.all(
|
||||||
markdowns.map(async (markdown) => {
|
markdowns.map(async (markdown) => {
|
||||||
const newExtractOptions = {
|
const newExtractOptions = {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user