mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-18 01:55:58 +08:00
Merge branch 'tom/extract-v2' of https://github.com/mendableai/firecrawl into tom/extract-v2
This commit is contained in:
commit
9786bc2fc0
@ -12,6 +12,7 @@ content-type: application/json
|
||||
"url":"https://firecrawl.dev"
|
||||
}
|
||||
|
||||
|
||||
### Crawl Website
|
||||
# @name crawl
|
||||
POST {{baseUrl}}/v1/crawl HTTP/1.1
|
||||
@ -120,3 +121,91 @@ content-type: application/json
|
||||
GET {{baseUrl}}/v1/llmstxt/{{generateLlmsTxtId}} HTTP/1.1
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
|
||||
### Scrape with JSON Schema Extraction
|
||||
# @name scrapeWithSchema
|
||||
POST {{baseUrl}}/v1/scrape HTTP/1.1
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
"url": "https://firecrawl.dev",
|
||||
"formats": ["json"],
|
||||
"jsonOptions": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"description": {
|
||||
"type": "string",
|
||||
"description": "Describe the site"
|
||||
},
|
||||
"respect_robots_txt": {
|
||||
"type": ["boolean","null"],
|
||||
"description": "Does firecrawl respect the robots.txt files?"
|
||||
}
|
||||
},
|
||||
"required": ["description", "respect_robots_txt"]
|
||||
}
|
||||
// "systemPrompt": "You are an expert web scraper." // Optional system prompt
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
### Scrape with JSON Schema Extraction
|
||||
# @name scrapeWithSchema
|
||||
POST {{baseUrl}}/v1/scrape HTTP/1.1
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
"url": "https://firecrawl.dev",
|
||||
"formats": ["json"],
|
||||
"jsonOptions": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"description": {
|
||||
"type": "string",
|
||||
"description": "Describe the site"
|
||||
}
|
||||
|
||||
},
|
||||
"required": ["description" ]
|
||||
}
|
||||
// "systemPrompt": "You are an expert web scraper." // Optional system prompt
|
||||
}
|
||||
}
|
||||
|
||||
### Scrape to Extract Array of Titles
|
||||
# @name scrapeItemsArray
|
||||
POST {{baseUrl}}/v1/scrape HTTP/1.1
|
||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
"url": "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/load-more/with-params",
|
||||
"formats": ["json"],
|
||||
"jsonOptions": {
|
||||
"prompt": "Extract all the main article or blog post titles from the page into an array.",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"items": {
|
||||
"type": "array",
|
||||
"description": "An array containing the extracted items.",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string",
|
||||
"description": "The title of a single article or blog post."
|
||||
}
|
||||
},
|
||||
"required": ["title"]
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["items"]
|
||||
}
|
||||
// "systemPrompt": "You are an expert structured data extractor."
|
||||
}
|
||||
}
|
@ -1,5 +1,8 @@
|
||||
import { logger } from "../../../lib/logger";
|
||||
import { generateCompletions, GenerateCompletionsOptions } from "../../../scraper/scrapeURL/transformers/llmExtract";
|
||||
import {
|
||||
generateCompletions,
|
||||
GenerateCompletionsOptions,
|
||||
} from "../../../scraper/scrapeURL/transformers/llmExtract";
|
||||
import { buildDocument } from "../build-document";
|
||||
import { ExtractResponse, TokenUsage } from "../../../controllers/v1/types";
|
||||
import { Document } from "../../../controllers/v1/types";
|
||||
@ -74,5 +77,5 @@ export async function batchExtractPromise(
|
||||
},
|
||||
warning: warning,
|
||||
sources: [doc.metadata.url || doc.metadata.sourceURL || ""],
|
||||
}
|
||||
};
|
||||
}
|
||||
|
@ -1,5 +1,8 @@
|
||||
import { logger } from "../../../lib/logger";
|
||||
import { generateCompletions, GenerateCompletionsOptions } from "../../../scraper/scrapeURL/transformers/llmExtract";
|
||||
import {
|
||||
generateCompletions,
|
||||
GenerateCompletionsOptions,
|
||||
} from "../../../scraper/scrapeURL/transformers/llmExtract";
|
||||
import { buildDocument } from "../build-document";
|
||||
import { Document, TokenUsage } from "../../../controllers/v1/types";
|
||||
import { getModel } from "../../../lib/generic-ai";
|
||||
@ -57,7 +60,6 @@ export async function singleAnswerCompletion({
|
||||
),
|
||||
};
|
||||
|
||||
|
||||
// const completion = await generateCompletions({
|
||||
// logger: logger.child({ module: "extract", method: "generateCompletions" }),
|
||||
// options: {
|
||||
@ -78,12 +80,7 @@ export async function singleAnswerCompletion({
|
||||
// );
|
||||
return {
|
||||
extract: completion.extract,
|
||||
tokenUsage: {
|
||||
promptTokens: 0,
|
||||
completionTokens: 0,
|
||||
totalTokens: 0,
|
||||
model: "gemini-2.0-flash",
|
||||
},
|
||||
tokenUsage: completion.tokenUsage,
|
||||
sources: singleAnswerDocs.map(
|
||||
(doc) => doc.metadata.url || doc.metadata.sourceURL || "",
|
||||
),
|
||||
|
@ -455,11 +455,12 @@ export async function performExtraction(
|
||||
);
|
||||
|
||||
// Race between timeout and completion
|
||||
const multiEntityCompletion = await completionPromise as Awaited<ReturnType<typeof batchExtractPromise>>;
|
||||
const multiEntityCompletion = (await completionPromise) as Awaited<
|
||||
ReturnType<typeof batchExtractPromise>
|
||||
>;
|
||||
|
||||
// TODO: merge multiEntityCompletion.extract to fit the multiEntitySchema
|
||||
|
||||
|
||||
// Track multi-entity extraction tokens
|
||||
if (multiEntityCompletion) {
|
||||
tokenUsage.push(multiEntityCompletion.totalUsage);
|
||||
@ -520,7 +521,9 @@ export async function performExtraction(
|
||||
);
|
||||
extractionResults.push(...validResults);
|
||||
// Merge all extracts from valid results into a single array
|
||||
const extractArrays = validResults.map(r => Array.isArray(r.extract) ? r.extract : [r.extract]);
|
||||
const extractArrays = validResults.map((r) =>
|
||||
Array.isArray(r.extract) ? r.extract : [r.extract],
|
||||
);
|
||||
const mergedExtracts = extractArrays.flat();
|
||||
multiEntityCompletions.push(...mergedExtracts);
|
||||
logger.debug("All multi-entity completion chunks finished.", {
|
||||
@ -692,10 +695,7 @@ export async function performExtraction(
|
||||
});
|
||||
logger.debug("Done generating singleAnswer completions.");
|
||||
|
||||
singleAnswerResult = transformArrayToObject(
|
||||
rSchema,
|
||||
completionResult,
|
||||
);
|
||||
singleAnswerResult = transformArrayToObject(rSchema, completionResult);
|
||||
|
||||
singleAnswerResult = deduplicateObjectsArray(singleAnswerResult);
|
||||
|
||||
|
@ -11,7 +11,7 @@ const commonSmartScrapeProperties = {
|
||||
shouldUseSmartscrape: {
|
||||
type: "boolean",
|
||||
description:
|
||||
"Set to `true` if any of the extractedData is null and you think you can find the information by performing user-like interactions (e.g., clicking buttons/accordions to reveal hidden text, scrolling down to load more content). SmartScrape can perform these actions to access the data.",
|
||||
"Set to `true` if any of the extractedData is null and you think you can find the information by performing user-like interactions (e.g., clicking buttons/accordions to reveal hidden text). SmartScrape can perform these actions to access the data.",
|
||||
},
|
||||
// Note: extractedData is added dynamically in prepareSmartScrapeSchema
|
||||
};
|
||||
@ -22,13 +22,13 @@ const commonReasoningPromptProperties = {
|
||||
type: ["string", "null"],
|
||||
// Using the more detailed multi-step description as the common one
|
||||
description:
|
||||
"Reasoning for why a SmartScrape step/action is needed. Explain which data is missing or requires interaction.",
|
||||
"Reasoning for why a SmartScrape is needed. Explain which data is missing or requires interaction.",
|
||||
},
|
||||
smartscrape_prompt: {
|
||||
type: ["string", "null"],
|
||||
// Using the more detailed multi-step description as the common one
|
||||
description:
|
||||
"Prompt detailing the specific actions SmartScrape should perform (e.g., 'click button X', 'scroll down').",
|
||||
"Prompt detailing the specific actions SmartScrape should perform (e.g., 'click button X''). Dont mention anything about extraction, smartscrape just returns page content",
|
||||
},
|
||||
};
|
||||
|
||||
@ -52,7 +52,7 @@ const multiSmartScrapeWrapperSchemaDefinition = {
|
||||
smartScrapePages: {
|
||||
type: "array",
|
||||
description:
|
||||
"Make an entry for each page we want to run smart scrape on.",
|
||||
"Make an entry for each page we want to run smart scrape on, no matter how many actions it should be one entry per page.",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
@ -185,7 +185,7 @@ export async function extractData({
|
||||
//WRAP SCHEMA
|
||||
const schema = extractOptions.options.schema;
|
||||
const logger = extractOptions.logger;
|
||||
const isSingleUrl = urls.length === 0;
|
||||
const isSingleUrl = urls.length === 1;
|
||||
console.log("!!!!!!!!!!!!!!!!!!hereee");
|
||||
const { schemaToUse } = prepareSmartScrapeSchema(schema, logger, isSingleUrl);
|
||||
const extractOptionsNewSchema = {
|
||||
@ -239,15 +239,18 @@ export async function extractData({
|
||||
}),
|
||||
);
|
||||
}
|
||||
console.log("smartscrapeResults", smartscrapeResults);
|
||||
|
||||
const scrapedPages = smartscrapeResults.map(
|
||||
(result) => result.scrapedPages,
|
||||
);
|
||||
const htmls = scrapedPages.map((page) => page.html);
|
||||
console.log("scrapedPages", scrapedPages);
|
||||
const htmls = scrapedPages.flat().map((page) => page.html);
|
||||
console.log("htmls", htmls);
|
||||
const markdowns = await Promise.all(
|
||||
htmls.map(async (html) => await parseMarkdown(html)),
|
||||
);
|
||||
|
||||
console.log("markdowns", markdowns);
|
||||
extractedData = await Promise.all(
|
||||
markdowns.map(async (markdown) => {
|
||||
const newExtractOptions = {
|
||||
|
@ -413,6 +413,7 @@ export async function performLLMExtract(
|
||||
// model: getModel("o3-mini", "openai"), // Keeping existing model selection
|
||||
// model: getModel("o3-mini", "openai"),
|
||||
// model: getModel("qwen-qwq-32b", "groq"),
|
||||
// model: getModel("gemini-2.0-flash", "google"),
|
||||
model: getModel("gemini-2.5-pro-exp-03-25", "google"),
|
||||
};
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user