mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-18 07:35:55 +08:00
Merge branch 'tom/extract-v2' of https://github.com/mendableai/firecrawl into tom/extract-v2
This commit is contained in:
commit
9786bc2fc0
@ -12,6 +12,7 @@ content-type: application/json
|
|||||||
"url":"https://firecrawl.dev"
|
"url":"https://firecrawl.dev"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
### Crawl Website
|
### Crawl Website
|
||||||
# @name crawl
|
# @name crawl
|
||||||
POST {{baseUrl}}/v1/crawl HTTP/1.1
|
POST {{baseUrl}}/v1/crawl HTTP/1.1
|
||||||
@ -120,3 +121,91 @@ content-type: application/json
|
|||||||
GET {{baseUrl}}/v1/llmstxt/{{generateLlmsTxtId}} HTTP/1.1
|
GET {{baseUrl}}/v1/llmstxt/{{generateLlmsTxtId}} HTTP/1.1
|
||||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
|
|
||||||
|
### Scrape with JSON Schema Extraction
|
||||||
|
# @name scrapeWithSchema
|
||||||
|
POST {{baseUrl}}/v1/scrape HTTP/1.1
|
||||||
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
|
content-type: application/json
|
||||||
|
|
||||||
|
{
|
||||||
|
"url": "https://firecrawl.dev",
|
||||||
|
"formats": ["json"],
|
||||||
|
"jsonOptions": {
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"description": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Describe the site"
|
||||||
|
},
|
||||||
|
"respect_robots_txt": {
|
||||||
|
"type": ["boolean","null"],
|
||||||
|
"description": "Does firecrawl respect the robots.txt files?"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["description", "respect_robots_txt"]
|
||||||
|
}
|
||||||
|
// "systemPrompt": "You are an expert web scraper." // Optional system prompt
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
### Scrape with JSON Schema Extraction
|
||||||
|
# @name scrapeWithSchema
|
||||||
|
POST {{baseUrl}}/v1/scrape HTTP/1.1
|
||||||
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
|
content-type: application/json
|
||||||
|
|
||||||
|
{
|
||||||
|
"url": "https://firecrawl.dev",
|
||||||
|
"formats": ["json"],
|
||||||
|
"jsonOptions": {
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"description": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Describe the site"
|
||||||
|
}
|
||||||
|
|
||||||
|
},
|
||||||
|
"required": ["description" ]
|
||||||
|
}
|
||||||
|
// "systemPrompt": "You are an expert web scraper." // Optional system prompt
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
### Scrape to Extract Array of Titles
|
||||||
|
# @name scrapeItemsArray
|
||||||
|
POST {{baseUrl}}/v1/scrape HTTP/1.1
|
||||||
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
|
content-type: application/json
|
||||||
|
|
||||||
|
{
|
||||||
|
"url": "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/load-more/with-params",
|
||||||
|
"formats": ["json"],
|
||||||
|
"jsonOptions": {
|
||||||
|
"prompt": "Extract all the main article or blog post titles from the page into an array.",
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"items": {
|
||||||
|
"type": "array",
|
||||||
|
"description": "An array containing the extracted items.",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"title": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The title of a single article or blog post."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["title"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["items"]
|
||||||
|
}
|
||||||
|
// "systemPrompt": "You are an expert structured data extractor."
|
||||||
|
}
|
||||||
|
}
|
@ -1,5 +1,8 @@
|
|||||||
import { logger } from "../../../lib/logger";
|
import { logger } from "../../../lib/logger";
|
||||||
import { generateCompletions, GenerateCompletionsOptions } from "../../../scraper/scrapeURL/transformers/llmExtract";
|
import {
|
||||||
|
generateCompletions,
|
||||||
|
GenerateCompletionsOptions,
|
||||||
|
} from "../../../scraper/scrapeURL/transformers/llmExtract";
|
||||||
import { buildDocument } from "../build-document";
|
import { buildDocument } from "../build-document";
|
||||||
import { ExtractResponse, TokenUsage } from "../../../controllers/v1/types";
|
import { ExtractResponse, TokenUsage } from "../../../controllers/v1/types";
|
||||||
import { Document } from "../../../controllers/v1/types";
|
import { Document } from "../../../controllers/v1/types";
|
||||||
@ -74,5 +77,5 @@ export async function batchExtractPromise(
|
|||||||
},
|
},
|
||||||
warning: warning,
|
warning: warning,
|
||||||
sources: [doc.metadata.url || doc.metadata.sourceURL || ""],
|
sources: [doc.metadata.url || doc.metadata.sourceURL || ""],
|
||||||
}
|
};
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,8 @@
|
|||||||
import { logger } from "../../../lib/logger";
|
import { logger } from "../../../lib/logger";
|
||||||
import { generateCompletions, GenerateCompletionsOptions } from "../../../scraper/scrapeURL/transformers/llmExtract";
|
import {
|
||||||
|
generateCompletions,
|
||||||
|
GenerateCompletionsOptions,
|
||||||
|
} from "../../../scraper/scrapeURL/transformers/llmExtract";
|
||||||
import { buildDocument } from "../build-document";
|
import { buildDocument } from "../build-document";
|
||||||
import { Document, TokenUsage } from "../../../controllers/v1/types";
|
import { Document, TokenUsage } from "../../../controllers/v1/types";
|
||||||
import { getModel } from "../../../lib/generic-ai";
|
import { getModel } from "../../../lib/generic-ai";
|
||||||
@ -39,7 +42,7 @@ export async function singleAnswerCompletion({
|
|||||||
model: getModel("gemini-2.0-flash", "google"),
|
model: getModel("gemini-2.0-flash", "google"),
|
||||||
};
|
};
|
||||||
|
|
||||||
const { extractedDataArray, warning } = await extractData({
|
const { extractedDataArray, warning } = await extractData({
|
||||||
extractOptions: generationOptions,
|
extractOptions: generationOptions,
|
||||||
urls: singleAnswerDocs.map(doc => doc.metadata.url || doc.metadata.sourceURL || ""),
|
urls: singleAnswerDocs.map(doc => doc.metadata.url || doc.metadata.sourceURL || ""),
|
||||||
});
|
});
|
||||||
@ -57,7 +60,6 @@ export async function singleAnswerCompletion({
|
|||||||
),
|
),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
// const completion = await generateCompletions({
|
// const completion = await generateCompletions({
|
||||||
// logger: logger.child({ module: "extract", method: "generateCompletions" }),
|
// logger: logger.child({ module: "extract", method: "generateCompletions" }),
|
||||||
// options: {
|
// options: {
|
||||||
@ -78,12 +80,7 @@ export async function singleAnswerCompletion({
|
|||||||
// );
|
// );
|
||||||
return {
|
return {
|
||||||
extract: completion.extract,
|
extract: completion.extract,
|
||||||
tokenUsage: {
|
tokenUsage: completion.tokenUsage,
|
||||||
promptTokens: 0,
|
|
||||||
completionTokens: 0,
|
|
||||||
totalTokens: 0,
|
|
||||||
model: "gemini-2.0-flash",
|
|
||||||
},
|
|
||||||
sources: singleAnswerDocs.map(
|
sources: singleAnswerDocs.map(
|
||||||
(doc) => doc.metadata.url || doc.metadata.sourceURL || "",
|
(doc) => doc.metadata.url || doc.metadata.sourceURL || "",
|
||||||
),
|
),
|
||||||
|
@ -455,10 +455,11 @@ export async function performExtraction(
|
|||||||
);
|
);
|
||||||
|
|
||||||
// Race between timeout and completion
|
// Race between timeout and completion
|
||||||
const multiEntityCompletion = await completionPromise as Awaited<ReturnType<typeof batchExtractPromise>>;
|
const multiEntityCompletion = (await completionPromise) as Awaited<
|
||||||
|
ReturnType<typeof batchExtractPromise>
|
||||||
|
>;
|
||||||
|
|
||||||
// TODO: merge multiEntityCompletion.extract to fit the multiEntitySchema
|
// TODO: merge multiEntityCompletion.extract to fit the multiEntitySchema
|
||||||
|
|
||||||
|
|
||||||
// Track multi-entity extraction tokens
|
// Track multi-entity extraction tokens
|
||||||
if (multiEntityCompletion) {
|
if (multiEntityCompletion) {
|
||||||
@ -520,7 +521,9 @@ export async function performExtraction(
|
|||||||
);
|
);
|
||||||
extractionResults.push(...validResults);
|
extractionResults.push(...validResults);
|
||||||
// Merge all extracts from valid results into a single array
|
// Merge all extracts from valid results into a single array
|
||||||
const extractArrays = validResults.map(r => Array.isArray(r.extract) ? r.extract : [r.extract]);
|
const extractArrays = validResults.map((r) =>
|
||||||
|
Array.isArray(r.extract) ? r.extract : [r.extract],
|
||||||
|
);
|
||||||
const mergedExtracts = extractArrays.flat();
|
const mergedExtracts = extractArrays.flat();
|
||||||
multiEntityCompletions.push(...mergedExtracts);
|
multiEntityCompletions.push(...mergedExtracts);
|
||||||
logger.debug("All multi-entity completion chunks finished.", {
|
logger.debug("All multi-entity completion chunks finished.", {
|
||||||
@ -692,10 +695,7 @@ export async function performExtraction(
|
|||||||
});
|
});
|
||||||
logger.debug("Done generating singleAnswer completions.");
|
logger.debug("Done generating singleAnswer completions.");
|
||||||
|
|
||||||
singleAnswerResult = transformArrayToObject(
|
singleAnswerResult = transformArrayToObject(rSchema, completionResult);
|
||||||
rSchema,
|
|
||||||
completionResult,
|
|
||||||
);
|
|
||||||
|
|
||||||
singleAnswerResult = deduplicateObjectsArray(singleAnswerResult);
|
singleAnswerResult = deduplicateObjectsArray(singleAnswerResult);
|
||||||
|
|
||||||
|
@ -11,7 +11,7 @@ const commonSmartScrapeProperties = {
|
|||||||
shouldUseSmartscrape: {
|
shouldUseSmartscrape: {
|
||||||
type: "boolean",
|
type: "boolean",
|
||||||
description:
|
description:
|
||||||
"Set to `true` if any of the extractedData is null and you think you can find the information by performing user-like interactions (e.g., clicking buttons/accordions to reveal hidden text, scrolling down to load more content). SmartScrape can perform these actions to access the data.",
|
"Set to `true` if any of the extractedData is null and you think you can find the information by performing user-like interactions (e.g., clicking buttons/accordions to reveal hidden text). SmartScrape can perform these actions to access the data.",
|
||||||
},
|
},
|
||||||
// Note: extractedData is added dynamically in prepareSmartScrapeSchema
|
// Note: extractedData is added dynamically in prepareSmartScrapeSchema
|
||||||
};
|
};
|
||||||
@ -22,13 +22,13 @@ const commonReasoningPromptProperties = {
|
|||||||
type: ["string", "null"],
|
type: ["string", "null"],
|
||||||
// Using the more detailed multi-step description as the common one
|
// Using the more detailed multi-step description as the common one
|
||||||
description:
|
description:
|
||||||
"Reasoning for why a SmartScrape step/action is needed. Explain which data is missing or requires interaction.",
|
"Reasoning for why a SmartScrape is needed. Explain which data is missing or requires interaction.",
|
||||||
},
|
},
|
||||||
smartscrape_prompt: {
|
smartscrape_prompt: {
|
||||||
type: ["string", "null"],
|
type: ["string", "null"],
|
||||||
// Using the more detailed multi-step description as the common one
|
// Using the more detailed multi-step description as the common one
|
||||||
description:
|
description:
|
||||||
"Prompt detailing the specific actions SmartScrape should perform (e.g., 'click button X', 'scroll down').",
|
"Prompt detailing the specific actions SmartScrape should perform (e.g., 'click button X''). Dont mention anything about extraction, smartscrape just returns page content",
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -52,7 +52,7 @@ const multiSmartScrapeWrapperSchemaDefinition = {
|
|||||||
smartScrapePages: {
|
smartScrapePages: {
|
||||||
type: "array",
|
type: "array",
|
||||||
description:
|
description:
|
||||||
"Make an entry for each page we want to run smart scrape on.",
|
"Make an entry for each page we want to run smart scrape on, no matter how many actions it should be one entry per page.",
|
||||||
items: {
|
items: {
|
||||||
type: "object",
|
type: "object",
|
||||||
properties: {
|
properties: {
|
||||||
@ -185,7 +185,7 @@ export async function extractData({
|
|||||||
//WRAP SCHEMA
|
//WRAP SCHEMA
|
||||||
const schema = extractOptions.options.schema;
|
const schema = extractOptions.options.schema;
|
||||||
const logger = extractOptions.logger;
|
const logger = extractOptions.logger;
|
||||||
const isSingleUrl = urls.length === 0;
|
const isSingleUrl = urls.length === 1;
|
||||||
console.log("!!!!!!!!!!!!!!!!!!hereee");
|
console.log("!!!!!!!!!!!!!!!!!!hereee");
|
||||||
const { schemaToUse } = prepareSmartScrapeSchema(schema, logger, isSingleUrl);
|
const { schemaToUse } = prepareSmartScrapeSchema(schema, logger, isSingleUrl);
|
||||||
const extractOptionsNewSchema = {
|
const extractOptionsNewSchema = {
|
||||||
@ -239,15 +239,18 @@ export async function extractData({
|
|||||||
}),
|
}),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
console.log("smartscrapeResults", smartscrapeResults);
|
||||||
|
|
||||||
const scrapedPages = smartscrapeResults.map(
|
const scrapedPages = smartscrapeResults.map(
|
||||||
(result) => result.scrapedPages,
|
(result) => result.scrapedPages,
|
||||||
);
|
);
|
||||||
const htmls = scrapedPages.map((page) => page.html);
|
console.log("scrapedPages", scrapedPages);
|
||||||
|
const htmls = scrapedPages.flat().map((page) => page.html);
|
||||||
|
console.log("htmls", htmls);
|
||||||
const markdowns = await Promise.all(
|
const markdowns = await Promise.all(
|
||||||
htmls.map(async (html) => await parseMarkdown(html)),
|
htmls.map(async (html) => await parseMarkdown(html)),
|
||||||
);
|
);
|
||||||
|
console.log("markdowns", markdowns);
|
||||||
extractedData = await Promise.all(
|
extractedData = await Promise.all(
|
||||||
markdowns.map(async (markdown) => {
|
markdowns.map(async (markdown) => {
|
||||||
const newExtractOptions = {
|
const newExtractOptions = {
|
||||||
|
@ -413,6 +413,7 @@ export async function performLLMExtract(
|
|||||||
// model: getModel("o3-mini", "openai"), // Keeping existing model selection
|
// model: getModel("o3-mini", "openai"), // Keeping existing model selection
|
||||||
// model: getModel("o3-mini", "openai"),
|
// model: getModel("o3-mini", "openai"),
|
||||||
// model: getModel("qwen-qwq-32b", "groq"),
|
// model: getModel("qwen-qwq-32b", "groq"),
|
||||||
|
// model: getModel("gemini-2.0-flash", "google"),
|
||||||
model: getModel("gemini-2.5-pro-exp-03-25", "google"),
|
model: getModel("gemini-2.5-pro-exp-03-25", "google"),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user