mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-15 17:45:54 +08:00
wip
This commit is contained in:
parent
e2dfc94640
commit
9fdbd3e4d6
@ -12,7 +12,6 @@ content-type: application/json
|
|||||||
"url":"https://firecrawl.dev"
|
"url":"https://firecrawl.dev"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
### Crawl Website
|
### Crawl Website
|
||||||
# @name crawl
|
# @name crawl
|
||||||
POST {{baseUrl}}/v1/crawl HTTP/1.1
|
POST {{baseUrl}}/v1/crawl HTTP/1.1
|
||||||
@ -73,22 +72,32 @@ Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
|||||||
content-type: application/json
|
content-type: application/json
|
||||||
|
|
||||||
{
|
{
|
||||||
"urls": [
|
"urls":[
|
||||||
"https://firecrawl.dev/blog"
|
"https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/numbered-large-pagination/with-params"
|
||||||
],
|
],
|
||||||
"origin": "api-sdk",
|
"origin":"api-sdk",
|
||||||
"prompt": "Extract all the blog titles from the page, is multity entity = true",
|
"prompt":"Get all products in the page. Use the pagination buttons to navigate. It has 1200 products.",
|
||||||
"schema": {
|
"schema":{
|
||||||
"type": "object",
|
"type":"object",
|
||||||
"properties": {
|
"properties":{
|
||||||
"blog_titles": {
|
"products": {
|
||||||
"type": "array",
|
"type":"array",
|
||||||
"items": {
|
"items":{
|
||||||
"type": "string"
|
"type":"object",
|
||||||
|
"properties":{
|
||||||
|
"name":{
|
||||||
|
"type":"string"
|
||||||
|
},
|
||||||
|
"description":{
|
||||||
|
"type":"string"
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"required": ["blog_titles"]
|
"required":[
|
||||||
|
"products"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -121,91 +130,3 @@ content-type: application/json
|
|||||||
GET {{baseUrl}}/v1/llmstxt/{{generateLlmsTxtId}} HTTP/1.1
|
GET {{baseUrl}}/v1/llmstxt/{{generateLlmsTxtId}} HTTP/1.1
|
||||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
|
|
||||||
### Scrape with JSON Schema Extraction
|
|
||||||
# @name scrapeWithSchema
|
|
||||||
POST {{baseUrl}}/v1/scrape HTTP/1.1
|
|
||||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
|
||||||
content-type: application/json
|
|
||||||
|
|
||||||
{
|
|
||||||
"url": "https://firecrawl.dev",
|
|
||||||
"formats": ["json"],
|
|
||||||
"jsonOptions": {
|
|
||||||
"schema": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"description": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Describe the site"
|
|
||||||
},
|
|
||||||
"respect_robots_txt": {
|
|
||||||
"type": ["boolean","null"],
|
|
||||||
"description": "Does firecrawl respect the robots.txt files?"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": ["description", "respect_robots_txt"]
|
|
||||||
}
|
|
||||||
// "systemPrompt": "You are an expert web scraper." // Optional system prompt
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
### Scrape with JSON Schema Extraction
|
|
||||||
# @name scrapeWithSchema
|
|
||||||
POST {{baseUrl}}/v1/scrape HTTP/1.1
|
|
||||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
|
||||||
content-type: application/json
|
|
||||||
|
|
||||||
{
|
|
||||||
"url": "https://firecrawl.dev",
|
|
||||||
"formats": ["json"],
|
|
||||||
"jsonOptions": {
|
|
||||||
"schema": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"description": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Describe the site"
|
|
||||||
}
|
|
||||||
|
|
||||||
},
|
|
||||||
"required": ["description" ]
|
|
||||||
}
|
|
||||||
// "systemPrompt": "You are an expert web scraper." // Optional system prompt
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
### Scrape to Extract Array of Titles
|
|
||||||
# @name scrapeItemsArray
|
|
||||||
POST {{baseUrl}}/v1/scrape HTTP/1.1
|
|
||||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
|
||||||
content-type: application/json
|
|
||||||
|
|
||||||
{
|
|
||||||
"url": "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/load-more/with-params",
|
|
||||||
"formats": ["json"],
|
|
||||||
"jsonOptions": {
|
|
||||||
"prompt": "Extract all the main article or blog post titles from the page into an array.",
|
|
||||||
"schema": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"items": {
|
|
||||||
"type": "array",
|
|
||||||
"description": "An array containing the extracted items.",
|
|
||||||
"items": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"title": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "The title of a single article or blog post."
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": ["title"]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": ["items"]
|
|
||||||
}
|
|
||||||
// "systemPrompt": "You are an expert structured data extractor."
|
|
||||||
}
|
|
||||||
}
|
|
@ -69,9 +69,12 @@ Key Instructions:
|
|||||||
2. **Content Grounding:** Extract information *only* if it is explicitly present in the provided markdown. Do NOT infer or fabricate information.
|
2. **Content Grounding:** Extract information *only* if it is explicitly present in the provided markdown. Do NOT infer or fabricate information.
|
||||||
3. **Missing Information:** If a piece of information required by the schema cannot be found in the markdown, use \`null\` for that field's value.
|
3. **Missing Information:** If a piece of information required by the schema cannot be found in the markdown, use \`null\` for that field's value.
|
||||||
4. **SmartScrape Recommendation:**
|
4. **SmartScrape Recommendation:**
|
||||||
* Assess if the *full* required data seems unavailable in the current markdown likely because user interaction (like clicking or scrolling) is needed to reveal it.
|
* Assess if the *full* required data seems unavailable in the current markdown likely because:
|
||||||
* If interaction seems necessary to get the complete data, set \`shouldUseSmartscrape\` to \`true\` in your response and provide a clear \`reasoning\` and \`prompt\` for the SmartScrape tool.
|
- Content requires user interaction to reveal (e.g., clicking buttons, hovering, scrolling)
|
||||||
* Otherwise, set \`shouldUseSmartscrape\` to \`false\`.
|
- Content uses pagination (e.g., "Load More" buttons, numbered pagination, infinite scroll)
|
||||||
|
- Content is dynamically loaded after user actions
|
||||||
|
* If the content requires user interaction or pagination to be fully accessible, set \`shouldUseSmartscrape\` to \`true\` in your response and provide a clear \`reasoning\` and \`prompt\` for the SmartScrape tool.
|
||||||
|
* If the content is simply JavaScript rendered but doesn't require interaction, set \`shouldUseSmartscrape\` to \`false\`.
|
||||||
5. **Output Format:** Your final output MUST be a single, valid JSON object conforming precisely to the schema. Do not include any explanatory text outside the JSON structure.`,
|
5. **Output Format:** Your final output MUST be a single, valid JSON object conforming precisely to the schema. Do not include any explanatory text outside the JSON structure.`,
|
||||||
),
|
),
|
||||||
prompt: z.string().max(10000).optional(),
|
prompt: z.string().max(10000).optional(),
|
||||||
|
@ -55,10 +55,18 @@ export async function batchExtractPromise(
|
|||||||
model: getModel("gemini-2.0-flash", "google"),
|
model: getModel("gemini-2.0-flash", "google"),
|
||||||
};
|
};
|
||||||
|
|
||||||
const { extractedDataArray, warning } = await extractData({
|
let extractedDataArray: any[] = [];
|
||||||
extractOptions: generationOptions,
|
let warning: string | undefined;
|
||||||
urls: [doc.metadata.sourceURL || doc.metadata.url || ""],
|
try {
|
||||||
});
|
const { extractedDataArray: e, warning: w } = await extractData({
|
||||||
|
extractOptions: generationOptions,
|
||||||
|
urls: [doc.metadata.sourceURL || doc.metadata.url || ""],
|
||||||
|
});
|
||||||
|
extractedDataArray = e;
|
||||||
|
warning = w;
|
||||||
|
} catch (error) {
|
||||||
|
console.error(">>>>>>>error>>>>>\n", error);
|
||||||
|
}
|
||||||
|
|
||||||
await fs.writeFile(
|
await fs.writeFile(
|
||||||
`logs/extractedDataArray-${crypto.randomUUID()}.json`,
|
`logs/extractedDataArray-${crypto.randomUUID()}.json`,
|
||||||
|
@ -27,9 +27,10 @@ const commonReasoningPromptProperties = {
|
|||||||
},
|
},
|
||||||
smartscrape_prompt: {
|
smartscrape_prompt: {
|
||||||
type: ["string", "null"],
|
type: ["string", "null"],
|
||||||
// Using the more detailed multi-step description as the common one
|
description: `A clear, outcome-focused prompt describing what information to find on the page.
|
||||||
description:
|
Example: "Find the product specifications in the expandable section" rather than "Click the button to reveal product specs".
|
||||||
"Prompt detailing the specific actions SmartScrape should perform (e.g., 'click button X''). Dont mention anything about extraction, smartscrape just returns page content",
|
Used by the smart scraping agent to determine what actions to take.
|
||||||
|
Dont mention anything about extraction, smartscrape just returns page content.`
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -199,12 +200,12 @@ export async function extractData({
|
|||||||
let extract, warning, totalUsage;
|
let extract, warning, totalUsage;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const { extract: x, warning: y, totalUsage: z } = await generateCompletions(
|
const { extract: e, warning: w, totalUsage: t } = await generateCompletions(
|
||||||
{ ...extractOptionsNewSchema, model: getModel("gemini-2.5-pro-exp-03-25", "google") }
|
{ ...extractOptionsNewSchema, model: getModel("gemini-2.5-pro-exp-03-25", "google") }
|
||||||
);
|
);
|
||||||
extract = x;
|
extract = e;
|
||||||
warning = y;
|
warning = w;
|
||||||
totalUsage = z;
|
totalUsage = t;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.log("failed during extractSmartScrape.ts:generateCompletions", error);
|
console.log("failed during extractSmartScrape.ts:generateCompletions", error);
|
||||||
}
|
}
|
||||||
@ -222,52 +223,56 @@ export async function extractData({
|
|||||||
console.log("shouldUseSmartscrape", extract?.shouldUseSmartscrape);
|
console.log("shouldUseSmartscrape", extract?.shouldUseSmartscrape);
|
||||||
console.log("smartscrape_reasoning", extract?.smartscrape_reasoning);
|
console.log("smartscrape_reasoning", extract?.smartscrape_reasoning);
|
||||||
console.log("smartscrape_prompt", extract?.smartscrape_prompt);
|
console.log("smartscrape_prompt", extract?.smartscrape_prompt);
|
||||||
if (extract?.shouldUseSmartscrape) {
|
try {
|
||||||
let smartscrapeResults;
|
if (extract?.shouldUseSmartscrape) {
|
||||||
if (isSingleUrl) {
|
let smartscrapeResults;
|
||||||
smartscrapeResults = [
|
if (isSingleUrl) {
|
||||||
await smartScrape(urls[0], extract?.smartscrape_prompt),
|
smartscrapeResults = [
|
||||||
];
|
await smartScrape(urls[0], extract?.smartscrape_prompt),
|
||||||
} else {
|
];
|
||||||
const pages = extract?.smartscrapePages;
|
} else {
|
||||||
//do it async promiseall instead
|
const pages = extract?.smartscrapePages;
|
||||||
smartscrapeResults = await Promise.all(
|
//do it async promiseall instead
|
||||||
pages.map(async (page) => {
|
smartscrapeResults = await Promise.all(
|
||||||
return await smartScrape(
|
pages.map(async (page) => {
|
||||||
urls[page.page_index],
|
return await smartScrape(
|
||||||
page.smartscrape_prompt,
|
urls[page.page_index],
|
||||||
);
|
page.smartscrape_prompt,
|
||||||
|
);
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
console.log("smartscrapeResults", smartscrapeResults);
|
||||||
|
|
||||||
|
const scrapedPages = smartscrapeResults.map(
|
||||||
|
(result) => result.scrapedPages,
|
||||||
|
);
|
||||||
|
console.log("scrapedPages", scrapedPages);
|
||||||
|
const htmls = scrapedPages.flat().map((page) => page.html);
|
||||||
|
console.log("htmls", htmls);
|
||||||
|
const markdowns = await Promise.all(
|
||||||
|
htmls.map(async (html) => await parseMarkdown(html)),
|
||||||
|
);
|
||||||
|
console.log("markdowns", markdowns);
|
||||||
|
extractedData = await Promise.all(
|
||||||
|
markdowns.map(async (markdown) => {
|
||||||
|
const newExtractOptions = {
|
||||||
|
...extractOptions,
|
||||||
|
markdown: markdown,
|
||||||
|
};
|
||||||
|
const { extract, warning, totalUsage, model } =
|
||||||
|
await generateCompletions(newExtractOptions);
|
||||||
|
return extract;
|
||||||
}),
|
}),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// console.log("markdowns", markdowns);
|
||||||
|
// extractedData = smartscrapeResult;
|
||||||
|
} else {
|
||||||
|
extractedData = [extractedData];
|
||||||
}
|
}
|
||||||
console.log("smartscrapeResults", smartscrapeResults);
|
} catch (error) {
|
||||||
|
console.error(">>>>>>>extractSmartScrape.ts error>>>>>\n", error);
|
||||||
const scrapedPages = smartscrapeResults.map(
|
|
||||||
(result) => result.scrapedPages,
|
|
||||||
);
|
|
||||||
console.log("scrapedPages", scrapedPages);
|
|
||||||
const htmls = scrapedPages.flat().map((page) => page.html);
|
|
||||||
console.log("htmls", htmls);
|
|
||||||
const markdowns = await Promise.all(
|
|
||||||
htmls.map(async (html) => await parseMarkdown(html)),
|
|
||||||
);
|
|
||||||
console.log("markdowns", markdowns);
|
|
||||||
extractedData = await Promise.all(
|
|
||||||
markdowns.map(async (markdown) => {
|
|
||||||
const newExtractOptions = {
|
|
||||||
...extractOptions,
|
|
||||||
markdown: markdown,
|
|
||||||
};
|
|
||||||
const { extract, warning, totalUsage, model } =
|
|
||||||
await generateCompletions(newExtractOptions);
|
|
||||||
return extract;
|
|
||||||
}),
|
|
||||||
);
|
|
||||||
|
|
||||||
// console.log("markdowns", markdowns);
|
|
||||||
// extractedData = smartscrapeResult;
|
|
||||||
} else {
|
|
||||||
extractedData = [extractedData];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return { extractedDataArray: extractedData, warning: warning };
|
return { extractedDataArray: extractedData, warning: warning };
|
||||||
|
@ -46,33 +46,53 @@ export async function smartScrape(
|
|||||||
try {
|
try {
|
||||||
logger.info("Initiating smart scrape request", { url, prompt });
|
logger.info("Initiating smart scrape request", { url, prompt });
|
||||||
|
|
||||||
// Pass schema type as generic parameter to robustFetch
|
// Pass schema type as generic parameter to robustFeth
|
||||||
const response = await robustFetch<typeof smartScrapeResultSchema>({
|
const response = await robustFetch<typeof smartScrapeResultSchema>({
|
||||||
url: `${process.env.SMART_SCRAPE_API_URL}/smart-scrape`,
|
url: `${process.env.SMART_SCRAPE_API_URL}/smart-scrape`,
|
||||||
method: "POST",
|
method: "POST",
|
||||||
body: {
|
body: {
|
||||||
url,
|
url,
|
||||||
prompt,
|
prompt,
|
||||||
thinkingModel: {
|
models: {
|
||||||
model: "gemini-2.5-pro-exp-03-25",
|
thinkingModel: {
|
||||||
provider: "google",
|
model: "gemini-2.5-pro-exp-03-25",
|
||||||
supportTools: true,
|
provider: "google",
|
||||||
toolChoice: "required",
|
supportTools: true,
|
||||||
cost: {
|
toolChoice: "required",
|
||||||
input: 1.3,
|
cost: {
|
||||||
output: 5,
|
input: 1.3,
|
||||||
|
output: 5,
|
||||||
|
},
|
||||||
},
|
},
|
||||||
},
|
toolModel: {
|
||||||
toolModel: {
|
model: "gemini-2.0-flash",
|
||||||
model: "gemini-2.0-flash",
|
provider: "google",
|
||||||
provider: "google",
|
}
|
||||||
},
|
}
|
||||||
},
|
},
|
||||||
schema: smartScrapeResultSchema, // Pass the schema instance for validation
|
schema: smartScrapeResultSchema, // Pass the schema instance for validation
|
||||||
logger,
|
logger,
|
||||||
mock: null, // Keep mock null if not mocking
|
mock: null, // Keep mock null if not mocking
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Check if the response indicates a 500 error
|
||||||
|
// Use type assertion to handle the error response structure
|
||||||
|
const errorResponse = response as unknown as {
|
||||||
|
success: boolean;
|
||||||
|
error?: string;
|
||||||
|
details?: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
if (errorResponse && errorResponse.success === false && errorResponse.error) {
|
||||||
|
logger.error("Smart scrape returned error response", {
|
||||||
|
url,
|
||||||
|
prompt,
|
||||||
|
error: errorResponse.error,
|
||||||
|
details: errorResponse.details || "No details provided"
|
||||||
|
});
|
||||||
|
throw new Error(`Smart scrape failed: ${errorResponse.error}${errorResponse.details ? ` - ${errorResponse.details}` : ''}`);
|
||||||
|
}
|
||||||
|
|
||||||
logger.info("Smart scrape successful", {
|
logger.info("Smart scrape successful", {
|
||||||
url,
|
url,
|
||||||
prompt,
|
prompt,
|
||||||
@ -80,9 +100,34 @@ export async function smartScrape(
|
|||||||
});
|
});
|
||||||
return response; // The response type now matches SmartScrapeResult
|
return response; // The response type now matches SmartScrapeResult
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error("Smart scrape request failed", { url, prompt, error });
|
// Safely extract error information without circular references
|
||||||
|
const errorInfo = {
|
||||||
|
message: error instanceof Error ? error.message : String(error),
|
||||||
|
name: error instanceof Error ? error.name : 'Unknown',
|
||||||
|
stack: error instanceof Error ? error.stack : undefined,
|
||||||
|
// Extract cause safely if it exists
|
||||||
|
cause: error instanceof Error && error.cause
|
||||||
|
? (error.cause instanceof Error
|
||||||
|
? { message: error.cause.message, name: error.cause.name, stack: error.cause.stack }
|
||||||
|
: typeof error.cause === 'object'
|
||||||
|
? {
|
||||||
|
...Object.fromEntries(
|
||||||
|
Object.entries(error.cause)
|
||||||
|
.filter(([_, v]) => v !== null && typeof v !== 'object')
|
||||||
|
),
|
||||||
|
error: (error.cause as any)?.error?.message || (error.cause as any)?.error
|
||||||
|
}
|
||||||
|
: String(error.cause))
|
||||||
|
: undefined
|
||||||
|
};
|
||||||
|
|
||||||
|
logger.error("Smart scrape request failed", {
|
||||||
|
url,
|
||||||
|
prompt,
|
||||||
|
error: JSON.stringify(errorInfo)
|
||||||
|
});
|
||||||
|
|
||||||
// Rethrowing the error to be handled by the caller
|
// Rethrowing the error to be handled by the caller
|
||||||
// Consider more specific error handling or wrapping if needed
|
|
||||||
throw new Error(`Failed to smart scrape URL: ${url}`, { cause: error });
|
throw new Error(`Failed to smart scrape URL: ${url}`, { cause: error });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user