This commit is contained in:
rafaelmmiller 2025-04-04 19:11:03 -03:00
parent e2dfc94640
commit 9fdbd3e4d6
5 changed files with 156 additions and 174 deletions

View File

@ -12,7 +12,6 @@ content-type: application/json
"url":"https://firecrawl.dev" "url":"https://firecrawl.dev"
} }
### Crawl Website ### Crawl Website
# @name crawl # @name crawl
POST {{baseUrl}}/v1/crawl HTTP/1.1 POST {{baseUrl}}/v1/crawl HTTP/1.1
@ -73,22 +72,32 @@ Authorization: Bearer {{$dotenv TEST_API_KEY}}
content-type: application/json content-type: application/json
{ {
"urls": [ "urls":[
"https://firecrawl.dev/blog" "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/numbered-large-pagination/with-params"
], ],
"origin": "api-sdk", "origin":"api-sdk",
"prompt": "Extract all the blog titles from the page, is multity entity = true", "prompt":"Get all products in the page. Use the pagination buttons to navigate. It has 1200 products.",
"schema": { "schema":{
"type": "object", "type":"object",
"properties": { "properties":{
"blog_titles": { "products": {
"type": "array", "type":"array",
"items": { "items":{
"type": "string" "type":"object",
"properties":{
"name":{
"type":"string"
},
"description":{
"type":"string"
}
}
} }
} }
}, },
"required": ["blog_titles"] "required":[
"products"
]
} }
} }
@ -121,91 +130,3 @@ content-type: application/json
GET {{baseUrl}}/v1/llmstxt/{{generateLlmsTxtId}} HTTP/1.1 GET {{baseUrl}}/v1/llmstxt/{{generateLlmsTxtId}} HTTP/1.1
Authorization: Bearer {{$dotenv TEST_API_KEY}} Authorization: Bearer {{$dotenv TEST_API_KEY}}
### Scrape with JSON Schema Extraction
# @name scrapeWithSchema
POST {{baseUrl}}/v1/scrape HTTP/1.1
Authorization: Bearer {{$dotenv TEST_API_KEY}}
content-type: application/json
{
"url": "https://firecrawl.dev",
"formats": ["json"],
"jsonOptions": {
"schema": {
"type": "object",
"properties": {
"description": {
"type": "string",
"description": "Describe the site"
},
"respect_robots_txt": {
"type": ["boolean","null"],
"description": "Does firecrawl respect the robots.txt files?"
}
},
"required": ["description", "respect_robots_txt"]
}
// "systemPrompt": "You are an expert web scraper." // Optional system prompt
}
}
### Scrape with JSON Schema Extraction
# @name scrapeWithSchema
POST {{baseUrl}}/v1/scrape HTTP/1.1
Authorization: Bearer {{$dotenv TEST_API_KEY}}
content-type: application/json
{
"url": "https://firecrawl.dev",
"formats": ["json"],
"jsonOptions": {
"schema": {
"type": "object",
"properties": {
"description": {
"type": "string",
"description": "Describe the site"
}
},
"required": ["description" ]
}
// "systemPrompt": "You are an expert web scraper." // Optional system prompt
}
}
### Scrape to Extract Array of Titles
# @name scrapeItemsArray
POST {{baseUrl}}/v1/scrape HTTP/1.1
Authorization: Bearer {{$dotenv TEST_API_KEY}}
content-type: application/json
{
"url": "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/load-more/with-params",
"formats": ["json"],
"jsonOptions": {
"prompt": "Extract all the main article or blog post titles from the page into an array.",
"schema": {
"type": "object",
"properties": {
"items": {
"type": "array",
"description": "An array containing the extracted items.",
"items": {
"type": "object",
"properties": {
"title": {
"type": "string",
"description": "The title of a single article or blog post."
}
},
"required": ["title"]
}
}
},
"required": ["items"]
}
// "systemPrompt": "You are an expert structured data extractor."
}
}

View File

@ -69,9 +69,12 @@ Key Instructions:
2. **Content Grounding:** Extract information *only* if it is explicitly present in the provided markdown. Do NOT infer or fabricate information. 2. **Content Grounding:** Extract information *only* if it is explicitly present in the provided markdown. Do NOT infer or fabricate information.
3. **Missing Information:** If a piece of information required by the schema cannot be found in the markdown, use \`null\` for that field's value. 3. **Missing Information:** If a piece of information required by the schema cannot be found in the markdown, use \`null\` for that field's value.
4. **SmartScrape Recommendation:** 4. **SmartScrape Recommendation:**
* Assess if the *full* required data seems unavailable in the current markdown likely because user interaction (like clicking or scrolling) is needed to reveal it. * Assess if the *full* required data seems unavailable in the current markdown likely because:
* If interaction seems necessary to get the complete data, set \`shouldUseSmartscrape\` to \`true\` in your response and provide a clear \`reasoning\` and \`prompt\` for the SmartScrape tool. - Content requires user interaction to reveal (e.g., clicking buttons, hovering, scrolling)
* Otherwise, set \`shouldUseSmartscrape\` to \`false\`. - Content uses pagination (e.g., "Load More" buttons, numbered pagination, infinite scroll)
- Content is dynamically loaded after user actions
* If the content requires user interaction or pagination to be fully accessible, set \`shouldUseSmartscrape\` to \`true\` in your response and provide a clear \`reasoning\` and \`prompt\` for the SmartScrape tool.
* If the content is simply JavaScript rendered but doesn't require interaction, set \`shouldUseSmartscrape\` to \`false\`.
5. **Output Format:** Your final output MUST be a single, valid JSON object conforming precisely to the schema. Do not include any explanatory text outside the JSON structure.`, 5. **Output Format:** Your final output MUST be a single, valid JSON object conforming precisely to the schema. Do not include any explanatory text outside the JSON structure.`,
), ),
prompt: z.string().max(10000).optional(), prompt: z.string().max(10000).optional(),

View File

@ -55,10 +55,18 @@ export async function batchExtractPromise(
model: getModel("gemini-2.0-flash", "google"), model: getModel("gemini-2.0-flash", "google"),
}; };
const { extractedDataArray, warning } = await extractData({ let extractedDataArray: any[] = [];
extractOptions: generationOptions, let warning: string | undefined;
urls: [doc.metadata.sourceURL || doc.metadata.url || ""], try {
}); const { extractedDataArray: e, warning: w } = await extractData({
extractOptions: generationOptions,
urls: [doc.metadata.sourceURL || doc.metadata.url || ""],
});
extractedDataArray = e;
warning = w;
} catch (error) {
console.error(">>>>>>>error>>>>>\n", error);
}
await fs.writeFile( await fs.writeFile(
`logs/extractedDataArray-${crypto.randomUUID()}.json`, `logs/extractedDataArray-${crypto.randomUUID()}.json`,

View File

@ -27,9 +27,10 @@ const commonReasoningPromptProperties = {
}, },
smartscrape_prompt: { smartscrape_prompt: {
type: ["string", "null"], type: ["string", "null"],
// Using the more detailed multi-step description as the common one description: `A clear, outcome-focused prompt describing what information to find on the page.
description: Example: "Find the product specifications in the expandable section" rather than "Click the button to reveal product specs".
"Prompt detailing the specific actions SmartScrape should perform (e.g., 'click button X''). Dont mention anything about extraction, smartscrape just returns page content", Used by the smart scraping agent to determine what actions to take.
Dont mention anything about extraction, smartscrape just returns page content.`
}, },
}; };
@ -199,12 +200,12 @@ export async function extractData({
let extract, warning, totalUsage; let extract, warning, totalUsage;
try { try {
const { extract: x, warning: y, totalUsage: z } = await generateCompletions( const { extract: e, warning: w, totalUsage: t } = await generateCompletions(
{ ...extractOptionsNewSchema, model: getModel("gemini-2.5-pro-exp-03-25", "google") } { ...extractOptionsNewSchema, model: getModel("gemini-2.5-pro-exp-03-25", "google") }
); );
extract = x; extract = e;
warning = y; warning = w;
totalUsage = z; totalUsage = t;
} catch (error) { } catch (error) {
console.log("failed during extractSmartScrape.ts:generateCompletions", error); console.log("failed during extractSmartScrape.ts:generateCompletions", error);
} }
@ -222,52 +223,56 @@ export async function extractData({
console.log("shouldUseSmartscrape", extract?.shouldUseSmartscrape); console.log("shouldUseSmartscrape", extract?.shouldUseSmartscrape);
console.log("smartscrape_reasoning", extract?.smartscrape_reasoning); console.log("smartscrape_reasoning", extract?.smartscrape_reasoning);
console.log("smartscrape_prompt", extract?.smartscrape_prompt); console.log("smartscrape_prompt", extract?.smartscrape_prompt);
if (extract?.shouldUseSmartscrape) { try {
let smartscrapeResults; if (extract?.shouldUseSmartscrape) {
if (isSingleUrl) { let smartscrapeResults;
smartscrapeResults = [ if (isSingleUrl) {
await smartScrape(urls[0], extract?.smartscrape_prompt), smartscrapeResults = [
]; await smartScrape(urls[0], extract?.smartscrape_prompt),
} else { ];
const pages = extract?.smartscrapePages; } else {
//do it async promiseall instead const pages = extract?.smartscrapePages;
smartscrapeResults = await Promise.all( //do it async promiseall instead
pages.map(async (page) => { smartscrapeResults = await Promise.all(
return await smartScrape( pages.map(async (page) => {
urls[page.page_index], return await smartScrape(
page.smartscrape_prompt, urls[page.page_index],
); page.smartscrape_prompt,
);
}),
);
}
console.log("smartscrapeResults", smartscrapeResults);
const scrapedPages = smartscrapeResults.map(
(result) => result.scrapedPages,
);
console.log("scrapedPages", scrapedPages);
const htmls = scrapedPages.flat().map((page) => page.html);
console.log("htmls", htmls);
const markdowns = await Promise.all(
htmls.map(async (html) => await parseMarkdown(html)),
);
console.log("markdowns", markdowns);
extractedData = await Promise.all(
markdowns.map(async (markdown) => {
const newExtractOptions = {
...extractOptions,
markdown: markdown,
};
const { extract, warning, totalUsage, model } =
await generateCompletions(newExtractOptions);
return extract;
}), }),
); );
// console.log("markdowns", markdowns);
// extractedData = smartscrapeResult;
} else {
extractedData = [extractedData];
} }
console.log("smartscrapeResults", smartscrapeResults); } catch (error) {
console.error(">>>>>>>extractSmartScrape.ts error>>>>>\n", error);
const scrapedPages = smartscrapeResults.map(
(result) => result.scrapedPages,
);
console.log("scrapedPages", scrapedPages);
const htmls = scrapedPages.flat().map((page) => page.html);
console.log("htmls", htmls);
const markdowns = await Promise.all(
htmls.map(async (html) => await parseMarkdown(html)),
);
console.log("markdowns", markdowns);
extractedData = await Promise.all(
markdowns.map(async (markdown) => {
const newExtractOptions = {
...extractOptions,
markdown: markdown,
};
const { extract, warning, totalUsage, model } =
await generateCompletions(newExtractOptions);
return extract;
}),
);
// console.log("markdowns", markdowns);
// extractedData = smartscrapeResult;
} else {
extractedData = [extractedData];
} }
return { extractedDataArray: extractedData, warning: warning }; return { extractedDataArray: extractedData, warning: warning };

View File

@ -46,33 +46,53 @@ export async function smartScrape(
try { try {
logger.info("Initiating smart scrape request", { url, prompt }); logger.info("Initiating smart scrape request", { url, prompt });
// Pass schema type as generic parameter to robustFetch // Pass schema type as generic parameter to robustFeth
const response = await robustFetch<typeof smartScrapeResultSchema>({ const response = await robustFetch<typeof smartScrapeResultSchema>({
url: `${process.env.SMART_SCRAPE_API_URL}/smart-scrape`, url: `${process.env.SMART_SCRAPE_API_URL}/smart-scrape`,
method: "POST", method: "POST",
body: { body: {
url, url,
prompt, prompt,
thinkingModel: { models: {
model: "gemini-2.5-pro-exp-03-25", thinkingModel: {
provider: "google", model: "gemini-2.5-pro-exp-03-25",
supportTools: true, provider: "google",
toolChoice: "required", supportTools: true,
cost: { toolChoice: "required",
input: 1.3, cost: {
output: 5, input: 1.3,
output: 5,
},
}, },
}, toolModel: {
toolModel: { model: "gemini-2.0-flash",
model: "gemini-2.0-flash", provider: "google",
provider: "google", }
}, }
}, },
schema: smartScrapeResultSchema, // Pass the schema instance for validation schema: smartScrapeResultSchema, // Pass the schema instance for validation
logger, logger,
mock: null, // Keep mock null if not mocking mock: null, // Keep mock null if not mocking
}); });
// Check if the response indicates a 500 error
// Use type assertion to handle the error response structure
const errorResponse = response as unknown as {
success: boolean;
error?: string;
details?: string;
};
if (errorResponse && errorResponse.success === false && errorResponse.error) {
logger.error("Smart scrape returned error response", {
url,
prompt,
error: errorResponse.error,
details: errorResponse.details || "No details provided"
});
throw new Error(`Smart scrape failed: ${errorResponse.error}${errorResponse.details ? ` - ${errorResponse.details}` : ''}`);
}
logger.info("Smart scrape successful", { logger.info("Smart scrape successful", {
url, url,
prompt, prompt,
@ -80,9 +100,34 @@ export async function smartScrape(
}); });
return response; // The response type now matches SmartScrapeResult return response; // The response type now matches SmartScrapeResult
} catch (error) { } catch (error) {
logger.error("Smart scrape request failed", { url, prompt, error }); // Safely extract error information without circular references
const errorInfo = {
message: error instanceof Error ? error.message : String(error),
name: error instanceof Error ? error.name : 'Unknown',
stack: error instanceof Error ? error.stack : undefined,
// Extract cause safely if it exists
cause: error instanceof Error && error.cause
? (error.cause instanceof Error
? { message: error.cause.message, name: error.cause.name, stack: error.cause.stack }
: typeof error.cause === 'object'
? {
...Object.fromEntries(
Object.entries(error.cause)
.filter(([_, v]) => v !== null && typeof v !== 'object')
),
error: (error.cause as any)?.error?.message || (error.cause as any)?.error
}
: String(error.cause))
: undefined
};
logger.error("Smart scrape request failed", {
url,
prompt,
error: JSON.stringify(errorInfo)
});
// Rethrowing the error to be handled by the caller // Rethrowing the error to be handled by the caller
// Consider more specific error handling or wrapping if needed
throw new Error(`Failed to smart scrape URL: ${url}`, { cause: error }); throw new Error(`Failed to smart scrape URL: ${url}`, { cause: error });
} }
} }