diff --git a/apps/api/requests.http b/apps/api/requests.http index 26fa9d07..0b56b118 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -12,7 +12,6 @@ content-type: application/json "url":"https://firecrawl.dev" } - ### Crawl Website # @name crawl POST {{baseUrl}}/v1/crawl HTTP/1.1 @@ -73,22 +72,32 @@ Authorization: Bearer {{$dotenv TEST_API_KEY}} content-type: application/json { - "urls": [ - "https://firecrawl.dev/blog" + "urls":[ + "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/numbered-large-pagination/with-params" ], - "origin": "api-sdk", - "prompt": "Extract all the blog titles from the page, is multity entity = true", - "schema": { - "type": "object", - "properties": { - "blog_titles": { - "type": "array", - "items": { - "type": "string" + "origin":"api-sdk", + "prompt":"Get all products in the page. Use the pagination buttons to navigate. It has 1200 products.", + "schema":{ + "type":"object", + "properties":{ + "products": { + "type":"array", + "items":{ + "type":"object", + "properties":{ + "name":{ + "type":"string" + }, + "description":{ + "type":"string" + } + } } } }, - "required": ["blog_titles"] + "required":[ + "products" + ] } } @@ -121,91 +130,3 @@ content-type: application/json GET {{baseUrl}}/v1/llmstxt/{{generateLlmsTxtId}} HTTP/1.1 Authorization: Bearer {{$dotenv TEST_API_KEY}} -### Scrape with JSON Schema Extraction -# @name scrapeWithSchema -POST {{baseUrl}}/v1/scrape HTTP/1.1 -Authorization: Bearer {{$dotenv TEST_API_KEY}} -content-type: application/json - -{ - "url": "https://firecrawl.dev", - "formats": ["json"], - "jsonOptions": { - "schema": { - "type": "object", - "properties": { - "description": { - "type": "string", - "description": "Describe the site" - }, - "respect_robots_txt": { - "type": ["boolean","null"], - "description": "Does firecrawl respect the robots.txt files?" - } - }, - "required": ["description", "respect_robots_txt"] - } - // "systemPrompt": "You are an expert web scraper." // Optional system prompt - } -} - - -### Scrape with JSON Schema Extraction -# @name scrapeWithSchema -POST {{baseUrl}}/v1/scrape HTTP/1.1 -Authorization: Bearer {{$dotenv TEST_API_KEY}} -content-type: application/json - -{ - "url": "https://firecrawl.dev", - "formats": ["json"], - "jsonOptions": { - "schema": { - "type": "object", - "properties": { - "description": { - "type": "string", - "description": "Describe the site" - } - - }, - "required": ["description" ] - } - // "systemPrompt": "You are an expert web scraper." // Optional system prompt - } -} - -### Scrape to Extract Array of Titles -# @name scrapeItemsArray -POST {{baseUrl}}/v1/scrape HTTP/1.1 -Authorization: Bearer {{$dotenv TEST_API_KEY}} -content-type: application/json - -{ - "url": "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/load-more/with-params", - "formats": ["json"], - "jsonOptions": { - "prompt": "Extract all the main article or blog post titles from the page into an array.", - "schema": { - "type": "object", - "properties": { - "items": { - "type": "array", - "description": "An array containing the extracted items.", - "items": { - "type": "object", - "properties": { - "title": { - "type": "string", - "description": "The title of a single article or blog post." - } - }, - "required": ["title"] - } - } - }, - "required": ["items"] - } - // "systemPrompt": "You are an expert structured data extractor." - } -} \ No newline at end of file diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 758c7cc0..ff3bc716 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -69,9 +69,12 @@ Key Instructions: 2. **Content Grounding:** Extract information *only* if it is explicitly present in the provided markdown. Do NOT infer or fabricate information. 3. **Missing Information:** If a piece of information required by the schema cannot be found in the markdown, use \`null\` for that field's value. 4. **SmartScrape Recommendation:** - * Assess if the *full* required data seems unavailable in the current markdown likely because user interaction (like clicking or scrolling) is needed to reveal it. - * If interaction seems necessary to get the complete data, set \`shouldUseSmartscrape\` to \`true\` in your response and provide a clear \`reasoning\` and \`prompt\` for the SmartScrape tool. - * Otherwise, set \`shouldUseSmartscrape\` to \`false\`. + * Assess if the *full* required data seems unavailable in the current markdown likely because: + - Content requires user interaction to reveal (e.g., clicking buttons, hovering, scrolling) + - Content uses pagination (e.g., "Load More" buttons, numbered pagination, infinite scroll) + - Content is dynamically loaded after user actions + * If the content requires user interaction or pagination to be fully accessible, set \`shouldUseSmartscrape\` to \`true\` in your response and provide a clear \`reasoning\` and \`prompt\` for the SmartScrape tool. + * If the content is simply JavaScript rendered but doesn't require interaction, set \`shouldUseSmartscrape\` to \`false\`. 5. **Output Format:** Your final output MUST be a single, valid JSON object conforming precisely to the schema. Do not include any explanatory text outside the JSON structure.`, ), prompt: z.string().max(10000).optional(), diff --git a/apps/api/src/lib/extract/completions/batchExtract.ts b/apps/api/src/lib/extract/completions/batchExtract.ts index 32b11420..b13b1213 100644 --- a/apps/api/src/lib/extract/completions/batchExtract.ts +++ b/apps/api/src/lib/extract/completions/batchExtract.ts @@ -55,10 +55,18 @@ export async function batchExtractPromise( model: getModel("gemini-2.0-flash", "google"), }; - const { extractedDataArray, warning } = await extractData({ - extractOptions: generationOptions, - urls: [doc.metadata.sourceURL || doc.metadata.url || ""], - }); + let extractedDataArray: any[] = []; + let warning: string | undefined; + try { + const { extractedDataArray: e, warning: w } = await extractData({ + extractOptions: generationOptions, + urls: [doc.metadata.sourceURL || doc.metadata.url || ""], + }); + extractedDataArray = e; + warning = w; + } catch (error) { + console.error(">>>>>>>error>>>>>\n", error); + } await fs.writeFile( `logs/extractedDataArray-${crypto.randomUUID()}.json`, diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index c1c9df34..1ca50d83 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -27,9 +27,10 @@ const commonReasoningPromptProperties = { }, smartscrape_prompt: { type: ["string", "null"], - // Using the more detailed multi-step description as the common one - description: - "Prompt detailing the specific actions SmartScrape should perform (e.g., 'click button X''). Dont mention anything about extraction, smartscrape just returns page content", + description: `A clear, outcome-focused prompt describing what information to find on the page. + Example: "Find the product specifications in the expandable section" rather than "Click the button to reveal product specs". + Used by the smart scraping agent to determine what actions to take. + Dont mention anything about extraction, smartscrape just returns page content.` }, }; @@ -199,12 +200,12 @@ export async function extractData({ let extract, warning, totalUsage; try { - const { extract: x, warning: y, totalUsage: z } = await generateCompletions( + const { extract: e, warning: w, totalUsage: t } = await generateCompletions( { ...extractOptionsNewSchema, model: getModel("gemini-2.5-pro-exp-03-25", "google") } ); - extract = x; - warning = y; - totalUsage = z; + extract = e; + warning = w; + totalUsage = t; } catch (error) { console.log("failed during extractSmartScrape.ts:generateCompletions", error); } @@ -222,52 +223,56 @@ export async function extractData({ console.log("shouldUseSmartscrape", extract?.shouldUseSmartscrape); console.log("smartscrape_reasoning", extract?.smartscrape_reasoning); console.log("smartscrape_prompt", extract?.smartscrape_prompt); - if (extract?.shouldUseSmartscrape) { - let smartscrapeResults; - if (isSingleUrl) { - smartscrapeResults = [ - await smartScrape(urls[0], extract?.smartscrape_prompt), - ]; - } else { - const pages = extract?.smartscrapePages; - //do it async promiseall instead - smartscrapeResults = await Promise.all( - pages.map(async (page) => { - return await smartScrape( - urls[page.page_index], - page.smartscrape_prompt, - ); + try { + if (extract?.shouldUseSmartscrape) { + let smartscrapeResults; + if (isSingleUrl) { + smartscrapeResults = [ + await smartScrape(urls[0], extract?.smartscrape_prompt), + ]; + } else { + const pages = extract?.smartscrapePages; + //do it async promiseall instead + smartscrapeResults = await Promise.all( + pages.map(async (page) => { + return await smartScrape( + urls[page.page_index], + page.smartscrape_prompt, + ); + }), + ); + } + console.log("smartscrapeResults", smartscrapeResults); + + const scrapedPages = smartscrapeResults.map( + (result) => result.scrapedPages, + ); + console.log("scrapedPages", scrapedPages); + const htmls = scrapedPages.flat().map((page) => page.html); + console.log("htmls", htmls); + const markdowns = await Promise.all( + htmls.map(async (html) => await parseMarkdown(html)), + ); + console.log("markdowns", markdowns); + extractedData = await Promise.all( + markdowns.map(async (markdown) => { + const newExtractOptions = { + ...extractOptions, + markdown: markdown, + }; + const { extract, warning, totalUsage, model } = + await generateCompletions(newExtractOptions); + return extract; }), ); + + // console.log("markdowns", markdowns); + // extractedData = smartscrapeResult; + } else { + extractedData = [extractedData]; } - console.log("smartscrapeResults", smartscrapeResults); - - const scrapedPages = smartscrapeResults.map( - (result) => result.scrapedPages, - ); - console.log("scrapedPages", scrapedPages); - const htmls = scrapedPages.flat().map((page) => page.html); - console.log("htmls", htmls); - const markdowns = await Promise.all( - htmls.map(async (html) => await parseMarkdown(html)), - ); - console.log("markdowns", markdowns); - extractedData = await Promise.all( - markdowns.map(async (markdown) => { - const newExtractOptions = { - ...extractOptions, - markdown: markdown, - }; - const { extract, warning, totalUsage, model } = - await generateCompletions(newExtractOptions); - return extract; - }), - ); - - // console.log("markdowns", markdowns); - // extractedData = smartscrapeResult; - } else { - extractedData = [extractedData]; + } catch (error) { + console.error(">>>>>>>extractSmartScrape.ts error>>>>>\n", error); } return { extractedDataArray: extractedData, warning: warning }; diff --git a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts index 777d7082..b6c493d8 100644 --- a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts @@ -46,33 +46,53 @@ export async function smartScrape( try { logger.info("Initiating smart scrape request", { url, prompt }); - // Pass schema type as generic parameter to robustFetch + // Pass schema type as generic parameter to robustFeth const response = await robustFetch({ url: `${process.env.SMART_SCRAPE_API_URL}/smart-scrape`, method: "POST", body: { url, prompt, - thinkingModel: { - model: "gemini-2.5-pro-exp-03-25", - provider: "google", - supportTools: true, - toolChoice: "required", - cost: { - input: 1.3, - output: 5, + models: { + thinkingModel: { + model: "gemini-2.5-pro-exp-03-25", + provider: "google", + supportTools: true, + toolChoice: "required", + cost: { + input: 1.3, + output: 5, + }, }, - }, - toolModel: { - model: "gemini-2.0-flash", - provider: "google", - }, + toolModel: { + model: "gemini-2.0-flash", + provider: "google", + } + } }, schema: smartScrapeResultSchema, // Pass the schema instance for validation logger, mock: null, // Keep mock null if not mocking }); + // Check if the response indicates a 500 error + // Use type assertion to handle the error response structure + const errorResponse = response as unknown as { + success: boolean; + error?: string; + details?: string; + }; + + if (errorResponse && errorResponse.success === false && errorResponse.error) { + logger.error("Smart scrape returned error response", { + url, + prompt, + error: errorResponse.error, + details: errorResponse.details || "No details provided" + }); + throw new Error(`Smart scrape failed: ${errorResponse.error}${errorResponse.details ? ` - ${errorResponse.details}` : ''}`); + } + logger.info("Smart scrape successful", { url, prompt, @@ -80,9 +100,34 @@ export async function smartScrape( }); return response; // The response type now matches SmartScrapeResult } catch (error) { - logger.error("Smart scrape request failed", { url, prompt, error }); + // Safely extract error information without circular references + const errorInfo = { + message: error instanceof Error ? error.message : String(error), + name: error instanceof Error ? error.name : 'Unknown', + stack: error instanceof Error ? error.stack : undefined, + // Extract cause safely if it exists + cause: error instanceof Error && error.cause + ? (error.cause instanceof Error + ? { message: error.cause.message, name: error.cause.name, stack: error.cause.stack } + : typeof error.cause === 'object' + ? { + ...Object.fromEntries( + Object.entries(error.cause) + .filter(([_, v]) => v !== null && typeof v !== 'object') + ), + error: (error.cause as any)?.error?.message || (error.cause as any)?.error + } + : String(error.cause)) + : undefined + }; + + logger.error("Smart scrape request failed", { + url, + prompt, + error: JSON.stringify(errorInfo) + }); + // Rethrowing the error to be handled by the caller - // Consider more specific error handling or wrapping if needed throw new Error(`Failed to smart scrape URL: ${url}`, { cause: error }); } }