wip

2025-08-15 17:45:54 +08:00 · 2025-04-04 19:11:03 -03:00 · 2025-04-04 19:11:03 -03:00 · 9fdbd3e4d6
commit 9fdbd3e4d6
parent e2dfc94640
5 changed files with 156 additions and 174 deletions
--- a/apps/api/requests.http
+++ b/apps/api/requests.http
@ -12,7 +12,6 @@ content-type: application/json
  "url":"https://firecrawl.dev"
 }
 ### Crawl Website
 # @name crawl
 POST {{baseUrl}}/v1/crawl HTTP/1.1
@ -73,22 +72,32 @@ Authorization: Bearer {{$dotenv TEST_API_KEY}}
 content-type: application/json
 {
-  "urls": [
+  "urls":[
-    "https://firecrawl.dev/blog"
+    "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/numbered-large-pagination/with-params"
  ],
-  "origin": "api-sdk",
+  "origin":"api-sdk",
-  "prompt": "Extract all the blog titles from the page, is multity entity = true",
+  "prompt":"Get all products in the page. Use the pagination buttons to navigate. It has 1200 products.",
-  "schema": {
+  "schema":{
-    "type": "object",
+    "type":"object",
-    "properties": {
+    "properties":{
-      "blog_titles": {
+      "products": {
-        "type": "array",
+        "type":"array",
-        "items": {
+        "items":{
-          "type": "string"
+          "type":"object",
          "properties":{
            "name":{
              "type":"string"
            },
            "description":{
              "type":"string"
            }
          }
        }
      }
    },
-    "required": ["blog_titles"]
+    "required":[
      "products"
    ]
  }
 }
@ -121,91 +130,3 @@ content-type: application/json
 GET {{baseUrl}}/v1/llmstxt/{{generateLlmsTxtId}} HTTP/1.1
 Authorization: Bearer {{$dotenv TEST_API_KEY}}
 ### Scrape with JSON Schema Extraction
 # @name scrapeWithSchema
 POST {{baseUrl}}/v1/scrape HTTP/1.1
 Authorization: Bearer {{$dotenv TEST_API_KEY}}
 content-type: application/json
 {
  "url": "https://firecrawl.dev",
  "formats": ["json"],
  "jsonOptions": {
    "schema": {
      "type": "object",
      "properties": {
        "description": { 
          "type": "string",
          "description": "Describe the site"
        },
        "respect_robots_txt": {
          "type": ["boolean","null"],
          "description": "Does firecrawl respect the robots.txt files?"
        }
      },
      "required": ["description", "respect_robots_txt"]
    }
    // "systemPrompt": "You are an expert web scraper." // Optional system prompt
  }
 }
 ### Scrape with JSON Schema Extraction
 # @name scrapeWithSchema
 POST {{baseUrl}}/v1/scrape HTTP/1.1
 Authorization: Bearer {{$dotenv TEST_API_KEY}}
 content-type: application/json
 {
  "url": "https://firecrawl.dev",
  "formats": ["json"],
  "jsonOptions": {
    "schema": {
      "type": "object",
      "properties": {
        "description": { 
          "type": "string",
          "description": "Describe the site"
        }
      },
      "required": ["description" ]
    }
    // "systemPrompt": "You are an expert web scraper." // Optional system prompt
  }
 }
 ### Scrape to Extract Array of Titles
 # @name scrapeItemsArray
 POST {{baseUrl}}/v1/scrape HTTP/1.1
 Authorization: Bearer {{$dotenv TEST_API_KEY}}
 content-type: application/json
 {
  "url": "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/load-more/with-params",
    "formats": ["json"],
  "jsonOptions": {
    "prompt": "Extract all the main article or blog post titles from the page into an array.",
    "schema": {
      "type": "object",
      "properties": {
        "items": {
          "type": "array",
          "description": "An array containing the extracted items.",
          "items": { 
            "type": "object",
            "properties": {
              "title": {
                "type": "string",
                "description": "The title of a single article or blog post."
              }
            },
            "required": ["title"] 
          }
        }
      },
      "required": ["items"] 
    }
    // "systemPrompt": "You are an expert structured data extractor." 
  }
 }
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -69,9 +69,12 @@ Key Instructions:
 2.  **Content Grounding:** Extract information *only* if it is explicitly present in the provided markdown. Do NOT infer or fabricate information.
 3.  **Missing Information:** If a piece of information required by the schema cannot be found in the markdown, use \`null\` for that field's value.
 4.  **SmartScrape Recommendation:**
-    *   Assess if the *full* required data seems unavailable in the current markdown likely because user interaction (like clicking or scrolling) is needed to reveal it.
+    *   Assess if the *full* required data seems unavailable in the current markdown likely because:
-    *   If interaction seems necessary to get the complete data, set \`shouldUseSmartscrape\` to \`true\` in your response and provide a clear \`reasoning\` and \`prompt\` for the SmartScrape tool.
+        - Content requires user interaction to reveal (e.g., clicking buttons, hovering, scrolling)
-    *   Otherwise, set \`shouldUseSmartscrape\` to \`false\`.
+        - Content uses pagination (e.g., "Load More" buttons, numbered pagination, infinite scroll)
        - Content is dynamically loaded after user actions
    *   If the content requires user interaction or pagination to be fully accessible, set \`shouldUseSmartscrape\` to \`true\` in your response and provide a clear \`reasoning\` and \`prompt\` for the SmartScrape tool.
    *   If the content is simply JavaScript rendered but doesn't require interaction, set \`shouldUseSmartscrape\` to \`false\`.
 5.  **Output Format:** Your final output MUST be a single, valid JSON object conforming precisely to the schema. Do not include any explanatory text outside the JSON structure.`,
      ),
    prompt: z.string().max(10000).optional(),
--- a/apps/api/src/lib/extract/completions/batchExtract.ts
+++ b/apps/api/src/lib/extract/completions/batchExtract.ts
@ -55,10 +55,18 @@ export async function batchExtractPromise(
    model: getModel("gemini-2.0-flash", "google"),
  };
-  const { extractedDataArray, warning } = await extractData({
+  let extractedDataArray: any[] = [];
-    extractOptions: generationOptions,
+  let warning: string | undefined;
-    urls: [doc.metadata.sourceURL || doc.metadata.url || ""],
+  try {
-  });
+    const { extractedDataArray: e, warning: w } = await extractData({
      extractOptions: generationOptions,
      urls: [doc.metadata.sourceURL || doc.metadata.url || ""],
    });
    extractedDataArray = e;
    warning = w;
  } catch (error) {
    console.error(">>>>>>>error>>>>>\n", error);
  }
  await fs.writeFile(
    `logs/extractedDataArray-${crypto.randomUUID()}.json`,
--- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts
+++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts
@ -27,9 +27,10 @@ const commonReasoningPromptProperties = {
  },
  smartscrape_prompt: {
    type: ["string", "null"],
-    // Using the more detailed multi-step description as the common one
+    description: `A clear, outcome-focused prompt describing what information to find on the page. 
-    description:
+      Example: "Find the product specifications in the expandable section" rather than "Click the button to reveal product specs".
-      "Prompt detailing the specific actions SmartScrape should perform (e.g., 'click button X''). Dont mention anything about extraction, smartscrape just returns page content",
+      Used by the smart scraping agent to determine what actions to take.
      Dont mention anything about extraction, smartscrape just returns page content.`
  },
 };
@ -199,12 +200,12 @@ export async function extractData({
  let extract, warning, totalUsage;
  try {
-    const { extract: x, warning: y, totalUsage: z } = await generateCompletions(
+    const { extract: e, warning: w, totalUsage: t } = await generateCompletions(
      { ...extractOptionsNewSchema, model: getModel("gemini-2.5-pro-exp-03-25", "google") }
    );
-    extract = x;
+    extract = e;
-    warning = y;
+    warning = w;
-    totalUsage = z;
+    totalUsage = t;
  } catch (error) {
    console.log("failed during extractSmartScrape.ts:generateCompletions", error);
  }
@ -222,52 +223,56 @@ export async function extractData({
  console.log("shouldUseSmartscrape", extract?.shouldUseSmartscrape);
  console.log("smartscrape_reasoning", extract?.smartscrape_reasoning);
  console.log("smartscrape_prompt", extract?.smartscrape_prompt);
-  if (extract?.shouldUseSmartscrape) {
+  try {
-    let smartscrapeResults;
+    if (extract?.shouldUseSmartscrape) {
-    if (isSingleUrl) {
+      let smartscrapeResults;
-      smartscrapeResults = [
+      if (isSingleUrl) {
-        await smartScrape(urls[0], extract?.smartscrape_prompt),
+        smartscrapeResults = [
-      ];
+          await smartScrape(urls[0], extract?.smartscrape_prompt),
-    } else {
+        ];
-      const pages = extract?.smartscrapePages;
+      } else {
-      //do it async promiseall instead
+        const pages = extract?.smartscrapePages;
-      smartscrapeResults = await Promise.all(
+        //do it async promiseall instead
-        pages.map(async (page) => {
+        smartscrapeResults = await Promise.all(
-          return await smartScrape(
+          pages.map(async (page) => {
-            urls[page.page_index],
+            return await smartScrape(
-            page.smartscrape_prompt,
+              urls[page.page_index],
-          );
+              page.smartscrape_prompt,
            );
          }),
        );
      }
      console.log("smartscrapeResults", smartscrapeResults);
      const scrapedPages = smartscrapeResults.map(
        (result) => result.scrapedPages,
      );
      console.log("scrapedPages", scrapedPages);
      const htmls = scrapedPages.flat().map((page) => page.html);
      console.log("htmls", htmls);
      const markdowns = await Promise.all(
        htmls.map(async (html) => await parseMarkdown(html)),
      );
      console.log("markdowns", markdowns);
      extractedData = await Promise.all(
        markdowns.map(async (markdown) => {
          const newExtractOptions = {
            ...extractOptions,
            markdown: markdown,
          };
          const { extract, warning, totalUsage, model } =
            await generateCompletions(newExtractOptions);
          return extract;
        }),
      );
      // console.log("markdowns", markdowns);
      // extractedData = smartscrapeResult;
    } else {
      extractedData = [extractedData];
    }
-    console.log("smartscrapeResults", smartscrapeResults);
+  } catch (error) {
-
+    console.error(">>>>>>>extractSmartScrape.ts error>>>>>\n", error);
    const scrapedPages = smartscrapeResults.map(
      (result) => result.scrapedPages,
    );
    console.log("scrapedPages", scrapedPages);
    const htmls = scrapedPages.flat().map((page) => page.html);
    console.log("htmls", htmls);
    const markdowns = await Promise.all(
      htmls.map(async (html) => await parseMarkdown(html)),
    );
    console.log("markdowns", markdowns);
    extractedData = await Promise.all(
      markdowns.map(async (markdown) => {
        const newExtractOptions = {
          ...extractOptions,
          markdown: markdown,
        };
        const { extract, warning, totalUsage, model } =
          await generateCompletions(newExtractOptions);
        return extract;
      }),
    );
    // console.log("markdowns", markdowns);
    // extractedData = smartscrapeResult;
  } else {
    extractedData = [extractedData];
  }
  return { extractedDataArray: extractedData, warning: warning };
--- a/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts
+++ b/apps/api/src/scraper/scrapeURL/lib/smartScrape.ts
@ -46,33 +46,53 @@ export async function smartScrape(
  try {
    logger.info("Initiating smart scrape request", { url, prompt });
-    // Pass schema type as generic parameter to robustFetch
+    // Pass schema type as generic parameter to robustFeth
    const response = await robustFetch<typeof smartScrapeResultSchema>({
      url: `${process.env.SMART_SCRAPE_API_URL}/smart-scrape`,
      method: "POST",
      body: {
        url,
        prompt,
-        thinkingModel: {
+        models: {
-          model: "gemini-2.5-pro-exp-03-25",
+          thinkingModel: {
-          provider: "google",
+            model: "gemini-2.5-pro-exp-03-25",
-          supportTools: true,
+            provider: "google",
-          toolChoice: "required",
+            supportTools: true,
-          cost: {
+            toolChoice: "required",
-            input: 1.3,
+            cost: {
-            output: 5,
+              input: 1.3,
              output: 5,
            },
          },
-        },
+          toolModel: {
-        toolModel: {
+            model: "gemini-2.0-flash",
-          model: "gemini-2.0-flash",
+            provider: "google",
-          provider: "google",
+          }
-        },
+        }
      },
      schema: smartScrapeResultSchema, // Pass the schema instance for validation
      logger,
      mock: null, // Keep mock null if not mocking
    });
    // Check if the response indicates a 500 error
    // Use type assertion to handle the error response structure
    const errorResponse = response as unknown as { 
      success: boolean; 
      error?: string; 
      details?: string;
    };
    if (errorResponse && errorResponse.success === false && errorResponse.error) {
      logger.error("Smart scrape returned error response", {
        url,
        prompt,
        error: errorResponse.error,
        details: errorResponse.details || "No details provided"
      });
      throw new Error(`Smart scrape failed: ${errorResponse.error}${errorResponse.details ? ` - ${errorResponse.details}` : ''}`);
    }
    logger.info("Smart scrape successful", {
      url,
      prompt,
@ -80,9 +100,34 @@ export async function smartScrape(
    });
    return response; // The response type now matches SmartScrapeResult
  } catch (error) {
-    logger.error("Smart scrape request failed", { url, prompt, error });
+    // Safely extract error information without circular references
    const errorInfo = {
      message: error instanceof Error ? error.message : String(error),
      name: error instanceof Error ? error.name : 'Unknown',
      stack: error instanceof Error ? error.stack : undefined,
      // Extract cause safely if it exists
      cause: error instanceof Error && error.cause
        ? (error.cause instanceof Error
            ? { message: error.cause.message, name: error.cause.name, stack: error.cause.stack }
            : typeof error.cause === 'object'
              ? {
                  ...Object.fromEntries(
                    Object.entries(error.cause)
                      .filter(([_, v]) => v !== null && typeof v !== 'object')
                  ),
                  error: (error.cause as any)?.error?.message || (error.cause as any)?.error
                }
              : String(error.cause))
        : undefined
    };
    logger.error("Smart scrape request failed", { 
      url, 
      prompt, 
      error: JSON.stringify(errorInfo)
    });
    // Rethrowing the error to be handled by the caller
    // Consider more specific error handling or wrapping if needed
    throw new Error(`Failed to smart scrape URL: ${url}`, { cause: error });
  }
 }