fix

2025-08-16 17:56:02 +08:00 · 2025-04-03 23:23:32 +03:00 · 2025-04-03 23:23:32 +03:00 · fea102dac6
commit fea102dac6
parent 2ffde5abc1
2 changed files with 94 additions and 2 deletions
--- a/apps/api/requests.http
+++ b/apps/api/requests.http
@ -12,6 +12,7 @@ content-type: application/json
  "url":"https://firecrawl.dev"
 }
 ### Crawl Website
 # @name crawl
 POST {{baseUrl}}/v1/crawl HTTP/1.1
@ -120,3 +121,91 @@ content-type: application/json
 GET {{baseUrl}}/v1/llmstxt/{{generateLlmsTxtId}} HTTP/1.1
 Authorization: Bearer {{$dotenv TEST_API_KEY}}
 ### Scrape with JSON Schema Extraction
 # @name scrapeWithSchema
 POST {{baseUrl}}/v1/scrape HTTP/1.1
 Authorization: Bearer {{$dotenv TEST_API_KEY}}
 content-type: application/json
 {
  "url": "https://firecrawl.dev",
  "formats": ["json"],
  "jsonOptions": {
    "schema": {
      "type": "object",
      "properties": {
        "description": { 
          "type": "string",
          "description": "Describe the site"
        },
        "respect_robots_txt": {
          "type": ["boolean","null"],
          "description": "Does firecrawl respect the robots.txt files?"
        }
      },
      "required": ["description", "respect_robots_txt"]
    }
    // "systemPrompt": "You are an expert web scraper." // Optional system prompt
  }
 }
 ### Scrape with JSON Schema Extraction
 # @name scrapeWithSchema
 POST {{baseUrl}}/v1/scrape HTTP/1.1
 Authorization: Bearer {{$dotenv TEST_API_KEY}}
 content-type: application/json
 {
  "url": "https://firecrawl.dev",
  "formats": ["json"],
  "jsonOptions": {
    "schema": {
      "type": "object",
      "properties": {
        "description": { 
          "type": "string",
          "description": "Describe the site"
        }
      },
      "required": ["description" ]
    }
    // "systemPrompt": "You are an expert web scraper." // Optional system prompt
  }
 }
 ### Scrape to Extract Array of Titles
 # @name scrapeItemsArray
 POST {{baseUrl}}/v1/scrape HTTP/1.1
 Authorization: Bearer {{$dotenv TEST_API_KEY}}
 content-type: application/json
 {
  "url": "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/load-more/with-params",
    "formats": ["json"],
  "jsonOptions": {
    "prompt": "Extract all the main article or blog post titles from the page into an array.",
    "schema": {
      "type": "object",
      "properties": {
        "items": {
          "type": "array",
          "description": "An array containing the extracted items.",
          "items": { 
            "type": "object",
            "properties": {
              "title": {
                "type": "string",
                "description": "The title of a single article or blog post."
              }
            },
            "required": ["title"] 
          }
        }
      },
      "required": ["items"] 
    }
    // "systemPrompt": "You are an expert structured data extractor." 
  }
 }
--- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts
+++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts
@ -230,15 +230,18 @@ export async function extractData({
        }),
      );
    }
    console.log("smartscrapeResults", smartscrapeResults);
    const scrapedPages = smartscrapeResults.map(
      (result) => result.scrapedPages,
    );
-    const htmls = scrapedPages.map((page) => page.html);
+    console.log("scrapedPages", scrapedPages);
    const htmls = scrapedPages.flat().map((page) => page.html);
    console.log("htmls", htmls);
    const markdowns = await Promise.all(
      htmls.map(async (html) => await parseMarkdown(html)),
    );
-
+    console.log("markdowns", markdowns);
    extractedData = await Promise.all(
      markdowns.map(async (markdown) => {
        const newExtractOptions = {