From fea102dac6f3d8182b7e60848ed137347f071dd4 Mon Sep 17 00:00:00 2001
From: Thomas Kosmas <thomas510111@gmail.com>
Date: Thu, 3 Apr 2025 23:23:32 +0300
Subject: [PATCH] fix

---
 apps/api/requests.http                        | 89 +++++++++++++++++++
 .../scrapeURL/lib/extractSmartScrape.ts       |  7 +-
 2 files changed, 94 insertions(+), 2 deletions(-)

diff --git a/apps/api/requests.http b/apps/api/requests.http
index a3997371..26fa9d07 100644
--- a/apps/api/requests.http
+++ b/apps/api/requests.http
@@ -12,6 +12,7 @@ content-type: application/json
   "url":"https://firecrawl.dev"
 }
 
+
 ### Crawl Website
 # @name crawl
 POST {{baseUrl}}/v1/crawl HTTP/1.1
@@ -120,3 +121,91 @@ content-type: application/json
 GET {{baseUrl}}/v1/llmstxt/{{generateLlmsTxtId}} HTTP/1.1
 Authorization: Bearer {{$dotenv TEST_API_KEY}}
 
+### Scrape with JSON Schema Extraction
+# @name scrapeWithSchema
+POST {{baseUrl}}/v1/scrape HTTP/1.1
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
+content-type: application/json
+
+{
+  "url": "https://firecrawl.dev",
+  "formats": ["json"],
+  "jsonOptions": {
+    "schema": {
+      "type": "object",
+      "properties": {
+        "description": { 
+          "type": "string",
+          "description": "Describe the site"
+        },
+        "respect_robots_txt": {
+          "type": ["boolean","null"],
+          "description": "Does firecrawl respect the robots.txt files?"
+        }
+      },
+      "required": ["description", "respect_robots_txt"]
+    }
+    // "systemPrompt": "You are an expert web scraper." // Optional system prompt
+  }
+}
+
+
+### Scrape with JSON Schema Extraction
+# @name scrapeWithSchema
+POST {{baseUrl}}/v1/scrape HTTP/1.1
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
+content-type: application/json
+
+{
+  "url": "https://firecrawl.dev",
+  "formats": ["json"],
+  "jsonOptions": {
+    "schema": {
+      "type": "object",
+      "properties": {
+        "description": { 
+          "type": "string",
+          "description": "Describe the site"
+        }
+
+      },
+      "required": ["description" ]
+    }
+    // "systemPrompt": "You are an expert web scraper." // Optional system prompt
+  }
+}
+
+### Scrape to Extract Array of Titles
+# @name scrapeItemsArray
+POST {{baseUrl}}/v1/scrape HTTP/1.1
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
+content-type: application/json
+
+{
+  "url": "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/load-more/with-params",
+    "formats": ["json"],
+  "jsonOptions": {
+    "prompt": "Extract all the main article or blog post titles from the page into an array.",
+    "schema": {
+      "type": "object",
+      "properties": {
+        "items": {
+          "type": "array",
+          "description": "An array containing the extracted items.",
+          "items": { 
+            "type": "object",
+            "properties": {
+              "title": {
+                "type": "string",
+                "description": "The title of a single article or blog post."
+              }
+            },
+            "required": ["title"] 
+          }
+        }
+      },
+      "required": ["items"] 
+    }
+    // "systemPrompt": "You are an expert structured data extractor." 
+  }
+}
\ No newline at end of file
diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts
index d7fd4e6a..3682b2be 100644
--- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts
+++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts
@@ -230,15 +230,18 @@ export async function extractData({
         }),
       );
     }
+    console.log("smartscrapeResults", smartscrapeResults);
 
     const scrapedPages = smartscrapeResults.map(
       (result) => result.scrapedPages,
     );
-    const htmls = scrapedPages.map((page) => page.html);
+    console.log("scrapedPages", scrapedPages);
+    const htmls = scrapedPages.flat().map((page) => page.html);
+    console.log("htmls", htmls);
     const markdowns = await Promise.all(
       htmls.map(async (html) => await parseMarkdown(html)),
     );
-
+    console.log("markdowns", markdowns);
     extractedData = await Promise.all(
       markdowns.map(async (markdown) => {
         const newExtractOptions = {