fix

2025-08-15 15:15:55 +08:00 · 2025-04-03 23:23:32 +03:00 · 2025-04-03 23:23:32 +03:00 · fea102dac6
commit fea102dac6
parent 2ffde5abc1
2 changed files with 94 additions and 2 deletions
--- a/apps/api/requests.http
+++ b/apps/api/requests.http
@ -12,6 +12,7 @@ content-type: application/json
  "url":"https://firecrawl.dev"
 }

+
 ### Crawl Website
 # @name crawl
 POST {{baseUrl}}/v1/crawl HTTP/1.1
@ -120,3 +121,91 @@ content-type: application/json
 GET {{baseUrl}}/v1/llmstxt/{{generateLlmsTxtId}} HTTP/1.1
 Authorization: Bearer {{$dotenv TEST_API_KEY}}

+### Scrape with JSON Schema Extraction
+# @name scrapeWithSchema
+POST {{baseUrl}}/v1/scrape HTTP/1.1
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
+content-type: application/json
+
+{
+  "url": "https://firecrawl.dev",
+  "formats": ["json"],
+  "jsonOptions": {
+    "schema": {
+      "type": "object",
+      "properties": {
+        "description": { 
+          "type": "string",
+          "description": "Describe the site"
+        },
+        "respect_robots_txt": {
+          "type": ["boolean","null"],
+          "description": "Does firecrawl respect the robots.txt files?"
+        }
+      },
+      "required": ["description", "respect_robots_txt"]
+    }
+    // "systemPrompt": "You are an expert web scraper." // Optional system prompt
+  }
+}
+
+
+### Scrape with JSON Schema Extraction
+# @name scrapeWithSchema
+POST {{baseUrl}}/v1/scrape HTTP/1.1
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
+content-type: application/json
+
+{
+  "url": "https://firecrawl.dev",
+  "formats": ["json"],
+  "jsonOptions": {
+    "schema": {
+      "type": "object",
+      "properties": {
+        "description": { 
+          "type": "string",
+          "description": "Describe the site"
+        }
+
+      },
+      "required": ["description" ]
+    }
+    // "systemPrompt": "You are an expert web scraper." // Optional system prompt
+  }
+}
+
+### Scrape to Extract Array of Titles
+# @name scrapeItemsArray
+POST {{baseUrl}}/v1/scrape HTTP/1.1
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
+content-type: application/json
+
+{
+  "url": "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/load-more/with-params",
+    "formats": ["json"],
+  "jsonOptions": {
+    "prompt": "Extract all the main article or blog post titles from the page into an array.",
+    "schema": {
+      "type": "object",
+      "properties": {
+        "items": {
+          "type": "array",
+          "description": "An array containing the extracted items.",
+          "items": { 
+            "type": "object",
+            "properties": {
+              "title": {
+                "type": "string",
+                "description": "The title of a single article or blog post."
+              }
+            },
+            "required": ["title"] 
+          }
+        }
+      },
+      "required": ["items"] 
+    }
+    // "systemPrompt": "You are an expert structured data extractor." 
+  }
+}
--- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts
+++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts
@ -230,15 +230,18 @@ export async function extractData({
        }),
      );
    }
+    console.log("smartscrapeResults", smartscrapeResults);

    const scrapedPages = smartscrapeResults.map(
      (result) => result.scrapedPages,
    );
-    const htmls = scrapedPages.map((page) => page.html);
+    console.log("scrapedPages", scrapedPages);
+    const htmls = scrapedPages.flat().map((page) => page.html);
+    console.log("htmls", htmls);
    const markdowns = await Promise.all(
      htmls.map(async (html) => await parseMarkdown(html)),
    );
-
+    console.log("markdowns", markdowns);
    extractedData = await Promise.all(
      markdowns.map(async (markdown) => {
        const newExtractOptions = {