From fea102dac6f3d8182b7e60848ed137347f071dd4 Mon Sep 17 00:00:00 2001 From: Thomas Kosmas Date: Thu, 3 Apr 2025 23:23:32 +0300 Subject: [PATCH] fix --- apps/api/requests.http | 89 +++++++++++++++++++ .../scrapeURL/lib/extractSmartScrape.ts | 7 +- 2 files changed, 94 insertions(+), 2 deletions(-) diff --git a/apps/api/requests.http b/apps/api/requests.http index a3997371..26fa9d07 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -12,6 +12,7 @@ content-type: application/json "url":"https://firecrawl.dev" } + ### Crawl Website # @name crawl POST {{baseUrl}}/v1/crawl HTTP/1.1 @@ -120,3 +121,91 @@ content-type: application/json GET {{baseUrl}}/v1/llmstxt/{{generateLlmsTxtId}} HTTP/1.1 Authorization: Bearer {{$dotenv TEST_API_KEY}} +### Scrape with JSON Schema Extraction +# @name scrapeWithSchema +POST {{baseUrl}}/v1/scrape HTTP/1.1 +Authorization: Bearer {{$dotenv TEST_API_KEY}} +content-type: application/json + +{ + "url": "https://firecrawl.dev", + "formats": ["json"], + "jsonOptions": { + "schema": { + "type": "object", + "properties": { + "description": { + "type": "string", + "description": "Describe the site" + }, + "respect_robots_txt": { + "type": ["boolean","null"], + "description": "Does firecrawl respect the robots.txt files?" + } + }, + "required": ["description", "respect_robots_txt"] + } + // "systemPrompt": "You are an expert web scraper." // Optional system prompt + } +} + + +### Scrape with JSON Schema Extraction +# @name scrapeWithSchema +POST {{baseUrl}}/v1/scrape HTTP/1.1 +Authorization: Bearer {{$dotenv TEST_API_KEY}} +content-type: application/json + +{ + "url": "https://firecrawl.dev", + "formats": ["json"], + "jsonOptions": { + "schema": { + "type": "object", + "properties": { + "description": { + "type": "string", + "description": "Describe the site" + } + + }, + "required": ["description" ] + } + // "systemPrompt": "You are an expert web scraper." // Optional system prompt + } +} + +### Scrape to Extract Array of Titles +# @name scrapeItemsArray +POST {{baseUrl}}/v1/scrape HTTP/1.1 +Authorization: Bearer {{$dotenv TEST_API_KEY}} +content-type: application/json + +{ + "url": "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/load-more/with-params", + "formats": ["json"], + "jsonOptions": { + "prompt": "Extract all the main article or blog post titles from the page into an array.", + "schema": { + "type": "object", + "properties": { + "items": { + "type": "array", + "description": "An array containing the extracted items.", + "items": { + "type": "object", + "properties": { + "title": { + "type": "string", + "description": "The title of a single article or blog post." + } + }, + "required": ["title"] + } + } + }, + "required": ["items"] + } + // "systemPrompt": "You are an expert structured data extractor." + } +} \ No newline at end of file diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index d7fd4e6a..3682b2be 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -230,15 +230,18 @@ export async function extractData({ }), ); } + console.log("smartscrapeResults", smartscrapeResults); const scrapedPages = smartscrapeResults.map( (result) => result.scrapedPages, ); - const htmls = scrapedPages.map((page) => page.html); + console.log("scrapedPages", scrapedPages); + const htmls = scrapedPages.flat().map((page) => page.html); + console.log("htmls", htmls); const markdowns = await Promise.all( htmls.map(async (html) => await parseMarkdown(html)), ); - + console.log("markdowns", markdowns); extractedData = await Promise.all( markdowns.map(async (markdown) => { const newExtractOptions = {