From 2ffde5abc11eec1a550f2aecae54c2d594d1dfb1 Mon Sep 17 00:00:00 2001 From: Thomas Kosmas Date: Thu, 3 Apr 2025 23:01:18 +0300 Subject: [PATCH 1/2] fixes --- .../lib/extract/completions/batchExtract.ts | 9 ++++--- .../lib/extract/completions/singleAnswer.ts | 26 +++++++++++-------- .../api/src/lib/extract/extraction-service.ts | 16 ++++++------ .../scrapeURL/lib/extractSmartScrape.ts | 10 +++---- .../scrapeURL/transformers/llmExtract.ts | 3 ++- 5 files changed, 36 insertions(+), 28 deletions(-) diff --git a/apps/api/src/lib/extract/completions/batchExtract.ts b/apps/api/src/lib/extract/completions/batchExtract.ts index 6db4e826..32b11420 100644 --- a/apps/api/src/lib/extract/completions/batchExtract.ts +++ b/apps/api/src/lib/extract/completions/batchExtract.ts @@ -1,5 +1,8 @@ import { logger } from "../../../lib/logger"; -import { generateCompletions, GenerateCompletionsOptions } from "../../../scraper/scrapeURL/transformers/llmExtract"; +import { + generateCompletions, + GenerateCompletionsOptions, +} from "../../../scraper/scrapeURL/transformers/llmExtract"; import { buildDocument } from "../build-document"; import { ExtractResponse, TokenUsage } from "../../../controllers/v1/types"; import { Document } from "../../../controllers/v1/types"; @@ -54,7 +57,7 @@ export async function batchExtractPromise( const { extractedDataArray, warning } = await extractData({ extractOptions: generationOptions, - url: doc.metadata.sourceURL || doc.metadata.url || "", + urls: [doc.metadata.sourceURL || doc.metadata.url || ""], }); await fs.writeFile( @@ -74,5 +77,5 @@ export async function batchExtractPromise( }, warning: warning, sources: [doc.metadata.url || doc.metadata.sourceURL || ""], - } + }; } diff --git a/apps/api/src/lib/extract/completions/singleAnswer.ts b/apps/api/src/lib/extract/completions/singleAnswer.ts index 3205596f..014be390 100644 --- a/apps/api/src/lib/extract/completions/singleAnswer.ts +++ b/apps/api/src/lib/extract/completions/singleAnswer.ts @@ -1,5 +1,8 @@ import { logger } from "../../../lib/logger"; -import { generateCompletions, GenerateCompletionsOptions } from "../../../scraper/scrapeURL/transformers/llmExtract"; +import { + generateCompletions, + GenerateCompletionsOptions, +} from "../../../scraper/scrapeURL/transformers/llmExtract"; import { buildDocument } from "../build-document"; import { Document, TokenUsage } from "../../../controllers/v1/types"; import { getModel } from "../../../lib/generic-ai"; @@ -32,15 +35,17 @@ export async function singleAnswerCompletion({ systemPrompt: (systemPrompt ? `${systemPrompt}\n` : "") + "Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. In case you can't find the information and the string is required, instead of 'N/A' or 'Not speficied', return an empty string: '', if it's not a string and you can't find the information, return null. Be concise and follow the schema always if provided.", - prompt: "Today is: " + new Date().toISOString() + ".\n" + prompt, - schema: rSchema, - }, - markdown: singleAnswerDocs.map((x, i) => `[ID: ${i}]` + buildDocument(x)).join("\n"), - isExtractEndpoint: true, - model: getModel("gemini-2.0-flash", "google"), - }; + prompt: "Today is: " + new Date().toISOString() + ".\n" + prompt, + schema: rSchema, + }, + markdown: singleAnswerDocs + .map((x, i) => `[ID: ${i}]` + buildDocument(x)) + .join("\n"), + isExtractEndpoint: true, + model: getModel("gemini-2.0-flash", "google"), + }; - const { extractedDataArray, warning } = await extractData({ + const { extractedDataArray, warning } = await extractData({ extractOptions: generationOptions, urls, }); @@ -58,7 +63,6 @@ export async function singleAnswerCompletion({ ), }; - // const completion = await generateCompletions({ // logger: logger.child({ module: "extract", method: "generateCompletions" }), // options: { @@ -79,7 +83,7 @@ export async function singleAnswerCompletion({ // ); return { extract: completion.extract, - tokenUsage: completion.totalUsage, + tokenUsage: completion.tokenUsage, sources: singleAnswerDocs.map( (doc) => doc.metadata.url || doc.metadata.sourceURL || "", ), diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index e9c33c9a..857daa07 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -455,10 +455,11 @@ export async function performExtraction( ); // Race between timeout and completion - const multiEntityCompletion = await completionPromise as Awaited>; + const multiEntityCompletion = (await completionPromise) as Awaited< + ReturnType + >; // TODO: merge multiEntityCompletion.extract to fit the multiEntitySchema - // Track multi-entity extraction tokens if (multiEntityCompletion) { @@ -520,7 +521,9 @@ export async function performExtraction( ); extractionResults.push(...validResults); // Merge all extracts from valid results into a single array - const extractArrays = validResults.map(r => Array.isArray(r.extract) ? r.extract : [r.extract]); + const extractArrays = validResults.map((r) => + Array.isArray(r.extract) ? r.extract : [r.extract], + ); const mergedExtracts = extractArrays.flat(); multiEntityCompletions.push(...mergedExtracts); logger.debug("All multi-entity completion chunks finished.", { @@ -684,7 +687,7 @@ export async function performExtraction( tokenUsage: singleAnswerTokenUsage, sources: singleAnswerSources, } = await singleAnswerCompletion({ - url: request.urls?.[0] || "", + urls: [request.urls?.[0] || ""], singleAnswerDocs, rSchema, links, @@ -693,10 +696,7 @@ export async function performExtraction( }); logger.debug("Done generating singleAnswer completions."); - singleAnswerResult = transformArrayToObject( - rSchema, - completionResult, - ); + singleAnswerResult = transformArrayToObject(rSchema, completionResult); singleAnswerResult = deduplicateObjectsArray(singleAnswerResult); diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index 58427d45..d7fd4e6a 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -11,7 +11,7 @@ const commonSmartScrapeProperties = { shouldUseSmartscrape: { type: "boolean", description: - "Set to `true` if any of the extractedData is null and you think you can find the information by performing user-like interactions (e.g., clicking buttons/accordions to reveal hidden text, scrolling down to load more content). SmartScrape can perform these actions to access the data.", + "Set to `true` if any of the extractedData is null and you think you can find the information by performing user-like interactions (e.g., clicking buttons/accordions to reveal hidden text). SmartScrape can perform these actions to access the data.", }, // Note: extractedData is added dynamically in prepareSmartScrapeSchema }; @@ -22,13 +22,13 @@ const commonReasoningPromptProperties = { type: ["string", "null"], // Using the more detailed multi-step description as the common one description: - "Reasoning for why a SmartScrape step/action is needed. Explain which data is missing or requires interaction.", + "Reasoning for why a SmartScrape is needed. Explain which data is missing or requires interaction.", }, smartscrape_prompt: { type: ["string", "null"], // Using the more detailed multi-step description as the common one description: - "Prompt detailing the specific actions SmartScrape should perform (e.g., 'click button X', 'scroll down').", + "Prompt detailing the specific actions SmartScrape should perform (e.g., 'click button X''). Dont mention anything about extraction, smartscrape just returns page content", }, }; @@ -52,7 +52,7 @@ const multiSmartScrapeWrapperSchemaDefinition = { smartScrapePages: { type: "array", description: - "Make an entry for each page we want to run smart scrape on.", + "Make an entry for each page we want to run smart scrape on, no matter how many actions it should be one entry per page.", items: { type: "object", properties: { @@ -185,7 +185,7 @@ export async function extractData({ //WRAP SCHEMA const schema = extractOptions.options.schema; const logger = extractOptions.logger; - const isSingleUrl = urls.length === 0; + const isSingleUrl = urls.length === 1; console.log("!!!!!!!!!!!!!!!!!!hereee"); const { schemaToUse } = prepareSmartScrapeSchema(schema, logger, isSingleUrl); const extractOptionsNewSchema = { diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 903eeb54..d12817c0 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -413,12 +413,13 @@ export async function performLLMExtract( // model: getModel("o3-mini", "openai"), // Keeping existing model selection // model: getModel("o3-mini", "openai"), // model: getModel("qwen-qwq-32b", "groq"), + // model: getModel("gemini-2.0-flash", "google"), model: getModel("gemini-2.5-pro-exp-03-25", "google"), }; const { extractedDataArray, warning } = await extractData({ extractOptions: generationOptions, - url: meta.url, + urls: [meta.url], }); //TODO: add merge here From fea102dac6f3d8182b7e60848ed137347f071dd4 Mon Sep 17 00:00:00 2001 From: Thomas Kosmas Date: Thu, 3 Apr 2025 23:23:32 +0300 Subject: [PATCH 2/2] fix --- apps/api/requests.http | 89 +++++++++++++++++++ .../scrapeURL/lib/extractSmartScrape.ts | 7 +- 2 files changed, 94 insertions(+), 2 deletions(-) diff --git a/apps/api/requests.http b/apps/api/requests.http index a3997371..26fa9d07 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -12,6 +12,7 @@ content-type: application/json "url":"https://firecrawl.dev" } + ### Crawl Website # @name crawl POST {{baseUrl}}/v1/crawl HTTP/1.1 @@ -120,3 +121,91 @@ content-type: application/json GET {{baseUrl}}/v1/llmstxt/{{generateLlmsTxtId}} HTTP/1.1 Authorization: Bearer {{$dotenv TEST_API_KEY}} +### Scrape with JSON Schema Extraction +# @name scrapeWithSchema +POST {{baseUrl}}/v1/scrape HTTP/1.1 +Authorization: Bearer {{$dotenv TEST_API_KEY}} +content-type: application/json + +{ + "url": "https://firecrawl.dev", + "formats": ["json"], + "jsonOptions": { + "schema": { + "type": "object", + "properties": { + "description": { + "type": "string", + "description": "Describe the site" + }, + "respect_robots_txt": { + "type": ["boolean","null"], + "description": "Does firecrawl respect the robots.txt files?" + } + }, + "required": ["description", "respect_robots_txt"] + } + // "systemPrompt": "You are an expert web scraper." // Optional system prompt + } +} + + +### Scrape with JSON Schema Extraction +# @name scrapeWithSchema +POST {{baseUrl}}/v1/scrape HTTP/1.1 +Authorization: Bearer {{$dotenv TEST_API_KEY}} +content-type: application/json + +{ + "url": "https://firecrawl.dev", + "formats": ["json"], + "jsonOptions": { + "schema": { + "type": "object", + "properties": { + "description": { + "type": "string", + "description": "Describe the site" + } + + }, + "required": ["description" ] + } + // "systemPrompt": "You are an expert web scraper." // Optional system prompt + } +} + +### Scrape to Extract Array of Titles +# @name scrapeItemsArray +POST {{baseUrl}}/v1/scrape HTTP/1.1 +Authorization: Bearer {{$dotenv TEST_API_KEY}} +content-type: application/json + +{ + "url": "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/load-more/with-params", + "formats": ["json"], + "jsonOptions": { + "prompt": "Extract all the main article or blog post titles from the page into an array.", + "schema": { + "type": "object", + "properties": { + "items": { + "type": "array", + "description": "An array containing the extracted items.", + "items": { + "type": "object", + "properties": { + "title": { + "type": "string", + "description": "The title of a single article or blog post." + } + }, + "required": ["title"] + } + } + }, + "required": ["items"] + } + // "systemPrompt": "You are an expert structured data extractor." + } +} \ No newline at end of file diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index d7fd4e6a..3682b2be 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -230,15 +230,18 @@ export async function extractData({ }), ); } + console.log("smartscrapeResults", smartscrapeResults); const scrapedPages = smartscrapeResults.map( (result) => result.scrapedPages, ); - const htmls = scrapedPages.map((page) => page.html); + console.log("scrapedPages", scrapedPages); + const htmls = scrapedPages.flat().map((page) => page.html); + console.log("htmls", htmls); const markdowns = await Promise.all( htmls.map(async (html) => await parseMarkdown(html)), ); - + console.log("markdowns", markdowns); extractedData = await Promise.all( markdowns.map(async (markdown) => { const newExtractOptions = {