diff --git a/apps/api/requests.http b/apps/api/requests.http index a3997371..26fa9d07 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -12,6 +12,7 @@ content-type: application/json "url":"https://firecrawl.dev" } + ### Crawl Website # @name crawl POST {{baseUrl}}/v1/crawl HTTP/1.1 @@ -120,3 +121,91 @@ content-type: application/json GET {{baseUrl}}/v1/llmstxt/{{generateLlmsTxtId}} HTTP/1.1 Authorization: Bearer {{$dotenv TEST_API_KEY}} +### Scrape with JSON Schema Extraction +# @name scrapeWithSchema +POST {{baseUrl}}/v1/scrape HTTP/1.1 +Authorization: Bearer {{$dotenv TEST_API_KEY}} +content-type: application/json + +{ + "url": "https://firecrawl.dev", + "formats": ["json"], + "jsonOptions": { + "schema": { + "type": "object", + "properties": { + "description": { + "type": "string", + "description": "Describe the site" + }, + "respect_robots_txt": { + "type": ["boolean","null"], + "description": "Does firecrawl respect the robots.txt files?" + } + }, + "required": ["description", "respect_robots_txt"] + } + // "systemPrompt": "You are an expert web scraper." // Optional system prompt + } +} + + +### Scrape with JSON Schema Extraction +# @name scrapeWithSchema +POST {{baseUrl}}/v1/scrape HTTP/1.1 +Authorization: Bearer {{$dotenv TEST_API_KEY}} +content-type: application/json + +{ + "url": "https://firecrawl.dev", + "formats": ["json"], + "jsonOptions": { + "schema": { + "type": "object", + "properties": { + "description": { + "type": "string", + "description": "Describe the site" + } + + }, + "required": ["description" ] + } + // "systemPrompt": "You are an expert web scraper." // Optional system prompt + } +} + +### Scrape to Extract Array of Titles +# @name scrapeItemsArray +POST {{baseUrl}}/v1/scrape HTTP/1.1 +Authorization: Bearer {{$dotenv TEST_API_KEY}} +content-type: application/json + +{ + "url": "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/load-more/with-params", + "formats": ["json"], + "jsonOptions": { + "prompt": "Extract all the main article or blog post titles from the page into an array.", + "schema": { + "type": "object", + "properties": { + "items": { + "type": "array", + "description": "An array containing the extracted items.", + "items": { + "type": "object", + "properties": { + "title": { + "type": "string", + "description": "The title of a single article or blog post." + } + }, + "required": ["title"] + } + } + }, + "required": ["items"] + } + // "systemPrompt": "You are an expert structured data extractor." + } +} \ No newline at end of file diff --git a/apps/api/src/lib/extract/completions/batchExtract.ts b/apps/api/src/lib/extract/completions/batchExtract.ts index 4225313b..32b11420 100644 --- a/apps/api/src/lib/extract/completions/batchExtract.ts +++ b/apps/api/src/lib/extract/completions/batchExtract.ts @@ -1,5 +1,8 @@ import { logger } from "../../../lib/logger"; -import { generateCompletions, GenerateCompletionsOptions } from "../../../scraper/scrapeURL/transformers/llmExtract"; +import { + generateCompletions, + GenerateCompletionsOptions, +} from "../../../scraper/scrapeURL/transformers/llmExtract"; import { buildDocument } from "../build-document"; import { ExtractResponse, TokenUsage } from "../../../controllers/v1/types"; import { Document } from "../../../controllers/v1/types"; @@ -74,5 +77,5 @@ export async function batchExtractPromise( }, warning: warning, sources: [doc.metadata.url || doc.metadata.sourceURL || ""], - } + }; } diff --git a/apps/api/src/lib/extract/completions/singleAnswer.ts b/apps/api/src/lib/extract/completions/singleAnswer.ts index 52db04e4..51aa346d 100644 --- a/apps/api/src/lib/extract/completions/singleAnswer.ts +++ b/apps/api/src/lib/extract/completions/singleAnswer.ts @@ -1,5 +1,8 @@ import { logger } from "../../../lib/logger"; -import { generateCompletions, GenerateCompletionsOptions } from "../../../scraper/scrapeURL/transformers/llmExtract"; +import { + generateCompletions, + GenerateCompletionsOptions, +} from "../../../scraper/scrapeURL/transformers/llmExtract"; import { buildDocument } from "../build-document"; import { Document, TokenUsage } from "../../../controllers/v1/types"; import { getModel } from "../../../lib/generic-ai"; @@ -39,7 +42,7 @@ export async function singleAnswerCompletion({ model: getModel("gemini-2.0-flash", "google"), }; - const { extractedDataArray, warning } = await extractData({ + const { extractedDataArray, warning } = await extractData({ extractOptions: generationOptions, urls: singleAnswerDocs.map(doc => doc.metadata.url || doc.metadata.sourceURL || ""), }); @@ -57,7 +60,6 @@ export async function singleAnswerCompletion({ ), }; - // const completion = await generateCompletions({ // logger: logger.child({ module: "extract", method: "generateCompletions" }), // options: { @@ -78,12 +80,7 @@ export async function singleAnswerCompletion({ // ); return { extract: completion.extract, - tokenUsage: { - promptTokens: 0, - completionTokens: 0, - totalTokens: 0, - model: "gemini-2.0-flash", - }, + tokenUsage: completion.tokenUsage, sources: singleAnswerDocs.map( (doc) => doc.metadata.url || doc.metadata.sourceURL || "", ), diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 33bdbf6c..d75d84b7 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -455,10 +455,11 @@ export async function performExtraction( ); // Race between timeout and completion - const multiEntityCompletion = await completionPromise as Awaited>; + const multiEntityCompletion = (await completionPromise) as Awaited< + ReturnType + >; // TODO: merge multiEntityCompletion.extract to fit the multiEntitySchema - // Track multi-entity extraction tokens if (multiEntityCompletion) { @@ -520,7 +521,9 @@ export async function performExtraction( ); extractionResults.push(...validResults); // Merge all extracts from valid results into a single array - const extractArrays = validResults.map(r => Array.isArray(r.extract) ? r.extract : [r.extract]); + const extractArrays = validResults.map((r) => + Array.isArray(r.extract) ? r.extract : [r.extract], + ); const mergedExtracts = extractArrays.flat(); multiEntityCompletions.push(...mergedExtracts); logger.debug("All multi-entity completion chunks finished.", { @@ -692,10 +695,7 @@ export async function performExtraction( }); logger.debug("Done generating singleAnswer completions."); - singleAnswerResult = transformArrayToObject( - rSchema, - completionResult, - ); + singleAnswerResult = transformArrayToObject(rSchema, completionResult); singleAnswerResult = deduplicateObjectsArray(singleAnswerResult); diff --git a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts index 76bd4984..fdfdd1d0 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractSmartScrape.ts @@ -11,7 +11,7 @@ const commonSmartScrapeProperties = { shouldUseSmartscrape: { type: "boolean", description: - "Set to `true` if any of the extractedData is null and you think you can find the information by performing user-like interactions (e.g., clicking buttons/accordions to reveal hidden text, scrolling down to load more content). SmartScrape can perform these actions to access the data.", + "Set to `true` if any of the extractedData is null and you think you can find the information by performing user-like interactions (e.g., clicking buttons/accordions to reveal hidden text). SmartScrape can perform these actions to access the data.", }, // Note: extractedData is added dynamically in prepareSmartScrapeSchema }; @@ -22,13 +22,13 @@ const commonReasoningPromptProperties = { type: ["string", "null"], // Using the more detailed multi-step description as the common one description: - "Reasoning for why a SmartScrape step/action is needed. Explain which data is missing or requires interaction.", + "Reasoning for why a SmartScrape is needed. Explain which data is missing or requires interaction.", }, smartscrape_prompt: { type: ["string", "null"], // Using the more detailed multi-step description as the common one description: - "Prompt detailing the specific actions SmartScrape should perform (e.g., 'click button X', 'scroll down').", + "Prompt detailing the specific actions SmartScrape should perform (e.g., 'click button X''). Dont mention anything about extraction, smartscrape just returns page content", }, }; @@ -52,7 +52,7 @@ const multiSmartScrapeWrapperSchemaDefinition = { smartScrapePages: { type: "array", description: - "Make an entry for each page we want to run smart scrape on.", + "Make an entry for each page we want to run smart scrape on, no matter how many actions it should be one entry per page.", items: { type: "object", properties: { @@ -185,7 +185,7 @@ export async function extractData({ //WRAP SCHEMA const schema = extractOptions.options.schema; const logger = extractOptions.logger; - const isSingleUrl = urls.length === 0; + const isSingleUrl = urls.length === 1; console.log("!!!!!!!!!!!!!!!!!!hereee"); const { schemaToUse } = prepareSmartScrapeSchema(schema, logger, isSingleUrl); const extractOptionsNewSchema = { @@ -239,15 +239,18 @@ export async function extractData({ }), ); } + console.log("smartscrapeResults", smartscrapeResults); const scrapedPages = smartscrapeResults.map( (result) => result.scrapedPages, ); - const htmls = scrapedPages.map((page) => page.html); + console.log("scrapedPages", scrapedPages); + const htmls = scrapedPages.flat().map((page) => page.html); + console.log("htmls", htmls); const markdowns = await Promise.all( htmls.map(async (html) => await parseMarkdown(html)), ); - + console.log("markdowns", markdowns); extractedData = await Promise.all( markdowns.map(async (markdown) => { const newExtractOptions = { diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 3a917c1f..d12817c0 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -413,6 +413,7 @@ export async function performLLMExtract( // model: getModel("o3-mini", "openai"), // Keeping existing model selection // model: getModel("o3-mini", "openai"), // model: getModel("qwen-qwq-32b", "groq"), + // model: getModel("gemini-2.0-flash", "google"), model: getModel("gemini-2.5-pro-exp-03-25", "google"), };