From 34b40f6a23abe5fe15b935828f3e344f8e3ec2f4 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 18 Jan 2025 17:17:42 -0300 Subject: [PATCH] Nick: --- apps/api/src/controllers/v1/scrape.ts | 3 +- apps/api/src/controllers/v1/types.ts | 36 ++++++++++++++++--- .../scrapeURL/transformers/llmExtract.ts | 7 +++- 3 files changed, 40 insertions(+), 6 deletions(-) diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index 1ea28995..f4bc45b5 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -33,6 +33,7 @@ export async function scrapeController( basePriority: 10, }); + await addScrapeJob( { url: req.body.url, @@ -96,7 +97,7 @@ export async function scrapeController( // Don't bill if we're early returning return; } - if (req.body.extract && req.body.formats.includes("extract")) { + if (req.body.extract && req.body.formats.includes("extract") ) { creditsToBeBilled = 5; } diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index ee141625..5573fbaa 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -125,6 +125,7 @@ export const scrapeOptions = z "screenshot", "screenshot@fullPage", "extract", + "json" ]) .array() .optional() @@ -139,7 +140,10 @@ export const scrapeOptions = z onlyMainContent: z.boolean().default(true), timeout: z.number().int().positive().finite().safe().optional(), waitFor: z.number().int().nonnegative().finite().safe().default(0), + // Deprecate this to jsonOptions extract: extractOptions.optional(), + // New + jsonOptions: extractOptions.optional(), mobile: z.boolean().default(false), parsePDF: z.boolean().default(true), actions: actionsSchema.optional(), @@ -242,20 +246,43 @@ export const scrapeRequestSchema = scrapeOptions (obj) => { const hasExtractFormat = obj.formats?.includes("extract"); const hasExtractOptions = obj.extract !== undefined; + const hasJsonFormat = obj.formats?.includes("json"); + const hasJsonOptions = obj.jsonOptions !== undefined; return ( (hasExtractFormat && hasExtractOptions) || - (!hasExtractFormat && !hasExtractOptions) + (!hasExtractFormat && !hasExtractOptions) || + (hasJsonFormat && hasJsonOptions) || + (!hasJsonFormat && !hasJsonOptions) ); }, { message: - "When 'extract' format is specified, 'extract' options must be provided, and vice versa", + "When 'extract' or 'json' format is specified, corresponding options must be provided, and vice versa", }, ) .transform((obj) => { - if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) { - return { ...obj, timeout: 60000 }; + // Handle timeout + if ((obj.formats?.includes("extract") || obj.extract || obj.formats?.includes("json") || obj.jsonOptions) && !obj.timeout) { + obj = { ...obj, timeout: 60000 }; } + + if(obj.formats?.includes("json")) { + obj.formats.push("extract"); + } + + // Convert JSON options to extract options if needed + if (obj.jsonOptions && !obj.extract) { + obj = { + ...obj, + extract: { + prompt: obj.jsonOptions.prompt, + systemPrompt: obj.jsonOptions.systemPrompt, + schema: obj.jsonOptions.schema, + mode: "llm" + } + }; + } + return obj; }); @@ -410,6 +437,7 @@ export type Document = { links?: string[]; screenshot?: string; extract?: any; + json?: any; warning?: string; actions?: { screenshots?: string[]; diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 5fdc3b75..be97036f 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -233,7 +233,12 @@ export async function performLLMExtract( document.markdown, document.warning, ); - document.extract = extract; + + if (meta.options.formats.includes("json")) { + document.json = extract; + } else { + document.extract = extract; + } document.warning = warning; }