From a4f15260a7fef1f8c1ddef72dad9d5c95e5fbb15 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 12 Nov 2024 12:23:24 -0500 Subject: [PATCH] Nick: --- apps/api/src/controllers/v1/extract.ts | 22 ++++++++++------------ apps/api/src/lib/extract/completions.ts | 5 ++--- apps/api/src/routes/v1.ts | 10 ++++++++++ 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/apps/api/src/controllers/v1/extract.ts b/apps/api/src/controllers/v1/extract.ts index 4ac32ddd..ebe248ba 100644 --- a/apps/api/src/controllers/v1/extract.ts +++ b/apps/api/src/controllers/v1/extract.ts @@ -10,8 +10,6 @@ import { import { v4 as uuidv4 } from "uuid"; import { getJobPriority } from "../../lib/job-priority"; import { PlanType } from "../../types"; -import { rerankDocuments } from "../../lib/extract/reranker"; -import { generateBasicCompletion } from "../../lib/extract/completions"; import { getMapResults } from "./map"; @@ -40,7 +38,7 @@ export async function extractController( const mappedDocuments: MapDocument[] = []; const prompt = req.body.prompt; - const keywords = await generateBasicCompletion(`If the user's prompt is: "${prompt}", what are the most important keywords besides the extraction task? Output only the keywords, separated by commas.`); + // const keywords = await generateBasicCompletion(`If the user's prompt is: "${prompt}", what are the most important keywords besides the extraction task? Output only the keywords, separated by commas.`); for (const url of urls) { if (url.endsWith("/*")) { @@ -57,15 +55,15 @@ export async function extractController( subId: req.acuc?.sub_id, includeMetadata: true }); - // top 3 links - const top3Links = (mapResults.links as MapDocument[]).slice(0, 3); - console.log(top3Links); - // console.log(top3Links); - mappedDocuments.push(...(mapResults.links as MapDocument[])); - // transform mappedUrls to just documents - // we quickly rerank - const rerank = await rerankDocuments(mappedDocuments.map(x => `URL: ${x.url}\nTITLE: ${x.title}\nDESCRIPTION: ${x.description}`), "What URLs are most relevant to the following prompt: " + (req.body.prompt || '').toLocaleLowerCase().replace("extract", " ").replace("extract ", " ")); - console.log(rerank); + // // top 3 links + // const top3Links = (mapResults.links as MapDocument[]).slice(0, 3); + // console.log(top3Links); + // // console.log(top3Links); + // mappedDocuments.push(...(mapResults.links as MapDocument[])); + // // transform mappedUrls to just documents + // // we quickly rerank + // const rerank = await rerankDocuments(mappedDocuments.map(x => `URL: ${x.url}\nTITLE: ${x.title}\nDESCRIPTION: ${x.description}`), "What URLs are most relevant to the following prompt: " + (req.body.prompt || '').toLocaleLowerCase().replace("extract", " ").replace("extract ", " ")); + // console.log(rerank); } else { mappedDocuments.push({ url }); } diff --git a/apps/api/src/lib/extract/completions.ts b/apps/api/src/lib/extract/completions.ts index 230584b4..fa75594b 100644 --- a/apps/api/src/lib/extract/completions.ts +++ b/apps/api/src/lib/extract/completions.ts @@ -82,7 +82,7 @@ export async function generateFinalExtraction({ const jsonCompletion = await openai.beta.chat.completions.parse({ model, messages: [ - { role: "system", content: systemPrompt }, + { role: "system", content: systemPrompt ?? "" }, { role: "user", content: [{ type: "text", text: extractionContent }] }, { role: "user", @@ -108,9 +108,8 @@ export async function generateFinalExtraction({ } const extraction = jsonCompletion.choices[0].message.parsed; - return { - content: extraction, + content: extraction ?? "", metadata: { numTokens, warning, diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index 3eaace3b..e6055a99 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -18,6 +18,7 @@ import { logger } from "../lib/logger"; import { scrapeStatusController } from "../controllers/v1/scrape-status"; import { concurrencyCheckController } from "../controllers/v1/concurrency-check"; import { batchScrapeController } from "../controllers/v1/batch-scrape"; +import { extractController } from "../controllers/v1/extract"; // import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview"; // import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status"; // import { searchController } from "../../src/controllers/v1/search"; @@ -178,6 +179,14 @@ v1Router.ws( crawlStatusWSController ); +v1Router.post( + "/extract", + authMiddleware(RateLimiterMode.Scrape), + checkCreditsMiddleware(1), + blocklistMiddleware, + wrap(extractController) +); + // v1Router.post("/crawlWebsitePreview", crawlPreviewController); @@ -199,3 +208,4 @@ v1Router.delete( // Health/Probe routes // v1Router.get("/health/liveness", livenessController); // v1Router.get("/health/readiness", readinessController); +