This commit is contained in:
Nicolas 2024-11-12 12:23:24 -05:00
parent d430cfcbfb
commit a4f15260a7
3 changed files with 22 additions and 15 deletions

View File

@ -10,8 +10,6 @@ import {
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { getJobPriority } from "../../lib/job-priority"; import { getJobPriority } from "../../lib/job-priority";
import { PlanType } from "../../types"; import { PlanType } from "../../types";
import { rerankDocuments } from "../../lib/extract/reranker";
import { generateBasicCompletion } from "../../lib/extract/completions";
import { getMapResults } from "./map"; import { getMapResults } from "./map";
@ -40,7 +38,7 @@ export async function extractController(
const mappedDocuments: MapDocument[] = []; const mappedDocuments: MapDocument[] = [];
const prompt = req.body.prompt; const prompt = req.body.prompt;
const keywords = await generateBasicCompletion(`If the user's prompt is: "${prompt}", what are the most important keywords besides the extraction task? Output only the keywords, separated by commas.`); // const keywords = await generateBasicCompletion(`If the user's prompt is: "${prompt}", what are the most important keywords besides the extraction task? Output only the keywords, separated by commas.`);
for (const url of urls) { for (const url of urls) {
if (url.endsWith("/*")) { if (url.endsWith("/*")) {
@ -57,15 +55,15 @@ export async function extractController(
subId: req.acuc?.sub_id, subId: req.acuc?.sub_id,
includeMetadata: true includeMetadata: true
}); });
// top 3 links // // top 3 links
const top3Links = (mapResults.links as MapDocument[]).slice(0, 3); // const top3Links = (mapResults.links as MapDocument[]).slice(0, 3);
console.log(top3Links); // console.log(top3Links);
// console.log(top3Links); // // console.log(top3Links);
mappedDocuments.push(...(mapResults.links as MapDocument[])); // mappedDocuments.push(...(mapResults.links as MapDocument[]));
// transform mappedUrls to just documents // // transform mappedUrls to just documents
// we quickly rerank // // we quickly rerank
const rerank = await rerankDocuments(mappedDocuments.map(x => `URL: ${x.url}\nTITLE: ${x.title}\nDESCRIPTION: ${x.description}`), "What URLs are most relevant to the following prompt: " + (req.body.prompt || '').toLocaleLowerCase().replace("extract", " ").replace("extract ", " ")); // const rerank = await rerankDocuments(mappedDocuments.map(x => `URL: ${x.url}\nTITLE: ${x.title}\nDESCRIPTION: ${x.description}`), "What URLs are most relevant to the following prompt: " + (req.body.prompt || '').toLocaleLowerCase().replace("extract", " ").replace("extract ", " "));
console.log(rerank); // console.log(rerank);
} else { } else {
mappedDocuments.push({ url }); mappedDocuments.push({ url });
} }

View File

@ -82,7 +82,7 @@ export async function generateFinalExtraction({
const jsonCompletion = await openai.beta.chat.completions.parse({ const jsonCompletion = await openai.beta.chat.completions.parse({
model, model,
messages: [ messages: [
{ role: "system", content: systemPrompt }, { role: "system", content: systemPrompt ?? "" },
{ role: "user", content: [{ type: "text", text: extractionContent }] }, { role: "user", content: [{ type: "text", text: extractionContent }] },
{ {
role: "user", role: "user",
@ -108,9 +108,8 @@ export async function generateFinalExtraction({
} }
const extraction = jsonCompletion.choices[0].message.parsed; const extraction = jsonCompletion.choices[0].message.parsed;
return { return {
content: extraction, content: extraction ?? "",
metadata: { metadata: {
numTokens, numTokens,
warning, warning,

View File

@ -18,6 +18,7 @@ import { logger } from "../lib/logger";
import { scrapeStatusController } from "../controllers/v1/scrape-status"; import { scrapeStatusController } from "../controllers/v1/scrape-status";
import { concurrencyCheckController } from "../controllers/v1/concurrency-check"; import { concurrencyCheckController } from "../controllers/v1/concurrency-check";
import { batchScrapeController } from "../controllers/v1/batch-scrape"; import { batchScrapeController } from "../controllers/v1/batch-scrape";
import { extractController } from "../controllers/v1/extract";
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview"; // import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status"; // import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
// import { searchController } from "../../src/controllers/v1/search"; // import { searchController } from "../../src/controllers/v1/search";
@ -178,6 +179,14 @@ v1Router.ws(
crawlStatusWSController crawlStatusWSController
); );
v1Router.post(
"/extract",
authMiddleware(RateLimiterMode.Scrape),
checkCreditsMiddleware(1),
blocklistMiddleware,
wrap(extractController)
);
// v1Router.post("/crawlWebsitePreview", crawlPreviewController); // v1Router.post("/crawlWebsitePreview", crawlPreviewController);
@ -199,3 +208,4 @@ v1Router.delete(
// Health/Probe routes // Health/Probe routes
// v1Router.get("/health/liveness", livenessController); // v1Router.get("/health/liveness", livenessController);
// v1Router.get("/health/readiness", readinessController); // v1Router.get("/health/readiness", readinessController);