From 67a29898742a7e0df28ee73c78a98d90e2a6ddd1 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 20 Nov 2024 12:48:10 -0800 Subject: [PATCH] Nick: fixes --- apps/api/src/controllers/v1/extract.ts | 14 +++++++++----- apps/api/src/lib/extract/build-document.ts | 17 +++++++++++++++++ .../scrapeURL/transformers/llmExtract.ts | 2 +- 3 files changed, 27 insertions(+), 6 deletions(-) create mode 100644 apps/api/src/lib/extract/build-document.ts diff --git a/apps/api/src/controllers/v1/extract.ts b/apps/api/src/controllers/v1/extract.ts index 6d9b4a2c..ededd63d 100644 --- a/apps/api/src/controllers/v1/extract.ts +++ b/apps/api/src/controllers/v1/extract.ts @@ -23,6 +23,7 @@ import { getJobPriority } from "../../lib/job-priority"; import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { getMapResults } from "./map"; +import { buildDocument } from "../../lib/extract/build-document"; configDotenv(); const redis = new Redis(process.env.REDIS_URL!); @@ -47,7 +48,7 @@ export async function extractController( if (url.includes('/*') || req.body.allowExternalLinks) { // Handle glob pattern URLs const baseUrl = url.replace('/*', ''); - const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any + // const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any const allowExternalLinks = req.body.allowExternalLinks ?? true; let urlWithoutWww = baseUrl.replace("www.", ""); @@ -76,9 +77,11 @@ export async function extractController( let mappedLinksRerank = mappedLinks.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`); // Filter by path prefix if present - if (pathPrefix) { - mappedLinks = mappedLinks.filter(x => x.url && x.url.includes(`/${pathPrefix}/`)); - } + // console.log("pathPrefix", pathPrefix); + // wrong + // if (pathPrefix) { + // mappedLinks = mappedLinks.filter(x => x.url && x.url.includes(`/${pathPrefix}/`)); + // } if (req.body.prompt) { const linksAndScores : { link: string, linkWithContext: string, score: number, originalIndex: number }[] = await performRanking(mappedLinksRerank, mappedLinks.map(l => l.url), mapUrl); @@ -170,6 +173,7 @@ export async function extractController( }); } + console.log("docs", docs.length); const completions = await generateOpenAICompletions( logger.child({ method: "extractController/generateOpenAICompletions" }), { @@ -178,7 +182,7 @@ export async function extractController( prompt: req.body.prompt, schema: req.body.schema, }, - docs.map(x => x.markdown).join('\n') + docs.map(x => buildDocument(x)).join('\n') ); // console.log("completions", completions); diff --git a/apps/api/src/lib/extract/build-document.ts b/apps/api/src/lib/extract/build-document.ts new file mode 100644 index 00000000..7486d2a7 --- /dev/null +++ b/apps/api/src/lib/extract/build-document.ts @@ -0,0 +1,17 @@ +import { Document } from "../../controllers/v1/types"; + +export function buildDocument(document: Document): string { + const metadata = document.metadata; + const markdown = document.markdown; + + const documentMetadataString = `\nHere is the metadata for the document:\n${JSON.stringify( + metadata, + null, + 2 + )}`; + + const documentString = `${markdown}${documentMetadataString}`; + + console.log("documentString", documentString); + return markdown ?? ""; +} diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index d8004c07..41d76779 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -63,7 +63,7 @@ export async function generateOpenAICompletions(logger: Logger, options: Extract let warning: string | undefined; const openai = new OpenAI(); - const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o"; + const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini"; if (markdown === undefined) { throw new Error("document.markdown is undefined -- this is unexpected");