Nick: fixes

This commit is contained in:
Nicolas 2024-11-20 12:48:10 -08:00
parent 28696da6b2
commit 67a2989874
3 changed files with 27 additions and 6 deletions

View File

@ -23,6 +23,7 @@ import { getJobPriority } from "../../lib/job-priority";
import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract"; import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { getMapResults } from "./map"; import { getMapResults } from "./map";
import { buildDocument } from "../../lib/extract/build-document";
configDotenv(); configDotenv();
const redis = new Redis(process.env.REDIS_URL!); const redis = new Redis(process.env.REDIS_URL!);
@ -47,7 +48,7 @@ export async function extractController(
if (url.includes('/*') || req.body.allowExternalLinks) { if (url.includes('/*') || req.body.allowExternalLinks) {
// Handle glob pattern URLs // Handle glob pattern URLs
const baseUrl = url.replace('/*', ''); const baseUrl = url.replace('/*', '');
const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any // const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any
const allowExternalLinks = req.body.allowExternalLinks ?? true; const allowExternalLinks = req.body.allowExternalLinks ?? true;
let urlWithoutWww = baseUrl.replace("www.", ""); let urlWithoutWww = baseUrl.replace("www.", "");
@ -76,9 +77,11 @@ export async function extractController(
let mappedLinksRerank = mappedLinks.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`); let mappedLinksRerank = mappedLinks.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`);
// Filter by path prefix if present // Filter by path prefix if present
if (pathPrefix) { // console.log("pathPrefix", pathPrefix);
mappedLinks = mappedLinks.filter(x => x.url && x.url.includes(`/${pathPrefix}/`)); // wrong
} // if (pathPrefix) {
// mappedLinks = mappedLinks.filter(x => x.url && x.url.includes(`/${pathPrefix}/`));
// }
if (req.body.prompt) { if (req.body.prompt) {
const linksAndScores : { link: string, linkWithContext: string, score: number, originalIndex: number }[] = await performRanking(mappedLinksRerank, mappedLinks.map(l => l.url), mapUrl); const linksAndScores : { link: string, linkWithContext: string, score: number, originalIndex: number }[] = await performRanking(mappedLinksRerank, mappedLinks.map(l => l.url), mapUrl);
@ -170,6 +173,7 @@ export async function extractController(
}); });
} }
console.log("docs", docs.length);
const completions = await generateOpenAICompletions( const completions = await generateOpenAICompletions(
logger.child({ method: "extractController/generateOpenAICompletions" }), logger.child({ method: "extractController/generateOpenAICompletions" }),
{ {
@ -178,7 +182,7 @@ export async function extractController(
prompt: req.body.prompt, prompt: req.body.prompt,
schema: req.body.schema, schema: req.body.schema,
}, },
docs.map(x => x.markdown).join('\n') docs.map(x => buildDocument(x)).join('\n')
); );
// console.log("completions", completions); // console.log("completions", completions);

View File

@ -0,0 +1,17 @@
import { Document } from "../../controllers/v1/types";
export function buildDocument(document: Document): string {
const metadata = document.metadata;
const markdown = document.markdown;
const documentMetadataString = `\nHere is the metadata for the document:\n${JSON.stringify(
metadata,
null,
2
)}`;
const documentString = `${markdown}${documentMetadataString}`;
console.log("documentString", documentString);
return markdown ?? "";
}

View File

@ -63,7 +63,7 @@ export async function generateOpenAICompletions(logger: Logger, options: Extract
let warning: string | undefined; let warning: string | undefined;
const openai = new OpenAI(); const openai = new OpenAI();
const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o"; const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
if (markdown === undefined) { if (markdown === undefined) {
throw new Error("document.markdown is undefined -- this is unexpected"); throw new Error("document.markdown is undefined -- this is unexpected");