Nick: fixes

This commit is contained in:
Nicolas 2024-11-20 12:48:10 -08:00
parent 28696da6b2
commit 67a2989874
3 changed files with 27 additions and 6 deletions

View File

@ -23,6 +23,7 @@ import { getJobPriority } from "../../lib/job-priority";
import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { getMapResults } from "./map";
import { buildDocument } from "../../lib/extract/build-document";
configDotenv();
const redis = new Redis(process.env.REDIS_URL!);
@ -47,7 +48,7 @@ export async function extractController(
if (url.includes('/*') || req.body.allowExternalLinks) {
// Handle glob pattern URLs
const baseUrl = url.replace('/*', '');
const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any
// const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any
const allowExternalLinks = req.body.allowExternalLinks ?? true;
let urlWithoutWww = baseUrl.replace("www.", "");
@ -76,9 +77,11 @@ export async function extractController(
let mappedLinksRerank = mappedLinks.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`);
// Filter by path prefix if present
if (pathPrefix) {
mappedLinks = mappedLinks.filter(x => x.url && x.url.includes(`/${pathPrefix}/`));
}
// console.log("pathPrefix", pathPrefix);
// wrong
// if (pathPrefix) {
// mappedLinks = mappedLinks.filter(x => x.url && x.url.includes(`/${pathPrefix}/`));
// }
if (req.body.prompt) {
const linksAndScores : { link: string, linkWithContext: string, score: number, originalIndex: number }[] = await performRanking(mappedLinksRerank, mappedLinks.map(l => l.url), mapUrl);
@ -170,6 +173,7 @@ export async function extractController(
});
}
console.log("docs", docs.length);
const completions = await generateOpenAICompletions(
logger.child({ method: "extractController/generateOpenAICompletions" }),
{
@ -178,7 +182,7 @@ export async function extractController(
prompt: req.body.prompt,
schema: req.body.schema,
},
docs.map(x => x.markdown).join('\n')
docs.map(x => buildDocument(x)).join('\n')
);
// console.log("completions", completions);

View File

@ -0,0 +1,17 @@
import { Document } from "../../controllers/v1/types";
export function buildDocument(document: Document): string {
const metadata = document.metadata;
const markdown = document.markdown;
const documentMetadataString = `\nHere is the metadata for the document:\n${JSON.stringify(
metadata,
null,
2
)}`;
const documentString = `${markdown}${documentMetadataString}`;
console.log("documentString", documentString);
return markdown ?? "";
}

View File

@ -63,7 +63,7 @@ export async function generateOpenAICompletions(logger: Logger, options: Extract
let warning: string | undefined;
const openai = new OpenAI();
const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o";
const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
if (markdown === undefined) {
throw new Error("document.markdown is undefined -- this is unexpected");