Nick: extract fixes

This commit is contained in:
Nicolas 2024-12-17 16:58:35 -03:00
parent 79e335636a
commit b9f621bed5
4 changed files with 55 additions and 25 deletions

View File

@ -24,6 +24,8 @@ import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { getMapResults } from "./map";
import { buildDocument } from "../../lib/extract/build-document";
import { generateBasicCompletion } from "../../lib/LLM-extraction";
import { buildRefrasedPrompt } from "../../lib/extract/build-prompts";
configDotenv();
const redis = new Redis(process.env.REDIS_URL!);
@ -63,30 +65,35 @@ export async function extractController(
const allowExternalLinks = req.body.allowExternalLinks;
let urlWithoutWww = baseUrl.replace("www.", "");
let mapUrl =
req.body.prompt && allowExternalLinks
? `${req.body.prompt} ${urlWithoutWww}`
: req.body.prompt
? `${req.body.prompt} site:${urlWithoutWww}`
: `site:${urlWithoutWww}`;
let rephrasedPrompt = req.body.prompt;
if (req.body.prompt) {
rephrasedPrompt =
(await generateBasicCompletion(
buildRefrasedPrompt(req.body.prompt, baseUrl),
)) ?? req.body.prompt;
}
const mapResults = await getMapResults({
url: baseUrl,
search: req.body.prompt,
search: rephrasedPrompt,
teamId: req.auth.team_id,
plan: req.auth.plan,
allowExternalLinks,
origin: req.body.origin,
limit: req.body.limit,
// If we're self-hosted, we don't want to ignore the sitemap, due to our fire-engine mapping
ignoreSitemap: !selfHosted ? true : false,
ignoreSitemap: false,
includeMetadata: true,
includeSubdomains: req.body.includeSubdomains,
});
// console.log("mapResults", mapResults);
let mappedLinks = mapResults.links as MapDocument[];
if (mappedLinks.length === 0) {
mappedLinks = [{ url: baseUrl, title: "", description: "" }];
}
// Limit number of links to MAX_EXTRACT_LIMIT
mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT);
@ -94,20 +101,19 @@ export async function extractController(
(x) =>
`url: ${x.url}, title: ${x.title}, description: ${x.description}`,
);
// console.log("mappedLinksRerank", mappedLinksRerank);
// Filter by path prefix if present
// wrong
// if (pathPrefix) {
// mappedLinks = mappedLinks.filter(x => x.url && x.url.includes(`/${pathPrefix}/`));
// }
if (req.body.prompt) {
let searchQuery =
req.body.prompt && allowExternalLinks
? `${req.body.prompt} ${urlWithoutWww}`
: req.body.prompt
? `${req.body.prompt} site:${urlWithoutWww}`
: `site:${urlWithoutWww}`;
// Get similarity scores between the search query and each link's context
const linksAndScores = await performRanking(
mappedLinksRerank,
mappedLinks.map((l) => l.url),
mapUrl,
searchQuery,
);
// First try with high threshold
@ -153,20 +159,16 @@ export async function extractController(
} else {
// Handle direct URLs without glob pattern
if (!isUrlBlocked(url)) {
// console.log("url", url);
return [url];
}
return [];
}
});
// console.log("urlPromises", urlPromises.length);
// Wait for all URL processing to complete and flatten results
const processedUrls = await Promise.all(urlPromises);
const flattenedUrls = processedUrls.flat().filter(url => url); // Filter out any null/undefined values
const flattenedUrls = processedUrls.flat().filter((url) => url); // Filter out any null/undefined values
links.push(...flattenedUrls);
// console.log("links", links.length, "flattenedUrls", flattenedUrls.length);
if (links.length === 0) {
return res.status(400).json({
@ -212,7 +214,7 @@ export async function extractController(
}
return doc;
} catch (e) {
logger.error(`Error in scrapeController: ${e}`);
logger.error(`Error in extractController: ${e}`);
if (
e instanceof Error &&
(e.message.startsWith("Job wait") || e.message === "timeout")
@ -316,4 +318,3 @@ function filterAndProcessLinks(
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url),
);
}

View File

@ -62,3 +62,17 @@ export async function generateCompletions(
return completions;
}
// generate basic completion
export async function generateBasicCompletion(prompt: string) {
const openai = new OpenAI();
const model = process.env.MODEL_NAME ?? "gpt-4o-mini";
const completion = await openai.chat.completions.create({
model,
messages: [{ role: "user", content: prompt }],
});
return completion.choices[0].message.content;
}

View File

@ -0,0 +1,14 @@
export function buildRefrasedPrompt(prompt: string, url: string): string {
return `You are a search query optimizer. Your task is to rephrase the following prompt into an effective search query that will find relevant results about this topic on ${url}.
Original prompt: "${prompt}"
Provide a rephrased search query that:
1. Maintains the core intent of the original prompt
2. Uses relevant keywords
3. Is optimized for search engine results
4. Is concise and focused
5. Short is better than long
Return only the rephrased search query, without any explanation or additional text.`;
}

View File

@ -122,3 +122,4 @@
// },
// };
// }