mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-15 00:05:52 +08:00
Nick: extract fixes
This commit is contained in:
parent
79e335636a
commit
b9f621bed5
@ -24,6 +24,8 @@ import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/
|
|||||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||||
import { getMapResults } from "./map";
|
import { getMapResults } from "./map";
|
||||||
import { buildDocument } from "../../lib/extract/build-document";
|
import { buildDocument } from "../../lib/extract/build-document";
|
||||||
|
import { generateBasicCompletion } from "../../lib/LLM-extraction";
|
||||||
|
import { buildRefrasedPrompt } from "../../lib/extract/build-prompts";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
const redis = new Redis(process.env.REDIS_URL!);
|
const redis = new Redis(process.env.REDIS_URL!);
|
||||||
@ -63,30 +65,35 @@ export async function extractController(
|
|||||||
|
|
||||||
const allowExternalLinks = req.body.allowExternalLinks;
|
const allowExternalLinks = req.body.allowExternalLinks;
|
||||||
let urlWithoutWww = baseUrl.replace("www.", "");
|
let urlWithoutWww = baseUrl.replace("www.", "");
|
||||||
let mapUrl =
|
|
||||||
req.body.prompt && allowExternalLinks
|
let rephrasedPrompt = req.body.prompt;
|
||||||
? `${req.body.prompt} ${urlWithoutWww}`
|
if (req.body.prompt) {
|
||||||
: req.body.prompt
|
rephrasedPrompt =
|
||||||
? `${req.body.prompt} site:${urlWithoutWww}`
|
(await generateBasicCompletion(
|
||||||
: `site:${urlWithoutWww}`;
|
buildRefrasedPrompt(req.body.prompt, baseUrl),
|
||||||
|
)) ?? req.body.prompt;
|
||||||
|
}
|
||||||
|
|
||||||
const mapResults = await getMapResults({
|
const mapResults = await getMapResults({
|
||||||
url: baseUrl,
|
url: baseUrl,
|
||||||
search: req.body.prompt,
|
search: rephrasedPrompt,
|
||||||
teamId: req.auth.team_id,
|
teamId: req.auth.team_id,
|
||||||
plan: req.auth.plan,
|
plan: req.auth.plan,
|
||||||
allowExternalLinks,
|
allowExternalLinks,
|
||||||
origin: req.body.origin,
|
origin: req.body.origin,
|
||||||
limit: req.body.limit,
|
limit: req.body.limit,
|
||||||
// If we're self-hosted, we don't want to ignore the sitemap, due to our fire-engine mapping
|
// If we're self-hosted, we don't want to ignore the sitemap, due to our fire-engine mapping
|
||||||
ignoreSitemap: !selfHosted ? true : false,
|
ignoreSitemap: false,
|
||||||
includeMetadata: true,
|
includeMetadata: true,
|
||||||
includeSubdomains: req.body.includeSubdomains,
|
includeSubdomains: req.body.includeSubdomains,
|
||||||
});
|
});
|
||||||
|
|
||||||
// console.log("mapResults", mapResults);
|
|
||||||
|
|
||||||
let mappedLinks = mapResults.links as MapDocument[];
|
let mappedLinks = mapResults.links as MapDocument[];
|
||||||
|
|
||||||
|
if (mappedLinks.length === 0) {
|
||||||
|
mappedLinks = [{ url: baseUrl, title: "", description: "" }];
|
||||||
|
}
|
||||||
|
|
||||||
// Limit number of links to MAX_EXTRACT_LIMIT
|
// Limit number of links to MAX_EXTRACT_LIMIT
|
||||||
mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT);
|
mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT);
|
||||||
|
|
||||||
@ -94,20 +101,19 @@ export async function extractController(
|
|||||||
(x) =>
|
(x) =>
|
||||||
`url: ${x.url}, title: ${x.title}, description: ${x.description}`,
|
`url: ${x.url}, title: ${x.title}, description: ${x.description}`,
|
||||||
);
|
);
|
||||||
// console.log("mappedLinksRerank", mappedLinksRerank);
|
|
||||||
|
|
||||||
// Filter by path prefix if present
|
|
||||||
// wrong
|
|
||||||
// if (pathPrefix) {
|
|
||||||
// mappedLinks = mappedLinks.filter(x => x.url && x.url.includes(`/${pathPrefix}/`));
|
|
||||||
// }
|
|
||||||
|
|
||||||
if (req.body.prompt) {
|
if (req.body.prompt) {
|
||||||
|
let searchQuery =
|
||||||
|
req.body.prompt && allowExternalLinks
|
||||||
|
? `${req.body.prompt} ${urlWithoutWww}`
|
||||||
|
: req.body.prompt
|
||||||
|
? `${req.body.prompt} site:${urlWithoutWww}`
|
||||||
|
: `site:${urlWithoutWww}`;
|
||||||
// Get similarity scores between the search query and each link's context
|
// Get similarity scores between the search query and each link's context
|
||||||
const linksAndScores = await performRanking(
|
const linksAndScores = await performRanking(
|
||||||
mappedLinksRerank,
|
mappedLinksRerank,
|
||||||
mappedLinks.map((l) => l.url),
|
mappedLinks.map((l) => l.url),
|
||||||
mapUrl,
|
searchQuery,
|
||||||
);
|
);
|
||||||
|
|
||||||
// First try with high threshold
|
// First try with high threshold
|
||||||
@ -153,20 +159,16 @@ export async function extractController(
|
|||||||
} else {
|
} else {
|
||||||
// Handle direct URLs without glob pattern
|
// Handle direct URLs without glob pattern
|
||||||
if (!isUrlBlocked(url)) {
|
if (!isUrlBlocked(url)) {
|
||||||
// console.log("url", url);
|
|
||||||
return [url];
|
return [url];
|
||||||
}
|
}
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// console.log("urlPromises", urlPromises.length);
|
|
||||||
|
|
||||||
// Wait for all URL processing to complete and flatten results
|
// Wait for all URL processing to complete and flatten results
|
||||||
const processedUrls = await Promise.all(urlPromises);
|
const processedUrls = await Promise.all(urlPromises);
|
||||||
const flattenedUrls = processedUrls.flat().filter(url => url); // Filter out any null/undefined values
|
const flattenedUrls = processedUrls.flat().filter((url) => url); // Filter out any null/undefined values
|
||||||
links.push(...flattenedUrls);
|
links.push(...flattenedUrls);
|
||||||
// console.log("links", links.length, "flattenedUrls", flattenedUrls.length);
|
|
||||||
|
|
||||||
if (links.length === 0) {
|
if (links.length === 0) {
|
||||||
return res.status(400).json({
|
return res.status(400).json({
|
||||||
@ -212,7 +214,7 @@ export async function extractController(
|
|||||||
}
|
}
|
||||||
return doc;
|
return doc;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.error(`Error in scrapeController: ${e}`);
|
logger.error(`Error in extractController: ${e}`);
|
||||||
if (
|
if (
|
||||||
e instanceof Error &&
|
e instanceof Error &&
|
||||||
(e.message.startsWith("Job wait") || e.message === "timeout")
|
(e.message.startsWith("Job wait") || e.message === "timeout")
|
||||||
@ -316,4 +318,3 @@ function filterAndProcessLinks(
|
|||||||
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url),
|
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -62,3 +62,17 @@ export async function generateCompletions(
|
|||||||
|
|
||||||
return completions;
|
return completions;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// generate basic completion
|
||||||
|
|
||||||
|
export async function generateBasicCompletion(prompt: string) {
|
||||||
|
const openai = new OpenAI();
|
||||||
|
const model = process.env.MODEL_NAME ?? "gpt-4o-mini";
|
||||||
|
|
||||||
|
const completion = await openai.chat.completions.create({
|
||||||
|
model,
|
||||||
|
messages: [{ role: "user", content: prompt }],
|
||||||
|
});
|
||||||
|
|
||||||
|
return completion.choices[0].message.content;
|
||||||
|
}
|
||||||
|
14
apps/api/src/lib/extract/build-prompts.ts
Normal file
14
apps/api/src/lib/extract/build-prompts.ts
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
export function buildRefrasedPrompt(prompt: string, url: string): string {
|
||||||
|
return `You are a search query optimizer. Your task is to rephrase the following prompt into an effective search query that will find relevant results about this topic on ${url}.
|
||||||
|
|
||||||
|
Original prompt: "${prompt}"
|
||||||
|
|
||||||
|
Provide a rephrased search query that:
|
||||||
|
1. Maintains the core intent of the original prompt
|
||||||
|
2. Uses relevant keywords
|
||||||
|
3. Is optimized for search engine results
|
||||||
|
4. Is concise and focused
|
||||||
|
5. Short is better than long
|
||||||
|
|
||||||
|
Return only the rephrased search query, without any explanation or additional text.`;
|
||||||
|
}
|
@ -122,3 +122,4 @@
|
|||||||
// },
|
// },
|
||||||
// };
|
// };
|
||||||
// }
|
// }
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user