mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-16 02:35:54 +08:00
Nick: fixes
This commit is contained in:
parent
28696da6b2
commit
67a2989874
@ -23,6 +23,7 @@ import { getJobPriority } from "../../lib/job-priority";
|
||||
import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
import { getMapResults } from "./map";
|
||||
import { buildDocument } from "../../lib/extract/build-document";
|
||||
|
||||
configDotenv();
|
||||
const redis = new Redis(process.env.REDIS_URL!);
|
||||
@ -47,7 +48,7 @@ export async function extractController(
|
||||
if (url.includes('/*') || req.body.allowExternalLinks) {
|
||||
// Handle glob pattern URLs
|
||||
const baseUrl = url.replace('/*', '');
|
||||
const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any
|
||||
// const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any
|
||||
|
||||
const allowExternalLinks = req.body.allowExternalLinks ?? true;
|
||||
let urlWithoutWww = baseUrl.replace("www.", "");
|
||||
@ -76,9 +77,11 @@ export async function extractController(
|
||||
let mappedLinksRerank = mappedLinks.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`);
|
||||
|
||||
// Filter by path prefix if present
|
||||
if (pathPrefix) {
|
||||
mappedLinks = mappedLinks.filter(x => x.url && x.url.includes(`/${pathPrefix}/`));
|
||||
}
|
||||
// console.log("pathPrefix", pathPrefix);
|
||||
// wrong
|
||||
// if (pathPrefix) {
|
||||
// mappedLinks = mappedLinks.filter(x => x.url && x.url.includes(`/${pathPrefix}/`));
|
||||
// }
|
||||
|
||||
if (req.body.prompt) {
|
||||
const linksAndScores : { link: string, linkWithContext: string, score: number, originalIndex: number }[] = await performRanking(mappedLinksRerank, mappedLinks.map(l => l.url), mapUrl);
|
||||
@ -170,6 +173,7 @@ export async function extractController(
|
||||
});
|
||||
}
|
||||
|
||||
console.log("docs", docs.length);
|
||||
const completions = await generateOpenAICompletions(
|
||||
logger.child({ method: "extractController/generateOpenAICompletions" }),
|
||||
{
|
||||
@ -178,7 +182,7 @@ export async function extractController(
|
||||
prompt: req.body.prompt,
|
||||
schema: req.body.schema,
|
||||
},
|
||||
docs.map(x => x.markdown).join('\n')
|
||||
docs.map(x => buildDocument(x)).join('\n')
|
||||
);
|
||||
|
||||
// console.log("completions", completions);
|
||||
|
17
apps/api/src/lib/extract/build-document.ts
Normal file
17
apps/api/src/lib/extract/build-document.ts
Normal file
@ -0,0 +1,17 @@
|
||||
import { Document } from "../../controllers/v1/types";
|
||||
|
||||
export function buildDocument(document: Document): string {
|
||||
const metadata = document.metadata;
|
||||
const markdown = document.markdown;
|
||||
|
||||
const documentMetadataString = `\nHere is the metadata for the document:\n${JSON.stringify(
|
||||
metadata,
|
||||
null,
|
||||
2
|
||||
)}`;
|
||||
|
||||
const documentString = `${markdown}${documentMetadataString}`;
|
||||
|
||||
console.log("documentString", documentString);
|
||||
return markdown ?? "";
|
||||
}
|
@ -63,7 +63,7 @@ export async function generateOpenAICompletions(logger: Logger, options: Extract
|
||||
let warning: string | undefined;
|
||||
|
||||
const openai = new OpenAI();
|
||||
const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o";
|
||||
const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
|
||||
|
||||
if (markdown === undefined) {
|
||||
throw new Error("document.markdown is undefined -- this is unexpected");
|
||||
|
Loading…
x
Reference in New Issue
Block a user