mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-18 03:45:53 +08:00
Nick: fixes
This commit is contained in:
parent
28696da6b2
commit
67a2989874
@ -23,6 +23,7 @@ import { getJobPriority } from "../../lib/job-priority";
|
|||||||
import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
|
import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
|
||||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||||
import { getMapResults } from "./map";
|
import { getMapResults } from "./map";
|
||||||
|
import { buildDocument } from "../../lib/extract/build-document";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
const redis = new Redis(process.env.REDIS_URL!);
|
const redis = new Redis(process.env.REDIS_URL!);
|
||||||
@ -47,7 +48,7 @@ export async function extractController(
|
|||||||
if (url.includes('/*') || req.body.allowExternalLinks) {
|
if (url.includes('/*') || req.body.allowExternalLinks) {
|
||||||
// Handle glob pattern URLs
|
// Handle glob pattern URLs
|
||||||
const baseUrl = url.replace('/*', '');
|
const baseUrl = url.replace('/*', '');
|
||||||
const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any
|
// const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any
|
||||||
|
|
||||||
const allowExternalLinks = req.body.allowExternalLinks ?? true;
|
const allowExternalLinks = req.body.allowExternalLinks ?? true;
|
||||||
let urlWithoutWww = baseUrl.replace("www.", "");
|
let urlWithoutWww = baseUrl.replace("www.", "");
|
||||||
@ -76,9 +77,11 @@ export async function extractController(
|
|||||||
let mappedLinksRerank = mappedLinks.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`);
|
let mappedLinksRerank = mappedLinks.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`);
|
||||||
|
|
||||||
// Filter by path prefix if present
|
// Filter by path prefix if present
|
||||||
if (pathPrefix) {
|
// console.log("pathPrefix", pathPrefix);
|
||||||
mappedLinks = mappedLinks.filter(x => x.url && x.url.includes(`/${pathPrefix}/`));
|
// wrong
|
||||||
}
|
// if (pathPrefix) {
|
||||||
|
// mappedLinks = mappedLinks.filter(x => x.url && x.url.includes(`/${pathPrefix}/`));
|
||||||
|
// }
|
||||||
|
|
||||||
if (req.body.prompt) {
|
if (req.body.prompt) {
|
||||||
const linksAndScores : { link: string, linkWithContext: string, score: number, originalIndex: number }[] = await performRanking(mappedLinksRerank, mappedLinks.map(l => l.url), mapUrl);
|
const linksAndScores : { link: string, linkWithContext: string, score: number, originalIndex: number }[] = await performRanking(mappedLinksRerank, mappedLinks.map(l => l.url), mapUrl);
|
||||||
@ -170,6 +173,7 @@ export async function extractController(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
console.log("docs", docs.length);
|
||||||
const completions = await generateOpenAICompletions(
|
const completions = await generateOpenAICompletions(
|
||||||
logger.child({ method: "extractController/generateOpenAICompletions" }),
|
logger.child({ method: "extractController/generateOpenAICompletions" }),
|
||||||
{
|
{
|
||||||
@ -178,7 +182,7 @@ export async function extractController(
|
|||||||
prompt: req.body.prompt,
|
prompt: req.body.prompt,
|
||||||
schema: req.body.schema,
|
schema: req.body.schema,
|
||||||
},
|
},
|
||||||
docs.map(x => x.markdown).join('\n')
|
docs.map(x => buildDocument(x)).join('\n')
|
||||||
);
|
);
|
||||||
|
|
||||||
// console.log("completions", completions);
|
// console.log("completions", completions);
|
||||||
|
17
apps/api/src/lib/extract/build-document.ts
Normal file
17
apps/api/src/lib/extract/build-document.ts
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
import { Document } from "../../controllers/v1/types";
|
||||||
|
|
||||||
|
export function buildDocument(document: Document): string {
|
||||||
|
const metadata = document.metadata;
|
||||||
|
const markdown = document.markdown;
|
||||||
|
|
||||||
|
const documentMetadataString = `\nHere is the metadata for the document:\n${JSON.stringify(
|
||||||
|
metadata,
|
||||||
|
null,
|
||||||
|
2
|
||||||
|
)}`;
|
||||||
|
|
||||||
|
const documentString = `${markdown}${documentMetadataString}`;
|
||||||
|
|
||||||
|
console.log("documentString", documentString);
|
||||||
|
return markdown ?? "";
|
||||||
|
}
|
@ -63,7 +63,7 @@ export async function generateOpenAICompletions(logger: Logger, options: Extract
|
|||||||
let warning: string | undefined;
|
let warning: string | undefined;
|
||||||
|
|
||||||
const openai = new OpenAI();
|
const openai = new OpenAI();
|
||||||
const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o";
|
const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
|
||||||
|
|
||||||
if (markdown === undefined) {
|
if (markdown === undefined) {
|
||||||
throw new Error("document.markdown is undefined -- this is unexpected");
|
throw new Error("document.markdown is undefined -- this is unexpected");
|
||||||
|
Loading…
x
Reference in New Issue
Block a user