mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-05 20:16:03 +08:00
Nick:
This commit is contained in:
parent
d430cfcbfb
commit
a4f15260a7
@ -10,8 +10,6 @@ import {
|
|||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { getJobPriority } from "../../lib/job-priority";
|
import { getJobPriority } from "../../lib/job-priority";
|
||||||
import { PlanType } from "../../types";
|
import { PlanType } from "../../types";
|
||||||
import { rerankDocuments } from "../../lib/extract/reranker";
|
|
||||||
import { generateBasicCompletion } from "../../lib/extract/completions";
|
|
||||||
import { getMapResults } from "./map";
|
import { getMapResults } from "./map";
|
||||||
|
|
||||||
|
|
||||||
@ -40,7 +38,7 @@ export async function extractController(
|
|||||||
const mappedDocuments: MapDocument[] = [];
|
const mappedDocuments: MapDocument[] = [];
|
||||||
|
|
||||||
const prompt = req.body.prompt;
|
const prompt = req.body.prompt;
|
||||||
const keywords = await generateBasicCompletion(`If the user's prompt is: "${prompt}", what are the most important keywords besides the extraction task? Output only the keywords, separated by commas.`);
|
// const keywords = await generateBasicCompletion(`If the user's prompt is: "${prompt}", what are the most important keywords besides the extraction task? Output only the keywords, separated by commas.`);
|
||||||
|
|
||||||
for (const url of urls) {
|
for (const url of urls) {
|
||||||
if (url.endsWith("/*")) {
|
if (url.endsWith("/*")) {
|
||||||
@ -57,15 +55,15 @@ export async function extractController(
|
|||||||
subId: req.acuc?.sub_id,
|
subId: req.acuc?.sub_id,
|
||||||
includeMetadata: true
|
includeMetadata: true
|
||||||
});
|
});
|
||||||
// top 3 links
|
// // top 3 links
|
||||||
const top3Links = (mapResults.links as MapDocument[]).slice(0, 3);
|
// const top3Links = (mapResults.links as MapDocument[]).slice(0, 3);
|
||||||
console.log(top3Links);
|
// console.log(top3Links);
|
||||||
// console.log(top3Links);
|
// // console.log(top3Links);
|
||||||
mappedDocuments.push(...(mapResults.links as MapDocument[]));
|
// mappedDocuments.push(...(mapResults.links as MapDocument[]));
|
||||||
// transform mappedUrls to just documents
|
// // transform mappedUrls to just documents
|
||||||
// we quickly rerank
|
// // we quickly rerank
|
||||||
const rerank = await rerankDocuments(mappedDocuments.map(x => `URL: ${x.url}\nTITLE: ${x.title}\nDESCRIPTION: ${x.description}`), "What URLs are most relevant to the following prompt: " + (req.body.prompt || '').toLocaleLowerCase().replace("extract", " ").replace("extract ", " "));
|
// const rerank = await rerankDocuments(mappedDocuments.map(x => `URL: ${x.url}\nTITLE: ${x.title}\nDESCRIPTION: ${x.description}`), "What URLs are most relevant to the following prompt: " + (req.body.prompt || '').toLocaleLowerCase().replace("extract", " ").replace("extract ", " "));
|
||||||
console.log(rerank);
|
// console.log(rerank);
|
||||||
} else {
|
} else {
|
||||||
mappedDocuments.push({ url });
|
mappedDocuments.push({ url });
|
||||||
}
|
}
|
||||||
|
@ -82,7 +82,7 @@ export async function generateFinalExtraction({
|
|||||||
const jsonCompletion = await openai.beta.chat.completions.parse({
|
const jsonCompletion = await openai.beta.chat.completions.parse({
|
||||||
model,
|
model,
|
||||||
messages: [
|
messages: [
|
||||||
{ role: "system", content: systemPrompt },
|
{ role: "system", content: systemPrompt ?? "" },
|
||||||
{ role: "user", content: [{ type: "text", text: extractionContent }] },
|
{ role: "user", content: [{ type: "text", text: extractionContent }] },
|
||||||
{
|
{
|
||||||
role: "user",
|
role: "user",
|
||||||
@ -108,9 +108,8 @@ export async function generateFinalExtraction({
|
|||||||
}
|
}
|
||||||
|
|
||||||
const extraction = jsonCompletion.choices[0].message.parsed;
|
const extraction = jsonCompletion.choices[0].message.parsed;
|
||||||
|
|
||||||
return {
|
return {
|
||||||
content: extraction,
|
content: extraction ?? "",
|
||||||
metadata: {
|
metadata: {
|
||||||
numTokens,
|
numTokens,
|
||||||
warning,
|
warning,
|
||||||
|
@ -18,6 +18,7 @@ import { logger } from "../lib/logger";
|
|||||||
import { scrapeStatusController } from "../controllers/v1/scrape-status";
|
import { scrapeStatusController } from "../controllers/v1/scrape-status";
|
||||||
import { concurrencyCheckController } from "../controllers/v1/concurrency-check";
|
import { concurrencyCheckController } from "../controllers/v1/concurrency-check";
|
||||||
import { batchScrapeController } from "../controllers/v1/batch-scrape";
|
import { batchScrapeController } from "../controllers/v1/batch-scrape";
|
||||||
|
import { extractController } from "../controllers/v1/extract";
|
||||||
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
|
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
|
||||||
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
|
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
|
||||||
// import { searchController } from "../../src/controllers/v1/search";
|
// import { searchController } from "../../src/controllers/v1/search";
|
||||||
@ -178,6 +179,14 @@ v1Router.ws(
|
|||||||
crawlStatusWSController
|
crawlStatusWSController
|
||||||
);
|
);
|
||||||
|
|
||||||
|
v1Router.post(
|
||||||
|
"/extract",
|
||||||
|
authMiddleware(RateLimiterMode.Scrape),
|
||||||
|
checkCreditsMiddleware(1),
|
||||||
|
blocklistMiddleware,
|
||||||
|
wrap(extractController)
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// v1Router.post("/crawlWebsitePreview", crawlPreviewController);
|
// v1Router.post("/crawlWebsitePreview", crawlPreviewController);
|
||||||
@ -199,3 +208,4 @@ v1Router.delete(
|
|||||||
// Health/Probe routes
|
// Health/Probe routes
|
||||||
// v1Router.get("/health/liveness", livenessController);
|
// v1Router.get("/health/liveness", livenessController);
|
||||||
// v1Router.get("/health/readiness", readinessController);
|
// v1Router.get("/health/readiness", readinessController);
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user