mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-20 14:49:08 +08:00
Nick: async background index
This commit is contained in:
parent
7a31306be5
commit
e6da214aeb
@ -7,6 +7,8 @@ import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/
|
||||
import { buildDocument } from "./build-document";
|
||||
import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { _addScrapeJobToBullMQ } from "../../services/queue-jobs";
|
||||
import { saveCrawl, StoredCrawl } from "../crawl-redis";
|
||||
|
||||
interface ExtractServiceOptions {
|
||||
request: ExtractRequest;
|
||||
@ -24,6 +26,18 @@ interface ExtractResult {
|
||||
error?: string;
|
||||
}
|
||||
|
||||
function getRootDomain(url: string): string {
|
||||
try {
|
||||
if(url.endsWith("/*")) {
|
||||
url = url.slice(0, -2);
|
||||
}
|
||||
const urlObj = new URL(url);
|
||||
return `${urlObj.protocol}//${urlObj.hostname}`;
|
||||
} catch (e) {
|
||||
return url;
|
||||
}
|
||||
}
|
||||
|
||||
export async function performExtraction(options: ExtractServiceOptions): Promise<ExtractResult> {
|
||||
const { request, teamId, plan, subId } = options;
|
||||
const scrapeId = crypto.randomUUID();
|
||||
@ -112,6 +126,62 @@ export async function performExtraction(options: ExtractServiceOptions): Promise
|
||||
});
|
||||
}
|
||||
|
||||
// Kickoff background crawl for indexing root domains
|
||||
const rootDomains = new Set(request.urls.map(getRootDomain));
|
||||
rootDomains.forEach(async url => {
|
||||
const crawlId = crypto.randomUUID();
|
||||
|
||||
// Create and save crawl configuration first
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: url,
|
||||
crawlerOptions: {
|
||||
maxDepth: 15,
|
||||
limit: 5000,
|
||||
includePaths: [],
|
||||
excludePaths: [],
|
||||
ignoreSitemap: false,
|
||||
includeSubdomains: true,
|
||||
allowExternalLinks: false,
|
||||
allowBackwardLinks: true
|
||||
},
|
||||
scrapeOptions: {
|
||||
formats: ["markdown"],
|
||||
onlyMainContent: true,
|
||||
waitFor: 0,
|
||||
mobile: false,
|
||||
removeBase64Images: true,
|
||||
fastMode: false,
|
||||
parsePDF: true,
|
||||
skipTlsVerification: false,
|
||||
},
|
||||
internalOptions: {
|
||||
disableSmartWaitCache: true,
|
||||
isBackgroundIndex: true
|
||||
},
|
||||
team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
|
||||
createdAt: Date.now(),
|
||||
plan: "hobby", // make it a low concurrency
|
||||
};
|
||||
|
||||
// Save the crawl configuration
|
||||
await saveCrawl(crawlId, sc);
|
||||
|
||||
// Then kick off the job
|
||||
await _addScrapeJobToBullMQ({
|
||||
url,
|
||||
mode: "kickoff" as const,
|
||||
team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
|
||||
plan: "hobby", // make it a low concurrency
|
||||
crawlerOptions: sc.crawlerOptions,
|
||||
scrapeOptions: sc.scrapeOptions,
|
||||
internalOptions: sc.internalOptions,
|
||||
origin: "index",
|
||||
crawl_id: crawlId,
|
||||
webhook: null,
|
||||
v1: true,
|
||||
}, {}, crypto.randomUUID(), 50);
|
||||
});
|
||||
|
||||
// Bill team for usage
|
||||
billTeam(teamId, subId, links.length * 5).catch((error) => {
|
||||
logger.error(
|
||||
|
@ -16,6 +16,7 @@ import {
|
||||
ScrapeUrlResponse,
|
||||
} from "../scraper/scrapeURL";
|
||||
import { Engine } from "../scraper/scrapeURL/engines";
|
||||
import { indexPage } from "../lib/extract/index/pinecone";
|
||||
configDotenv();
|
||||
|
||||
export async function startWebScraperPipeline({
|
||||
@ -173,6 +174,12 @@ export async function runWebScraper({
|
||||
creditsToBeBilled = 5;
|
||||
}
|
||||
|
||||
// If the team is the background index team, return the response
|
||||
if(team_id === process.env.BACKGROUND_INDEX_TEAM_ID!) {
|
||||
return response;
|
||||
}
|
||||
|
||||
|
||||
billTeam(team_id, undefined, creditsToBeBilled, logger).catch((error) => {
|
||||
logger.error(
|
||||
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits`,
|
||||
|
@ -153,6 +153,7 @@ export type InternalOptions = {
|
||||
v0DisableJsDom?: boolean;
|
||||
|
||||
disableSmartWaitCache?: boolean; // Passed along to fire-engine
|
||||
isBackgroundIndex?: boolean;
|
||||
};
|
||||
|
||||
export type EngineResultsTracker = {
|
||||
|
@ -166,6 +166,7 @@ const testSuiteTokens = [
|
||||
"4c2638d",
|
||||
"cbb3462", // don't remove (s-ai)
|
||||
"824abcd", // don't remove (s-ai)
|
||||
"0966288",
|
||||
];
|
||||
|
||||
const manual = ["69be9e74-7624-4990-b20d-08e0acc70cf6"];
|
||||
|
Loading…
x
Reference in New Issue
Block a user