mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-20 19:59:12 +08:00
Nick: async background index
This commit is contained in:
parent
7a31306be5
commit
e6da214aeb
@ -7,6 +7,8 @@ import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/
|
|||||||
import { buildDocument } from "./build-document";
|
import { buildDocument } from "./build-document";
|
||||||
import { billTeam } from "../../services/billing/credit_billing";
|
import { billTeam } from "../../services/billing/credit_billing";
|
||||||
import { logJob } from "../../services/logging/log_job";
|
import { logJob } from "../../services/logging/log_job";
|
||||||
|
import { _addScrapeJobToBullMQ } from "../../services/queue-jobs";
|
||||||
|
import { saveCrawl, StoredCrawl } from "../crawl-redis";
|
||||||
|
|
||||||
interface ExtractServiceOptions {
|
interface ExtractServiceOptions {
|
||||||
request: ExtractRequest;
|
request: ExtractRequest;
|
||||||
@ -24,6 +26,18 @@ interface ExtractResult {
|
|||||||
error?: string;
|
error?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function getRootDomain(url: string): string {
|
||||||
|
try {
|
||||||
|
if(url.endsWith("/*")) {
|
||||||
|
url = url.slice(0, -2);
|
||||||
|
}
|
||||||
|
const urlObj = new URL(url);
|
||||||
|
return `${urlObj.protocol}//${urlObj.hostname}`;
|
||||||
|
} catch (e) {
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
export async function performExtraction(options: ExtractServiceOptions): Promise<ExtractResult> {
|
export async function performExtraction(options: ExtractServiceOptions): Promise<ExtractResult> {
|
||||||
const { request, teamId, plan, subId } = options;
|
const { request, teamId, plan, subId } = options;
|
||||||
const scrapeId = crypto.randomUUID();
|
const scrapeId = crypto.randomUUID();
|
||||||
@ -112,6 +126,62 @@ export async function performExtraction(options: ExtractServiceOptions): Promise
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Kickoff background crawl for indexing root domains
|
||||||
|
const rootDomains = new Set(request.urls.map(getRootDomain));
|
||||||
|
rootDomains.forEach(async url => {
|
||||||
|
const crawlId = crypto.randomUUID();
|
||||||
|
|
||||||
|
// Create and save crawl configuration first
|
||||||
|
const sc: StoredCrawl = {
|
||||||
|
originUrl: url,
|
||||||
|
crawlerOptions: {
|
||||||
|
maxDepth: 15,
|
||||||
|
limit: 5000,
|
||||||
|
includePaths: [],
|
||||||
|
excludePaths: [],
|
||||||
|
ignoreSitemap: false,
|
||||||
|
includeSubdomains: true,
|
||||||
|
allowExternalLinks: false,
|
||||||
|
allowBackwardLinks: true
|
||||||
|
},
|
||||||
|
scrapeOptions: {
|
||||||
|
formats: ["markdown"],
|
||||||
|
onlyMainContent: true,
|
||||||
|
waitFor: 0,
|
||||||
|
mobile: false,
|
||||||
|
removeBase64Images: true,
|
||||||
|
fastMode: false,
|
||||||
|
parsePDF: true,
|
||||||
|
skipTlsVerification: false,
|
||||||
|
},
|
||||||
|
internalOptions: {
|
||||||
|
disableSmartWaitCache: true,
|
||||||
|
isBackgroundIndex: true
|
||||||
|
},
|
||||||
|
team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
|
||||||
|
createdAt: Date.now(),
|
||||||
|
plan: "hobby", // make it a low concurrency
|
||||||
|
};
|
||||||
|
|
||||||
|
// Save the crawl configuration
|
||||||
|
await saveCrawl(crawlId, sc);
|
||||||
|
|
||||||
|
// Then kick off the job
|
||||||
|
await _addScrapeJobToBullMQ({
|
||||||
|
url,
|
||||||
|
mode: "kickoff" as const,
|
||||||
|
team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
|
||||||
|
plan: "hobby", // make it a low concurrency
|
||||||
|
crawlerOptions: sc.crawlerOptions,
|
||||||
|
scrapeOptions: sc.scrapeOptions,
|
||||||
|
internalOptions: sc.internalOptions,
|
||||||
|
origin: "index",
|
||||||
|
crawl_id: crawlId,
|
||||||
|
webhook: null,
|
||||||
|
v1: true,
|
||||||
|
}, {}, crypto.randomUUID(), 50);
|
||||||
|
});
|
||||||
|
|
||||||
// Bill team for usage
|
// Bill team for usage
|
||||||
billTeam(teamId, subId, links.length * 5).catch((error) => {
|
billTeam(teamId, subId, links.length * 5).catch((error) => {
|
||||||
logger.error(
|
logger.error(
|
||||||
|
@ -16,6 +16,7 @@ import {
|
|||||||
ScrapeUrlResponse,
|
ScrapeUrlResponse,
|
||||||
} from "../scraper/scrapeURL";
|
} from "../scraper/scrapeURL";
|
||||||
import { Engine } from "../scraper/scrapeURL/engines";
|
import { Engine } from "../scraper/scrapeURL/engines";
|
||||||
|
import { indexPage } from "../lib/extract/index/pinecone";
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
export async function startWebScraperPipeline({
|
export async function startWebScraperPipeline({
|
||||||
@ -173,6 +174,12 @@ export async function runWebScraper({
|
|||||||
creditsToBeBilled = 5;
|
creditsToBeBilled = 5;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If the team is the background index team, return the response
|
||||||
|
if(team_id === process.env.BACKGROUND_INDEX_TEAM_ID!) {
|
||||||
|
return response;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
billTeam(team_id, undefined, creditsToBeBilled, logger).catch((error) => {
|
billTeam(team_id, undefined, creditsToBeBilled, logger).catch((error) => {
|
||||||
logger.error(
|
logger.error(
|
||||||
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits`,
|
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits`,
|
||||||
|
@ -153,6 +153,7 @@ export type InternalOptions = {
|
|||||||
v0DisableJsDom?: boolean;
|
v0DisableJsDom?: boolean;
|
||||||
|
|
||||||
disableSmartWaitCache?: boolean; // Passed along to fire-engine
|
disableSmartWaitCache?: boolean; // Passed along to fire-engine
|
||||||
|
isBackgroundIndex?: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type EngineResultsTracker = {
|
export type EngineResultsTracker = {
|
||||||
|
@ -166,6 +166,7 @@ const testSuiteTokens = [
|
|||||||
"4c2638d",
|
"4c2638d",
|
||||||
"cbb3462", // don't remove (s-ai)
|
"cbb3462", // don't remove (s-ai)
|
||||||
"824abcd", // don't remove (s-ai)
|
"824abcd", // don't remove (s-ai)
|
||||||
|
"0966288",
|
||||||
];
|
];
|
||||||
|
|
||||||
const manual = ["69be9e74-7624-4990-b20d-08e0acc70cf6"];
|
const manual = ["69be9e74-7624-4990-b20d-08e0acc70cf6"];
|
||||||
|
Loading…
x
Reference in New Issue
Block a user