mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 22:19:03 +08:00
Update extraction-service.ts
This commit is contained in:
parent
bd81b41d5f
commit
33632d2fe3
@ -127,60 +127,60 @@ export async function performExtraction(options: ExtractServiceOptions): Promise
|
||||
}
|
||||
|
||||
// Kickoff background crawl for indexing root domains
|
||||
const rootDomains = new Set(request.urls.map(getRootDomain));
|
||||
rootDomains.forEach(async url => {
|
||||
const crawlId = crypto.randomUUID();
|
||||
// const rootDomains = new Set(request.urls.map(getRootDomain));
|
||||
// rootDomains.forEach(async url => {
|
||||
// const crawlId = crypto.randomUUID();
|
||||
|
||||
// Create and save crawl configuration first
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: url,
|
||||
crawlerOptions: {
|
||||
maxDepth: 15,
|
||||
limit: 5000,
|
||||
includePaths: [],
|
||||
excludePaths: [],
|
||||
ignoreSitemap: false,
|
||||
includeSubdomains: true,
|
||||
allowExternalLinks: false,
|
||||
allowBackwardLinks: true
|
||||
},
|
||||
scrapeOptions: {
|
||||
formats: ["markdown"],
|
||||
onlyMainContent: true,
|
||||
waitFor: 0,
|
||||
mobile: false,
|
||||
removeBase64Images: true,
|
||||
fastMode: false,
|
||||
parsePDF: true,
|
||||
skipTlsVerification: false,
|
||||
},
|
||||
internalOptions: {
|
||||
disableSmartWaitCache: true,
|
||||
isBackgroundIndex: true
|
||||
},
|
||||
team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
|
||||
createdAt: Date.now(),
|
||||
plan: "hobby", // make it a low concurrency
|
||||
};
|
||||
// // Create and save crawl configuration first
|
||||
// const sc: StoredCrawl = {
|
||||
// originUrl: url,
|
||||
// crawlerOptions: {
|
||||
// maxDepth: 15,
|
||||
// limit: 5000,
|
||||
// includePaths: [],
|
||||
// excludePaths: [],
|
||||
// ignoreSitemap: false,
|
||||
// includeSubdomains: true,
|
||||
// allowExternalLinks: false,
|
||||
// allowBackwardLinks: true
|
||||
// },
|
||||
// scrapeOptions: {
|
||||
// formats: ["markdown"],
|
||||
// onlyMainContent: true,
|
||||
// waitFor: 0,
|
||||
// mobile: false,
|
||||
// removeBase64Images: true,
|
||||
// fastMode: false,
|
||||
// parsePDF: true,
|
||||
// skipTlsVerification: false,
|
||||
// },
|
||||
// internalOptions: {
|
||||
// disableSmartWaitCache: true,
|
||||
// isBackgroundIndex: true
|
||||
// },
|
||||
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
|
||||
// createdAt: Date.now(),
|
||||
// plan: "hobby", // make it a low concurrency
|
||||
// };
|
||||
|
||||
// Save the crawl configuration
|
||||
await saveCrawl(crawlId, sc);
|
||||
// // Save the crawl configuration
|
||||
// await saveCrawl(crawlId, sc);
|
||||
|
||||
// Then kick off the job
|
||||
await _addScrapeJobToBullMQ({
|
||||
url,
|
||||
mode: "kickoff" as const,
|
||||
team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
|
||||
plan: "hobby", // make it a low concurrency
|
||||
crawlerOptions: sc.crawlerOptions,
|
||||
scrapeOptions: sc.scrapeOptions,
|
||||
internalOptions: sc.internalOptions,
|
||||
origin: "index",
|
||||
crawl_id: crawlId,
|
||||
webhook: null,
|
||||
v1: true,
|
||||
}, {}, crypto.randomUUID(), 50);
|
||||
});
|
||||
// // Then kick off the job
|
||||
// await _addScrapeJobToBullMQ({
|
||||
// url,
|
||||
// mode: "kickoff" as const,
|
||||
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
|
||||
// plan: "hobby", // make it a low concurrency
|
||||
// crawlerOptions: sc.crawlerOptions,
|
||||
// scrapeOptions: sc.scrapeOptions,
|
||||
// internalOptions: sc.internalOptions,
|
||||
// origin: "index",
|
||||
// crawl_id: crawlId,
|
||||
// webhook: null,
|
||||
// v1: true,
|
||||
// }, {}, crypto.randomUUID(), 50);
|
||||
// });
|
||||
|
||||
// Bill team for usage
|
||||
billTeam(teamId, subId, links.length * 5).catch((error) => {
|
||||
|
Loading…
x
Reference in New Issue
Block a user