mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 10:55:57 +08:00
Update extraction-service.ts
This commit is contained in:
parent
bd81b41d5f
commit
33632d2fe3
@ -127,60 +127,60 @@ export async function performExtraction(options: ExtractServiceOptions): Promise
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Kickoff background crawl for indexing root domains
|
// Kickoff background crawl for indexing root domains
|
||||||
const rootDomains = new Set(request.urls.map(getRootDomain));
|
// const rootDomains = new Set(request.urls.map(getRootDomain));
|
||||||
rootDomains.forEach(async url => {
|
// rootDomains.forEach(async url => {
|
||||||
const crawlId = crypto.randomUUID();
|
// const crawlId = crypto.randomUUID();
|
||||||
|
|
||||||
// Create and save crawl configuration first
|
// // Create and save crawl configuration first
|
||||||
const sc: StoredCrawl = {
|
// const sc: StoredCrawl = {
|
||||||
originUrl: url,
|
// originUrl: url,
|
||||||
crawlerOptions: {
|
// crawlerOptions: {
|
||||||
maxDepth: 15,
|
// maxDepth: 15,
|
||||||
limit: 5000,
|
// limit: 5000,
|
||||||
includePaths: [],
|
// includePaths: [],
|
||||||
excludePaths: [],
|
// excludePaths: [],
|
||||||
ignoreSitemap: false,
|
// ignoreSitemap: false,
|
||||||
includeSubdomains: true,
|
// includeSubdomains: true,
|
||||||
allowExternalLinks: false,
|
// allowExternalLinks: false,
|
||||||
allowBackwardLinks: true
|
// allowBackwardLinks: true
|
||||||
},
|
// },
|
||||||
scrapeOptions: {
|
// scrapeOptions: {
|
||||||
formats: ["markdown"],
|
// formats: ["markdown"],
|
||||||
onlyMainContent: true,
|
// onlyMainContent: true,
|
||||||
waitFor: 0,
|
// waitFor: 0,
|
||||||
mobile: false,
|
// mobile: false,
|
||||||
removeBase64Images: true,
|
// removeBase64Images: true,
|
||||||
fastMode: false,
|
// fastMode: false,
|
||||||
parsePDF: true,
|
// parsePDF: true,
|
||||||
skipTlsVerification: false,
|
// skipTlsVerification: false,
|
||||||
},
|
// },
|
||||||
internalOptions: {
|
// internalOptions: {
|
||||||
disableSmartWaitCache: true,
|
// disableSmartWaitCache: true,
|
||||||
isBackgroundIndex: true
|
// isBackgroundIndex: true
|
||||||
},
|
// },
|
||||||
team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
|
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
|
||||||
createdAt: Date.now(),
|
// createdAt: Date.now(),
|
||||||
plan: "hobby", // make it a low concurrency
|
// plan: "hobby", // make it a low concurrency
|
||||||
};
|
// };
|
||||||
|
|
||||||
// Save the crawl configuration
|
// // Save the crawl configuration
|
||||||
await saveCrawl(crawlId, sc);
|
// await saveCrawl(crawlId, sc);
|
||||||
|
|
||||||
// Then kick off the job
|
// // Then kick off the job
|
||||||
await _addScrapeJobToBullMQ({
|
// await _addScrapeJobToBullMQ({
|
||||||
url,
|
// url,
|
||||||
mode: "kickoff" as const,
|
// mode: "kickoff" as const,
|
||||||
team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
|
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
|
||||||
plan: "hobby", // make it a low concurrency
|
// plan: "hobby", // make it a low concurrency
|
||||||
crawlerOptions: sc.crawlerOptions,
|
// crawlerOptions: sc.crawlerOptions,
|
||||||
scrapeOptions: sc.scrapeOptions,
|
// scrapeOptions: sc.scrapeOptions,
|
||||||
internalOptions: sc.internalOptions,
|
// internalOptions: sc.internalOptions,
|
||||||
origin: "index",
|
// origin: "index",
|
||||||
crawl_id: crawlId,
|
// crawl_id: crawlId,
|
||||||
webhook: null,
|
// webhook: null,
|
||||||
v1: true,
|
// v1: true,
|
||||||
}, {}, crypto.randomUUID(), 50);
|
// }, {}, crypto.randomUUID(), 50);
|
||||||
});
|
// });
|
||||||
|
|
||||||
// Bill team for usage
|
// Bill team for usage
|
||||||
billTeam(teamId, subId, links.length * 5).catch((error) => {
|
billTeam(teamId, subId, links.length * 5).catch((error) => {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user