mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-15 10:15:55 +08:00
various queue fixes
This commit is contained in:
parent
7a03275575
commit
ccfada98ca
@ -8,21 +8,13 @@ import {
|
|||||||
toLegacyCrawlerOptions,
|
toLegacyCrawlerOptions,
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import {
|
import {
|
||||||
addCrawlJob,
|
|
||||||
addCrawlJobs,
|
|
||||||
crawlToCrawler,
|
crawlToCrawler,
|
||||||
lockURL,
|
|
||||||
lockURLs,
|
|
||||||
saveCrawl,
|
saveCrawl,
|
||||||
StoredCrawl,
|
StoredCrawl,
|
||||||
} from "../../lib/crawl-redis";
|
} from "../../lib/crawl-redis";
|
||||||
import { logCrawl } from "../../services/logging/crawl_log";
|
import { logCrawl } from "../../services/logging/crawl_log";
|
||||||
import { getScrapeQueue } from "../../services/queue-service";
|
import { _addScrapeJobToBullMQ } from "../../services/queue-jobs";
|
||||||
import { _addScrapeJobToBullMQ, addScrapeJob, addScrapeJobs } from "../../services/queue-jobs";
|
|
||||||
import { logger as _logger } from "../../lib/logger";
|
import { logger as _logger } from "../../lib/logger";
|
||||||
import { getJobPriority } from "../../lib/job-priority";
|
|
||||||
import { callWebhook } from "../../services/webhook";
|
|
||||||
import { scrapeOptions as scrapeOptionsSchema } from "./types";
|
|
||||||
|
|
||||||
export async function crawlController(
|
export async function crawlController(
|
||||||
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
|
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
|
||||||
|
@ -315,6 +315,22 @@ export async function lockURLs(
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export async function lockURLsIndividually(
|
||||||
|
id: string,
|
||||||
|
sc: StoredCrawl,
|
||||||
|
jobs: { id: string; url: string; }[],
|
||||||
|
) {
|
||||||
|
const out: typeof jobs = [];
|
||||||
|
|
||||||
|
for (const job of jobs) {
|
||||||
|
if (await lockURL(id, sc, job.url)) {
|
||||||
|
out.push(job);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
export function crawlToCrawler(
|
export function crawlToCrawler(
|
||||||
id: string,
|
id: string,
|
||||||
sc: StoredCrawl,
|
sc: StoredCrawl,
|
||||||
|
@ -471,7 +471,7 @@ export class WebCrawler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private async tryFetchSitemapLinks(url: string, urlsHandler: (urls: string[]) => unknown): Promise<number> {
|
private async tryFetchSitemapLinks(url: string, urlsHandler: (urls: string[]) => unknown): Promise<number> {
|
||||||
const sitemapUrl = url.endsWith(".xml") ? url : `${url}/sitemap.xml`;
|
const sitemapUrl = url.endsWith(".xml") ? url : `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`;
|
||||||
|
|
||||||
let sitemapCount: number = 0;
|
let sitemapCount: number = 0;
|
||||||
|
|
||||||
|
@ -26,7 +26,9 @@ import {
|
|||||||
getCrawlJobs,
|
getCrawlJobs,
|
||||||
lockURL,
|
lockURL,
|
||||||
lockURLs,
|
lockURLs,
|
||||||
|
lockURLsIndividually,
|
||||||
normalizeURL,
|
normalizeURL,
|
||||||
|
saveCrawl,
|
||||||
} from "../lib/crawl-redis";
|
} from "../lib/crawl-redis";
|
||||||
import { StoredCrawl } from "../lib/crawl-redis";
|
import { StoredCrawl } from "../lib/crawl-redis";
|
||||||
import { addScrapeJob, addScrapeJobs } from "./queue-jobs";
|
import { addScrapeJob, addScrapeJobs } from "./queue-jobs";
|
||||||
@ -480,6 +482,47 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
|
|||||||
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
||||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||||
|
|
||||||
|
logger.debug("Locking URL...");
|
||||||
|
await lockURL(job.data.crawl_id, sc, job.data.url);
|
||||||
|
const jobId = uuidv4();
|
||||||
|
logger.debug("Adding scrape job to Redis...", { jobId });
|
||||||
|
await addScrapeJob(
|
||||||
|
{
|
||||||
|
url: job.data.url,
|
||||||
|
mode: "single_urls",
|
||||||
|
team_id: job.data.team_id,
|
||||||
|
crawlerOptions: job.data.crawlerOptions,
|
||||||
|
scrapeOptions: scrapeOptions.parse(job.data.scrapeOptions),
|
||||||
|
internalOptions: sc.internalOptions,
|
||||||
|
plan: job.data.plan!,
|
||||||
|
origin: job.data.origin,
|
||||||
|
crawl_id: job.data.crawl_id,
|
||||||
|
webhook: job.data.webhook,
|
||||||
|
v1: job.data.v1,
|
||||||
|
isCrawlSourceScrape: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
priority: 15,
|
||||||
|
},
|
||||||
|
jobId,
|
||||||
|
);
|
||||||
|
logger.debug("Adding scrape job to BullMQ...", { jobId });
|
||||||
|
await addCrawlJob(job.data.crawl_id, jobId);
|
||||||
|
|
||||||
|
if (job.data.webhook) {
|
||||||
|
logger.debug("Calling webhook with crawl.started...", {
|
||||||
|
webhook: job.data.webhook,
|
||||||
|
});
|
||||||
|
await callWebhook(
|
||||||
|
job.data.team_id,
|
||||||
|
job.data.crawl_id,
|
||||||
|
null,
|
||||||
|
job.data.webhook,
|
||||||
|
true,
|
||||||
|
"crawl.started",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
const sitemap = sc.crawlerOptions.ignoreSitemap
|
const sitemap = sc.crawlerOptions.ignoreSitemap
|
||||||
? 0
|
? 0
|
||||||
: await crawler.tryGetSitemap(async (urls) => {
|
: await crawler.tryGetSitemap(async (urls) => {
|
||||||
@ -522,68 +565,29 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
|
|||||||
});
|
});
|
||||||
|
|
||||||
logger.debug("Locking URLs...");
|
logger.debug("Locking URLs...");
|
||||||
await lockURLs(
|
const lockedIds = await lockURLsIndividually(
|
||||||
job.data.crawl_id,
|
job.data.crawl_id,
|
||||||
sc,
|
sc,
|
||||||
jobs.map((x) => x.data.url),
|
jobs.map((x) => ({ id: x.opts.jobId, url: x.data.url })),
|
||||||
);
|
);
|
||||||
|
const lockedJobs = jobs.filter(x => lockedIds.find(y => y.id === x.opts.jobId));
|
||||||
logger.debug("Adding scrape jobs to Redis...");
|
logger.debug("Adding scrape jobs to Redis...");
|
||||||
await addCrawlJobs(
|
await addCrawlJobs(
|
||||||
job.data.crawl_id,
|
job.data.crawl_id,
|
||||||
jobs.map((x) => x.opts.jobId),
|
lockedJobs.map((x) => x.opts.jobId),
|
||||||
);
|
);
|
||||||
logger.debug("Adding scrape jobs to BullMQ...");
|
logger.debug("Adding scrape jobs to BullMQ...");
|
||||||
await addScrapeJobs(jobs);
|
await addScrapeJobs(lockedJobs);
|
||||||
});
|
});
|
||||||
|
|
||||||
if (sitemap === 0) {
|
if (sitemap === 0) {
|
||||||
logger.debug("Sitemap not found or ignored.", {
|
logger.debug("Sitemap not found or ignored.", {
|
||||||
ignoreSitemap: sc.crawlerOptions.ignoreSitemap,
|
ignoreSitemap: sc.crawlerOptions.ignoreSitemap,
|
||||||
});
|
});
|
||||||
|
|
||||||
logger.debug("Locking URL...");
|
|
||||||
await lockURL(job.data.crawl_id, sc, job.data.url);
|
|
||||||
const jobId = uuidv4();
|
|
||||||
logger.debug("Adding scrape job to Redis...", { jobId });
|
|
||||||
await addScrapeJob(
|
|
||||||
{
|
|
||||||
url: job.data.url,
|
|
||||||
mode: "single_urls",
|
|
||||||
team_id: job.data.team_id,
|
|
||||||
crawlerOptions: job.data.crawlerOptions,
|
|
||||||
scrapeOptions: scrapeOptions.parse(job.data.scrapeOptions),
|
|
||||||
internalOptions: sc.internalOptions,
|
|
||||||
plan: job.data.plan!,
|
|
||||||
origin: job.data.origin,
|
|
||||||
crawl_id: job.data.crawl_id,
|
|
||||||
webhook: job.data.webhook,
|
|
||||||
v1: job.data.v1,
|
|
||||||
isCrawlSourceScrape: true,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
priority: 15,
|
|
||||||
},
|
|
||||||
jobId,
|
|
||||||
);
|
|
||||||
logger.debug("Adding scrape job to BullMQ...", { jobId });
|
|
||||||
await addCrawlJob(job.data.crawl_id, jobId);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.debug("Done queueing jobs!");
|
logger.debug("Done queueing jobs!");
|
||||||
|
|
||||||
if (job.data.webhook) {
|
|
||||||
logger.debug("Calling webhook with crawl.started...", {
|
|
||||||
webhook: job.data.webhook,
|
|
||||||
});
|
|
||||||
await callWebhook(
|
|
||||||
job.data.team_id,
|
|
||||||
job.data.crawl_id,
|
|
||||||
null,
|
|
||||||
job.data.webhook,
|
|
||||||
true,
|
|
||||||
"crawl.started",
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
return { success: true };
|
return { success: true };
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error("An error occurred!", { error });
|
logger.error("An error occurred!", { error });
|
||||||
@ -720,16 +724,18 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
) {
|
) {
|
||||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||||
if (
|
if (
|
||||||
crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null
|
crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null &&
|
||||||
|
!job.data.isCrawlSourceScrape
|
||||||
) {
|
) {
|
||||||
if (job.data.isCrawlSourceScrape) {
|
throw new Error(
|
||||||
// TODO: re-fetch sitemap for redirect target domain
|
"Redirected target URL is not allowed by crawlOptions",
|
||||||
// TODO: reset crawl source url to new target
|
); // TODO: make this its own error type that is ignored by error tracking
|
||||||
} else {
|
}
|
||||||
throw new Error(
|
|
||||||
"Redirected target URL is not allowed by crawlOptions",
|
if (job.data.isCrawlSourceScrape) {
|
||||||
); // TODO: make this its own error type that is ignored by error tracking
|
// TODO: re-fetch sitemap for redirect target domain
|
||||||
}
|
sc.originUrl = doc.metadata.url;
|
||||||
|
await saveCrawl(job.data.crawl_id, sc);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isUrlBlocked(doc.metadata.url)) {
|
if (isUrlBlocked(doc.metadata.url)) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user