feat(crawl): add parameter to treat differing query parameters as different URLs (#892)

* add parameter to crawleroptions

* add code to make it work
This commit is contained in:
Gergő Móricz 2024-11-11 21:36:22 +01:00 committed by GitHub
parent 5cb46dc494
commit a8dc75f762
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 19 additions and 16 deletions

View File

@ -195,6 +195,7 @@ export async function crawlController(req: Request, res: Response) {
await lockURLs( await lockURLs(
id, id,
sc,
jobs.map((x) => x.data.url) jobs.map((x) => x.data.url)
); );
await addCrawlJobs( await addCrawlJobs(

View File

@ -76,6 +76,7 @@ export async function batchScrapeController(
await lockURLs( await lockURLs(
id, id,
sc,
jobs.map((x) => x.data.url) jobs.map((x) => x.data.url)
); );
await addCrawlJobs( await addCrawlJobs(

View File

@ -133,6 +133,7 @@ export async function crawlController(
await lockURLs( await lockURLs(
id, id,
sc,
jobs.map((x) => x.data.url) jobs.map((x) => x.data.url)
); );
await addCrawlJobs( await addCrawlJobs(

View File

@ -205,6 +205,7 @@ const crawlerOptions = z.object({
allowExternalLinks: z.boolean().default(false), allowExternalLinks: z.boolean().default(false),
ignoreSitemap: z.boolean().default(true), ignoreSitemap: z.boolean().default(true),
deduplicateSimilarURLs: z.boolean().default(true), deduplicateSimilarURLs: z.boolean().default(true),
ignoreQueryParameters: z.boolean().default(false),
}).strict(strictMessage); }).strict(strictMessage);
// export type CrawlerOptions = { // export type CrawlerOptions = {
@ -460,6 +461,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
allowExternalContentLinks: x.allowExternalLinks, allowExternalContentLinks: x.allowExternalLinks,
ignoreSitemap: x.ignoreSitemap, ignoreSitemap: x.ignoreSitemap,
deduplicateSimilarURLs: x.deduplicateSimilarURLs, deduplicateSimilarURLs: x.deduplicateSimilarURLs,
ignoreQueryParameters: x.ignoreQueryParameters,
}; };
} }
@ -474,6 +476,7 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions
allowExternalLinks: x.allowExternalContentLinks, allowExternalLinks: x.allowExternalContentLinks,
ignoreSitemap: x.ignoreSitemap, ignoreSitemap: x.ignoreSitemap,
deduplicateSimilarURLs: x.deduplicateSimilarURLs, deduplicateSimilarURLs: x.deduplicateSimilarURLs,
ignoreQueryParameters: x.ignoreQueryParameters,
}), }),
internalOptions: { internalOptions: {
v0CrawlOnlyUrls: x.returnOnlyUrls, v0CrawlOnlyUrls: x.returnOnlyUrls,

View File

@ -90,9 +90,11 @@ export async function getThrottledJobs(teamId: string): Promise<string[]> {
return await redisConnection.zrangebyscore("concurrency-limiter:" + teamId + ":throttled", Date.now(), Infinity); return await redisConnection.zrangebyscore("concurrency-limiter:" + teamId + ":throttled", Date.now(), Infinity);
} }
export function normalizeURL(url: string): string { export function normalizeURL(url: string, sc: StoredCrawl): string {
const urlO = new URL(url); const urlO = new URL(url);
urlO.search = ""; if (!sc.crawlerOptions || sc.crawlerOptions.ignoreQueryParameters) {
urlO.search = "";
}
urlO.hash = ""; urlO.hash = "";
return urlO.href; return urlO.href;
} }
@ -130,12 +132,15 @@ export function generateURLPermutations(url: string | URL): URL[] {
export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise<boolean> { export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise<boolean> {
if (typeof sc.crawlerOptions?.limit === "number") { if (typeof sc.crawlerOptions?.limit === "number") {
if (await redisConnection.scard("crawl:" + id + ":visited") >= sc.crawlerOptions.limit) { if (await redisConnection.scard("crawl:" + id + ":visited_unique") >= sc.crawlerOptions.limit) {
return false; return false;
} }
} }
url = normalizeURL(url); url = normalizeURL(url, sc);
await redisConnection.sadd("crawl:" + id + ":visited_unique", url);
await redisConnection.expire("crawl:" + id + ":visited_unique", 24 * 60 * 60, "NX");
let res: boolean; let res: boolean;
if (!sc.crawlerOptions.deduplicateSimilarURLs) { if (!sc.crawlerOptions.deduplicateSimilarURLs) {
@ -150,18 +155,9 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
} }
/// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap /// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap
export async function lockURLs(id: string, urls: string[]): Promise<boolean> { export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Promise<boolean> {
urls = urls.map(url => { urls = urls.map(url => {
try { return normalizeURL(url, sc);
const urlO = new URL(url);
urlO.search = "";
urlO.hash = "";
return urlO.href;
} catch (error) {
logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
}
return url;
}); });
const res = (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0 const res = (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0

View File

@ -320,7 +320,7 @@ async function processJob(job: Job & { id: string }, token: string) {
if (job.data.crawl_id) { if (job.data.crawl_id) {
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl; const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
if (doc.metadata.url !== undefined && doc.metadata.sourceURL !== undefined && normalizeURL(doc.metadata.url) !== normalizeURL(doc.metadata.sourceURL)) { if (doc.metadata.url !== undefined && doc.metadata.sourceURL !== undefined && normalizeURL(doc.metadata.url, sc) !== normalizeURL(doc.metadata.sourceURL, sc)) {
logger.debug("Was redirected, locking new URL..."); logger.debug("Was redirected, locking new URL...");
await lockURL(job.data.crawl_id, sc, doc.metadata.url); await lockURL(job.data.crawl_id, sc, doc.metadata.url);
} }

View File

@ -155,6 +155,7 @@ export interface CrawlParams {
scrapeOptions?: CrawlScrapeOptions; scrapeOptions?: CrawlScrapeOptions;
webhook?: string; webhook?: string;
deduplicateSimilarURLs?: boolean; deduplicateSimilarURLs?: boolean;
ignoreQueryParameters?: boolean;
} }
/** /**