Merge pull request #878 from mendableai/mog/deduplicate-urls

feat(crawl): Similar URL deduplication
This commit is contained in:
Nicolas 2024-11-11 14:33:13 -05:00 committed by GitHub
commit 56a1ac07a4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 91 additions and 9 deletions

View File

@ -203,6 +203,7 @@ const crawlerOptions = z.object({
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME??? allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
allowExternalLinks: z.boolean().default(false), allowExternalLinks: z.boolean().default(false),
ignoreSitemap: z.boolean().default(true), ignoreSitemap: z.boolean().default(true),
deduplicateSimilarURLs: z.boolean().default(true),
}).strict(strictMessage); }).strict(strictMessage);
// export type CrawlerOptions = { // export type CrawlerOptions = {
@ -457,6 +458,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
allowBackwardCrawling: x.allowBackwardLinks, allowBackwardCrawling: x.allowBackwardLinks,
allowExternalContentLinks: x.allowExternalLinks, allowExternalContentLinks: x.allowExternalLinks,
ignoreSitemap: x.ignoreSitemap, ignoreSitemap: x.ignoreSitemap,
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
}; };
} }
@ -470,7 +472,7 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions
allowBackwardLinks: x.allowBackwardCrawling, allowBackwardLinks: x.allowBackwardCrawling,
allowExternalLinks: x.allowExternalContentLinks, allowExternalLinks: x.allowExternalContentLinks,
ignoreSitemap: x.ignoreSitemap, ignoreSitemap: x.ignoreSitemap,
// TODO: returnOnlyUrls support deduplicateSimilarURLs: x.deduplicateSimilarURLs,
}), }),
internalOptions: { internalOptions: {
v0CrawlOnlyUrls: x.returnOnlyUrls, v0CrawlOnlyUrls: x.returnOnlyUrls,

View File

@ -0,0 +1,33 @@
import { generateURLPermutations } from "./crawl-redis";
describe("generateURLPermutations", () => {
it("generates permutations correctly", () => {
const bareHttps = generateURLPermutations("https://firecrawl.dev").map(x => x.href);
expect(bareHttps.length).toBe(4);
expect(bareHttps.includes("https://firecrawl.dev/")).toBe(true);
expect(bareHttps.includes("https://www.firecrawl.dev/")).toBe(true);
expect(bareHttps.includes("http://firecrawl.dev/")).toBe(true);
expect(bareHttps.includes("http://www.firecrawl.dev/")).toBe(true);
const bareHttp = generateURLPermutations("http://firecrawl.dev").map(x => x.href);
expect(bareHttp.length).toBe(4);
expect(bareHttp.includes("https://firecrawl.dev/")).toBe(true);
expect(bareHttp.includes("https://www.firecrawl.dev/")).toBe(true);
expect(bareHttp.includes("http://firecrawl.dev/")).toBe(true);
expect(bareHttp.includes("http://www.firecrawl.dev/")).toBe(true);
const wwwHttps = generateURLPermutations("https://www.firecrawl.dev").map(x => x.href);
expect(wwwHttps.length).toBe(4);
expect(wwwHttps.includes("https://firecrawl.dev/")).toBe(true);
expect(wwwHttps.includes("https://www.firecrawl.dev/")).toBe(true);
expect(wwwHttps.includes("http://firecrawl.dev/")).toBe(true);
expect(wwwHttps.includes("http://www.firecrawl.dev/")).toBe(true);
const wwwHttp = generateURLPermutations("http://www.firecrawl.dev").map(x => x.href);
expect(wwwHttp.length).toBe(4);
expect(wwwHttp.includes("https://firecrawl.dev/")).toBe(true);
expect(wwwHttp.includes("https://www.firecrawl.dev/")).toBe(true);
expect(wwwHttp.includes("http://firecrawl.dev/")).toBe(true);
expect(wwwHttp.includes("http://www.firecrawl.dev/")).toBe(true);
})
});

View File

@ -90,6 +90,44 @@ export async function getThrottledJobs(teamId: string): Promise<string[]> {
return await redisConnection.zrangebyscore("concurrency-limiter:" + teamId + ":throttled", Date.now(), Infinity); return await redisConnection.zrangebyscore("concurrency-limiter:" + teamId + ":throttled", Date.now(), Infinity);
} }
export function normalizeURL(url: string): string {
const urlO = new URL(url);
urlO.search = "";
urlO.hash = "";
return urlO.href;
}
export function generateURLPermutations(url: string | URL): URL[] {
const urlO = new URL(url);
// Construct two versions, one with www., one without
const urlWithWWW = new URL(urlO);
const urlWithoutWWW = new URL(urlO);
if (urlO.hostname.startsWith("www.")) {
urlWithoutWWW.hostname = urlWithWWW.hostname.slice(4);
} else {
urlWithWWW.hostname = "www." + urlWithoutWWW.hostname;
}
let permutations = [urlWithWWW, urlWithoutWWW];
// Construct more versions for http/https
permutations = permutations.flatMap(urlO => {
if (!["http:", "https:"].includes(urlO.protocol)) {
return [urlO];
}
const urlWithHTTP = new URL(urlO);
const urlWithHTTPS = new URL(urlO);
urlWithHTTP.protocol = "http:";
urlWithHTTPS.protocol = "https:";
return [urlWithHTTP, urlWithHTTPS];
});
return permutations;
}
export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise<boolean> { export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise<boolean> {
if (typeof sc.crawlerOptions?.limit === "number") { if (typeof sc.crawlerOptions?.limit === "number") {
if (await redisConnection.scard("crawl:" + id + ":visited") >= sc.crawlerOptions.limit) { if (await redisConnection.scard("crawl:" + id + ":visited") >= sc.crawlerOptions.limit) {
@ -97,16 +135,16 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
} }
} }
try { url = normalizeURL(url);
const urlO = new URL(url);
urlO.search = ""; let res: boolean;
urlO.hash = ""; if (!sc.crawlerOptions.deduplicateSimilarURLs) {
url = urlO.href; res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
} catch (error) { } else {
logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error); const permutations = generateURLPermutations(url);
res = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations.map(x => x.href))) === permutations.length;
} }
const res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX"); await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
return res; return res;
} }

View File

@ -23,6 +23,7 @@ import {
getCrawl, getCrawl,
getCrawlJobs, getCrawlJobs,
lockURL, lockURL,
normalizeURL,
} from "../lib/crawl-redis"; } from "../lib/crawl-redis";
import { StoredCrawl } from "../lib/crawl-redis"; import { StoredCrawl } from "../lib/crawl-redis";
import { addScrapeJob } from "./queue-jobs"; import { addScrapeJob } from "./queue-jobs";
@ -319,6 +320,11 @@ async function processJob(job: Job & { id: string }, token: string) {
if (job.data.crawl_id) { if (job.data.crawl_id) {
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl; const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
if (doc.metadata.url !== undefined && doc.metadata.sourceURL !== undefined && normalizeURL(doc.metadata.url) !== normalizeURL(doc.metadata.sourceURL)) {
logger.debug("Was redirected, locking new URL...");
await lockURL(job.data.crawl_id, sc, doc.metadata.url);
}
await logJob({ await logJob({
job_id: job.id as string, job_id: job.id as string,
success: true, success: true,

View File

@ -86,6 +86,8 @@ export interface CrawlScrapeOptions {
country?: string; country?: string;
languages?: string[]; languages?: string[];
}; };
skipTlsVerification?: boolean;
removeBase64Images?: boolean;
} }
export type Action = { export type Action = {
@ -151,6 +153,7 @@ export interface CrawlParams {
ignoreSitemap?: boolean; ignoreSitemap?: boolean;
scrapeOptions?: CrawlScrapeOptions; scrapeOptions?: CrawlScrapeOptions;
webhook?: string; webhook?: string;
deduplicateSimilarURLs?: boolean;
} }
/** /**