mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-07 18:49:00 +08:00
fix(crawl-redis): normalize URL before locking
This commit is contained in:
parent
96245e387d
commit
fe721fffbe
@ -1,5 +1,6 @@
|
||||
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
||||
import { redisConnection } from "../services/queue-service";
|
||||
import { Logger } from "./logger";
|
||||
|
||||
export type StoredCrawl = {
|
||||
originUrl: string;
|
||||
@ -88,6 +89,16 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const urlO = new URL(url);
|
||||
urlO.search = "";
|
||||
urlO.hash = "";
|
||||
url = urlO.href;
|
||||
} catch (error) {
|
||||
Logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
|
||||
}
|
||||
|
||||
const res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
|
||||
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
||||
return res;
|
||||
@ -95,6 +106,19 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
|
||||
|
||||
/// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap
|
||||
export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
|
||||
urls = urls.map(url => {
|
||||
try {
|
||||
const urlO = new URL(url);
|
||||
urlO.search = "";
|
||||
urlO.hash = "";
|
||||
return urlO.href;
|
||||
} catch (error) {
|
||||
Logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
|
||||
}
|
||||
|
||||
return url;
|
||||
});
|
||||
|
||||
const res = (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0
|
||||
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
||||
return res;
|
||||
|
Loading…
x
Reference in New Issue
Block a user