mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-07 18:49:00 +08:00
fix(crawl-redis): normalize URL before locking
This commit is contained in:
parent
96245e387d
commit
fe721fffbe
@ -1,5 +1,6 @@
|
|||||||
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
||||||
import { redisConnection } from "../services/queue-service";
|
import { redisConnection } from "../services/queue-service";
|
||||||
|
import { Logger } from "./logger";
|
||||||
|
|
||||||
export type StoredCrawl = {
|
export type StoredCrawl = {
|
||||||
originUrl: string;
|
originUrl: string;
|
||||||
@ -88,6 +89,16 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const urlO = new URL(url);
|
||||||
|
urlO.search = "";
|
||||||
|
urlO.hash = "";
|
||||||
|
url = urlO.href;
|
||||||
|
} catch (error) {
|
||||||
|
Logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
|
||||||
|
}
|
||||||
|
|
||||||
const res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
|
const res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
|
||||||
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
||||||
return res;
|
return res;
|
||||||
@ -95,6 +106,19 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
|
|||||||
|
|
||||||
/// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap
|
/// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap
|
||||||
export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
|
export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
|
||||||
|
urls = urls.map(url => {
|
||||||
|
try {
|
||||||
|
const urlO = new URL(url);
|
||||||
|
urlO.search = "";
|
||||||
|
urlO.hash = "";
|
||||||
|
return urlO.href;
|
||||||
|
} catch (error) {
|
||||||
|
Logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
|
||||||
|
}
|
||||||
|
|
||||||
|
return url;
|
||||||
|
});
|
||||||
|
|
||||||
const res = (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0
|
const res = (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0
|
||||||
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
||||||
return res;
|
return res;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user