mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-06 00:36:08 +08:00
fix handling of badly formatted URLs
This commit is contained in:
parent
ba6f29cdda
commit
e2ddc6c65c
@ -230,8 +230,11 @@ export class WebCrawler {
|
||||
|
||||
const $ = load(html);
|
||||
$("a").each((_, element) => {
|
||||
const href = $(element).attr("href");
|
||||
let href = $(element).attr("href");
|
||||
if (href) {
|
||||
if (href.match(/^https?:\/[^\/]/)) {
|
||||
href = href.replace(/^https?:\/[^\/]/, "$&/");
|
||||
}
|
||||
const u = this.filterURL(href, url);
|
||||
if (u !== null) {
|
||||
links.push(u);
|
||||
|
@ -352,12 +352,10 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
|
||||
if (job.data.crawlerOptions !== null) {
|
||||
if (!sc.cancelled) {
|
||||
const newURL = new URL(doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!);
|
||||
const useNewURLAsBase = newURL.hostname.split(".").slice(-2).join(".") === new URL(sc.originUrl!).hostname.split(".").slice(-2).join(".");
|
||||
const crawler = crawlToCrawler(job.data.crawl_id, sc, useNewURLAsBase ? newURL.href : undefined);
|
||||
const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!);
|
||||
|
||||
const links = crawler.filterLinks(
|
||||
crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string),
|
||||
crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl!),
|
||||
Infinity,
|
||||
sc.crawlerOptions?.maxDepth ?? 10
|
||||
);
|
||||
|
Loading…
x
Reference in New Issue
Block a user