mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 03:06:00 +08:00
Merge pull request #336 from snippet/allow-external-content-links
[Proposal] new feature allowExternalContentLinks
This commit is contained in:
commit
f0f449fe51
@ -826,6 +826,46 @@ describe("E2E Tests for API Routes", () => {
|
||||
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
|
||||
}, 180000);
|
||||
|
||||
it.concurrent("should crawl external content links when allowed", async () => {
|
||||
const crawlInitResponse = await request(TEST_URL)
|
||||
.post("/v0/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://mendable.ai",
|
||||
crawlerOptions: {
|
||||
allowExternalContentLinks: true,
|
||||
ignoreSitemap: true,
|
||||
returnOnlyUrls: true,
|
||||
limit: 50
|
||||
}
|
||||
});
|
||||
|
||||
expect(crawlInitResponse.statusCode).toBe(200);
|
||||
expect(crawlInitResponse.body).toHaveProperty("jobId");
|
||||
|
||||
let crawlStatus: string;
|
||||
let crawlData = [];
|
||||
while (crawlStatus !== "completed") {
|
||||
const statusResponse = await request(TEST_URL)
|
||||
.get(`/v0/crawl/status/${crawlInitResponse.body.jobId}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
crawlStatus = statusResponse.body.status;
|
||||
if (statusResponse.body.data) {
|
||||
crawlData = statusResponse.body.data;
|
||||
}
|
||||
if (crawlStatus !== "completed") {
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||
}
|
||||
}
|
||||
console.log(crawlData)
|
||||
expect(crawlData.length).toBeGreaterThan(0);
|
||||
expect(crawlData).toEqual(expect.arrayContaining([
|
||||
expect.objectContaining({ url: expect.stringContaining("https://firecrawl.dev/?ref=mendable+banner") }),
|
||||
expect.objectContaining({ url: expect.stringContaining("https://mendable.ai/pricing") }),
|
||||
expect.objectContaining({ url: expect.stringContaining("https://x.com/CalebPeffer") })
|
||||
]));
|
||||
}, 180000); // 3 minutes timeout
|
||||
});
|
||||
|
||||
describe("POST /v0/crawlWebsitePreview", () => {
|
||||
|
@ -52,6 +52,7 @@ export type CrawlerOptions = {
|
||||
ignoreSitemap?: boolean;
|
||||
mode?: "default" | "fast"; // have a mode of some sort
|
||||
allowBackwardCrawling?: boolean;
|
||||
allowExternalContentLinks?: boolean;
|
||||
}
|
||||
|
||||
export type WebScraperOptions = {
|
||||
|
@ -23,6 +23,7 @@ export class WebCrawler {
|
||||
private robots: any;
|
||||
private generateImgAltText: boolean;
|
||||
private allowBackwardCrawling: boolean;
|
||||
private allowExternalContentLinks: boolean;
|
||||
|
||||
constructor({
|
||||
initialUrl,
|
||||
@ -32,7 +33,8 @@ export class WebCrawler {
|
||||
limit = 10000,
|
||||
generateImgAltText = false,
|
||||
maxCrawledDepth = 10,
|
||||
allowBackwardCrawling = false
|
||||
allowBackwardCrawling = false,
|
||||
allowExternalContentLinks = false
|
||||
}: {
|
||||
initialUrl: string;
|
||||
includes?: string[];
|
||||
@ -42,6 +44,7 @@ export class WebCrawler {
|
||||
generateImgAltText?: boolean;
|
||||
maxCrawledDepth?: number;
|
||||
allowBackwardCrawling?: boolean;
|
||||
allowExternalContentLinks?: boolean;
|
||||
}) {
|
||||
this.initialUrl = initialUrl;
|
||||
this.baseUrl = new URL(initialUrl).origin;
|
||||
@ -55,6 +58,7 @@ export class WebCrawler {
|
||||
this.maxCrawledDepth = maxCrawledDepth ?? 10;
|
||||
this.generateImgAltText = generateImgAltText ?? false;
|
||||
this.allowBackwardCrawling = allowBackwardCrawling ?? false;
|
||||
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
|
||||
}
|
||||
|
||||
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
|
||||
@ -98,9 +102,10 @@ export class WebCrawler {
|
||||
const linkHostname = normalizedLink.hostname.replace(/^www\./, '');
|
||||
|
||||
// Ensure the protocol and hostname match, and the path starts with the initial URL's path
|
||||
if (linkHostname !== initialHostname) {
|
||||
return false;
|
||||
}
|
||||
// commented to able to handling external link on allowExternalContentLinks
|
||||
// if (linkHostname !== initialHostname) {
|
||||
// return false;
|
||||
// }
|
||||
|
||||
if (!this.allowBackwardCrawling) {
|
||||
if (!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) {
|
||||
@ -278,15 +283,24 @@ export class WebCrawler {
|
||||
const path = urlObj.pathname;
|
||||
|
||||
|
||||
if (
|
||||
this.isInternalLink(fullUrl) &&
|
||||
this.noSections(fullUrl) &&
|
||||
// The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
|
||||
// this.matchesIncludes(path) &&
|
||||
!this.matchesExcludes(path) &&
|
||||
this.isRobotsAllowed(fullUrl)
|
||||
) {
|
||||
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
|
||||
if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
|
||||
if (this.isInternalLink(fullUrl) &&
|
||||
this.noSections(fullUrl) &&
|
||||
!this.matchesExcludes(path) &&
|
||||
this.isRobotsAllowed(fullUrl)
|
||||
) {
|
||||
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
|
||||
}
|
||||
} else { // EXTERNAL LINKS
|
||||
if (
|
||||
this.isInternalLink(url) &&
|
||||
this.allowExternalContentLinks &&
|
||||
!this.isSocialMediaOrEmail(fullUrl) &&
|
||||
!this.matchesExcludes(fullUrl, true) &&
|
||||
!this.isExternalMainPage(fullUrl)
|
||||
) {
|
||||
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
@ -320,9 +334,41 @@ export class WebCrawler {
|
||||
return this.includes.some((pattern) => new RegExp(pattern).test(url));
|
||||
}
|
||||
|
||||
private matchesExcludes(url: string): boolean {
|
||||
if (this.excludes.length === 0 || this.excludes[0] == "") return false;
|
||||
return this.excludes.some((pattern) => new RegExp(pattern).test(url));
|
||||
private matchesExcludes(url: string, onlyDomains: boolean = false): boolean {
|
||||
return this.excludes.some((pattern) => {
|
||||
if (onlyDomains)
|
||||
return this.matchesExcludesExternalDomains(url);
|
||||
|
||||
return this.excludes.some((pattern) => new RegExp(pattern).test(url));
|
||||
});
|
||||
}
|
||||
|
||||
// supported formats: "example.com/blog", "https://example.com", "blog.example.com", "example.com"
|
||||
private matchesExcludesExternalDomains(url: string) {
|
||||
try {
|
||||
const urlObj = new URL(url);
|
||||
const hostname = urlObj.hostname;
|
||||
const pathname = urlObj.pathname;
|
||||
|
||||
for (let domain of this.excludes) {
|
||||
let domainObj = new URL('http://' + domain.replace(/^https?:\/\//, ''));
|
||||
let domainHostname = domainObj.hostname;
|
||||
let domainPathname = domainObj.pathname;
|
||||
|
||||
if (hostname === domainHostname || hostname.endsWith(`.${domainHostname}`)) {
|
||||
if (pathname.startsWith(domainPathname)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private isExternalMainPage(url:string):boolean {
|
||||
return !Boolean(url.split("/").slice(3).filter(subArray => subArray.length > 0).length)
|
||||
}
|
||||
|
||||
private noSections(link: string): boolean {
|
||||
@ -375,6 +421,10 @@ export class WebCrawler {
|
||||
"instagram.com",
|
||||
"pinterest.com",
|
||||
"mailto:",
|
||||
"github.com",
|
||||
"calendly.com",
|
||||
"discord.gg",
|
||||
"discord.com",
|
||||
];
|
||||
return socialMediaOrEmail.some((ext) => url.includes(ext));
|
||||
}
|
||||
|
@ -40,6 +40,7 @@ export class WebScraperDataProvider {
|
||||
"gpt-4-turbo";
|
||||
private crawlerMode: string = "default";
|
||||
private allowBackwardCrawling: boolean = false;
|
||||
private allowExternalContentLinks: boolean = false;
|
||||
|
||||
authorize(): void {
|
||||
throw new Error("Method not implemented.");
|
||||
@ -173,6 +174,7 @@ export class WebScraperDataProvider {
|
||||
limit: this.limit,
|
||||
generateImgAltText: this.generateImgAltText,
|
||||
allowBackwardCrawling: this.allowBackwardCrawling,
|
||||
allowExternalContentLinks: this.allowExternalContentLinks,
|
||||
});
|
||||
|
||||
let links = await crawler.start(
|
||||
@ -496,6 +498,7 @@ export class WebScraperDataProvider {
|
||||
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
|
||||
this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
|
||||
this.allowBackwardCrawling = options.crawlerOptions?.allowBackwardCrawling ?? false;
|
||||
this.allowExternalContentLinks = options.crawlerOptions?.allowExternalContentLinks ?? false;
|
||||
|
||||
// make sure all urls start with https://
|
||||
this.urls = this.urls.map((url) => {
|
||||
|
Loading…
x
Reference in New Issue
Block a user