Merge pull request #336 from snippet/allow-external-content-links

[Proposal] new feature allowExternalContentLinks
This commit is contained in:
Rafael Miller 2024-07-02 09:45:21 -03:00 committed by GitHub
commit f0f449fe51
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 110 additions and 16 deletions

View File

@ -826,6 +826,46 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
}, 180000); }, 180000);
it.concurrent("should crawl external content links when allowed", async () => {
const crawlInitResponse = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://mendable.ai",
crawlerOptions: {
allowExternalContentLinks: true,
ignoreSitemap: true,
returnOnlyUrls: true,
limit: 50
}
});
expect(crawlInitResponse.statusCode).toBe(200);
expect(crawlInitResponse.body).toHaveProperty("jobId");
let crawlStatus: string;
let crawlData = [];
while (crawlStatus !== "completed") {
const statusResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlInitResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
crawlStatus = statusResponse.body.status;
if (statusResponse.body.data) {
crawlData = statusResponse.body.data;
}
if (crawlStatus !== "completed") {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
}
}
console.log(crawlData)
expect(crawlData.length).toBeGreaterThan(0);
expect(crawlData).toEqual(expect.arrayContaining([
expect.objectContaining({ url: expect.stringContaining("https://firecrawl.dev/?ref=mendable+banner") }),
expect.objectContaining({ url: expect.stringContaining("https://mendable.ai/pricing") }),
expect.objectContaining({ url: expect.stringContaining("https://x.com/CalebPeffer") })
]));
}, 180000); // 3 minutes timeout
}); });
describe("POST /v0/crawlWebsitePreview", () => { describe("POST /v0/crawlWebsitePreview", () => {

View File

@ -52,6 +52,7 @@ export type CrawlerOptions = {
ignoreSitemap?: boolean; ignoreSitemap?: boolean;
mode?: "default" | "fast"; // have a mode of some sort mode?: "default" | "fast"; // have a mode of some sort
allowBackwardCrawling?: boolean; allowBackwardCrawling?: boolean;
allowExternalContentLinks?: boolean;
} }
export type WebScraperOptions = { export type WebScraperOptions = {

View File

@ -23,6 +23,7 @@ export class WebCrawler {
private robots: any; private robots: any;
private generateImgAltText: boolean; private generateImgAltText: boolean;
private allowBackwardCrawling: boolean; private allowBackwardCrawling: boolean;
private allowExternalContentLinks: boolean;
constructor({ constructor({
initialUrl, initialUrl,
@ -32,7 +33,8 @@ export class WebCrawler {
limit = 10000, limit = 10000,
generateImgAltText = false, generateImgAltText = false,
maxCrawledDepth = 10, maxCrawledDepth = 10,
allowBackwardCrawling = false allowBackwardCrawling = false,
allowExternalContentLinks = false
}: { }: {
initialUrl: string; initialUrl: string;
includes?: string[]; includes?: string[];
@ -42,6 +44,7 @@ export class WebCrawler {
generateImgAltText?: boolean; generateImgAltText?: boolean;
maxCrawledDepth?: number; maxCrawledDepth?: number;
allowBackwardCrawling?: boolean; allowBackwardCrawling?: boolean;
allowExternalContentLinks?: boolean;
}) { }) {
this.initialUrl = initialUrl; this.initialUrl = initialUrl;
this.baseUrl = new URL(initialUrl).origin; this.baseUrl = new URL(initialUrl).origin;
@ -55,6 +58,7 @@ export class WebCrawler {
this.maxCrawledDepth = maxCrawledDepth ?? 10; this.maxCrawledDepth = maxCrawledDepth ?? 10;
this.generateImgAltText = generateImgAltText ?? false; this.generateImgAltText = generateImgAltText ?? false;
this.allowBackwardCrawling = allowBackwardCrawling ?? false; this.allowBackwardCrawling = allowBackwardCrawling ?? false;
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
} }
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
@ -98,9 +102,10 @@ export class WebCrawler {
const linkHostname = normalizedLink.hostname.replace(/^www\./, ''); const linkHostname = normalizedLink.hostname.replace(/^www\./, '');
// Ensure the protocol and hostname match, and the path starts with the initial URL's path // Ensure the protocol and hostname match, and the path starts with the initial URL's path
if (linkHostname !== initialHostname) { // commented to able to handling external link on allowExternalContentLinks
return false; // if (linkHostname !== initialHostname) {
} // return false;
// }
if (!this.allowBackwardCrawling) { if (!this.allowBackwardCrawling) {
if (!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) { if (!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) {
@ -278,16 +283,25 @@ export class WebCrawler {
const path = urlObj.pathname; const path = urlObj.pathname;
if ( if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
this.isInternalLink(fullUrl) && if (this.isInternalLink(fullUrl) &&
this.noSections(fullUrl) && this.noSections(fullUrl) &&
// The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
// this.matchesIncludes(path) &&
!this.matchesExcludes(path) && !this.matchesExcludes(path) &&
this.isRobotsAllowed(fullUrl) this.isRobotsAllowed(fullUrl)
) { ) {
links.push({ url: fullUrl, html: content, pageStatusCode, pageError }); links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
} }
} else { // EXTERNAL LINKS
if (
this.isInternalLink(url) &&
this.allowExternalContentLinks &&
!this.isSocialMediaOrEmail(fullUrl) &&
!this.matchesExcludes(fullUrl, true) &&
!this.isExternalMainPage(fullUrl)
) {
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
}
}
} }
}); });
@ -320,9 +334,41 @@ export class WebCrawler {
return this.includes.some((pattern) => new RegExp(pattern).test(url)); return this.includes.some((pattern) => new RegExp(pattern).test(url));
} }
private matchesExcludes(url: string): boolean { private matchesExcludes(url: string, onlyDomains: boolean = false): boolean {
if (this.excludes.length === 0 || this.excludes[0] == "") return false; return this.excludes.some((pattern) => {
if (onlyDomains)
return this.matchesExcludesExternalDomains(url);
return this.excludes.some((pattern) => new RegExp(pattern).test(url)); return this.excludes.some((pattern) => new RegExp(pattern).test(url));
});
}
// supported formats: "example.com/blog", "https://example.com", "blog.example.com", "example.com"
private matchesExcludesExternalDomains(url: string) {
try {
const urlObj = new URL(url);
const hostname = urlObj.hostname;
const pathname = urlObj.pathname;
for (let domain of this.excludes) {
let domainObj = new URL('http://' + domain.replace(/^https?:\/\//, ''));
let domainHostname = domainObj.hostname;
let domainPathname = domainObj.pathname;
if (hostname === domainHostname || hostname.endsWith(`.${domainHostname}`)) {
if (pathname.startsWith(domainPathname)) {
return true;
}
}
}
return false;
} catch (e) {
return false;
}
}
private isExternalMainPage(url:string):boolean {
return !Boolean(url.split("/").slice(3).filter(subArray => subArray.length > 0).length)
} }
private noSections(link: string): boolean { private noSections(link: string): boolean {
@ -375,6 +421,10 @@ export class WebCrawler {
"instagram.com", "instagram.com",
"pinterest.com", "pinterest.com",
"mailto:", "mailto:",
"github.com",
"calendly.com",
"discord.gg",
"discord.com",
]; ];
return socialMediaOrEmail.some((ext) => url.includes(ext)); return socialMediaOrEmail.some((ext) => url.includes(ext));
} }

View File

@ -40,6 +40,7 @@ export class WebScraperDataProvider {
"gpt-4-turbo"; "gpt-4-turbo";
private crawlerMode: string = "default"; private crawlerMode: string = "default";
private allowBackwardCrawling: boolean = false; private allowBackwardCrawling: boolean = false;
private allowExternalContentLinks: boolean = false;
authorize(): void { authorize(): void {
throw new Error("Method not implemented."); throw new Error("Method not implemented.");
@ -173,6 +174,7 @@ export class WebScraperDataProvider {
limit: this.limit, limit: this.limit,
generateImgAltText: this.generateImgAltText, generateImgAltText: this.generateImgAltText,
allowBackwardCrawling: this.allowBackwardCrawling, allowBackwardCrawling: this.allowBackwardCrawling,
allowExternalContentLinks: this.allowExternalContentLinks,
}); });
let links = await crawler.start( let links = await crawler.start(
@ -496,6 +498,7 @@ export class WebScraperDataProvider {
this.crawlerMode = options.crawlerOptions?.mode ?? "default"; this.crawlerMode = options.crawlerOptions?.mode ?? "default";
this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false; this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
this.allowBackwardCrawling = options.crawlerOptions?.allowBackwardCrawling ?? false; this.allowBackwardCrawling = options.crawlerOptions?.allowBackwardCrawling ?? false;
this.allowExternalContentLinks = options.crawlerOptions?.allowExternalContentLinks ?? false;
// make sure all urls start with https:// // make sure all urls start with https://
this.urls = this.urls.map((url) => { this.urls = this.urls.map((url) => {