Merge branch 'main' into nsc/new-extract

This commit is contained in:
Nicolas 2024-11-20 16:41:13 -08:00
commit c78dae178b
8 changed files with 1534 additions and 10 deletions

View File

@ -2,7 +2,7 @@
Welcome to [Firecrawl](https://firecrawl.dev) 🔥! Here are some instructions on how to get the project locally, so you can run it on your own (and contribute)
If you're contributing, note that the process is similar to other open source repos i.e. (fork firecrawl, make changes, run tests, PR). If you have any questions, and would like help gettin on board, reach out to hello@mendable.ai for more or submit an issue!
If you're contributing, note that the process is similar to other open source repos i.e. (fork firecrawl, make changes, run tests, PR). If you have any questions, and would like help gettin on board, reach out to help@firecrawl.com for more or submit an issue!
## Running the project locally

View File

@ -77,10 +77,10 @@ To use the API, you need to sign up on [Firecrawl](https://firecrawl.dev) and ge
- **LLM-ready formats**: markdown, structured data, screenshot, HTML, links, metadata
- **The hard stuff**: proxies, anti-bot mechanisms, dynamic content (js-rendered), output parsing, orchestration
- **Customizability**: exclude tags, crawl behind auth walls with custom headers, max crawl depth, etc...
- **Media parsing**: pdfs, docx, images.
- **Reliability first**: designed to get the data you need - no matter how hard it is.
- **Media parsing**: pdfs, docx, images
- **Reliability first**: designed to get the data you need - no matter how hard it is
- **Actions**: click, scroll, input, wait and more before extracting data
- **Batching (New)**: scrape thousands of URLs at the same time with a new async endpoint
- **Batching (New)**: scrape thousands of URLs at the same time with a new async endpoint.
You can find all of Firecrawl's capabilities and how to use them in our [documentation](https://docs.firecrawl.dev)

View File

@ -230,6 +230,7 @@ const crawlerOptions = z.object({
limit: z.number().default(10000), // default?
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
allowExternalLinks: z.boolean().default(false),
allowSubdomains: z.boolean().default(false),
ignoreSitemap: z.boolean().default(true),
deduplicateSimilarURLs: z.boolean().default(true),
ignoreQueryParameters: z.boolean().default(false),
@ -502,6 +503,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
generateImgAltText: false,
allowBackwardCrawling: x.allowBackwardLinks,
allowExternalContentLinks: x.allowExternalLinks,
allowSubdomains: x.allowSubdomains,
ignoreSitemap: x.ignoreSitemap,
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
ignoreQueryParameters: x.ignoreQueryParameters,
@ -517,6 +519,7 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions
maxDepth: x.maxDepth,
allowBackwardLinks: x.allowBackwardCrawling,
allowExternalLinks: x.allowExternalContentLinks,
allowSubdomains: x.allowSubdomains,
ignoreSitemap: x.ignoreSitemap,
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
ignoreQueryParameters: x.ignoreQueryParameters,

View File

@ -148,7 +148,8 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
} else {
const permutations = generateURLPermutations(url);
res = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations.map(x => x.href))) === permutations.length;
const x = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations.map(x => x.href)));
res = x === permutations.length;
}
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
@ -179,6 +180,7 @@ export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): W
generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false,
allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false,
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
});
if (sc.robots !== undefined) {

View File

@ -23,6 +23,7 @@ export class WebCrawler {
private generateImgAltText: boolean;
private allowBackwardCrawling: boolean;
private allowExternalContentLinks: boolean;
private allowSubdomains: boolean;
constructor({
jobId,
@ -35,7 +36,8 @@ export class WebCrawler {
generateImgAltText = false,
maxCrawledDepth = 10,
allowBackwardCrawling = false,
allowExternalContentLinks = false
allowExternalContentLinks = false,
allowSubdomains = false,
}: {
jobId: string;
initialUrl: string;
@ -48,6 +50,7 @@ export class WebCrawler {
maxCrawledDepth?: number;
allowBackwardCrawling?: boolean;
allowExternalContentLinks?: boolean;
allowSubdomains?: boolean;
}) {
this.jobId = jobId;
this.initialUrl = initialUrl;
@ -63,6 +66,7 @@ export class WebCrawler {
this.generateImgAltText = generateImgAltText ?? false;
this.allowBackwardCrawling = allowBackwardCrawling ?? false;
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
this.allowSubdomains = allowSubdomains ?? false;
}
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
@ -214,6 +218,10 @@ export class WebCrawler {
}
}
if (this.allowSubdomains && !this.isSocialMediaOrEmail(fullUrl) && this.isSubdomain(fullUrl)) {
return fullUrl;
}
return null;
}
@ -222,8 +230,11 @@ export class WebCrawler {
const $ = load(html);
$("a").each((_, element) => {
const href = $(element).attr("href");
let href = $(element).attr("href");
if (href) {
if (href.match(/^https?:\/[^\/]/)) {
href = href.replace(/^https?:\//, "$&/");
}
const u = this.filterURL(href, url);
if (u !== null) {
links.push(u);
@ -297,6 +308,10 @@ export class WebCrawler {
return linkDomain === baseDomain;
}
private isSubdomain(link: string): boolean {
return new URL(link, this.baseUrl).hostname.endsWith("." + new URL(this.baseUrl).hostname.split(".").slice(-2).join("."));
}
public isFile(url: string): boolean {
const fileExtensions = [
".png",

View File

@ -350,12 +350,12 @@ async function processJob(job: Job & { id: string }, token: string) {
await addCrawlJobDone(job.data.crawl_id, job.id);
if (!job.data.sitemapped && job.data.crawlerOptions !== null) {
if (job.data.crawlerOptions !== null) {
if (!sc.cancelled) {
const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl);
const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!);
const links = crawler.filterLinks(
crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string),
crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl!),
Infinity,
sc.crawlerOptions?.maxDepth ?? 10
);

View File

@ -160,6 +160,7 @@ const testSuiteTokens = [
"6c46abb",
"cb0ff78",
"fd769b2",
"4c2638d",
"cbb3462", // don't remove (s-ai)
"824abcd" // don't remove (s-ai)
];

File diff suppressed because one or more lines are too long