mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-03 19:30:40 +08:00
Merge branch 'main' into nsc/new-extract
This commit is contained in:
commit
c78dae178b
@ -2,7 +2,7 @@
|
||||
|
||||
Welcome to [Firecrawl](https://firecrawl.dev) 🔥! Here are some instructions on how to get the project locally, so you can run it on your own (and contribute)
|
||||
|
||||
If you're contributing, note that the process is similar to other open source repos i.e. (fork firecrawl, make changes, run tests, PR). If you have any questions, and would like help gettin on board, reach out to hello@mendable.ai for more or submit an issue!
|
||||
If you're contributing, note that the process is similar to other open source repos i.e. (fork firecrawl, make changes, run tests, PR). If you have any questions, and would like help gettin on board, reach out to help@firecrawl.com for more or submit an issue!
|
||||
|
||||
## Running the project locally
|
||||
|
||||
|
@ -77,10 +77,10 @@ To use the API, you need to sign up on [Firecrawl](https://firecrawl.dev) and ge
|
||||
- **LLM-ready formats**: markdown, structured data, screenshot, HTML, links, metadata
|
||||
- **The hard stuff**: proxies, anti-bot mechanisms, dynamic content (js-rendered), output parsing, orchestration
|
||||
- **Customizability**: exclude tags, crawl behind auth walls with custom headers, max crawl depth, etc...
|
||||
- **Media parsing**: pdfs, docx, images.
|
||||
- **Reliability first**: designed to get the data you need - no matter how hard it is.
|
||||
- **Media parsing**: pdfs, docx, images
|
||||
- **Reliability first**: designed to get the data you need - no matter how hard it is
|
||||
- **Actions**: click, scroll, input, wait and more before extracting data
|
||||
- **Batching (New)**: scrape thousands of URLs at the same time with a new async endpoint
|
||||
- **Batching (New)**: scrape thousands of URLs at the same time with a new async endpoint.
|
||||
|
||||
You can find all of Firecrawl's capabilities and how to use them in our [documentation](https://docs.firecrawl.dev)
|
||||
|
||||
|
@ -230,6 +230,7 @@ const crawlerOptions = z.object({
|
||||
limit: z.number().default(10000), // default?
|
||||
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
|
||||
allowExternalLinks: z.boolean().default(false),
|
||||
allowSubdomains: z.boolean().default(false),
|
||||
ignoreSitemap: z.boolean().default(true),
|
||||
deduplicateSimilarURLs: z.boolean().default(true),
|
||||
ignoreQueryParameters: z.boolean().default(false),
|
||||
@ -502,6 +503,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
|
||||
generateImgAltText: false,
|
||||
allowBackwardCrawling: x.allowBackwardLinks,
|
||||
allowExternalContentLinks: x.allowExternalLinks,
|
||||
allowSubdomains: x.allowSubdomains,
|
||||
ignoreSitemap: x.ignoreSitemap,
|
||||
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
||||
ignoreQueryParameters: x.ignoreQueryParameters,
|
||||
@ -517,6 +519,7 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions
|
||||
maxDepth: x.maxDepth,
|
||||
allowBackwardLinks: x.allowBackwardCrawling,
|
||||
allowExternalLinks: x.allowExternalContentLinks,
|
||||
allowSubdomains: x.allowSubdomains,
|
||||
ignoreSitemap: x.ignoreSitemap,
|
||||
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
||||
ignoreQueryParameters: x.ignoreQueryParameters,
|
||||
|
@ -148,7 +148,8 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
|
||||
res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
|
||||
} else {
|
||||
const permutations = generateURLPermutations(url);
|
||||
res = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations.map(x => x.href))) === permutations.length;
|
||||
const x = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations.map(x => x.href)));
|
||||
res = x === permutations.length;
|
||||
}
|
||||
|
||||
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
||||
@ -179,6 +180,7 @@ export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): W
|
||||
generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false,
|
||||
allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
|
||||
allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false,
|
||||
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
|
||||
});
|
||||
|
||||
if (sc.robots !== undefined) {
|
||||
|
@ -23,6 +23,7 @@ export class WebCrawler {
|
||||
private generateImgAltText: boolean;
|
||||
private allowBackwardCrawling: boolean;
|
||||
private allowExternalContentLinks: boolean;
|
||||
private allowSubdomains: boolean;
|
||||
|
||||
constructor({
|
||||
jobId,
|
||||
@ -35,7 +36,8 @@ export class WebCrawler {
|
||||
generateImgAltText = false,
|
||||
maxCrawledDepth = 10,
|
||||
allowBackwardCrawling = false,
|
||||
allowExternalContentLinks = false
|
||||
allowExternalContentLinks = false,
|
||||
allowSubdomains = false,
|
||||
}: {
|
||||
jobId: string;
|
||||
initialUrl: string;
|
||||
@ -48,6 +50,7 @@ export class WebCrawler {
|
||||
maxCrawledDepth?: number;
|
||||
allowBackwardCrawling?: boolean;
|
||||
allowExternalContentLinks?: boolean;
|
||||
allowSubdomains?: boolean;
|
||||
}) {
|
||||
this.jobId = jobId;
|
||||
this.initialUrl = initialUrl;
|
||||
@ -63,6 +66,7 @@ export class WebCrawler {
|
||||
this.generateImgAltText = generateImgAltText ?? false;
|
||||
this.allowBackwardCrawling = allowBackwardCrawling ?? false;
|
||||
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
|
||||
this.allowSubdomains = allowSubdomains ?? false;
|
||||
}
|
||||
|
||||
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
|
||||
@ -214,6 +218,10 @@ export class WebCrawler {
|
||||
}
|
||||
}
|
||||
|
||||
if (this.allowSubdomains && !this.isSocialMediaOrEmail(fullUrl) && this.isSubdomain(fullUrl)) {
|
||||
return fullUrl;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
@ -222,8 +230,11 @@ export class WebCrawler {
|
||||
|
||||
const $ = load(html);
|
||||
$("a").each((_, element) => {
|
||||
const href = $(element).attr("href");
|
||||
let href = $(element).attr("href");
|
||||
if (href) {
|
||||
if (href.match(/^https?:\/[^\/]/)) {
|
||||
href = href.replace(/^https?:\//, "$&/");
|
||||
}
|
||||
const u = this.filterURL(href, url);
|
||||
if (u !== null) {
|
||||
links.push(u);
|
||||
@ -297,6 +308,10 @@ export class WebCrawler {
|
||||
return linkDomain === baseDomain;
|
||||
}
|
||||
|
||||
private isSubdomain(link: string): boolean {
|
||||
return new URL(link, this.baseUrl).hostname.endsWith("." + new URL(this.baseUrl).hostname.split(".").slice(-2).join("."));
|
||||
}
|
||||
|
||||
public isFile(url: string): boolean {
|
||||
const fileExtensions = [
|
||||
".png",
|
||||
|
@ -350,12 +350,12 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
|
||||
await addCrawlJobDone(job.data.crawl_id, job.id);
|
||||
|
||||
if (!job.data.sitemapped && job.data.crawlerOptions !== null) {
|
||||
if (job.data.crawlerOptions !== null) {
|
||||
if (!sc.cancelled) {
|
||||
const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl);
|
||||
const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!);
|
||||
|
||||
const links = crawler.filterLinks(
|
||||
crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string),
|
||||
crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl!),
|
||||
Infinity,
|
||||
sc.crawlerOptions?.maxDepth ?? 10
|
||||
);
|
||||
|
@ -160,6 +160,7 @@ const testSuiteTokens = [
|
||||
"6c46abb",
|
||||
"cb0ff78",
|
||||
"fd769b2",
|
||||
"4c2638d",
|
||||
"cbb3462", // don't remove (s-ai)
|
||||
"824abcd" // don't remove (s-ai)
|
||||
];
|
||||
|
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user