Merge branch 'main' into nsc/new-extract

2025-08-03 19:30:40 +08:00 · 2024-11-20 16:41:13 -08:00 · 2024-11-20 16:41:13 -08:00 · c78dae178b
commit c78dae178b
parent 945183ffbd 98894641c1
8 changed files with 1534 additions and 10 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -2,7 +2,7 @@

 Welcome to [Firecrawl](https://firecrawl.dev) 🔥! Here are some instructions on how to get the project locally, so you can run it on your own (and contribute)

-If you're contributing, note that the process is similar to other open source repos i.e. (fork firecrawl, make changes, run tests, PR). If you have any questions, and would like help gettin on board, reach out to hello@mendable.ai for more or submit an issue!
+If you're contributing, note that the process is similar to other open source repos i.e. (fork firecrawl, make changes, run tests, PR). If you have any questions, and would like help gettin on board, reach out to help@firecrawl.com for more or submit an issue!

 ## Running the project locally

--- a/README.md
+++ b/README.md
@ -77,10 +77,10 @@ To use the API, you need to sign up on [Firecrawl](https://firecrawl.dev) and ge
 - **LLM-ready formats**: markdown, structured data, screenshot, HTML, links, metadata
 - **The hard stuff**: proxies, anti-bot mechanisms, dynamic content (js-rendered), output parsing, orchestration
 - **Customizability**: exclude tags, crawl behind auth walls with custom headers, max crawl depth, etc...
- **Media parsing**: pdfs, docx, images.
- **Reliability first**: designed to get the data you need - no matter how hard it is.
+- **Media parsing**: pdfs, docx, images
+- **Reliability first**: designed to get the data you need - no matter how hard it is
 - **Actions**: click, scroll, input, wait and more before extracting data
- **Batching (New)**: scrape thousands of URLs at the same time with a new async endpoint
+- **Batching (New)**: scrape thousands of URLs at the same time with a new async endpoint.

 You can find all of Firecrawl's capabilities and how to use them in our [documentation](https://docs.firecrawl.dev)

--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -230,6 +230,7 @@ const crawlerOptions = z.object({
  limit: z.number().default(10000), // default?
  allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
  allowExternalLinks: z.boolean().default(false),
+  allowSubdomains: z.boolean().default(false),
  ignoreSitemap: z.boolean().default(true),
  deduplicateSimilarURLs: z.boolean().default(true),
  ignoreQueryParameters: z.boolean().default(false),
@ -502,6 +503,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
    generateImgAltText: false,
    allowBackwardCrawling: x.allowBackwardLinks,
    allowExternalContentLinks: x.allowExternalLinks,
+    allowSubdomains: x.allowSubdomains,
    ignoreSitemap: x.ignoreSitemap,
    deduplicateSimilarURLs: x.deduplicateSimilarURLs,
    ignoreQueryParameters: x.ignoreQueryParameters,
@ -517,6 +519,7 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions
      maxDepth: x.maxDepth,
      allowBackwardLinks: x.allowBackwardCrawling,
      allowExternalLinks: x.allowExternalContentLinks,
+      allowSubdomains: x.allowSubdomains,
      ignoreSitemap: x.ignoreSitemap,
      deduplicateSimilarURLs: x.deduplicateSimilarURLs,
      ignoreQueryParameters: x.ignoreQueryParameters,
--- a/apps/api/src/lib/crawl-redis.ts
+++ b/apps/api/src/lib/crawl-redis.ts
@ -148,7 +148,8 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
        res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
    } else {
        const permutations = generateURLPermutations(url);
-        res = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations.map(x => x.href))) === permutations.length;
+        const x = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations.map(x => x.href)));
+        res = x === permutations.length;
    }

    await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
@ -179,6 +180,7 @@ export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): W
        generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false,
        allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
        allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false,
+        allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
    });

    if (sc.robots !== undefined) {
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -23,6 +23,7 @@ export class WebCrawler {
  private generateImgAltText: boolean;
  private allowBackwardCrawling: boolean;
  private allowExternalContentLinks: boolean;
+  private allowSubdomains: boolean;

  constructor({
    jobId,
@ -35,7 +36,8 @@ export class WebCrawler {
    generateImgAltText = false,
    maxCrawledDepth = 10,
    allowBackwardCrawling = false,
-    allowExternalContentLinks = false
+    allowExternalContentLinks = false,
+    allowSubdomains = false,
  }: {
    jobId: string;
    initialUrl: string;
@ -48,6 +50,7 @@ export class WebCrawler {
    maxCrawledDepth?: number;
    allowBackwardCrawling?: boolean;
    allowExternalContentLinks?: boolean;
+    allowSubdomains?: boolean;
  }) {
    this.jobId = jobId;
    this.initialUrl = initialUrl;
@ -63,6 +66,7 @@ export class WebCrawler {
    this.generateImgAltText = generateImgAltText ?? false;
    this.allowBackwardCrawling = allowBackwardCrawling ?? false;
    this.allowExternalContentLinks = allowExternalContentLinks ?? false;
+    this.allowSubdomains = allowSubdomains ?? false;
  }

  public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
@ -214,6 +218,10 @@ export class WebCrawler {
      }
    }

+    if (this.allowSubdomains && !this.isSocialMediaOrEmail(fullUrl) && this.isSubdomain(fullUrl)) {
+      return fullUrl;
+    }
+
    return null;
  }

@ -222,8 +230,11 @@ export class WebCrawler {

    const $ = load(html);
    $("a").each((_, element) => {
-      const href = $(element).attr("href");
+      let href = $(element).attr("href");
      if (href) {
+        if (href.match(/^https?:\/[^\/]/)) {
+          href = href.replace(/^https?:\//, "$&/");
+        }
        const u = this.filterURL(href, url);
        if (u !== null) {
          links.push(u);
@ -297,6 +308,10 @@ export class WebCrawler {
    return linkDomain === baseDomain;
  }

+  private isSubdomain(link: string): boolean {
+    return new URL(link, this.baseUrl).hostname.endsWith("." + new URL(this.baseUrl).hostname.split(".").slice(-2).join("."));
+  }
+
  public isFile(url: string): boolean {
    const fileExtensions = [
      ".png",
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -350,12 +350,12 @@ async function processJob(job: Job & { id: string }, token: string) {

      await addCrawlJobDone(job.data.crawl_id, job.id);

-      if (!job.data.sitemapped && job.data.crawlerOptions !== null) {
+      if (job.data.crawlerOptions !== null) {
        if (!sc.cancelled) {
-          const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl);
+          const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!);

          const links = crawler.filterLinks(
-            crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string),
+            crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl!),
            Infinity,
            sc.crawlerOptions?.maxDepth ?? 10
          );
--- a/apps/api/src/services/rate-limiter.ts
+++ b/apps/api/src/services/rate-limiter.ts
@ -160,6 +160,7 @@ const testSuiteTokens = [
  "6c46abb",
  "cb0ff78",
  "fd769b2",
+  "4c2638d",
  "cbb3462", // don't remove (s-ai)
  "824abcd" // don't remove (s-ai)
 ];
--- a/examples/mastering-the-crawl-endpoint/mastering-the-crawl-endpoint.ipynb
+++ b/examples/mastering-the-crawl-endpoint/mastering-the-crawl-endpoint.ipynb