diff --git a/apps/api/src/scraper/WebScraper/utils/metadata.ts b/apps/api/src/scraper/WebScraper/utils/metadata.ts index 9496d569..fac53b38 100644 --- a/apps/api/src/scraper/WebScraper/utils/metadata.ts +++ b/apps/api/src/scraper/WebScraper/utils/metadata.ts @@ -75,9 +75,7 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { description = soup('meta[name="description"]').attr("content") || null; // Assuming the language is part of the URL as per the regex pattern - const pattern = /([a-zA-Z]+-[A-Z]{2})/; - const match = pattern.exec(url); - language = match ? match[1] : null; + language = soup('html').attr('lang') || null; keywords = soup('meta[name="keywords"]').attr("content") || null; robots = soup('meta[name="robots"]').attr("content") || null; diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index b0623b8d..724996bc 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -1,4 +1,4 @@ -import FirecrawlApp, { CrawlParams, CrawlResponse, CrawlStatusResponse, ScrapeParams, ScrapeResponse } from '../../../index'; +import FirecrawlApp, { CrawlParams, CrawlResponse, CrawlStatusResponse, MapResponse, ScrapeParams, ScrapeResponse } from '../../../index'; import { v4 as uuidv4 } from 'uuid'; import dotenv from 'dotenv'; import { describe, test, expect } from '@jest/globals'; @@ -66,6 +66,7 @@ describe('FirecrawlApp E2E Tests', () => { expect(response.data?.rawHtml).toContain(" { test.concurrent('should throw error for blocklisted URL on crawl', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const blocklistedUrl = "https://twitter.com/fake-test"; - await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); + await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions."); }); test.concurrent('should return successful response for crawl and wait for completion', async () => { @@ -145,14 +146,13 @@ describe('FirecrawlApp E2E Tests', () => { expect(response.data?.[0]).not.toHaveProperty("rawHtml"); expect(response.data?.[0]).not.toHaveProperty("screenshot"); expect(response.data?.[0]).not.toHaveProperty("links"); - expect(response.data?.[0]).toHaveProperty("metadata"); expect(response.data?.[0].metadata).toHaveProperty("title"); expect(response.data?.[0].metadata).toHaveProperty("description"); expect(response.data?.[0].metadata).toHaveProperty("language"); expect(response.data?.[0].metadata).toHaveProperty("sourceURL"); expect(response.data?.[0].metadata).toHaveProperty("statusCode"); - expect(response.data?.[0].metadata).toHaveProperty("error"); + expect(response.data?.[0].metadata).not.toHaveProperty("error"); }, 60000); // 60 seconds timeout test.concurrent('should return successful response for crawl with options and wait for completion', async () => { @@ -203,7 +203,7 @@ describe('FirecrawlApp E2E Tests', () => { expect(response.data?.[0].metadata).toHaveProperty("language"); expect(response.data?.[0].metadata).toHaveProperty("sourceURL"); expect(response.data?.[0].metadata).toHaveProperty("statusCode"); - expect(response.data?.[0].metadata).toHaveProperty("error"); + expect(response.data?.[0].metadata).not.toHaveProperty("error"); }, 60000); // 60 seconds timeout test.concurrent('should handle idempotency key for crawl', async () => { @@ -211,23 +211,23 @@ describe('FirecrawlApp E2E Tests', () => { const uniqueIdempotencyKey = uuidv4(); const response = await app.crawlUrl('https://roastmywebsite.ai', {}, false, 2, uniqueIdempotencyKey) as CrawlResponse; expect(response).not.toBeNull(); - expect(response.jobId).toBeDefined(); + expect(response.id).toBeDefined(); await expect(app.crawlUrl('https://roastmywebsite.ai', {}, true, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409"); }); test.concurrent('should check crawl status', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response: any = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false) as CrawlStatusResponse; + const response = await app.crawlUrl('https://firecrawl.dev', { scrapeOptions: { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}} as CrawlParams, false) as CrawlResponse; expect(response).not.toBeNull(); - expect(response.jobId).toBeDefined(); + expect(response.id).toBeDefined(); - let statusResponse: any = await app.checkCrawlStatus(response.jobId); + let statusResponse: any = await app.checkCrawlStatus(response.id) as CrawlStatusResponse; const maxChecks = 15; let checks = 0; while (statusResponse.status === 'scraping' && checks < maxChecks) { - await new Promise(resolve => setTimeout(resolve, 1000)); + await new Promise(resolve => setTimeout(resolve, 5000)); expect(statusResponse).not.toHaveProperty("partial_data"); // v0 expect(statusResponse).not.toHaveProperty("current"); // v0 expect(statusResponse).toHaveProperty("data"); @@ -238,44 +238,70 @@ describe('FirecrawlApp E2E Tests', () => { expect(statusResponse).toHaveProperty("next"); expect(statusResponse.totalCount).toBeGreaterThan(0); expect(statusResponse.creditsUsed).toBeGreaterThan(0); - expect(statusResponse.expiresAt).toBeGreaterThan(Date.now()); + expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now()); expect(statusResponse.status).toBe("scraping"); expect(statusResponse.next).toContain("/v1/crawl/"); - statusResponse = await app.checkCrawlStatus(response.jobId) as CrawlResponse; + statusResponse = await app.checkCrawlStatus(response.id) as CrawlStatusResponse; checks++; } + expect(statusResponse).not.toBeNull(); + expect(statusResponse).toHaveProperty("totalCount"); + expect(statusResponse.totalCount).toBeGreaterThan(0); + expect(statusResponse).toHaveProperty("creditsUsed"); + expect(statusResponse.creditsUsed).toBeGreaterThan(0); + expect(statusResponse).toHaveProperty("expiresAt"); + expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now()); + expect(statusResponse).toHaveProperty("status"); + expect(statusResponse.status).toBe("completed"); + expect(statusResponse.data?.length).toBeGreaterThan(0); + expect(statusResponse.data?.[0]).toHaveProperty("markdown"); + expect(statusResponse.data?.[0].markdown?.length).toBeGreaterThan(10); + expect(statusResponse.data?.[0]).not.toHaveProperty('content'); // v0 + expect(statusResponse.data?.[0]).toHaveProperty("html"); + expect(statusResponse.data?.[0].html).toContain(" { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); + }); + + test.concurrent('should throw error for blocklisted URL on map', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const blocklistedUrl = "https://facebook.com/fake-test"; + await expect(app.mapUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); + }); + + test.concurrent('should return successful response with valid preview token', async () => { + const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL }); + const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse; expect(response).not.toBeNull(); - expect(response).toHaveProperty("totalCount"); - expect(response.totalCount).toBeGreaterThan(0); - expect(response).toHaveProperty("creditsUsed"); - expect(response.creditsUsed).toBeGreaterThan(0); - expect(response).toHaveProperty("expiresAt"); - expect(response.expiresAt).toBeGreaterThan(Date.now()); - expect(response).toHaveProperty("status"); - expect(response.status).toBe("completed"); - expect(response).toHaveProperty("next"); - expect(response.next).toContain("/v1/crawl/"); - expect(response.data?.length).toBeGreaterThan(0); - expect(response.data?.[0]).toHaveProperty("markdown"); - expect(response.data?.[0].markdown).toContain("_Roast_"); - expect(response.data?.[0]).not.toHaveProperty('content'); // v0 - expect(response.data?.[0].markdown).toContain("_Roast_"); - expect(response.data?.[0]).toHaveProperty("html"); - expect(response.data?.[0].html).toContain(" { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse; + expect(response).not.toBeNull(); + + expect(response.links?.length).toBeGreaterThan(0); + expect(response.links?.[0]).toContain("https://"); + const filteredLinks = response.links?.filter((link: string) => link.includes("roastmywebsite.ai")); + expect(filteredLinks?.length).toBeGreaterThan(0); + }, 30000); // 30 seconds timeout }); diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index c280206c..90c86a2a 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -214,7 +214,7 @@ export interface CrawlParamsV0 { * Defines the structure of the response received after initiating a crawl. */ export interface CrawlResponse { - jobId?: string; + id?: string; url?: string; success: boolean; error?: string; @@ -281,7 +281,7 @@ export interface MapParams { */ export interface MapResponse { success: boolean; - data?: string[]; + links?: string[]; error?: string; } @@ -458,36 +458,53 @@ export default class FirecrawlApp { headers ); if (response.status === 200) { - const jobId: string = this.version == 'v0' ? response.data.jobId : response.data.id; + const id: string = this.version == 'v0' ? response.data.jobId : response.data.id; let checkUrl: string | undefined = undefined; if (waitUntilDone) { if (this.version == 'v1') { checkUrl = response.data.url } - return this.monitorJobStatus(jobId, headers, pollInterval, checkUrl); + return this.monitorJobStatus(id, headers, pollInterval, checkUrl); } else { - return { success: true, jobId }; + if (this.version == 'v0') { + return { + success: true, + jobId: id + } as CrawlResponseV0; + } else { + return { + success: true, + id: id + } as CrawlResponse; + } } } else { this.handleError(response, "start crawl job"); } } catch (error: any) { - console.log(error); - throw new Error(error.message); + if (error.response.data.error) { + throw new Error(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`); + } else { + throw new Error(error.message); + } } return { success: false, error: "Internal server error." }; } /** * Checks the status of a crawl job using the Firecrawl API. - * @param jobId - The job ID of the crawl operation. + * @param id - The ID of the crawl operation. * @returns The response containing the job status. */ - async checkCrawlStatus(jobId: string): Promise { + async checkCrawlStatus(id?: string): Promise { + if (!id) { + throw new Error("No crawl ID provided"); + } + const headers: AxiosRequestHeaders = this.prepareHeaders(); try { const response: AxiosResponse = await this.getRequest( this.version == 'v1' ? - this.apiUrl + `/${this.version}/crawl/${jobId}` : - this.apiUrl + `/${this.version}/crawl/status/${jobId}`, + this.apiUrl + `/${this.version}/crawl/${id}` : + this.apiUrl + `/${this.version}/crawl/status/${id}`, headers ); if (response.status === 200) { @@ -508,8 +525,12 @@ export default class FirecrawlApp { return { success: true, status: response.data.status, + totalCount: response.data.totalCount, + creditsUsed: response.data.creditsUsed, + expiresAt: new Date(response.data.expiresAt), + next: response.data.next, data: response.data.data, - error: response.data.error, + error: response.data.error } as CrawlStatusResponse; } } else { @@ -537,7 +558,7 @@ export default class FirecrawlApp { } } - async map(url: string, params?: MapParams): Promise { + async mapUrl(url: string, params?: MapParams): Promise { if (this.version == 'v0') { throw new Error("Map is not supported in v0"); } @@ -604,23 +625,23 @@ export default class FirecrawlApp { /** * Monitors the status of a crawl job until completion or failure. - * @param jobId - The job ID of the crawl operation. + * @param id - The ID of the crawl operation. * @param headers - The headers for the request. * @param checkInterval - Interval in seconds for job status checks. * @returns The final job status or data. */ async monitorJobStatus( - jobId: string, + id: string, headers: AxiosRequestHeaders, checkInterval: number, checkUrl?: string - ): Promise { + ): Promise { let apiUrl: string = ''; while (true) { if (this.version == 'v1') { - apiUrl = checkUrl ?? this.apiUrl + `/v1/crawl/${jobId}`; + apiUrl = checkUrl ?? this.apiUrl + `/v1/crawl/${id}`; } else if (this.version == 'v0') { - apiUrl = checkUrl ?? this.apiUrl + `/v0/crawl/status/${jobId}`; + apiUrl = checkUrl ?? this.apiUrl + `/v0/crawl/status/${id}`; } const statusResponse: AxiosResponse = await this.getRequest( apiUrl,