diff --git a/apps/api/src/scraper/WebScraper/utils/metadata.ts b/apps/api/src/scraper/WebScraper/utils/metadata.ts
index 9496d569..fac53b38 100644
--- a/apps/api/src/scraper/WebScraper/utils/metadata.ts
+++ b/apps/api/src/scraper/WebScraper/utils/metadata.ts
@@ -75,9 +75,7 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
description = soup('meta[name="description"]').attr("content") || null;
// Assuming the language is part of the URL as per the regex pattern
- const pattern = /([a-zA-Z]+-[A-Z]{2})/;
- const match = pattern.exec(url);
- language = match ? match[1] : null;
+ language = soup('html').attr('lang') || null;
keywords = soup('meta[name="keywords"]').attr("content") || null;
robots = soup('meta[name="robots"]').attr("content") || null;
diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts
index b0623b8d..724996bc 100644
--- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts
+++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts
@@ -1,4 +1,4 @@
-import FirecrawlApp, { CrawlParams, CrawlResponse, CrawlStatusResponse, ScrapeParams, ScrapeResponse } from '../../../index';
+import FirecrawlApp, { CrawlParams, CrawlResponse, CrawlStatusResponse, MapResponse, ScrapeParams, ScrapeResponse } from '../../../index';
import { v4 as uuidv4 } from 'uuid';
import dotenv from 'dotenv';
import { describe, test, expect } from '@jest/globals';
@@ -66,6 +66,7 @@ describe('FirecrawlApp E2E Tests', () => {
expect(response.data?.rawHtml).toContain("
{
test.concurrent('should throw error for blocklisted URL on crawl', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const blocklistedUrl = "https://twitter.com/fake-test";
- await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
+ await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.");
});
test.concurrent('should return successful response for crawl and wait for completion', async () => {
@@ -145,14 +146,13 @@ describe('FirecrawlApp E2E Tests', () => {
expect(response.data?.[0]).not.toHaveProperty("rawHtml");
expect(response.data?.[0]).not.toHaveProperty("screenshot");
expect(response.data?.[0]).not.toHaveProperty("links");
-
expect(response.data?.[0]).toHaveProperty("metadata");
expect(response.data?.[0].metadata).toHaveProperty("title");
expect(response.data?.[0].metadata).toHaveProperty("description");
expect(response.data?.[0].metadata).toHaveProperty("language");
expect(response.data?.[0].metadata).toHaveProperty("sourceURL");
expect(response.data?.[0].metadata).toHaveProperty("statusCode");
- expect(response.data?.[0].metadata).toHaveProperty("error");
+ expect(response.data?.[0].metadata).not.toHaveProperty("error");
}, 60000); // 60 seconds timeout
test.concurrent('should return successful response for crawl with options and wait for completion', async () => {
@@ -203,7 +203,7 @@ describe('FirecrawlApp E2E Tests', () => {
expect(response.data?.[0].metadata).toHaveProperty("language");
expect(response.data?.[0].metadata).toHaveProperty("sourceURL");
expect(response.data?.[0].metadata).toHaveProperty("statusCode");
- expect(response.data?.[0].metadata).toHaveProperty("error");
+ expect(response.data?.[0].metadata).not.toHaveProperty("error");
}, 60000); // 60 seconds timeout
test.concurrent('should handle idempotency key for crawl', async () => {
@@ -211,23 +211,23 @@ describe('FirecrawlApp E2E Tests', () => {
const uniqueIdempotencyKey = uuidv4();
const response = await app.crawlUrl('https://roastmywebsite.ai', {}, false, 2, uniqueIdempotencyKey) as CrawlResponse;
expect(response).not.toBeNull();
- expect(response.jobId).toBeDefined();
+ expect(response.id).toBeDefined();
await expect(app.crawlUrl('https://roastmywebsite.ai', {}, true, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409");
});
test.concurrent('should check crawl status', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
- const response: any = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false) as CrawlStatusResponse;
+ const response = await app.crawlUrl('https://firecrawl.dev', { scrapeOptions: { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}} as CrawlParams, false) as CrawlResponse;
expect(response).not.toBeNull();
- expect(response.jobId).toBeDefined();
+ expect(response.id).toBeDefined();
- let statusResponse: any = await app.checkCrawlStatus(response.jobId);
+ let statusResponse: any = await app.checkCrawlStatus(response.id) as CrawlStatusResponse;
const maxChecks = 15;
let checks = 0;
while (statusResponse.status === 'scraping' && checks < maxChecks) {
- await new Promise(resolve => setTimeout(resolve, 1000));
+ await new Promise(resolve => setTimeout(resolve, 5000));
expect(statusResponse).not.toHaveProperty("partial_data"); // v0
expect(statusResponse).not.toHaveProperty("current"); // v0
expect(statusResponse).toHaveProperty("data");
@@ -238,44 +238,70 @@ describe('FirecrawlApp E2E Tests', () => {
expect(statusResponse).toHaveProperty("next");
expect(statusResponse.totalCount).toBeGreaterThan(0);
expect(statusResponse.creditsUsed).toBeGreaterThan(0);
- expect(statusResponse.expiresAt).toBeGreaterThan(Date.now());
+ expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now());
expect(statusResponse.status).toBe("scraping");
expect(statusResponse.next).toContain("/v1/crawl/");
- statusResponse = await app.checkCrawlStatus(response.jobId) as CrawlResponse;
+ statusResponse = await app.checkCrawlStatus(response.id) as CrawlStatusResponse;
checks++;
}
+ expect(statusResponse).not.toBeNull();
+ expect(statusResponse).toHaveProperty("totalCount");
+ expect(statusResponse.totalCount).toBeGreaterThan(0);
+ expect(statusResponse).toHaveProperty("creditsUsed");
+ expect(statusResponse.creditsUsed).toBeGreaterThan(0);
+ expect(statusResponse).toHaveProperty("expiresAt");
+ expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now());
+ expect(statusResponse).toHaveProperty("status");
+ expect(statusResponse.status).toBe("completed");
+ expect(statusResponse.data?.length).toBeGreaterThan(0);
+ expect(statusResponse.data?.[0]).toHaveProperty("markdown");
+ expect(statusResponse.data?.[0].markdown?.length).toBeGreaterThan(10);
+ expect(statusResponse.data?.[0]).not.toHaveProperty('content'); // v0
+ expect(statusResponse.data?.[0]).toHaveProperty("html");
+ expect(statusResponse.data?.[0].html).toContain(" {
+ const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
+ await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
+ });
+
+ test.concurrent('should throw error for blocklisted URL on map', async () => {
+ const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
+ const blocklistedUrl = "https://facebook.com/fake-test";
+ await expect(app.mapUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
+ });
+
+ test.concurrent('should return successful response with valid preview token', async () => {
+ const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL });
+ const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse;
expect(response).not.toBeNull();
- expect(response).toHaveProperty("totalCount");
- expect(response.totalCount).toBeGreaterThan(0);
- expect(response).toHaveProperty("creditsUsed");
- expect(response.creditsUsed).toBeGreaterThan(0);
- expect(response).toHaveProperty("expiresAt");
- expect(response.expiresAt).toBeGreaterThan(Date.now());
- expect(response).toHaveProperty("status");
- expect(response.status).toBe("completed");
- expect(response).toHaveProperty("next");
- expect(response.next).toContain("/v1/crawl/");
- expect(response.data?.length).toBeGreaterThan(0);
- expect(response.data?.[0]).toHaveProperty("markdown");
- expect(response.data?.[0].markdown).toContain("_Roast_");
- expect(response.data?.[0]).not.toHaveProperty('content'); // v0
- expect(response.data?.[0].markdown).toContain("_Roast_");
- expect(response.data?.[0]).toHaveProperty("html");
- expect(response.data?.[0].html).toContain("
{
+ const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
+ const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse;
+ expect(response).not.toBeNull();
+
+ expect(response.links?.length).toBeGreaterThan(0);
+ expect(response.links?.[0]).toContain("https://");
+ const filteredLinks = response.links?.filter((link: string) => link.includes("roastmywebsite.ai"));
+ expect(filteredLinks?.length).toBeGreaterThan(0);
+ }, 30000); // 30 seconds timeout
});
diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts
index c280206c..90c86a2a 100644
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@@ -214,7 +214,7 @@ export interface CrawlParamsV0 {
* Defines the structure of the response received after initiating a crawl.
*/
export interface CrawlResponse {
- jobId?: string;
+ id?: string;
url?: string;
success: boolean;
error?: string;
@@ -281,7 +281,7 @@ export interface MapParams {
*/
export interface MapResponse {
success: boolean;
- data?: string[];
+ links?: string[];
error?: string;
}
@@ -458,36 +458,53 @@ export default class FirecrawlApp {
headers
);
if (response.status === 200) {
- const jobId: string = this.version == 'v0' ? response.data.jobId : response.data.id;
+ const id: string = this.version == 'v0' ? response.data.jobId : response.data.id;
let checkUrl: string | undefined = undefined;
if (waitUntilDone) {
if (this.version == 'v1') { checkUrl = response.data.url }
- return this.monitorJobStatus(jobId, headers, pollInterval, checkUrl);
+ return this.monitorJobStatus(id, headers, pollInterval, checkUrl);
} else {
- return { success: true, jobId };
+ if (this.version == 'v0') {
+ return {
+ success: true,
+ jobId: id
+ } as CrawlResponseV0;
+ } else {
+ return {
+ success: true,
+ id: id
+ } as CrawlResponse;
+ }
}
} else {
this.handleError(response, "start crawl job");
}
} catch (error: any) {
- console.log(error);
- throw new Error(error.message);
+ if (error.response.data.error) {
+ throw new Error(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`);
+ } else {
+ throw new Error(error.message);
+ }
}
return { success: false, error: "Internal server error." };
}
/**
* Checks the status of a crawl job using the Firecrawl API.
- * @param jobId - The job ID of the crawl operation.
+ * @param id - The ID of the crawl operation.
* @returns The response containing the job status.
*/
- async checkCrawlStatus(jobId: string): Promise {
+ async checkCrawlStatus(id?: string): Promise {
+ if (!id) {
+ throw new Error("No crawl ID provided");
+ }
+
const headers: AxiosRequestHeaders = this.prepareHeaders();
try {
const response: AxiosResponse = await this.getRequest(
this.version == 'v1' ?
- this.apiUrl + `/${this.version}/crawl/${jobId}` :
- this.apiUrl + `/${this.version}/crawl/status/${jobId}`,
+ this.apiUrl + `/${this.version}/crawl/${id}` :
+ this.apiUrl + `/${this.version}/crawl/status/${id}`,
headers
);
if (response.status === 200) {
@@ -508,8 +525,12 @@ export default class FirecrawlApp {
return {
success: true,
status: response.data.status,
+ totalCount: response.data.totalCount,
+ creditsUsed: response.data.creditsUsed,
+ expiresAt: new Date(response.data.expiresAt),
+ next: response.data.next,
data: response.data.data,
- error: response.data.error,
+ error: response.data.error
} as CrawlStatusResponse;
}
} else {
@@ -537,7 +558,7 @@ export default class FirecrawlApp {
}
}
- async map(url: string, params?: MapParams): Promise {
+ async mapUrl(url: string, params?: MapParams): Promise {
if (this.version == 'v0') {
throw new Error("Map is not supported in v0");
}
@@ -604,23 +625,23 @@ export default class FirecrawlApp {
/**
* Monitors the status of a crawl job until completion or failure.
- * @param jobId - The job ID of the crawl operation.
+ * @param id - The ID of the crawl operation.
* @param headers - The headers for the request.
* @param checkInterval - Interval in seconds for job status checks.
* @returns The final job status or data.
*/
async monitorJobStatus(
- jobId: string,
+ id: string,
headers: AxiosRequestHeaders,
checkInterval: number,
checkUrl?: string
- ): Promise {
+ ): Promise {
let apiUrl: string = '';
while (true) {
if (this.version == 'v1') {
- apiUrl = checkUrl ?? this.apiUrl + `/v1/crawl/${jobId}`;
+ apiUrl = checkUrl ?? this.apiUrl + `/v1/crawl/${id}`;
} else if (this.version == 'v0') {
- apiUrl = checkUrl ?? this.apiUrl + `/v0/crawl/status/${jobId}`;
+ apiUrl = checkUrl ?? this.apiUrl + `/v0/crawl/status/${id}`;
}
const statusResponse: AxiosResponse = await this.getRequest(
apiUrl,