mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 02:49:03 +08:00
tests passing now
This commit is contained in:
parent
0f48823c9c
commit
e9d6ca197e
@ -75,9 +75,7 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
||||
description = soup('meta[name="description"]').attr("content") || null;
|
||||
|
||||
// Assuming the language is part of the URL as per the regex pattern
|
||||
const pattern = /([a-zA-Z]+-[A-Z]{2})/;
|
||||
const match = pattern.exec(url);
|
||||
language = match ? match[1] : null;
|
||||
language = soup('html').attr('lang') || null;
|
||||
|
||||
keywords = soup('meta[name="keywords"]').attr("content") || null;
|
||||
robots = soup('meta[name="robots"]').attr("content") || null;
|
||||
|
@ -1,4 +1,4 @@
|
||||
import FirecrawlApp, { CrawlParams, CrawlResponse, CrawlStatusResponse, ScrapeParams, ScrapeResponse } from '../../../index';
|
||||
import FirecrawlApp, { CrawlParams, CrawlResponse, CrawlStatusResponse, MapResponse, ScrapeParams, ScrapeResponse } from '../../../index';
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
import dotenv from 'dotenv';
|
||||
import { describe, test, expect } from '@jest/globals';
|
||||
@ -66,6 +66,7 @@ describe('FirecrawlApp E2E Tests', () => {
|
||||
expect(response.data?.rawHtml).toContain("<h1");
|
||||
expect(response.data?.screenshot).not.toBeUndefined();
|
||||
expect(response.data?.screenshot).not.toBeNull();
|
||||
expect(response.data?.screenshot).toContain("https://");
|
||||
expect(response.data?.links).not.toBeNull();
|
||||
expect(response.data?.links?.length).toBeGreaterThan(0);
|
||||
expect(response.data?.links?.[0]).toContain("https://");
|
||||
@ -121,7 +122,7 @@ describe('FirecrawlApp E2E Tests', () => {
|
||||
test.concurrent('should throw error for blocklisted URL on crawl', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const blocklistedUrl = "https://twitter.com/fake-test";
|
||||
await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
|
||||
await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.");
|
||||
});
|
||||
|
||||
test.concurrent('should return successful response for crawl and wait for completion', async () => {
|
||||
@ -145,14 +146,13 @@ describe('FirecrawlApp E2E Tests', () => {
|
||||
expect(response.data?.[0]).not.toHaveProperty("rawHtml");
|
||||
expect(response.data?.[0]).not.toHaveProperty("screenshot");
|
||||
expect(response.data?.[0]).not.toHaveProperty("links");
|
||||
|
||||
expect(response.data?.[0]).toHaveProperty("metadata");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("title");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("description");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("language");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("sourceURL");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("statusCode");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("error");
|
||||
expect(response.data?.[0].metadata).not.toHaveProperty("error");
|
||||
}, 60000); // 60 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response for crawl with options and wait for completion', async () => {
|
||||
@ -203,7 +203,7 @@ describe('FirecrawlApp E2E Tests', () => {
|
||||
expect(response.data?.[0].metadata).toHaveProperty("language");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("sourceURL");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("statusCode");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("error");
|
||||
expect(response.data?.[0].metadata).not.toHaveProperty("error");
|
||||
}, 60000); // 60 seconds timeout
|
||||
|
||||
test.concurrent('should handle idempotency key for crawl', async () => {
|
||||
@ -211,23 +211,23 @@ describe('FirecrawlApp E2E Tests', () => {
|
||||
const uniqueIdempotencyKey = uuidv4();
|
||||
const response = await app.crawlUrl('https://roastmywebsite.ai', {}, false, 2, uniqueIdempotencyKey) as CrawlResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.jobId).toBeDefined();
|
||||
expect(response.id).toBeDefined();
|
||||
|
||||
await expect(app.crawlUrl('https://roastmywebsite.ai', {}, true, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409");
|
||||
});
|
||||
|
||||
test.concurrent('should check crawl status', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response: any = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false) as CrawlStatusResponse;
|
||||
const response = await app.crawlUrl('https://firecrawl.dev', { scrapeOptions: { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}} as CrawlParams, false) as CrawlResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.jobId).toBeDefined();
|
||||
expect(response.id).toBeDefined();
|
||||
|
||||
let statusResponse: any = await app.checkCrawlStatus(response.jobId);
|
||||
let statusResponse: any = await app.checkCrawlStatus(response.id) as CrawlStatusResponse;
|
||||
const maxChecks = 15;
|
||||
let checks = 0;
|
||||
|
||||
while (statusResponse.status === 'scraping' && checks < maxChecks) {
|
||||
await new Promise(resolve => setTimeout(resolve, 1000));
|
||||
await new Promise(resolve => setTimeout(resolve, 5000));
|
||||
expect(statusResponse).not.toHaveProperty("partial_data"); // v0
|
||||
expect(statusResponse).not.toHaveProperty("current"); // v0
|
||||
expect(statusResponse).toHaveProperty("data");
|
||||
@ -238,44 +238,70 @@ describe('FirecrawlApp E2E Tests', () => {
|
||||
expect(statusResponse).toHaveProperty("next");
|
||||
expect(statusResponse.totalCount).toBeGreaterThan(0);
|
||||
expect(statusResponse.creditsUsed).toBeGreaterThan(0);
|
||||
expect(statusResponse.expiresAt).toBeGreaterThan(Date.now());
|
||||
expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now());
|
||||
expect(statusResponse.status).toBe("scraping");
|
||||
expect(statusResponse.next).toContain("/v1/crawl/");
|
||||
statusResponse = await app.checkCrawlStatus(response.jobId) as CrawlResponse;
|
||||
statusResponse = await app.checkCrawlStatus(response.id) as CrawlStatusResponse;
|
||||
checks++;
|
||||
}
|
||||
|
||||
expect(statusResponse).not.toBeNull();
|
||||
expect(statusResponse).toHaveProperty("totalCount");
|
||||
expect(statusResponse.totalCount).toBeGreaterThan(0);
|
||||
expect(statusResponse).toHaveProperty("creditsUsed");
|
||||
expect(statusResponse.creditsUsed).toBeGreaterThan(0);
|
||||
expect(statusResponse).toHaveProperty("expiresAt");
|
||||
expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now());
|
||||
expect(statusResponse).toHaveProperty("status");
|
||||
expect(statusResponse.status).toBe("completed");
|
||||
expect(statusResponse.data?.length).toBeGreaterThan(0);
|
||||
expect(statusResponse.data?.[0]).toHaveProperty("markdown");
|
||||
expect(statusResponse.data?.[0].markdown?.length).toBeGreaterThan(10);
|
||||
expect(statusResponse.data?.[0]).not.toHaveProperty('content'); // v0
|
||||
expect(statusResponse.data?.[0]).toHaveProperty("html");
|
||||
expect(statusResponse.data?.[0].html).toContain("<div");
|
||||
expect(statusResponse.data?.[0]).toHaveProperty("rawHtml");
|
||||
expect(statusResponse.data?.[0].rawHtml).toContain("<div");
|
||||
expect(statusResponse.data?.[0]).toHaveProperty("screenshot");
|
||||
expect(statusResponse.data?.[0].screenshot).toContain("https://");
|
||||
expect(statusResponse.data?.[0]).toHaveProperty("links");
|
||||
expect(statusResponse.data?.[0].links).not.toBeNull();
|
||||
expect(statusResponse.data?.[0].links?.length).toBeGreaterThan(0);
|
||||
expect(statusResponse.data?.[0]).toHaveProperty("metadata");
|
||||
expect(statusResponse.data?.[0].metadata).toHaveProperty("title");
|
||||
expect(statusResponse.data?.[0].metadata).toHaveProperty("description");
|
||||
expect(statusResponse.data?.[0].metadata).toHaveProperty("language");
|
||||
expect(statusResponse.data?.[0].metadata).toHaveProperty("sourceURL");
|
||||
expect(statusResponse.data?.[0].metadata).toHaveProperty("statusCode");
|
||||
expect(statusResponse.data?.[0].metadata).not.toHaveProperty("error");
|
||||
}, 60000); // 60 seconds timeout
|
||||
|
||||
test.concurrent('should throw error for invalid API key on map', async () => {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
|
||||
});
|
||||
|
||||
test.concurrent('should throw error for blocklisted URL on map', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const blocklistedUrl = "https://facebook.com/fake-test";
|
||||
await expect(app.mapUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
|
||||
});
|
||||
|
||||
test.concurrent('should return successful response with valid preview token', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL });
|
||||
const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response).toHaveProperty("totalCount");
|
||||
expect(response.totalCount).toBeGreaterThan(0);
|
||||
expect(response).toHaveProperty("creditsUsed");
|
||||
expect(response.creditsUsed).toBeGreaterThan(0);
|
||||
expect(response).toHaveProperty("expiresAt");
|
||||
expect(response.expiresAt).toBeGreaterThan(Date.now());
|
||||
expect(response).toHaveProperty("status");
|
||||
expect(response.status).toBe("completed");
|
||||
expect(response).toHaveProperty("next");
|
||||
expect(response.next).toContain("/v1/crawl/");
|
||||
expect(response.data?.length).toBeGreaterThan(0);
|
||||
expect(response.data?.[0]).toHaveProperty("markdown");
|
||||
expect(response.data?.[0].markdown).toContain("_Roast_");
|
||||
expect(response.data?.[0]).not.toHaveProperty('content'); // v0
|
||||
expect(response.data?.[0].markdown).toContain("_Roast_");
|
||||
expect(response.data?.[0]).toHaveProperty("html");
|
||||
expect(response.data?.[0].html).toContain("<h1");
|
||||
expect(response.data?.[0]).toHaveProperty("rawHtml");
|
||||
expect(response.data?.[0].rawHtml).toContain("<h1");
|
||||
expect(response.data?.[0]).toHaveProperty("screenshot");
|
||||
expect(response.data?.[0].screenshot).toContain("https://");
|
||||
expect(response.data?.[0]).toHaveProperty("links");
|
||||
expect(response.data?.[0].links).not.toBeNull();
|
||||
expect(response.data?.[0].links?.length).toBeGreaterThan(0);
|
||||
expect(response.data?.[0]).toHaveProperty("metadata");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("title");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("description");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("language");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("sourceURL");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("statusCode");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("error");
|
||||
}, 35000); // 35 seconds timeout
|
||||
expect(response.links?.length).toBeGreaterThan(0);
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response for valid map', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse;
|
||||
expect(response).not.toBeNull();
|
||||
|
||||
expect(response.links?.length).toBeGreaterThan(0);
|
||||
expect(response.links?.[0]).toContain("https://");
|
||||
const filteredLinks = response.links?.filter((link: string) => link.includes("roastmywebsite.ai"));
|
||||
expect(filteredLinks?.length).toBeGreaterThan(0);
|
||||
}, 30000); // 30 seconds timeout
|
||||
});
|
||||
|
@ -214,7 +214,7 @@ export interface CrawlParamsV0 {
|
||||
* Defines the structure of the response received after initiating a crawl.
|
||||
*/
|
||||
export interface CrawlResponse {
|
||||
jobId?: string;
|
||||
id?: string;
|
||||
url?: string;
|
||||
success: boolean;
|
||||
error?: string;
|
||||
@ -281,7 +281,7 @@ export interface MapParams {
|
||||
*/
|
||||
export interface MapResponse {
|
||||
success: boolean;
|
||||
data?: string[];
|
||||
links?: string[];
|
||||
error?: string;
|
||||
}
|
||||
|
||||
@ -458,36 +458,53 @@ export default class FirecrawlApp {
|
||||
headers
|
||||
);
|
||||
if (response.status === 200) {
|
||||
const jobId: string = this.version == 'v0' ? response.data.jobId : response.data.id;
|
||||
const id: string = this.version == 'v0' ? response.data.jobId : response.data.id;
|
||||
let checkUrl: string | undefined = undefined;
|
||||
if (waitUntilDone) {
|
||||
if (this.version == 'v1') { checkUrl = response.data.url }
|
||||
return this.monitorJobStatus(jobId, headers, pollInterval, checkUrl);
|
||||
return this.monitorJobStatus(id, headers, pollInterval, checkUrl);
|
||||
} else {
|
||||
return { success: true, jobId };
|
||||
if (this.version == 'v0') {
|
||||
return {
|
||||
success: true,
|
||||
jobId: id
|
||||
} as CrawlResponseV0;
|
||||
} else {
|
||||
return {
|
||||
success: true,
|
||||
id: id
|
||||
} as CrawlResponse;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
this.handleError(response, "start crawl job");
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.log(error);
|
||||
throw new Error(error.message);
|
||||
if (error.response.data.error) {
|
||||
throw new Error(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`);
|
||||
} else {
|
||||
throw new Error(error.message);
|
||||
}
|
||||
}
|
||||
return { success: false, error: "Internal server error." };
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks the status of a crawl job using the Firecrawl API.
|
||||
* @param jobId - The job ID of the crawl operation.
|
||||
* @param id - The ID of the crawl operation.
|
||||
* @returns The response containing the job status.
|
||||
*/
|
||||
async checkCrawlStatus(jobId: string): Promise<CrawlStatusResponse | CrawlStatusResponseV0> {
|
||||
async checkCrawlStatus(id?: string): Promise<CrawlStatusResponse | CrawlStatusResponseV0> {
|
||||
if (!id) {
|
||||
throw new Error("No crawl ID provided");
|
||||
}
|
||||
|
||||
const headers: AxiosRequestHeaders = this.prepareHeaders();
|
||||
try {
|
||||
const response: AxiosResponse = await this.getRequest(
|
||||
this.version == 'v1' ?
|
||||
this.apiUrl + `/${this.version}/crawl/${jobId}` :
|
||||
this.apiUrl + `/${this.version}/crawl/status/${jobId}`,
|
||||
this.apiUrl + `/${this.version}/crawl/${id}` :
|
||||
this.apiUrl + `/${this.version}/crawl/status/${id}`,
|
||||
headers
|
||||
);
|
||||
if (response.status === 200) {
|
||||
@ -508,8 +525,12 @@ export default class FirecrawlApp {
|
||||
return {
|
||||
success: true,
|
||||
status: response.data.status,
|
||||
totalCount: response.data.totalCount,
|
||||
creditsUsed: response.data.creditsUsed,
|
||||
expiresAt: new Date(response.data.expiresAt),
|
||||
next: response.data.next,
|
||||
data: response.data.data,
|
||||
error: response.data.error,
|
||||
error: response.data.error
|
||||
} as CrawlStatusResponse;
|
||||
}
|
||||
} else {
|
||||
@ -537,7 +558,7 @@ export default class FirecrawlApp {
|
||||
}
|
||||
}
|
||||
|
||||
async map(url: string, params?: MapParams): Promise<MapResponse> {
|
||||
async mapUrl(url: string, params?: MapParams): Promise<MapResponse> {
|
||||
if (this.version == 'v0') {
|
||||
throw new Error("Map is not supported in v0");
|
||||
}
|
||||
@ -604,23 +625,23 @@ export default class FirecrawlApp {
|
||||
|
||||
/**
|
||||
* Monitors the status of a crawl job until completion or failure.
|
||||
* @param jobId - The job ID of the crawl operation.
|
||||
* @param id - The ID of the crawl operation.
|
||||
* @param headers - The headers for the request.
|
||||
* @param checkInterval - Interval in seconds for job status checks.
|
||||
* @returns The final job status or data.
|
||||
*/
|
||||
async monitorJobStatus(
|
||||
jobId: string,
|
||||
id: string,
|
||||
headers: AxiosRequestHeaders,
|
||||
checkInterval: number,
|
||||
checkUrl?: string
|
||||
): Promise<any> {
|
||||
): Promise<CrawlStatusResponse | CrawlStatusResponseV0> {
|
||||
let apiUrl: string = '';
|
||||
while (true) {
|
||||
if (this.version == 'v1') {
|
||||
apiUrl = checkUrl ?? this.apiUrl + `/v1/crawl/${jobId}`;
|
||||
apiUrl = checkUrl ?? this.apiUrl + `/v1/crawl/${id}`;
|
||||
} else if (this.version == 'v0') {
|
||||
apiUrl = checkUrl ?? this.apiUrl + `/v0/crawl/status/${jobId}`;
|
||||
apiUrl = checkUrl ?? this.apiUrl + `/v0/crawl/status/${id}`;
|
||||
}
|
||||
const statusResponse: AxiosResponse = await this.getRequest(
|
||||
apiUrl,
|
||||
|
Loading…
x
Reference in New Issue
Block a user