tests passing now

This commit is contained in:
rafaelsideguide 2024-08-20 20:00:41 -03:00
parent 0f48823c9c
commit e9d6ca197e
3 changed files with 110 additions and 65 deletions

View File

@ -75,9 +75,7 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
description = soup('meta[name="description"]').attr("content") || null; description = soup('meta[name="description"]').attr("content") || null;
// Assuming the language is part of the URL as per the regex pattern // Assuming the language is part of the URL as per the regex pattern
const pattern = /([a-zA-Z]+-[A-Z]{2})/; language = soup('html').attr('lang') || null;
const match = pattern.exec(url);
language = match ? match[1] : null;
keywords = soup('meta[name="keywords"]').attr("content") || null; keywords = soup('meta[name="keywords"]').attr("content") || null;
robots = soup('meta[name="robots"]').attr("content") || null; robots = soup('meta[name="robots"]').attr("content") || null;

View File

@ -1,4 +1,4 @@
import FirecrawlApp, { CrawlParams, CrawlResponse, CrawlStatusResponse, ScrapeParams, ScrapeResponse } from '../../../index'; import FirecrawlApp, { CrawlParams, CrawlResponse, CrawlStatusResponse, MapResponse, ScrapeParams, ScrapeResponse } from '../../../index';
import { v4 as uuidv4 } from 'uuid'; import { v4 as uuidv4 } from 'uuid';
import dotenv from 'dotenv'; import dotenv from 'dotenv';
import { describe, test, expect } from '@jest/globals'; import { describe, test, expect } from '@jest/globals';
@ -66,6 +66,7 @@ describe('FirecrawlApp E2E Tests', () => {
expect(response.data?.rawHtml).toContain("<h1"); expect(response.data?.rawHtml).toContain("<h1");
expect(response.data?.screenshot).not.toBeUndefined(); expect(response.data?.screenshot).not.toBeUndefined();
expect(response.data?.screenshot).not.toBeNull(); expect(response.data?.screenshot).not.toBeNull();
expect(response.data?.screenshot).toContain("https://");
expect(response.data?.links).not.toBeNull(); expect(response.data?.links).not.toBeNull();
expect(response.data?.links?.length).toBeGreaterThan(0); expect(response.data?.links?.length).toBeGreaterThan(0);
expect(response.data?.links?.[0]).toContain("https://"); expect(response.data?.links?.[0]).toContain("https://");
@ -121,7 +122,7 @@ describe('FirecrawlApp E2E Tests', () => {
test.concurrent('should throw error for blocklisted URL on crawl', async () => { test.concurrent('should throw error for blocklisted URL on crawl', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const blocklistedUrl = "https://twitter.com/fake-test"; const blocklistedUrl = "https://twitter.com/fake-test";
await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.");
}); });
test.concurrent('should return successful response for crawl and wait for completion', async () => { test.concurrent('should return successful response for crawl and wait for completion', async () => {
@ -145,14 +146,13 @@ describe('FirecrawlApp E2E Tests', () => {
expect(response.data?.[0]).not.toHaveProperty("rawHtml"); expect(response.data?.[0]).not.toHaveProperty("rawHtml");
expect(response.data?.[0]).not.toHaveProperty("screenshot"); expect(response.data?.[0]).not.toHaveProperty("screenshot");
expect(response.data?.[0]).not.toHaveProperty("links"); expect(response.data?.[0]).not.toHaveProperty("links");
expect(response.data?.[0]).toHaveProperty("metadata"); expect(response.data?.[0]).toHaveProperty("metadata");
expect(response.data?.[0].metadata).toHaveProperty("title"); expect(response.data?.[0].metadata).toHaveProperty("title");
expect(response.data?.[0].metadata).toHaveProperty("description"); expect(response.data?.[0].metadata).toHaveProperty("description");
expect(response.data?.[0].metadata).toHaveProperty("language"); expect(response.data?.[0].metadata).toHaveProperty("language");
expect(response.data?.[0].metadata).toHaveProperty("sourceURL"); expect(response.data?.[0].metadata).toHaveProperty("sourceURL");
expect(response.data?.[0].metadata).toHaveProperty("statusCode"); expect(response.data?.[0].metadata).toHaveProperty("statusCode");
expect(response.data?.[0].metadata).toHaveProperty("error"); expect(response.data?.[0].metadata).not.toHaveProperty("error");
}, 60000); // 60 seconds timeout }, 60000); // 60 seconds timeout
test.concurrent('should return successful response for crawl with options and wait for completion', async () => { test.concurrent('should return successful response for crawl with options and wait for completion', async () => {
@ -203,7 +203,7 @@ describe('FirecrawlApp E2E Tests', () => {
expect(response.data?.[0].metadata).toHaveProperty("language"); expect(response.data?.[0].metadata).toHaveProperty("language");
expect(response.data?.[0].metadata).toHaveProperty("sourceURL"); expect(response.data?.[0].metadata).toHaveProperty("sourceURL");
expect(response.data?.[0].metadata).toHaveProperty("statusCode"); expect(response.data?.[0].metadata).toHaveProperty("statusCode");
expect(response.data?.[0].metadata).toHaveProperty("error"); expect(response.data?.[0].metadata).not.toHaveProperty("error");
}, 60000); // 60 seconds timeout }, 60000); // 60 seconds timeout
test.concurrent('should handle idempotency key for crawl', async () => { test.concurrent('should handle idempotency key for crawl', async () => {
@ -211,23 +211,23 @@ describe('FirecrawlApp E2E Tests', () => {
const uniqueIdempotencyKey = uuidv4(); const uniqueIdempotencyKey = uuidv4();
const response = await app.crawlUrl('https://roastmywebsite.ai', {}, false, 2, uniqueIdempotencyKey) as CrawlResponse; const response = await app.crawlUrl('https://roastmywebsite.ai', {}, false, 2, uniqueIdempotencyKey) as CrawlResponse;
expect(response).not.toBeNull(); expect(response).not.toBeNull();
expect(response.jobId).toBeDefined(); expect(response.id).toBeDefined();
await expect(app.crawlUrl('https://roastmywebsite.ai', {}, true, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409"); await expect(app.crawlUrl('https://roastmywebsite.ai', {}, true, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409");
}); });
test.concurrent('should check crawl status', async () => { test.concurrent('should check crawl status', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const response: any = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false) as CrawlStatusResponse; const response = await app.crawlUrl('https://firecrawl.dev', { scrapeOptions: { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}} as CrawlParams, false) as CrawlResponse;
expect(response).not.toBeNull(); expect(response).not.toBeNull();
expect(response.jobId).toBeDefined(); expect(response.id).toBeDefined();
let statusResponse: any = await app.checkCrawlStatus(response.jobId); let statusResponse: any = await app.checkCrawlStatus(response.id) as CrawlStatusResponse;
const maxChecks = 15; const maxChecks = 15;
let checks = 0; let checks = 0;
while (statusResponse.status === 'scraping' && checks < maxChecks) { while (statusResponse.status === 'scraping' && checks < maxChecks) {
await new Promise(resolve => setTimeout(resolve, 1000)); await new Promise(resolve => setTimeout(resolve, 5000));
expect(statusResponse).not.toHaveProperty("partial_data"); // v0 expect(statusResponse).not.toHaveProperty("partial_data"); // v0
expect(statusResponse).not.toHaveProperty("current"); // v0 expect(statusResponse).not.toHaveProperty("current"); // v0
expect(statusResponse).toHaveProperty("data"); expect(statusResponse).toHaveProperty("data");
@ -238,44 +238,70 @@ describe('FirecrawlApp E2E Tests', () => {
expect(statusResponse).toHaveProperty("next"); expect(statusResponse).toHaveProperty("next");
expect(statusResponse.totalCount).toBeGreaterThan(0); expect(statusResponse.totalCount).toBeGreaterThan(0);
expect(statusResponse.creditsUsed).toBeGreaterThan(0); expect(statusResponse.creditsUsed).toBeGreaterThan(0);
expect(statusResponse.expiresAt).toBeGreaterThan(Date.now()); expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now());
expect(statusResponse.status).toBe("scraping"); expect(statusResponse.status).toBe("scraping");
expect(statusResponse.next).toContain("/v1/crawl/"); expect(statusResponse.next).toContain("/v1/crawl/");
statusResponse = await app.checkCrawlStatus(response.jobId) as CrawlResponse; statusResponse = await app.checkCrawlStatus(response.id) as CrawlStatusResponse;
checks++; checks++;
} }
expect(statusResponse).not.toBeNull();
expect(statusResponse).toHaveProperty("totalCount");
expect(statusResponse.totalCount).toBeGreaterThan(0);
expect(statusResponse).toHaveProperty("creditsUsed");
expect(statusResponse.creditsUsed).toBeGreaterThan(0);
expect(statusResponse).toHaveProperty("expiresAt");
expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now());
expect(statusResponse).toHaveProperty("status");
expect(statusResponse.status).toBe("completed");
expect(statusResponse.data?.length).toBeGreaterThan(0);
expect(statusResponse.data?.[0]).toHaveProperty("markdown");
expect(statusResponse.data?.[0].markdown?.length).toBeGreaterThan(10);
expect(statusResponse.data?.[0]).not.toHaveProperty('content'); // v0
expect(statusResponse.data?.[0]).toHaveProperty("html");
expect(statusResponse.data?.[0].html).toContain("<div");
expect(statusResponse.data?.[0]).toHaveProperty("rawHtml");
expect(statusResponse.data?.[0].rawHtml).toContain("<div");
expect(statusResponse.data?.[0]).toHaveProperty("screenshot");
expect(statusResponse.data?.[0].screenshot).toContain("https://");
expect(statusResponse.data?.[0]).toHaveProperty("links");
expect(statusResponse.data?.[0].links).not.toBeNull();
expect(statusResponse.data?.[0].links?.length).toBeGreaterThan(0);
expect(statusResponse.data?.[0]).toHaveProperty("metadata");
expect(statusResponse.data?.[0].metadata).toHaveProperty("title");
expect(statusResponse.data?.[0].metadata).toHaveProperty("description");
expect(statusResponse.data?.[0].metadata).toHaveProperty("language");
expect(statusResponse.data?.[0].metadata).toHaveProperty("sourceURL");
expect(statusResponse.data?.[0].metadata).toHaveProperty("statusCode");
expect(statusResponse.data?.[0].metadata).not.toHaveProperty("error");
}, 60000); // 60 seconds timeout
test.concurrent('should throw error for invalid API key on map', async () => {
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
});
test.concurrent('should throw error for blocklisted URL on map', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const blocklistedUrl = "https://facebook.com/fake-test";
await expect(app.mapUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
});
test.concurrent('should return successful response with valid preview token', async () => {
const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL });
const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse;
expect(response).not.toBeNull(); expect(response).not.toBeNull();
expect(response).toHaveProperty("totalCount"); expect(response.links?.length).toBeGreaterThan(0);
expect(response.totalCount).toBeGreaterThan(0); }, 30000); // 30 seconds timeout
expect(response).toHaveProperty("creditsUsed");
expect(response.creditsUsed).toBeGreaterThan(0); test.concurrent('should return successful response for valid map', async () => {
expect(response).toHaveProperty("expiresAt"); const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
expect(response.expiresAt).toBeGreaterThan(Date.now()); const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse;
expect(response).toHaveProperty("status"); expect(response).not.toBeNull();
expect(response.status).toBe("completed");
expect(response).toHaveProperty("next"); expect(response.links?.length).toBeGreaterThan(0);
expect(response.next).toContain("/v1/crawl/"); expect(response.links?.[0]).toContain("https://");
expect(response.data?.length).toBeGreaterThan(0); const filteredLinks = response.links?.filter((link: string) => link.includes("roastmywebsite.ai"));
expect(response.data?.[0]).toHaveProperty("markdown"); expect(filteredLinks?.length).toBeGreaterThan(0);
expect(response.data?.[0].markdown).toContain("_Roast_"); }, 30000); // 30 seconds timeout
expect(response.data?.[0]).not.toHaveProperty('content'); // v0
expect(response.data?.[0].markdown).toContain("_Roast_");
expect(response.data?.[0]).toHaveProperty("html");
expect(response.data?.[0].html).toContain("<h1");
expect(response.data?.[0]).toHaveProperty("rawHtml");
expect(response.data?.[0].rawHtml).toContain("<h1");
expect(response.data?.[0]).toHaveProperty("screenshot");
expect(response.data?.[0].screenshot).toContain("https://");
expect(response.data?.[0]).toHaveProperty("links");
expect(response.data?.[0].links).not.toBeNull();
expect(response.data?.[0].links?.length).toBeGreaterThan(0);
expect(response.data?.[0]).toHaveProperty("metadata");
expect(response.data?.[0].metadata).toHaveProperty("title");
expect(response.data?.[0].metadata).toHaveProperty("description");
expect(response.data?.[0].metadata).toHaveProperty("language");
expect(response.data?.[0].metadata).toHaveProperty("sourceURL");
expect(response.data?.[0].metadata).toHaveProperty("statusCode");
expect(response.data?.[0].metadata).toHaveProperty("error");
}, 35000); // 35 seconds timeout
}); });

View File

@ -214,7 +214,7 @@ export interface CrawlParamsV0 {
* Defines the structure of the response received after initiating a crawl. * Defines the structure of the response received after initiating a crawl.
*/ */
export interface CrawlResponse { export interface CrawlResponse {
jobId?: string; id?: string;
url?: string; url?: string;
success: boolean; success: boolean;
error?: string; error?: string;
@ -281,7 +281,7 @@ export interface MapParams {
*/ */
export interface MapResponse { export interface MapResponse {
success: boolean; success: boolean;
data?: string[]; links?: string[];
error?: string; error?: string;
} }
@ -458,36 +458,53 @@ export default class FirecrawlApp {
headers headers
); );
if (response.status === 200) { if (response.status === 200) {
const jobId: string = this.version == 'v0' ? response.data.jobId : response.data.id; const id: string = this.version == 'v0' ? response.data.jobId : response.data.id;
let checkUrl: string | undefined = undefined; let checkUrl: string | undefined = undefined;
if (waitUntilDone) { if (waitUntilDone) {
if (this.version == 'v1') { checkUrl = response.data.url } if (this.version == 'v1') { checkUrl = response.data.url }
return this.monitorJobStatus(jobId, headers, pollInterval, checkUrl); return this.monitorJobStatus(id, headers, pollInterval, checkUrl);
} else { } else {
return { success: true, jobId }; if (this.version == 'v0') {
return {
success: true,
jobId: id
} as CrawlResponseV0;
} else {
return {
success: true,
id: id
} as CrawlResponse;
}
} }
} else { } else {
this.handleError(response, "start crawl job"); this.handleError(response, "start crawl job");
} }
} catch (error: any) { } catch (error: any) {
console.log(error); if (error.response.data.error) {
throw new Error(error.message); throw new Error(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`);
} else {
throw new Error(error.message);
}
} }
return { success: false, error: "Internal server error." }; return { success: false, error: "Internal server error." };
} }
/** /**
* Checks the status of a crawl job using the Firecrawl API. * Checks the status of a crawl job using the Firecrawl API.
* @param jobId - The job ID of the crawl operation. * @param id - The ID of the crawl operation.
* @returns The response containing the job status. * @returns The response containing the job status.
*/ */
async checkCrawlStatus(jobId: string): Promise<CrawlStatusResponse | CrawlStatusResponseV0> { async checkCrawlStatus(id?: string): Promise<CrawlStatusResponse | CrawlStatusResponseV0> {
if (!id) {
throw new Error("No crawl ID provided");
}
const headers: AxiosRequestHeaders = this.prepareHeaders(); const headers: AxiosRequestHeaders = this.prepareHeaders();
try { try {
const response: AxiosResponse = await this.getRequest( const response: AxiosResponse = await this.getRequest(
this.version == 'v1' ? this.version == 'v1' ?
this.apiUrl + `/${this.version}/crawl/${jobId}` : this.apiUrl + `/${this.version}/crawl/${id}` :
this.apiUrl + `/${this.version}/crawl/status/${jobId}`, this.apiUrl + `/${this.version}/crawl/status/${id}`,
headers headers
); );
if (response.status === 200) { if (response.status === 200) {
@ -508,8 +525,12 @@ export default class FirecrawlApp {
return { return {
success: true, success: true,
status: response.data.status, status: response.data.status,
totalCount: response.data.totalCount,
creditsUsed: response.data.creditsUsed,
expiresAt: new Date(response.data.expiresAt),
next: response.data.next,
data: response.data.data, data: response.data.data,
error: response.data.error, error: response.data.error
} as CrawlStatusResponse; } as CrawlStatusResponse;
} }
} else { } else {
@ -537,7 +558,7 @@ export default class FirecrawlApp {
} }
} }
async map(url: string, params?: MapParams): Promise<MapResponse> { async mapUrl(url: string, params?: MapParams): Promise<MapResponse> {
if (this.version == 'v0') { if (this.version == 'v0') {
throw new Error("Map is not supported in v0"); throw new Error("Map is not supported in v0");
} }
@ -604,23 +625,23 @@ export default class FirecrawlApp {
/** /**
* Monitors the status of a crawl job until completion or failure. * Monitors the status of a crawl job until completion or failure.
* @param jobId - The job ID of the crawl operation. * @param id - The ID of the crawl operation.
* @param headers - The headers for the request. * @param headers - The headers for the request.
* @param checkInterval - Interval in seconds for job status checks. * @param checkInterval - Interval in seconds for job status checks.
* @returns The final job status or data. * @returns The final job status or data.
*/ */
async monitorJobStatus( async monitorJobStatus(
jobId: string, id: string,
headers: AxiosRequestHeaders, headers: AxiosRequestHeaders,
checkInterval: number, checkInterval: number,
checkUrl?: string checkUrl?: string
): Promise<any> { ): Promise<CrawlStatusResponse | CrawlStatusResponseV0> {
let apiUrl: string = ''; let apiUrl: string = '';
while (true) { while (true) {
if (this.version == 'v1') { if (this.version == 'v1') {
apiUrl = checkUrl ?? this.apiUrl + `/v1/crawl/${jobId}`; apiUrl = checkUrl ?? this.apiUrl + `/v1/crawl/${id}`;
} else if (this.version == 'v0') { } else if (this.version == 'v0') {
apiUrl = checkUrl ?? this.apiUrl + `/v0/crawl/status/${jobId}`; apiUrl = checkUrl ?? this.apiUrl + `/v0/crawl/status/${id}`;
} }
const statusResponse: AxiosResponse = await this.getRequest( const statusResponse: AxiosResponse = await this.getRequest(
apiUrl, apiUrl,