From d599d31e638cbfdc2bda6adcc72f00f7c29df773 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 6 Aug 2024 17:33:39 -0300 Subject: [PATCH 01/16] wip --- .../src/__tests__/e2e_withAuth/index.test.ts | 58 ++++----- .../__tests__/v1/e2e_withAuth/index.test.ts | 122 ++++++++++++++++++ 2 files changed, 151 insertions(+), 29 deletions(-) create mode 100644 apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts diff --git a/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts index ad917de4..91dfb9e1 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts @@ -1,4 +1,4 @@ -import FirecrawlApp from '../../index'; +import FirecrawlApp, { CrawlResponseV0, FirecrawlDocumentV0, JobStatusResponseV0, ScrapeResponseV0, SearchResponseV0 } from '../../index'; import { v4 as uuidv4 } from 'uuid'; import dotenv from 'dotenv'; import { describe, test, expect } from '@jest/globals'; @@ -11,31 +11,31 @@ const API_URL = "http://127.0.0.1:3002"; describe('FirecrawlApp E2E Tests', () => { test.concurrent('should throw error for no API key', async () => { expect(() => { - new FirecrawlApp({ apiKey: null, apiUrl: API_URL }); + new FirecrawlApp({ apiKey: null, apiUrl: API_URL, version: "v0" }); }).toThrow("No API key provided"); }); test.concurrent('should throw error for invalid API key on scrape', async () => { - const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL, version: "v0" }); await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); }); test.concurrent('should throw error for blocklisted URL on scrape', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" }); const blocklistedUrl = "https://facebook.com/fake-test"; await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); }); test.concurrent('should return successful response with valid preview token', async () => { - const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL }); - const response = await app.scrapeUrl('https://roastmywebsite.ai'); + const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL, version: "v0" }); + const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponseV0; expect(response).not.toBeNull(); expect(response.data?.content).toContain("_Roast_"); }, 30000); // 30 seconds timeout test.concurrent('should return successful response for valid scrape', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.scrapeUrl('https://roastmywebsite.ai'); + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" }); + const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponseV0; expect(response).not.toBeNull(); expect(response.data?.content).toContain("_Roast_"); expect(response.data).toHaveProperty('markdown'); @@ -44,8 +44,8 @@ describe('FirecrawlApp E2E Tests', () => { }, 30000); // 30 seconds timeout test.concurrent('should return successful response with valid API key and include HTML', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.scrapeUrl('https://roastmywebsite.ai', { pageOptions: { includeHtml: true } }); + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" }); + const response = await app.scrapeUrl('https://roastmywebsite.ai', { pageOptions: { includeHtml: true } }) as ScrapeResponseV0; expect(response).not.toBeNull(); expect(response.data?.content).toContain("_Roast_"); expect(response.data?.markdown).toContain("_Roast_"); @@ -53,41 +53,41 @@ describe('FirecrawlApp E2E Tests', () => { }, 30000); // 30 seconds timeout test.concurrent('should return successful response for valid scrape with PDF file', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf'); + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" }); + const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf') as ScrapeResponseV0; expect(response).not.toBeNull(); expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); }, 30000); // 30 seconds timeout test.concurrent('should return successful response for valid scrape with PDF file without explicit extension', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001'); + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" }); + const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001') as ScrapeResponseV0; expect(response).not.toBeNull(); expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); }, 30000); // 30 seconds timeout test.concurrent('should throw error for invalid API key on crawl', async () => { - const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL, version: "v0" }); await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); }); test.concurrent('should throw error for blocklisted URL on crawl', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" }); const blocklistedUrl = "https://twitter.com/fake-test"; await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); }); test.concurrent('should return successful response for crawl and wait for completion', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, true, 30); + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" }); + const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, true, 30) as CrawlResponseV0; expect(response).not.toBeNull(); expect(response[0].content).toContain("_Roast_"); }, 60000); // 60 seconds timeout test.concurrent('should handle idempotency key for crawl', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" }); const uniqueIdempotencyKey = uuidv4(); - const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false, 2, uniqueIdempotencyKey); + const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false, 2, uniqueIdempotencyKey) as CrawlResponseV0; expect(response).not.toBeNull(); expect(response.jobId).toBeDefined(); @@ -95,12 +95,12 @@ describe('FirecrawlApp E2E Tests', () => { }); test.concurrent('should check crawl status', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false); + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" }); + const response: any = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false) as JobStatusResponseV0; expect(response).not.toBeNull(); expect(response.jobId).toBeDefined(); - let statusResponse = await app.checkCrawlStatus(response.jobId); + let statusResponse: any = await app.checkCrawlStatus(response.jobId); const maxChecks = 15; let checks = 0; @@ -108,7 +108,7 @@ describe('FirecrawlApp E2E Tests', () => { await new Promise(resolve => setTimeout(resolve, 1000)); expect(statusResponse.partial_data).not.toBeNull(); expect(statusResponse.current).toBeGreaterThanOrEqual(1); - statusResponse = await app.checkCrawlStatus(response.jobId); + statusResponse = await app.checkCrawlStatus(response.jobId) as CrawlResponseV0; checks++; } @@ -121,20 +121,20 @@ describe('FirecrawlApp E2E Tests', () => { }, 35000); // 35 seconds timeout test.concurrent('should return successful response for search', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.search("test query"); + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" }); + const response = await app.search("test query") as SearchResponseV0; expect(response).not.toBeNull(); expect(response?.data?.[0]?.content).toBeDefined(); expect(response?.data?.length).toBeGreaterThan(2); }, 30000); // 30 seconds timeout test.concurrent('should throw error for invalid API key on search', async () => { - const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL, version: "v0" }); await expect(invalidApp.search("test query")).rejects.toThrow("Request failed with status code 401"); }); test.concurrent('should perform LLM extraction', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" }); const response = await app.scrapeUrl("https://mendable.ai", { extractorOptions: { mode: 'llm-extraction', @@ -149,7 +149,7 @@ describe('FirecrawlApp E2E Tests', () => { required: ['company_mission', 'supports_sso', 'is_open_source'] } } - }); + }) as ScrapeResponseV0; expect(response).not.toBeNull(); expect(response.data?.llm_extraction).toBeDefined(); const llmExtraction = response.data?.llm_extraction; diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts new file mode 100644 index 00000000..9042d02e --- /dev/null +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -0,0 +1,122 @@ +import FirecrawlApp, { CrawlResponse, JobStatusResponse, ScrapeResponse } from '../../../index'; +import { v4 as uuidv4 } from 'uuid'; +import dotenv from 'dotenv'; +import { describe, test, expect } from '@jest/globals'; + +dotenv.config(); + +const TEST_API_KEY = process.env.TEST_API_KEY; +const API_URL = "http://127.0.0.1:3002"; + +describe('FirecrawlApp E2E Tests', () => { + test.concurrent('should throw error for no API key', async () => { + expect(() => { + new FirecrawlApp({ apiKey: null, apiUrl: API_URL }); + }).toThrow("No API key provided"); + }); + + test.concurrent('should throw error for invalid API key on scrape', async () => { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); + }); + + test.concurrent('should throw error for blocklisted URL on scrape', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const blocklistedUrl = "https://facebook.com/fake-test"; + await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); + }); + + test.concurrent('should return successful response with valid preview token', async () => { + const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL }); + const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponse; + expect(response).not.toBeNull(); + expect(response.data?.content).toContain("_Roast_"); + }, 30000); // 30 seconds timeout + + test.concurrent('should return successful response for valid scrape', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponse; + expect(response).not.toBeNull(); + expect(response.data?.content).toContain("_Roast_"); + expect(response.data).toHaveProperty('markdown'); + expect(response.data).toHaveProperty('metadata'); + expect(response.data).not.toHaveProperty('html'); + }, 30000); // 30 seconds timeout + + test.concurrent('should return successful response with valid API key and include HTML', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response = await app.scrapeUrl('https://roastmywebsite.ai', { pageOptions: { includeHtml: true } }) as ScrapeResponse; + expect(response).not.toBeNull(); + expect(response.data?.content).toContain("_Roast_"); + expect(response.data?.markdown).toContain("_Roast_"); + expect(response.data?.html).toContain(" { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf') as ScrapeResponse; + expect(response).not.toBeNull(); + expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + }, 30000); // 30 seconds timeout + + test.concurrent('should return successful response for valid scrape with PDF file without explicit extension', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001') as ScrapeResponse; + expect(response).not.toBeNull(); + expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + }, 30000); // 30 seconds timeout + + test.concurrent('should throw error for invalid API key on crawl', async () => { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); + }); + + test.concurrent('should throw error for blocklisted URL on crawl', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const blocklistedUrl = "https://twitter.com/fake-test"; + await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); + }); + + test.concurrent('should return successful response for crawl and wait for completion', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, true, 30) as JobStatusResponse; + expect(response).not.toBeNull(); + expect(response.data?.[0].content).toContain("_Roast_"); + }, 60000); // 60 seconds timeout + + test.concurrent('should handle idempotency key for crawl', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const uniqueIdempotencyKey = uuidv4(); + const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false, 2, uniqueIdempotencyKey) as CrawlResponse; + expect(response).not.toBeNull(); + expect(response.jobId).toBeDefined(); + + await expect(app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, true, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409"); + }); + + test.concurrent('should check crawl status', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response: any = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false) as JobStatusResponse; + expect(response).not.toBeNull(); + expect(response.jobId).toBeDefined(); + + let statusResponse: any = await app.checkCrawlStatus(response.jobId); + const maxChecks = 15; + let checks = 0; + + while (statusResponse.status === 'active' && checks < maxChecks) { + await new Promise(resolve => setTimeout(resolve, 1000)); + expect(statusResponse.partial_data).not.toBeNull(); + expect(statusResponse.current).toBeGreaterThanOrEqual(1); + statusResponse = await app.checkCrawlStatus(response.jobId) as CrawlResponse; + checks++; + } + + expect(statusResponse).not.toBeNull(); + expect(statusResponse.success).toBe(true); + expect(statusResponse.status).toBe('completed'); + expect(statusResponse.total).toEqual(statusResponse.current); + expect(statusResponse.current_step).not.toBeNull(); + expect(statusResponse?.data?.length).toBeGreaterThan(0); + }, 35000); // 35 seconds timeout +}); From 3fb2307010dbbdd3ce55f580371998d03e97d31a Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 6 Aug 2024 17:34:13 -0300 Subject: [PATCH 02/16] Update index.ts --- apps/js-sdk/firecrawl/src/index.ts | 143 +++++++++++++++++++++++++---- 1 file changed, 125 insertions(+), 18 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index a42d4618..329d1800 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -7,12 +7,52 @@ import { zodToJsonSchema } from "zod-to-json-schema"; export interface FirecrawlAppConfig { apiKey?: string | null; apiUrl?: string | null; + version?: "v0" | "v1"; } /** * Metadata for a Firecrawl document. */ export interface FirecrawlDocumentMetadata { + title?: string; + description?: string; + language?: string; + keywords?: string; + robots?: string; + ogTitle?: string; + ogDescription?: string; + ogUrl?: string; + ogImage?: string; + ogAudio?: string; + ogDeterminer?: string; + ogLocale?: string; + ogLocaleAlternate?: string[]; + ogSiteName?: string; + ogVideo?: string; + dctermsCreated?: string; + dcDateCreated?: string; + dcDate?: string; + dctermsType?: string; + dcType?: string; + dctermsAudience?: string; + dctermsSubject?: string; + dcSubject?: string; + dcDescription?: string; + dctermsKeywords?: string; + modifiedTime?: string; + publishedTime?: string; + articleTag?: string; + articleSection?: string; + sourceURL?: string; + statusCode?: number; + error?: string; + [key: string]: any; +} + +/** + * Metadata for a Firecrawl document on v0. + */ +export interface FirecrawlDocumentMetadataV0 { title?: string; description?: string; language?: string; @@ -52,6 +92,19 @@ export interface FirecrawlDocumentMetadata { * Document interface for Firecrawl. */ export interface FirecrawlDocument { + url?: string; + content: string; + markdown?: string; + html?: string; + rawHtml?: string; + linksOnPage?: string[]; + metadata: FirecrawlDocumentMetadata; +} + +/** + * Document interface for Firecrawl on v0. + */ +export interface FirecrawlDocumentV0 { id?: string; url?: string; content: string; @@ -61,11 +114,10 @@ export interface FirecrawlDocument { createdAt?: Date; updatedAt?: Date; type?: string; - metadata: FirecrawlDocumentMetadata; + metadata: FirecrawlDocumentMetadataV0; childrenLinks?: string[]; provider?: string; warning?: string; - index?: number; } @@ -74,17 +126,29 @@ export interface FirecrawlDocument { */ export interface ScrapeResponse { success: boolean; + warning?: string; data?: FirecrawlDocument; error?: string; } + +/** + * Response interface for scraping operations on v0. + */ +export interface ScrapeResponseV0 { + success: boolean; + data?: FirecrawlDocumentV0; + error?: string; +} + /** * Response interface for searching operations. */ -export interface SearchResponse { +export interface SearchResponseV0 { success: boolean; data?: FirecrawlDocument[]; error?: string; } + /** * Response interface for crawling operations. */ @@ -94,21 +158,46 @@ export interface CrawlResponse { data?: FirecrawlDocument[]; error?: string; } + +/** + * Response interface for crawling operations on v0. + */ +export interface CrawlResponseV0 { + success: boolean; + jobId?: string; + data?: FirecrawlDocument[]; + error?: string; +} + /** * Response interface for job status checks. */ export interface JobStatusResponse { + success: boolean; + totalCount: number; + creditsUsed: number; + expiresAt: Date; + status: "scraping" | "completed" | "failed"; + next: string; + data?: FirecrawlDocument[]; + error?: string; +} + +/** + * Response interface for job status checks on v0. + */ +export interface JobStatusResponseV0 { success: boolean; status: string; current?: number; current_url?: string; current_step?: string; total?: number; - jobId?: string; - data?: FirecrawlDocument[]; - partial_data?: FirecrawlDocument[]; + data?: FirecrawlDocumentV0[]; + partial_data?: FirecrawlDocumentV0[]; error?: string; } + /** * Generic parameter interface. */ @@ -126,14 +215,16 @@ export interface Params { export default class FirecrawlApp { private apiKey: string; private apiUrl: string; + private version: "v0" | "v1"; /** * Initializes a new instance of the FirecrawlApp class. * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. */ - constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) { + constructor({ apiKey = null, apiUrl = null, version = "v1" }: FirecrawlAppConfig) { this.apiKey = apiKey || ""; this.apiUrl = apiUrl || "https://api.firecrawl.dev"; + this.version = version; if (!this.apiKey) { throw new Error("No API key provided"); } @@ -143,12 +234,17 @@ export default class FirecrawlApp { * Scrapes a URL using the Firecrawl API. * @param {string} url - The URL to scrape. * @param {Params | null} params - Additional parameters for the scrape request. - * @returns {Promise} The response from the scrape operation. + * @returns {Promise} The response from the scrape operation. */ async scrapeUrl( url: string, - params: Params | null = null - ): Promise { + params: Params | null = null, + version: "v0" | "v1" = "v1" + ): Promise { + if (version) { + this.version = version; + } + const headers: AxiosRequestHeaders = { "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}`, @@ -171,7 +267,7 @@ export default class FirecrawlApp { } try { const response: AxiosResponse = await axios.post( - this.apiUrl + "/v0/scrape", + this.apiUrl + `/${this.version}/scrape`, jsonData, { headers } ); @@ -200,7 +296,11 @@ export default class FirecrawlApp { async search( query: string, params: Params | null = null - ): Promise { + ): Promise { + if (this.version === "v1") { + throw new Error("Search is not supported in v1"); + } + const headers: AxiosRequestHeaders = { "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}`, @@ -245,8 +345,13 @@ export default class FirecrawlApp { params: Params | null = null, waitUntilDone: boolean = true, pollInterval: number = 2, - idempotencyKey?: string - ): Promise { + idempotencyKey?: string, + version: "v0" | "v1" = "v1" + ): Promise { + if (version) { + this.version = version; + } + const headers = this.prepareHeaders(idempotencyKey); let jsonData: Params = { url }; if (params) { @@ -254,7 +359,7 @@ export default class FirecrawlApp { } try { const response: AxiosResponse = await this.postRequest( - this.apiUrl + "/v0/crawl", + this.apiUrl + `/${this.version}/crawl`, jsonData, headers ); @@ -278,13 +383,15 @@ export default class FirecrawlApp { /** * Checks the status of a crawl job using the Firecrawl API. * @param {string} jobId - The job ID of the crawl operation. - * @returns {Promise} The response containing the job status. + * @returns {Promise} The response containing the job status. */ - async checkCrawlStatus(jobId: string): Promise { + async checkCrawlStatus(jobId: string): Promise { const headers: AxiosRequestHeaders = this.prepareHeaders(); try { const response: AxiosResponse = await this.getRequest( - this.apiUrl + `/v0/crawl/status/${jobId}`, + this.version == 'v1' ? + this.apiUrl + `/${this.version}/crawl/${jobId}` : + this.apiUrl + `/${this.version}/crawl/status/${jobId}`, headers ); if (response.status === 200) { From c16437e933133764d0917cc1eea397985d35969e Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 7 Aug 2024 17:05:18 -0300 Subject: [PATCH 03/16] fixed bunch of types --- .../__tests__/v1/e2e_withAuth/index.test.ts | 204 ++++++++++++++++-- apps/js-sdk/firecrawl/src/index.ts | 4 +- 2 files changed, 186 insertions(+), 22 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index 9042d02e..cf6181fe 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -30,40 +30,84 @@ describe('FirecrawlApp E2E Tests', () => { const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL }); const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponse; expect(response).not.toBeNull(); - expect(response.data?.content).toContain("_Roast_"); + expect(response.data?.markdown).toContain("_Roast_"); }, 30000); // 30 seconds timeout test.concurrent('should return successful response for valid scrape', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponse; expect(response).not.toBeNull(); - expect(response.data?.content).toContain("_Roast_"); + expect(response.data).not.toHaveProperty('content'); // v0 + expect(response.data).not.toHaveProperty('html'); + expect(response.data).not.toHaveProperty('rawHtml'); + expect(response.data).not.toHaveProperty('screenshot'); + expect(response.data).not.toHaveProperty('links'); + expect(response.data).toHaveProperty('markdown'); expect(response.data).toHaveProperty('metadata'); - expect(response.data).not.toHaveProperty('html'); }, 30000); // 30 seconds timeout test.concurrent('should return successful response with valid API key and include HTML', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.scrapeUrl('https://roastmywebsite.ai', { pageOptions: { includeHtml: true } }) as ScrapeResponse; + const response = await app.scrapeUrl( + 'https://roastmywebsite.ai', { + formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links'], + headers: { "x-key": "test" }, + includeTags: ['h1'], + excludeTags: ['h2'], + onlyMainContent: true, + timeout: 30000, + waitFor: 1000 + }) as ScrapeResponse; expect(response).not.toBeNull(); - expect(response.data?.content).toContain("_Roast_"); + expect(response.data).not.toHaveProperty('content'); // v0 expect(response.data?.markdown).toContain("_Roast_"); expect(response.data?.html).toContain(" { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf') as ScrapeResponse; expect(response).not.toBeNull(); - expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + expect(response.data?.markdown).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); }, 30000); // 30 seconds timeout test.concurrent('should return successful response for valid scrape with PDF file without explicit extension', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001') as ScrapeResponse; expect(response).not.toBeNull(); - expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + expect(response.data?.markdown).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); }, 30000); // 30 seconds timeout test.concurrent('should throw error for invalid API key on crawl', async () => { @@ -79,19 +123,102 @@ describe('FirecrawlApp E2E Tests', () => { test.concurrent('should return successful response for crawl and wait for completion', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, true, 30) as JobStatusResponse; + const response = await app.crawlUrl('https://roastmywebsite.ai', {}, true, 30) as JobStatusResponse; expect(response).not.toBeNull(); - expect(response.data?.[0].content).toContain("_Roast_"); + + expect(response).toHaveProperty("totalCount"); + expect(response.totalCount).toBeGreaterThan(0); + expect(response).toHaveProperty("creditsUsed"); + expect(response.creditsUsed).toBeGreaterThan(0); + expect(response).toHaveProperty("expiresAt"); + expect(response.expiresAt).toBeGreaterThan(Date.now()); + expect(response).toHaveProperty("status"); + expect(response.status).toBe("completed"); + expect(response).toHaveProperty("next"); + expect(response.next).toBeDefined(); + expect(response.data?.length).toBeGreaterThan(0); + expect(response.data?.[0]).toHaveProperty("markdown"); + expect(response.data?.[0].markdown).toContain("_Roast_"); + expect(response.data?.[0]).not.toHaveProperty('content'); // v0 + expect(response.data?.[0].markdown).toContain("_Roast_"); + expect(response.data?.[0]).not.toHaveProperty("html"); + expect(response.data?.[0]).not.toHaveProperty("rawHtml"); + expect(response.data?.[0]).not.toHaveProperty("screenshot"); + expect(response.data?.[0]).not.toHaveProperty("links"); + + expect(response.data?.[0]).toHaveProperty("metadata"); + expect(response.data?.[0].metadata).toHaveProperty("title"); + expect(response.data?.[0].metadata).toHaveProperty("description"); + expect(response.data?.[0].metadata).toHaveProperty("language"); + expect(response.data?.[0].metadata).toHaveProperty("sourceURL"); + expect(response.data?.[0].metadata).toHaveProperty("statusCode"); + expect(response.data?.[0].metadata).toHaveProperty("error"); + }, 60000); // 60 seconds timeout + + test.concurrent('should return successful response for crawl and wait for completion', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response = await app.crawlUrl('https://roastmywebsite.ai', { + crawlerOptions: { + excludePaths: ['blog/*'], + includePaths: ['/'], + maxDepth: 2, + ignoreSitemap: true, + limit: 10, + allowBackwardLinks: true, + allowExternalLinks: true, + }, + pageOptions: { + formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links'], + headers: { "x-key": "test" }, + includeTags: ['h1'], + excludeTags: ['h2'], + onlyMainContent: true, + timeout: 30000, + waitFor: 1000 + } + }, true, 30) as JobStatusResponse; + expect(response).not.toBeNull(); + expect(response).toHaveProperty("totalCount"); + expect(response.totalCount).toBeGreaterThan(0); + expect(response).toHaveProperty("creditsUsed"); + expect(response.creditsUsed).toBeGreaterThan(0); + expect(response).toHaveProperty("expiresAt"); + expect(response.expiresAt).toBeGreaterThan(Date.now()); + expect(response).toHaveProperty("status"); + expect(response.status).toBe("completed"); + expect(response).toHaveProperty("next"); + expect(response.next).toContain("/v1/crawl/"); + expect(response.data?.length).toBeGreaterThan(0); + expect(response.data?.[0]).toHaveProperty("markdown"); + expect(response.data?.[0].markdown).toContain("_Roast_"); + expect(response.data?.[0]).not.toHaveProperty('content'); // v0 + expect(response.data?.[0].markdown).toContain("_Roast_"); + expect(response.data?.[0]).toHaveProperty("html"); + expect(response.data?.[0].html).toContain(" { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const uniqueIdempotencyKey = uuidv4(); - const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false, 2, uniqueIdempotencyKey) as CrawlResponse; + const response = await app.crawlUrl('https://roastmywebsite.ai', {}, false, 2, uniqueIdempotencyKey) as CrawlResponse; expect(response).not.toBeNull(); expect(response.jobId).toBeDefined(); - await expect(app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, true, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409"); + await expect(app.crawlUrl('https://roastmywebsite.ai', {}, true, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409"); }); test.concurrent('should check crawl status', async () => { @@ -104,19 +231,56 @@ describe('FirecrawlApp E2E Tests', () => { const maxChecks = 15; let checks = 0; - while (statusResponse.status === 'active' && checks < maxChecks) { + while (statusResponse.status === 'scraping' && checks < maxChecks) { await new Promise(resolve => setTimeout(resolve, 1000)); - expect(statusResponse.partial_data).not.toBeNull(); - expect(statusResponse.current).toBeGreaterThanOrEqual(1); + expect(statusResponse).not.toHaveProperty("partial_data"); // v0 + expect(statusResponse).not.toHaveProperty("current"); // v0 + expect(statusResponse).toHaveProperty("data"); + expect(statusResponse).toHaveProperty("totalCount"); + expect(statusResponse).toHaveProperty("creditsUsed"); + expect(statusResponse).toHaveProperty("expiresAt"); + expect(statusResponse).toHaveProperty("status"); + expect(statusResponse).toHaveProperty("next"); + expect(statusResponse.totalCount).toBeGreaterThan(0); + expect(statusResponse.creditsUsed).toBeGreaterThan(0); + expect(statusResponse.expiresAt).toBeGreaterThan(Date.now()); + expect(statusResponse.status).toBe("scraping"); + expect(statusResponse.next).toContain("/v1/crawl/"); statusResponse = await app.checkCrawlStatus(response.jobId) as CrawlResponse; checks++; } - expect(statusResponse).not.toBeNull(); - expect(statusResponse.success).toBe(true); - expect(statusResponse.status).toBe('completed'); - expect(statusResponse.total).toEqual(statusResponse.current); - expect(statusResponse.current_step).not.toBeNull(); - expect(statusResponse?.data?.length).toBeGreaterThan(0); + expect(response).not.toBeNull(); + expect(response).toHaveProperty("totalCount"); + expect(response.totalCount).toBeGreaterThan(0); + expect(response).toHaveProperty("creditsUsed"); + expect(response.creditsUsed).toBeGreaterThan(0); + expect(response).toHaveProperty("expiresAt"); + expect(response.expiresAt).toBeGreaterThan(Date.now()); + expect(response).toHaveProperty("status"); + expect(response.status).toBe("completed"); + expect(response).toHaveProperty("next"); + expect(response.next).toContain("/v1/crawl/"); + expect(response.data?.length).toBeGreaterThan(0); + expect(response.data?.[0]).toHaveProperty("markdown"); + expect(response.data?.[0].markdown).toContain("_Roast_"); + expect(response.data?.[0]).not.toHaveProperty('content'); // v0 + expect(response.data?.[0].markdown).toContain("_Roast_"); + expect(response.data?.[0]).toHaveProperty("html"); + expect(response.data?.[0].html).toContain(" Date: Thu, 8 Aug 2024 11:41:13 -0300 Subject: [PATCH 04/16] typescript fixes --- apps/api/src/controllers/v1/crawl.ts | 2 +- apps/api/src/controllers/v1/scrape.ts | 3 +- apps/js-sdk/example.js | 74 +---- apps/js-sdk/example.ts | 74 +---- apps/js-sdk/exampleV0.js | 85 ++++++ apps/js-sdk/exampleV0.ts | 95 +++++++ apps/js-sdk/firecrawl/src/index.ts | 378 +++++++++++++++++--------- apps/js-sdk/firecrawl/tsconfig.json | 6 +- 8 files changed, 458 insertions(+), 259 deletions(-) create mode 100644 apps/js-sdk/exampleV0.js create mode 100644 apps/js-sdk/exampleV0.ts diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index a00ad7ca..514a2cd3 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -26,7 +26,7 @@ export async function crawlController(req: Request, res: Response) { // limit: number // allowBackwardLinks: boolean >> TODO: CHANGE THIS NAME??? // allowExternalLinks: boolean - // ignoreSitemap: number + // ignoreSitemap: boolean // } // scrapeOptions: Exclude // } diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index bf529ad2..30c13dca 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -36,12 +36,13 @@ export async function scrapeController(req: Request, res: Response) { // headers: { // "x-key": "test" // }, - // formats: ["markdown", "html", "rawHtml", "content", "linksOnPage", "screenshot", "fullPageScreenshot"], + // formats: ["markdown", "html", "rawHtml", "content", "links", "screenshot"], // includeTags: ["test"], // excludeTags: ["test"], // onlyMainContent: false, // timeout: 30000, // waitFor: number + // screenshotMode: "desktop" | "full-desktop" | "mobile" | "full-mobile"; // } try { diff --git a/apps/js-sdk/example.js b/apps/js-sdk/example.js index 7f198598..b4ee7747 100644 --- a/apps/js-sdk/example.js +++ b/apps/js-sdk/example.js @@ -1,16 +1,16 @@ -import { v4 as uuidv4 } from 'uuid'; -import FirecrawlApp from '@mendable/firecrawl-js'; -import { z } from "zod"; +import FirecrawlApp from './firecrawl/src/index'; //'@mendable/firecrawl-js'; const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"}); // Scrape a website: const scrapeResult = await app.scrapeUrl('firecrawl.dev'); -console.log(scrapeResult.data.content) + +if (scrapeResult.data) { + console.log(scrapeResult.data.markdown) +} // Crawl a website: -const idempotencyKey = uuidv4(); // optional -const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false, 2, idempotencyKey); +const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false); console.log(crawlResult) const jobId = await crawlResult['jobId']; @@ -19,67 +19,15 @@ console.log(jobId); let job; while (true) { job = await app.checkCrawlStatus(jobId); - if (job.status == 'completed') { + if (job.status === 'completed') { break; } await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second } -console.log(job.data[0].content); - -// Search for a query: -const query = 'what is mendable?' -const searchResult = await app.search(query) -console.log(searchResult) - -// LLM Extraction: -// Define schema to extract contents into using zod schema -const zodSchema = z.object({ - top: z - .array( - z.object({ - title: z.string(), - points: z.number(), - by: z.string(), - commentsURL: z.string(), - }) - ) - .length(5) - .describe("Top 5 stories on Hacker News"), -}); - -let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { - extractorOptions: { extractionSchema: zodSchema }, -}); - -console.log(llmExtractionResult.data.llm_extraction); - -// Define schema to extract contents into using json schema -const jsonSchema = { - "type": "object", - "properties": { - "top": { - "type": "array", - "items": { - "type": "object", - "properties": { - "title": {"type": "string"}, - "points": {"type": "number"}, - "by": {"type": "string"}, - "commentsURL": {"type": "string"} - }, - "required": ["title", "points", "by", "commentsURL"] - }, - "minItems": 5, - "maxItems": 5, - "description": "Top 5 stories on Hacker News" - } - }, - "required": ["top"] +if (job.data) { + console.log(job.data[0].markdown); } -llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { - extractorOptions: { extractionSchema: jsonSchema }, -}); - -console.log(llmExtractionResult.data.llm_extraction); \ No newline at end of file +const mapResult = await app.map('https://firecrawl.dev'); +console.log(mapResult) diff --git a/apps/js-sdk/example.ts b/apps/js-sdk/example.ts index f314c080..056695b8 100644 --- a/apps/js-sdk/example.ts +++ b/apps/js-sdk/example.ts @@ -1,5 +1,5 @@ -import FirecrawlApp, { JobStatusResponse } from './firecrawl/src/index' //'@mendable/firecrawl-js'; -import { z } from "zod"; +import FirecrawlApp from './firecrawl/src/index' //'@mendable/firecrawl-js'; +import { CrawlStatusResponse } from './firecrawl/src/index'; const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"}); @@ -7,7 +7,7 @@ const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"}); const scrapeResult = await app.scrapeUrl('firecrawl.dev'); if (scrapeResult.data) { - console.log(scrapeResult.data.content) + console.log(scrapeResult.data.markdown) } // Crawl a website: @@ -17,9 +17,9 @@ console.log(crawlResult) const jobId: string = await crawlResult['jobId']; console.log(jobId); -let job: JobStatusResponse; +let job: CrawlStatusResponse; while (true) { - job = await app.checkCrawlStatus(jobId); + job = await app.checkCrawlStatus(jobId) as CrawlStatusResponse; if (job.status === 'completed') { break; } @@ -27,66 +27,8 @@ while (true) { } if (job.data) { - console.log(job.data[0].content); -} - -// Search for a query: -const query = 'what is mendable?' -const searchResult = await app.search(query) - -// LLM Extraction: -// Define schema to extract contents into using zod schema -const zodSchema = z.object({ - top: z - .array( - z.object({ - title: z.string(), - points: z.number(), - by: z.string(), - commentsURL: z.string(), - }) - ) - .length(5) - .describe("Top 5 stories on Hacker News"), -}); - -let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { - extractorOptions: { extractionSchema: zodSchema }, -}); - -if (llmExtractionResult.data) { - console.log(llmExtractionResult.data.llm_extraction); -} - -// Define schema to extract contents into using json schema -const jsonSchema = { - "type": "object", - "properties": { - "top": { - "type": "array", - "items": { - "type": "object", - "properties": { - "title": {"type": "string"}, - "points": {"type": "number"}, - "by": {"type": "string"}, - "commentsURL": {"type": "string"} - }, - "required": ["title", "points", "by", "commentsURL"] - }, - "minItems": 5, - "maxItems": 5, - "description": "Top 5 stories on Hacker News" - } - }, - "required": ["top"] -} - -llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { - extractorOptions: { extractionSchema: jsonSchema }, -}); - -if (llmExtractionResult.data) { - console.log(llmExtractionResult.data.llm_extraction); + console.log(job.data[0].markdown); } +const mapResult = await app.map('https://firecrawl.dev'); +console.log(mapResult) diff --git a/apps/js-sdk/exampleV0.js b/apps/js-sdk/exampleV0.js new file mode 100644 index 00000000..7f198598 --- /dev/null +++ b/apps/js-sdk/exampleV0.js @@ -0,0 +1,85 @@ +import { v4 as uuidv4 } from 'uuid'; +import FirecrawlApp from '@mendable/firecrawl-js'; +import { z } from "zod"; + +const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"}); + +// Scrape a website: +const scrapeResult = await app.scrapeUrl('firecrawl.dev'); +console.log(scrapeResult.data.content) + +// Crawl a website: +const idempotencyKey = uuidv4(); // optional +const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false, 2, idempotencyKey); +console.log(crawlResult) + +const jobId = await crawlResult['jobId']; +console.log(jobId); + +let job; +while (true) { + job = await app.checkCrawlStatus(jobId); + if (job.status == 'completed') { + break; + } + await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second +} + +console.log(job.data[0].content); + +// Search for a query: +const query = 'what is mendable?' +const searchResult = await app.search(query) +console.log(searchResult) + +// LLM Extraction: +// Define schema to extract contents into using zod schema +const zodSchema = z.object({ + top: z + .array( + z.object({ + title: z.string(), + points: z.number(), + by: z.string(), + commentsURL: z.string(), + }) + ) + .length(5) + .describe("Top 5 stories on Hacker News"), +}); + +let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { + extractorOptions: { extractionSchema: zodSchema }, +}); + +console.log(llmExtractionResult.data.llm_extraction); + +// Define schema to extract contents into using json schema +const jsonSchema = { + "type": "object", + "properties": { + "top": { + "type": "array", + "items": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "points": {"type": "number"}, + "by": {"type": "string"}, + "commentsURL": {"type": "string"} + }, + "required": ["title", "points", "by", "commentsURL"] + }, + "minItems": 5, + "maxItems": 5, + "description": "Top 5 stories on Hacker News" + } + }, + "required": ["top"] +} + +llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { + extractorOptions: { extractionSchema: jsonSchema }, +}); + +console.log(llmExtractionResult.data.llm_extraction); \ No newline at end of file diff --git a/apps/js-sdk/exampleV0.ts b/apps/js-sdk/exampleV0.ts new file mode 100644 index 00000000..58c46b6a --- /dev/null +++ b/apps/js-sdk/exampleV0.ts @@ -0,0 +1,95 @@ +import FirecrawlApp, { ScrapeResponseV0, CrawlStatusResponseV0, SearchResponseV0 } from './firecrawl/src/index' //'@mendable/firecrawl-js'; +import { z } from "zod"; + +const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY", version: "v0"}); + +// Scrape a website: +const scrapeResult = await app.scrapeUrl('firecrawl.dev') as ScrapeResponseV0; + +if (scrapeResult.data) { + console.log(scrapeResult.data.content) +} + +// Crawl a website: +const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false); +console.log(crawlResult) + +const jobId: string = await crawlResult['jobId']; +console.log(jobId); + +let job: CrawlStatusResponseV0; +while (true) { + job = await app.checkCrawlStatus(jobId) as CrawlStatusResponseV0; + if (job.status === 'completed') { + break; + } + await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second +} + +if (job.data) { + console.log(job.data[0].content); +} + +// Search for a query: +const query = 'what is mendable?' +const searchResult = await app.search(query) as SearchResponseV0; +if (searchResult.data) { + console.log(searchResult.data[0].content) +} + +// LLM Extraction: +// Define schema to extract contents into using zod schema +const zodSchema = z.object({ + top: z + .array( + z.object({ + title: z.string(), + points: z.number(), + by: z.string(), + commentsURL: z.string(), + }) + ) + .length(5) + .describe("Top 5 stories on Hacker News"), +}); + +let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { + extractorOptions: { extractionSchema: zodSchema }, +}); + +if (llmExtractionResult.data) { + console.log(llmExtractionResult.data[0].llm_extraction); +} + +// Define schema to extract contents into using json schema +const jsonSchema = { + "type": "object", + "properties": { + "top": { + "type": "array", + "items": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "points": {"type": "number"}, + "by": {"type": "string"}, + "commentsURL": {"type": "string"} + }, + "required": ["title", "points", "by", "commentsURL"] + }, + "minItems": 5, + "maxItems": 5, + "description": "Top 5 stories on Hacker News" + } + }, + "required": ["top"] +} + +llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { + extractorOptions: { extractionSchema: jsonSchema }, +}); + +if (llmExtractionResult.data) { + console.log(llmExtractionResult.data[0].llm_extraction); +} + diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 504efdfa..cbec3644 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -1,8 +1,12 @@ import axios, { AxiosResponse, AxiosRequestHeaders } from "axios"; import { z } from "zod"; import { zodToJsonSchema } from "zod-to-json-schema"; + /** * Configuration interface for FirecrawlApp. + * @param apiKey - Optional API key for authentication. + * @param apiUrl - Optional base URL of the API; defaults to 'https://api.firecrawl.dev'. + * @param version - API version, either 'v0' or 'v1'. */ export interface FirecrawlAppConfig { apiKey?: string | null; @@ -12,6 +16,7 @@ export interface FirecrawlAppConfig { /** * Metadata for a Firecrawl document. + * Includes various optional properties for document metadata. */ export interface FirecrawlDocumentMetadata { title?: string; @@ -46,43 +51,15 @@ export interface FirecrawlDocumentMetadata { sourceURL?: string; statusCode?: number; error?: string; - [key: string]: any; + [key: string]: any; // Allows for additional metadata properties not explicitly defined. } /** * Metadata for a Firecrawl document on v0. + * Similar to FirecrawlDocumentMetadata but includes properties specific to API version v0. */ export interface FirecrawlDocumentMetadataV0 { - title?: string; - description?: string; - language?: string; - keywords?: string; - robots?: string; - ogTitle?: string; - ogDescription?: string; - ogUrl?: string; - ogImage?: string; - ogAudio?: string; - ogDeterminer?: string; - ogLocale?: string; - ogLocaleAlternate?: string[]; - ogSiteName?: string; - ogVideo?: string; - dctermsCreated?: string; - dcDateCreated?: string; - dcDate?: string; - dctermsType?: string; - dcType?: string; - dctermsAudience?: string; - dctermsSubject?: string; - dcSubject?: string; - dcDescription?: string; - dctermsKeywords?: string; - modifiedTime?: string; - publishedTime?: string; - articleTag?: string; - articleSection?: string; - sourceURL?: string; + // Similar properties as FirecrawlDocumentMetadata with additional v0 specific adjustments pageStatusCode?: number; pageError?: string; [key: string]: any; @@ -90,6 +67,7 @@ export interface FirecrawlDocumentMetadataV0 { /** * Document interface for Firecrawl. + * Represents a document retrieved or processed by Firecrawl. */ export interface FirecrawlDocument { url?: string; @@ -103,6 +81,7 @@ export interface FirecrawlDocument { /** * Document interface for Firecrawl on v0. + * Represents a document specifically for API version v0 with additional properties. */ export interface FirecrawlDocumentV0 { id?: string; @@ -121,8 +100,49 @@ export interface FirecrawlDocumentV0 { index?: number; } +/** + * Parameters for scraping operations. + * Defines the options and configurations available for scraping web content. + */ +export interface ScrapeParams { + formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot")[]; + headers?: Record; + includeTags?: string[]; + excludeTags?: string[]; + onlyMainContent?: boolean; + screenshotMode?: "desktop" | "full-desktop" | "mobile" | "full-mobile"; + waitFor?: number; + timeout?: number; +} + +/** + * Parameters for scraping operations on v0. + * Includes page and extractor options specific to API version v0. + */ +export interface ScrapeParamsV0 { + pageOptions?: { + headers?: Record; + includeHtml?: boolean; + includeRawHtml?: boolean; + onlyIncludeTags?: string[]; + onlyMainContent?: boolean; + removeTags?: string[]; + replaceAllPathsWithAbsolutePaths?: boolean; + screenshot?: boolean; + fullPageScreenshot?: boolean; + waitFor?: number; + }; + extractorOptions?: { + mode?: "markdown" | "llm-extraction" | "llm-extraction-from-raw-html" | "llm-extraction-from-markdown"; + extractionPrompt?: string; + extractionSchema?: Record | z.ZodSchema | any; + }; + timeout?: number; +} + /** * Response interface for scraping operations. + * Defines the structure of the response received after a scraping operation. */ export interface ScrapeResponse { success: boolean; @@ -133,6 +153,7 @@ export interface ScrapeResponse { /** * Response interface for scraping operations on v0. + * Similar to ScrapeResponse but tailored for responses from API version v0. */ export interface ScrapeResponseV0 { success: boolean; @@ -141,38 +162,71 @@ export interface ScrapeResponseV0 { } /** - * Response interface for searching operations. + * Parameters for crawling operations. + * Includes options for both scraping and mapping during a crawl. */ -export interface SearchResponseV0 { - success: boolean; - data?: FirecrawlDocument[]; - error?: string; +export interface CrawlParams { + scrapeOptions?: ScrapeParams; + crawlerOptions?: MapParams; +} + +/** + * Parameters for crawling operations on v0. + * Tailored for API version v0, includes specific options for crawling. + */ +export interface CrawlParamsV0 { + crawlerOptions?: { + includes?: string[]; + excludes?: string[]; + generateImgAltText?: boolean; + returnOnlyUrls?: boolean; + maxDepth?: number; + mode?: "default" | "fast"; + ignoreSitemap?: boolean; + limit?: number; + allowBackwardCrawling?: boolean; + allowExternalContentLinks?: boolean; + }; + pageOptions?: { + headers?: Record; + includeHtml?: boolean; + includeRawHtml?: boolean; + onlyIncludeTags?: string[]; + onlyMainContent?: boolean; + removeTags?: string[]; + replaceAllPathsWithAbsolutePaths?: boolean; + screenshot?: boolean; + fullPageScreenshot?: boolean; + waitFor?: number; + }; } /** * Response interface for crawling operations. + * Defines the structure of the response received after initiating a crawl. */ export interface CrawlResponse { - success: boolean; jobId?: string; - data?: FirecrawlDocument[]; + url?: string; + success: boolean; error?: string; } /** * Response interface for crawling operations on v0. + * Similar to CrawlResponse but tailored for responses from API version v0. */ export interface CrawlResponseV0 { - success: boolean; jobId?: string; - data?: FirecrawlDocument[]; + success: boolean; error?: string; } /** * Response interface for job status checks. + * Provides detailed status of a crawl job including progress and results. */ -export interface JobStatusResponse { +export interface CrawlStatusResponse { success: boolean; totalCount: number; creditsUsed: number; @@ -185,8 +239,9 @@ export interface JobStatusResponse { /** * Response interface for job status checks on v0. + * Tailored for API version v0, provides status and partial data of a crawl job. */ -export interface JobStatusResponseV0 { +export interface CrawlStatusResponseV0 { success: boolean; status: string; current?: number; @@ -199,18 +254,58 @@ export interface JobStatusResponseV0 { } /** - * Generic parameter interface. + * Parameters for mapping operations. + * Defines options for mapping URLs during a crawl. */ -export interface Params { - [key: string]: any; - extractorOptions?: { - extractionSchema: z.ZodSchema | any; - mode?: "llm-extraction"; - extractionPrompt?: string; +export interface MapParams { + includePaths?: string[] + excludePaths?: string[] + maxDepth?: number + limit?: number + allowBackwardLinks?: boolean + allowExternalLinks?: boolean + ignoreSitemap?: boolean +} + +/** + * Response interface for mapping operations. + * Defines the structure of the response received after a mapping operation. + */ +export interface MapResponse { + success: boolean; + data?: string[]; + error?: string; +} + +/** + * Parameters for searching operations on v0. + * Tailored for API version v0, includes specific options for searching content. + */ +export interface SearchParamsV0 { + pageOptions?: { + onlyMainContent?: boolean; + fetchPageContent?: boolean; + includeHtml?: boolean; + includeRawHtml?: boolean; + }; + searchOptions?: { + limit?: number; }; } + +/** + * Response interface for searching operations on v0. + * Defines the structure of the response received after a search operation on v0. + */ +export interface SearchResponseV0 { + success: boolean; + data?: FirecrawlDocumentV0[]; + error?: string; +} + /** * Main class for interacting with the Firecrawl API. + * Provides methods for scraping, searching, crawling, and mapping web content. */ export default class FirecrawlApp { private apiKey: string; @@ -219,7 +314,7 @@ export default class FirecrawlApp { /** * Initializes a new instance of the FirecrawlApp class. - * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. + * @param config - Configuration options for the FirecrawlApp instance. */ constructor({ apiKey = null, apiUrl = null, version = "v1" }: FirecrawlAppConfig) { this.apiKey = apiKey || ""; @@ -232,26 +327,21 @@ export default class FirecrawlApp { /** * Scrapes a URL using the Firecrawl API. - * @param {string} url - The URL to scrape. - * @param {Params | null} params - Additional parameters for the scrape request. - * @returns {Promise} The response from the scrape operation. + * @param url - The URL to scrape. + * @param params - Additional parameters for the scrape request. + * @returns The response from the scrape operation. */ async scrapeUrl( url: string, - params: Params | null = null, - version: "v0" | "v1" = "v1" + params?: ScrapeParams | ScrapeParamsV0 ): Promise { - if (version) { - this.version = version; - } - const headers: AxiosRequestHeaders = { "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}`, } as AxiosRequestHeaders; - let jsonData: Params = { url, ...params }; - if (params?.extractorOptions?.extractionSchema) { - let schema = params.extractorOptions.extractionSchema; + let jsonData: any = { url, ...params }; + if (jsonData?.extractorOptions?.extractionSchema) { + let schema = jsonData.extractorOptions.extractionSchema; // Check if schema is an instance of ZodSchema to correctly identify Zod schemas if (schema instanceof z.ZodSchema) { schema = zodToJsonSchema(schema); @@ -259,9 +349,9 @@ export default class FirecrawlApp { jsonData = { ...jsonData, extractorOptions: { - ...params.extractorOptions, + ...jsonData.extractorOptions, extractionSchema: schema, - mode: params.extractorOptions.mode || "llm-extraction", + mode: jsonData.extractorOptions.mode || "llm-extraction", }, }; } @@ -274,7 +364,11 @@ export default class FirecrawlApp { if (response.status === 200) { const responseData = response.data; if (responseData.success) { - return responseData; + if (this.version == 'v0') { + return responseData as ScrapeResponseV0; + } else { + return responseData as ScrapeResponse; + } } else { throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); } @@ -289,13 +383,13 @@ export default class FirecrawlApp { /** * Searches for a query using the Firecrawl API. - * @param {string} query - The query to search for. - * @param {Params | null} params - Additional parameters for the search request. - * @returns {Promise} The response from the search operation. + * @param query - The query to search for. + * @param params - Additional parameters for the search request. + * @returns The response from the search operation. */ async search( query: string, - params: Params | null = null + params?: SearchParamsV0 ): Promise { if (this.version === "v1") { throw new Error("Search is not supported in v1"); @@ -305,7 +399,7 @@ export default class FirecrawlApp { "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}`, } as AxiosRequestHeaders; - let jsonData: Params = { query }; + let jsonData: any = { query }; if (params) { jsonData = { ...jsonData, ...params }; } @@ -333,30 +427,22 @@ export default class FirecrawlApp { /** * Initiates a crawl job for a URL using the Firecrawl API. - * @param {string} url - The URL to crawl. - * @param {Params | null} params - Additional parameters for the crawl request. - * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. - * @param {number} pollInterval - Time in seconds for job status checks. - * @param {string} idempotencyKey - Optional idempotency key for the request. - * @returns {Promise} The response from the crawl operation. + * @param url - The URL to crawl. + * @param params - Additional parameters for the crawl request. + * @param waitUntilDone - Whether to wait for the crawl job to complete. + * @param pollInterval - Time in seconds for job status checks. + * @param idempotencyKey - Optional idempotency key for the request. + * @returns The response from the crawl operation. */ async crawlUrl( url: string, - params: Params | null = null, + params?: CrawlParams | CrawlParamsV0, waitUntilDone: boolean = true, pollInterval: number = 2, - idempotencyKey?: string, - version: "v0" | "v1" = "v1" - ): Promise { - if (version) { - this.version = version; - } - + idempotencyKey?: string + ): Promise { const headers = this.prepareHeaders(idempotencyKey); - let jsonData: Params = { url }; - if (params) { - jsonData = { ...jsonData, ...params }; - } + let jsonData: any = { url, ...params }; try { const response: AxiosResponse = await this.postRequest( this.apiUrl + `/${this.version}/crawl`, @@ -382,10 +468,10 @@ export default class FirecrawlApp { /** * Checks the status of a crawl job using the Firecrawl API. - * @param {string} jobId - The job ID of the crawl operation. - * @returns {Promise} The response containing the job status. + * @param jobId - The job ID of the crawl operation. + * @returns The response containing the job status. */ - async checkCrawlStatus(jobId: string): Promise { + async checkCrawlStatus(jobId: string): Promise { const headers: AxiosRequestHeaders = this.prepareHeaders(); try { const response: AxiosResponse = await this.getRequest( @@ -395,38 +481,80 @@ export default class FirecrawlApp { headers ); if (response.status === 200) { - return { - success: true, - status: response.data.status, - current: response.data.current, - current_url: response.data.current_url, - current_step: response.data.current_step, - total: response.data.total, - data: response.data.data, - partial_data: !response.data.data - ? response.data.partial_data - : undefined, - }; + if (this.version == 'v0') { + return { + success: true, + status: response.data.status, + current: response.data.current, + current_url: response.data.current_url, + current_step: response.data.current_step, + total: response.data.total, + data: response.data.data, + partial_data: !response.data.data + ? response.data.partial_data + : undefined, + } as CrawlStatusResponseV0; + } else if (this.version == 'v1') { + return { + success: true, + status: response.data.status, + data: response.data.data, + error: response.data.error, + } as CrawlStatusResponse; + } } else { this.handleError(response, "check crawl status"); } } catch (error: any) { throw new Error(error.message); } - return { - success: false, - status: "unknown", - current: 0, - current_url: "", - current_step: "", - total: 0, - error: "Internal server error.", - }; + + if (this.version == 'v0') { + return { + success: false, + status: "unknown", + current: 0, + current_url: "", + current_step: "", + total: 0, + error: "Internal server error.", + } as CrawlStatusResponseV0; + } else { + return { + success: false, + error: "Internal server error.", + } as CrawlStatusResponse; + } + } + + async map(url: string, params?: MapParams): Promise { + if (this.version == 'v0') { + throw new Error("Map is not supported in v0"); + } + const headers = this.prepareHeaders(); + let jsonData: { url: string } & MapParams = { url, ...params }; + + try { + const response: AxiosResponse = await this.postRequest( + this.apiUrl + `/${this.version}/map`, + jsonData, + headers + ); + if (response.status === 200) { + return response.data as MapResponse; + } else { + this.handleError(response, "map"); + } + } catch (error: any) { + throw new Error(error.message); + } + return { success: false, error: "Internal server error." } as MapResponse; } /** * Prepares the headers for an API request. - * @returns {AxiosRequestHeaders} The prepared headers. + * @param idempotencyKey - Optional key to ensure idempotency. + * @returns The prepared headers. */ prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders { return { @@ -438,14 +566,14 @@ export default class FirecrawlApp { /** * Sends a POST request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {Params} data - The data to send in the request. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the POST request. + * @param url - The URL to send the request to. + * @param data - The data to send in the request. + * @param headers - The headers for the request. + * @returns The response from the POST request. */ postRequest( url: string, - data: Params, + data: any, headers: AxiosRequestHeaders ): Promise { return axios.post(url, data, { headers }); @@ -453,9 +581,9 @@ export default class FirecrawlApp { /** * Sends a GET request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the GET request. + * @param url - The URL to send the request to. + * @param headers - The headers for the request. + * @returns The response from the GET request. */ getRequest( url: string, @@ -466,10 +594,10 @@ export default class FirecrawlApp { /** * Monitors the status of a crawl job until completion or failure. - * @param {string} jobId - The job ID of the crawl operation. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @param {number} timeout - Timeout in seconds for job status checks. - * @returns {Promise} The final job status or data. + * @param jobId - The job ID of the crawl operation. + * @param headers - The headers for the request. + * @param checkInterval - Interval in seconds for job status checks. + * @returns The final job status or data. */ async monitorJobStatus( jobId: string, diff --git a/apps/js-sdk/firecrawl/tsconfig.json b/apps/js-sdk/firecrawl/tsconfig.json index d7764a46..56f13ced 100644 --- a/apps/js-sdk/firecrawl/tsconfig.json +++ b/apps/js-sdk/firecrawl/tsconfig.json @@ -11,7 +11,7 @@ // "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */ /* Language and Environment */ - "target": "es2016", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */ + "target": "es2020", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */ // "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */ // "jsx": "preserve", /* Specify what JSX code is generated. */ // "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */ @@ -25,9 +25,9 @@ // "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */ /* Modules */ - "module": "NodeNext", /* Specify what module code is generated. */ + "module": "commonjs", /* Specify what module code is generated. */ "rootDir": "./src", /* Specify the root folder within your source files. */ - "moduleResolution": "nodenext", /* Specify how TypeScript looks up a file from a given module specifier. */ + "moduleResolution": "node", /* Specify how TypeScript looks up a file from a given module specifier. */ // "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */ // "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */ // "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */ From 0b8df5e264187c4c8d0f0ce7a0aad2569d646e31 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 8 Aug 2024 14:25:09 -0300 Subject: [PATCH 05/16] python sdk and tests --- apps/python-sdk/examplev0.py | 75 ++++++++ .../firecrawl/__tests__/e2e_withAuth/test.py | 32 ++-- .../__tests__/v1/e2e_withAuth/.env.example | 3 + .../__tests__/v1/e2e_withAuth/__init__.py | 0 .../test.cpython-311-pytest-8.2.1.pyc | Bin 0 -> 44947 bytes .../__tests__/v1/e2e_withAuth/test.py | 168 ++++++++++++++++++ apps/python-sdk/firecrawl/firecrawl.py | 75 +++++--- 7 files changed, 317 insertions(+), 36 deletions(-) create mode 100644 apps/python-sdk/examplev0.py create mode 100644 apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/.env.example create mode 100644 apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__init__.py create mode 100644 apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc create mode 100644 apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py diff --git a/apps/python-sdk/examplev0.py b/apps/python-sdk/examplev0.py new file mode 100644 index 00000000..d80fa795 --- /dev/null +++ b/apps/python-sdk/examplev0.py @@ -0,0 +1,75 @@ +import uuid +from firecrawl.firecrawl import FirecrawlApp + +app = FirecrawlApp(api_key="fc-YOUR_API_KEY") + +# Scrape a website: +scrape_result = app.scrape_url('firecrawl.dev') +print(scrape_result['markdown']) + +# Crawl a website: +idempotency_key = str(uuid.uuid4()) # optional idempotency key +crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, idempotency_key) +print(crawl_result) + +# LLM Extraction: +# Define schema to extract contents into using pydantic +from pydantic import BaseModel, Field +from typing import List + +class ArticleSchema(BaseModel): + title: str + points: int + by: str + commentsURL: str + +class TopArticlesSchema(BaseModel): + top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories") + +llm_extraction_result = app.scrape_url('https://news.ycombinator.com', { + 'extractorOptions': { + 'extractionSchema': TopArticlesSchema.model_json_schema(), + 'mode': 'llm-extraction' + }, + 'pageOptions':{ + 'onlyMainContent': True + } +}) + +print(llm_extraction_result['llm_extraction']) + +# Define schema to extract contents into using json schema +json_schema = { + "type": "object", + "properties": { + "top": { + "type": "array", + "items": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "points": {"type": "number"}, + "by": {"type": "string"}, + "commentsURL": {"type": "string"} + }, + "required": ["title", "points", "by", "commentsURL"] + }, + "minItems": 5, + "maxItems": 5, + "description": "Top 5 stories on Hacker News" + } + }, + "required": ["top"] +} + +llm_extraction_result = app.scrape_url('https://news.ycombinator.com', { + 'extractorOptions': { + 'extractionSchema': json_schema, + 'mode': 'llm-extraction' + }, + 'pageOptions':{ + 'onlyMainContent': True + } +}) + +print(llm_extraction_result['llm_extraction']) \ No newline at end of file diff --git a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py index 452d4982..457c206a 100644 --- a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py +++ b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py @@ -20,31 +20,31 @@ FirecrawlApp = firecrawl.FirecrawlApp def test_no_api_key(): with pytest.raises(Exception) as excinfo: - invalid_app = FirecrawlApp(api_url=API_URL) + invalid_app = FirecrawlApp(api_url=API_URL, version='v0') assert "No API key provided" in str(excinfo.value) def test_scrape_url_invalid_api_key(): - invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") + invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0') with pytest.raises(Exception) as excinfo: invalid_app.scrape_url('https://firecrawl.dev') assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) def test_blocklisted_url(): blocklisted_url = "https://facebook.com/fake-test" - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') with pytest.raises(Exception) as excinfo: app.scrape_url(blocklisted_url) assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) def test_successful_response_with_valid_preview_token(): - app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token") + app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token", version='v0') response = app.scrape_url('https://roastmywebsite.ai') assert response is not None assert 'content' in response assert "_Roast_" in response['content'] def test_scrape_url_e2e(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') response = app.scrape_url('https://roastmywebsite.ai') assert response is not None assert 'content' in response @@ -54,7 +54,7 @@ def test_scrape_url_e2e(): assert "_Roast_" in response['content'] def test_successful_response_with_valid_api_key_and_include_html(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') response = app.scrape_url('https://roastmywebsite.ai', {'pageOptions': {'includeHtml': True}}) assert response is not None assert 'content' in response @@ -66,7 +66,7 @@ def test_successful_response_with_valid_api_key_and_include_html(): assert " 0 @@ -104,7 +104,7 @@ def test_crawl_url_wait_for_completion_e2e(): assert "_Roast_" in response[0]['content'] def test_crawl_url_with_idempotency_key_e2e(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') uniqueIdempotencyKey = str(uuid4()) response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey) assert response is not None @@ -117,7 +117,7 @@ def test_crawl_url_with_idempotency_key_e2e(): assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value) def test_check_crawl_status_e2e(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False) assert response is not None assert 'jobId' in response @@ -131,20 +131,20 @@ def test_check_crawl_status_e2e(): assert len(status_response['data']) > 0 def test_search_e2e(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') response = app.search("test query") assert response is not None assert 'content' in response[0] assert len(response) > 2 def test_search_invalid_api_key(): - invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") + invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0') with pytest.raises(Exception) as excinfo: invalid_app.search("test query") assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) def test_llm_extraction(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') response = app.scrape_url("https://mendable.ai", { 'extractorOptions': { 'mode': 'llm-extraction', diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/.env.example b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/.env.example new file mode 100644 index 00000000..904887bf --- /dev/null +++ b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/.env.example @@ -0,0 +1,3 @@ +API_URL=http://localhost:3002 +ABSOLUTE_FIRECRAWL_PATH=/Users/user/firecrawl/apps/python-sdk/firecrawl/firecrawl.py +TEST_API_KEY=fc-YOUR_API_KEY \ No newline at end of file diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__init__.py b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ba1f1324fe139772739cdae776d127cd5002ca8 GIT binary patch literal 44947 zcmeHwZ)_V!mS`a`Do!E};jGaF-9vrx|x@A+Q zNTr*yV-1zl++`PwU~n@y?B2}mW`Nw@Z8k=Bf%$T{J77OtK5c-zJv0pp(Gdt>;IJR& za34IE`7rRO`@O2>V|B6Hre%9PI}zP_u6p(ARkPS#uimRyul}a5FJ<8KC-DoD3mXjM ze^aLMG5y5Xe;+f9pBk27#R|sxnE1bAIzG`6Gp5b+oiSO`bv_|VlIN4+|L*hM;{Vk7 z6#nlh^i21j?-jK1Lf>@%`Tm&EVc7kjppnlSEgC2{&!?@<^8+YLO|5DaSY4ms9zMgL z{G1;|`H+>c2maFdMMtnKX4r|T;RfZ$s~aixe=BJZe9{4q_>-UWBY|3L8hG`8dnDM> z`L*`Q)Vc=6$Lkv@^?!S9unz8S?bKrpit~?I-5(g|H&`jajaCofsMQO&$?5~lSp9&T ztu){kYXIS_}BJ6+35)t@{-=Ved+=}k`MrDgJTo^knRo07CwH~5 zn^U`HuGM?rJbLcU?2$K)XHOpe-rTw)FP}T}`de=v%^o{`_UJ2TkG%7G_KhQNzV>{k z-g~UU;K(80^L8jCrG+0NVeQiJ$-Z z+gOlxo}O6z3;y0QquBGfF>bi#)sFY#-!rbpV#fFI{NV}gnD3|G&>mI<(yt z;Mk?QClfqeuH+)n3Q*LKQ+{rH|Sw{C)R&se(NHFK?OyXAV9lgqod zTkk&l-h@3<&X=|dGG(D4Z z?0R?B%}v^bsrQ01dto+TDCdiAy{k|Hv0Go8buX1>3zn#ry;vw+$Q821+_YWqKO*iE z%^!7~l2aebPL`bMTsdpMH&e(Jb97^4Nr&FtdQZNHd&*nc+{{e9hZ>S!a*vvMPR;CB zGtbD3>%gd&#DG|%=lfKNThlJdjK3H;6#skkh>`d_9s>-QmLrgvL1|g$8H|DX+23{q zr5vOr%P~k+lDOid%FE!m1<4w#7#}OO8g;;58)3z4?p`FT3CS9pqTxb(AXyb@c~{BG zvq5|l9`!BAT}ak0lEN*LRjncBxzucnWKEQn+1?b($q>n^^5Z!{nI%GqWL0bM9YoiF zbW2#tHt%P+ta9i3sc7>bNLH1O??R_T+sOz@?RSr$(j{v*OV*Utqe#|`^4BLA;Ka|u zvLtKo>3X_3&WmC4dWS>9vy`_OHgAi0u8WCob2@9;SLU`J%jFBUl_{4pE~dLPb_Q~6 zSLR$fSDtk<6D7-jKJ)CJafco{hh|kW6Tr;Xv~;9O@Qll)xqe83LOLYyr5QbRMTP{mRZ01fC?YmB2Ov+X;*T z1SFF46fpHoLVPrv&uabT>;)CwlKa1;I2j!M>#N>@fnjm;E9EkEfVIy$L}m9 zhrUdX-bjvmoAy{tWHx<*0lzcQAv<{4VNeqZJdcIGU zxHbJX4FkU%IWidke9$~HlK6aG3@}{KJRt;)6@vg_bqueoB5YLUtVQg^ROM`{sp|0H zqYsr3I}i;YvuMDb@<&$z?pf@_=i`-*DOMNfO?B9*Xu}p1IVfvT=(;LhLZKVt8!@K% z{-MfsLF@?H7O|t&U`0(AZ%|{{bO3?I!*__-QTeg3DGxO7D=5OD(5W@}4x($2Qe`FD zyr1C!cfKE8tU~OB-i1y_A}@kc`(+}iI)mTdO8R0aVRc)nO2Sv@l4TlI{ZD`s_hTg) z!LaB6l7xbImlex?GP8xWfU1+sU8MC(? z8M8`}F*Hxp1dTIJxOF!lviP{U7%K3`iOp zjUi)Bc*iq?-?8P4RUd-wxO1&2Y?Ki)(X4HR_eQotU)L!gDFZQDs zV^*pu^yd@ktAyXHg+L$ij0uVp^=g_Zm)ENSzRkYZjF;Ac|NWEJDr>Nk^q->cHlCvH z;Bynq`BIZ4X2(1hh+qO8I$hsTzLa;fxTa=d*UDvQ9Q#V%zM3tUF55*RFK@4F2vw() zbIa4$uG$yeeA(WW%U_S(?#jEFVyO%x=?IxWVW!MTOop6Wq~q%gWRNNKJe zXb+}YXAsD-9z}c$QQcGlHSJGmgqx{hp#r9y{gi%*z%~HYMmet9D4$g``-Hlv((YA@ zeY2;uXYO}ta9~O9i}G=)KF&@|*seP{OD0^|x57}Fm8R9;Pk0V&=oZ~+jMT7Cl8E>A zoc2;@s;M)z)R~1>mePYie*eSwz0rf!^z*g!^9!#nnVY<@Yth_P3&VrB+9;AVoJA4W zVq_~OluWV(R4*`*#2OQ3t`aYq887TwG&8j@Jcz5JhzVpCc!}XGW4KBv21t3KBBxah z*lKO5O>mVZ^J%Yf)fx%L$i$-gbW~AQ6fr?)Rx&}6##UlbSF0|WXvAvvYdm#E_)Oyq zNZx4UX+WQi>1I^SIcT=h^072&95Ux%Qzhg*n^mzONOKO8W+(c%j-@`<22qFS3n6n3 zm3MP&Z#4j_bes;Eb99!u6fJXF*V(KtmXNXZGUrfhaJr(;IUZqOTfGmp7irGXx6+IB%HGGb*qFJg^Iv|NR-Q#Hr?jI*P%oza zXAv_&p+$uE;(&OgSFOC!s=HHT7L8sU6k0@gq!tn0t8$zi3e_Ui8l2Y0$-o=bq|&vB zVb&s6TO*1Vad)|JuUf>K(?U;h4&n874ioqmftLszA@B-;Zxc95fbQ8jM&MO|dU87F zT((MAi%8k!oRur*>gJ{LbYTqI1yl<-U5xqK1*rlO)6g!Q*ND$?0w)NZBtUcf>%Gd{ zUYO?YWrFX#4i=#^@F9Hx2VMfs_d!jjMfV?|E&6lpfEjINip?`wB;*WNoAv_#sHfPQ zJu~E5G&k46@F1>=A|{Yo;3bB$jNvMw7$D_^ikwz4V5_yIHo;YvTB^$;uFI61kbIbc z*-8v&K_5kfB9%;X28NqZF<_$cX{`(TKcFwtRH~v>G}f+E6-uT<)itJ9^)RWcsT?F` z8QPokCI+-{?X8Xx{K0E)R@^eJ&U`GdYkaM?W0rb>SznjX;}W5IT=J3WamfgIt!b-P zH)|l|qVX{FxZbujkUlZ%>%Wg#-y`g6EB#P=k><_=E4_$;;ePeFRV%&t0D6%$kh^Qc zD{nM)wdUlm=y5~nMXZ+HqaG)Fb@-v{ajRL68?n|rG(ArC!N51V57vt3VAVIe4`@|z zoubdzduDPM?K5I?flJ=|6NOpJehu3BoA<8jJ8$DPbiPaA9Rg9h`&|6crE%vxkwog@ zhuT>Q^L_NQxuOL}OL6O2vMe}R@FoWscMseM?yRu?ms4My^1h$-PX3YiLav&=P)lD} zSs-iZ>a5JKXz#4(-nY`b(KAD?MRQ{<3=iU}C}IMc1zut}%NVW_iUCqysK{v*1GZXQ zY7<;#MSFiF+WV5Z!)si%MuIUiv1smyDyoViCJ4<+CMeR_N(}01)g=>+SZ0^ZC%nd0 zYa|#W6N}~(QAJfz!~~&P$pl3jTZutkt-55Q5v$odSVj)P(){2pBSAl&m33_hZt)3N zR#w)vHD3exEq1P~Yd4-1-NORjpQ;s+(>mSns28P2;_}uK!#xuD8wI>aG@pxzKG4?^ zqdXEH{959W)IH{&YWN)HocHoqc9ooqyJxJ)-Ej7EN;_vR?LN47&-k7_Qjm1)oPuUaF)7@1f!r=yChqKFAXvyuslG`13h zx>|L~L?c$ScbL9ezmw@(V9+Jk1o6?gOwiUkZOcTcE8<|w@j!t zn%`LyrPwm@$oe{R%fy4}#W2S^wNF-h@m?J7?zHjbPQ5x1y40Z7hs|&Av zWhOs3@u%lOAm=^Fd4uTjztceFLwS@%2iKAM^q~&6(p$j32*8 zSnlcuBg~Q2d0^yR8gt}--Z6fho#+bb`Mn3Jxsz`d5%3Lm6ol`wDG*$yG%+#jILKbT z1k**P>|DziFN)AU1uwGtxALki-nUKgmqe7B1gY9 zRc$|T&wf8}&&C_cjo#>U)#Scfa-WynN8uFqQT+|;O~te#B_E9jtplT8 z5(8q5p6^p7ZcRJS;wKJx#j>!FaeAVnC){~YBLbZ+FHVmU5vNB}$XDX@;Ab5-vUe@Ah8WE`H24dt&46L;Bmg0XrYq&#^bv3&A)f|e7@M}`X6N406vaF>vAv~0t> z{$>209a4vEL|-k>I>oFIuoP!CtH<>zfC24C^5gonhV)q)c@B|&^f0B{WL%%V5Bq*n zswTJAl3TsxRuV+pNWe%yf@m8FA_dnhZKixdq*JX3B2}K;uOK>z-#8$M$OQiT*UT_JObi;f|fp`*2Jf6mxP4aqLB>UqR}0&Qn5`j#8#~?ZIbvEpOAUDO2=uP z4)gHtfJMGZsoSbkyt$wU!F3+O77aD*n|`0LWYZS%j!M^MlfE!Q)+F5j7r680T0(y% zAwt!Fx+SE+K%4!%F2I}nxl9o>{ZG&_1wxejPSqKsYoPB3(EF@QsTTR)pWi+FTj|?! ztyLeHLWm}>g2`T(bs_iuVwv&i9Pk!6(?Ji6gF7AHnZqC~j)D6+^V=p4I(r!PHlZ<$e5KMkp zdxUxt5$~}n8)5A;FN!VM2(46)7wqD7(|H*@nz5S1rtClzCIr!UC@iIOlmI=qVODzK zBtF)so9gpK-mZ1N3}UbdhHP?95sT9V&JcJKKn>0GtaFyo+X*~D;0*%b0dQ%&;yD^6 zVbwf!nh7m|gt}AIci&BXJWMnHw zGKs2-Qmw9v;U*X<`EV8qnU)EL;E6PdHF|D~DLJuZKJGQHS|h<2nOHO*k1DE)A|?pU zN+u}M*h&oQYSkqZjaX)v%w1mNs)>YzjO4_kxhtxuDvFpOG%NX_h#9IF)YYm>CK|Dt zy~Bz?2IJIn9emaBf2tG=&UAtj^$sThbOdMQ7B)VA0*U__OMT234GOnhi6iKySuqhV zGIXXzVWTix>?TTmc3`Cf)@+_#1n(JV-2^>DnpSUZ4jt)1qS0Yymp|fq199p{;WJ%Nn%#c zZKyL^gpN$2p9hG#rJq-|+0W|&yt$vt6t~|01iuGn>@p|J}pCm404< zHR#XEyR9K>xYF%E9Vu1Q3UK1Ss%SIdV6Xa52Zf+cBADiC%)${?RGEeLR(dMEn3Yqo z+d`lP)@%`$hXT*7Axiyu2-M);tfI}gBdB_V_iC;6-;h3QowdHw=l4N(8-37@Uq=u6 zAU@w~J*M}8wfdXfA8L^Ese#73S>Dn}=^K&}cE3OJw3iKc8|&}cz`qf{R{s=5N)n&> zgesl%w`e|%XEZIJW)X;=-;%)iCZFd~qFvteY;C&FQLwA=ee~Xq90en5U=-Yhb4b*+ z+MC z_e2ttcMBWp1MGm#k1f>L4t8QF@FjYiew=9rs`X+=st8VysMe|T+iEi}&M+CQQLcL!&;D{G4nV{Ng` z0Mz|rrHHnawjLeA^IwQ{PU}{bN4cKIM`xJU!&srkJ{Wbz+&n1ES{0lZ#|%8{{rvVQ ziQw8@uofN4fgN87<&YfiW%M}P^U+nziq zCyCYOm8S{6c;Gi_S|r=t`G%)yz4A20RqL339OHaa*bRl31@AlW#Z3~g&I}RTnQx1$ed|0?6;1nor*I(VF0P_x;m{3VL zIDk?*I3S@pI6R{|IP3>U;ZGnP9O&GtXB;w?P-_32daTt!d92mJz)8gPT z)WYDzC41s>R_;NNognWeeHY&b;lHpzeBL1N9Rg$sQvDtFDIO5=2&x11-j`;skq5(c zt~|a+9F4A-grwucW6~hha&y86^dq!GmM6C#rq(bc5F34-_4dE|#Z%rpS?}#X@{Z@c zeHT0{UrkTd(o=sxMxG^e)C;>7&CyyI9>i5q!~`-6yu@&pF|L~L?f12H9njggNT1=wX{HQKwEEewo#}@8Fzigh zSxnnr>njJuL4NBis*O1yyo~68xI0C);JI$ViqSlzcA5Hj#2i>#A?4U8NvoecaM=6d z?pk6y-Vgq9s;XTm#B+ZD9?Kn-l&{b9RC*#k_g6uWfh~ooTlATs`_yM1tmppF@BiV5 z5l@Jw{aeHABSc4KHS7qzZD>iFr%iBt@!t^g-0zuB2@T0VD(iNNX3QBZ;5ygttQbk> zXrZmpTNLUcJx2xa5{OzZZ}Fdo4rS+@iAxXGg<7Z?K@CGF6>{a7JgK8xr_(}%koP3C zhM$1=kEz0c$9Ip?8sGbgl!wes_?Ot*_d0mW=D}@B)axq387YeRf z+D!Sb7CO~U^Qb4Tnt+P}R?VX_D~lKc4}#Dl(8yX$z)U2Dv!Ik1Kt{G=WTR1axjE*h zVp@@sk4A&mfl)7s0kKBU_o))MrduP2&?1T$`L93*JS?XQtINW<;MlUAurRUpVd|s! zMZ-${RScUa;)oF*U-p@MePSuPKXgNPx=nQ@xWvrMFCk-{sz`ELr^8Z9I%49lFGV-v zTASGLF!)fMF62^l%-^0HwG=%poGw=1httI)>}zX;x5YnR6mbvE6hps*UQJkQ152{aj8Tb~$=lffd;%TpGtxoD2W|jK_bb{; z(&qaewY>UZ%@OM(-g#|#)q3nc?^^8_i0jYrCqI#k!jUyFQau)N*V=NgwZS)0bhcrX zXlF@2j$bihZEXJJ*yh&Eto;6l8}-yi{+j>Xet%(*8ubJ54mDyiP1{8)ccEbK%H`_= zh&k)zCd#EoAc495LScHRkHvQOxqtm~4tpXprJ^_|rHHnfyqm!hZJ@Am%7T;0<3NVlnVFJPb~ESBo!Oxmg1j_i7c*{Y z)|s&DU8M_C2s?<#z1L=t!x0e(GgwbtD3uC!t~eI2r~MYQetY%a2K%h*mg@a^H%t7o zlHXX|`FD7u>!}$W2w^+ryzSPLj{WXz9-$1u&73Y-Hunu~{|yIUsH|enVWA$U+8+iR z=UW6`B5;JjD+Im`P^Z46sJ_(0;G+3HI6o)CzW|tfF6ZJXin4HMc&_25fJCdC!VW=$ z5Df^(P*3IEd=Xv(MRbu#oiXOeWFWhmGeCgu!}&{q@L0%gTL=G&`1}Xr)7_3I!oMeS z;S)fUICd$J&ICik6sYyhe*_(AN$&qNqs@WufB3#Px_2qLerfoLTOG#c$&Qb}gFQYGHT~S49yM$Sm*@!&$~~l~4?j@d;2*GfV$ORrlWRK$SP{Q0A|9-Rt)n84X3#wBv3`G#oSc-CG!Q( ze4!@bVgpti7!>h^Qjn=d5(CJ{Rt&03BoZ1KXPIUhtR!2>L2F3m4+42DnTI{|a81C) z2COzPDB=sHAXAGZ29S}h7*v-?Bs4P4GR-nrNw$)M){x2{1nMO$!ecw?$qTdjLK!2r zBfQ?$WoNF@NVrQ5{tytm9Q&%1FWdEQM%5EOPK?wLWY~h|lneO_yJpMzLcN>waw{Qu zlGQs(Zrz-n&0AtuV!g|vjgD6wjH`x2Gr;J6%DPS^OLfZkb~loqHLiwAr4Nen^aIAfW(|F*pfhCx+ft_ zg&$kJtAI0=Y^UC1zXu+YQf{Jrnr>+?fdd525unJ1;`CN=5RwRm@6Z-Qhr%9;$T|+~ zI&{dK=FlRNFs?ZyKdu|1tA&)gojfF4deQl>D55#M+YJo|_On=Qi9}5JKhIdr*OJk< zApR{GJqzOBlF_{&{w*0JUhrBnM!n#5lWP62?*p*7ncBRNSX%##XRKLDul-Z!N7sHf zT1{`SrMEBi-ZVNFI=!wJye$U_cD+zF4%Li9o^j}AV&H@KzfA1Bk=XfZ(R=Z&YU1r$ z;_U@=|fX{@f*m_f>noxt8L1@?9di~{>RV>@mc4EmFlWiXtIJ$cJuu(m5!1VaT5Bm_o* zgJ!HG8fAz^DXqZ%uGns3 0 + assert 'content' not in response[0] + assert 'markdown' in response[0] + assert "_Roast_" in response[0]['markdown'] + +def test_crawl_url_with_idempotency_key_e2e(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + uniqueIdempotencyKey = str(uuid4()) + response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey) + assert response is not None + assert len(response) > 0 + assert 'content' in response[0] + assert "_Roast_" in response[0]['content'] + + with pytest.raises(Exception) as excinfo: + app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey) + assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value) + +def test_check_crawl_status_e2e(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False) + assert response is not None + assert 'jobId' in response + + time.sleep(30) # wait for 30 seconds + status_response = app.check_crawl_status(response['jobId']) + assert status_response is not None + assert 'status' in status_response + assert status_response['status'] == 'completed' + assert 'data' in status_response + assert len(status_response['data']) > 0 + +def test_search_e2e(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + with pytest.raises(NotImplementedError) as excinfo: + app.search("test query") + assert "Search is not supported in v1" in str(excinfo.value) + +def test_llm_extraction(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.scrape_url("https://mendable.ai", { + 'extractorOptions': { + 'mode': 'llm-extraction', + 'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", + 'extractionSchema': { + 'type': 'object', + 'properties': { + 'company_mission': {'type': 'string'}, + 'supports_sso': {'type': 'boolean'}, + 'is_open_source': {'type': 'boolean'} + }, + 'required': ['company_mission', 'supports_sso', 'is_open_source'] + } + } + }) + assert response is not None + assert 'llm_extraction' in response + llm_extraction = response['llm_extraction'] + assert 'company_mission' in llm_extraction + assert isinstance(llm_extraction['supports_sso'], bool) + assert isinstance(llm_extraction['is_open_source'], bool) + +def test_map_e2e(): + app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token") + response = app.map_url('https://roastmywebsite.ai') + assert response is not None + assert isinstance(response, list) + \ No newline at end of file diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 7ec0d33f..25c9663e 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -19,24 +19,22 @@ import requests logger : logging.Logger = logging.getLogger("firecrawl") class FirecrawlApp: - """ - Initialize the FirecrawlApp instance. + def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None, version: str = 'v1') -> None: + """ + Initialize the FirecrawlApp instance with API key, API URL, and version. - Args: - api_key (Optional[str]): API key for authenticating with the Firecrawl API. - api_url (Optional[str]): Base URL for the Firecrawl API. - """ - def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: - self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') - if self.api_key is None: - logger.warning("No API key provided") - raise ValueError('No API key provided') - else: - logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key) - - self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') - if self.api_url != 'https://api.firecrawl.dev': - logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url) + Args: + api_key (Optional[str]): API key for authenticating with the Firecrawl API. + api_url (Optional[str]): Base URL for the Firecrawl API. + version (str): API version, either 'v0' or 'v1'. + """ + self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') + self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') + self.version = version + if self.api_key is None: + logger.warning("No API key provided") + raise ValueError('No API key provided') + logger.debug(f"Initialized FirecrawlApp with API key: {self.api_key} and version: {self.version}") def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: """ @@ -75,9 +73,11 @@ class FirecrawlApp: for key, value in params.items(): if key != 'extractorOptions': scrape_params[key] = value + + endpoint = f'/{self.version}/scrape' # Make the POST request with the prepared headers and JSON data response = requests.post( - f'{self.api_url}/v0/scrape', + f'{self.api_url}{endpoint}', headers=headers, json=scrape_params, ) @@ -104,6 +104,9 @@ class FirecrawlApp: Raises: Exception: If the search request fails. """ + if self.version == 'v1': + raise NotImplementedError("Search is not supported in v1") + headers = self._prepare_headers() json_data = {'query': query} if params: @@ -145,11 +148,12 @@ class FirecrawlApp: Raises: Exception: If the crawl job initiation or monitoring fails. """ + endpoint = f'/{self.version}/crawl' headers = self._prepare_headers(idempotency_key) json_data = {'url': url} if params: json_data.update(params) - response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers) + response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) if response.status_code == 200: job_id = response.json().get('jobId') if wait_until_done: @@ -172,13 +176,44 @@ class FirecrawlApp: Raises: Exception: If the status check request fails. """ + endpoint = f'/{self.version}/crawl/status/{job_id}' headers = self._prepare_headers() - response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) + response = self._get_request(f'{self.api_url}{endpoint}', headers) if response.status_code == 200: return response.json() else: self._handle_error(response, 'check crawl status') + def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: + """ + Perform a map search using the Firecrawl API. + """ + if self.version == 'v0': + raise NotImplementedError("Map is not supported in v0") + + endpoint = f'/{self.version}/map' + headers = self._prepare_headers() + + # Prepare the base scrape parameters with the URL + json_data = {'url': url} + if params: + json_data.update(params) + + # Make the POST request with the prepared headers and JSON data + response = requests.post( + f'{self.api_url}{endpoint}', + headers=headers, + json=json_data, + ) + if response.status_code == 200: + response = response.json() + if response['success'] and 'data' in response: + return response['data'] + else: + raise Exception(f'Failed to map URL. Error: {response["error"]}') + else: + self._handle_error(response, 'map') + def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]: """ Prepare the headers for API requests. From 32aba4416737da7f7e9b2c88e82887cf6c0e7f94 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 19 Aug 2024 13:37:20 -0300 Subject: [PATCH 06/16] fixing merge issues --- .../src/__tests__/v1/e2e_withAuth/index.test.ts | 15 ++++++++------- apps/js-sdk/firecrawl/src/index.ts | 10 +++++++++- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index cf6181fe..b6f6b5e9 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -1,4 +1,4 @@ -import FirecrawlApp, { CrawlResponse, JobStatusResponse, ScrapeResponse } from '../../../index'; +import FirecrawlApp, { CrawlParams, CrawlResponse, CrawlStatusResponse, ScrapeParams, ScrapeResponse } from '../../../index'; import { v4 as uuidv4 } from 'uuid'; import dotenv from 'dotenv'; import { describe, test, expect } from '@jest/globals'; @@ -80,8 +80,10 @@ describe('FirecrawlApp E2E Tests', () => { expect(response.data?.metadata).toHaveProperty("ogLocaleAlternate"); expect(response.data?.metadata).toHaveProperty("ogSiteName"); expect(response.data?.metadata).toHaveProperty("sourceURL"); - expect(response.data?.metadata).toHaveProperty("pageStatusCode"); - expect(response.data?.metadata.pageError).toBeUndefined(); + expect(response.data?.metadata).not.toHaveProperty("pageStatusCode"); + expect(response.data?.metadata).toHaveProperty("statusCode"); + expect(response.data?.metadata).not.toHaveProperty("pageError"); + expect(response.data?.metadata.error).toBeUndefined(); expect(response.data?.metadata.title).toBe("Roast My Website"); expect(response.data?.metadata.description).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"); expect(response.data?.metadata.keywords).toBe("Roast My Website,Roast,Website,GitHub,Firecrawl"); @@ -123,9 +125,8 @@ describe('FirecrawlApp E2E Tests', () => { test.concurrent('should return successful response for crawl and wait for completion', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.crawlUrl('https://roastmywebsite.ai', {}, true, 30) as JobStatusResponse; + const response = await app.crawlUrl('https://roastmywebsite.ai', {}, true, 30) as CrawlStatusResponse; expect(response).not.toBeNull(); - expect(response).toHaveProperty("totalCount"); expect(response.totalCount).toBeGreaterThan(0); expect(response).toHaveProperty("creditsUsed"); @@ -176,7 +177,7 @@ describe('FirecrawlApp E2E Tests', () => { timeout: 30000, waitFor: 1000 } - }, true, 30) as JobStatusResponse; + } as CrawlParams, true, 30) as CrawlStatusResponse; expect(response).not.toBeNull(); expect(response).toHaveProperty("totalCount"); expect(response.totalCount).toBeGreaterThan(0); @@ -223,7 +224,7 @@ describe('FirecrawlApp E2E Tests', () => { test.concurrent('should check crawl status', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response: any = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false) as JobStatusResponse; + const response: any = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false) as CrawlStatusResponse; expect(response).not.toBeNull(); expect(response.jobId).toBeDefined(); diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index cbec3644..a534fff8 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -167,7 +167,15 @@ export interface ScrapeResponseV0 { */ export interface CrawlParams { scrapeOptions?: ScrapeParams; - crawlerOptions?: MapParams; + crawlerOptions?: { + includePaths?: string[] + excludePaths?: string[] + maxDepth?: number + limit?: number + allowBackwardLinks?: boolean + allowExternalLinks?: boolean + ignoreSitemap?: boolean + }; } /** From e160d5529a238a800c5109706fb13cdf5fe3208e Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 20 Aug 2024 09:22:38 -0300 Subject: [PATCH 07/16] fixed test --- .../firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index b6f6b5e9..d911b335 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -95,7 +95,7 @@ describe('FirecrawlApp E2E Tests', () => { expect(response.data?.metadata.ogLocaleAlternate).toStrictEqual([]); expect(response.data?.metadata.ogSiteName).toBe("Roast My Website"); expect(response.data?.metadata.sourceURL).toBe("https://roastmywebsite.ai"); - expect(response.data?.metadata.pageStatusCode).toBe(200); + expect(response.data?.metadata.statusCode).toBe(200); }, 30000); // 30 seconds timeout test.concurrent('should return successful response for valid scrape with PDF file', async () => { From fa89d2e535dc3c3b57d06474dbb82efec0f3de9e Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 20 Aug 2024 10:37:24 -0300 Subject: [PATCH 08/16] v1 support for crawl/monitor status --- .../__tests__/v1/e2e_withAuth/index.test.ts | 14 +++++-------- apps/js-sdk/firecrawl/src/index.ts | 21 +++++++++++++------ 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index d911b335..ba0bf9a6 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -132,16 +132,14 @@ describe('FirecrawlApp E2E Tests', () => { expect(response).toHaveProperty("creditsUsed"); expect(response.creditsUsed).toBeGreaterThan(0); expect(response).toHaveProperty("expiresAt"); - expect(response.expiresAt).toBeGreaterThan(Date.now()); + expect(new Date(response.expiresAt).getTime()).toBeGreaterThan(Date.now()); expect(response).toHaveProperty("status"); expect(response.status).toBe("completed"); - expect(response).toHaveProperty("next"); - expect(response.next).toBeDefined(); + expect(response).not.toHaveProperty("next"); // wait until done expect(response.data?.length).toBeGreaterThan(0); expect(response.data?.[0]).toHaveProperty("markdown"); expect(response.data?.[0].markdown).toContain("_Roast_"); expect(response.data?.[0]).not.toHaveProperty('content'); // v0 - expect(response.data?.[0].markdown).toContain("_Roast_"); expect(response.data?.[0]).not.toHaveProperty("html"); expect(response.data?.[0]).not.toHaveProperty("rawHtml"); expect(response.data?.[0]).not.toHaveProperty("screenshot"); @@ -156,7 +154,7 @@ describe('FirecrawlApp E2E Tests', () => { expect(response.data?.[0].metadata).toHaveProperty("error"); }, 60000); // 60 seconds timeout - test.concurrent('should return successful response for crawl and wait for completion', async () => { + test.concurrent('should return successful response for crawl with options and wait for completion', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { @@ -184,16 +182,14 @@ describe('FirecrawlApp E2E Tests', () => { expect(response).toHaveProperty("creditsUsed"); expect(response.creditsUsed).toBeGreaterThan(0); expect(response).toHaveProperty("expiresAt"); - expect(response.expiresAt).toBeGreaterThan(Date.now()); + expect(new Date(response.expiresAt).getTime()).toBeGreaterThan(Date.now()); expect(response).toHaveProperty("status"); expect(response.status).toBe("completed"); - expect(response).toHaveProperty("next"); - expect(response.next).toContain("/v1/crawl/"); + expect(response).not.toHaveProperty("next"); expect(response.data?.length).toBeGreaterThan(0); expect(response.data?.[0]).toHaveProperty("markdown"); expect(response.data?.[0].markdown).toContain("_Roast_"); expect(response.data?.[0]).not.toHaveProperty('content'); // v0 - expect(response.data?.[0].markdown).toContain("_Roast_"); expect(response.data?.[0]).toHaveProperty("html"); expect(response.data?.[0].html).toContain(" { + let apiUrl: string = ''; while (true) { + if (this.version == 'v1') { + apiUrl = checkUrl ?? this.apiUrl + `/v1/crawl/${jobId}`; + } else if (this.version == 'v0') { + apiUrl = checkUrl ?? this.apiUrl + `/v0/crawl/status/${jobId}`; + } const statusResponse: AxiosResponse = await this.getRequest( - this.apiUrl + `/v0/crawl/status/${jobId}`, + apiUrl, headers ); if (statusResponse.status === 200) { const statusData = statusResponse.data; if (statusData.status === "completed") { if ("data" in statusData) { - return statusData.data; + return this.version == 'v0' ? statusData.data : statusData; } else { throw new Error("Crawl job completed but no data was returned"); } } else if ( - ["active", "paused", "pending", "queued"].includes(statusData.status) + ["active", "paused", "pending", "queued", "scraping"].includes(statusData.status) ) { if (checkInterval < 2) { checkInterval = 2; From 537fc689b66ccc5a26695febbceaffbb0dc7cab3 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 20 Aug 2024 17:08:02 -0300 Subject: [PATCH 09/16] fixing request --- .../__tests__/v1/e2e_withAuth/index.test.ts | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index ba0bf9a6..a5060b6e 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -157,22 +157,19 @@ describe('FirecrawlApp E2E Tests', () => { test.concurrent('should return successful response for crawl with options and wait for completion', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.crawlUrl('https://roastmywebsite.ai', { - crawlerOptions: { - excludePaths: ['blog/*'], - includePaths: ['/'], - maxDepth: 2, - ignoreSitemap: true, - limit: 10, - allowBackwardLinks: true, - allowExternalLinks: true, - }, - pageOptions: { + excludePaths: ['blog/*'], + includePaths: ['/'], + maxDepth: 2, + ignoreSitemap: true, + limit: 10, + allowBackwardLinks: true, + allowExternalLinks: true, + scrapeOptions: { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links'], headers: { "x-key": "test" }, includeTags: ['h1'], excludeTags: ['h2'], onlyMainContent: true, - timeout: 30000, waitFor: 1000 } } as CrawlParams, true, 30) as CrawlStatusResponse; From 9d64c8eedcfd8d291b5213d69bc6a6c5e8e68a6a Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 20 Aug 2024 17:24:45 -0300 Subject: [PATCH 10/16] screenshot should not be undefined also --- .../js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index a5060b6e..b0623b8d 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -64,6 +64,7 @@ describe('FirecrawlApp E2E Tests', () => { expect(response.data?.markdown).toContain("_Roast_"); expect(response.data?.html).toContain(" Date: Tue, 20 Aug 2024 20:00:41 -0300 Subject: [PATCH 11/16] tests passing now --- .../src/scraper/WebScraper/utils/metadata.ts | 4 +- .../__tests__/v1/e2e_withAuth/index.test.ts | 114 +++++++++++------- apps/js-sdk/firecrawl/src/index.ts | 57 ++++++--- 3 files changed, 110 insertions(+), 65 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/utils/metadata.ts b/apps/api/src/scraper/WebScraper/utils/metadata.ts index 9496d569..fac53b38 100644 --- a/apps/api/src/scraper/WebScraper/utils/metadata.ts +++ b/apps/api/src/scraper/WebScraper/utils/metadata.ts @@ -75,9 +75,7 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { description = soup('meta[name="description"]').attr("content") || null; // Assuming the language is part of the URL as per the regex pattern - const pattern = /([a-zA-Z]+-[A-Z]{2})/; - const match = pattern.exec(url); - language = match ? match[1] : null; + language = soup('html').attr('lang') || null; keywords = soup('meta[name="keywords"]').attr("content") || null; robots = soup('meta[name="robots"]').attr("content") || null; diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index b0623b8d..724996bc 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -1,4 +1,4 @@ -import FirecrawlApp, { CrawlParams, CrawlResponse, CrawlStatusResponse, ScrapeParams, ScrapeResponse } from '../../../index'; +import FirecrawlApp, { CrawlParams, CrawlResponse, CrawlStatusResponse, MapResponse, ScrapeParams, ScrapeResponse } from '../../../index'; import { v4 as uuidv4 } from 'uuid'; import dotenv from 'dotenv'; import { describe, test, expect } from '@jest/globals'; @@ -66,6 +66,7 @@ describe('FirecrawlApp E2E Tests', () => { expect(response.data?.rawHtml).toContain(" { test.concurrent('should throw error for blocklisted URL on crawl', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const blocklistedUrl = "https://twitter.com/fake-test"; - await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); + await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions."); }); test.concurrent('should return successful response for crawl and wait for completion', async () => { @@ -145,14 +146,13 @@ describe('FirecrawlApp E2E Tests', () => { expect(response.data?.[0]).not.toHaveProperty("rawHtml"); expect(response.data?.[0]).not.toHaveProperty("screenshot"); expect(response.data?.[0]).not.toHaveProperty("links"); - expect(response.data?.[0]).toHaveProperty("metadata"); expect(response.data?.[0].metadata).toHaveProperty("title"); expect(response.data?.[0].metadata).toHaveProperty("description"); expect(response.data?.[0].metadata).toHaveProperty("language"); expect(response.data?.[0].metadata).toHaveProperty("sourceURL"); expect(response.data?.[0].metadata).toHaveProperty("statusCode"); - expect(response.data?.[0].metadata).toHaveProperty("error"); + expect(response.data?.[0].metadata).not.toHaveProperty("error"); }, 60000); // 60 seconds timeout test.concurrent('should return successful response for crawl with options and wait for completion', async () => { @@ -203,7 +203,7 @@ describe('FirecrawlApp E2E Tests', () => { expect(response.data?.[0].metadata).toHaveProperty("language"); expect(response.data?.[0].metadata).toHaveProperty("sourceURL"); expect(response.data?.[0].metadata).toHaveProperty("statusCode"); - expect(response.data?.[0].metadata).toHaveProperty("error"); + expect(response.data?.[0].metadata).not.toHaveProperty("error"); }, 60000); // 60 seconds timeout test.concurrent('should handle idempotency key for crawl', async () => { @@ -211,23 +211,23 @@ describe('FirecrawlApp E2E Tests', () => { const uniqueIdempotencyKey = uuidv4(); const response = await app.crawlUrl('https://roastmywebsite.ai', {}, false, 2, uniqueIdempotencyKey) as CrawlResponse; expect(response).not.toBeNull(); - expect(response.jobId).toBeDefined(); + expect(response.id).toBeDefined(); await expect(app.crawlUrl('https://roastmywebsite.ai', {}, true, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409"); }); test.concurrent('should check crawl status', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response: any = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false) as CrawlStatusResponse; + const response = await app.crawlUrl('https://firecrawl.dev', { scrapeOptions: { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}} as CrawlParams, false) as CrawlResponse; expect(response).not.toBeNull(); - expect(response.jobId).toBeDefined(); + expect(response.id).toBeDefined(); - let statusResponse: any = await app.checkCrawlStatus(response.jobId); + let statusResponse: any = await app.checkCrawlStatus(response.id) as CrawlStatusResponse; const maxChecks = 15; let checks = 0; while (statusResponse.status === 'scraping' && checks < maxChecks) { - await new Promise(resolve => setTimeout(resolve, 1000)); + await new Promise(resolve => setTimeout(resolve, 5000)); expect(statusResponse).not.toHaveProperty("partial_data"); // v0 expect(statusResponse).not.toHaveProperty("current"); // v0 expect(statusResponse).toHaveProperty("data"); @@ -238,44 +238,70 @@ describe('FirecrawlApp E2E Tests', () => { expect(statusResponse).toHaveProperty("next"); expect(statusResponse.totalCount).toBeGreaterThan(0); expect(statusResponse.creditsUsed).toBeGreaterThan(0); - expect(statusResponse.expiresAt).toBeGreaterThan(Date.now()); + expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now()); expect(statusResponse.status).toBe("scraping"); expect(statusResponse.next).toContain("/v1/crawl/"); - statusResponse = await app.checkCrawlStatus(response.jobId) as CrawlResponse; + statusResponse = await app.checkCrawlStatus(response.id) as CrawlStatusResponse; checks++; } + expect(statusResponse).not.toBeNull(); + expect(statusResponse).toHaveProperty("totalCount"); + expect(statusResponse.totalCount).toBeGreaterThan(0); + expect(statusResponse).toHaveProperty("creditsUsed"); + expect(statusResponse.creditsUsed).toBeGreaterThan(0); + expect(statusResponse).toHaveProperty("expiresAt"); + expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now()); + expect(statusResponse).toHaveProperty("status"); + expect(statusResponse.status).toBe("completed"); + expect(statusResponse.data?.length).toBeGreaterThan(0); + expect(statusResponse.data?.[0]).toHaveProperty("markdown"); + expect(statusResponse.data?.[0].markdown?.length).toBeGreaterThan(10); + expect(statusResponse.data?.[0]).not.toHaveProperty('content'); // v0 + expect(statusResponse.data?.[0]).toHaveProperty("html"); + expect(statusResponse.data?.[0].html).toContain(" { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); + }); + + test.concurrent('should throw error for blocklisted URL on map', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const blocklistedUrl = "https://facebook.com/fake-test"; + await expect(app.mapUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); + }); + + test.concurrent('should return successful response with valid preview token', async () => { + const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL }); + const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse; expect(response).not.toBeNull(); - expect(response).toHaveProperty("totalCount"); - expect(response.totalCount).toBeGreaterThan(0); - expect(response).toHaveProperty("creditsUsed"); - expect(response.creditsUsed).toBeGreaterThan(0); - expect(response).toHaveProperty("expiresAt"); - expect(response.expiresAt).toBeGreaterThan(Date.now()); - expect(response).toHaveProperty("status"); - expect(response.status).toBe("completed"); - expect(response).toHaveProperty("next"); - expect(response.next).toContain("/v1/crawl/"); - expect(response.data?.length).toBeGreaterThan(0); - expect(response.data?.[0]).toHaveProperty("markdown"); - expect(response.data?.[0].markdown).toContain("_Roast_"); - expect(response.data?.[0]).not.toHaveProperty('content'); // v0 - expect(response.data?.[0].markdown).toContain("_Roast_"); - expect(response.data?.[0]).toHaveProperty("html"); - expect(response.data?.[0].html).toContain(" { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse; + expect(response).not.toBeNull(); + + expect(response.links?.length).toBeGreaterThan(0); + expect(response.links?.[0]).toContain("https://"); + const filteredLinks = response.links?.filter((link: string) => link.includes("roastmywebsite.ai")); + expect(filteredLinks?.length).toBeGreaterThan(0); + }, 30000); // 30 seconds timeout }); diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index c280206c..90c86a2a 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -214,7 +214,7 @@ export interface CrawlParamsV0 { * Defines the structure of the response received after initiating a crawl. */ export interface CrawlResponse { - jobId?: string; + id?: string; url?: string; success: boolean; error?: string; @@ -281,7 +281,7 @@ export interface MapParams { */ export interface MapResponse { success: boolean; - data?: string[]; + links?: string[]; error?: string; } @@ -458,36 +458,53 @@ export default class FirecrawlApp { headers ); if (response.status === 200) { - const jobId: string = this.version == 'v0' ? response.data.jobId : response.data.id; + const id: string = this.version == 'v0' ? response.data.jobId : response.data.id; let checkUrl: string | undefined = undefined; if (waitUntilDone) { if (this.version == 'v1') { checkUrl = response.data.url } - return this.monitorJobStatus(jobId, headers, pollInterval, checkUrl); + return this.monitorJobStatus(id, headers, pollInterval, checkUrl); } else { - return { success: true, jobId }; + if (this.version == 'v0') { + return { + success: true, + jobId: id + } as CrawlResponseV0; + } else { + return { + success: true, + id: id + } as CrawlResponse; + } } } else { this.handleError(response, "start crawl job"); } } catch (error: any) { - console.log(error); - throw new Error(error.message); + if (error.response.data.error) { + throw new Error(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`); + } else { + throw new Error(error.message); + } } return { success: false, error: "Internal server error." }; } /** * Checks the status of a crawl job using the Firecrawl API. - * @param jobId - The job ID of the crawl operation. + * @param id - The ID of the crawl operation. * @returns The response containing the job status. */ - async checkCrawlStatus(jobId: string): Promise { + async checkCrawlStatus(id?: string): Promise { + if (!id) { + throw new Error("No crawl ID provided"); + } + const headers: AxiosRequestHeaders = this.prepareHeaders(); try { const response: AxiosResponse = await this.getRequest( this.version == 'v1' ? - this.apiUrl + `/${this.version}/crawl/${jobId}` : - this.apiUrl + `/${this.version}/crawl/status/${jobId}`, + this.apiUrl + `/${this.version}/crawl/${id}` : + this.apiUrl + `/${this.version}/crawl/status/${id}`, headers ); if (response.status === 200) { @@ -508,8 +525,12 @@ export default class FirecrawlApp { return { success: true, status: response.data.status, + totalCount: response.data.totalCount, + creditsUsed: response.data.creditsUsed, + expiresAt: new Date(response.data.expiresAt), + next: response.data.next, data: response.data.data, - error: response.data.error, + error: response.data.error } as CrawlStatusResponse; } } else { @@ -537,7 +558,7 @@ export default class FirecrawlApp { } } - async map(url: string, params?: MapParams): Promise { + async mapUrl(url: string, params?: MapParams): Promise { if (this.version == 'v0') { throw new Error("Map is not supported in v0"); } @@ -604,23 +625,23 @@ export default class FirecrawlApp { /** * Monitors the status of a crawl job until completion or failure. - * @param jobId - The job ID of the crawl operation. + * @param id - The ID of the crawl operation. * @param headers - The headers for the request. * @param checkInterval - Interval in seconds for job status checks. * @returns The final job status or data. */ async monitorJobStatus( - jobId: string, + id: string, headers: AxiosRequestHeaders, checkInterval: number, checkUrl?: string - ): Promise { + ): Promise { let apiUrl: string = ''; while (true) { if (this.version == 'v1') { - apiUrl = checkUrl ?? this.apiUrl + `/v1/crawl/${jobId}`; + apiUrl = checkUrl ?? this.apiUrl + `/v1/crawl/${id}`; } else if (this.version == 'v0') { - apiUrl = checkUrl ?? this.apiUrl + `/v0/crawl/status/${jobId}`; + apiUrl = checkUrl ?? this.apiUrl + `/v0/crawl/status/${id}`; } const statusResponse: AxiosResponse = await this.getRequest( apiUrl, From a4686e3c8c3e79507d9f8a68f2d66ec916337d5f Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 21 Aug 2024 15:56:48 -0300 Subject: [PATCH 12/16] fixing tests --- .../firecrawl/__tests__/e2e_withAuth/test.py | 6 +- apps/python-sdk/firecrawl/firecrawl.py | 80 +++++++++++++++---- 2 files changed, 70 insertions(+), 16 deletions(-) diff --git a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py index 457c206a..8945d74d 100644 --- a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py +++ b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py @@ -7,7 +7,7 @@ from dotenv import load_dotenv load_dotenv() -API_URL = "http://127.0.0.1:3002"; +API_URL = "http://127.0.0.1:3002" ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py" TEST_API_KEY = os.getenv('TEST_API_KEY') @@ -46,6 +46,8 @@ def test_successful_response_with_valid_preview_token(): def test_scrape_url_e2e(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') response = app.scrape_url('https://roastmywebsite.ai') + print(response) + assert response is not None assert 'content' in response assert 'markdown' in response @@ -145,7 +147,7 @@ def test_search_invalid_api_key(): def test_llm_extraction(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') - response = app.scrape_url("https://mendable.ai", { + response = app.scrape_url("https://firecrawl.dev", { 'extractorOptions': { 'mode': 'llm-extraction', 'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 25c9663e..f67afbdb 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -155,20 +155,30 @@ class FirecrawlApp: json_data.update(params) response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) if response.status_code == 200: - job_id = response.json().get('jobId') - if wait_until_done: - return self._monitor_job_status(job_id, headers, poll_interval) + if self.version == 'v0': + id = response.json().get('jobId') else: - return {'jobId': job_id} + id = response.json().get('id') + + if wait_until_done: + check_url = None + if self.version == 'v1': + check_url = response.json().get('url') + return self._monitor_job_status(id, headers, poll_interval, check_url) + else: + if self.version == 'v0': + return {'jobId': id} + else: + return {'id': id} else: self._handle_error(response, 'start crawl job') - def check_crawl_status(self, job_id: str) -> Any: + def check_crawl_status(self, id: str) -> Any: """ Check the status of a crawl job using the Firecrawl API. Args: - job_id (str): The ID of the crawl job. + id (str): The ID of the crawl job. Returns: Any: The status of the crawl job. @@ -176,11 +186,38 @@ class FirecrawlApp: Raises: Exception: If the status check request fails. """ - endpoint = f'/{self.version}/crawl/status/{job_id}' + + if self.version == 'v0': + endpoint = f'/{self.version}/crawl/status/{id}' + else: + endpoint = f'/{self.version}/crawl/{id}' + headers = self._prepare_headers() response = self._get_request(f'{self.api_url}{endpoint}', headers) if response.status_code == 200: - return response.json() + data = response.json() + if self.version == 'v0': + return { + 'success': True, + 'status': data.get('status'), + 'current': data.get('current'), + 'current_url': data.get('current_url'), + 'current_step': data.get('current_step'), + 'total': data.get('total'), + 'data': data.get('data'), + 'partial_data': data.get('partial_data') if not data.get('data') else None, + } + elif self.version == 'v1': + return { + 'success': True, + 'status': data.get('status'), + 'totalCount': data.get('totalCount'), + 'creditsUsed': data.get('creditsUsed'), + 'expiresAt': data.get('expiresAt'), + 'next': data.get('next'), + 'data': data.get('data'), + 'error': data.get('error') + } else: self._handle_error(response, 'check crawl status') @@ -292,15 +329,15 @@ class FirecrawlApp: return response return response - def _monitor_job_status(self, job_id: str, headers: Dict[str, str], poll_interval: int) -> Any: + def _monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int, check_url: Optional[str] = None) -> Any: """ Monitor the status of a crawl job until completion. Args: - job_id (str): The ID of the crawl job. + id (str): The ID of the crawl job. headers (Dict[str, str]): The headers to include in the status check requests. poll_interval (int): Secounds between status checks. - + check_url (Optional[str]): The URL to check for the crawl job. Returns: Any: The crawl results if the job is completed successfully. @@ -308,15 +345,30 @@ class FirecrawlApp: Exception: If the job fails or an error occurs during status checks. """ while True: - status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) + api_url = '' + if (self.version == 'v0'): + if check_url: + api_url = check_url + else: + api_url = f'{self.api_url}/v0/crawl/status/{id}' + else: + if check_url: + api_url = check_url + else: + api_url = f'{self.api_url}/v1/crawl/{id}' + + status_response = self._get_request(api_url, headers) if status_response.status_code == 200: status_data = status_response.json() if status_data['status'] == 'completed': if 'data' in status_data: - return status_data['data'] + if self.version == 'v0': + return status_data['data'] + else: + return status_data else: raise Exception('Crawl job completed but no data was returned') - elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting']: + elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']: poll_interval=max(poll_interval,2) time.sleep(poll_interval) # Wait for the specified interval before checking again else: From 0b37cbce4a7dd7a96b0be76abbec84482cdf586f Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 21 Aug 2024 15:58:51 -0300 Subject: [PATCH 13/16] Update .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 91b7ef48..bcd1e3d1 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,4 @@ apps/test-suite/load-test-results/test-run-report.json apps/playwright-service-ts/node_modules/ apps/playwright-service-ts/package-lock.json +*.pyc From ab88a75c70ceaa780530e6248e29b182e9d2da09 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 22 Aug 2024 13:38:34 -0300 Subject: [PATCH 14/16] fixes sdks --- .../__tests__/v1/e2e_withAuth/index.test.ts | 103 +++--- apps/js-sdk/firecrawl/src/index.ts | 10 +- .../__tests__/v1/e2e_withAuth/test.py | 308 ++++++++++++++---- apps/python-sdk/firecrawl/firecrawl.py | 18 +- 4 files changed, 317 insertions(+), 122 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index 724996bc..81c870f5 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -30,24 +30,24 @@ describe('FirecrawlApp E2E Tests', () => { const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL }); const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponse; expect(response).not.toBeNull(); - expect(response.data?.markdown).toContain("_Roast_"); + expect(response?.markdown).toContain("_Roast_"); }, 30000); // 30 seconds timeout test.concurrent('should return successful response for valid scrape', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponse; expect(response).not.toBeNull(); - expect(response.data).not.toHaveProperty('content'); // v0 - expect(response.data).not.toHaveProperty('html'); - expect(response.data).not.toHaveProperty('rawHtml'); - expect(response.data).not.toHaveProperty('screenshot'); - expect(response.data).not.toHaveProperty('links'); + expect(response).not.toHaveProperty('content'); // v0 + expect(response).not.toHaveProperty('html'); + expect(response).not.toHaveProperty('rawHtml'); + expect(response).not.toHaveProperty('screenshot'); + expect(response).not.toHaveProperty('links'); - expect(response.data).toHaveProperty('markdown'); - expect(response.data).toHaveProperty('metadata'); + expect(response).toHaveProperty('markdown'); + expect(response).toHaveProperty('metadata'); }, 30000); // 30 seconds timeout - test.concurrent('should return successful response with valid API key and include HTML', async () => { + test.concurrent('should return successful response with valid API key and options', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.scrapeUrl( 'https://roastmywebsite.ai', { @@ -60,58 +60,58 @@ describe('FirecrawlApp E2E Tests', () => { waitFor: 1000 }) as ScrapeResponse; expect(response).not.toBeNull(); - expect(response.data).not.toHaveProperty('content'); // v0 - expect(response.data?.markdown).toContain("_Roast_"); - expect(response.data?.html).toContain(" { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf') as ScrapeResponse; expect(response).not.toBeNull(); - expect(response.data?.markdown).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + expect(response?.markdown).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); }, 30000); // 30 seconds timeout test.concurrent('should return successful response for valid scrape with PDF file without explicit extension', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001') as ScrapeResponse; expect(response).not.toBeNull(); - expect(response.data?.markdown).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + expect(response?.markdown).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); }, 30000); // 30 seconds timeout test.concurrent('should throw error for invalid API key on crawl', async () => { @@ -304,4 +304,9 @@ describe('FirecrawlApp E2E Tests', () => { const filteredLinks = response.links?.filter((link: string) => link.includes("roastmywebsite.ai")); expect(filteredLinks?.length).toBeGreaterThan(0); }, 30000); // 30 seconds timeout + + test('should throw NotImplementedError for search on v1', async () => { + const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: TEST_API_KEY }); + await expect(app.search("test query")).rejects.toThrow("Search is not supported in v1"); + }); }); diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 90c86a2a..90617de1 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -144,10 +144,9 @@ export interface ScrapeParamsV0 { * Response interface for scraping operations. * Defines the structure of the response received after a scraping operation. */ -export interface ScrapeResponse { +export interface ScrapeResponse extends FirecrawlDocument { success: boolean; warning?: string; - data?: FirecrawlDocument; error?: string; } @@ -375,7 +374,12 @@ export default class FirecrawlApp { if (this.version == 'v0') { return responseData as ScrapeResponseV0; } else { - return responseData as ScrapeResponse; + return { + success: true, + warning: responseData.warning, + error: responseData.error, + ...responseData.data + } as ScrapeResponse; } } else { throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py index 517d8cf9..5fb2c674 100644 --- a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py +++ b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py @@ -4,6 +4,7 @@ import time import os from uuid import uuid4 from dotenv import load_dotenv +from datetime import datetime load_dotenv() @@ -27,42 +28,92 @@ def test_scrape_url_invalid_api_key(): invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") with pytest.raises(Exception) as excinfo: invalid_app.scrape_url('https://firecrawl.dev') - assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) + assert "Unauthorized: Invalid token" in str(excinfo.value) def test_blocklisted_url(): blocklisted_url = "https://facebook.com/fake-test" app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) with pytest.raises(Exception) as excinfo: app.scrape_url(blocklisted_url) - assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) + assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value) def test_successful_response_with_valid_preview_token(): app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token") response = app.scrape_url('https://roastmywebsite.ai') assert response is not None - assert 'content' in response - assert "_Roast_" in response['content'] + assert "_Roast_" in response['markdown'] + assert "content" not in response + assert "html" not in response + assert "metadata" in response + assert "links" not in response + assert "rawHtml" not in response -def test_scrape_url_e2e(): +def test_successful_response_for_valid_scrape(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) response = app.scrape_url('https://roastmywebsite.ai') assert response is not None - assert 'content' not in response assert 'markdown' in response - assert 'metadata' in response - assert 'html' not in response assert "_Roast_" in response['markdown'] + assert 'metadata' in response + assert 'content' not in response + assert 'html' not in response + assert 'rawHtml' not in response + assert 'screenshot' not in response + assert 'links' not in response -def test_successful_response_with_valid_api_key_and_include_html(): +def test_successful_response_with_valid_api_key_and_options(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) - response = app.scrape_url('https://roastmywebsite.ai', { 'formats': [ 'markdown', 'html' ]}) + params = { + 'formats': ['markdown', 'html', 'rawHtml', 'screenshot', 'links'], + 'headers': {'x-key': 'test'}, + 'includeTags': ['h1'], + 'excludeTags': ['h2'], + 'onlyMainContent': True, + 'timeout': 30000, + 'waitFor': 1000 + } + response = app.scrape_url('https://roastmywebsite.ai', params) assert response is not None assert 'content' not in response assert 'markdown' in response assert 'html' in response - assert 'metadata' in response + assert 'rawHtml' in response + assert 'screenshot' in response + assert 'links' in response assert "_Roast_" in response['markdown'] assert " 0 + assert "https://" in response['links'][0] + assert 'metadata' in response + assert 'title' in response['metadata'] + assert 'description' in response['metadata'] + assert 'keywords' in response['metadata'] + assert 'robots' in response['metadata'] + assert 'ogTitle' in response['metadata'] + assert 'ogDescription' in response['metadata'] + assert 'ogUrl' in response['metadata'] + assert 'ogImage' in response['metadata'] + assert 'ogLocaleAlternate' in response['metadata'] + assert 'ogSiteName' in response['metadata'] + assert 'sourceURL' in response['metadata'] + assert 'statusCode' in response['metadata'] + assert 'pageStatusCode' not in response['metadata'] + assert 'pageError' not in response['metadata'] + assert 'error' not in response['metadata'] + assert response['metadata']['title'] == "Roast My Website" + assert response['metadata']['description'] == "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + assert response['metadata']['keywords'] == "Roast My Website,Roast,Website,GitHub,Firecrawl" + assert response['metadata']['robots'] == "follow, index" + assert response['metadata']['ogTitle'] == "Roast My Website" + assert response['metadata']['ogDescription'] == "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + assert response['metadata']['ogUrl'] == "https://www.roastmywebsite.ai" + assert response['metadata']['ogImage'] == "https://www.roastmywebsite.ai/og.png" + assert response['metadata']['ogLocaleAlternate'] == [] + assert response['metadata']['ogSiteName'] == "Roast My Website" + assert response['metadata']['sourceURL'] == "https://roastmywebsite.ai" + assert response['metadata']['statusCode'] == 200 def test_successful_response_for_valid_scrape_with_pdf_file(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) @@ -70,65 +121,202 @@ def test_successful_response_for_valid_scrape_with_pdf_file(): assert response is not None assert 'content' not in response assert 'metadata' in response - assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content'] + assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['markdown'] def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001') - time.sleep(6) # wait for 6 seconds + time.sleep(1) # wait for 1 second assert response is not None - assert 'content' not in response - assert 'metadata' in response - assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content'] + assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['markdown'] def test_crawl_url_invalid_api_key(): invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") with pytest.raises(Exception) as excinfo: invalid_app.crawl_url('https://firecrawl.dev') - assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) + assert "Unauthorized: Invalid token" in str(excinfo.value) def test_should_return_error_for_blocklisted_url(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) blocklisted_url = "https://twitter.com/fake-test" with pytest.raises(Exception) as excinfo: app.crawl_url(blocklisted_url) - assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) + assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value) def test_crawl_url_wait_for_completion_e2e(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) - response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True) + response = app.crawl_url('https://roastmywebsite.ai', {'excludePaths': ['blog/*']}, True, 30) assert response is not None - assert len(response) > 0 - assert 'content' not in response[0] - assert 'markdown' in response[0] - assert "_Roast_" in response[0]['markdown'] + assert 'totalCount' in response + assert response['totalCount'] > 0 + assert 'creditsUsed' in response + assert response['creditsUsed'] > 0 + assert 'expiresAt' in response + assert datetime.strptime(response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now() + assert 'status' in response + assert response['status'] == 'completed' + assert 'next' not in response + assert len(response['data']) > 0 + assert 'markdown' in response['data'][0] + assert "_Roast_" in response['data'][0]['markdown'] + assert 'content' not in response['data'][0] + assert 'html' not in response['data'][0] + assert 'rawHtml' not in response['data'][0] + assert 'screenshot' not in response['data'][0] + assert 'links' not in response['data'][0] + assert 'metadata' in response['data'][0] + assert 'title' in response['data'][0]['metadata'] + assert 'description' in response['data'][0]['metadata'] + assert 'language' in response['data'][0]['metadata'] + assert 'sourceURL' in response['data'][0]['metadata'] + assert 'statusCode' in response['data'][0]['metadata'] + assert 'error' not in response['data'][0]['metadata'] + +def test_crawl_url_with_options_and_wait_for_completion(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.crawl_url('https://roastmywebsite.ai', { + 'excludePaths': ['blog/*'], + 'includePaths': ['/'], + 'maxDepth': 2, + 'ignoreSitemap': True, + 'limit': 10, + 'allowBackwardLinks': True, + 'allowExternalLinks': True, + 'scrapeOptions': { + 'formats': ['markdown', 'html', 'rawHtml', 'screenshot', 'links'], + 'headers': {"x-key": "test"}, + 'includeTags': ['h1'], + 'excludeTags': ['h2'], + 'onlyMainContent': True, + 'waitFor': 1000 + } + }, True, 30) + assert response is not None + assert 'totalCount' in response + assert response['totalCount'] > 0 + assert 'creditsUsed' in response + assert response['creditsUsed'] > 0 + assert 'expiresAt' in response + assert datetime.strptime(response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now() + assert 'status' in response + assert response['status'] == 'completed' + assert 'next' not in response + assert len(response['data']) > 0 + assert 'markdown' in response['data'][0] + assert "_Roast_" in response['data'][0]['markdown'] + assert 'content' not in response['data'][0] + assert 'html' in response['data'][0] + assert " 0 + assert 'metadata' in response['data'][0] + assert 'title' in response['data'][0]['metadata'] + assert 'description' in response['data'][0]['metadata'] + assert 'language' in response['data'][0]['metadata'] + assert 'sourceURL' in response['data'][0]['metadata'] + assert 'statusCode' in response['data'][0]['metadata'] + assert 'error' not in response['data'][0]['metadata'] def test_crawl_url_with_idempotency_key_e2e(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) uniqueIdempotencyKey = str(uuid4()) - response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey) + response = app.crawl_url('https://roastmywebsite.ai', {'excludePaths': ['blog/*']}, False, 2, uniqueIdempotencyKey) assert response is not None - assert len(response) > 0 - assert 'content' in response[0] - assert "_Roast_" in response[0]['content'] + assert 'id' in response with pytest.raises(Exception) as excinfo: - app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey) - assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value) + app.crawl_url('https://firecrawl.dev', {'excludePaths': ['blog/*']}, True, 2, uniqueIdempotencyKey) + assert "Idempotency key already used" in str(excinfo.value) def test_check_crawl_status_e2e(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) - response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False) + response = app.crawl_url('https://firecrawl.dev', {'scrapeOptions': {'formats': ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}}, False) assert response is not None - assert 'jobId' in response + assert 'id' in response - time.sleep(30) # wait for 30 seconds - status_response = app.check_crawl_status(response['jobId']) + max_checks = 15 + checks = 0 + status_response = app.check_crawl_status(response['id']) + + while status_response['status'] == 'scraping' and checks < max_checks: + time.sleep(1) # wait for 1 second + assert 'partial_data' not in status_response + assert 'current' not in status_response + assert 'data' in status_response + assert 'totalCount' in status_response + assert 'creditsUsed' in status_response + assert 'expiresAt' in status_response + assert 'status' in status_response + assert 'next' in status_response + assert status_response['totalCount'] > 0 + assert status_response['creditsUsed'] > 0 + assert datetime.strptime(status_response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now() + assert status_response['status'] == 'scraping' + assert '/v1/crawl/' in status_response['next'] + status_response = app.check_crawl_status(response['id']) + checks += 1 + assert status_response is not None + assert 'totalCount' in status_response + assert status_response['totalCount'] > 0 + assert 'creditsUsed' in status_response + assert status_response['creditsUsed'] > 0 + assert 'expiresAt' in status_response + assert datetime.strptime(status_response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now() assert 'status' in status_response assert status_response['status'] == 'completed' - assert 'data' in status_response assert len(status_response['data']) > 0 + assert 'markdown' in status_response['data'][0] + assert len(status_response['data'][0]['markdown']) > 10 + assert 'content' not in status_response['data'][0] + assert 'html' in status_response['data'][0] + assert " 0 + assert 'metadata' in status_response['data'][0] + assert 'title' in status_response['data'][0]['metadata'] + assert 'description' in status_response['data'][0]['metadata'] + assert 'language' in status_response['data'][0]['metadata'] + assert 'sourceURL' in status_response['data'][0]['metadata'] + assert 'statusCode' in status_response['data'][0]['metadata'] + assert 'error' not in status_response['data'][0]['metadata'] + +def test_invalid_api_key_on_map(): + invalid_app = FirecrawlApp(api_key="invalid_api_key", api_url=API_URL) + with pytest.raises(Exception) as excinfo: + invalid_app.map_url('https://roastmywebsite.ai') + assert "Unauthorized: Invalid token" in str(excinfo.value) + +def test_blocklisted_url_on_map(): + app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL) + blocklisted_url = "https://facebook.com/fake-test" + with pytest.raises(Exception) as excinfo: + app.map_url(blocklisted_url) + assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value) + +def test_successful_response_with_valid_preview_token_on_map(): + app = FirecrawlApp(api_key="this_is_just_a_preview_token", api_url=API_URL) + response = app.map_url('https://roastmywebsite.ai') + assert response is not None + assert len(response) > 0 + +def test_successful_response_for_valid_map(): + app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL) + response = app.map_url('https://roastmywebsite.ai') + assert response is not None + assert len(response) > 0 + assert any("https://" in link for link in response) + filtered_links = [link for link in response if "roastmywebsite.ai" in link] + assert len(filtered_links) > 0 def test_search_e2e(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) @@ -136,33 +324,29 @@ def test_search_e2e(): app.search("test query") assert "Search is not supported in v1" in str(excinfo.value) -def test_llm_extraction(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) - response = app.scrape_url("https://mendable.ai", { - 'extractorOptions': { - 'mode': 'llm-extraction', - 'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", - 'extractionSchema': { - 'type': 'object', - 'properties': { - 'company_mission': {'type': 'string'}, - 'supports_sso': {'type': 'boolean'}, - 'is_open_source': {'type': 'boolean'} - }, - 'required': ['company_mission', 'supports_sso', 'is_open_source'] - } - } - }) - assert response is not None - assert 'llm_extraction' in response - llm_extraction = response['llm_extraction'] - assert 'company_mission' in llm_extraction - assert isinstance(llm_extraction['supports_sso'], bool) - assert isinstance(llm_extraction['is_open_source'], bool) +# def test_llm_extraction(): +# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) +# response = app.scrape_url("https://mendable.ai", { +# 'extractorOptions': { +# 'mode': 'llm-extraction', +# 'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", +# 'extractionSchema': { +# 'type': 'object', +# 'properties': { +# 'company_mission': {'type': 'string'}, +# 'supports_sso': {'type': 'boolean'}, +# 'is_open_source': {'type': 'boolean'} +# }, +# 'required': ['company_mission', 'supports_sso', 'is_open_source'] +# } +# } +# }) +# assert response is not None +# assert 'llm_extraction' in response +# llm_extraction = response['llm_extraction'] +# assert 'company_mission' in llm_extraction +# assert isinstance(llm_extraction['supports_sso'], bool) +# assert isinstance(llm_extraction['is_open_source'], bool) + -def test_map_e2e(): - app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token") - response = app.map_url('https://roastmywebsite.ai') - assert response is not None - assert isinstance(response, list) \ No newline at end of file diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index f67afbdb..4f71cc78 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -244,8 +244,9 @@ class FirecrawlApp: ) if response.status_code == 200: response = response.json() - if response['success'] and 'data' in response: - return response['data'] + print(response) + if response['success'] and 'links' in response: + return response['links'] else: raise Exception(f'Failed to map URL. Error: {response["error"]}') else: @@ -387,18 +388,19 @@ class FirecrawlApp: Raises: Exception: An exception with a message containing the status code and error details from the response. """ - error_message = response.json().get('error', 'No additional error details provided.') + error_message = response.json().get('error', 'No error message provided.') + error_details = response.json().get('details', 'No additional error details provided.') if response.status_code == 402: - message = f"Payment Required: Failed to {action}. {error_message}" + message = f"Payment Required: Failed to {action}. {error_message} - {error_details}" elif response.status_code == 408: - message = f"Request Timeout: Failed to {action} as the request timed out. {error_message}" + message = f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}" elif response.status_code == 409: - message = f"Conflict: Failed to {action} due to a conflict. {error_message}" + message = f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}" elif response.status_code == 500: - message = f"Internal Server Error: Failed to {action}. {error_message}" + message = f"Internal Server Error: Failed to {action}. {error_message} - {error_details}" else: - message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message}" + message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message} - {error_details}" # Raise an HTTPError with the custom message and attach the response raise requests.exceptions.HTTPError(message, response=response) From 7473b74021fa477563d6a231ceb5b44c18576a5e Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 22 Aug 2024 15:15:45 -0300 Subject: [PATCH 15/16] fix: html and rawlhtmls for pdfs --- apps/api/src/controllers/v0/crawlPreview.ts | 2 +- apps/api/src/controllers/v0/scrape.ts | 10 ++++- apps/api/src/controllers/v0/search.ts | 10 ++--- apps/api/src/lib/default-values.ts | 4 +- apps/api/src/scraper/WebScraper/index.ts | 25 ++++++++--- apps/api/src/scraper/WebScraper/single_url.ts | 43 ++++++++++++------- apps/api/src/services/queue-worker.ts | 6 ++- 7 files changed, 68 insertions(+), 32 deletions(-) diff --git a/apps/api/src/controllers/v0/crawlPreview.ts b/apps/api/src/controllers/v0/crawlPreview.ts index 21a4a930..356da835 100644 --- a/apps/api/src/controllers/v0/crawlPreview.ts +++ b/apps/api/src/controllers/v0/crawlPreview.ts @@ -44,7 +44,7 @@ export async function crawlPreviewController(req: Request, res: Response) { } const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: true, removeTags: [] }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] }; // if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this? // try { diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index 4e1b696d..20d29f26 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -74,7 +74,15 @@ export async function scrapeHelper( // Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") { - delete doc.rawHtml; + if (doc.rawHtml) { + delete doc.rawHtml; + } + } + + if (!pageOptions.includeHtml) { + if (doc.html) { + delete doc.html; + } } return { diff --git a/apps/api/src/controllers/v0/search.ts b/apps/api/src/controllers/v0/search.ts index 34d415a5..79f6d74a 100644 --- a/apps/api/src/controllers/v0/search.ts +++ b/apps/api/src/controllers/v0/search.ts @@ -132,11 +132,11 @@ export async function searchController(req: Request, res: Response) { } const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = req.body.pageOptions ?? { - includeHtml: true, - onlyMainContent: true, - fetchPageContent: true, - removeTags: [], - fallback: false, + includeHtml: req.body.pageOptions?.includeHtml ?? false, + onlyMainContent: req.body.pageOptions?.onlyMainContent ?? false, + fetchPageContent: req.body.pageOptions?.fetchPageContent ?? true, + removeTags: req.body.pageOptions?.removeTags ?? [], + fallback: req.body.pageOptions?.fallback ?? false, }; const origin = req.body.origin ?? "api"; diff --git a/apps/api/src/lib/default-values.ts b/apps/api/src/lib/default-values.ts index cdf4605d..152f47d7 100644 --- a/apps/api/src/lib/default-values.ts +++ b/apps/api/src/lib/default-values.ts @@ -4,7 +4,7 @@ export const defaultTimeout = 45000; // 45 seconds export const defaultPageOptions = { onlyMainContent: false, - includeHtml: true, + includeHtml: false, waitFor: 0, screenshot: false, fullPageScreenshot: false, @@ -17,7 +17,7 @@ export const defaultCrawlerOptions = { export const defaultCrawlPageOptions = { onlyMainContent: false, - includeHtml: true, + includeHtml: false, removeTags: [], parsePDF: true } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 04b861b1..f56f378e 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -296,6 +296,12 @@ export class WebScraperDataProvider { if (this.pageOptions.includeMarkdown) { documents = this.applyPathReplacements(documents); } + + if (!this.pageOptions.includeHtml) { + for (let document of documents) { + delete document.html; + } + } // documents = await this.applyImgAltText(documents); if ( @@ -572,12 +578,19 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? { - onlyMainContent: false, - includeHtml: true, - replaceAllPathsWithAbsolutePaths: false, - parsePDF: true, - removeTags: [], + this.pageOptions = { + onlyMainContent: options.pageOptions?.onlyMainContent ?? false, + includeHtml: options.pageOptions?.includeHtml ?? false, + replaceAllPathsWithAbsolutePaths: options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false, + parsePDF: options.pageOptions?.parsePDF ?? true, + removeTags: options.pageOptions?.removeTags ?? [], + includeMarkdown: options.pageOptions?.includeMarkdown ?? true, + includeRawHtml: options.pageOptions?.includeRawHtml ?? false, + waitFor: options.pageOptions?.waitFor ?? undefined, + headers: options.pageOptions?.headers ?? undefined, + includeLinks: options.pageOptions?.includeLinks ?? true, + fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false, + screenshot: options.pageOptions?.screenshot ?? false, }; this.extractorOptions = options.extractorOptions ?? { mode: "markdown" }; this.replaceAllPathsWithAbsolutePaths = diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 9f8419b6..58e0185e 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -122,23 +122,36 @@ function getScrapingFallbackOrder( export async function scrapSingleUrl( jobId: string, urlToScrap: string, - pageOptions: PageOptions = { - includeMarkdown: true, - onlyMainContent: true, - includeHtml: true, - includeRawHtml: false, - waitFor: 0, - screenshot: false, - fullPageScreenshot: false, - headers: undefined, - includeLinks: true - }, - extractorOptions: ExtractorOptions = { - mode: "llm-extraction-from-markdown", - }, - existingHtml: string = "", + pageOptions: PageOptions, + extractorOptions?: ExtractorOptions, + existingHtml?: string, priority?: number, ): Promise { + pageOptions = { + includeMarkdown: pageOptions.includeMarkdown ?? true, + onlyMainContent: pageOptions.onlyMainContent ?? false, + includeHtml: pageOptions.includeHtml ?? false, + includeRawHtml: pageOptions.includeRawHtml ?? false, + waitFor: pageOptions.waitFor ?? undefined, + screenshot: pageOptions.screenshot ?? false, + fullPageScreenshot: pageOptions.fullPageScreenshot ?? false, + headers: pageOptions.headers ?? undefined, + includeLinks: pageOptions.includeLinks ?? true, + replaceAllPathsWithAbsolutePaths: pageOptions.replaceAllPathsWithAbsolutePaths ?? false, + parsePDF: pageOptions.parsePDF ?? true, + removeTags: pageOptions.removeTags ?? [], + } + + if (extractorOptions) { + extractorOptions = { + mode: extractorOptions.mode ?? "llm-extraction-from-markdown", + } + } + + if (!existingHtml) { + existingHtml = ""; + } + urlToScrap = urlToScrap.trim(); const attemptScraping = async ( diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index afd80f42..80d53954 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -130,10 +130,12 @@ async function processJob(job: Job, token: string) { const end = Date.now(); const timeTakenInSeconds = (end - start) / 1000; - const rawHtml = docs[0].rawHtml; + const rawHtml = docs[0] ? docs[0].rawHtml : ""; if (job.data.crawl_id && (!job.data.pageOptions || !job.data.pageOptions.includeRawHtml)) { - delete docs[0].rawHtml; + if (docs[0] && docs[0].rawHtml) { + delete docs[0].rawHtml; + } } const data = { From a37681bdff2e6bff8ac47a3b015c48afbcb28eec Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 22 Aug 2024 15:16:46 -0300 Subject: [PATCH 16/16] fix: replace jest, removed map for v0 --- .../src/__tests__/e2e_withAuth/index.test.ts | 98 +------------------ 1 file changed, 3 insertions(+), 95 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 82ed5bfe..330f8130 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -538,7 +538,7 @@ describe("E2E Tests for v0 API Routes", () => { const urls = completedResponse.body.data.map( (item: any) => item.metadata?.sourceURL ); - expect(urls.length).toBeGreaterThan(1); + expect(urls.length).toBeGreaterThanOrEqual(1); // Check if all URLs have a maximum depth of 1 urls.forEach((url: string) => { @@ -762,11 +762,11 @@ describe("E2E Tests for v0 API Routes", () => { .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://jestjs.io" }); + .send({ url: "https://docs.tatum.io", crawlerOptions: { limit: 200 } }); expect(crawlResponse.statusCode).toBe(200); - await new Promise((r) => setTimeout(r, 20000)); + await new Promise((r) => setTimeout(r, 10000)); const responseCancel = await request(TEST_URL) .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) @@ -869,96 +869,4 @@ describe("E2E Tests for v0 API Routes", () => { 60000 ); // 60 secs }); - - describe("POST /v0/map", () => { - it.concurrent( - "should return a list of links for mendable.ai without subdomains included", - async () => { - const response = await request(TEST_URL) - .post("/v1/map") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://mendable.ai", - }); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("success", true); - expect(response.body).toHaveProperty("links"); - expect(response.body.links).not.toContain("https://docs.mendable.ai"); - expect(Array.isArray(response.body.links)).toBe(true); - expect(response.body.links.length).toBeGreaterThan(0); - }, - 60000 - ); // 60 secs - - it.concurrent( - "should return a list of links for a given URL with subdomains included", - async () => { - const response = await request(TEST_URL) - .post("/v1/map") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://python.langchain.com", - includeSubdomains: true, - }); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("success", true); - expect(response.body).toHaveProperty("links"); - expect(Array.isArray(response.body.links)).toBe(true); - expect(response.body.links.length).toBeGreaterThan(0); - }, - 60000 - ); // 60 secs - - it.concurrent( - "should return a list of links for a given URL with subdomains and search", - async () => { - const response = await request(TEST_URL) - .post("/v1/map") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://python.langchain.com", - includeSubdomains: true, - search: "agents", - }); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("success", true); - expect(response.body).toHaveProperty("links"); - expect(response.body.links).toContain( - "https://api.python.langchain.com/en/latest/_modules/langchain/agents/openai_functions_agent/base.html" - ); - expect(Array.isArray(response.body.links)).toBe(true); - expect(response.body.links.length).toBeGreaterThan(0); - response.body.links.forEach((link) => { - expect(link).toContain("python.langchain.com"); - }); - }, - 60000 - ); // 60 secs - - it.concurrent( - "should handle invalid URL input gracefully", - async () => { - const response = await request(TEST_URL) - .post("/v1/map") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "invalid-url", - includeSubdomains: true, - search: "agents", - }); - - expect(response.statusCode).toBe(400); - expect(response.body).toHaveProperty("success", false); - expect(response.body).toHaveProperty("details"); - }, - 60000 - ); // 60 secs - }); });