From d599d31e638cbfdc2bda6adcc72f00f7c29df773 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 6 Aug 2024 17:33:39 -0300 Subject: [PATCH 01/16] wip --- .../src/__tests__/e2e_withAuth/index.test.ts | 58 ++++----- .../__tests__/v1/e2e_withAuth/index.test.ts | 122 ++++++++++++++++++ 2 files changed, 151 insertions(+), 29 deletions(-) create mode 100644 apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts diff --git a/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts index ad917de4..91dfb9e1 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts @@ -1,4 +1,4 @@ -import FirecrawlApp from '../../index'; +import FirecrawlApp, { CrawlResponseV0, FirecrawlDocumentV0, JobStatusResponseV0, ScrapeResponseV0, SearchResponseV0 } from '../../index'; import { v4 as uuidv4 } from 'uuid'; import dotenv from 'dotenv'; import { describe, test, expect } from '@jest/globals'; @@ -11,31 +11,31 @@ const API_URL = "http://127.0.0.1:3002"; describe('FirecrawlApp E2E Tests', () => { test.concurrent('should throw error for no API key', async () => { expect(() => { - new FirecrawlApp({ apiKey: null, apiUrl: API_URL }); + new FirecrawlApp({ apiKey: null, apiUrl: API_URL, version: "v0" }); }).toThrow("No API key provided"); }); test.concurrent('should throw error for invalid API key on scrape', async () => { - const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL, version: "v0" }); await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); }); test.concurrent('should throw error for blocklisted URL on scrape', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" }); const blocklistedUrl = "https://facebook.com/fake-test"; await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); }); test.concurrent('should return successful response with valid preview token', async () => { - const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL }); - const response = await app.scrapeUrl('https://roastmywebsite.ai'); + const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL, version: "v0" }); + const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponseV0; expect(response).not.toBeNull(); expect(response.data?.content).toContain("_Roast_"); }, 30000); // 30 seconds timeout test.concurrent('should return successful response for valid scrape', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.scrapeUrl('https://roastmywebsite.ai'); + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" }); + const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponseV0; expect(response).not.toBeNull(); expect(response.data?.content).toContain("_Roast_"); expect(response.data).toHaveProperty('markdown'); @@ -44,8 +44,8 @@ describe('FirecrawlApp E2E Tests', () => { }, 30000); // 30 seconds timeout test.concurrent('should return successful response with valid API key and include HTML', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.scrapeUrl('https://roastmywebsite.ai', { pageOptions: { includeHtml: true } }); + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" }); + const response = await app.scrapeUrl('https://roastmywebsite.ai', { pageOptions: { includeHtml: true } }) as ScrapeResponseV0; expect(response).not.toBeNull(); expect(response.data?.content).toContain("_Roast_"); expect(response.data?.markdown).toContain("_Roast_"); @@ -53,41 +53,41 @@ describe('FirecrawlApp E2E Tests', () => { }, 30000); // 30 seconds timeout test.concurrent('should return successful response for valid scrape with PDF file', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf'); + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" }); + const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf') as ScrapeResponseV0; expect(response).not.toBeNull(); expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); }, 30000); // 30 seconds timeout test.concurrent('should return successful response for valid scrape with PDF file without explicit extension', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001'); + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" }); + const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001') as ScrapeResponseV0; expect(response).not.toBeNull(); expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); }, 30000); // 30 seconds timeout test.concurrent('should throw error for invalid API key on crawl', async () => { - const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL, version: "v0" }); await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); }); test.concurrent('should throw error for blocklisted URL on crawl', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" }); const blocklistedUrl = "https://twitter.com/fake-test"; await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); }); test.concurrent('should return successful response for crawl and wait for completion', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, true, 30); + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" }); + const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, true, 30) as CrawlResponseV0; expect(response).not.toBeNull(); expect(response[0].content).toContain("_Roast_"); }, 60000); // 60 seconds timeout test.concurrent('should handle idempotency key for crawl', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" }); const uniqueIdempotencyKey = uuidv4(); - const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false, 2, uniqueIdempotencyKey); + const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false, 2, uniqueIdempotencyKey) as CrawlResponseV0; expect(response).not.toBeNull(); expect(response.jobId).toBeDefined(); @@ -95,12 +95,12 @@ describe('FirecrawlApp E2E Tests', () => { }); test.concurrent('should check crawl status', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false); + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" }); + const response: any = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false) as JobStatusResponseV0; expect(response).not.toBeNull(); expect(response.jobId).toBeDefined(); - let statusResponse = await app.checkCrawlStatus(response.jobId); + let statusResponse: any = await app.checkCrawlStatus(response.jobId); const maxChecks = 15; let checks = 0; @@ -108,7 +108,7 @@ describe('FirecrawlApp E2E Tests', () => { await new Promise(resolve => setTimeout(resolve, 1000)); expect(statusResponse.partial_data).not.toBeNull(); expect(statusResponse.current).toBeGreaterThanOrEqual(1); - statusResponse = await app.checkCrawlStatus(response.jobId); + statusResponse = await app.checkCrawlStatus(response.jobId) as CrawlResponseV0; checks++; } @@ -121,20 +121,20 @@ describe('FirecrawlApp E2E Tests', () => { }, 35000); // 35 seconds timeout test.concurrent('should return successful response for search', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.search("test query"); + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" }); + const response = await app.search("test query") as SearchResponseV0; expect(response).not.toBeNull(); expect(response?.data?.[0]?.content).toBeDefined(); expect(response?.data?.length).toBeGreaterThan(2); }, 30000); // 30 seconds timeout test.concurrent('should throw error for invalid API key on search', async () => { - const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL, version: "v0" }); await expect(invalidApp.search("test query")).rejects.toThrow("Request failed with status code 401"); }); test.concurrent('should perform LLM extraction', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" }); const response = await app.scrapeUrl("https://mendable.ai", { extractorOptions: { mode: 'llm-extraction', @@ -149,7 +149,7 @@ describe('FirecrawlApp E2E Tests', () => { required: ['company_mission', 'supports_sso', 'is_open_source'] } } - }); + }) as ScrapeResponseV0; expect(response).not.toBeNull(); expect(response.data?.llm_extraction).toBeDefined(); const llmExtraction = response.data?.llm_extraction; diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts new file mode 100644 index 00000000..9042d02e --- /dev/null +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -0,0 +1,122 @@ +import FirecrawlApp, { CrawlResponse, JobStatusResponse, ScrapeResponse } from '../../../index'; +import { v4 as uuidv4 } from 'uuid'; +import dotenv from 'dotenv'; +import { describe, test, expect } from '@jest/globals'; + +dotenv.config(); + +const TEST_API_KEY = process.env.TEST_API_KEY; +const API_URL = "http://127.0.0.1:3002"; + +describe('FirecrawlApp E2E Tests', () => { + test.concurrent('should throw error for no API key', async () => { + expect(() => { + new FirecrawlApp({ apiKey: null, apiUrl: API_URL }); + }).toThrow("No API key provided"); + }); + + test.concurrent('should throw error for invalid API key on scrape', async () => { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); + }); + + test.concurrent('should throw error for blocklisted URL on scrape', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const blocklistedUrl = "https://facebook.com/fake-test"; + await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); + }); + + test.concurrent('should return successful response with valid preview token', async () => { + const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL }); + const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponse; + expect(response).not.toBeNull(); + expect(response.data?.content).toContain("_Roast_"); + }, 30000); // 30 seconds timeout + + test.concurrent('should return successful response for valid scrape', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponse; + expect(response).not.toBeNull(); + expect(response.data?.content).toContain("_Roast_"); + expect(response.data).toHaveProperty('markdown'); + expect(response.data).toHaveProperty('metadata'); + expect(response.data).not.toHaveProperty('html'); + }, 30000); // 30 seconds timeout + + test.concurrent('should return successful response with valid API key and include HTML', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response = await app.scrapeUrl('https://roastmywebsite.ai', { pageOptions: { includeHtml: true } }) as ScrapeResponse; + expect(response).not.toBeNull(); + expect(response.data?.content).toContain("_Roast_"); + expect(response.data?.markdown).toContain("_Roast_"); + expect(response.data?.html).toContain("
G&>mI<(yt z;Mk?QClfqeuH+)n3Q*LKQ+{rH|Sw{C)R&se(NHFK?OyXAV9lgqod zTkk&l-h@3<&X =|dGG(D4Z z?0R?B%}v^bsrQ01dto+TDCdiAy{k|Hv0Go8buX1>3zn#ry;vw+$Q821+_YWqKO*iE z%^!7~l2aebPL`bMTsdpMH&e(Jb97^4Nr&FtdQZNHd&*nc+{{e9hZ>S!a*vvMPR;CB zGtbD3 >%gdDG|%=lfKNThlJdjK3H;6#skkh>`d_9s>-QmLrgvL1|g$8H|DX+23{q zr5vOr%P~k+lDOid%FE!m1<4w#7#}OO8g;;58)3z4?p`FT3CS9pqTxb(AXyb@c~{BG zvq5|l9`!BAT}ak0lEN*LRjncBxzucnWKEQn+1?b($q>n^^5Z!{nI%GqWL0bM9YoiF zbW2#tHt%P+ta9i3sc7>bNLH1O??R_T+sOz@?RSr$(j{v*OV*Utqe#|`^4BLA;Ka|u zvLtKo>3X_3&WmC4dWS>9vy`_OHgAi0u8WCob2@9;SLU`J%jFBUl_{4pE~dLPb_Q~6 zSLR$fSDtk<6D7-jKJ)CJafco{hh| kW6Tr;Xv~;9O@Qll)xqe83LOLYyr5QbRMTP{mRZ01fC?YmB2Ov+X;*T z1SFF46fpHoLVPrv&uabT>;)CwlKa1;I2j!M>#N>@ fnjm;E9EkEfVIy$L}m9 zhrUdX-bjvmoAy {tWHx<*0lzcQAv<{4VNeqZJdcIGU zxHbJX4FkU%IWidke9$~HlK6aG3@}{KJRt;)6@vg_bqueoB5YLUtVQg^ROM`{sp|0H zqYsr3I}i;YvuMDb@<&$z?pf@_=i`-*DOMNfO?B9*Xu}p1IVfvT=(;LhLZKVt8!@K% z{-MfsLF@?H7O|t&U`0(AZ%|{{bO3?I!*__-QTeg3DGxO7D=5OD(5W@}4x($2Qe`FD zyr1C!cfKE8tU~OB-i1y_A}@kc`(+}iI)mTdO8R0aVRc)nO2Sv@l4TlI{ZD`s_hTg) z !LaB6l7xbImlex?GP8xWfU1+sU8MC(? z8M8`}F*Hxp1dTIJxOF!lviP{U7%K
3`iOp zjUi) Bc*iq?-?8P4RUd-wxO1&2Y?Ki)(X4HR_eQotU)L!gDFZQDs zV^*pu^yd@ktAyXHg+L$ij0uVp^=g_Zm)ENSzRkYZjF;Ac|NWEJDr>Nk^q->cHlCvH z;Bynq`BIZ4X2(1hh+qO8I$hsTzLa;fxTa=d*UDvQ9Q#V%zM3tUF55*RFK@4F2vw() zbIa4$uG$yeeA(WW%U_S(?#jEFVyO%x=?IxWVW!MTOop 6Wq~q%gWRNNKJe zXb+}YXAsD-9z}c$QQcGlHSJGmgqx{hp#r9y{gi%*z%~HYMmet9D4$g``-Hlv((YA@ zeY2;uXYO}ta9~O9i}G=)KF&@|*seP{OD0^|x57}Fm8R9;Pk0V&=oZ~+jMT7Cl8E>A zoc2;@s;M)z)R~1>mePYie*eSwz0rf!^z*g!^9!#nnVY<@Yth_P3&VrB+9;AVoJA4W zVq_~OluWV(R4*`*#2OQ3t`aYq887TwG&8j@Jcz5JhzVpCc!}XGW4KBv21t3KBBxah z*lKO5O>mVZ^J%Yf)fx%L$i$-gbW~AQ6fr?)Rx&}6##UlbSF0|WXvAvvYdm#E_)Oyq zNZx4UX+WQi>1I^SIcT=h^072&95Ux%Qzhg*n^mzONOKO8W+(c%j-@`<22qFS3n6n3 zm3MP&Z#4j_bes;Eb99!u6fJXF*V(KtmX NXZGUrfhaJr(;IUZqOTfGmp7irGXx6+IB%HGGb*qFJg^Iv|NR-Q#Hr?jI*P%oza zXAv_&p+$uE;(&OgSFOC!s=HHT7L8sU6k0@gq!tn0t8$zi3e_Ui8l2Y0$-o=bq|&vB zVb&s6TO*1Vad)|JuUf>K(?U;h4&n874ioqmftLszA@B-;Zxc95fbQ8jM&MO|dU87F zT((MAi%8k!oRur*>gJ{LbYTqI1yl<-U5xqK1*rlO)6g!Q*ND$?0w)NZBtUcf>%Gd{ zUYO?YWrFX#4i=#^@F9Hx2VMfs_d!jjMfV?|E&6lpfEjINip?`wB;*WNoAv_#sHfPQ zJu~E5G&k46@F1>=A|{Yo;3bB$jNvMw7$D_^ikwz4V5_yIHo;YvTB^$;uFI61kbIbc z*-8v&K_5kfB9%;X28NqZF<_$cX{`(TKcFwtRH~v>G}f+E6-uT<)itJ9^)RWcsT?F` z8QPokCI+-{?X8Xx{K0E)R@^eJ&U`GdYkaM?W0rb>SznjX;}W5IT=J3WamfgIt!b-P zH)|l|qVX{FxZbujkUlZ%>%Wg#-y`g6EB#P=k><_=E4_$;;ePeFRV%&t0D6%$kh^Qc zD{nM)wdUlm=y5~nMXZ+HqaG)Fb@-v{ajRL68?n|rG(ArC!N51V57vt3VAVIe4`@|z zoubdzduDPM?K5I?flJ=|6NOpJehu3BoA<8jJ8$DPbiPaA9Rg9h`&|6crE%vxkwog@ zhuT>Q^L_NQxuOL}OL6O2vMe}R@FoWscMseM?yRu?ms4My^1h$-PX3YiLav&=P)lD} zSs-iZ>a5JKXz#4(-nY`b(KAD?MRQ{<3=iU}C}IMc1zut}%NVW_iUCqysK{v*1GZXQ zY7<;#MSFiF+WV5Z!)si%MuIUiv1smyDyoViCJ4<+CMeR_N(}01)g=>+SZ0^ZC%nd0 zYa|#W6N}~(QAJfz!~~&P$pl3jTZutkt-55Q5v$odSVj)P(){2pBSAl&m33_hZt)3N zR#w)vHD3exEq1P~Yd4-1-NORjpQ;s+(>mSns28P2;_}uK!#xuD8wI>aG@pxzKG4?^ zqdXEH{959W)IH{&YWN)HocHoqc9ooqyJxJ)-Ej7EN;_vR?LN47&-k7_ Qjm1)oPuUaF)7@1f!r=yChqKFAXvyuslG`13h zx>|L~L?c$ScbL9ezmw@(V9+Jk1o6?gOwiUkZOcTcE8< |w@j!t zn%`LyrPwm@$oe{R%fy4}#W2S^wNF-h@m?J7?zHjbPQ5x1y40Z7h s|&Av zWhOs3@u% lOAm=^Fd4uTjztceFLwS@%2iKAM^q~&6(p$j32*8 zSnlcuBg~Q2d0^yR8gt}--Z6fho#+bb`Mn3Jxsz`d5%3Lm6ol`wDG*$yG%+#jILKbT z1k**P>|DziFN)AU
1uwGtxALki-nUKgmqe7B1 gY9 zRc$|T&wf8}&&C_cjo#>U)#Scfa-WynN8uFqQ T+|;O~te#B_E9jtplT8 z5(8q5p6^p7ZcRJS;wKJx#j>!FaeAVnC){~YBLbZ+FHVmU5vNB}$XDX@;Ab 5-vUe@Ah8WE`H24dt&46L;Bmg0XrYq^bv3&A)f|e7@M}`X6N406vaF>vAv~0t> z{$>209a4vEL|-k>I>oFIuoP!CtH<>zfC24C^5gonhV)q)c@B|&^f0B{WL%%V5Bq*n zswTJAl3TsxRuV+pNWe%yf@m8FA_dnhZKixdq*JX3B2}K;uOK>z-#8$M$O QiT*UT_JObi;f|fp`*2Jf6mxP4aqLB>UqR}0&Qn5`j#8#~?ZIbvEpOAUDO2=uP z4)gHtfJMGZsoSbkyt$wU!F3+O77aD*n|`0LWYZS%j!M^MlfE!Q)+F5j7r680T0(y% zAwt!Fx+SE+K%4!%F2I}nxl9o>{ZG&_1wxejPSqKsYoPB3(EF@QsTTR)pWi+FTj|?! ztyLeHLWm} >g2`T(b
s_iuVwv&i9Pk!6(?Ji6gF7AHnZqC~j)D6+^V=p4I(r!PHlZ<$e5KMkp zdxUxt5$~}n8)5A;FN!VM2(46)7wqD7(|H*@nz5S1rtClzCIr!UC@iIOlmI=qVODzK zBtF)so9gpK-mZ1N3}UbdhHP?95sT9V&JcJKKn>0GtaFyo+X*~D;0*%b0dQ%&;yD^6 z Vbwf!nh7m|gt}AIci&BXJWMnHw zGKs2-Qmw9v;U*X<`EV8qnU)EL;E6PdHF|D~DLJuZKJGQHS|h<2nOHO*k1DE)A|?pU zN+u}M*h&oQYSkqZjaX)v%w1mNs)>YzjO4_kxhtxuDvFpOG%NX_h#9IF)YYm>CK|Dt zy~Bz?2IJIn9emaBf2tG=&UAtj^$sThbOdMQ7B)VA0*U__OMT234GOnhi6iKySuqhV zGIXXzVWTix>?TTmc3`Cf)@+_#1n(JV-2^>DnpSUZ4jt)1qS0Yym p|fq199p{;WJ%Nn%#c zZKyL^gpN$2p9hG#rJq-|+0W|&yt$vt6t~|01iuGn7vVKStL;->>@p|J}pCm404< zHR#XEyR9K>xYF%E9Vu1Q3UK1Ss%SIdV6Xa52Zf+cBADiC%)${?RGEeLR(dMEn3Yqo z+d`lP)@%`$hXT*7Axiyu2-M);tfI}gBdB_V_iC;6-;h3QowdHw=l4N(8-37@Uq=u6 zAU@w~J*M}8wfdXfA8L^Ese#73S>Dn}=^K&}cE3OJw3iKc8|&}cz`qf{R{s=5N)n&> zgesl%w`e|%XEZIJW)X;=-;%)iCZFd~qFvteY;C&FQLwA=ee~Xq90en5U=-Yhb4b*+ z+ MC z_ e2ttcMBWp1MGm#k1f>L4t8QF@FjYiew=9rs`X+=st8Vy sMe|T+iEi}&M+CQQLcL!&;D{G4nV{Ng` z0Mz|rrHHnawjLeA^IwQ{PU}{bN4cKIM`xJU!&srkJ{Wbz+&n1ES{0lZ#|%8{{rvVQ ziQw8@uofN4fgN87<&YfiW%M}P^ U+nziq zCyCYOm8S{6c;Gi_S|r=t`G%)yz4A20RqL339OHaa*b Rl31@AlW#Z3~g&I}RTnQx1$ed|0?6;1nor*I(VF0P_x;m{3VL zIDk?*I3S@pI6R{|IP3>U;ZGnP9O&GtXB;w?P-_32daTt!d92mJz)8gP T z)WYDzC41s>R_;NNognWeeHY&b;lHpzeBL1N9Rg$sQvDtFDIO5=2&x11-j`;skq5(c zt~|a+9F4A-grwucW6~hha&y86^dq!GmM6C#rq(bc5F34-_4dE|#Z%rpS?}#X@{Z@c zeHT0{UrkTd(o=sxMxG^e)C;>7&CyyI9>i5q!~`-6yu@&pF |L~L?f12H9njggN T1=wX{HQKwEEewo#}@8Fzigh zSxnnr>njJuL4NBis*O1yyo~68xI0C);JI$ViqSlzcA5Hj#2i>#A?4U8NvoecaM=6d z?pk6y-Vgq9s;XTm#B+ZD9?Kn-l&{b9RC*#k_g6uWfh~ooTlATs`_yM1tmppF@BiV5 z5l@Jw{aeHABSc4KHS7qzZD>iFr%iBt@!t^g-0zuB2@T0VD(iNNX3QBZ;5ygttQbk> zXrZmpTNLUcJx2xa5{OzZZ}Fdo4rS+@iAxXGg<7Z?K@CGF6>{a7JgK8xr_(}%koP3C zhM$1=kEz0c$9Ip?8s Gbgl!wes _?Ot*_d0mW=D}@B)axq387YeRf z+D!Sb7CO~U^Qb4Tnt+P}R?VX_D~lKc4}#Dl(8yX$z)U2Dv!Ik1Kt{G=WTR1axjE*h zVp@@sk4A&mfl)7s0kKBU_o))MrduP2&?1T$`L93*JS?XQtINW<;MlUAurRUpVd|s! zMZ-${RScUa;)oF*U-p@MePSuPKXgNPx=nQ@xWvrMFCk-{sz`ELr^8Z9I%49lFGV-v zTASGLF!)fMF62^l%-^0HwG=%poGw=1httI)>}zX;x5Yn R6mbvE6hps*UQJkQ152{aj8Tb~$=lffd;%TpGtxoD2W|jK_bb{; z(&qaewY>UZ%@OM(-g#|#)q3nc?^^8_i0jYrCqI#k!jUyFQau)N*V=NgwZS)0bhcrX zXlF@2j$bihZEXJJ*yh&Eto;6l8}-yi{+j>Xet%(*8ubJ54mDyiP1{8)ccEbK%H`_= zh&k)zCd#EoAc495LScHRkHvQOxqtm~4tpXprJ^_ |rHHnfyqm!hZJ@Am%7T;0<3NVlnVFJPb~ESBo!Oxmg1j_i7c*{Y z)|s&DU8M_C2s?<#z1L=t!x0e(GgwbtD3uC!t~eI2r~MYQetY%a2K%h*mg@a^H%t7o zlHXX|`FD7u>!}$W2w^+ryzSPLj{WXz9-$1u&73Y-Hunu~{|yIUsH|enVWA$U+8+iR z=UW6`B5;JjD+Im`P^Z46sJ_(0;G+3HI6o)CzW|tfF6ZJXin4HMc&_25fJCdC!VW=$ z5Df^(P*3IEd=Xv(MRbu#oiXOeWFWhmGeCgu!}&{q@L0%gTL=G&`1}Xr)7_3I!oMeS z;S)fUICd$J&ICik6sYyhe*_(AN$&qNqs@WufB3#Px_2qLerfoLTOG#c$&Q b}gFQYGHT~S49yM$Sm*@!&$~~l~4?j@ d;2*GfV$ORrlWRK$SP{Q0A|9-Rt)n84X3#wBv3`G#oSc-CG!Q( ze4!@bVgpti7!>h^Qjn=d5(CJ{Rt&03BoZ1KXPIUhtR!2>L2F3m4+42DnTI{|a81C) z2COzPDB=sHAXAGZ29S}h7*v-?Bs4P4GR-nrNw$)M){x2{1nMO$!ecw?$qTdjLK!2r zBfQ?$WoNF@NVrQ5{tytm9Q&%1FWdEQM%5EOPK?wLWY~h|lneO_yJpMzLcN>waw{Qu zlGQs(Zrz-n&0AtuV!g|vjgD6wjH`x2Gr; J6%DPS^OLfZkb~loqHLiwAr4Nen^aIAfW(|F*pfhCx+ft_ zg&$kJtAI0=Y^UC1zXu+YQf{Jrnr>+?fdd525unJ1;`CN=5RwRm@6Z-Qhr%9;$T|+~ zI&{dK=FlRNFs?ZyKdu|1tA&)gojfF4deQl>D55#M+YJo|_On=Qi9}5JKhIdr*OJk< zApR{GJqzOBlF_{&{w*0JUhrBnM!n#5lWP62?*p*7ncBRNSX%##XRKLDul-Z!N7sHf zT1{`SrMEBi-ZVNFI=!wJye$U_cD+zF4%Li9o^j}AV&H@KzfA1Bk=XfZ(R=Z&YU1r$ z;_U@ =|fX{@ f*m_f>noxt8L1@?9di~{>RV>@mc4EmFlWiXtIJ$cJuu(m5!1VaT5Bm_o* zgJ!HG8fAz^DXqZ%uGns3 0 + assert 'content' not in response[0] + assert 'markdown' in response[0] + assert "_Roast_" in response[0]['markdown'] + +def test_crawl_url_with_idempotency_key_e2e(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + uniqueIdempotencyKey = str(uuid4()) + response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey) + assert response is not None + assert len(response) > 0 + assert 'content' in response[0] + assert "_Roast_" in response[0]['content'] + + with pytest.raises(Exception) as excinfo: + app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey) + assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value) + +def test_check_crawl_status_e2e(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False) + assert response is not None + assert 'jobId' in response + + time.sleep(30) # wait for 30 seconds + status_response = app.check_crawl_status(response['jobId']) + assert status_response is not None + assert 'status' in status_response + assert status_response['status'] == 'completed' + assert 'data' in status_response + assert len(status_response['data']) > 0 + +def test_search_e2e(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + with pytest.raises(NotImplementedError) as excinfo: + app.search("test query") + assert "Search is not supported in v1" in str(excinfo.value) + +def test_llm_extraction(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.scrape_url("https://mendable.ai", { + 'extractorOptions': { + 'mode': 'llm-extraction', + 'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", + 'extractionSchema': { + 'type': 'object', + 'properties': { + 'company_mission': {'type': 'string'}, + 'supports_sso': {'type': 'boolean'}, + 'is_open_source': {'type': 'boolean'} + }, + 'required': ['company_mission', 'supports_sso', 'is_open_source'] + } + } + }) + assert response is not None + assert 'llm_extraction' in response + llm_extraction = response['llm_extraction'] + assert 'company_mission' in llm_extraction + assert isinstance(llm_extraction['supports_sso'], bool) + assert isinstance(llm_extraction['is_open_source'], bool) + +def test_map_e2e(): + app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token") + response = app.map_url('https://roastmywebsite.ai') + assert response is not None + assert isinstance(response, list) + \ No newline at end of file diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 7ec0d33f..25c9663e 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -19,24 +19,22 @@ import requests logger : logging.Logger = logging.getLogger("firecrawl") class FirecrawlApp: - """ - Initialize the FirecrawlApp instance. + def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None, version: str = 'v1') -> None: + """ + Initialize the FirecrawlApp instance with API key, API URL, and version. - Args: - api_key (Optional[str]): API key for authenticating with the Firecrawl API. - api_url (Optional[str]): Base URL for the Firecrawl API. - """ - def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: - self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') - if self.api_key is None: - logger.warning("No API key provided") - raise ValueError('No API key provided') - else: - logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key) - - self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') - if self.api_url != 'https://api.firecrawl.dev': - logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url) + Args: + api_key (Optional[str]): API key for authenticating with the Firecrawl API. + api_url (Optional[str]): Base URL for the Firecrawl API. + version (str): API version, either 'v0' or 'v1'. + """ + self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') + self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') + self.version = version + if self.api_key is None: + logger.warning("No API key provided") + raise ValueError('No API key provided') + logger.debug(f"Initialized FirecrawlApp with API key: {self.api_key} and version: {self.version}") def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: """ @@ -75,9 +73,11 @@ class FirecrawlApp: for key, value in params.items(): if key != 'extractorOptions': scrape_params[key] = value + + endpoint = f'/{self.version}/scrape' # Make the POST request with the prepared headers and JSON data response = requests.post( - f'{self.api_url}/v0/scrape', + f'{self.api_url}{endpoint}', headers=headers, json=scrape_params, ) @@ -104,6 +104,9 @@ class FirecrawlApp: Raises: Exception: If the search request fails. """ + if self.version == 'v1': + raise NotImplementedError("Search is not supported in v1") + headers = self._prepare_headers() json_data = {'query': query} if params: @@ -145,11 +148,12 @@ class FirecrawlApp: Raises: Exception: If the crawl job initiation or monitoring fails. """ + endpoint = f'/{self.version}/crawl' headers = self._prepare_headers(idempotency_key) json_data = {'url': url} if params: json_data.update(params) - response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers) + response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) if response.status_code == 200: job_id = response.json().get('jobId') if wait_until_done: @@ -172,13 +176,44 @@ class FirecrawlApp: Raises: Exception: If the status check request fails. """ + endpoint = f'/{self.version}/crawl/status/{job_id}' headers = self._prepare_headers() - response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) + response = self._get_request(f'{self.api_url}{endpoint}', headers) if response.status_code == 200: return response.json() else: self._handle_error(response, 'check crawl status') + def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: + """ + Perform a map search using the Firecrawl API. + """ + if self.version == 'v0': + raise NotImplementedError("Map is not supported in v0") + + endpoint = f'/{self.version}/map' + headers = self._prepare_headers() + + # Prepare the base scrape parameters with the URL + json_data = {'url': url} + if params: + json_data.update(params) + + # Make the POST request with the prepared headers and JSON data + response = requests.post( + f'{self.api_url}{endpoint}', + headers=headers, + json=json_data, + ) + if response.status_code == 200: + response = response.json() + if response['success'] and 'data' in response: + return response['data'] + else: + raise Exception(f'Failed to map URL. Error: {response["error"]}') + else: + self._handle_error(response, 'map') + def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]: """ Prepare the headers for API requests. From 32aba4416737da7f7e9b2c88e82887cf6c0e7f94 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 19 Aug 2024 13:37:20 -0300 Subject: [PATCH 06/16] fixing merge issues --- .../src/__tests__/v1/e2e_withAuth/index.test.ts | 15 ++++++++------- apps/js-sdk/firecrawl/src/index.ts | 10 +++++++++- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index cf6181fe..b6f6b5e9 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -1,4 +1,4 @@ -import FirecrawlApp, { CrawlResponse, JobStatusResponse, ScrapeResponse } from '../../../index'; +import FirecrawlApp, { CrawlParams, CrawlResponse, CrawlStatusResponse, ScrapeParams, ScrapeResponse } from '../../../index'; import { v4 as uuidv4 } from 'uuid'; import dotenv from 'dotenv'; import { describe, test, expect } from '@jest/globals'; @@ -80,8 +80,10 @@ describe('FirecrawlApp E2E Tests', () => { expect(response.data?.metadata).toHaveProperty("ogLocaleAlternate"); expect(response.data?.metadata).toHaveProperty("ogSiteName"); expect(response.data?.metadata).toHaveProperty("sourceURL"); - expect(response.data?.metadata).toHaveProperty("pageStatusCode"); - expect(response.data?.metadata.pageError).toBeUndefined(); + expect(response.data?.metadata).not.toHaveProperty("pageStatusCode"); + expect(response.data?.metadata).toHaveProperty("statusCode"); + expect(response.data?.metadata).not.toHaveProperty("pageError"); + expect(response.data?.metadata.error).toBeUndefined(); expect(response.data?.metadata.title).toBe("Roast My Website"); expect(response.data?.metadata.description).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"); expect(response.data?.metadata.keywords).toBe("Roast My Website,Roast,Website,GitHub,Firecrawl"); @@ -123,9 +125,8 @@ describe('FirecrawlApp E2E Tests', () => { test.concurrent('should return successful response for crawl and wait for completion', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.crawlUrl('https://roastmywebsite.ai', {}, true, 30) as JobStatusResponse; + const response = await app.crawlUrl('https://roastmywebsite.ai', {}, true, 30) as CrawlStatusResponse; expect(response).not.toBeNull(); - expect(response).toHaveProperty("totalCount"); expect(response.totalCount).toBeGreaterThan(0); expect(response).toHaveProperty("creditsUsed"); @@ -176,7 +177,7 @@ describe('FirecrawlApp E2E Tests', () => { timeout: 30000, waitFor: 1000 } - }, true, 30) as JobStatusResponse; + } as CrawlParams, true, 30) as CrawlStatusResponse; expect(response).not.toBeNull(); expect(response).toHaveProperty("totalCount"); expect(response.totalCount).toBeGreaterThan(0); @@ -223,7 +224,7 @@ describe('FirecrawlApp E2E Tests', () => { test.concurrent('should check crawl status', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response: any = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false) as JobStatusResponse; + const response: any = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false) as CrawlStatusResponse; expect(response).not.toBeNull(); expect(response.jobId).toBeDefined(); diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index cbec3644..a534fff8 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -167,7 +167,15 @@ export interface ScrapeResponseV0 { */ export interface CrawlParams { scrapeOptions?: ScrapeParams; - crawlerOptions?: MapParams; + crawlerOptions?: { + includePaths?: string[] + excludePaths?: string[] + maxDepth?: number + limit?: number + allowBackwardLinks?: boolean + allowExternalLinks?: boolean + ignoreSitemap?: boolean + }; } /** From e160d5529a238a800c5109706fb13cdf5fe3208e Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 20 Aug 2024 09:22:38 -0300 Subject: [PATCH 07/16] fixed test --- .../firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index b6f6b5e9..d911b335 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -95,7 +95,7 @@ describe('FirecrawlApp E2E Tests', () => { expect(response.data?.metadata.ogLocaleAlternate).toStrictEqual([]); expect(response.data?.metadata.ogSiteName).toBe("Roast My Website"); expect(response.data?.metadata.sourceURL).toBe("https://roastmywebsite.ai"); - expect(response.data?.metadata.pageStatusCode).toBe(200); + expect(response.data?.metadata.statusCode).toBe(200); }, 30000); // 30 seconds timeout test.concurrent('should return successful response for valid scrape with PDF file', async () => { From fa89d2e535dc3c3b57d06474dbb82efec0f3de9e Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 20 Aug 2024 10:37:24 -0300 Subject: [PATCH 08/16] v1 support for crawl/monitor status --- .../__tests__/v1/e2e_withAuth/index.test.ts | 14 +++++-------- apps/js-sdk/firecrawl/src/index.ts | 21 +++++++++++++------ 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index d911b335..ba0bf9a6 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -132,16 +132,14 @@ describe('FirecrawlApp E2E Tests', () => { expect(response).toHaveProperty("creditsUsed"); expect(response.creditsUsed).toBeGreaterThan(0); expect(response).toHaveProperty("expiresAt"); - expect(response.expiresAt).toBeGreaterThan(Date.now()); + expect(new Date(response.expiresAt).getTime()).toBeGreaterThan(Date.now()); expect(response).toHaveProperty("status"); expect(response.status).toBe("completed"); - expect(response).toHaveProperty("next"); - expect(response.next).toBeDefined(); + expect(response).not.toHaveProperty("next"); // wait until done expect(response.data?.length).toBeGreaterThan(0); expect(response.data?.[0]).toHaveProperty("markdown"); expect(response.data?.[0].markdown).toContain("_Roast_"); expect(response.data?.[0]).not.toHaveProperty('content'); // v0 - expect(response.data?.[0].markdown).toContain("_Roast_"); expect(response.data?.[0]).not.toHaveProperty("html"); expect(response.data?.[0]).not.toHaveProperty("rawHtml"); expect(response.data?.[0]).not.toHaveProperty("screenshot"); @@ -156,7 +154,7 @@ describe('FirecrawlApp E2E Tests', () => { expect(response.data?.[0].metadata).toHaveProperty("error"); }, 60000); // 60 seconds timeout - test.concurrent('should return successful response for crawl and wait for completion', async () => { + test.concurrent('should return successful response for crawl with options and wait for completion', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { @@ -184,16 +182,14 @@ describe('FirecrawlApp E2E Tests', () => { expect(response).toHaveProperty("creditsUsed"); expect(response.creditsUsed).toBeGreaterThan(0); expect(response).toHaveProperty("expiresAt"); - expect(response.expiresAt).toBeGreaterThan(Date.now()); + expect(new Date(response.expiresAt).getTime()).toBeGreaterThan(Date.now()); expect(response).toHaveProperty("status"); expect(response.status).toBe("completed"); - expect(response).toHaveProperty("next"); - expect(response.next).toContain("/v1/crawl/"); + expect(response).not.toHaveProperty("next"); expect(response.data?.length).toBeGreaterThan(0); expect(response.data?.[0]).toHaveProperty("markdown"); expect(response.data?.[0].markdown).toContain("_Roast_"); expect(response.data?.[0]).not.toHaveProperty('content'); // v0 - expect(response.data?.[0].markdown).toContain("_Roast_"); expect(response.data?.[0]).toHaveProperty("html"); expect(response.data?.[0].html).toContain(" { + let apiUrl: string = ''; while (true) { + if (this.version == 'v1') { + apiUrl = checkUrl ?? this.apiUrl + `/v1/crawl/${jobId}`; + } else if (this.version == 'v0') { + apiUrl = checkUrl ?? this.apiUrl + `/v0/crawl/status/${jobId}`; + } const statusResponse: AxiosResponse = await this.getRequest( - this.apiUrl + `/v0/crawl/status/${jobId}`, + apiUrl, headers ); if (statusResponse.status === 200) { const statusData = statusResponse.data; if (statusData.status === "completed") { if ("data" in statusData) { - return statusData.data; + return this.version == 'v0' ? statusData.data : statusData; } else { throw new Error("Crawl job completed but no data was returned"); } } else if ( - ["active", "paused", "pending", "queued"].includes(statusData.status) + ["active", "paused", "pending", "queued", "scraping"].includes(statusData.status) ) { if (checkInterval < 2) { checkInterval = 2; From 537fc689b66ccc5a26695febbceaffbb0dc7cab3 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 20 Aug 2024 17:08:02 -0300 Subject: [PATCH 09/16] fixing request --- .../__tests__/v1/e2e_withAuth/index.test.ts | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index ba0bf9a6..a5060b6e 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -157,22 +157,19 @@ describe('FirecrawlApp E2E Tests', () => { test.concurrent('should return successful response for crawl with options and wait for completion', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.crawlUrl('https://roastmywebsite.ai', { - crawlerOptions: { - excludePaths: ['blog/*'], - includePaths: ['/'], - maxDepth: 2, - ignoreSitemap: true, - limit: 10, - allowBackwardLinks: true, - allowExternalLinks: true, - }, - pageOptions: { + excludePaths: ['blog/*'], + includePaths: ['/'], + maxDepth: 2, + ignoreSitemap: true, + limit: 10, + allowBackwardLinks: true, + allowExternalLinks: true, + scrapeOptions: { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links'], headers: { "x-key": "test" }, includeTags: ['h1'], excludeTags: ['h2'], onlyMainContent: true, - timeout: 30000, waitFor: 1000 } } as CrawlParams, true, 30) as CrawlStatusResponse; From 9d64c8eedcfd8d291b5213d69bc6a6c5e8e68a6a Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 20 Aug 2024 17:24:45 -0300 Subject: [PATCH 10/16] screenshot should not be undefined also --- .../js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index a5060b6e..b0623b8d 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -64,6 +64,7 @@ describe('FirecrawlApp E2E Tests', () => { expect(response.data?.markdown).toContain("_Roast_"); expect(response.data?.html).toContain("
Date: Tue, 20 Aug 2024 20:00:41 -0300 Subject: [PATCH 11/16] tests passing now --- .../src/scraper/WebScraper/utils/metadata.ts | 4 +- .../__tests__/v1/e2e_withAuth/index.test.ts | 114 +++++++++++------- apps/js-sdk/firecrawl/src/index.ts | 57 ++++++--- 3 files changed, 110 insertions(+), 65 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/utils/metadata.ts b/apps/api/src/scraper/WebScraper/utils/metadata.ts index 9496d569..fac53b38 100644 --- a/apps/api/src/scraper/WebScraper/utils/metadata.ts +++ b/apps/api/src/scraper/WebScraper/utils/metadata.ts @@ -75,9 +75,7 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { description = soup('meta[name="description"]').attr("content") || null; // Assuming the language is part of the URL as per the regex pattern - const pattern = /([a-zA-Z]+-[A-Z]{2})/; - const match = pattern.exec(url); - language = match ? match[1] : null; + language = soup('html').attr('lang') || null; keywords = soup('meta[name="keywords"]').attr("content") || null; robots = soup('meta[name="robots"]').attr("content") || null; diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index b0623b8d..724996bc 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -1,4 +1,4 @@ -import FirecrawlApp, { CrawlParams, CrawlResponse, CrawlStatusResponse, ScrapeParams, ScrapeResponse } from '../../../index'; +import FirecrawlApp, { CrawlParams, CrawlResponse, CrawlStatusResponse, MapResponse, ScrapeParams, ScrapeResponse } from '../../../index'; import { v4 as uuidv4 } from 'uuid'; import dotenv from 'dotenv'; import { describe, test, expect } from '@jest/globals'; @@ -66,6 +66,7 @@ describe('FirecrawlApp E2E Tests', () => { expect(response.data?.rawHtml).toContain("
{ test.concurrent('should throw error for blocklisted URL on crawl', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const blocklistedUrl = "https://twitter.com/fake-test"; - await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); + await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions."); }); test.concurrent('should return successful response for crawl and wait for completion', async () => { @@ -145,14 +146,13 @@ describe('FirecrawlApp E2E Tests', () => { expect(response.data?.[0]).not.toHaveProperty("rawHtml"); expect(response.data?.[0]).not.toHaveProperty("screenshot"); expect(response.data?.[0]).not.toHaveProperty("links"); - expect(response.data?.[0]).toHaveProperty("metadata"); expect(response.data?.[0].metadata).toHaveProperty("title"); expect(response.data?.[0].metadata).toHaveProperty("description"); expect(response.data?.[0].metadata).toHaveProperty("language"); expect(response.data?.[0].metadata).toHaveProperty("sourceURL"); expect(response.data?.[0].metadata).toHaveProperty("statusCode"); - expect(response.data?.[0].metadata).toHaveProperty("error"); + expect(response.data?.[0].metadata).not.toHaveProperty("error"); }, 60000); // 60 seconds timeout test.concurrent('should return successful response for crawl with options and wait for completion', async () => { @@ -203,7 +203,7 @@ describe('FirecrawlApp E2E Tests', () => { expect(response.data?.[0].metadata).toHaveProperty("language"); expect(response.data?.[0].metadata).toHaveProperty("sourceURL"); expect(response.data?.[0].metadata).toHaveProperty("statusCode"); - expect(response.data?.[0].metadata).toHaveProperty("error"); + expect(response.data?.[0].metadata).not.toHaveProperty("error"); }, 60000); // 60 seconds timeout test.concurrent('should handle idempotency key for crawl', async () => { @@ -211,23 +211,23 @@ describe('FirecrawlApp E2E Tests', () => { const uniqueIdempotencyKey = uuidv4(); const response = await app.crawlUrl('https://roastmywebsite.ai', {}, false, 2, uniqueIdempotencyKey) as CrawlResponse; expect(response).not.toBeNull(); - expect(response.jobId).toBeDefined(); + expect(response.id).toBeDefined(); await expect(app.crawlUrl('https://roastmywebsite.ai', {}, true, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409"); }); test.concurrent('should check crawl status', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response: any = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false) as CrawlStatusResponse; + const response = await app.crawlUrl('https://firecrawl.dev', { scrapeOptions: { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}} as CrawlParams, false) as CrawlResponse; expect(response).not.toBeNull(); - expect(response.jobId).toBeDefined(); + expect(response.id).toBeDefined(); - let statusResponse: any = await app.checkCrawlStatus(response.jobId); + let statusResponse: any = await app.checkCrawlStatus(response.id) as CrawlStatusResponse; const maxChecks = 15; let checks = 0; while (statusResponse.status === 'scraping' && checks < maxChecks) { - await new Promise(resolve => setTimeout(resolve, 1000)); + await new Promise(resolve => setTimeout(resolve, 5000)); expect(statusResponse).not.toHaveProperty("partial_data"); // v0 expect(statusResponse).not.toHaveProperty("current"); // v0 expect(statusResponse).toHaveProperty("data"); @@ -238,44 +238,70 @@ describe('FirecrawlApp E2E Tests', () => { expect(statusResponse).toHaveProperty("next"); expect(statusResponse.totalCount).toBeGreaterThan(0); expect(statusResponse.creditsUsed).toBeGreaterThan(0); - expect(statusResponse.expiresAt).toBeGreaterThan(Date.now()); + expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now()); expect(statusResponse.status).toBe("scraping"); expect(statusResponse.next).toContain("/v1/crawl/"); - statusResponse = await app.checkCrawlStatus(response.jobId) as CrawlResponse; + statusResponse = await app.checkCrawlStatus(response.id) as CrawlStatusResponse; checks++; } + expect(statusResponse).not.toBeNull(); + expect(statusResponse).toHaveProperty("totalCount"); + expect(statusResponse.totalCount).toBeGreaterThan(0); + expect(statusResponse).toHaveProperty("creditsUsed"); + expect(statusResponse.creditsUsed).toBeGreaterThan(0); + expect(statusResponse).toHaveProperty("expiresAt"); + expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now()); + expect(statusResponse).toHaveProperty("status"); + expect(statusResponse.status).toBe("completed"); + expect(statusResponse.data?.length).toBeGreaterThan(0); + expect(statusResponse.data?.[0]).toHaveProperty("markdown"); + expect(statusResponse.data?.[0].markdown?.length).toBeGreaterThan(10); + expect(statusResponse.data?.[0]).not.toHaveProperty('content'); // v0 + expect(statusResponse.data?.[0]).toHaveProperty("html"); + expect(statusResponse.data?.[0].html).toContain("
{ + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); + }); + + test.concurrent('should throw error for blocklisted URL on map', async () => { + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const blocklistedUrl = "https://facebook.com/fake-test"; + await expect(app.mapUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); + }); + + test.concurrent('should return successful response with valid preview token', async () => { + const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL }); + const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse; expect(response).not.toBeNull(); - expect(response).toHaveProperty("totalCount"); - expect(response.totalCount).toBeGreaterThan(0); - expect(response).toHaveProperty("creditsUsed"); - expect(response.creditsUsed).toBeGreaterThan(0); - expect(response).toHaveProperty("expiresAt"); - expect(response.expiresAt).toBeGreaterThan(Date.now()); - expect(response).toHaveProperty("status"); - expect(response.status).toBe("completed"); - expect(response).toHaveProperty("next"); - expect(response.next).toContain("/v1/crawl/"); - expect(response.data?.length).toBeGreaterThan(0); - expect(response.data?.[0]).toHaveProperty("markdown"); - expect(response.data?.[0].markdown).toContain("_Roast_"); - expect(response.data?.[0]).not.toHaveProperty('content'); // v0 - expect(response.data?.[0].markdown).toContain("_Roast_"); - expect(response.data?.[0]).toHaveProperty("html"); - expect(response.data?.[0].html).toContain("{ + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); + const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse; + expect(response).not.toBeNull(); + + expect(response.links?.length).toBeGreaterThan(0); + expect(response.links?.[0]).toContain("https://"); + const filteredLinks = response.links?.filter((link: string) => link.includes("roastmywebsite.ai")); + expect(filteredLinks?.length).toBeGreaterThan(0); + }, 30000); // 30 seconds timeout }); diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index c280206c..90c86a2a 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -214,7 +214,7 @@ export interface CrawlParamsV0 { * Defines the structure of the response received after initiating a crawl. */ export interface CrawlResponse { - jobId?: string; + id?: string; url?: string; success: boolean; error?: string; @@ -281,7 +281,7 @@ export interface MapParams { */ export interface MapResponse { success: boolean; - data?: string[]; + links?: string[]; error?: string; } @@ -458,36 +458,53 @@ export default class FirecrawlApp { headers ); if (response.status === 200) { - const jobId: string = this.version == 'v0' ? response.data.jobId : response.data.id; + const id: string = this.version == 'v0' ? response.data.jobId : response.data.id; let checkUrl: string | undefined = undefined; if (waitUntilDone) { if (this.version == 'v1') { checkUrl = response.data.url } - return this.monitorJobStatus(jobId, headers, pollInterval, checkUrl); + return this.monitorJobStatus(id, headers, pollInterval, checkUrl); } else { - return { success: true, jobId }; + if (this.version == 'v0') { + return { + success: true, + jobId: id + } as CrawlResponseV0; + } else { + return { + success: true, + id: id + } as CrawlResponse; + } } } else { this.handleError(response, "start crawl job"); } } catch (error: any) { - console.log(error); - throw new Error(error.message); + if (error.response.data.error) { + throw new Error(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`); + } else { + throw new Error(error.message); + } } return { success: false, error: "Internal server error." }; } /** * Checks the status of a crawl job using the Firecrawl API. - * @param jobId - The job ID of the crawl operation. + * @param id - The ID of the crawl operation. * @returns The response containing the job status. */ - async checkCrawlStatus(jobId: string): Promise
{ + async checkCrawlStatus(id?: string): Promise { + if (!id) { + throw new Error("No crawl ID provided"); + } + const headers: AxiosRequestHeaders = this.prepareHeaders(); try { const response: AxiosResponse = await this.getRequest( this.version == 'v1' ? - this.apiUrl + `/${this.version}/crawl/${jobId}` : - this.apiUrl + `/${this.version}/crawl/status/${jobId}`, + this.apiUrl + `/${this.version}/crawl/${id}` : + this.apiUrl + `/${this.version}/crawl/status/${id}`, headers ); if (response.status === 200) { @@ -508,8 +525,12 @@ export default class FirecrawlApp { return { success: true, status: response.data.status, + totalCount: response.data.totalCount, + creditsUsed: response.data.creditsUsed, + expiresAt: new Date(response.data.expiresAt), + next: response.data.next, data: response.data.data, - error: response.data.error, + error: response.data.error } as CrawlStatusResponse; } } else { @@ -537,7 +558,7 @@ export default class FirecrawlApp { } } - async map(url: string, params?: MapParams): Promise { + async mapUrl(url: string, params?: MapParams): Promise { if (this.version == 'v0') { throw new Error("Map is not supported in v0"); } @@ -604,23 +625,23 @@ export default class FirecrawlApp { /** * Monitors the status of a crawl job until completion or failure. - * @param jobId - The job ID of the crawl operation. + * @param id - The ID of the crawl operation. * @param headers - The headers for the request. * @param checkInterval - Interval in seconds for job status checks. * @returns The final job status or data. */ async monitorJobStatus( - jobId: string, + id: string, headers: AxiosRequestHeaders, checkInterval: number, checkUrl?: string - ): Promise { + ): Promise { let apiUrl: string = ''; while (true) { if (this.version == 'v1') { - apiUrl = checkUrl ?? this.apiUrl + `/v1/crawl/${jobId}`; + apiUrl = checkUrl ?? this.apiUrl + `/v1/crawl/${id}`; } else if (this.version == 'v0') { - apiUrl = checkUrl ?? this.apiUrl + `/v0/crawl/status/${jobId}`; + apiUrl = checkUrl ?? this.apiUrl + `/v0/crawl/status/${id}`; } const statusResponse: AxiosResponse = await this.getRequest( apiUrl, From a4686e3c8c3e79507d9f8a68f2d66ec916337d5f Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 21 Aug 2024 15:56:48 -0300 Subject: [PATCH 12/16] fixing tests --- .../firecrawl/__tests__/e2e_withAuth/test.py | 6 +- apps/python-sdk/firecrawl/firecrawl.py | 80 +++++++++++++++---- 2 files changed, 70 insertions(+), 16 deletions(-) diff --git a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py index 457c206a..8945d74d 100644 --- a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py +++ b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py @@ -7,7 +7,7 @@ from dotenv import load_dotenv load_dotenv() -API_URL = "http://127.0.0.1:3002"; +API_URL = "http://127.0.0.1:3002" ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py" TEST_API_KEY = os.getenv('TEST_API_KEY') @@ -46,6 +46,8 @@ def test_successful_response_with_valid_preview_token(): def test_scrape_url_e2e(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') response = app.scrape_url('https://roastmywebsite.ai') + print(response) + assert response is not None assert 'content' in response assert 'markdown' in response @@ -145,7 +147,7 @@ def test_search_invalid_api_key(): def test_llm_extraction(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') - response = app.scrape_url("https://mendable.ai", { + response = app.scrape_url("https://firecrawl.dev", { 'extractorOptions': { 'mode': 'llm-extraction', 'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 25c9663e..f67afbdb 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -155,20 +155,30 @@ class FirecrawlApp: json_data.update(params) response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) if response.status_code == 200: - job_id = response.json().get('jobId') - if wait_until_done: - return self._monitor_job_status(job_id, headers, poll_interval) + if self.version == 'v0': + id = response.json().get('jobId') else: - return {'jobId': job_id} + id = response.json().get('id') + + if wait_until_done: + check_url = None + if self.version == 'v1': + check_url = response.json().get('url') + return self._monitor_job_status(id, headers, poll_interval, check_url) + else: + if self.version == 'v0': + return {'jobId': id} + else: + return {'id': id} else: self._handle_error(response, 'start crawl job') - def check_crawl_status(self, job_id: str) -> Any: + def check_crawl_status(self, id: str) -> Any: """ Check the status of a crawl job using the Firecrawl API. Args: - job_id (str): The ID of the crawl job. + id (str): The ID of the crawl job. Returns: Any: The status of the crawl job. @@ -176,11 +186,38 @@ class FirecrawlApp: Raises: Exception: If the status check request fails. """ - endpoint = f'/{self.version}/crawl/status/{job_id}' + + if self.version == 'v0': + endpoint = f'/{self.version}/crawl/status/{id}' + else: + endpoint = f'/{self.version}/crawl/{id}' + headers = self._prepare_headers() response = self._get_request(f'{self.api_url}{endpoint}', headers) if response.status_code == 200: - return response.json() + data = response.json() + if self.version == 'v0': + return { + 'success': True, + 'status': data.get('status'), + 'current': data.get('current'), + 'current_url': data.get('current_url'), + 'current_step': data.get('current_step'), + 'total': data.get('total'), + 'data': data.get('data'), + 'partial_data': data.get('partial_data') if not data.get('data') else None, + } + elif self.version == 'v1': + return { + 'success': True, + 'status': data.get('status'), + 'totalCount': data.get('totalCount'), + 'creditsUsed': data.get('creditsUsed'), + 'expiresAt': data.get('expiresAt'), + 'next': data.get('next'), + 'data': data.get('data'), + 'error': data.get('error') + } else: self._handle_error(response, 'check crawl status') @@ -292,15 +329,15 @@ class FirecrawlApp: return response return response - def _monitor_job_status(self, job_id: str, headers: Dict[str, str], poll_interval: int) -> Any: + def _monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int, check_url: Optional[str] = None) -> Any: """ Monitor the status of a crawl job until completion. Args: - job_id (str): The ID of the crawl job. + id (str): The ID of the crawl job. headers (Dict[str, str]): The headers to include in the status check requests. poll_interval (int): Secounds between status checks. - + check_url (Optional[str]): The URL to check for the crawl job. Returns: Any: The crawl results if the job is completed successfully. @@ -308,15 +345,30 @@ class FirecrawlApp: Exception: If the job fails or an error occurs during status checks. """ while True: - status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) + api_url = '' + if (self.version == 'v0'): + if check_url: + api_url = check_url + else: + api_url = f'{self.api_url}/v0/crawl/status/{id}' + else: + if check_url: + api_url = check_url + else: + api_url = f'{self.api_url}/v1/crawl/{id}' + + status_response = self._get_request(api_url, headers) if status_response.status_code == 200: status_data = status_response.json() if status_data['status'] == 'completed': if 'data' in status_data: - return status_data['data'] + if self.version == 'v0': + return status_data['data'] + else: + return status_data else: raise Exception('Crawl job completed but no data was returned') - elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting']: + elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']: poll_interval=max(poll_interval,2) time.sleep(poll_interval) # Wait for the specified interval before checking again else: From 0b37cbce4a7dd7a96b0be76abbec84482cdf586f Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 21 Aug 2024 15:58:51 -0300 Subject: [PATCH 13/16] Update .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 91b7ef48..bcd1e3d1 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,4 @@ apps/test-suite/load-test-results/test-run-report.json apps/playwright-service-ts/node_modules/ apps/playwright-service-ts/package-lock.json +*.pyc From ab88a75c70ceaa780530e6248e29b182e9d2da09 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 22 Aug 2024 13:38:34 -0300 Subject: [PATCH 14/16] fixes sdks --- .../__tests__/v1/e2e_withAuth/index.test.ts | 103 +++--- apps/js-sdk/firecrawl/src/index.ts | 10 +- .../__tests__/v1/e2e_withAuth/test.py | 308 ++++++++++++++---- apps/python-sdk/firecrawl/firecrawl.py | 18 +- 4 files changed, 317 insertions(+), 122 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index 724996bc..81c870f5 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -30,24 +30,24 @@ describe('FirecrawlApp E2E Tests', () => { const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL }); const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponse; expect(response).not.toBeNull(); - expect(response.data?.markdown).toContain("_Roast_"); + expect(response?.markdown).toContain("_Roast_"); }, 30000); // 30 seconds timeout test.concurrent('should return successful response for valid scrape', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponse; expect(response).not.toBeNull(); - expect(response.data).not.toHaveProperty('content'); // v0 - expect(response.data).not.toHaveProperty('html'); - expect(response.data).not.toHaveProperty('rawHtml'); - expect(response.data).not.toHaveProperty('screenshot'); - expect(response.data).not.toHaveProperty('links'); + expect(response).not.toHaveProperty('content'); // v0 + expect(response).not.toHaveProperty('html'); + expect(response).not.toHaveProperty('rawHtml'); + expect(response).not.toHaveProperty('screenshot'); + expect(response).not.toHaveProperty('links'); - expect(response.data).toHaveProperty('markdown'); - expect(response.data).toHaveProperty('metadata'); + expect(response).toHaveProperty('markdown'); + expect(response).toHaveProperty('metadata'); }, 30000); // 30 seconds timeout - test.concurrent('should return successful response with valid API key and include HTML', async () => { + test.concurrent('should return successful response with valid API key and options', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.scrapeUrl( 'https://roastmywebsite.ai', { @@ -60,58 +60,58 @@ describe('FirecrawlApp E2E Tests', () => { waitFor: 1000 }) as ScrapeResponse; expect(response).not.toBeNull(); - expect(response.data).not.toHaveProperty('content'); // v0 - expect(response.data?.markdown).toContain("_Roast_"); - expect(response.data?.html).toContain(" { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf') as ScrapeResponse; expect(response).not.toBeNull(); - expect(response.data?.markdown).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + expect(response?.markdown).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); }, 30000); // 30 seconds timeout test.concurrent('should return successful response for valid scrape with PDF file without explicit extension', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001') as ScrapeResponse; expect(response).not.toBeNull(); - expect(response.data?.markdown).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); + expect(response?.markdown).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); }, 30000); // 30 seconds timeout test.concurrent('should throw error for invalid API key on crawl', async () => { @@ -304,4 +304,9 @@ describe('FirecrawlApp E2E Tests', () => { const filteredLinks = response.links?.filter((link: string) => link.includes("roastmywebsite.ai")); expect(filteredLinks?.length).toBeGreaterThan(0); }, 30000); // 30 seconds timeout + + test('should throw NotImplementedError for search on v1', async () => { + const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: TEST_API_KEY }); + await expect(app.search("test query")).rejects.toThrow("Search is not supported in v1"); + }); }); diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 90c86a2a..90617de1 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -144,10 +144,9 @@ export interface ScrapeParamsV0 { * Response interface for scraping operations. * Defines the structure of the response received after a scraping operation. */ -export interface ScrapeResponse { +export interface ScrapeResponse extends FirecrawlDocument { success: boolean; warning?: string; - data?: FirecrawlDocument; error?: string; } @@ -375,7 +374,12 @@ export default class FirecrawlApp { if (this.version == 'v0') { return responseData as ScrapeResponseV0; } else { - return responseData as ScrapeResponse; + return { + success: true, + warning: responseData.warning, + error: responseData.error, + ...responseData.data + } as ScrapeResponse; } } else { throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py index 517d8cf9..5fb2c674 100644 --- a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py +++ b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py @@ -4,6 +4,7 @@ import time import os from uuid import uuid4 from dotenv import load_dotenv +from datetime import datetime load_dotenv() @@ -27,42 +28,92 @@ def test_scrape_url_invalid_api_key(): invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") with pytest.raises(Exception) as excinfo: invalid_app.scrape_url('https://firecrawl.dev') - assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) + assert "Unauthorized: Invalid token" in str(excinfo.value) def test_blocklisted_url(): blocklisted_url = "https://facebook.com/fake-test" app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) with pytest.raises(Exception) as excinfo: app.scrape_url(blocklisted_url) - assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) + assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value) def test_successful_response_with_valid_preview_token(): app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token") response = app.scrape_url('https://roastmywebsite.ai') assert response is not None - assert 'content' in response - assert "_Roast_" in response['content'] + assert "_Roast_" in response['markdown'] + assert "content" not in response + assert "html" not in response + assert "metadata" in response + assert "links" not in response + assert "rawHtml" not in response -def test_scrape_url_e2e(): +def test_successful_response_for_valid_scrape(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) response = app.scrape_url('https://roastmywebsite.ai') assert response is not None - assert 'content' not in response assert 'markdown' in response - assert 'metadata' in response - assert 'html' not in response assert "_Roast_" in response['markdown'] + assert 'metadata' in response + assert 'content' not in response + assert 'html' not in response + assert 'rawHtml' not in response + assert 'screenshot' not in response + assert 'links' not in response -def test_successful_response_with_valid_api_key_and_include_html(): +def test_successful_response_with_valid_api_key_and_options(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) - response = app.scrape_url('https://roastmywebsite.ai', { 'formats': [ 'markdown', 'html' ]}) + params = { + 'formats': ['markdown', 'html', 'rawHtml', 'screenshot', 'links'], + 'headers': {'x-key': 'test'}, + 'includeTags': ['h1'], + 'excludeTags': ['h2'], + 'onlyMainContent': True, + 'timeout': 30000, + 'waitFor': 1000 + } + response = app.scrape_url('https://roastmywebsite.ai', params) assert response is not None assert 'content' not in response assert 'markdown' in response assert 'html' in response - assert 'metadata' in response + assert 'rawHtml' in response + assert 'screenshot' in response + assert 'links' in response assert "_Roast_" in response['markdown'] assert "
0 + assert "https://" in response['links'][0] + assert 'metadata' in response + assert 'title' in response['metadata'] + assert 'description' in response['metadata'] + assert 'keywords' in response['metadata'] + assert 'robots' in response['metadata'] + assert 'ogTitle' in response['metadata'] + assert 'ogDescription' in response['metadata'] + assert 'ogUrl' in response['metadata'] + assert 'ogImage' in response['metadata'] + assert 'ogLocaleAlternate' in response['metadata'] + assert 'ogSiteName' in response['metadata'] + assert 'sourceURL' in response['metadata'] + assert 'statusCode' in response['metadata'] + assert 'pageStatusCode' not in response['metadata'] + assert 'pageError' not in response['metadata'] + assert 'error' not in response['metadata'] + assert response['metadata']['title'] == "Roast My Website" + assert response['metadata']['description'] == "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + assert response['metadata']['keywords'] == "Roast My Website,Roast,Website,GitHub,Firecrawl" + assert response['metadata']['robots'] == "follow, index" + assert response['metadata']['ogTitle'] == "Roast My Website" + assert response['metadata']['ogDescription'] == "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + assert response['metadata']['ogUrl'] == "https://www.roastmywebsite.ai" + assert response['metadata']['ogImage'] == "https://www.roastmywebsite.ai/og.png" + assert response['metadata']['ogLocaleAlternate'] == [] + assert response['metadata']['ogSiteName'] == "Roast My Website" + assert response['metadata']['sourceURL'] == "https://roastmywebsite.ai" + assert response['metadata']['statusCode'] == 200 def test_successful_response_for_valid_scrape_with_pdf_file(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) @@ -70,65 +121,202 @@ def test_successful_response_for_valid_scrape_with_pdf_file(): assert response is not None assert 'content' not in response assert 'metadata' in response - assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content'] + assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['markdown'] def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001') - time.sleep(6) # wait for 6 seconds + time.sleep(1) # wait for 1 second assert response is not None - assert 'content' not in response - assert 'metadata' in response - assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content'] + assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['markdown'] def test_crawl_url_invalid_api_key(): invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") with pytest.raises(Exception) as excinfo: invalid_app.crawl_url('https://firecrawl.dev') - assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) + assert "Unauthorized: Invalid token" in str(excinfo.value) def test_should_return_error_for_blocklisted_url(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) blocklisted_url = "https://twitter.com/fake-test" with pytest.raises(Exception) as excinfo: app.crawl_url(blocklisted_url) - assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) + assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value) def test_crawl_url_wait_for_completion_e2e(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) - response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True) + response = app.crawl_url('https://roastmywebsite.ai', {'excludePaths': ['blog/*']}, True, 30) assert response is not None - assert len(response) > 0 - assert 'content' not in response[0] - assert 'markdown' in response[0] - assert "_Roast_" in response[0]['markdown'] + assert 'totalCount' in response + assert response['totalCount'] > 0 + assert 'creditsUsed' in response + assert response['creditsUsed'] > 0 + assert 'expiresAt' in response + assert datetime.strptime(response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now() + assert 'status' in response + assert response['status'] == 'completed' + assert 'next' not in response + assert len(response['data']) > 0 + assert 'markdown' in response['data'][0] + assert "_Roast_" in response['data'][0]['markdown'] + assert 'content' not in response['data'][0] + assert 'html' not in response['data'][0] + assert 'rawHtml' not in response['data'][0] + assert 'screenshot' not in response['data'][0] + assert 'links' not in response['data'][0] + assert 'metadata' in response['data'][0] + assert 'title' in response['data'][0]['metadata'] + assert 'description' in response['data'][0]['metadata'] + assert 'language' in response['data'][0]['metadata'] + assert 'sourceURL' in response['data'][0]['metadata'] + assert 'statusCode' in response['data'][0]['metadata'] + assert 'error' not in response['data'][0]['metadata'] + +def test_crawl_url_with_options_and_wait_for_completion(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.crawl_url('https://roastmywebsite.ai', { + 'excludePaths': ['blog/*'], + 'includePaths': ['/'], + 'maxDepth': 2, + 'ignoreSitemap': True, + 'limit': 10, + 'allowBackwardLinks': True, + 'allowExternalLinks': True, + 'scrapeOptions': { + 'formats': ['markdown', 'html', 'rawHtml', 'screenshot', 'links'], + 'headers': {"x-key": "test"}, + 'includeTags': ['h1'], + 'excludeTags': ['h2'], + 'onlyMainContent': True, + 'waitFor': 1000 + } + }, True, 30) + assert response is not None + assert 'totalCount' in response + assert response['totalCount'] > 0 + assert 'creditsUsed' in response + assert response['creditsUsed'] > 0 + assert 'expiresAt' in response + assert datetime.strptime(response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now() + assert 'status' in response + assert response['status'] == 'completed' + assert 'next' not in response + assert len(response['data']) > 0 + assert 'markdown' in response['data'][0] + assert "_Roast_" in response['data'][0]['markdown'] + assert 'content' not in response['data'][0] + assert 'html' in response['data'][0] + assert "
0 + assert 'metadata' in response['data'][0] + assert 'title' in response['data'][0]['metadata'] + assert 'description' in response['data'][0]['metadata'] + assert 'language' in response['data'][0]['metadata'] + assert 'sourceURL' in response['data'][0]['metadata'] + assert 'statusCode' in response['data'][0]['metadata'] + assert 'error' not in response['data'][0]['metadata'] def test_crawl_url_with_idempotency_key_e2e(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) uniqueIdempotencyKey = str(uuid4()) - response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey) + response = app.crawl_url('https://roastmywebsite.ai', {'excludePaths': ['blog/*']}, False, 2, uniqueIdempotencyKey) assert response is not None - assert len(response) > 0 - assert 'content' in response[0] - assert "_Roast_" in response[0]['content'] + assert 'id' in response with pytest.raises(Exception) as excinfo: - app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey) - assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value) + app.crawl_url('https://firecrawl.dev', {'excludePaths': ['blog/*']}, True, 2, uniqueIdempotencyKey) + assert "Idempotency key already used" in str(excinfo.value) def test_check_crawl_status_e2e(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) - response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False) + response = app.crawl_url('https://firecrawl.dev', {'scrapeOptions': {'formats': ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}}, False) assert response is not None - assert 'jobId' in response + assert 'id' in response - time.sleep(30) # wait for 30 seconds - status_response = app.check_crawl_status(response['jobId']) + max_checks = 15 + checks = 0 + status_response = app.check_crawl_status(response['id']) + + while status_response['status'] == 'scraping' and checks < max_checks: + time.sleep(1) # wait for 1 second + assert 'partial_data' not in status_response + assert 'current' not in status_response + assert 'data' in status_response + assert 'totalCount' in status_response + assert 'creditsUsed' in status_response + assert 'expiresAt' in status_response + assert 'status' in status_response + assert 'next' in status_response + assert status_response['totalCount'] > 0 + assert status_response['creditsUsed'] > 0 + assert datetime.strptime(status_response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now() + assert status_response['status'] == 'scraping' + assert '/v1/crawl/' in status_response['next'] + status_response = app.check_crawl_status(response['id']) + checks += 1 + assert status_response is not None + assert 'totalCount' in status_response + assert status_response['totalCount'] > 0 + assert 'creditsUsed' in status_response + assert status_response['creditsUsed'] > 0 + assert 'expiresAt' in status_response + assert datetime.strptime(status_response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now() assert 'status' in status_response assert status_response['status'] == 'completed' - assert 'data' in status_response assert len(status_response['data']) > 0 + assert 'markdown' in status_response['data'][0] + assert len(status_response['data'][0]['markdown']) > 10 + assert 'content' not in status_response['data'][0] + assert 'html' in status_response['data'][0] + assert "
0 + assert 'metadata' in status_response['data'][0] + assert 'title' in status_response['data'][0]['metadata'] + assert 'description' in status_response['data'][0]['metadata'] + assert 'language' in status_response['data'][0]['metadata'] + assert 'sourceURL' in status_response['data'][0]['metadata'] + assert 'statusCode' in status_response['data'][0]['metadata'] + assert 'error' not in status_response['data'][0]['metadata'] + +def test_invalid_api_key_on_map(): + invalid_app = FirecrawlApp(api_key="invalid_api_key", api_url=API_URL) + with pytest.raises(Exception) as excinfo: + invalid_app.map_url('https://roastmywebsite.ai') + assert "Unauthorized: Invalid token" in str(excinfo.value) + +def test_blocklisted_url_on_map(): + app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL) + blocklisted_url = "https://facebook.com/fake-test" + with pytest.raises(Exception) as excinfo: + app.map_url(blocklisted_url) + assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value) + +def test_successful_response_with_valid_preview_token_on_map(): + app = FirecrawlApp(api_key="this_is_just_a_preview_token", api_url=API_URL) + response = app.map_url('https://roastmywebsite.ai') + assert response is not None + assert len(response) > 0 + +def test_successful_response_for_valid_map(): + app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL) + response = app.map_url('https://roastmywebsite.ai') + assert response is not None + assert len(response) > 0 + assert any("https://" in link for link in response) + filtered_links = [link for link in response if "roastmywebsite.ai" in link] + assert len(filtered_links) > 0 def test_search_e2e(): app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) @@ -136,33 +324,29 @@ def test_search_e2e(): app.search("test query") assert "Search is not supported in v1" in str(excinfo.value) -def test_llm_extraction(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) - response = app.scrape_url("https://mendable.ai", { - 'extractorOptions': { - 'mode': 'llm-extraction', - 'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", - 'extractionSchema': { - 'type': 'object', - 'properties': { - 'company_mission': {'type': 'string'}, - 'supports_sso': {'type': 'boolean'}, - 'is_open_source': {'type': 'boolean'} - }, - 'required': ['company_mission', 'supports_sso', 'is_open_source'] - } - } - }) - assert response is not None - assert 'llm_extraction' in response - llm_extraction = response['llm_extraction'] - assert 'company_mission' in llm_extraction - assert isinstance(llm_extraction['supports_sso'], bool) - assert isinstance(llm_extraction['is_open_source'], bool) +# def test_llm_extraction(): +# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) +# response = app.scrape_url("https://mendable.ai", { +# 'extractorOptions': { +# 'mode': 'llm-extraction', +# 'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", +# 'extractionSchema': { +# 'type': 'object', +# 'properties': { +# 'company_mission': {'type': 'string'}, +# 'supports_sso': {'type': 'boolean'}, +# 'is_open_source': {'type': 'boolean'} +# }, +# 'required': ['company_mission', 'supports_sso', 'is_open_source'] +# } +# } +# }) +# assert response is not None +# assert 'llm_extraction' in response +# llm_extraction = response['llm_extraction'] +# assert 'company_mission' in llm_extraction +# assert isinstance(llm_extraction['supports_sso'], bool) +# assert isinstance(llm_extraction['is_open_source'], bool) + -def test_map_e2e(): - app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token") - response = app.map_url('https://roastmywebsite.ai') - assert response is not None - assert isinstance(response, list) \ No newline at end of file diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index f67afbdb..4f71cc78 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -244,8 +244,9 @@ class FirecrawlApp: ) if response.status_code == 200: response = response.json() - if response['success'] and 'data' in response: - return response['data'] + print(response) + if response['success'] and 'links' in response: + return response['links'] else: raise Exception(f'Failed to map URL. Error: {response["error"]}') else: @@ -387,18 +388,19 @@ class FirecrawlApp: Raises: Exception: An exception with a message containing the status code and error details from the response. """ - error_message = response.json().get('error', 'No additional error details provided.') + error_message = response.json().get('error', 'No error message provided.') + error_details = response.json().get('details', 'No additional error details provided.') if response.status_code == 402: - message = f"Payment Required: Failed to {action}. {error_message}" + message = f"Payment Required: Failed to {action}. {error_message} - {error_details}" elif response.status_code == 408: - message = f"Request Timeout: Failed to {action} as the request timed out. {error_message}" + message = f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}" elif response.status_code == 409: - message = f"Conflict: Failed to {action} due to a conflict. {error_message}" + message = f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}" elif response.status_code == 500: - message = f"Internal Server Error: Failed to {action}. {error_message}" + message = f"Internal Server Error: Failed to {action}. {error_message} - {error_details}" else: - message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message}" + message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message} - {error_details}" # Raise an HTTPError with the custom message and attach the response raise requests.exceptions.HTTPError(message, response=response) From 7473b74021fa477563d6a231ceb5b44c18576a5e Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 22 Aug 2024 15:15:45 -0300 Subject: [PATCH 15/16] fix: html and rawlhtmls for pdfs --- apps/api/src/controllers/v0/crawlPreview.ts | 2 +- apps/api/src/controllers/v0/scrape.ts | 10 ++++- apps/api/src/controllers/v0/search.ts | 10 ++--- apps/api/src/lib/default-values.ts | 4 +- apps/api/src/scraper/WebScraper/index.ts | 25 ++++++++--- apps/api/src/scraper/WebScraper/single_url.ts | 43 ++++++++++++------- apps/api/src/services/queue-worker.ts | 6 ++- 7 files changed, 68 insertions(+), 32 deletions(-) diff --git a/apps/api/src/controllers/v0/crawlPreview.ts b/apps/api/src/controllers/v0/crawlPreview.ts index 21a4a930..356da835 100644 --- a/apps/api/src/controllers/v0/crawlPreview.ts +++ b/apps/api/src/controllers/v0/crawlPreview.ts @@ -44,7 +44,7 @@ export async function crawlPreviewController(req: Request, res: Response) { } const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: true, removeTags: [] }; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] }; // if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this? // try { diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index 4e1b696d..20d29f26 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -74,7 +74,15 @@ export async function scrapeHelper( // Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") { - delete doc.rawHtml; + if (doc.rawHtml) { + delete doc.rawHtml; + } + } + + if (!pageOptions.includeHtml) { + if (doc.html) { + delete doc.html; + } } return { diff --git a/apps/api/src/controllers/v0/search.ts b/apps/api/src/controllers/v0/search.ts index 34d415a5..79f6d74a 100644 --- a/apps/api/src/controllers/v0/search.ts +++ b/apps/api/src/controllers/v0/search.ts @@ -132,11 +132,11 @@ export async function searchController(req: Request, res: Response) { } const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = req.body.pageOptions ?? { - includeHtml: true, - onlyMainContent: true, - fetchPageContent: true, - removeTags: [], - fallback: false, + includeHtml: req.body.pageOptions?.includeHtml ?? false, + onlyMainContent: req.body.pageOptions?.onlyMainContent ?? false, + fetchPageContent: req.body.pageOptions?.fetchPageContent ?? true, + removeTags: req.body.pageOptions?.removeTags ?? [], + fallback: req.body.pageOptions?.fallback ?? false, }; const origin = req.body.origin ?? "api"; diff --git a/apps/api/src/lib/default-values.ts b/apps/api/src/lib/default-values.ts index cdf4605d..152f47d7 100644 --- a/apps/api/src/lib/default-values.ts +++ b/apps/api/src/lib/default-values.ts @@ -4,7 +4,7 @@ export const defaultTimeout = 45000; // 45 seconds export const defaultPageOptions = { onlyMainContent: false, - includeHtml: true, + includeHtml: false, waitFor: 0, screenshot: false, fullPageScreenshot: false, @@ -17,7 +17,7 @@ export const defaultCrawlerOptions = { export const defaultCrawlPageOptions = { onlyMainContent: false, - includeHtml: true, + includeHtml: false, removeTags: [], parsePDF: true } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 04b861b1..f56f378e 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -296,6 +296,12 @@ export class WebScraperDataProvider { if (this.pageOptions.includeMarkdown) { documents = this.applyPathReplacements(documents); } + + if (!this.pageOptions.includeHtml) { + for (let document of documents) { + delete document.html; + } + } // documents = await this.applyImgAltText(documents); if ( @@ -572,12 +578,19 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = options.pageOptions ?? { - onlyMainContent: false, - includeHtml: true, - replaceAllPathsWithAbsolutePaths: false, - parsePDF: true, - removeTags: [], + this.pageOptions = { + onlyMainContent: options.pageOptions?.onlyMainContent ?? false, + includeHtml: options.pageOptions?.includeHtml ?? false, + replaceAllPathsWithAbsolutePaths: options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false, + parsePDF: options.pageOptions?.parsePDF ?? true, + removeTags: options.pageOptions?.removeTags ?? [], + includeMarkdown: options.pageOptions?.includeMarkdown ?? true, + includeRawHtml: options.pageOptions?.includeRawHtml ?? false, + waitFor: options.pageOptions?.waitFor ?? undefined, + headers: options.pageOptions?.headers ?? undefined, + includeLinks: options.pageOptions?.includeLinks ?? true, + fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false, + screenshot: options.pageOptions?.screenshot ?? false, }; this.extractorOptions = options.extractorOptions ?? { mode: "markdown" }; this.replaceAllPathsWithAbsolutePaths = diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 9f8419b6..58e0185e 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -122,23 +122,36 @@ function getScrapingFallbackOrder( export async function scrapSingleUrl( jobId: string, urlToScrap: string, - pageOptions: PageOptions = { - includeMarkdown: true, - onlyMainContent: true, - includeHtml: true, - includeRawHtml: false, - waitFor: 0, - screenshot: false, - fullPageScreenshot: false, - headers: undefined, - includeLinks: true - }, - extractorOptions: ExtractorOptions = { - mode: "llm-extraction-from-markdown", - }, - existingHtml: string = "", + pageOptions: PageOptions, + extractorOptions?: ExtractorOptions, + existingHtml?: string, priority?: number, ): Promise{ + pageOptions = { + includeMarkdown: pageOptions.includeMarkdown ?? true, + onlyMainContent: pageOptions.onlyMainContent ?? false, + includeHtml: pageOptions.includeHtml ?? false, + includeRawHtml: pageOptions.includeRawHtml ?? false, + waitFor: pageOptions.waitFor ?? undefined, + screenshot: pageOptions.screenshot ?? false, + fullPageScreenshot: pageOptions.fullPageScreenshot ?? false, + headers: pageOptions.headers ?? undefined, + includeLinks: pageOptions.includeLinks ?? true, + replaceAllPathsWithAbsolutePaths: pageOptions.replaceAllPathsWithAbsolutePaths ?? false, + parsePDF: pageOptions.parsePDF ?? true, + removeTags: pageOptions.removeTags ?? [], + } + + if (extractorOptions) { + extractorOptions = { + mode: extractorOptions.mode ?? "llm-extraction-from-markdown", + } + } + + if (!existingHtml) { + existingHtml = ""; + } + urlToScrap = urlToScrap.trim(); const attemptScraping = async ( diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index afd80f42..80d53954 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -130,10 +130,12 @@ async function processJob(job: Job, token: string) { const end = Date.now(); const timeTakenInSeconds = (end - start) / 1000; - const rawHtml = docs[0].rawHtml; + const rawHtml = docs[0] ? docs[0].rawHtml : ""; if (job.data.crawl_id && (!job.data.pageOptions || !job.data.pageOptions.includeRawHtml)) { - delete docs[0].rawHtml; + if (docs[0] && docs[0].rawHtml) { + delete docs[0].rawHtml; + } } const data = { From a37681bdff2e6bff8ac47a3b015c48afbcb28eec Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 22 Aug 2024 15:16:46 -0300 Subject: [PATCH 16/16] fix: replace jest, removed map for v0 --- .../src/__tests__/e2e_withAuth/index.test.ts | 98 +------------------ 1 file changed, 3 insertions(+), 95 deletions(-) diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 82ed5bfe..330f8130 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -538,7 +538,7 @@ describe("E2E Tests for v0 API Routes", () => { const urls = completedResponse.body.data.map( (item: any) => item.metadata?.sourceURL ); - expect(urls.length).toBeGreaterThan(1); + expect(urls.length).toBeGreaterThanOrEqual(1); // Check if all URLs have a maximum depth of 1 urls.forEach((url: string) => { @@ -762,11 +762,11 @@ describe("E2E Tests for v0 API Routes", () => { .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://jestjs.io" }); + .send({ url: "https://docs.tatum.io", crawlerOptions: { limit: 200 } }); expect(crawlResponse.statusCode).toBe(200); - await new Promise((r) => setTimeout(r, 20000)); + await new Promise((r) => setTimeout(r, 10000)); const responseCancel = await request(TEST_URL) .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) @@ -869,96 +869,4 @@ describe("E2E Tests for v0 API Routes", () => { 60000 ); // 60 secs }); - - describe("POST /v0/map", () => { - it.concurrent( - "should return a list of links for mendable.ai without subdomains included", - async () => { - const response = await request(TEST_URL) - .post("/v1/map") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://mendable.ai", - }); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("success", true); - expect(response.body).toHaveProperty("links"); - expect(response.body.links).not.toContain("https://docs.mendable.ai"); - expect(Array.isArray(response.body.links)).toBe(true); - expect(response.body.links.length).toBeGreaterThan(0); - }, - 60000 - ); // 60 secs - - it.concurrent( - "should return a list of links for a given URL with subdomains included", - async () => { - const response = await request(TEST_URL) - .post("/v1/map") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://python.langchain.com", - includeSubdomains: true, - }); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("success", true); - expect(response.body).toHaveProperty("links"); - expect(Array.isArray(response.body.links)).toBe(true); - expect(response.body.links.length).toBeGreaterThan(0); - }, - 60000 - ); // 60 secs - - it.concurrent( - "should return a list of links for a given URL with subdomains and search", - async () => { - const response = await request(TEST_URL) - .post("/v1/map") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://python.langchain.com", - includeSubdomains: true, - search: "agents", - }); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("success", true); - expect(response.body).toHaveProperty("links"); - expect(response.body.links).toContain( - "https://api.python.langchain.com/en/latest/_modules/langchain/agents/openai_functions_agent/base.html" - ); - expect(Array.isArray(response.body.links)).toBe(true); - expect(response.body.links.length).toBeGreaterThan(0); - response.body.links.forEach((link) => { - expect(link).toContain("python.langchain.com"); - }); - }, - 60000 - ); // 60 secs - - it.concurrent( - "should handle invalid URL input gracefully", - async () => { - const response = await request(TEST_URL) - .post("/v1/map") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "invalid-url", - includeSubdomains: true, - search: "agents", - }); - - expect(response.statusCode).toBe(400); - expect(response.body).toHaveProperty("success", false); - expect(response.body).toHaveProperty("details"); - }, - 60000 - ); // 60 secs - }); });