From 0b8df5e264187c4c8d0f0ce7a0aad2569d646e31 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 8 Aug 2024 14:25:09 -0300 Subject: [PATCH] python sdk and tests --- apps/python-sdk/examplev0.py | 75 ++++++++ .../firecrawl/__tests__/e2e_withAuth/test.py | 32 ++-- .../__tests__/v1/e2e_withAuth/.env.example | 3 + .../__tests__/v1/e2e_withAuth/__init__.py | 0 .../test.cpython-311-pytest-8.2.1.pyc | Bin 0 -> 44947 bytes .../__tests__/v1/e2e_withAuth/test.py | 168 ++++++++++++++++++ apps/python-sdk/firecrawl/firecrawl.py | 75 +++++--- 7 files changed, 317 insertions(+), 36 deletions(-) create mode 100644 apps/python-sdk/examplev0.py create mode 100644 apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/.env.example create mode 100644 apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__init__.py create mode 100644 apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc create mode 100644 apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py diff --git a/apps/python-sdk/examplev0.py b/apps/python-sdk/examplev0.py new file mode 100644 index 00000000..d80fa795 --- /dev/null +++ b/apps/python-sdk/examplev0.py @@ -0,0 +1,75 @@ +import uuid +from firecrawl.firecrawl import FirecrawlApp + +app = FirecrawlApp(api_key="fc-YOUR_API_KEY") + +# Scrape a website: +scrape_result = app.scrape_url('firecrawl.dev') +print(scrape_result['markdown']) + +# Crawl a website: +idempotency_key = str(uuid.uuid4()) # optional idempotency key +crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, idempotency_key) +print(crawl_result) + +# LLM Extraction: +# Define schema to extract contents into using pydantic +from pydantic import BaseModel, Field +from typing import List + +class ArticleSchema(BaseModel): + title: str + points: int + by: str + commentsURL: str + +class TopArticlesSchema(BaseModel): + top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories") + +llm_extraction_result = app.scrape_url('https://news.ycombinator.com', { + 'extractorOptions': { + 'extractionSchema': TopArticlesSchema.model_json_schema(), + 'mode': 'llm-extraction' + }, + 'pageOptions':{ + 'onlyMainContent': True + } +}) + +print(llm_extraction_result['llm_extraction']) + +# Define schema to extract contents into using json schema +json_schema = { + "type": "object", + "properties": { + "top": { + "type": "array", + "items": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "points": {"type": "number"}, + "by": {"type": "string"}, + "commentsURL": {"type": "string"} + }, + "required": ["title", "points", "by", "commentsURL"] + }, + "minItems": 5, + "maxItems": 5, + "description": "Top 5 stories on Hacker News" + } + }, + "required": ["top"] +} + +llm_extraction_result = app.scrape_url('https://news.ycombinator.com', { + 'extractorOptions': { + 'extractionSchema': json_schema, + 'mode': 'llm-extraction' + }, + 'pageOptions':{ + 'onlyMainContent': True + } +}) + +print(llm_extraction_result['llm_extraction']) \ No newline at end of file diff --git a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py index 452d4982..457c206a 100644 --- a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py +++ b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py @@ -20,31 +20,31 @@ FirecrawlApp = firecrawl.FirecrawlApp def test_no_api_key(): with pytest.raises(Exception) as excinfo: - invalid_app = FirecrawlApp(api_url=API_URL) + invalid_app = FirecrawlApp(api_url=API_URL, version='v0') assert "No API key provided" in str(excinfo.value) def test_scrape_url_invalid_api_key(): - invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") + invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0') with pytest.raises(Exception) as excinfo: invalid_app.scrape_url('https://firecrawl.dev') assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) def test_blocklisted_url(): blocklisted_url = "https://facebook.com/fake-test" - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') with pytest.raises(Exception) as excinfo: app.scrape_url(blocklisted_url) assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) def test_successful_response_with_valid_preview_token(): - app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token") + app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token", version='v0') response = app.scrape_url('https://roastmywebsite.ai') assert response is not None assert 'content' in response assert "_Roast_" in response['content'] def test_scrape_url_e2e(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') response = app.scrape_url('https://roastmywebsite.ai') assert response is not None assert 'content' in response @@ -54,7 +54,7 @@ def test_scrape_url_e2e(): assert "_Roast_" in response['content'] def test_successful_response_with_valid_api_key_and_include_html(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') response = app.scrape_url('https://roastmywebsite.ai', {'pageOptions': {'includeHtml': True}}) assert response is not None assert 'content' in response @@ -66,7 +66,7 @@ def test_successful_response_with_valid_api_key_and_include_html(): assert " 0 @@ -104,7 +104,7 @@ def test_crawl_url_wait_for_completion_e2e(): assert "_Roast_" in response[0]['content'] def test_crawl_url_with_idempotency_key_e2e(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') uniqueIdempotencyKey = str(uuid4()) response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey) assert response is not None @@ -117,7 +117,7 @@ def test_crawl_url_with_idempotency_key_e2e(): assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value) def test_check_crawl_status_e2e(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False) assert response is not None assert 'jobId' in response @@ -131,20 +131,20 @@ def test_check_crawl_status_e2e(): assert len(status_response['data']) > 0 def test_search_e2e(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') response = app.search("test query") assert response is not None assert 'content' in response[0] assert len(response) > 2 def test_search_invalid_api_key(): - invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") + invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0') with pytest.raises(Exception) as excinfo: invalid_app.search("test query") assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) def test_llm_extraction(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') response = app.scrape_url("https://mendable.ai", { 'extractorOptions': { 'mode': 'llm-extraction', diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/.env.example b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/.env.example new file mode 100644 index 00000000..904887bf --- /dev/null +++ b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/.env.example @@ -0,0 +1,3 @@ +API_URL=http://localhost:3002 +ABSOLUTE_FIRECRAWL_PATH=/Users/user/firecrawl/apps/python-sdk/firecrawl/firecrawl.py +TEST_API_KEY=fc-YOUR_API_KEY \ No newline at end of file diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__init__.py b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ba1f1324fe139772739cdae776d127cd5002ca8 GIT binary patch literal 44947 zcmeHwZ)_V!mS`a`Do!E};jGaF-9vrx|x@A+Q zNTr*yV-1zl++`PwU~n@y?B2}mW`Nw@Z8k=Bf%$T{J77OtK5c-zJv0pp(Gdt>;IJR& za34IE`7rRO`@O2>V|B6Hre%9PI}zP_u6p(ARkPS#uimRyul}a5FJ<8KC-DoD3mXjM ze^aLMG5y5Xe;+f9pBk27#R|sxnE1bAIzG`6Gp5b+oiSO`bv_|VlIN4+|L*hM;{Vk7 z6#nlh^i21j?-jK1Lf>@%`Tm&EVc7kjppnlSEgC2{&!?@<^8+YLO|5DaSY4ms9zMgL z{G1;|`H+>c2maFdMMtnKX4r|T;RfZ$s~aixe=BJZe9{4q_>-UWBY|3L8hG`8dnDM> z`L*`Q)Vc=6$Lkv@^?!S9unz8S?bKrpit~?I-5(g|H&`jajaCofsMQO&$?5~lSp9&T ztu){kYXIS_}BJ6+35)t@{-=Ved+=}k`MrDgJTo^knRo07CwH~5 zn^U`HuGM?rJbLcU?2$K)XHOpe-rTw)FP}T}`de=v%^o{`_UJ2TkG%7G_KhQNzV>{k z-g~UU;K(80^L8jCrG+0NVeQiJ$-Z z+gOlxo}O6z3;y0QquBGfF>bi#)sFY#-!rbpV#fFI{NV}gnD3|G&>mI<(yt z;Mk?QClfqeuH+)n3Q*LKQ+{rH|Sw{C)R&se(NHFK?OyXAV9lgqod zTkk&l-h@3<&X=|dGG(D4Z z?0R?B%}v^bsrQ01dto+TDCdiAy{k|Hv0Go8buX1>3zn#ry;vw+$Q821+_YWqKO*iE z%^!7~l2aebPL`bMTsdpMH&e(Jb97^4Nr&FtdQZNHd&*nc+{{e9hZ>S!a*vvMPR;CB zGtbD3>%gd&#DG|%=lfKNThlJdjK3H;6#skkh>`d_9s>-QmLrgvL1|g$8H|DX+23{q zr5vOr%P~k+lDOid%FE!m1<4w#7#}OO8g;;58)3z4?p`FT3CS9pqTxb(AXyb@c~{BG zvq5|l9`!BAT}ak0lEN*LRjncBxzucnWKEQn+1?b($q>n^^5Z!{nI%GqWL0bM9YoiF zbW2#tHt%P+ta9i3sc7>bNLH1O??R_T+sOz@?RSr$(j{v*OV*Utqe#|`^4BLA;Ka|u zvLtKo>3X_3&WmC4dWS>9vy`_OHgAi0u8WCob2@9;SLU`J%jFBUl_{4pE~dLPb_Q~6 zSLR$fSDtk<6D7-jKJ)CJafco{hh|kW6Tr;Xv~;9O@Qll)xqe83LOLYyr5QbRMTP{mRZ01fC?YmB2Ov+X;*T z1SFF46fpHoLVPrv&uabT>;)CwlKa1;I2j!M>#N>@fnjm;E9EkEfVIy$L}m9 zhrUdX-bjvmoAy{tWHx<*0lzcQAv<{4VNeqZJdcIGU zxHbJX4FkU%IWidke9$~HlK6aG3@}{KJRt;)6@vg_bqueoB5YLUtVQg^ROM`{sp|0H zqYsr3I}i;YvuMDb@<&$z?pf@_=i`-*DOMNfO?B9*Xu}p1IVfvT=(;LhLZKVt8!@K% z{-MfsLF@?H7O|t&U`0(AZ%|{{bO3?I!*__-QTeg3DGxO7D=5OD(5W@}4x($2Qe`FD zyr1C!cfKE8tU~OB-i1y_A}@kc`(+}iI)mTdO8R0aVRc)nO2Sv@l4TlI{ZD`s_hTg) z!LaB6l7xbImlex?GP8xWfU1+sU8MC(? z8M8`}F*Hxp1dTIJxOF!lviP{U7%K3`iOp zjUi)Bc*iq?-?8P4RUd-wxO1&2Y?Ki)(X4HR_eQotU)L!gDFZQDs zV^*pu^yd@ktAyXHg+L$ij0uVp^=g_Zm)ENSzRkYZjF;Ac|NWEJDr>Nk^q->cHlCvH z;Bynq`BIZ4X2(1hh+qO8I$hsTzLa;fxTa=d*UDvQ9Q#V%zM3tUF55*RFK@4F2vw() zbIa4$uG$yeeA(WW%U_S(?#jEFVyO%x=?IxWVW!MTOop6Wq~q%gWRNNKJe zXb+}YXAsD-9z}c$QQcGlHSJGmgqx{hp#r9y{gi%*z%~HYMmet9D4$g``-Hlv((YA@ zeY2;uXYO}ta9~O9i}G=)KF&@|*seP{OD0^|x57}Fm8R9;Pk0V&=oZ~+jMT7Cl8E>A zoc2;@s;M)z)R~1>mePYie*eSwz0rf!^z*g!^9!#nnVY<@Yth_P3&VrB+9;AVoJA4W zVq_~OluWV(R4*`*#2OQ3t`aYq887TwG&8j@Jcz5JhzVpCc!}XGW4KBv21t3KBBxah z*lKO5O>mVZ^J%Yf)fx%L$i$-gbW~AQ6fr?)Rx&}6##UlbSF0|WXvAvvYdm#E_)Oyq zNZx4UX+WQi>1I^SIcT=h^072&95Ux%Qzhg*n^mzONOKO8W+(c%j-@`<22qFS3n6n3 zm3MP&Z#4j_bes;Eb99!u6fJXF*V(KtmXNXZGUrfhaJr(;IUZqOTfGmp7irGXx6+IB%HGGb*qFJg^Iv|NR-Q#Hr?jI*P%oza zXAv_&p+$uE;(&OgSFOC!s=HHT7L8sU6k0@gq!tn0t8$zi3e_Ui8l2Y0$-o=bq|&vB zVb&s6TO*1Vad)|JuUf>K(?U;h4&n874ioqmftLszA@B-;Zxc95fbQ8jM&MO|dU87F zT((MAi%8k!oRur*>gJ{LbYTqI1yl<-U5xqK1*rlO)6g!Q*ND$?0w)NZBtUcf>%Gd{ zUYO?YWrFX#4i=#^@F9Hx2VMfs_d!jjMfV?|E&6lpfEjINip?`wB;*WNoAv_#sHfPQ zJu~E5G&k46@F1>=A|{Yo;3bB$jNvMw7$D_^ikwz4V5_yIHo;YvTB^$;uFI61kbIbc z*-8v&K_5kfB9%;X28NqZF<_$cX{`(TKcFwtRH~v>G}f+E6-uT<)itJ9^)RWcsT?F` z8QPokCI+-{?X8Xx{K0E)R@^eJ&U`GdYkaM?W0rb>SznjX;}W5IT=J3WamfgIt!b-P zH)|l|qVX{FxZbujkUlZ%>%Wg#-y`g6EB#P=k><_=E4_$;;ePeFRV%&t0D6%$kh^Qc zD{nM)wdUlm=y5~nMXZ+HqaG)Fb@-v{ajRL68?n|rG(ArC!N51V57vt3VAVIe4`@|z zoubdzduDPM?K5I?flJ=|6NOpJehu3BoA<8jJ8$DPbiPaA9Rg9h`&|6crE%vxkwog@ zhuT>Q^L_NQxuOL}OL6O2vMe}R@FoWscMseM?yRu?ms4My^1h$-PX3YiLav&=P)lD} zSs-iZ>a5JKXz#4(-nY`b(KAD?MRQ{<3=iU}C}IMc1zut}%NVW_iUCqysK{v*1GZXQ zY7<;#MSFiF+WV5Z!)si%MuIUiv1smyDyoViCJ4<+CMeR_N(}01)g=>+SZ0^ZC%nd0 zYa|#W6N}~(QAJfz!~~&P$pl3jTZutkt-55Q5v$odSVj)P(){2pBSAl&m33_hZt)3N zR#w)vHD3exEq1P~Yd4-1-NORjpQ;s+(>mSns28P2;_}uK!#xuD8wI>aG@pxzKG4?^ zqdXEH{959W)IH{&YWN)HocHoqc9ooqyJxJ)-Ej7EN;_vR?LN47&-k7_Qjm1)oPuUaF)7@1f!r=yChqKFAXvyuslG`13h zx>|L~L?c$ScbL9ezmw@(V9+Jk1o6?gOwiUkZOcTcE8<|w@j!t zn%`LyrPwm@$oe{R%fy4}#W2S^wNF-h@m?J7?zHjbPQ5x1y40Z7hs|&Av zWhOs3@u%lOAm=^Fd4uTjztceFLwS@%2iKAM^q~&6(p$j32*8 zSnlcuBg~Q2d0^yR8gt}--Z6fho#+bb`Mn3Jxsz`d5%3Lm6ol`wDG*$yG%+#jILKbT z1k**P>|DziFN)AU1uwGtxALki-nUKgmqe7B1gY9 zRc$|T&wf8}&&C_cjo#>U)#Scfa-WynN8uFqQT+|;O~te#B_E9jtplT8 z5(8q5p6^p7ZcRJS;wKJx#j>!FaeAVnC){~YBLbZ+FHVmU5vNB}$XDX@;Ab5-vUe@Ah8WE`H24dt&46L;Bmg0XrYq&#^bv3&A)f|e7@M}`X6N406vaF>vAv~0t> z{$>209a4vEL|-k>I>oFIuoP!CtH<>zfC24C^5gonhV)q)c@B|&^f0B{WL%%V5Bq*n zswTJAl3TsxRuV+pNWe%yf@m8FA_dnhZKixdq*JX3B2}K;uOK>z-#8$M$OQiT*UT_JObi;f|fp`*2Jf6mxP4aqLB>UqR}0&Qn5`j#8#~?ZIbvEpOAUDO2=uP z4)gHtfJMGZsoSbkyt$wU!F3+O77aD*n|`0LWYZS%j!M^MlfE!Q)+F5j7r680T0(y% zAwt!Fx+SE+K%4!%F2I}nxl9o>{ZG&_1wxejPSqKsYoPB3(EF@QsTTR)pWi+FTj|?! ztyLeHLWm}>g2`T(bs_iuVwv&i9Pk!6(?Ji6gF7AHnZqC~j)D6+^V=p4I(r!PHlZ<$e5KMkp zdxUxt5$~}n8)5A;FN!VM2(46)7wqD7(|H*@nz5S1rtClzCIr!UC@iIOlmI=qVODzK zBtF)so9gpK-mZ1N3}UbdhHP?95sT9V&JcJKKn>0GtaFyo+X*~D;0*%b0dQ%&;yD^6 zVbwf!nh7m|gt}AIci&BXJWMnHw zGKs2-Qmw9v;U*X<`EV8qnU)EL;E6PdHF|D~DLJuZKJGQHS|h<2nOHO*k1DE)A|?pU zN+u}M*h&oQYSkqZjaX)v%w1mNs)>YzjO4_kxhtxuDvFpOG%NX_h#9IF)YYm>CK|Dt zy~Bz?2IJIn9emaBf2tG=&UAtj^$sThbOdMQ7B)VA0*U__OMT234GOnhi6iKySuqhV zGIXXzVWTix>?TTmc3`Cf)@+_#1n(JV-2^>DnpSUZ4jt)1qS0Yymp|fq199p{;WJ%Nn%#c zZKyL^gpN$2p9hG#rJq-|+0W|&yt$vt6t~|01iuGn>@p|J}pCm404< zHR#XEyR9K>xYF%E9Vu1Q3UK1Ss%SIdV6Xa52Zf+cBADiC%)${?RGEeLR(dMEn3Yqo z+d`lP)@%`$hXT*7Axiyu2-M);tfI}gBdB_V_iC;6-;h3QowdHw=l4N(8-37@Uq=u6 zAU@w~J*M}8wfdXfA8L^Ese#73S>Dn}=^K&}cE3OJw3iKc8|&}cz`qf{R{s=5N)n&> zgesl%w`e|%XEZIJW)X;=-;%)iCZFd~qFvteY;C&FQLwA=ee~Xq90en5U=-Yhb4b*+ z+MC z_e2ttcMBWp1MGm#k1f>L4t8QF@FjYiew=9rs`X+=st8VysMe|T+iEi}&M+CQQLcL!&;D{G4nV{Ng` z0Mz|rrHHnawjLeA^IwQ{PU}{bN4cKIM`xJU!&srkJ{Wbz+&n1ES{0lZ#|%8{{rvVQ ziQw8@uofN4fgN87<&YfiW%M}P^U+nziq zCyCYOm8S{6c;Gi_S|r=t`G%)yz4A20RqL339OHaa*bRl31@AlW#Z3~g&I}RTnQx1$ed|0?6;1nor*I(VF0P_x;m{3VL zIDk?*I3S@pI6R{|IP3>U;ZGnP9O&GtXB;w?P-_32daTt!d92mJz)8gPT z)WYDzC41s>R_;NNognWeeHY&b;lHpzeBL1N9Rg$sQvDtFDIO5=2&x11-j`;skq5(c zt~|a+9F4A-grwucW6~hha&y86^dq!GmM6C#rq(bc5F34-_4dE|#Z%rpS?}#X@{Z@c zeHT0{UrkTd(o=sxMxG^e)C;>7&CyyI9>i5q!~`-6yu@&pF|L~L?f12H9njggNT1=wX{HQKwEEewo#}@8Fzigh zSxnnr>njJuL4NBis*O1yyo~68xI0C);JI$ViqSlzcA5Hj#2i>#A?4U8NvoecaM=6d z?pk6y-Vgq9s;XTm#B+ZD9?Kn-l&{b9RC*#k_g6uWfh~ooTlATs`_yM1tmppF@BiV5 z5l@Jw{aeHABSc4KHS7qzZD>iFr%iBt@!t^g-0zuB2@T0VD(iNNX3QBZ;5ygttQbk> zXrZmpTNLUcJx2xa5{OzZZ}Fdo4rS+@iAxXGg<7Z?K@CGF6>{a7JgK8xr_(}%koP3C zhM$1=kEz0c$9Ip?8sGbgl!wes_?Ot*_d0mW=D}@B)axq387YeRf z+D!Sb7CO~U^Qb4Tnt+P}R?VX_D~lKc4}#Dl(8yX$z)U2Dv!Ik1Kt{G=WTR1axjE*h zVp@@sk4A&mfl)7s0kKBU_o))MrduP2&?1T$`L93*JS?XQtINW<;MlUAurRUpVd|s! zMZ-${RScUa;)oF*U-p@MePSuPKXgNPx=nQ@xWvrMFCk-{sz`ELr^8Z9I%49lFGV-v zTASGLF!)fMF62^l%-^0HwG=%poGw=1httI)>}zX;x5YnR6mbvE6hps*UQJkQ152{aj8Tb~$=lffd;%TpGtxoD2W|jK_bb{; z(&qaewY>UZ%@OM(-g#|#)q3nc?^^8_i0jYrCqI#k!jUyFQau)N*V=NgwZS)0bhcrX zXlF@2j$bihZEXJJ*yh&Eto;6l8}-yi{+j>Xet%(*8ubJ54mDyiP1{8)ccEbK%H`_= zh&k)zCd#EoAc495LScHRkHvQOxqtm~4tpXprJ^_|rHHnfyqm!hZJ@Am%7T;0<3NVlnVFJPb~ESBo!Oxmg1j_i7c*{Y z)|s&DU8M_C2s?<#z1L=t!x0e(GgwbtD3uC!t~eI2r~MYQetY%a2K%h*mg@a^H%t7o zlHXX|`FD7u>!}$W2w^+ryzSPLj{WXz9-$1u&73Y-Hunu~{|yIUsH|enVWA$U+8+iR z=UW6`B5;JjD+Im`P^Z46sJ_(0;G+3HI6o)CzW|tfF6ZJXin4HMc&_25fJCdC!VW=$ z5Df^(P*3IEd=Xv(MRbu#oiXOeWFWhmGeCgu!}&{q@L0%gTL=G&`1}Xr)7_3I!oMeS z;S)fUICd$J&ICik6sYyhe*_(AN$&qNqs@WufB3#Px_2qLerfoLTOG#c$&Qb}gFQYGHT~S49yM$Sm*@!&$~~l~4?j@d;2*GfV$ORrlWRK$SP{Q0A|9-Rt)n84X3#wBv3`G#oSc-CG!Q( ze4!@bVgpti7!>h^Qjn=d5(CJ{Rt&03BoZ1KXPIUhtR!2>L2F3m4+42DnTI{|a81C) z2COzPDB=sHAXAGZ29S}h7*v-?Bs4P4GR-nrNw$)M){x2{1nMO$!ecw?$qTdjLK!2r zBfQ?$WoNF@NVrQ5{tytm9Q&%1FWdEQM%5EOPK?wLWY~h|lneO_yJpMzLcN>waw{Qu zlGQs(Zrz-n&0AtuV!g|vjgD6wjH`x2Gr;J6%DPS^OLfZkb~loqHLiwAr4Nen^aIAfW(|F*pfhCx+ft_ zg&$kJtAI0=Y^UC1zXu+YQf{Jrnr>+?fdd525unJ1;`CN=5RwRm@6Z-Qhr%9;$T|+~ zI&{dK=FlRNFs?ZyKdu|1tA&)gojfF4deQl>D55#M+YJo|_On=Qi9}5JKhIdr*OJk< zApR{GJqzOBlF_{&{w*0JUhrBnM!n#5lWP62?*p*7ncBRNSX%##XRKLDul-Z!N7sHf zT1{`SrMEBi-ZVNFI=!wJye$U_cD+zF4%Li9o^j}AV&H@KzfA1Bk=XfZ(R=Z&YU1r$ z;_U@=|fX{@f*m_f>noxt8L1@?9di~{>RV>@mc4EmFlWiXtIJ$cJuu(m5!1VaT5Bm_o* zgJ!HG8fAz^DXqZ%uGns3 0 + assert 'content' not in response[0] + assert 'markdown' in response[0] + assert "_Roast_" in response[0]['markdown'] + +def test_crawl_url_with_idempotency_key_e2e(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + uniqueIdempotencyKey = str(uuid4()) + response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey) + assert response is not None + assert len(response) > 0 + assert 'content' in response[0] + assert "_Roast_" in response[0]['content'] + + with pytest.raises(Exception) as excinfo: + app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey) + assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value) + +def test_check_crawl_status_e2e(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False) + assert response is not None + assert 'jobId' in response + + time.sleep(30) # wait for 30 seconds + status_response = app.check_crawl_status(response['jobId']) + assert status_response is not None + assert 'status' in status_response + assert status_response['status'] == 'completed' + assert 'data' in status_response + assert len(status_response['data']) > 0 + +def test_search_e2e(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + with pytest.raises(NotImplementedError) as excinfo: + app.search("test query") + assert "Search is not supported in v1" in str(excinfo.value) + +def test_llm_extraction(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.scrape_url("https://mendable.ai", { + 'extractorOptions': { + 'mode': 'llm-extraction', + 'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", + 'extractionSchema': { + 'type': 'object', + 'properties': { + 'company_mission': {'type': 'string'}, + 'supports_sso': {'type': 'boolean'}, + 'is_open_source': {'type': 'boolean'} + }, + 'required': ['company_mission', 'supports_sso', 'is_open_source'] + } + } + }) + assert response is not None + assert 'llm_extraction' in response + llm_extraction = response['llm_extraction'] + assert 'company_mission' in llm_extraction + assert isinstance(llm_extraction['supports_sso'], bool) + assert isinstance(llm_extraction['is_open_source'], bool) + +def test_map_e2e(): + app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token") + response = app.map_url('https://roastmywebsite.ai') + assert response is not None + assert isinstance(response, list) + \ No newline at end of file diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 7ec0d33f..25c9663e 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -19,24 +19,22 @@ import requests logger : logging.Logger = logging.getLogger("firecrawl") class FirecrawlApp: - """ - Initialize the FirecrawlApp instance. + def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None, version: str = 'v1') -> None: + """ + Initialize the FirecrawlApp instance with API key, API URL, and version. - Args: - api_key (Optional[str]): API key for authenticating with the Firecrawl API. - api_url (Optional[str]): Base URL for the Firecrawl API. - """ - def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: - self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') - if self.api_key is None: - logger.warning("No API key provided") - raise ValueError('No API key provided') - else: - logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key) - - self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') - if self.api_url != 'https://api.firecrawl.dev': - logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url) + Args: + api_key (Optional[str]): API key for authenticating with the Firecrawl API. + api_url (Optional[str]): Base URL for the Firecrawl API. + version (str): API version, either 'v0' or 'v1'. + """ + self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') + self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') + self.version = version + if self.api_key is None: + logger.warning("No API key provided") + raise ValueError('No API key provided') + logger.debug(f"Initialized FirecrawlApp with API key: {self.api_key} and version: {self.version}") def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: """ @@ -75,9 +73,11 @@ class FirecrawlApp: for key, value in params.items(): if key != 'extractorOptions': scrape_params[key] = value + + endpoint = f'/{self.version}/scrape' # Make the POST request with the prepared headers and JSON data response = requests.post( - f'{self.api_url}/v0/scrape', + f'{self.api_url}{endpoint}', headers=headers, json=scrape_params, ) @@ -104,6 +104,9 @@ class FirecrawlApp: Raises: Exception: If the search request fails. """ + if self.version == 'v1': + raise NotImplementedError("Search is not supported in v1") + headers = self._prepare_headers() json_data = {'query': query} if params: @@ -145,11 +148,12 @@ class FirecrawlApp: Raises: Exception: If the crawl job initiation or monitoring fails. """ + endpoint = f'/{self.version}/crawl' headers = self._prepare_headers(idempotency_key) json_data = {'url': url} if params: json_data.update(params) - response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers) + response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) if response.status_code == 200: job_id = response.json().get('jobId') if wait_until_done: @@ -172,13 +176,44 @@ class FirecrawlApp: Raises: Exception: If the status check request fails. """ + endpoint = f'/{self.version}/crawl/status/{job_id}' headers = self._prepare_headers() - response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) + response = self._get_request(f'{self.api_url}{endpoint}', headers) if response.status_code == 200: return response.json() else: self._handle_error(response, 'check crawl status') + def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: + """ + Perform a map search using the Firecrawl API. + """ + if self.version == 'v0': + raise NotImplementedError("Map is not supported in v0") + + endpoint = f'/{self.version}/map' + headers = self._prepare_headers() + + # Prepare the base scrape parameters with the URL + json_data = {'url': url} + if params: + json_data.update(params) + + # Make the POST request with the prepared headers and JSON data + response = requests.post( + f'{self.api_url}{endpoint}', + headers=headers, + json=json_data, + ) + if response.status_code == 200: + response = response.json() + if response['success'] and 'data' in response: + return response['data'] + else: + raise Exception(f'Failed to map URL. Error: {response["error"]}') + else: + self._handle_error(response, 'map') + def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]: """ Prepare the headers for API requests.