From 170a8ebfe58c9f4ae377976ab956119e2b0b5773 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 27 Aug 2024 11:58:42 -0300 Subject: [PATCH] Nick: --- apps/api/src/routes/v1.ts | 5 +- .../build/lib/firecrawl/__init__.py | 1 - .../build/lib/firecrawl/firecrawl.py | 299 ------------------ .../dist/firecrawl-py-0.0.12.tar.gz | Bin 6754 -> 0 bytes .../dist/firecrawl_py-0.0.12-py3-none-any.whl | Bin 6407 -> 0 bytes .../python-sdk/firecrawl_py.egg-info/PKG-INFO | 179 ----------- .../firecrawl_py.egg-info/SOURCES.txt | 9 - .../dependency_links.txt | 1 - .../firecrawl_py.egg-info/requires.txt | 3 - .../firecrawl_py.egg-info/top_level.txt | 1 - 10 files changed, 2 insertions(+), 496 deletions(-) delete mode 100644 apps/python-sdk/build/lib/firecrawl/__init__.py delete mode 100644 apps/python-sdk/build/lib/firecrawl/firecrawl.py delete mode 100644 apps/python-sdk/dist/firecrawl-py-0.0.12.tar.gz delete mode 100644 apps/python-sdk/dist/firecrawl_py-0.0.12-py3-none-any.whl delete mode 100644 apps/python-sdk/firecrawl_py.egg-info/PKG-INFO delete mode 100644 apps/python-sdk/firecrawl_py.egg-info/SOURCES.txt delete mode 100644 apps/python-sdk/firecrawl_py.egg-info/dependency_links.txt delete mode 100644 apps/python-sdk/firecrawl_py.egg-info/requires.txt delete mode 100644 apps/python-sdk/firecrawl_py.egg-info/top_level.txt diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index a9f4d396..27da0a1a 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -7,16 +7,14 @@ import { mapController } from "../controllers/v1/map"; import { ErrorResponse, RequestWithAuth, RequestWithMaybeAuth } from "../controllers/v1/types"; import { RateLimiterMode } from "../types"; import { authenticateUser } from "../controllers/auth"; -import { Logger } from "../lib/logger"; import { createIdempotencyKey } from "../services/idempotency/create"; import { validateIdempotencyKey } from "../services/idempotency/validate"; -import { ZodError } from "zod"; import { checkTeamCredits } from "../services/billing/credit_billing"; -import { v4 as uuidv4 } from "uuid"; import expressWs from "express-ws"; import { crawlStatusWSController } from "../controllers/v1/crawl-status-ws"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; import { crawlCancelController } from "../controllers/v1/crawl-cancel"; +import { Logger } from "../lib/logger"; // import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview"; // import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status"; // import { searchController } from "../../src/controllers/v1/search"; @@ -33,6 +31,7 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R } const { success, message, remainingCredits } = await checkTeamCredits(req.auth.team_id, minimum); if (!success) { + Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`); return res.status(402).json({ success: false, error: "Insufficient credits" }); } req.account = { remainingCredits } diff --git a/apps/python-sdk/build/lib/firecrawl/__init__.py b/apps/python-sdk/build/lib/firecrawl/__init__.py deleted file mode 100644 index e7f8063d..00000000 --- a/apps/python-sdk/build/lib/firecrawl/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .firecrawl import FirecrawlApp diff --git a/apps/python-sdk/build/lib/firecrawl/firecrawl.py b/apps/python-sdk/build/lib/firecrawl/firecrawl.py deleted file mode 100644 index 3f50c798..00000000 --- a/apps/python-sdk/build/lib/firecrawl/firecrawl.py +++ /dev/null @@ -1,299 +0,0 @@ -""" -FirecrawlApp Module - -This module provides a class `FirecrawlApp` for interacting with the Firecrawl API. -It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs, -and check the status of these jobs. The module uses requests for HTTP communication -and handles retries for certain HTTP status codes. - -Classes: - - FirecrawlApp: Main class for interacting with the Firecrawl API. -""" - -import os -import time -from typing import Any, Dict, Optional - -import requests - - -class FirecrawlApp: - """ - Initialize the FirecrawlApp instance. - - Args: - api_key (Optional[str]): API key for authenticating with the Firecrawl API. - api_url (Optional[str]): Base URL for the Firecrawl API. - """ - def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: - self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') - if self.api_key is None: - raise ValueError('No API key provided') - self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') - def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: - """ - Scrape the specified URL using the Firecrawl API. - - Args: - url (str): The URL to scrape. - params (Optional[Dict[str, Any]]): Additional parameters for the scrape request. - - Returns: - Any: The scraped data if the request is successful. - - Raises: - Exception: If the scrape request fails. - """ - - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.api_key}' - } - # Prepare the base scrape parameters with the URL - scrape_params = {'url': url} - - # If there are additional params, process them - if params: - # Initialize extractorOptions if present - extractor_options = params.get('extractorOptions', {}) - # Check and convert the extractionSchema if it's a Pydantic model - if 'extractionSchema' in extractor_options: - if hasattr(extractor_options['extractionSchema'], 'schema'): - extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema() - # Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided - extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction') - # Update the scrape_params with the processed extractorOptions - scrape_params['extractorOptions'] = extractor_options - - # Include any other params directly at the top level of scrape_params - for key, value in params.items(): - if key != 'extractorOptions': - scrape_params[key] = value - # Make the POST request with the prepared headers and JSON data - response = requests.post( - f'{self.api_url}/v0/scrape', - headers=headers, - json=scrape_params, - ) - if response.status_code == 200: - response = response.json() - if response['success'] and 'data' in response: - return response['data'] - else: - raise Exception(f'Failed to scrape URL. Error: {response["error"]}') - elif response.status_code in [402, 408, 409, 500]: - error_message = response.json().get('error', 'Unknown error occurred') - raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') - else: - raise Exception(f'Failed to scrape URL. Status code: {response.status_code}') - - def search(self, query, params=None): - """ - Perform a search using the Firecrawl API. - - Args: - query (str): The search query. - params (Optional[Dict[str, Any]]): Additional parameters for the search request. - - Returns: - Any: The search results if the request is successful. - - Raises: - Exception: If the search request fails. - """ - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.api_key}' - } - json_data = {'query': query} - if params: - json_data.update(params) - response = requests.post( - f'{self.api_url}/v0/search', - headers=headers, - json=json_data - ) - if response.status_code == 200: - response = response.json() - - if response['success'] and 'data' in response: - return response['data'] - else: - raise Exception(f'Failed to search. Error: {response["error"]}') - - elif response.status_code in [402, 409, 500]: - error_message = response.json().get('error', 'Unknown error occurred') - raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}') - else: - raise Exception(f'Failed to search. Status code: {response.status_code}') - - def crawl_url(self, url, params=None, wait_until_done=True, timeout=2, idempotency_key=None): - """ - Initiate a crawl job for the specified URL using the Firecrawl API. - - Args: - url (str): The URL to crawl. - params (Optional[Dict[str, Any]]): Additional parameters for the crawl request. - wait_until_done (bool): Whether to wait until the crawl job is completed. - timeout (int): Timeout between status checks when waiting for job completion. - idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. - - Returns: - Any: The crawl job ID or the crawl results if waiting until completion. - - Raises: - Exception: If the crawl job initiation or monitoring fails. - """ - headers = self._prepare_headers(idempotency_key) - json_data = {'url': url} - if params: - json_data.update(params) - response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers) - if response.status_code == 200: - job_id = response.json().get('jobId') - if wait_until_done: - return self._monitor_job_status(job_id, headers, timeout) - else: - return {'jobId': job_id} - else: - self._handle_error(response, 'start crawl job') - - def check_crawl_status(self, job_id): - """ - Check the status of a crawl job using the Firecrawl API. - - Args: - job_id (str): The ID of the crawl job. - - Returns: - Any: The status of the crawl job. - - Raises: - Exception: If the status check request fails. - """ - headers = self._prepare_headers() - response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) - if response.status_code == 200: - return response.json() - else: - self._handle_error(response, 'check crawl status') - - def _prepare_headers(self, idempotency_key=None): - """ - Prepare the headers for API requests. - - Args: - idempotency_key (Optional[str]): A unique key to ensure idempotency of requests. - - Returns: - Dict[str, str]: The headers including content type, authorization, and optionally idempotency key. - """ - if idempotency_key: - return { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.api_key}', - 'x-idempotency-key': idempotency_key - } - - return { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.api_key}', - } - - def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5): - """ - Make a POST request with retries. - - Args: - url (str): The URL to send the POST request to. - data (Dict[str, Any]): The JSON data to include in the POST request. - headers (Dict[str, str]): The headers to include in the POST request. - retries (int): Number of retries for the request. - backoff_factor (float): Backoff factor for retries. - - Returns: - requests.Response: The response from the POST request. - - Raises: - requests.RequestException: If the request fails after the specified retries. - """ - for attempt in range(retries): - response = requests.post(url, headers=headers, json=data) - if response.status_code == 502: - time.sleep(backoff_factor * (2 ** attempt)) - else: - return response - return response - - def _get_request(self, url, headers, retries=3, backoff_factor=0.5): - """ - Make a GET request with retries. - - Args: - url (str): The URL to send the GET request to. - headers (Dict[str, str]): The headers to include in the GET request. - retries (int): Number of retries for the request. - backoff_factor (float): Backoff factor for retries. - - Returns: - requests.Response: The response from the GET request. - - Raises: - requests.RequestException: If the request fails after the specified retries. - """ - for attempt in range(retries): - response = requests.get(url, headers=headers) - if response.status_code == 502: - time.sleep(backoff_factor * (2 ** attempt)) - else: - return response - return response - - def _monitor_job_status(self, job_id, headers, timeout): - """ - Monitor the status of a crawl job until completion. - - Args: - job_id (str): The ID of the crawl job. - headers (Dict[str, str]): The headers to include in the status check requests. - timeout (int): Timeout between status checks. - - Returns: - Any: The crawl results if the job is completed successfully. - - Raises: - Exception: If the job fails or an error occurs during status checks. - """ - while True: - status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) - if status_response.status_code == 200: - status_data = status_response.json() - if status_data['status'] == 'completed': - if 'data' in status_data: - return status_data['data'] - else: - raise Exception('Crawl job completed but no data was returned') - elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting']: - timeout=max(timeout,2) - time.sleep(timeout) # Wait for the specified timeout before checking again - else: - raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}') - else: - self._handle_error(status_response, 'check crawl status') - - def _handle_error(self, response, action): - """ - Handle errors from API responses. - - Args: - response (requests.Response): The response object from the API request. - action (str): Description of the action that was being performed. - - Raises: - Exception: An exception with a message containing the status code and error details from the response. - """ - if response.status_code in [402, 408, 409, 500]: - error_message = response.json().get('error', 'Unknown error occurred') - raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}') - else: - raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}') diff --git a/apps/python-sdk/dist/firecrawl-py-0.0.12.tar.gz b/apps/python-sdk/dist/firecrawl-py-0.0.12.tar.gz deleted file mode 100644 index 83cd72218ad5d077ab60e7b4eded13c825e9b3b0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6754 zcma)=RZyIPvaJUv*x*iZcY?bI2o6C5gS!)42G>AvXK)W5AOt5!kl^m_?*5;>>sH;z zQ|Dz>SFcsoul+5`SX5L45(6uEz{=Lm(%j9|%Yog+hnT4&_Ueqk}hiHA*_S$;3>(sM04~RP#Idv*~4iRnlj$n*7 z6A9acykw|(eIGURlK+@%^ssL1=T&C!DZ=^e7Ye(SmH-~Ck0QR}F5MfuciJ@nYhFBT z?u>jn$3lKtr+9O2s~Yw%>zZCY^m+<4Cfe!hTBc~H3f)hEuID1cC{#`|DVFGW^B2B8 zjgvvOj-k~+`YqJitG4PE09OKP>s^lTVeO9>DOX+7=d}|+ZeSZ^1`x>y87JJi5C1pv z-PS?G(?I}QWV(a=mhkmoysO#xLBij5Q;T(>WMb8+AUpU#p55(!k%z`Uf%8C`Fsi>U3xF1G5`~y-Xd!~PFz}^E^0Gg6~qi( zKcsUCeDda{s|{6zuQL;gSmWbi%QeBxB+)FDIiIn!mcmuFG2b?3P92x=bNlos&c0=Z zNy)~dy)5>b>T4l&5&wh$CCxSgm-CLQ5n8dI%6U1_XuaVMhHsjzg&-Zm6kSk6vT|J& z`vd47(=FDEhyaGa4w%Ki)ci43^Jd~ZoW@bYYoBX{M1h{vPxPF^6qQJjlE0t`-Sf2% z@ru;!W+?&#f=KY0$sz4>mugTw_HZ!tv~@C0^9&&$&in|H+$e`XP2x;(;dwyL&3S`@ z>Q1rLNTQP<<$ktS$n-Y5+}z&U;t`i_IDhanm1Vz&yj*NYhUAIY7)a&r+aIs?+TSnB zPTx7FcmhQ=jg?cVIt{btAx%H5VO5N_WvBPue>@ZXDU&Y0uBSpX&y5=0f;J%(5S6%w zhqnH59EUb!!6iNg&`~vOB)9O>b$v^KGoH4={q3IeZ?6i}&*}u@+{>Re1I@u(`W++Q zcbdM?9WH!%BprCfWPg=jiCXA0fBlm1Fc2So>eCY-ODHA16{IfW)`1kfcPd2Re2$H^ z1&gN@DAfq57{~w#;=cASzOt6{2iF{tKRS9B`2w>;{-Em;%ft=atg_g-XWun6rRl$-UN~I z&wXw&tUlCy8b1xDyy=pjCyDxih?|vMFy{8rk)1;*Loh@?pdUjTZf>az^~s8u0&j+B zV%mD?Aq^g<*F>HJf5cV)({20JZ>O&+Zt_Bk(?_0R-Q~qm=RqPGT6mP-8f0qL$OA>a zZ(V`wL?n+N_b7muT#%BxiS*XM5Ahd}kz}xg;BVAdmSYK~bygm!G^9$Ba~p-K_I{M? zI9V=wfgZ6rnL-nze0;{yoGR(Lo*%S`$XuxAGy*|vz6XEwmSE42$|*|mYYoJuNZVRW zvbFdWX+FO*g9}^j{*bhX&(*XiA9~6W<b^>aOz7V#U`M4{bUauWLpcpK%y_?(N8-S>op&By5VHH9b% z>UXb`-uUdn@(S&G8|i$;BX4#Wrg8EwdA`FjA@7;q^V1?-1^!enl; z3JA?3{16?7s;D0nmf)dzluU*D3^sS>7KZG~%_Z9OTa$329A{OKX+>X76y77Z!p3=K&v%7QRaMg$#}4pb)v0kim>l z*2Y%M7iCo8*Kg4BC@O{n>St_QMRgzcO#vpJAp_Zp_K(gWLB3gg0D)&HOLrCrs%t)5 zY|NE9L@;h$z^C!3tgXVr3{%dmo)8fvvZfis#}=k=K?Qcad9aW%KfFqRTICB=1uO>Cl4~*69nFaI*Je#Y(Iwhg3s78aU?S1q=tKhY2q^B zP54OL>1U|mlvM*ZSxrB0a=LgD2PUyyjtdv2*Xv*W+ZWkpKI5Frf_4gCK2t_f;-IEpyB?0vkA+;yn2>{ZIj7@ zSRw=2QU8wZ@KccF$iu_Ur0GM{!SHw`OoFmhe-FpqC-woUq@~j{lw&m}W&&M;HWcY+djc(ejzG<(IJ4h`4q|{KVWYt;N`Y{}9y;50z z&I3r+JIMh-v-wpsOMRatD|(k>j~38t3?+0q>2r6V9`8qxAEUizkaaW1_PD*;C$%iw zGv_Z?GGj?Ar=42Js+yMX@3cvayKia3-W84LoAF|kA`zSDiwDUl5qs(owMP0S`1gdf z_Tn}-T7FJu9hem@vr`z`gSd||C=wTve(5y!-Iyh?K z*XBgsT*$kxf;6hQZsz#C7}@fdOFi3Jf5s?CQW%gOk=;ELz~WUe84sDbVkj9nwJ41W z4?W14PdL94esuF*wxXZ3Ql*9*?P`ttq2Zgl8!^md1-WLqP88dS>4|)y-DK(+XSF&= zFkEO+bU`VK|L})ODiqb1e=H)iXWvCji!_=9zFCA23;8)ZjL<6UyF@#IL&uE!+%nhZ zR{w zFdnf6qvfI`Q+3RBMm-?>OfZ@ReXna`E0)>yQ0ZL_cvTC0Q3gn3$;j|{1j`GuZp7A= zQ!f0K8{)HBUWpBg=HjsDX+h+rI7c-i4kc9z3pP@|cCFvvWI2)$Zi; z;x`l0>!CFW0yf7XG;iyx`tiX1*B3y43s@C>>$+}en_gW0uxxMyJ$L{HLtp0yQT9fC zU=Pr;4+o78K>qsi?Z&yMb=k7*@dMy4Dx6Uqs`!5uH3H?Ss%qgktxlk^g&a6Mco=W& zgL;nKLCcOKXWALD=>hO5IJXxl3w(sl}418TPc13<$1rEC85<{mkK0+@f2$fjxS zBr_x&jqEghp@fOY0xe#Cxo4_mU!_{6=Pm(jQYeO2<}q!Zfzr*3+A;v`d>wpTSB2)E zb~OXNZ~M&h&^|alXP~XEW7Yp{{un&*=JhPH^d>Q=RiTYRpBIxC5IpjSc*t5}hg<}P zq^SV6%-eXzr7DA;h^kOe@HG)~bGVID<1vk0v_ex8^ccc1@)q-`bi;Px$}Iwxki!h6 zCrLxGq$S+JO$?aKMx>Q8tIw%zZcJ9<8cP5XUF1lFWy)ttlQtqq*ByjK)So`J$Jp56 zN=7P&<{?Vc!jPx6{mZF)B+K*HN_Tqexx-`#We*|c)jn%q+D)Pjwq85CY!c?r=$sP3 zG?Ua-xp#94)86qk`wUjdjq~NzR!K}#0(-h@9Egy57z~nL%m4hhmSo|*y zT9drTdGfve7n=`5qu!KT8~NIIf#Z{Dw}%q%4f~=ox~x)F=^qW=rj`T=?tihMv+^C(R*K<`U1C{D z%)9QRWKLwVlr0zy>r>PV*rE0y;64$TQ5*{TT1GP-fpWtYN|m{xAwAufOx=**jCMg4 zrw*RBKqR1srJJzOjR&sODN4dDk}Tc+?&Fv?WQ*nbNozH8an~%Vk6d3btpE+U>ZS&- z0_Etwbs)<4RG zj32^dn$UWha!tZtnrGKdD5t6S%qgj*uI!lAP*>rHU|yM*XzGk9;SI{e3&vPF3R%}n zb&AfRpEk(iz#x&lP4|UoaTJTL zgnE%Hklt{!hI6LCl)F%OVsAKo)XN!1FMN5zRZB}Opgv)5`A!cIa(3YN+^B>4PSpk<5xK7Gprs9S%K3Z~ zYRvyPf3;Kl(iEFL@v%vnt6=9pboImt%z@8RPa4%Px0?^w^u=itg_*rJxyyn>%9eog znS1}FJJ)5EJ=rU5mov$RunV@;WS^+rn$3y~nk8G$=aye#qV`5^Xh>{#ZxsR3Sibgo zT-5_f*Le%9+UhLVXn>L`^|C+pU9f*e12sJ8WPPx<)kfdwgDFE}nv;(&eE(?FuO_oG zF5maLF9PZl-{K(}W6QStU14T~t4~@fS9^{lXhLe;;|#Yk@^1!DViWKEHy1@g|G4z) z#M15YA5EJZ7#(359H;l*OZpO!QEvmJyfZNNlTn0GK2b$nXg5Mo-n63-+O)LNi}_+0c`e}S>&()K!wfRv%<(}+>A_CLt55u~DUJZs><@pU zzf)laRMf0-Sm(*a#``u?v(^QIN^0iXh;TMH#&^coojO&#_{Tj9P#~K zfUO`;SMXV)2NyL9>Gd2s1dns*uii<#m9U)#BlYB>?ZIma2`ss2rIR)dD2id&fTyOw$&LPn8dHyGZ z-ORl5(e5njz)62UQfGdULxUSdvrQWE=~)9=*!yG>pOi8CaO&sGncqA6d0IEehJv4& zkq0X8TU|!(^pZ}?5{_C>)Hr8&MhlgpRj5i_05xv;&4Dv}kYE6Fa#vT&M5=Ph&ZELm zm|b{&bj}Yx_mo+!TW45~X!vOAKfyw0i0z_VZc$wwg=Tr%@6MhrnD8GZsM z+43{le9_5DeD6%y_5FReUWu)Sod_h?T=7>leHu+z5`kM2yXTtb=h1SG`cyl92G-uc z8IpF0 za06I;3b52DVAZk=D|j5g z!jW%fkNUuDIY^{3CG+{je)%@;Uvyi0FHFmGcz&tCvA9%li{8kEvNuP*5E2fk7mZeB zb7bBV{dzc_hBonQ9s5|qbQ`2|8@F|Ae4Ei4_xy*)?Q6Yqc*4~U-SCc(B&dnE<1$*? zsH$1ZSu)JCNf8fW?K3P<@F%>Q22xW0p?t^;d}6eqSMkCXRM{p)bW+S=?I2}}d1@>`2AP_%sb6Xodz+yUqw z1N&=0>=Wc1n0R|>&=RL!1L)ePy_I4JB?ohp2^3#mGxYs3m?faY57sk zvM9FuU_CpEtyMJ7W0`$|pR6P+`qjmSaEZ)P=i?m-xHR5#m)oL{BJv0;YT@ZXMK_|GBD7g$&wd&x04n@ zHt%#fEgg6N)s-?#mDgfs;5MlpuKAHzw@dm9O>2=cmvv-%?BRur0&=k;2$RwD!!DtJ zkX4E=%9IMV`sN3Uk=m5PNaCl#BCC76!4hwpD=vE?K{@7ZEVz^h7f*$|{SyZ*6sR-5 z2~Qo~0alv0Qf(Ls&DuPSY?WzO97O7$e7Q&ybNH^W>H7O>9KbB!a1i;>hd45X5XZ$JPk7hSSEKiC_)yCYaOQkw>4~U zn1SNm%p*kj?LX}B^tibVSW}7z_--%=V(ASCmYbO+|KqyUwSf;bP4H2K6&9`0slTHcmG!)_<*S`8wB-WD z7~jWmRsQ|*Wz%ov?7>H_2+i8=lK)+YhUFhEFIj>>xruU`+>gcn^$QiUMM{_2G#Uzv zOdV%G=rlSN3MW~zQi;^M47|H0%L~aVXbcokoQp9mf6Ev+4A)P&tRK*L0QvuzdcxKqFA;z;gZc%k}l(qr}tX_rRMX U%KtxJGl~P?v1J`8&;4H6s%_YCd?3GV(l z_trb-+|+sRcGs$|{YU@m>s@PAefw)=IV5C4004jvATNs1ipgJr>bpnOixV znK&DH*s>cMTG?B<8XB@WddV^Ok7$g_4Y0^6u*tHFsPql8_OUamI0j?9ASQG~04V>7 zEKB^Zm5#|Y{{aX9T(SWG#Q%Z(6aL6t&@ptFdx0NtQMjCs6OY-8;jFtn9f;Fcux+$-x=rb?vCm(9??zI zGVo*|zL_;%z$Cc6TWG_SIv}>1$$lclNlYd~WfT z1eqtukV?_%PF?S03{cTIe)%;KhC;;4*o_t7#PA~uhB7}kJxS5^_Sb9PwII2LfYyc= z^iEY;lU`(Mam0=5FCni}^_WW%i}N{4kriAdawNJ@p7K1yiXP>UQbI+3)zo!97Sb%i zw9%rlP>bZlElpykf}wi2nJ!IY%TXGEf;WHp%vA4gKzgK^|O5$2n$sf zMT)+%?^fr$x*6LhsbpFqF%e*T@;P{K-)r6o;c@hM&B?$A2F-_C5LmgoPdKx_KNp z=|aC;m;-)#$2?^1uBRbEgn9s6)62{W@1ArN$5Di`Q%(Sd$fBe?`%UfjR zPmvP1C3z6OImwreV4=n!BJ4!awtDsyyE)pV=c;VjHu@p*Kn>xm` zX41Q{*_$zjY?m632C;Hf2k31WUl?mwzY{~$NlumQZ&7OQ3yy2L#~p7j8eiG~Ql+50 zM#%UYAp|#S1!7J0`}&9~GNbxy=mtQ&zOYti*f}D(S3^CB;=Iw?!vJ-lkj|y4w$+MT zL_}ruJzECNPbDITYdN8rp9RW5a)xkE*glGww@UHyN_BQ}bHScY zDz@?ISX2ujh$Cy)!R=;HE$t1^aicL}Bg!NevyfwgR1@EN)t(&R7cov1-a)w9vx(J5?>8gqRJC=S-BEvh%!aQoBm2RgXg`> z0Y!f(3dE;%j-s)>WZ>KNg@cnS4l9bCA!sJ*SAL$<}JQ!ie ziq-K?_Dp(u*_C(u^XkR}e-N7z5QI-?MD@w4x5unqVv7tU1b6eP^=SHDnE2~m88P3~ z!$SH8m|Ab$dFzB5k1SM!ncRq3u0YMX^81q2E89I@Y}swoT1B zT#CnHmNW~k9f4*!4u!D(O%t2HNVxqOKy+QXxY{;l~?x`mcTfDI$aan-F0FFq^kh4&eRGC z?wU27)bo$Ky3jhn|FL7x`Ha+ zS*IK|11e5AFYQnEFC#;KNW>v@#<(!}n2w@<#BJ;{4*MeQv_@=(+JXPE^qe1KLhqB; z^-o>`kEjiEj&KX{7kGm!_&`yToc=ytZqx+5;mp00s1uPXLM(c>Oor+uRDu)%Iqfd^ z9Qlidn$_BZ>}CegQW_0hD@VwDuF4F%hN2u1&pt<8*d%#zHjl0U8}7D=NL~d5Bvlvw zih$ukOyhzV6&p>ae`=3 zs&w2!t7SLIVug-{W-5=4!F@#+Fhr@rdPPm!#ll%7z1!3DvpiyJyC137+=p?4fu6U{ z#=pR!=g(Z_kquKpnVZFPrFP_2{Yw1{vNa%EVTU5+xg3Ex7kJNNrTDJG@o)_1D!f{$aWUaAkR*1 zeFEco(x2rB#rnAnVgxJ>?U!@?>I8P`I9R*Ho zY;)PJ3cDs4ZJBrSQKUnbzD@YPW&dK-YpPSH)Lq^fBDZIcfMKC68tUuMTtVOm@;dd~ z8*&atwHTt-NHfPZ5GFlHJ+%Z^d-k*6jjyZIidh7?;eaLm3Meh;AV6!i6>seZ6UvJfm!a1Q{JzAvhb-(B_aQX&5La#en(@G?R z*`iW*?VRV%;Qf0)`*s2-UI(bo2{paEn6(?B==bc1 zqG*~f&7W~Mj=TRl_c{TGkq@TkGVHOstdw12`|{@&)_D^FrA>1-YyB&Em`a*FF_xc2 z1WZPCU|UkkYiV(DXd()a;@B;kfvNTTLObyFrA{LA&qxavIeuZ|ZRbPzK7DOz57I4y zDI!^n!K^Z`s3>;jl<5yclX~8+W7!x(j zG%xPyM~ylhyOpBIJZJ3bFa^(>Ftu?mZ(}RC#5C?^%w<>BXJr;^3-(DmIIASm%cZ@) zK_2EwesG_}xd5Mdk7c<8Fh+d$z4a)jFRSm`X^H1`y#Jb~9@wtI`&PoxRYxu*!|fm zv(oiPYhtj~Gm@2MKu#fPo%X?ff7SbrA0ZgwjXlqi1b5BRmWFIlary^JUmtr-LoycL zsM7j2dhdRtE6Mj)v0>g5iF`o(%e(GAdRP3|t5I_d0Kn!c0Ko8f?`r7i#md3P!N$qO zW@_c)%4%hA?!c}fp)M*esxDfqrSCxAgy*+p*v|0iWvw>o&_{uey8Q0=wC=`TiqH-Y zI_r?LiO0KAow05t&{*>10O#{1Gh;d}4gxd78yNQ&Kior3CYtztV2!*>8k>-%)~x96I2hU1 z@pxxb`)vQbDVsid(&o+=W_Kg9^y2ofEj#FjI;nTc=QHE9dwRKg$1>%~J)8D}W0Jo7 zVA(~Y@t1z3L}DI!`|3v4&T@KfUwgujw3>JdkXVs^jvB0wu| zqalrCmQ~$RTnW;*fR*QWyF(RON)M+4 z_yPB3;cst1adK+3vum-ky5NTseblb0sjan*jjI!-urs}d&dRaScs|`<#H}3l{^0W2 zp8XT0@$2F&ZL3=6hg`U3!>}@4w^{_-R7;_!1<+^eRvS{+8SAU@o%gveKUO`o+wciG zK&szN^M`tRr$<=-01o?GUag@wGw7x_wEP3swElUSl$ep({azQ2Ee2jJ2U^`po%14h z`ti;4IlI^~jIa%{y?MM@3^4G?jO=CTsTgsnwm=mZ^icBowrdssq4>0H9}|gax9U(v zm?)9~H?)Er$BVw`0$pE}IZX<~PZQ?botcM#!6F)!V~L=jrbncBs-!eBZ@{G3OvNzP zNK=}@wo5Z#%)De-aX@${H=~Q;O#{7^Gam$&&3N1*e=hg{lgsP(y6k2}Xy?FP4@LXQ z??hVpkXi=c()qDQm#SFEjiK66QR`cny9!u3A8N0s^GXWBz#EAOzk`V3mVvw73NaQ= z=aR>?7v&5JSH(hO>sH6(xN5t%Jpcl6twp0tE}VV=$7h!dKRQb%FiaV;qaR)peNe>~ z!raq-_F9A)Xq5Xa{(O5j;P6u2G{j$=^{S|BCzSkv-lAhZB2+t`0wvTGuNXH4&=~^K z6Qqe1F=>If22y#eQh%%-eNbJvG!Zn#tEw;m5#F^U`cv6(+B>K`_!%gp%dkaw+z;

SN9g>2XN3#&MPB1hSw7VM(*%? z51K>67|8E(N%OYFF4Okr*7BS_12?JogGoC1ul2n|8(GNp1N zb(OY?HS@sTRP=gk+zq=LY|JqXE7d4H+EN=TG+OG8c?2ue1wdD8RZ%mEGPxwem&wKz zeuc3O!0ykXt77_nflKP!RdKNg%X;BLMws5-U|XkRNPB2ASlcT5)b3rZ=N;yjY->rs#vl6ZE3N={UR zS!JM-*a`h+P}JKVt>fZ9<8G`2EFq_7%)IiIvWy0l+N(P!^VJAE?<;8C3363 z{6^cIf)PlMy24M)JyA`G9p}hW_^5g#gTw}AXe^#|jeO^FP2hlFX~>34 zL4<|rgl}(@MngjP05|9B#Vz-yHwWG@i3J+VNWJ01+tY0un79J3xC_8rO^P}9mQf0@ zizxidk*yhT#>~SE^3X*t%+EtTajC2o`XqQby7VU|SKW$n|L3-HTKV?`|zDL z7CHA?SmxR0Xv%m!*?x>OYc*t{!%`A*aqp84opdWUgFSZ*ePq}sjX=AfB!wH4En&N@6kW&U7JWe_1I6V+c*=J&MOO4ZEP-V~^B zZCv#B=H@22aAxm8xEQMoX_7Kl(Uf0gb=l685U}jz2cQ-N+@weg0A}*|(yX_1r6S|) z^0}Cpe)j!)TbdBT)s^h;tfdd&gs^C zU6(wY->^vE-7kfGEvrhlJjAH+aEw|a$fQl5gSG4%UC|vZ(5XnspVM;Ri25!Nc2(7W zSeDWf&!irGCM-r$gUCp@*JR0>VpQjS`w8=2ISQn2gZ+)E-!B2Z*DH7O(2gOvQ*j-N zJ@oulML=3}8-+7>pE7S%9sWqbq?D(Izg-D$xWRaWa9<__B0#UL@r0I+vcax|1CfMG zgEY7GX1h`S1NwKj#xE{KFvubnksD4cBgZ(8FMf)1E^W=Bf%$%{n>T4%7#WS{bj8Hr zb?{oPd*k-ZOhq&TkwK6_f=#@cYQAB(|n(s;n4^=rPV!B2iUb@_@8A6(?C57 zmrOB+d_97c@S{>RTJ(B*&?Sbr@)%Ei&(Mfak<@XFrNx(g5DHi1t9{!dx{^8U<=X37sJvy{MiuS)c zwYSm|67rZM(h5Uk*mA5)!-FGVRXHZPmz|Y{bZ(y}lXws3Z|VwF*jQSQ1? z?y+@X{|Nio8Ot>Gb5_|Qn0&SBb1;*_22e)6UiEnq&-Bp1h}@LIVCniQ;&0;Qxr3iz zJ&JlvK>^JFRh+AXqoJ*tyO}MUtEcOqDeKDb(5a6f30V;UfW^o9Bd+~3P5no2sw(kD zNmV>_NFiHH1Q2-SlZ}#!u~CV}rAZxU@XO499G)7EUV6HHq25$)KeGL>UPi5*uo%JF z^Q%K3D&HGdCrwP_23>vz8(%&qqYGj-a%2xubg*HU;8vdDNp-AdPV-R?QckhVy)|pu zq13$bi_EE@-teFH+?zTcA)m`f^J}M>h_(ujI?$MGwBN)8^ebPTI0}ELrjrB6GETeG zG_6m;rK*fbtBy_~uua>kB$JQrT&D-0Ekzuhq=jD1X-e}cjUM7sxj6EW1fgmihAj!A zdnu7GO78^^Xjkz8t9J8df?u?L`MTqhB$$YrQk)-;QYNGs^kk_|Ut=xkG%xm>&xO!o zLxB8QZ&r0XRZKGrCnO!5~42lH<-{;>=G z_k{90^Y`ibi`n@2{eP387o9sd_l^YORB{}23+{{J2Pd)NOjxCWv0zk>fg X1IltJsDHeL`gs3%oE=3.8 -Description-Content-Type: text/markdown - -# Firecrawl Python SDK - -The Firecrawl Python SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API. - -## Installation - -To install the Firecrawl Python SDK, you can use pip: - -```bash -pip install firecrawl-py -``` - -## Usage - -1. Get an API key from [firecrawl.dev](https://firecrawl.dev) -2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class. - - -Here's an example of how to use the SDK: - -```python -from firecrawl import FirecrawlApp - -# Initialize the FirecrawlApp with your API key -app = FirecrawlApp(api_key='your_api_key') - -# Scrape a single URL -url = 'https://mendable.ai' -scraped_data = app.scrape_url(url) - -# Crawl a website -crawl_url = 'https://mendable.ai' -params = { - 'pageOptions': { - 'onlyMainContent': True - } -} -crawl_result = app.crawl_url(crawl_url, params=params) -``` - -### Scraping a URL - -To scrape a single URL, use the `scrape_url` method. It takes the URL as a parameter and returns the scraped data as a dictionary. - -```python -url = 'https://example.com' -scraped_data = app.scrape_url(url) -``` -### Extracting structured data from a URL - -With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it: - -```python -class ArticleSchema(BaseModel): - title: str - points: int - by: str - commentsURL: str - -class TopArticlesSchema(BaseModel): - top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories") - -data = app.scrape_url('https://news.ycombinator.com', { - 'extractorOptions': { - 'extractionSchema': TopArticlesSchema.model_json_schema(), - 'mode': 'llm-extraction' - }, - 'pageOptions':{ - 'onlyMainContent': True - } -}) -print(data["llm_extraction"]) -``` - -### Search for a query - -Used to search the web, get the most relevant results, scrap each page and return the markdown. - -```python -query = 'what is mendable?' -search_result = app.search(query) -``` - -### Crawling a Website - -To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format. - -The `wait_until_done` parameter determines whether the method should wait for the crawl job to complete before returning the result. If set to `True`, the method will periodically check the status of the crawl job until it is completed or the specified `timeout` (in seconds) is reached. If set to `False`, the method will return immediately with the job ID, and you can manually check the status of the crawl job using the `check_crawl_status` method. - -```python -crawl_url = 'https://example.com' -params = { - 'crawlerOptions': { - 'excludes': ['blog/*'], - 'includes': [], # leave empty for all pages - 'limit': 1000, - }, - 'pageOptions': { - 'onlyMainContent': True - } -} -crawl_result = app.crawl_url(crawl_url, params=params, wait_until_done=True, timeout=5) -``` - -If `wait_until_done` is set to `True`, the `crawl_url` method will return the crawl result once the job is completed. If the job fails or is stopped, an exception will be raised. - -### Checking Crawl Status - -To check the status of a crawl job, use the `check_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job. - -```python -job_id = crawl_result['jobId'] -status = app.check_crawl_status(job_id) -``` - -## Error Handling - -The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. - -## Running the Tests with Pytest - -To ensure the functionality of the Firecrawl Python SDK, we have included end-to-end tests using `pytest`. These tests cover various aspects of the SDK, including URL scraping, web searching, and website crawling. - -### Running the Tests - -To run the tests, execute the following commands: - -Install pytest: -```bash -pip install pytest -``` - -Run: -```bash -pytest firecrawl/__tests__/e2e_withAuth/test.py -``` - - -## Contributing - -Contributions to the Firecrawl Python SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository. - -## License - -The Firecrawl Python SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT). diff --git a/apps/python-sdk/firecrawl_py.egg-info/SOURCES.txt b/apps/python-sdk/firecrawl_py.egg-info/SOURCES.txt deleted file mode 100644 index c25567c5..00000000 --- a/apps/python-sdk/firecrawl_py.egg-info/SOURCES.txt +++ /dev/null @@ -1,9 +0,0 @@ -README.md -setup.py -firecrawl/__init__.py -firecrawl/firecrawl.py -firecrawl_py.egg-info/PKG-INFO -firecrawl_py.egg-info/SOURCES.txt -firecrawl_py.egg-info/dependency_links.txt -firecrawl_py.egg-info/requires.txt -firecrawl_py.egg-info/top_level.txt \ No newline at end of file diff --git a/apps/python-sdk/firecrawl_py.egg-info/dependency_links.txt b/apps/python-sdk/firecrawl_py.egg-info/dependency_links.txt deleted file mode 100644 index 8b137891..00000000 --- a/apps/python-sdk/firecrawl_py.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/apps/python-sdk/firecrawl_py.egg-info/requires.txt b/apps/python-sdk/firecrawl_py.egg-info/requires.txt deleted file mode 100644 index c8d341f5..00000000 --- a/apps/python-sdk/firecrawl_py.egg-info/requires.txt +++ /dev/null @@ -1,3 +0,0 @@ -requests -pytest -python-dotenv diff --git a/apps/python-sdk/firecrawl_py.egg-info/top_level.txt b/apps/python-sdk/firecrawl_py.egg-info/top_level.txt deleted file mode 100644 index 8bce1a1f..00000000 --- a/apps/python-sdk/firecrawl_py.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -firecrawl