diff --git a/api/core/tools/utils/web_reader_tool.py b/api/core/tools/utils/web_reader_tool.py index 1e7eb129a7..e52082541a 100644 --- a/api/core/tools/utils/web_reader_tool.py +++ b/api/core/tools/utils/web_reader_tool.py @@ -10,6 +10,7 @@ import unicodedata from contextlib import contextmanager from urllib.parse import unquote +import cloudscraper import requests from bs4 import BeautifulSoup, CData, Comment, NavigableString from newspaper import Article @@ -46,29 +47,34 @@ def get_url(url: str, user_agent: str = None) -> str: supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"] response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10)) + if response.status_code == 200: + # check content-type + content_type = response.headers.get('Content-Type') + if content_type: + main_content_type = response.headers.get('Content-Type').split(';')[0].strip() + else: + content_disposition = response.headers.get('Content-Disposition', '') + filename_match = re.search(r'filename="([^"]+)"', content_disposition) + if filename_match: + filename = unquote(filename_match.group(1)) + extension = re.search(r'\.(\w+)$', filename) + if extension: + main_content_type = mimetypes.guess_type(filename)[0] + + if main_content_type not in supported_content_types: + return "Unsupported content-type [{}] of URL.".format(main_content_type) + + if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES: + return ExtractProcessor.load_from_url(url, return_text=True) + + response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300)) + elif response.status_code == 403: + scraper = cloudscraper.create_scraper() + response = scraper.get(url, headers=headers, allow_redirects=True, timeout=(120, 300)) + if response.status_code != 200: return "URL returned status code {}.".format(response.status_code) - # check content-type - content_type = response.headers.get('Content-Type') - if content_type: - main_content_type = response.headers.get('Content-Type').split(';')[0].strip() - else: - content_disposition = response.headers.get('Content-Disposition', '') - filename_match = re.search(r'filename="([^"]+)"', content_disposition) - if filename_match: - filename = unquote(filename_match.group(1)) - extension = re.search(r'\.(\w+)$', filename) - if extension: - main_content_type = mimetypes.guess_type(filename)[0] - - if main_content_type not in supported_content_types: - return "Unsupported content-type [{}] of URL.".format(main_content_type) - - if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES: - return ExtractProcessor.load_from_url(url, return_text=True) - - response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300)) a = extract_using_readabilipy(response.text) if not a['plain_text'] or not a['plain_text'].strip(): diff --git a/api/poetry.lock b/api/poetry.lock index 1b0d41e72d..4b90b63e9f 100644 --- a/api/poetry.lock +++ b/api/poetry.lock @@ -1610,6 +1610,22 @@ lz4 = ["clickhouse-cityhash (>=1.0.2.1)", "lz4", "lz4 (<=3.0.1)"] numpy = ["numpy (>=1.12.0)", "pandas (>=0.24.0)"] zstd = ["clickhouse-cityhash (>=1.0.2.1)", "zstd"] +[[package]] +name = "cloudscraper" +version = "1.2.71" +description = "A Python module to bypass Cloudflare's anti-bot page." +optional = false +python-versions = "*" +files = [ + {file = "cloudscraper-1.2.71-py2.py3-none-any.whl", hash = "sha256:76f50ca529ed2279e220837befdec892626f9511708e200d48d5bb76ded679b0"}, + {file = "cloudscraper-1.2.71.tar.gz", hash = "sha256:429c6e8aa6916d5bad5c8a5eac50f3ea53c9ac22616f6cb21b18dcc71517d0d3"}, +] + +[package.dependencies] +pyparsing = ">=2.4.7" +requests = ">=2.9.2" +requests-toolbelt = ">=0.9.1" + [[package]] name = "cohere" version = "5.2.6" @@ -7304,6 +7320,20 @@ requests = ">=2.0.0" [package.extras] rsa = ["oauthlib[signedtoken] (>=3.0.0)"] +[[package]] +name = "requests-toolbelt" +version = "1.0.0" +description = "A utility belt for advanced users of python-requests" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"}, + {file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"}, +] + +[package.dependencies] +requests = ">=2.0.1,<3.0.0" + [[package]] name = "resend" version = "0.7.2" @@ -9408,4 +9438,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "8d2a12543340f6f4fa6dcb27f93d8b3f5380e7a3e7eb5e399e76e6b8588b4611" +content-hash = "9b1821b6e5d6d44947cc011c2d635a366557582b4540b99e0ff53a3078a989e5" diff --git a/api/pyproject.toml b/api/pyproject.toml index 7868ea4ab6..d37d4c21f0 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -193,6 +193,7 @@ twilio = "~9.0.4" vanna = { version = "0.5.5", extras = ["postgres", "mysql", "clickhouse", "duckdb"] } wikipedia = "1.4.0" yfinance = "~0.2.40" +cloudscraper = "1.2.71" ############################################################ # VDB dependencies required by vector store clients