WebscraperTool bypass cloudflare site by cloudscraper (#6337)

2025-08-11 16:08:58 +08:00 · 2024-07-17 14:13:57 +08:00 · 2024-07-17 14:13:57 +08:00 · 4e2fba404d
commit 4e2fba404d
parent 7943f7f697
3 changed files with 58 additions and 21 deletions
--- a/api/core/tools/utils/web_reader_tool.py
+++ b/api/core/tools/utils/web_reader_tool.py
@ -10,6 +10,7 @@ import unicodedata
 from contextlib import contextmanager
 from urllib.parse import unquote
 import cloudscraper
 import requests
 from bs4 import BeautifulSoup, CData, Comment, NavigableString
 from newspaper import Article
@ -46,29 +47,34 @@ def get_url(url: str, user_agent: str = None) -> str:
    supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
    response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10))
    if response.status_code == 200:
        # check content-type
        content_type = response.headers.get('Content-Type')
        if content_type:
            main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
        else:
            content_disposition = response.headers.get('Content-Disposition', '')
            filename_match = re.search(r'filename="([^"]+)"', content_disposition)
            if filename_match:
                filename = unquote(filename_match.group(1))
                extension = re.search(r'\.(\w+)$', filename)
                if extension:
                    main_content_type = mimetypes.guess_type(filename)[0]
        if main_content_type not in supported_content_types:
            return "Unsupported content-type [{}] of URL.".format(main_content_type)
        if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
            return ExtractProcessor.load_from_url(url, return_text=True)
        response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
    elif response.status_code == 403:
        scraper = cloudscraper.create_scraper()
        response = scraper.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
    if response.status_code != 200:
        return "URL returned status code {}.".format(response.status_code)
    # check content-type
    content_type = response.headers.get('Content-Type')
    if content_type:
        main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
    else:
        content_disposition = response.headers.get('Content-Disposition', '')
        filename_match = re.search(r'filename="([^"]+)"', content_disposition)
        if filename_match:
            filename = unquote(filename_match.group(1))
            extension = re.search(r'\.(\w+)$', filename)
            if extension:
                main_content_type = mimetypes.guess_type(filename)[0]
    if main_content_type not in supported_content_types:
        return "Unsupported content-type [{}] of URL.".format(main_content_type)
    if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
        return ExtractProcessor.load_from_url(url, return_text=True)
    response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
    a = extract_using_readabilipy(response.text)
    if not a['plain_text'] or not a['plain_text'].strip():
--- a/api/poetry.lock
+++ b/api/poetry.lock
@ -1610,6 +1610,22 @@ lz4 = ["clickhouse-cityhash (>=1.0.2.1)", "lz4", "lz4 (<=3.0.1)"]
 numpy = ["numpy (>=1.12.0)", "pandas (>=0.24.0)"]
 zstd = ["clickhouse-cityhash (>=1.0.2.1)", "zstd"]
 [[package]]
 name = "cloudscraper"
 version = "1.2.71"
 description = "A Python module to bypass Cloudflare's anti-bot page."
 optional = false
 python-versions = "*"
 files = [
    {file = "cloudscraper-1.2.71-py2.py3-none-any.whl", hash = "sha256:76f50ca529ed2279e220837befdec892626f9511708e200d48d5bb76ded679b0"},
    {file = "cloudscraper-1.2.71.tar.gz", hash = "sha256:429c6e8aa6916d5bad5c8a5eac50f3ea53c9ac22616f6cb21b18dcc71517d0d3"},
 ]
 [package.dependencies]
 pyparsing = ">=2.4.7"
 requests = ">=2.9.2"
 requests-toolbelt = ">=0.9.1"
 [[package]]
 name = "cohere"
 version = "5.2.6"
@ -7304,6 +7320,20 @@ requests = ">=2.0.0"
 [package.extras]
 rsa = ["oauthlib[signedtoken] (>=3.0.0)"]
 [[package]]
 name = "requests-toolbelt"
 version = "1.0.0"
 description = "A utility belt for advanced users of python-requests"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 files = [
    {file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"},
    {file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"},
 ]
 [package.dependencies]
 requests = ">=2.0.1,<3.0.0"
 [[package]]
 name = "resend"
 version = "0.7.2"
@ -9408,4 +9438,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "8d2a12543340f6f4fa6dcb27f93d8b3f5380e7a3e7eb5e399e76e6b8588b4611"
+content-hash = "9b1821b6e5d6d44947cc011c2d635a366557582b4540b99e0ff53a3078a989e5"
--- a/api/pyproject.toml
+++ b/api/pyproject.toml
@ -193,6 +193,7 @@ twilio = "~9.0.4"
 vanna = { version = "0.5.5", extras = ["postgres", "mysql", "clickhouse", "duckdb"] }
 wikipedia = "1.4.0"
 yfinance = "~0.2.40"
 cloudscraper = "1.2.71"
 ############################################################
 # VDB dependencies required by vector store clients