WebscraperTool bypass cloudflare site by cloudscraper (#6337)

2025-08-11 13:59:04 +08:00 · 2024-07-17 14:13:57 +08:00 · 2024-07-17 14:13:57 +08:00 · 4e2fba404d
commit 4e2fba404d
parent 7943f7f697
3 changed files with 58 additions and 21 deletions
--- a/api/core/tools/utils/web_reader_tool.py
+++ b/api/core/tools/utils/web_reader_tool.py
@ -10,6 +10,7 @@ import unicodedata
 from contextlib import contextmanager
 from urllib.parse import unquote

+import cloudscraper
 import requests
 from bs4 import BeautifulSoup, CData, Comment, NavigableString
 from newspaper import Article
@ -46,29 +47,34 @@ def get_url(url: str, user_agent: str = None) -> str:
    supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
    response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10))

+    if response.status_code == 200:
+        # check content-type
+        content_type = response.headers.get('Content-Type')
+        if content_type:
+            main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
+        else:
+            content_disposition = response.headers.get('Content-Disposition', '')
+            filename_match = re.search(r'filename="([^"]+)"', content_disposition)
+            if filename_match:
+                filename = unquote(filename_match.group(1))
+                extension = re.search(r'\.(\w+)$', filename)
+                if extension:
+                    main_content_type = mimetypes.guess_type(filename)[0]
+
+        if main_content_type not in supported_content_types:
+            return "Unsupported content-type [{}] of URL.".format(main_content_type)
+
+        if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
+            return ExtractProcessor.load_from_url(url, return_text=True)
+
+        response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
+    elif response.status_code == 403:
+        scraper = cloudscraper.create_scraper()
+        response = scraper.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
+
    if response.status_code != 200:
        return "URL returned status code {}.".format(response.status_code)

-    # check content-type
-    content_type = response.headers.get('Content-Type')
-    if content_type:
-        main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
-    else:
-        content_disposition = response.headers.get('Content-Disposition', '')
-        filename_match = re.search(r'filename="([^"]+)"', content_disposition)
-        if filename_match:
-            filename = unquote(filename_match.group(1))
-            extension = re.search(r'\.(\w+)$', filename)
-            if extension:
-                main_content_type = mimetypes.guess_type(filename)[0]
-
-    if main_content_type not in supported_content_types:
-        return "Unsupported content-type [{}] of URL.".format(main_content_type)
-
-    if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
-        return ExtractProcessor.load_from_url(url, return_text=True)
-
-    response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
    a = extract_using_readabilipy(response.text)

    if not a['plain_text'] or not a['plain_text'].strip():
--- a/api/poetry.lock
+++ b/api/poetry.lock
@ -1610,6 +1610,22 @@ lz4 = ["clickhouse-cityhash (>=1.0.2.1)", "lz4", "lz4 (<=3.0.1)"]
 numpy = ["numpy (>=1.12.0)", "pandas (>=0.24.0)"]
 zstd = ["clickhouse-cityhash (>=1.0.2.1)", "zstd"]

+[[package]]
+name = "cloudscraper"
+version = "1.2.71"
+description = "A Python module to bypass Cloudflare's anti-bot page."
+optional = false
+python-versions = "*"
+files = [
+    {file = "cloudscraper-1.2.71-py2.py3-none-any.whl", hash = "sha256:76f50ca529ed2279e220837befdec892626f9511708e200d48d5bb76ded679b0"},
+    {file = "cloudscraper-1.2.71.tar.gz", hash = "sha256:429c6e8aa6916d5bad5c8a5eac50f3ea53c9ac22616f6cb21b18dcc71517d0d3"},
+]
+
+[package.dependencies]
+pyparsing = ">=2.4.7"
+requests = ">=2.9.2"
+requests-toolbelt = ">=0.9.1"
+
 [[package]]
 name = "cohere"
 version = "5.2.6"
@ -7304,6 +7320,20 @@ requests = ">=2.0.0"
 [package.extras]
 rsa = ["oauthlib[signedtoken] (>=3.0.0)"]

+[[package]]
+name = "requests-toolbelt"
+version = "1.0.0"
+description = "A utility belt for advanced users of python-requests"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"},
+    {file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"},
+]
+
+[package.dependencies]
+requests = ">=2.0.1,<3.0.0"
+
 [[package]]
 name = "resend"
 version = "0.7.2"
@ -9408,4 +9438,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "8d2a12543340f6f4fa6dcb27f93d8b3f5380e7a3e7eb5e399e76e6b8588b4611"
+content-hash = "9b1821b6e5d6d44947cc011c2d635a366557582b4540b99e0ff53a3078a989e5"
--- a/api/pyproject.toml
+++ b/api/pyproject.toml
@ -193,6 +193,7 @@ twilio = "~9.0.4"
 vanna = { version = "0.5.5", extras = ["postgres", "mysql", "clickhouse", "duckdb"] }
 wikipedia = "1.4.0"
 yfinance = "~0.2.40"
+cloudscraper = "1.2.71"

 ############################################################
 # VDB dependencies required by vector store clients