WebscraperTool bypass cloudflare site by cloudscraper (#6337)

This commit is contained in:
Weaxs 2024-07-17 14:13:57 +08:00 committed by GitHub
parent 7943f7f697
commit 4e2fba404d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 58 additions and 21 deletions

View File

@ -10,6 +10,7 @@ import unicodedata
from contextlib import contextmanager
from urllib.parse import unquote
import cloudscraper
import requests
from bs4 import BeautifulSoup, CData, Comment, NavigableString
from newspaper import Article
@ -46,9 +47,7 @@ def get_url(url: str, user_agent: str = None) -> str:
supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10))
if response.status_code != 200:
return "URL returned status code {}.".format(response.status_code)
if response.status_code == 200:
# check content-type
content_type = response.headers.get('Content-Type')
if content_type:
@ -69,6 +68,13 @@ def get_url(url: str, user_agent: str = None) -> str:
return ExtractProcessor.load_from_url(url, return_text=True)
response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
elif response.status_code == 403:
scraper = cloudscraper.create_scraper()
response = scraper.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
if response.status_code != 200:
return "URL returned status code {}.".format(response.status_code)
a = extract_using_readabilipy(response.text)
if not a['plain_text'] or not a['plain_text'].strip():

32
api/poetry.lock generated
View File

@ -1610,6 +1610,22 @@ lz4 = ["clickhouse-cityhash (>=1.0.2.1)", "lz4", "lz4 (<=3.0.1)"]
numpy = ["numpy (>=1.12.0)", "pandas (>=0.24.0)"]
zstd = ["clickhouse-cityhash (>=1.0.2.1)", "zstd"]
[[package]]
name = "cloudscraper"
version = "1.2.71"
description = "A Python module to bypass Cloudflare's anti-bot page."
optional = false
python-versions = "*"
files = [
{file = "cloudscraper-1.2.71-py2.py3-none-any.whl", hash = "sha256:76f50ca529ed2279e220837befdec892626f9511708e200d48d5bb76ded679b0"},
{file = "cloudscraper-1.2.71.tar.gz", hash = "sha256:429c6e8aa6916d5bad5c8a5eac50f3ea53c9ac22616f6cb21b18dcc71517d0d3"},
]
[package.dependencies]
pyparsing = ">=2.4.7"
requests = ">=2.9.2"
requests-toolbelt = ">=0.9.1"
[[package]]
name = "cohere"
version = "5.2.6"
@ -7304,6 +7320,20 @@ requests = ">=2.0.0"
[package.extras]
rsa = ["oauthlib[signedtoken] (>=3.0.0)"]
[[package]]
name = "requests-toolbelt"
version = "1.0.0"
description = "A utility belt for advanced users of python-requests"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
files = [
{file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"},
{file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"},
]
[package.dependencies]
requests = ">=2.0.1,<3.0.0"
[[package]]
name = "resend"
version = "0.7.2"
@ -9408,4 +9438,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
content-hash = "8d2a12543340f6f4fa6dcb27f93d8b3f5380e7a3e7eb5e399e76e6b8588b4611"
content-hash = "9b1821b6e5d6d44947cc011c2d635a366557582b4540b99e0ff53a3078a989e5"

View File

@ -193,6 +193,7 @@ twilio = "~9.0.4"
vanna = { version = "0.5.5", extras = ["postgres", "mysql", "clickhouse", "duckdb"] }
wikipedia = "1.4.0"
yfinance = "~0.2.40"
cloudscraper = "1.2.71"
############################################################
# VDB dependencies required by vector store clients