WebscraperTool bypass cloudflare site by cloudscraper (#6337)

This commit is contained in:
Weaxs 2024-07-17 14:13:57 +08:00 committed by GitHub
parent 7943f7f697
commit 4e2fba404d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 58 additions and 21 deletions

View File

@ -10,6 +10,7 @@ import unicodedata
from contextlib import contextmanager from contextlib import contextmanager
from urllib.parse import unquote from urllib.parse import unquote
import cloudscraper
import requests import requests
from bs4 import BeautifulSoup, CData, Comment, NavigableString from bs4 import BeautifulSoup, CData, Comment, NavigableString
from newspaper import Article from newspaper import Article
@ -46,9 +47,7 @@ def get_url(url: str, user_agent: str = None) -> str:
supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"] supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10)) response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10))
if response.status_code != 200: if response.status_code == 200:
return "URL returned status code {}.".format(response.status_code)
# check content-type # check content-type
content_type = response.headers.get('Content-Type') content_type = response.headers.get('Content-Type')
if content_type: if content_type:
@ -69,6 +68,13 @@ def get_url(url: str, user_agent: str = None) -> str:
return ExtractProcessor.load_from_url(url, return_text=True) return ExtractProcessor.load_from_url(url, return_text=True)
response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300)) response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
elif response.status_code == 403:
scraper = cloudscraper.create_scraper()
response = scraper.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
if response.status_code != 200:
return "URL returned status code {}.".format(response.status_code)
a = extract_using_readabilipy(response.text) a = extract_using_readabilipy(response.text)
if not a['plain_text'] or not a['plain_text'].strip(): if not a['plain_text'] or not a['plain_text'].strip():

32
api/poetry.lock generated
View File

@ -1610,6 +1610,22 @@ lz4 = ["clickhouse-cityhash (>=1.0.2.1)", "lz4", "lz4 (<=3.0.1)"]
numpy = ["numpy (>=1.12.0)", "pandas (>=0.24.0)"] numpy = ["numpy (>=1.12.0)", "pandas (>=0.24.0)"]
zstd = ["clickhouse-cityhash (>=1.0.2.1)", "zstd"] zstd = ["clickhouse-cityhash (>=1.0.2.1)", "zstd"]
[[package]]
name = "cloudscraper"
version = "1.2.71"
description = "A Python module to bypass Cloudflare's anti-bot page."
optional = false
python-versions = "*"
files = [
{file = "cloudscraper-1.2.71-py2.py3-none-any.whl", hash = "sha256:76f50ca529ed2279e220837befdec892626f9511708e200d48d5bb76ded679b0"},
{file = "cloudscraper-1.2.71.tar.gz", hash = "sha256:429c6e8aa6916d5bad5c8a5eac50f3ea53c9ac22616f6cb21b18dcc71517d0d3"},
]
[package.dependencies]
pyparsing = ">=2.4.7"
requests = ">=2.9.2"
requests-toolbelt = ">=0.9.1"
[[package]] [[package]]
name = "cohere" name = "cohere"
version = "5.2.6" version = "5.2.6"
@ -7304,6 +7320,20 @@ requests = ">=2.0.0"
[package.extras] [package.extras]
rsa = ["oauthlib[signedtoken] (>=3.0.0)"] rsa = ["oauthlib[signedtoken] (>=3.0.0)"]
[[package]]
name = "requests-toolbelt"
version = "1.0.0"
description = "A utility belt for advanced users of python-requests"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
files = [
{file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"},
{file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"},
]
[package.dependencies]
requests = ">=2.0.1,<3.0.0"
[[package]] [[package]]
name = "resend" name = "resend"
version = "0.7.2" version = "0.7.2"
@ -9408,4 +9438,4 @@ cffi = ["cffi (>=1.11)"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "8d2a12543340f6f4fa6dcb27f93d8b3f5380e7a3e7eb5e399e76e6b8588b4611" content-hash = "9b1821b6e5d6d44947cc011c2d635a366557582b4540b99e0ff53a3078a989e5"

View File

@ -193,6 +193,7 @@ twilio = "~9.0.4"
vanna = { version = "0.5.5", extras = ["postgres", "mysql", "clickhouse", "duckdb"] } vanna = { version = "0.5.5", extras = ["postgres", "mysql", "clickhouse", "duckdb"] }
wikipedia = "1.4.0" wikipedia = "1.4.0"
yfinance = "~0.2.40" yfinance = "~0.2.40"
cloudscraper = "1.2.71"
############################################################ ############################################################
# VDB dependencies required by vector store clients # VDB dependencies required by vector store clients