mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-11 13:59:04 +08:00
WebscraperTool bypass cloudflare site by cloudscraper (#6337)
This commit is contained in:
parent
7943f7f697
commit
4e2fba404d
@ -10,6 +10,7 @@ import unicodedata
|
||||
from contextlib import contextmanager
|
||||
from urllib.parse import unquote
|
||||
|
||||
import cloudscraper
|
||||
import requests
|
||||
from bs4 import BeautifulSoup, CData, Comment, NavigableString
|
||||
from newspaper import Article
|
||||
@ -46,29 +47,34 @@ def get_url(url: str, user_agent: str = None) -> str:
|
||||
supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
|
||||
response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10))
|
||||
|
||||
if response.status_code == 200:
|
||||
# check content-type
|
||||
content_type = response.headers.get('Content-Type')
|
||||
if content_type:
|
||||
main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
|
||||
else:
|
||||
content_disposition = response.headers.get('Content-Disposition', '')
|
||||
filename_match = re.search(r'filename="([^"]+)"', content_disposition)
|
||||
if filename_match:
|
||||
filename = unquote(filename_match.group(1))
|
||||
extension = re.search(r'\.(\w+)$', filename)
|
||||
if extension:
|
||||
main_content_type = mimetypes.guess_type(filename)[0]
|
||||
|
||||
if main_content_type not in supported_content_types:
|
||||
return "Unsupported content-type [{}] of URL.".format(main_content_type)
|
||||
|
||||
if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
|
||||
return ExtractProcessor.load_from_url(url, return_text=True)
|
||||
|
||||
response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
|
||||
elif response.status_code == 403:
|
||||
scraper = cloudscraper.create_scraper()
|
||||
response = scraper.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
|
||||
|
||||
if response.status_code != 200:
|
||||
return "URL returned status code {}.".format(response.status_code)
|
||||
|
||||
# check content-type
|
||||
content_type = response.headers.get('Content-Type')
|
||||
if content_type:
|
||||
main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
|
||||
else:
|
||||
content_disposition = response.headers.get('Content-Disposition', '')
|
||||
filename_match = re.search(r'filename="([^"]+)"', content_disposition)
|
||||
if filename_match:
|
||||
filename = unquote(filename_match.group(1))
|
||||
extension = re.search(r'\.(\w+)$', filename)
|
||||
if extension:
|
||||
main_content_type = mimetypes.guess_type(filename)[0]
|
||||
|
||||
if main_content_type not in supported_content_types:
|
||||
return "Unsupported content-type [{}] of URL.".format(main_content_type)
|
||||
|
||||
if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
|
||||
return ExtractProcessor.load_from_url(url, return_text=True)
|
||||
|
||||
response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
|
||||
a = extract_using_readabilipy(response.text)
|
||||
|
||||
if not a['plain_text'] or not a['plain_text'].strip():
|
||||
|
32
api/poetry.lock
generated
32
api/poetry.lock
generated
@ -1610,6 +1610,22 @@ lz4 = ["clickhouse-cityhash (>=1.0.2.1)", "lz4", "lz4 (<=3.0.1)"]
|
||||
numpy = ["numpy (>=1.12.0)", "pandas (>=0.24.0)"]
|
||||
zstd = ["clickhouse-cityhash (>=1.0.2.1)", "zstd"]
|
||||
|
||||
[[package]]
|
||||
name = "cloudscraper"
|
||||
version = "1.2.71"
|
||||
description = "A Python module to bypass Cloudflare's anti-bot page."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "cloudscraper-1.2.71-py2.py3-none-any.whl", hash = "sha256:76f50ca529ed2279e220837befdec892626f9511708e200d48d5bb76ded679b0"},
|
||||
{file = "cloudscraper-1.2.71.tar.gz", hash = "sha256:429c6e8aa6916d5bad5c8a5eac50f3ea53c9ac22616f6cb21b18dcc71517d0d3"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pyparsing = ">=2.4.7"
|
||||
requests = ">=2.9.2"
|
||||
requests-toolbelt = ">=0.9.1"
|
||||
|
||||
[[package]]
|
||||
name = "cohere"
|
||||
version = "5.2.6"
|
||||
@ -7304,6 +7320,20 @@ requests = ">=2.0.0"
|
||||
[package.extras]
|
||||
rsa = ["oauthlib[signedtoken] (>=3.0.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "requests-toolbelt"
|
||||
version = "1.0.0"
|
||||
description = "A utility belt for advanced users of python-requests"
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
|
||||
files = [
|
||||
{file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"},
|
||||
{file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
requests = ">=2.0.1,<3.0.0"
|
||||
|
||||
[[package]]
|
||||
name = "resend"
|
||||
version = "0.7.2"
|
||||
@ -9408,4 +9438,4 @@ cffi = ["cffi (>=1.11)"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.10"
|
||||
content-hash = "8d2a12543340f6f4fa6dcb27f93d8b3f5380e7a3e7eb5e399e76e6b8588b4611"
|
||||
content-hash = "9b1821b6e5d6d44947cc011c2d635a366557582b4540b99e0ff53a3078a989e5"
|
||||
|
@ -193,6 +193,7 @@ twilio = "~9.0.4"
|
||||
vanna = { version = "0.5.5", extras = ["postgres", "mysql", "clickhouse", "duckdb"] }
|
||||
wikipedia = "1.4.0"
|
||||
yfinance = "~0.2.40"
|
||||
cloudscraper = "1.2.71"
|
||||
|
||||
############################################################
|
||||
# VDB dependencies required by vector store clients
|
||||
|
Loading…
x
Reference in New Issue
Block a user