mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-11 22:09:02 +08:00
fix wrong charset when decoding Chinese content (#6774)
Co-authored-by: zhangwb <zhangwb@zts.com.cn>
This commit is contained in:
parent
53a89bbbc7
commit
98d9837fbc
@ -10,6 +10,7 @@ import unicodedata
|
||||
from contextlib import contextmanager
|
||||
from urllib.parse import unquote
|
||||
|
||||
import chardet
|
||||
import cloudscraper
|
||||
from bs4 import BeautifulSoup, CData, Comment, NavigableString
|
||||
from regex import regex
|
||||
@ -75,7 +76,18 @@ def get_url(url: str, user_agent: str = None) -> str:
|
||||
if response.status_code != 200:
|
||||
return "URL returned status code {}.".format(response.status_code)
|
||||
|
||||
a = extract_using_readabilipy(response.text)
|
||||
# Detect encoding using chardet
|
||||
detected_encoding = chardet.detect(response.content)
|
||||
encoding = detected_encoding['encoding']
|
||||
if encoding:
|
||||
try:
|
||||
content = response.content.decode(encoding)
|
||||
except (UnicodeDecodeError, TypeError):
|
||||
content = response.text
|
||||
else:
|
||||
content = response.text
|
||||
|
||||
a = extract_using_readabilipy(content)
|
||||
|
||||
if not a['plain_text'] or not a['plain_text'].strip():
|
||||
return ''
|
||||
|
Loading…
x
Reference in New Issue
Block a user