mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-12 01:39:04 +08:00
fix wrong charset when decoding Chinese content (#6774)
Co-authored-by: zhangwb <zhangwb@zts.com.cn>
This commit is contained in:
parent
53a89bbbc7
commit
98d9837fbc
@ -10,6 +10,7 @@ import unicodedata
|
|||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from urllib.parse import unquote
|
from urllib.parse import unquote
|
||||||
|
|
||||||
|
import chardet
|
||||||
import cloudscraper
|
import cloudscraper
|
||||||
from bs4 import BeautifulSoup, CData, Comment, NavigableString
|
from bs4 import BeautifulSoup, CData, Comment, NavigableString
|
||||||
from regex import regex
|
from regex import regex
|
||||||
@ -75,7 +76,18 @@ def get_url(url: str, user_agent: str = None) -> str:
|
|||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
return "URL returned status code {}.".format(response.status_code)
|
return "URL returned status code {}.".format(response.status_code)
|
||||||
|
|
||||||
a = extract_using_readabilipy(response.text)
|
# Detect encoding using chardet
|
||||||
|
detected_encoding = chardet.detect(response.content)
|
||||||
|
encoding = detected_encoding['encoding']
|
||||||
|
if encoding:
|
||||||
|
try:
|
||||||
|
content = response.content.decode(encoding)
|
||||||
|
except (UnicodeDecodeError, TypeError):
|
||||||
|
content = response.text
|
||||||
|
else:
|
||||||
|
content = response.text
|
||||||
|
|
||||||
|
a = extract_using_readabilipy(content)
|
||||||
|
|
||||||
if not a['plain_text'] or not a['plain_text'].strip():
|
if not a['plain_text'] or not a['plain_text'].strip():
|
||||||
return ''
|
return ''
|
||||||
|
Loading…
x
Reference in New Issue
Block a user