mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-11 22:38:59 +08:00
fix: tool webscraper - too many redirects in case target url does not… (#3831)
Co-authored-by: miendinh <miendinh@users.noreply.github.com>
This commit is contained in:
parent
ffc60bb917
commit
2a8881d0e8
@ -42,20 +42,19 @@ def get_url(url: str, user_agent: str = None) -> str:
|
||||
|
||||
supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
|
||||
|
||||
head_response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10))
|
||||
response = requests.get(url, headers=headers, allow_redirects=True, timeout=(5, 10))
|
||||
|
||||
if head_response.status_code != 200:
|
||||
return "URL returned status code {}.".format(head_response.status_code)
|
||||
if response.status_code != 200:
|
||||
return "URL returned status code {}.".format(response.status_code)
|
||||
|
||||
# check content-type
|
||||
main_content_type = head_response.headers.get('Content-Type').split(';')[0].strip()
|
||||
main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
|
||||
if main_content_type not in supported_content_types:
|
||||
return "Unsupported content-type [{}] of URL.".format(main_content_type)
|
||||
|
||||
if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
|
||||
return ExtractProcessor.load_from_url(url, return_text=True)
|
||||
|
||||
response = requests.get(url, headers=headers, allow_redirects=True, timeout=(5, 30))
|
||||
a = extract_using_readabilipy(response.text)
|
||||
|
||||
if not a['plain_text'] or not a['plain_text'].strip():
|
||||
|
Loading…
x
Reference in New Issue
Block a user