From 2a8881d0e80628986b8c8fc8f637d066fbdfb655 Mon Sep 17 00:00:00 2001
From: miendinh <22139872+miendinh@users.noreply.github.com>
Date: Fri, 26 Apr 2024 16:58:46 +0700
Subject: [PATCH] =?UTF-8?q?fix:=20tool=20webscraper=20-=20too=20many=20red?=
 =?UTF-8?q?irects=20in=20case=20target=20url=20does=20not=E2=80=A6=20(#383?=
 =?UTF-8?q?1)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: miendinh <miendinh@users.noreply.github.com>
---
 api/core/tools/utils/web_reader_tool.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/api/core/tools/utils/web_reader_tool.py b/api/core/tools/utils/web_reader_tool.py
index ba10b318dc..4c6fbb2780 100644
--- a/api/core/tools/utils/web_reader_tool.py
+++ b/api/core/tools/utils/web_reader_tool.py
@@ -42,20 +42,19 @@ def get_url(url: str, user_agent: str = None) -> str:
     
     supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
 
-    head_response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10))
+    response = requests.get(url, headers=headers, allow_redirects=True, timeout=(5, 10))
 
-    if head_response.status_code != 200:
-        return "URL returned status code {}.".format(head_response.status_code)
+    if response.status_code != 200:
+        return "URL returned status code {}.".format(response.status_code)
 
     # check content-type
-    main_content_type = head_response.headers.get('Content-Type').split(';')[0].strip()
+    main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
     if main_content_type not in supported_content_types:
         return "Unsupported content-type [{}] of URL.".format(main_content_type)
 
     if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
         return ExtractProcessor.load_from_url(url, return_text=True)
 
-    response = requests.get(url, headers=headers, allow_redirects=True, timeout=(5, 30))
     a = extract_using_readabilipy(response.text)
 
     if not a['plain_text'] or not a['plain_text'].strip():