From 97b65f9b4b9255fa69af1795249a7a9842d29146 Mon Sep 17 00:00:00 2001
From: "Charlie.Wei" <luowei@cvte.com>
Date: Wed, 15 May 2024 15:23:16 +0800
Subject: [PATCH] Optimize webscraper (#4392)

Co-authored-by: luowei <glpat-EjySCyNjWiLqAED-YmwM>
Co-authored-by: crazywoola <427733928@qq.com>
Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
---
 api/core/rag/extractor/extract_processor.py   | 13 +++++++++++
 .../builtin/webscraper/tools/webscraper.py    | 17 ++++++++------
 .../builtin/webscraper/tools/webscraper.yaml  | 20 +++++++++++++++++
 api/core/tools/utils/web_reader_tool.py       | 22 +++++++++++++++----
 4 files changed, 61 insertions(+), 11 deletions(-)

diff --git a/api/core/rag/extractor/extract_processor.py b/api/core/rag/extractor/extract_processor.py
index 1136e11f76..a7adea8a05 100644
--- a/api/core/rag/extractor/extract_processor.py
+++ b/api/core/rag/extractor/extract_processor.py
@@ -1,6 +1,8 @@
+import re
 import tempfile
 from pathlib import Path
 from typing import Union
+from urllib.parse import unquote
 
 import requests
 from flask import current_app
@@ -55,6 +57,17 @@ class ExtractProcessor:
 
         with tempfile.TemporaryDirectory() as temp_dir:
             suffix = Path(url).suffix
+            if not suffix and suffix != '.':
+                # get content-type
+                if response.headers.get('Content-Type'):
+                    suffix = '.' + response.headers.get('Content-Type').split('/')[-1]
+                else:
+                    content_disposition = response.headers.get('Content-Disposition')
+                    filename_match = re.search(r'filename="([^"]+)"', content_disposition)
+                    if filename_match:
+                        filename = unquote(filename_match.group(1))
+                        suffix = '.' + re.search(r'\.(\w+)$', filename).group(1)
+
             file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}"
             with open(file_path, 'wb') as file:
                 file.write(response.content)
diff --git a/api/core/tools/provider/builtin/webscraper/tools/webscraper.py b/api/core/tools/provider/builtin/webscraper/tools/webscraper.py
index 5e8c405b47..3d098e6768 100644
--- a/api/core/tools/provider/builtin/webscraper/tools/webscraper.py
+++ b/api/core/tools/provider/builtin/webscraper/tools/webscraper.py
@@ -7,9 +7,9 @@ from core.tools.tool.builtin_tool import BuiltinTool
 
 class WebscraperTool(BuiltinTool):
     def _invoke(self,
-               user_id: str,
-               tool_parameters: dict[str, Any], 
-        ) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
+                user_id: str,
+                tool_parameters: dict[str, Any],
+                ) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
         """
             invoke tools
         """
@@ -18,12 +18,15 @@ class WebscraperTool(BuiltinTool):
             user_agent = tool_parameters.get('user_agent', '')
             if not url:
                 return self.create_text_message('Please input url')
-            
+
             # get webpage
             result = self.get_url(url, user_agent=user_agent)
 
-            # summarize and return
-            return self.create_text_message(self.summary(user_id=user_id, content=result))
+            if tool_parameters.get('generate_summary'):
+                # summarize and return
+                return self.create_text_message(self.summary(user_id=user_id, content=result))
+            else:
+                # return full webpage
+                return self.create_text_message(result)
         except Exception as e:
             raise ToolInvokeError(str(e))
-        
\ No newline at end of file
diff --git a/api/core/tools/provider/builtin/webscraper/tools/webscraper.yaml b/api/core/tools/provider/builtin/webscraper/tools/webscraper.yaml
index 5782dbb0c7..180cfec6fc 100644
--- a/api/core/tools/provider/builtin/webscraper/tools/webscraper.yaml
+++ b/api/core/tools/provider/builtin/webscraper/tools/webscraper.yaml
@@ -38,3 +38,23 @@ parameters:
       pt_BR: used for identifying the browser.
     form: form
     default: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.1000.0 Safari/537.36
+  - name: generate_summary
+    type: boolean
+    required: false
+    label:
+      en_US: Whether to generate summary
+      zh_Hans: 是否生成摘要
+    human_description:
+      en_US: If true, the crawler will only return the page summary content.
+      zh_Hans: 如果启用，爬虫将仅返回页面摘要内容。
+    form: form
+    options:
+      - value: true
+        label:
+          en_US: Yes
+          zh_Hans: 是
+      - value: false
+        label:
+          en_US: No
+          zh_Hans: 否
+    default: false
diff --git a/api/core/tools/utils/web_reader_tool.py b/api/core/tools/utils/web_reader_tool.py
index 4c6fbb2780..96e4824940 100644
--- a/api/core/tools/utils/web_reader_tool.py
+++ b/api/core/tools/utils/web_reader_tool.py
@@ -1,5 +1,6 @@
 import hashlib
 import json
+import mimetypes
 import os
 import re
 import site
@@ -7,6 +8,7 @@ import subprocess
 import tempfile
 import unicodedata
 from contextlib import contextmanager
+from urllib.parse import unquote
 
 import requests
 from bs4 import BeautifulSoup, CData, Comment, NavigableString
@@ -39,22 +41,34 @@ def get_url(url: str, user_agent: str = None) -> str:
     }
     if user_agent:
         headers["User-Agent"] = user_agent
-    
-    supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
 
-    response = requests.get(url, headers=headers, allow_redirects=True, timeout=(5, 10))
+    main_content_type = None
+    supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
+    response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10))
 
     if response.status_code != 200:
         return "URL returned status code {}.".format(response.status_code)
 
     # check content-type
-    main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
+    content_type = response.headers.get('Content-Type')
+    if content_type:
+        main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
+    else:
+        content_disposition = response.headers.get('Content-Disposition')
+        filename_match = re.search(r'filename="([^"]+)"', content_disposition)
+        if filename_match:
+            filename = unquote(filename_match.group(1))
+            extension = re.search(r'\.(\w+)$', filename)
+            if extension:
+                main_content_type = mimetypes.guess_type(filename)[0]
+
     if main_content_type not in supported_content_types:
         return "Unsupported content-type [{}] of URL.".format(main_content_type)
 
     if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
         return ExtractProcessor.load_from_url(url, return_text=True)
 
+    response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
     a = extract_using_readabilipy(response.text)
 
     if not a['plain_text'] or not a['plain_text'].strip():