From 97b65f9b4b9255fa69af1795249a7a9842d29146 Mon Sep 17 00:00:00 2001 From: "Charlie.Wei" Date: Wed, 15 May 2024 15:23:16 +0800 Subject: [PATCH] Optimize webscraper (#4392) Co-authored-by: luowei Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com> --- api/core/rag/extractor/extract_processor.py | 13 +++++++++++ .../builtin/webscraper/tools/webscraper.py | 17 ++++++++------ .../builtin/webscraper/tools/webscraper.yaml | 20 +++++++++++++++++ api/core/tools/utils/web_reader_tool.py | 22 +++++++++++++++---- 4 files changed, 61 insertions(+), 11 deletions(-) diff --git a/api/core/rag/extractor/extract_processor.py b/api/core/rag/extractor/extract_processor.py index 1136e11f76..a7adea8a05 100644 --- a/api/core/rag/extractor/extract_processor.py +++ b/api/core/rag/extractor/extract_processor.py @@ -1,6 +1,8 @@ +import re import tempfile from pathlib import Path from typing import Union +from urllib.parse import unquote import requests from flask import current_app @@ -55,6 +57,17 @@ class ExtractProcessor: with tempfile.TemporaryDirectory() as temp_dir: suffix = Path(url).suffix + if not suffix and suffix != '.': + # get content-type + if response.headers.get('Content-Type'): + suffix = '.' + response.headers.get('Content-Type').split('/')[-1] + else: + content_disposition = response.headers.get('Content-Disposition') + filename_match = re.search(r'filename="([^"]+)"', content_disposition) + if filename_match: + filename = unquote(filename_match.group(1)) + suffix = '.' + re.search(r'\.(\w+)$', filename).group(1) + file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}" with open(file_path, 'wb') as file: file.write(response.content) diff --git a/api/core/tools/provider/builtin/webscraper/tools/webscraper.py b/api/core/tools/provider/builtin/webscraper/tools/webscraper.py index 5e8c405b47..3d098e6768 100644 --- a/api/core/tools/provider/builtin/webscraper/tools/webscraper.py +++ b/api/core/tools/provider/builtin/webscraper/tools/webscraper.py @@ -7,9 +7,9 @@ from core.tools.tool.builtin_tool import BuiltinTool class WebscraperTool(BuiltinTool): def _invoke(self, - user_id: str, - tool_parameters: dict[str, Any], - ) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]: + user_id: str, + tool_parameters: dict[str, Any], + ) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]: """ invoke tools """ @@ -18,12 +18,15 @@ class WebscraperTool(BuiltinTool): user_agent = tool_parameters.get('user_agent', '') if not url: return self.create_text_message('Please input url') - + # get webpage result = self.get_url(url, user_agent=user_agent) - # summarize and return - return self.create_text_message(self.summary(user_id=user_id, content=result)) + if tool_parameters.get('generate_summary'): + # summarize and return + return self.create_text_message(self.summary(user_id=user_id, content=result)) + else: + # return full webpage + return self.create_text_message(result) except Exception as e: raise ToolInvokeError(str(e)) - \ No newline at end of file diff --git a/api/core/tools/provider/builtin/webscraper/tools/webscraper.yaml b/api/core/tools/provider/builtin/webscraper/tools/webscraper.yaml index 5782dbb0c7..180cfec6fc 100644 --- a/api/core/tools/provider/builtin/webscraper/tools/webscraper.yaml +++ b/api/core/tools/provider/builtin/webscraper/tools/webscraper.yaml @@ -38,3 +38,23 @@ parameters: pt_BR: used for identifying the browser. form: form default: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.1000.0 Safari/537.36 + - name: generate_summary + type: boolean + required: false + label: + en_US: Whether to generate summary + zh_Hans: 是否生成摘要 + human_description: + en_US: If true, the crawler will only return the page summary content. + zh_Hans: 如果启用,爬虫将仅返回页面摘要内容。 + form: form + options: + - value: true + label: + en_US: Yes + zh_Hans: 是 + - value: false + label: + en_US: No + zh_Hans: 否 + default: false diff --git a/api/core/tools/utils/web_reader_tool.py b/api/core/tools/utils/web_reader_tool.py index 4c6fbb2780..96e4824940 100644 --- a/api/core/tools/utils/web_reader_tool.py +++ b/api/core/tools/utils/web_reader_tool.py @@ -1,5 +1,6 @@ import hashlib import json +import mimetypes import os import re import site @@ -7,6 +8,7 @@ import subprocess import tempfile import unicodedata from contextlib import contextmanager +from urllib.parse import unquote import requests from bs4 import BeautifulSoup, CData, Comment, NavigableString @@ -39,22 +41,34 @@ def get_url(url: str, user_agent: str = None) -> str: } if user_agent: headers["User-Agent"] = user_agent - - supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"] - response = requests.get(url, headers=headers, allow_redirects=True, timeout=(5, 10)) + main_content_type = None + supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"] + response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10)) if response.status_code != 200: return "URL returned status code {}.".format(response.status_code) # check content-type - main_content_type = response.headers.get('Content-Type').split(';')[0].strip() + content_type = response.headers.get('Content-Type') + if content_type: + main_content_type = response.headers.get('Content-Type').split(';')[0].strip() + else: + content_disposition = response.headers.get('Content-Disposition') + filename_match = re.search(r'filename="([^"]+)"', content_disposition) + if filename_match: + filename = unquote(filename_match.group(1)) + extension = re.search(r'\.(\w+)$', filename) + if extension: + main_content_type = mimetypes.guess_type(filename)[0] + if main_content_type not in supported_content_types: return "Unsupported content-type [{}] of URL.".format(main_content_type) if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES: return ExtractProcessor.load_from_url(url, return_text=True) + response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300)) a = extract_using_readabilipy(response.text) if not a['plain_text'] or not a['plain_text'].strip():