Optimize webscraper (#4392)

Co-authored-by: luowei <glpat-EjySCyNjWiLqAED-YmwM>
Co-authored-by: crazywoola <427733928@qq.com>
Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
This commit is contained in:
Charlie.Wei 2024-05-15 15:23:16 +08:00 committed by GitHub
parent c0fe414e0a
commit 97b65f9b4b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 61 additions and 11 deletions

View File

@ -1,6 +1,8 @@
import re
import tempfile import tempfile
from pathlib import Path from pathlib import Path
from typing import Union from typing import Union
from urllib.parse import unquote
import requests import requests
from flask import current_app from flask import current_app
@ -55,6 +57,17 @@ class ExtractProcessor:
with tempfile.TemporaryDirectory() as temp_dir: with tempfile.TemporaryDirectory() as temp_dir:
suffix = Path(url).suffix suffix = Path(url).suffix
if not suffix and suffix != '.':
# get content-type
if response.headers.get('Content-Type'):
suffix = '.' + response.headers.get('Content-Type').split('/')[-1]
else:
content_disposition = response.headers.get('Content-Disposition')
filename_match = re.search(r'filename="([^"]+)"', content_disposition)
if filename_match:
filename = unquote(filename_match.group(1))
suffix = '.' + re.search(r'\.(\w+)$', filename).group(1)
file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}" file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}"
with open(file_path, 'wb') as file: with open(file_path, 'wb') as file:
file.write(response.content) file.write(response.content)

View File

@ -7,9 +7,9 @@ from core.tools.tool.builtin_tool import BuiltinTool
class WebscraperTool(BuiltinTool): class WebscraperTool(BuiltinTool):
def _invoke(self, def _invoke(self,
user_id: str, user_id: str,
tool_parameters: dict[str, Any], tool_parameters: dict[str, Any],
) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]: ) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
""" """
invoke tools invoke tools
""" """
@ -18,12 +18,15 @@ class WebscraperTool(BuiltinTool):
user_agent = tool_parameters.get('user_agent', '') user_agent = tool_parameters.get('user_agent', '')
if not url: if not url:
return self.create_text_message('Please input url') return self.create_text_message('Please input url')
# get webpage # get webpage
result = self.get_url(url, user_agent=user_agent) result = self.get_url(url, user_agent=user_agent)
# summarize and return if tool_parameters.get('generate_summary'):
return self.create_text_message(self.summary(user_id=user_id, content=result)) # summarize and return
return self.create_text_message(self.summary(user_id=user_id, content=result))
else:
# return full webpage
return self.create_text_message(result)
except Exception as e: except Exception as e:
raise ToolInvokeError(str(e)) raise ToolInvokeError(str(e))

View File

@ -38,3 +38,23 @@ parameters:
pt_BR: used for identifying the browser. pt_BR: used for identifying the browser.
form: form form: form
default: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.1000.0 Safari/537.36 default: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.1000.0 Safari/537.36
- name: generate_summary
type: boolean
required: false
label:
en_US: Whether to generate summary
zh_Hans: 是否生成摘要
human_description:
en_US: If true, the crawler will only return the page summary content.
zh_Hans: 如果启用,爬虫将仅返回页面摘要内容。
form: form
options:
- value: true
label:
en_US: Yes
zh_Hans:
- value: false
label:
en_US: No
zh_Hans:
default: false

View File

@ -1,5 +1,6 @@
import hashlib import hashlib
import json import json
import mimetypes
import os import os
import re import re
import site import site
@ -7,6 +8,7 @@ import subprocess
import tempfile import tempfile
import unicodedata import unicodedata
from contextlib import contextmanager from contextlib import contextmanager
from urllib.parse import unquote
import requests import requests
from bs4 import BeautifulSoup, CData, Comment, NavigableString from bs4 import BeautifulSoup, CData, Comment, NavigableString
@ -39,22 +41,34 @@ def get_url(url: str, user_agent: str = None) -> str:
} }
if user_agent: if user_agent:
headers["User-Agent"] = user_agent headers["User-Agent"] = user_agent
supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
response = requests.get(url, headers=headers, allow_redirects=True, timeout=(5, 10)) main_content_type = None
supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10))
if response.status_code != 200: if response.status_code != 200:
return "URL returned status code {}.".format(response.status_code) return "URL returned status code {}.".format(response.status_code)
# check content-type # check content-type
main_content_type = response.headers.get('Content-Type').split(';')[0].strip() content_type = response.headers.get('Content-Type')
if content_type:
main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
else:
content_disposition = response.headers.get('Content-Disposition')
filename_match = re.search(r'filename="([^"]+)"', content_disposition)
if filename_match:
filename = unquote(filename_match.group(1))
extension = re.search(r'\.(\w+)$', filename)
if extension:
main_content_type = mimetypes.guess_type(filename)[0]
if main_content_type not in supported_content_types: if main_content_type not in supported_content_types:
return "Unsupported content-type [{}] of URL.".format(main_content_type) return "Unsupported content-type [{}] of URL.".format(main_content_type)
if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES: if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
return ExtractProcessor.load_from_url(url, return_text=True) return ExtractProcessor.load_from_url(url, return_text=True)
response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
a = extract_using_readabilipy(response.text) a = extract_using_readabilipy(response.text)
if not a['plain_text'] or not a['plain_text'].strip(): if not a['plain_text'] or not a['plain_text'].strip():