Optimize webscraper (#4392)

Co-authored-by: luowei <glpat-EjySCyNjWiLqAED-YmwM>
Co-authored-by: crazywoola <427733928@qq.com>
Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
This commit is contained in:
Charlie.Wei 2024-05-15 15:23:16 +08:00 committed by GitHub
parent c0fe414e0a
commit 97b65f9b4b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 61 additions and 11 deletions

View File

@ -1,6 +1,8 @@
import re
import tempfile
from pathlib import Path
from typing import Union
from urllib.parse import unquote
import requests
from flask import current_app
@ -55,6 +57,17 @@ class ExtractProcessor:
with tempfile.TemporaryDirectory() as temp_dir:
suffix = Path(url).suffix
if not suffix and suffix != '.':
# get content-type
if response.headers.get('Content-Type'):
suffix = '.' + response.headers.get('Content-Type').split('/')[-1]
else:
content_disposition = response.headers.get('Content-Disposition')
filename_match = re.search(r'filename="([^"]+)"', content_disposition)
if filename_match:
filename = unquote(filename_match.group(1))
suffix = '.' + re.search(r'\.(\w+)$', filename).group(1)
file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}"
with open(file_path, 'wb') as file:
file.write(response.content)

View File

@ -22,8 +22,11 @@ class WebscraperTool(BuiltinTool):
# get webpage
result = self.get_url(url, user_agent=user_agent)
if tool_parameters.get('generate_summary'):
# summarize and return
return self.create_text_message(self.summary(user_id=user_id, content=result))
else:
# return full webpage
return self.create_text_message(result)
except Exception as e:
raise ToolInvokeError(str(e))

View File

@ -38,3 +38,23 @@ parameters:
pt_BR: used for identifying the browser.
form: form
default: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.1000.0 Safari/537.36
- name: generate_summary
type: boolean
required: false
label:
en_US: Whether to generate summary
zh_Hans: 是否生成摘要
human_description:
en_US: If true, the crawler will only return the page summary content.
zh_Hans: 如果启用,爬虫将仅返回页面摘要内容。
form: form
options:
- value: true
label:
en_US: Yes
zh_Hans:
- value: false
label:
en_US: No
zh_Hans:
default: false

View File

@ -1,5 +1,6 @@
import hashlib
import json
import mimetypes
import os
import re
import site
@ -7,6 +8,7 @@ import subprocess
import tempfile
import unicodedata
from contextlib import contextmanager
from urllib.parse import unquote
import requests
from bs4 import BeautifulSoup, CData, Comment, NavigableString
@ -40,21 +42,33 @@ def get_url(url: str, user_agent: str = None) -> str:
if user_agent:
headers["User-Agent"] = user_agent
main_content_type = None
supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
response = requests.get(url, headers=headers, allow_redirects=True, timeout=(5, 10))
response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10))
if response.status_code != 200:
return "URL returned status code {}.".format(response.status_code)
# check content-type
content_type = response.headers.get('Content-Type')
if content_type:
main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
else:
content_disposition = response.headers.get('Content-Disposition')
filename_match = re.search(r'filename="([^"]+)"', content_disposition)
if filename_match:
filename = unquote(filename_match.group(1))
extension = re.search(r'\.(\w+)$', filename)
if extension:
main_content_type = mimetypes.guess_type(filename)[0]
if main_content_type not in supported_content_types:
return "Unsupported content-type [{}] of URL.".format(main_content_type)
if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
return ExtractProcessor.load_from_url(url, return_text=True)
response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
a = extract_using_readabilipy(response.text)
if not a['plain_text'] or not a['plain_text'].strip():