mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-14 19:05:55 +08:00
Optimize webscraper (#4392)
Co-authored-by: luowei <glpat-EjySCyNjWiLqAED-YmwM> Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
This commit is contained in:
parent
c0fe414e0a
commit
97b65f9b4b
@ -1,6 +1,8 @@
|
|||||||
|
import re
|
||||||
import tempfile
|
import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
from urllib.parse import unquote
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from flask import current_app
|
from flask import current_app
|
||||||
@ -55,6 +57,17 @@ class ExtractProcessor:
|
|||||||
|
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
suffix = Path(url).suffix
|
suffix = Path(url).suffix
|
||||||
|
if not suffix and suffix != '.':
|
||||||
|
# get content-type
|
||||||
|
if response.headers.get('Content-Type'):
|
||||||
|
suffix = '.' + response.headers.get('Content-Type').split('/')[-1]
|
||||||
|
else:
|
||||||
|
content_disposition = response.headers.get('Content-Disposition')
|
||||||
|
filename_match = re.search(r'filename="([^"]+)"', content_disposition)
|
||||||
|
if filename_match:
|
||||||
|
filename = unquote(filename_match.group(1))
|
||||||
|
suffix = '.' + re.search(r'\.(\w+)$', filename).group(1)
|
||||||
|
|
||||||
file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}"
|
file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}"
|
||||||
with open(file_path, 'wb') as file:
|
with open(file_path, 'wb') as file:
|
||||||
file.write(response.content)
|
file.write(response.content)
|
||||||
|
@ -7,9 +7,9 @@ from core.tools.tool.builtin_tool import BuiltinTool
|
|||||||
|
|
||||||
class WebscraperTool(BuiltinTool):
|
class WebscraperTool(BuiltinTool):
|
||||||
def _invoke(self,
|
def _invoke(self,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
tool_parameters: dict[str, Any],
|
tool_parameters: dict[str, Any],
|
||||||
) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
|
) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
|
||||||
"""
|
"""
|
||||||
invoke tools
|
invoke tools
|
||||||
"""
|
"""
|
||||||
@ -18,12 +18,15 @@ class WebscraperTool(BuiltinTool):
|
|||||||
user_agent = tool_parameters.get('user_agent', '')
|
user_agent = tool_parameters.get('user_agent', '')
|
||||||
if not url:
|
if not url:
|
||||||
return self.create_text_message('Please input url')
|
return self.create_text_message('Please input url')
|
||||||
|
|
||||||
# get webpage
|
# get webpage
|
||||||
result = self.get_url(url, user_agent=user_agent)
|
result = self.get_url(url, user_agent=user_agent)
|
||||||
|
|
||||||
# summarize and return
|
if tool_parameters.get('generate_summary'):
|
||||||
return self.create_text_message(self.summary(user_id=user_id, content=result))
|
# summarize and return
|
||||||
|
return self.create_text_message(self.summary(user_id=user_id, content=result))
|
||||||
|
else:
|
||||||
|
# return full webpage
|
||||||
|
return self.create_text_message(result)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise ToolInvokeError(str(e))
|
raise ToolInvokeError(str(e))
|
||||||
|
|
@ -38,3 +38,23 @@ parameters:
|
|||||||
pt_BR: used for identifying the browser.
|
pt_BR: used for identifying the browser.
|
||||||
form: form
|
form: form
|
||||||
default: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.1000.0 Safari/537.36
|
default: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.1000.0 Safari/537.36
|
||||||
|
- name: generate_summary
|
||||||
|
type: boolean
|
||||||
|
required: false
|
||||||
|
label:
|
||||||
|
en_US: Whether to generate summary
|
||||||
|
zh_Hans: 是否生成摘要
|
||||||
|
human_description:
|
||||||
|
en_US: If true, the crawler will only return the page summary content.
|
||||||
|
zh_Hans: 如果启用,爬虫将仅返回页面摘要内容。
|
||||||
|
form: form
|
||||||
|
options:
|
||||||
|
- value: true
|
||||||
|
label:
|
||||||
|
en_US: Yes
|
||||||
|
zh_Hans: 是
|
||||||
|
- value: false
|
||||||
|
label:
|
||||||
|
en_US: No
|
||||||
|
zh_Hans: 否
|
||||||
|
default: false
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
|
import mimetypes
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import site
|
import site
|
||||||
@ -7,6 +8,7 @@ import subprocess
|
|||||||
import tempfile
|
import tempfile
|
||||||
import unicodedata
|
import unicodedata
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
|
from urllib.parse import unquote
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup, CData, Comment, NavigableString
|
from bs4 import BeautifulSoup, CData, Comment, NavigableString
|
||||||
@ -39,22 +41,34 @@ def get_url(url: str, user_agent: str = None) -> str:
|
|||||||
}
|
}
|
||||||
if user_agent:
|
if user_agent:
|
||||||
headers["User-Agent"] = user_agent
|
headers["User-Agent"] = user_agent
|
||||||
|
|
||||||
supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
|
|
||||||
|
|
||||||
response = requests.get(url, headers=headers, allow_redirects=True, timeout=(5, 10))
|
main_content_type = None
|
||||||
|
supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
|
||||||
|
response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10))
|
||||||
|
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
return "URL returned status code {}.".format(response.status_code)
|
return "URL returned status code {}.".format(response.status_code)
|
||||||
|
|
||||||
# check content-type
|
# check content-type
|
||||||
main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
|
content_type = response.headers.get('Content-Type')
|
||||||
|
if content_type:
|
||||||
|
main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
|
||||||
|
else:
|
||||||
|
content_disposition = response.headers.get('Content-Disposition')
|
||||||
|
filename_match = re.search(r'filename="([^"]+)"', content_disposition)
|
||||||
|
if filename_match:
|
||||||
|
filename = unquote(filename_match.group(1))
|
||||||
|
extension = re.search(r'\.(\w+)$', filename)
|
||||||
|
if extension:
|
||||||
|
main_content_type = mimetypes.guess_type(filename)[0]
|
||||||
|
|
||||||
if main_content_type not in supported_content_types:
|
if main_content_type not in supported_content_types:
|
||||||
return "Unsupported content-type [{}] of URL.".format(main_content_type)
|
return "Unsupported content-type [{}] of URL.".format(main_content_type)
|
||||||
|
|
||||||
if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
|
if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
|
||||||
return ExtractProcessor.load_from_url(url, return_text=True)
|
return ExtractProcessor.load_from_url(url, return_text=True)
|
||||||
|
|
||||||
|
response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
|
||||||
a = extract_using_readabilipy(response.text)
|
a = extract_using_readabilipy(response.text)
|
||||||
|
|
||||||
if not a['plain_text'] or not a['plain_text'].strip():
|
if not a['plain_text'] or not a['plain_text'].strip():
|
||||||
|
Loading…
x
Reference in New Issue
Block a user