diff --git a/agent/component/crawler.py b/agent/component/crawler.py index b500d7852..d8c5381b1 100644 --- a/agent/component/crawler.py +++ b/agent/component/crawler.py @@ -41,7 +41,7 @@ class Crawler(ComponentBase, ABC): ans = self.get_input() ans = " - ".join(ans["content"]) if "content" in ans else "" if not is_valid_url(ans): - return Crawler.be_output("") + return Crawler.be_output("URL not valid") try: result = asyncio.run(self.get_web(ans)) diff --git a/api/utils/web_utils.py b/api/utils/web_utils.py index 29c39601b..687d683ac 100644 --- a/api/utils/web_utils.py +++ b/api/utils/web_utils.py @@ -1,4 +1,7 @@ import re +import socket +from urllib.parse import urlparse +import ipaddress import json import base64 @@ -76,5 +79,25 @@ def __get_pdf_from_html( return base64.b64decode(result["data"]) +def is_private_ip(ip: str) -> bool: + try: + ip_obj = ipaddress.ip_address(ip) + return ip_obj.is_private + except ValueError: + return False + def is_valid_url(url: str) -> bool: - return bool(re.match(r"(https?)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", url)) + if not re.match(r"(https?)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", url): + return False + parsed_url = urlparse(url) + hostname = parsed_url.hostname + + if not hostname: + return False + try: + ip = socket.gethostbyname(hostname) + if is_private_ip(ip): + return False + except socket.gaierror: + return False + return True \ No newline at end of file