From 8674156d1c704fea1369964cc874b01d3a102160 Mon Sep 17 00:00:00 2001 From: liuhua <10215101452@stu.ecnu.edu.cn> Date: Thu, 2 Jan 2025 18:45:45 +0800 Subject: [PATCH] Fix potential SSRF attack vulnerability (#4334) ### What problem does this PR solve? Fix potential SSRF attack vulnerability ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn> --- agent/component/crawler.py | 2 +- api/utils/web_utils.py | 25 ++++++++++++++++++++++++- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/agent/component/crawler.py b/agent/component/crawler.py index b500d7852..d8c5381b1 100644 --- a/agent/component/crawler.py +++ b/agent/component/crawler.py @@ -41,7 +41,7 @@ class Crawler(ComponentBase, ABC): ans = self.get_input() ans = " - ".join(ans["content"]) if "content" in ans else "" if not is_valid_url(ans): - return Crawler.be_output("") + return Crawler.be_output("URL not valid") try: result = asyncio.run(self.get_web(ans)) diff --git a/api/utils/web_utils.py b/api/utils/web_utils.py index 29c39601b..687d683ac 100644 --- a/api/utils/web_utils.py +++ b/api/utils/web_utils.py @@ -1,4 +1,7 @@ import re +import socket +from urllib.parse import urlparse +import ipaddress import json import base64 @@ -76,5 +79,25 @@ def __get_pdf_from_html( return base64.b64decode(result["data"]) +def is_private_ip(ip: str) -> bool: + try: + ip_obj = ipaddress.ip_address(ip) + return ip_obj.is_private + except ValueError: + return False + def is_valid_url(url: str) -> bool: - return bool(re.match(r"(https?)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", url)) + if not re.match(r"(https?)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", url): + return False + parsed_url = urlparse(url) + hostname = parsed_url.hostname + + if not hostname: + return False + try: + ip = socket.gethostbyname(hostname) + if is_private_ip(ip): + return False + except socket.gaierror: + return False + return True \ No newline at end of file