Fix potential SSRF attack vulnerability (#4334)

### What problem does this PR solve?

Fix potential SSRF attack vulnerability

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn>
This commit is contained in:
liuhua 2025-01-02 18:45:45 +08:00 committed by GitHub
parent 5083d92998
commit 8674156d1c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 25 additions and 2 deletions

View File

@ -41,7 +41,7 @@ class Crawler(ComponentBase, ABC):
ans = self.get_input()
ans = " - ".join(ans["content"]) if "content" in ans else ""
if not is_valid_url(ans):
return Crawler.be_output("")
return Crawler.be_output("URL not valid")
try:
result = asyncio.run(self.get_web(ans))

View File

@ -1,4 +1,7 @@
import re
import socket
from urllib.parse import urlparse
import ipaddress
import json
import base64
@ -76,5 +79,25 @@ def __get_pdf_from_html(
return base64.b64decode(result["data"])
def is_private_ip(ip: str) -> bool:
try:
ip_obj = ipaddress.ip_address(ip)
return ip_obj.is_private
except ValueError:
return False
def is_valid_url(url: str) -> bool:
return bool(re.match(r"(https?)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", url))
if not re.match(r"(https?)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", url):
return False
parsed_url = urlparse(url)
hostname = parsed_url.hostname
if not hostname:
return False
try:
ip = socket.gethostbyname(hostname)
if is_private_ip(ip):
return False
except socket.gaierror:
return False
return True