Fix baidu request error (#7799)

### What problem does this PR solve?

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Co-authored-by: xiaohzho <xiaohzho@cisco.com>
This commit is contained in:
Hayden Zhou 2025-05-23 09:48:55 +08:00 committed by GitHub
parent 1fd92e6bee
commit bdc2b74e8f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -17,6 +17,7 @@ import logging
from abc import ABC from abc import ABC
import pandas as pd import pandas as pd
import requests import requests
from bs4 import BeautifulSoup
import re import re
from agent.component.base import ComponentBase, ComponentParamBase from agent.component.base import ComponentBase, ComponentParamBase
@ -44,14 +45,25 @@ class Baidu(ComponentBase, ABC):
return Baidu.be_output("") return Baidu.be_output("")
try: try:
url = 'http://www.baidu.com/s?wd=' + ans + '&rn=' + str(self._param.top_n) url = 'https://www.baidu.com/s?wd=' + ans + '&rn=' + str(self._param.top_n)
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'} 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
}
response = requests.get(url=url, headers=headers) response = requests.get(url=url, headers=headers)
# check if request success
url_res = re.findall(r"'url': \\\"(.*?)\\\"}", response.text) if response.status_code == 200:
title_res = re.findall(r"'title': \\\"(.*?)\\\",\\n", response.text) soup = BeautifulSoup(response.text, 'html.parser')
body_res = re.findall(r"\"contentText\":\"(.*?)\"", response.text) url_res = []
title_res = []
body_res = []
for item in soup.select('.result.c-container'):
# extract title
title_res.append(item.select_one('h3 a').get_text(strip=True))
url_res.append(item.select_one('h3 a')['href'])
body_res.append(item.select_one('.c-abstract').get_text(strip=True) if item.select_one('.c-abstract') else '')
baidu_res = [{"content": re.sub('<em>|</em>', '', '<a href="' + url + '">' + title + '</a> ' + body)} for baidu_res = [{"content": re.sub('<em>|</em>', '', '<a href="' + url + '">' + title + '</a> ' + body)} for
url, title, body in zip(url_res, title_res, body_res)] url, title, body in zip(url_res, title_res, body_res)]
del body_res, url_res, title_res del body_res, url_res, title_res