Duckduckgosearch (#1388)

### What problem does this PR solve?

#918 

Add components: Baidu, Duckduckgo

### Type of change
- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
H 2024-07-05 16:14:32 +08:00 committed by GitHub
parent edc61e9b4c
commit a2eb0df875
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 68 additions and 5 deletions

View File

@ -10,6 +10,7 @@ from .message import Message, MessageParam
from .rewrite import RewriteQuestion, RewriteQuestionParam from .rewrite import RewriteQuestion, RewriteQuestionParam
from .keyword import KeywordExtract, KeywordExtractParam from .keyword import KeywordExtract, KeywordExtractParam
from .baidu import Baidu, BaiduParam from .baidu import Baidu, BaiduParam
from .duckduckgosearch import DuckDuckGoSearch, DuckDuckGoSearchParam
def component_class(class_name): def component_class(class_name):

View File

@ -50,12 +50,12 @@ class Baidu(ComponentBase, ABC):
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'} 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'}
response = requests.get(url=url, headers=headers) response = requests.get(url=url, headers=headers)
baidu_res = re.findall(r'"contentText":"(.*?)"', response.text)
url_res = re.findall(r"'url': \\\"(.*?)\\\"}", response.text) url_res = re.findall(r"'url': \\\"(.*?)\\\"}", response.text)
for i in range(min(len(baidu_res), len(url_res))): title_res = re.findall(r"'title': \\\"(.*?)\\\",\\n", response.text)
baidu_res[i] += '<a>' + url_res[i] + '</a>' body_res = re.findall(r"\"contentText\":\"(.*?)\"", response.text)
baidu_res = [re.sub('<em>|</em>', '', '<a href="' + url + '">' + title + '</a> ' + body) for url, title, body
del url_res in zip(url_res, title_res, body_res)]
del body_res, url_res, title_res
br = pd.DataFrame(baidu_res, columns=['content']) br = pd.DataFrame(baidu_res, columns=['content'])
print(">>>>>>>>>>>>>>>>>>>>>>>>>>\n", br) print(">>>>>>>>>>>>>>>>>>>>>>>>>>\n", br)

View File

@ -0,0 +1,62 @@
#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import random
from abc import ABC
from functools import partial
from duckduckgosearch import DDGS
import pandas as pd
from graph.component.base import ComponentBase, ComponentParamBase
class DuckDuckGoSearchParam(ComponentParamBase):
"""
Define the DuckDuckGoSearch component parameters.
"""
def __init__(self):
super().__init__()
self.top_n = 10
self.channel = "text"
def check(self):
self.check_positive_integer(self.top_n, "Top N")
self.check_valid_value(self.channel, "Web Search or News", ["text", "news"])
class DuckDuckGoSearch(ComponentBase, ABC):
component_name = "DuckDuckGoSearch"
def _run(self, history, **kwargs):
ans = self.get_input()
ans = " - ".join(ans["content"]) if "content" in ans else ""
if not ans:
return Baidu.be_output(self._param.no)
if self.channel == "text":
with DDGS() as ddgs:
# {'title': '', 'href': '', 'body': ''}
duck_res = ['<a href="' + i["href"] + '">' + i["title"] + '</a> ' + i["body"] for i in
ddgs.text(ans, max_results=self._param.top_n)]
elif self.channel == "news":
with DDGS() as ddgs:
# {'date': '', 'title': '', 'body': '', 'url': '', 'image': '', 'source': ''}
duck_res = ['<a href="' + i["url"] + '">' + i["title"] + '</a> ' + i["body"] for i in
ddgs.news(ans, max_results=self._param.top_n)]
dr = pd.DataFrame(duck_res, columns=['content'])
print(">>>>>>>>>>>>>>>>>>>>>>>>>>\n", dr)
return dr