Add component google scholar (#1790)

### What problem does this PR solve?

#1739 

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
H 2024-08-02 17:34:38 +08:00 committed by GitHub
parent 418700b455
commit 5d55e6a049
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 84 additions and 4 deletions

View File

@ -16,6 +16,7 @@ from .pubmed import PubMed, PubMedParam
from .arxiv import ArXiv, ArXivParam
from .google import Google, GoogleParam
from .bing import Bing, BingParam
from .googlescholar import GoogleScholar, GoogleScholarParam
def component_class(class_name):

View File

@ -0,0 +1,70 @@
#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from abc import ABC
import pandas as pd
from graph.settings import DEBUG
from graph.component.base import ComponentBase, ComponentParamBase
from scholarly import scholarly
class GoogleScholarParam(ComponentParamBase):
"""
Define the GoogleScholar component parameters.
"""
def __init__(self):
super().__init__()
self.top_n = 6
self.sort_by = 'relevance'
self.year_low = None
self.year_high = None
self.patents = True
def check(self):
self.check_positive_integer(self.top_n, "Top N")
self.check_valid_value(self.sort_by, "GoogleScholar Sort_by", ['date', 'relevance'])
self.check_boolean(self.patents, "Whether or not to include patents, defaults to True")
class GoogleScholar(ComponentBase, ABC):
component_name = "GoogleScholar"
def _run(self, history, **kwargs):
ans = self.get_input()
ans = " - ".join(ans["content"]) if "content" in ans else ""
if not ans:
return GoogleScholar.be_output("")
scholar_client = scholarly.search_pubs(ans, patents=self._param.patents, year_low=self._param.year_low,
year_high=self._param.year_high, sort_by=self._param.sort_by)
scholar_res = []
for i in range(self._param.top_n):
try:
pub = next(scholar_client)
scholar_res.append({"content": 'Title: ' + pub['bib']['title'] + '\n_Url: <a href="' + pub[
'pub_url'] + '"></a> ' + "\n author: " + ",".join(pub['bib']['author']) + '\n Abstract: ' + pub[
'bib'].get('abstract', 'no abstract')})
except StopIteration or Exception as e:
print("**ERROR** " + str(e))
break
if not scholar_res:
return GoogleScholar.be_output("")
df = pd.DataFrame(scholar_res)
if DEBUG: print(df, ":::::::::::::::::::::::::::::::::")
return df

View File

@ -12,6 +12,7 @@ datrie==0.8.2
demjson3==3.0.6
discord.py==2.3.2
duckduckgo_search==6.1.9
editdistance==0.8.1
elastic_transport==8.12.0
elasticsearch==8.12.1
elasticsearch_dsl==8.12.0
@ -31,7 +32,9 @@ httpx==0.27.0
huggingface_hub==0.20.3
infinity_emb==0.0.51
itsdangerous==2.1.2
jina==3.27.2
Markdown==3.6
markdown_to_json==2.1.1
minio==7.2.4
mistralai==0.4.2
nltk==3.8.1
@ -51,6 +54,7 @@ pipreqs==0.5.0
protobuf==5.27.2
pyclipper==1.3.0.post5
pycryptodomex==3.20.0
pypdf==4.3.0
PyPDF2==3.0.1
pytest==8.2.2
python-dotenv==1.0.1
@ -61,6 +65,7 @@ redis==5.0.3
Requests==2.32.2
roman_numbers==1.0.2
ruamel.base==1.0.0
scholarly==1.7.11
scikit_learn==1.5.0
selenium==4.22.0
setuptools==70.0.0
@ -80,5 +85,3 @@ word2number==1.1
xgboost==2.1.0
xpinyin==0.7.6
zhipuai==2.0.1
pypdf==4.3.0
jina==3.27.2

View File

@ -155,4 +155,7 @@ Bio==1.7.1
arxiv==2.1.3
pypdf==4.3.0
google_search_results==2.4.2
jina==3.27.2
jina==3.27.2
editdistance==0.8.1
markdown_to_json==2.1.1
scholarly==1.7.11

View File

@ -140,4 +140,7 @@ Bio==1.7.1
arxiv==2.1.3
pypdf==4.3.0
google_search_results==2.4.2
jina==3.27.2
jina==3.27.2
editdistance==0.8.1
markdown_to_json==2.1.1
scholarly==1.7.11