From 5d55e6a049f87358f1d9acaad0a9188ee17f7078 Mon Sep 17 00:00:00 2001 From: H <43509927+guoyuhao2330@users.noreply.github.com> Date: Fri, 2 Aug 2024 17:34:38 +0800 Subject: [PATCH] Add component google scholar (#1790) ### What problem does this PR solve? #1739 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- graph/component/__init__.py | 1 + graph/component/googlescholar.py | 70 ++++++++++++++++++++++++++++++++ requirements.txt | 7 +++- requirements_arm.txt | 5 ++- requirements_dev.txt | 5 ++- 5 files changed, 84 insertions(+), 4 deletions(-) create mode 100644 graph/component/googlescholar.py diff --git a/graph/component/__init__.py b/graph/component/__init__.py index 8257701b4..ea9e745be 100644 --- a/graph/component/__init__.py +++ b/graph/component/__init__.py @@ -16,6 +16,7 @@ from .pubmed import PubMed, PubMedParam from .arxiv import ArXiv, ArXivParam from .google import Google, GoogleParam from .bing import Bing, BingParam +from .googlescholar import GoogleScholar, GoogleScholarParam def component_class(class_name): diff --git a/graph/component/googlescholar.py b/graph/component/googlescholar.py new file mode 100644 index 000000000..8da7ba55f --- /dev/null +++ b/graph/component/googlescholar.py @@ -0,0 +1,70 @@ +# +# Copyright 2024 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from abc import ABC +import pandas as pd +from graph.settings import DEBUG +from graph.component.base import ComponentBase, ComponentParamBase +from scholarly import scholarly + + +class GoogleScholarParam(ComponentParamBase): + """ + Define the GoogleScholar component parameters. + """ + + def __init__(self): + super().__init__() + self.top_n = 6 + self.sort_by = 'relevance' + self.year_low = None + self.year_high = None + self.patents = True + + def check(self): + self.check_positive_integer(self.top_n, "Top N") + self.check_valid_value(self.sort_by, "GoogleScholar Sort_by", ['date', 'relevance']) + self.check_boolean(self.patents, "Whether or not to include patents, defaults to True") + + +class GoogleScholar(ComponentBase, ABC): + component_name = "GoogleScholar" + + def _run(self, history, **kwargs): + ans = self.get_input() + ans = " - ".join(ans["content"]) if "content" in ans else "" + if not ans: + return GoogleScholar.be_output("") + + scholar_client = scholarly.search_pubs(ans, patents=self._param.patents, year_low=self._param.year_low, + year_high=self._param.year_high, sort_by=self._param.sort_by) + scholar_res = [] + for i in range(self._param.top_n): + try: + pub = next(scholar_client) + scholar_res.append({"content": 'Title: ' + pub['bib']['title'] + '\n_Url: ' + "\n author: " + ",".join(pub['bib']['author']) + '\n Abstract: ' + pub[ + 'bib'].get('abstract', 'no abstract')}) + + except StopIteration or Exception as e: + print("**ERROR** " + str(e)) + break + + if not scholar_res: + return GoogleScholar.be_output("") + + df = pd.DataFrame(scholar_res) + if DEBUG: print(df, ":::::::::::::::::::::::::::::::::") + return df diff --git a/requirements.txt b/requirements.txt index 690d3745e..a4065d175 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,6 +12,7 @@ datrie==0.8.2 demjson3==3.0.6 discord.py==2.3.2 duckduckgo_search==6.1.9 +editdistance==0.8.1 elastic_transport==8.12.0 elasticsearch==8.12.1 elasticsearch_dsl==8.12.0 @@ -31,7 +32,9 @@ httpx==0.27.0 huggingface_hub==0.20.3 infinity_emb==0.0.51 itsdangerous==2.1.2 +jina==3.27.2 Markdown==3.6 +markdown_to_json==2.1.1 minio==7.2.4 mistralai==0.4.2 nltk==3.8.1 @@ -51,6 +54,7 @@ pipreqs==0.5.0 protobuf==5.27.2 pyclipper==1.3.0.post5 pycryptodomex==3.20.0 +pypdf==4.3.0 PyPDF2==3.0.1 pytest==8.2.2 python-dotenv==1.0.1 @@ -61,6 +65,7 @@ redis==5.0.3 Requests==2.32.2 roman_numbers==1.0.2 ruamel.base==1.0.0 +scholarly==1.7.11 scikit_learn==1.5.0 selenium==4.22.0 setuptools==70.0.0 @@ -80,5 +85,3 @@ word2number==1.1 xgboost==2.1.0 xpinyin==0.7.6 zhipuai==2.0.1 -pypdf==4.3.0 -jina==3.27.2 \ No newline at end of file diff --git a/requirements_arm.txt b/requirements_arm.txt index 777165ddb..1c9431662 100644 --- a/requirements_arm.txt +++ b/requirements_arm.txt @@ -155,4 +155,7 @@ Bio==1.7.1 arxiv==2.1.3 pypdf==4.3.0 google_search_results==2.4.2 -jina==3.27.2 \ No newline at end of file +jina==3.27.2 +editdistance==0.8.1 +markdown_to_json==2.1.1 +scholarly==1.7.11 diff --git a/requirements_dev.txt b/requirements_dev.txt index bc7287ee8..92a015c98 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -140,4 +140,7 @@ Bio==1.7.1 arxiv==2.1.3 pypdf==4.3.0 google_search_results==2.4.2 -jina==3.27.2 \ No newline at end of file +jina==3.27.2 +editdistance==0.8.1 +markdown_to_json==2.1.1 +scholarly==1.7.11