diff --git a/agent/component/pubmed.py b/agent/component/pubmed.py index 19cbdecc0..ff97ec88b 100644 --- a/agent/component/pubmed.py +++ b/agent/component/pubmed.py @@ -15,6 +15,7 @@ # from abc import ABC from Bio import Entrez +import re import pandas as pd import xml.etree.ElementTree as ET from agent.settings import DEBUG @@ -47,21 +48,16 @@ class PubMed(ComponentBase, ABC): try: Entrez.email = self._param.email pubmedids = Entrez.read(Entrez.esearch(db='pubmed', retmax=self._param.top_n, term=ans))['IdList'] - pubmedcnt = ET.fromstring( - Entrez.efetch(db='pubmed', id=",".join(pubmedids), retmode="xml").read().decode("utf-8")) - pubmed_res = [] - for child in pubmedcnt.findall("PubmedArticle"): - if child.find("MedlineCitation").find("Article").find("ArticleTitle").text: - title_tmp = 'Title:' + child.find("MedlineCitation").find("Article").find("ArticleTitle").text - else: - title_tmp = 'Title:' + "".join( - [childtitle.text for childtitle in - child.find("MedlineCitation").find("Article").find("ArticleTitle")]) - url_tmp = '\nUrl:' + '' - abstract_tmp = '\nAbstract:' + child.find("MedlineCitation").find("Article").find("Abstract").find( - "AbstractText").text - pubmed_res.append({"content": title_tmp + url_tmp + abstract_tmp}) + pubmedcnt = ET.fromstring(re.sub(r'<(/?)b>|<(/?)i>', '', Entrez.efetch(db='pubmed', id=",".join(pubmedids), + retmode="xml").read().decode( + "utf-8"))) + pubmed_res = [{"content": 'Title:' + child.find("MedlineCitation").find("Article").find( + "ArticleTitle").text + '\nUrl:' + '\n' + 'Abstract:' + ( + child.find("MedlineCitation").find("Article").find("Abstract").find( + "AbstractText").text if child.find("MedlineCitation").find( + "Article").find("Abstract") else "No abstract available")} for child in + pubmedcnt.findall("PubmedArticle")] except Exception as e: return PubMed.be_output("**ERROR**: " + str(e))