From 3a342a62badf6e8dfdbb3e0ce1ad336cf6a8acb6 Mon Sep 17 00:00:00 2001 From: He Tao Date: Fri, 11 Apr 2025 15:37:55 +0800 Subject: [PATCH] feat: support arxiv & brave search --- .env.example | 3 ++- README.md | 31 ++++++++++++++++++++++++++++++- pyproject.toml | 1 + src/config/tools.py | 2 ++ src/tools/__init__.py | 9 ++++++++- src/tools/search.py | 23 +++++++++++++++++++++++ uv.lock | 33 +++++++++++++++++++++++++++++++++ 7 files changed, 99 insertions(+), 3 deletions(-) diff --git a/.env.example b/.env.example index ee0b590..8227804 100644 --- a/.env.example +++ b/.env.example @@ -3,8 +3,9 @@ DEBUG=True APP_ENV=development # Add other environment variables as needed -# tavily, duckduckgo +# tavily, duckduckgo, brave_search, arxiv SEARCH_API=tavily TAVILY_API_KEY=tvly-xxx +BRAVE_SEARCH_API_KEY=brave-xxx # JINA_API_KEY=jina_xxx # Optional, default is None diff --git a/README.md b/README.md index d9ce5a7..ae96cd4 100644 --- a/README.md +++ b/README.md @@ -14,13 +14,16 @@ lite-deep-researcher is a community-driven AI automation framework that builds u git clone https://github.com/hetaoBackend/lite-deep-researcher.git cd lite-deep-researcher -# Install dependencies, uv will take care of the python interpreter and venv creation +# Install dependencies, uv will take care of the python interpreter and venv creation, and install the required packages uv sync # Configure .env with your Search Engine API keys # Tavily: https://app.tavily.com/home +# Brave_SEARCH: https://brave.com/search/api/ cp .env.example .env +# See the 'Supported Search Engines' section below for all available options + # Configure conf.yaml for your LLM model and API keys # Gemini: https://ai.google.dev/gemini-api/docs/openai cp conf.yaml.example conf.yaml @@ -29,6 +32,32 @@ cp conf.yaml.example conf.yaml uv run main.py ``` +## Supported Search Engines + +Lite-deep-researcher supports multiple search engines that can be configured in your `.env` file using the `SEARCH_API` variable: + +- **Tavily** (default): A specialized search API for AI applications + - Requires `TAVILY_API_KEY` in your `.env` file + - Sign up at: https://app.tavily.com/home + +- **DuckDuckGo**: Privacy-focused search engine + - No API key required + +- **Brave Search**: Privacy-focused search engine with advanced features + - Requires `BRAVE_SEARCH_API_KEY` in your `.env` file + - Sign up at: https://brave.com/search/api/ + +- **Arxiv**: Scientific paper search for academic research + - No API key required + - Specialized for scientific and academic papers + +To configure your preferred search engine, set the `SEARCH_API` variable in your `.env` file: + +```bash +# Choose one: tavily, duckduckgo, brave_search, arxiv +SEARCH_API=tavily +``` + ## Development ### Testing diff --git a/pyproject.toml b/pyproject.toml index 01e7ab9..94b6339 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,7 @@ dependencies = [ "jinja2>=3.1.3", "duckduckgo-search>=8.0.0", "inquirerpy>=0.3.4", + "arxiv>=2.2.0", ] [project.optional-dependencies] diff --git a/src/config/tools.py b/src/config/tools.py index 555e1b1..8b3e526 100644 --- a/src/config/tools.py +++ b/src/config/tools.py @@ -8,6 +8,8 @@ load_dotenv() class SearchEngine(enum.Enum): TAVILY = "tavily" DUCKDUCKGO = "duckduckgo" + BRAVE_SEARCH = "brave_search" + ARXIV = "arxiv" # Tool configuration diff --git a/src/tools/__init__.py b/src/tools/__init__.py index 5dc73c4..1d34af8 100644 --- a/src/tools/__init__.py +++ b/src/tools/__init__.py @@ -1,12 +1,19 @@ from .crawl import crawl_tool from .python_repl import python_repl_tool -from .search import tavily_search_tool, duckduckgo_search_tool +from .search import ( + tavily_search_tool, + duckduckgo_search_tool, + brave_search_tool, + arxiv_search_tool, +) from src.config import SELECTED_SEARCH_ENGINE, SearchEngine # Map search engine names to their respective tools search_tool_mappings = { SearchEngine.TAVILY.value: tavily_search_tool, SearchEngine.DUCKDUCKGO.value: duckduckgo_search_tool, + SearchEngine.BRAVE_SEARCH.value: brave_search_tool, + SearchEngine.ARXIV.value: arxiv_search_tool, } web_search_tool = search_tool_mappings.get(SELECTED_SEARCH_ENGINE, tavily_search_tool) diff --git a/src/tools/search.py b/src/tools/search.py index 0978fab..6d8367e 100644 --- a/src/tools/search.py +++ b/src/tools/search.py @@ -1,6 +1,10 @@ import logging +import os from langchain_community.tools.tavily_search import TavilySearchResults from langchain_community.tools import DuckDuckGoSearchResults +from langchain_community.tools import BraveSearch +from langchain_community.tools.arxiv import ArxivQueryRun +from langchain_community.utilities import ArxivAPIWrapper, BraveSearchWrapper from src.config import SEARCH_MAX_RESULTS from .decorators import create_logged_tool @@ -16,3 +20,22 @@ LoggedDuckDuckGoSearch = create_logged_tool(DuckDuckGoSearchResults) duckduckgo_search_tool = LoggedDuckDuckGoSearch( name="web_search", max_results=SEARCH_MAX_RESULTS ) + +LoggedBraveSearch = create_logged_tool(BraveSearch) +brave_search_tool = LoggedBraveSearch( + name="web_search", + search_wrapper=BraveSearchWrapper( + api_key=os.getenv("BRAVE_SEARCH_API_KEY", ""), + search_kwargs={"count": SEARCH_MAX_RESULTS}, + ), +) + +LoggedArxivSearch = create_logged_tool(ArxivQueryRun) +arxiv_search_tool = LoggedArxivSearch( + name="web_search", + api_wrapper=ArxivAPIWrapper( + top_k_results=SEARCH_MAX_RESULTS, + load_max_docs=SEARCH_MAX_RESULTS, + load_all_available_meta=True, + ), +) diff --git a/uv.lock b/uv.lock index fe5a6ba..5eb7957 100644 --- a/uv.lock +++ b/uv.lock @@ -100,6 +100,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/46/eb/e7f063ad1fec6b3178a3cd82d1a3c4de82cccf283fc42746168188e1cdd5/anyio-4.8.0-py3-none-any.whl", hash = "sha256:b5011f270ab5eb0abf13385f851315585cc37ef330dd88e27ec3d34d651fd47a", size = 96041 }, ] +[[package]] +name = "arxiv" +version = "2.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "feedparser" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0b/16/3d72446400a59d1fbda24fed2289661398994164e07d72cfa85e43ce5e36/arxiv-2.2.0.tar.gz", hash = "sha256:6072a2211e95697092ef32acde0144d7de2cfa71208e2751724316c9df322cc0", size = 16910 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/71/1e/e7f0393e836b5347605fc356c24d9f9ae9b26e0f7e52573b80e3d28335eb/arxiv-2.2.0-py3-none-any.whl", hash = "sha256:545b8af5ab301efff7697cd112b5189e631b80521ccbc33fbc1e1f9cff63ca4d", size = 11696 }, +] + [[package]] name = "attrs" version = "25.1.0" @@ -333,6 +346,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b3/5d/4d8bbb94f0dbc22732350c06965e40740f4a92ca560e90bb566f4f73af41/fastapi-0.115.11-py3-none-any.whl", hash = "sha256:32e1541b7b74602e4ef4a0260ecaf3aadf9d4f19590bba3e1bf2ac4666aa2c64", size = 94926 }, ] +[[package]] +name = "feedparser" +version = "6.0.11" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "sgmllib3k" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ff/aa/7af346ebeb42a76bf108027fe7f3328bb4e57a3a96e53e21fd9ef9dd6dd0/feedparser-6.0.11.tar.gz", hash = "sha256:c9d0407b64c6f2a065d0ebb292c2b35c01050cc0dc33757461aaabdc4c4184d5", size = 286197 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7c/d4/8c31aad9cc18f451c49f7f9cfb5799dadffc88177f7917bc90a66459b1d7/feedparser-6.0.11-py3-none-any.whl", hash = "sha256:0be7ee7b395572b19ebeb1d6aafb0028dee11169f1c934e0ed67d54992f4ad45", size = 81343 }, +] + [[package]] name = "filelock" version = "3.18.0" @@ -833,6 +858,7 @@ name = "lite-deep-researcher" version = "0.1.0" source = { editable = "." } dependencies = [ + { name = "arxiv" }, { name = "duckduckgo-search" }, { name = "fastapi" }, { name = "httpx" }, @@ -866,6 +892,7 @@ test = [ [package.metadata] requires-dist = [ + { name = "arxiv", specifier = ">=2.2.0" }, { name = "black", marker = "extra == 'dev'", specifier = ">=24.2.0" }, { name = "duckduckgo-search", specifier = ">=8.0.0" }, { name = "fastapi", specifier = ">=0.110.0" }, @@ -1667,6 +1694,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/68/15/6d22d07e063ce5e9bfbd96db9ec2fbb4693591b4503e3a76996639474d02/rpds_py-0.23.1-cp313-cp313t-win_amd64.whl", hash = "sha256:d6f6512a90bd5cd9030a6237f5346f046c6f0e40af98657568fa45695d4de59d", size = 235415 }, ] +[[package]] +name = "sgmllib3k" +version = "1.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9e/bd/3704a8c3e0942d711c1299ebf7b9091930adae6675d7c8f476a7ce48653c/sgmllib3k-1.0.0.tar.gz", hash = "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9", size = 5750 } + [[package]] name = "six" version = "1.17.0"