feat: support arxiv & brave search

This commit is contained in:
He Tao 2025-04-11 15:37:55 +08:00
parent 23298abd14
commit 3a342a62ba
7 changed files with 99 additions and 3 deletions

View File

@ -3,8 +3,9 @@ DEBUG=True
APP_ENV=development
# Add other environment variables as needed
# tavily, duckduckgo
# tavily, duckduckgo, brave_search, arxiv
SEARCH_API=tavily
TAVILY_API_KEY=tvly-xxx
BRAVE_SEARCH_API_KEY=brave-xxx
# JINA_API_KEY=jina_xxx # Optional, default is None

View File

@ -14,13 +14,16 @@ lite-deep-researcher is a community-driven AI automation framework that builds u
git clone https://github.com/hetaoBackend/lite-deep-researcher.git
cd lite-deep-researcher
# Install dependencies, uv will take care of the python interpreter and venv creation
# Install dependencies, uv will take care of the python interpreter and venv creation, and install the required packages
uv sync
# Configure .env with your Search Engine API keys
# Tavily: https://app.tavily.com/home
# Brave_SEARCH: https://brave.com/search/api/
cp .env.example .env
# See the 'Supported Search Engines' section below for all available options
# Configure conf.yaml for your LLM model and API keys
# Gemini: https://ai.google.dev/gemini-api/docs/openai
cp conf.yaml.example conf.yaml
@ -29,6 +32,32 @@ cp conf.yaml.example conf.yaml
uv run main.py
```
## Supported Search Engines
Lite-deep-researcher supports multiple search engines that can be configured in your `.env` file using the `SEARCH_API` variable:
- **Tavily** (default): A specialized search API for AI applications
- Requires `TAVILY_API_KEY` in your `.env` file
- Sign up at: https://app.tavily.com/home
- **DuckDuckGo**: Privacy-focused search engine
- No API key required
- **Brave Search**: Privacy-focused search engine with advanced features
- Requires `BRAVE_SEARCH_API_KEY` in your `.env` file
- Sign up at: https://brave.com/search/api/
- **Arxiv**: Scientific paper search for academic research
- No API key required
- Specialized for scientific and academic papers
To configure your preferred search engine, set the `SEARCH_API` variable in your `.env` file:
```bash
# Choose one: tavily, duckduckgo, brave_search, arxiv
SEARCH_API=tavily
```
## Development
### Testing

View File

@ -29,6 +29,7 @@ dependencies = [
"jinja2>=3.1.3",
"duckduckgo-search>=8.0.0",
"inquirerpy>=0.3.4",
"arxiv>=2.2.0",
]
[project.optional-dependencies]

View File

@ -8,6 +8,8 @@ load_dotenv()
class SearchEngine(enum.Enum):
TAVILY = "tavily"
DUCKDUCKGO = "duckduckgo"
BRAVE_SEARCH = "brave_search"
ARXIV = "arxiv"
# Tool configuration

View File

@ -1,12 +1,19 @@
from .crawl import crawl_tool
from .python_repl import python_repl_tool
from .search import tavily_search_tool, duckduckgo_search_tool
from .search import (
tavily_search_tool,
duckduckgo_search_tool,
brave_search_tool,
arxiv_search_tool,
)
from src.config import SELECTED_SEARCH_ENGINE, SearchEngine
# Map search engine names to their respective tools
search_tool_mappings = {
SearchEngine.TAVILY.value: tavily_search_tool,
SearchEngine.DUCKDUCKGO.value: duckduckgo_search_tool,
SearchEngine.BRAVE_SEARCH.value: brave_search_tool,
SearchEngine.ARXIV.value: arxiv_search_tool,
}
web_search_tool = search_tool_mappings.get(SELECTED_SEARCH_ENGINE, tavily_search_tool)

View File

@ -1,6 +1,10 @@
import logging
import os
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_community.tools import DuckDuckGoSearchResults
from langchain_community.tools import BraveSearch
from langchain_community.tools.arxiv import ArxivQueryRun
from langchain_community.utilities import ArxivAPIWrapper, BraveSearchWrapper
from src.config import SEARCH_MAX_RESULTS
from .decorators import create_logged_tool
@ -16,3 +20,22 @@ LoggedDuckDuckGoSearch = create_logged_tool(DuckDuckGoSearchResults)
duckduckgo_search_tool = LoggedDuckDuckGoSearch(
name="web_search", max_results=SEARCH_MAX_RESULTS
)
LoggedBraveSearch = create_logged_tool(BraveSearch)
brave_search_tool = LoggedBraveSearch(
name="web_search",
search_wrapper=BraveSearchWrapper(
api_key=os.getenv("BRAVE_SEARCH_API_KEY", ""),
search_kwargs={"count": SEARCH_MAX_RESULTS},
),
)
LoggedArxivSearch = create_logged_tool(ArxivQueryRun)
arxiv_search_tool = LoggedArxivSearch(
name="web_search",
api_wrapper=ArxivAPIWrapper(
top_k_results=SEARCH_MAX_RESULTS,
load_max_docs=SEARCH_MAX_RESULTS,
load_all_available_meta=True,
),
)

33
uv.lock generated
View File

@ -100,6 +100,19 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/46/eb/e7f063ad1fec6b3178a3cd82d1a3c4de82cccf283fc42746168188e1cdd5/anyio-4.8.0-py3-none-any.whl", hash = "sha256:b5011f270ab5eb0abf13385f851315585cc37ef330dd88e27ec3d34d651fd47a", size = 96041 },
]
[[package]]
name = "arxiv"
version = "2.2.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "feedparser" },
{ name = "requests" },
]
sdist = { url = "https://files.pythonhosted.org/packages/0b/16/3d72446400a59d1fbda24fed2289661398994164e07d72cfa85e43ce5e36/arxiv-2.2.0.tar.gz", hash = "sha256:6072a2211e95697092ef32acde0144d7de2cfa71208e2751724316c9df322cc0", size = 16910 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/71/1e/e7f0393e836b5347605fc356c24d9f9ae9b26e0f7e52573b80e3d28335eb/arxiv-2.2.0-py3-none-any.whl", hash = "sha256:545b8af5ab301efff7697cd112b5189e631b80521ccbc33fbc1e1f9cff63ca4d", size = 11696 },
]
[[package]]
name = "attrs"
version = "25.1.0"
@ -333,6 +346,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/b3/5d/4d8bbb94f0dbc22732350c06965e40740f4a92ca560e90bb566f4f73af41/fastapi-0.115.11-py3-none-any.whl", hash = "sha256:32e1541b7b74602e4ef4a0260ecaf3aadf9d4f19590bba3e1bf2ac4666aa2c64", size = 94926 },
]
[[package]]
name = "feedparser"
version = "6.0.11"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "sgmllib3k" },
]
sdist = { url = "https://files.pythonhosted.org/packages/ff/aa/7af346ebeb42a76bf108027fe7f3328bb4e57a3a96e53e21fd9ef9dd6dd0/feedparser-6.0.11.tar.gz", hash = "sha256:c9d0407b64c6f2a065d0ebb292c2b35c01050cc0dc33757461aaabdc4c4184d5", size = 286197 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/7c/d4/8c31aad9cc18f451c49f7f9cfb5799dadffc88177f7917bc90a66459b1d7/feedparser-6.0.11-py3-none-any.whl", hash = "sha256:0be7ee7b395572b19ebeb1d6aafb0028dee11169f1c934e0ed67d54992f4ad45", size = 81343 },
]
[[package]]
name = "filelock"
version = "3.18.0"
@ -833,6 +858,7 @@ name = "lite-deep-researcher"
version = "0.1.0"
source = { editable = "." }
dependencies = [
{ name = "arxiv" },
{ name = "duckduckgo-search" },
{ name = "fastapi" },
{ name = "httpx" },
@ -866,6 +892,7 @@ test = [
[package.metadata]
requires-dist = [
{ name = "arxiv", specifier = ">=2.2.0" },
{ name = "black", marker = "extra == 'dev'", specifier = ">=24.2.0" },
{ name = "duckduckgo-search", specifier = ">=8.0.0" },
{ name = "fastapi", specifier = ">=0.110.0" },
@ -1667,6 +1694,12 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/68/15/6d22d07e063ce5e9bfbd96db9ec2fbb4693591b4503e3a76996639474d02/rpds_py-0.23.1-cp313-cp313t-win_amd64.whl", hash = "sha256:d6f6512a90bd5cd9030a6237f5346f046c6f0e40af98657568fa45695d4de59d", size = 235415 },
]
[[package]]
name = "sgmllib3k"
version = "1.0.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/9e/bd/3704a8c3e0942d711c1299ebf7b9091930adae6675d7c8f476a7ce48653c/sgmllib3k-1.0.0.tar.gz", hash = "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9", size = 5750 }
[[package]]
name = "six"
version = "1.17.0"