Add examples/deepseek-v3-crawler

This commit is contained in:
Aparup Ganguly 2025-03-28 16:05:16 +05:30
parent be43598071
commit da76524771
4 changed files with 285 additions and 0 deletions

50
examples/deepseek-v3-crawler/.gitignore vendored Normal file
View File

@ -0,0 +1,50 @@
# Environment variables
.env
.env.*
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# Virtual environments
venv/
ENV/
env/
# Editor files
.idea/
.vscode/
*.swp
*.swo
*~
# OS specific files
.DS_Store
.DS_Store?
._*
.Spotlight-V100
.Trashes
ehthumbs.db
Thumbs.db
# Logs
*.log
logs/

View File

@ -0,0 +1,68 @@
# DeepSeek V3 Web Crawler
This script uses the DeepSeek V3 large language model (via Hugging Face's Inference API) and FireCrawl to crawl websites based on specific objectives.
## Prerequisites
- Python 3.8+
- A FireCrawl API key (get one at [FireCrawl's website](https://firecrawl.app))
- A Hugging Face API key with access to inference API
## Installation
1. Clone this repository:
```bash
git clone <repository-url>
cd <repository-directory>
```
2. Install the required packages:
```bash
pip install -r requirements.txt
```
3. Create a `.env` file in the root directory with your API keys:
```
FIRECRAWL_API_KEY=your_firecrawl_api_key
HUGGINGFACE_API_KEY=your_huggingface_api_key
```
## Usage
Run the script:
```bash
python deepseek-v3-crawler.py
```
The script will prompt you to:
1. Enter a website URL to crawl
2. Enter your objective (what information you're looking for)
The script will then:
- Use DeepSeek V3 to generate optimal search parameters for the website
- Map the website to find relevant pages
- Crawl the most relevant pages to extract information based on your objective
- Output the results in JSON format if successful
## Example
Input:
- Website: https://www.example.com
- Objective: Find information about their pricing plans
Output:
- The script will output structured JSON data containing the pricing information found on the website.
## Notes
- The script uses DeepSeek V3, an advanced language model, to analyze web content.
- The model is accessed via Hugging Face's Inference API.
- You may need to adjust temperature or max_new_tokens parameters in the script based on your needs.

View File

@ -0,0 +1,164 @@
import os
from firecrawl import FirecrawlApp
import json
from dotenv import load_dotenv
from openai import OpenAI
# ANSI color codes
class Colors:
CYAN = '\033[96m'
YELLOW = '\033[93m'
GREEN = '\033[92m'
RED = '\033[91m'
MAGENTA = '\033[95m'
BLUE = '\033[94m'
RESET = '\033[0m'
# Load environment variables
load_dotenv()
# Retrieve API keys from environment variables
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
openrouter_api_key = os.getenv("OPENROUTER_API_KEY")
# Initialize the FirecrawlApp and OpenRouter client
app = FirecrawlApp(api_key=firecrawl_api_key)
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=openrouter_api_key
)
def main():
try:
# Test the model availability first
test_response = client.chat.completions.create(
model="deepseek/deepseek-chat-v3-0324:free",
messages=[{"role": "user", "content": "test"}]
)
except Exception as e:
print(f"{Colors.RED}Error: Could not connect to the language model. Please try again later.{Colors.RESET}")
print(f"{Colors.RED}Details: {str(e)}{Colors.RESET}")
return
url = input(f"{Colors.BLUE}Enter the website to crawl: {Colors.RESET}")
objective = input(f"{Colors.BLUE}Enter your objective: {Colors.RESET}")
print(f"{Colors.YELLOW}Initiating web crawling process...{Colors.RESET}")
relevant_pages = find_relevant_page_via_map(objective, url, app, client)
if not relevant_pages:
print(f"{Colors.RED}No relevant pages found. Exiting...{Colors.RESET}")
return
result = find_objective_in_top_pages(relevant_pages, objective, app, client)
if result:
print(f"{Colors.GREEN}Objective successfully found! Extracted information:{Colors.RESET}")
print(json.dumps(result, indent=2))
else:
print(f"{Colors.RED}Objective could not be fulfilled.{Colors.RESET}")
def find_relevant_page_via_map(objective, url, app, client):
try:
print(f"{Colors.CYAN}Understood. Objective: {objective}{Colors.RESET}")
print(f"{Colors.CYAN}Searching website: {url}{Colors.RESET}")
map_prompt = f"""
The map function generates a list of URLs from a website and it accepts a search parameter. Based on the objective of: {objective}, come up with a 1-2 word search parameter that will help us find the information we need. Only respond with 1-2 words nothing else.
"""
response = client.chat.completions.create(
model="deepseek/deepseek-chat-v3-0324:free",
messages=[{"role": "user", "content": map_prompt}]
)
map_search_parameter = response.choices[0].message.content.strip()
print(f"{Colors.GREEN}Optimal search parameter identified: {map_search_parameter}{Colors.RESET}")
map_website = app.map_url(url, params={"search": map_search_parameter})
print(f"{Colors.GREEN}Website mapping completed successfully.{Colors.RESET}")
links = map_website.get('urls', []) or map_website.get('links', [])
if not links:
print(f"{Colors.RED}No links found in map response.{Colors.RESET}")
return None
return links
except Exception as e:
print(f"{Colors.RED}Error encountered: {str(e)}{Colors.RESET}")
return None
def find_objective_in_top_pages(pages, objective, app, client):
try:
for link in pages[:3]:
print(f"{Colors.YELLOW}Scraping page: {link}{Colors.RESET}")
scrape_result = app.scrape_url(link, params={'formats': ['markdown']})
check_prompt = f"""
Given the following scraped content and objective, determine if the objective is met.
If it is, extract the relevant information in a simple JSON format.
If the objective is not met, respond with exactly 'Objective not met'.
The JSON format should be:
{{
"found": true,
"data": {{
// extracted information here
}}
}}
Important: Do not wrap the JSON in markdown code blocks. Just return the raw JSON.
Objective: {objective}
Scraped content: {scrape_result['markdown']}
"""
# Using OpenRouter's API to analyze the content
response = client.chat.completions.create(
model="deepseek/deepseek-chat-v3-0324:free",
messages=[{
"role": "system",
"content": "You are a helpful assistant that extracts information from web pages. Always respond in valid JSON format when information is found. Do not wrap the JSON in markdown code blocks."
}, {
"role": "user",
"content": check_prompt
}]
)
result = response.choices[0].message.content.strip()
print(f"{Colors.CYAN}Model response: {result}{Colors.RESET}") # Debug output
if result == "Objective not met":
print(f"{Colors.YELLOW}Objective not met in this page, continuing search...{Colors.RESET}")
continue
try:
# Clean up the response if it's wrapped in code blocks
if result.startswith('```'):
result = result.split('```')[1]
if result.startswith('json'):
result = result[4:]
result = result.strip()
parsed_result = json.loads(result)
if isinstance(parsed_result, dict) and parsed_result.get('found'):
return parsed_result.get('data')
else:
print(f"{Colors.YELLOW}Invalid response format, continuing search...{Colors.RESET}")
except json.JSONDecodeError as e:
print(f"{Colors.RED}Error parsing JSON response: {str(e)}{Colors.RESET}")
print(f"{Colors.RED}Raw response: {result}{Colors.RESET}")
continue
return None
except Exception as e:
print(f"{Colors.RED}Error encountered: {str(e)}{Colors.RESET}")
return None
if __name__ == "__main__":
main()

View File

@ -0,0 +1,3 @@
firecrawl==1.13.5
python-dotenv==1.0.1
huggingface-hub>=0.20.0