Add examples/deepseek-v3-crawler

2025-04-18 12:09:42 +08:00 · 2025-03-28 16:05:16 +05:30 · 2025-03-28 16:05:16 +05:30 · da76524771
commit da76524771
parent be43598071
4 changed files with 285 additions and 0 deletions
--- a/examples/deepseek-v3-crawler/.gitignore
+++ b/examples/deepseek-v3-crawler/.gitignore
@ -0,0 +1,50 @@
+# Environment variables
+.env
+.env.*
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Virtual environments
+venv/
+ENV/
+env/
+
+# Editor files
+.idea/
+.vscode/
+*.swp
+*.swo
+*~
+
+# OS specific files
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+
+# Logs
+*.log
+logs/ 
--- a/examples/deepseek-v3-crawler/README.md
+++ b/examples/deepseek-v3-crawler/README.md
@ -0,0 +1,68 @@
+# DeepSeek V3 Web Crawler
+
+This script uses the DeepSeek V3 large language model (via Hugging Face's Inference API) and FireCrawl to crawl websites based on specific objectives.
+
+## Prerequisites
+
+- Python 3.8+
+- A FireCrawl API key (get one at [FireCrawl's website](https://firecrawl.app))
+- A Hugging Face API key with access to inference API
+
+## Installation
+
+1. Clone this repository:
+
+```bash
+git clone <repository-url>
+cd <repository-directory>
+```
+
+2. Install the required packages:
+
+```bash
+pip install -r requirements.txt
+```
+
+3. Create a `.env` file in the root directory with your API keys:
+
+```
+FIRECRAWL_API_KEY=your_firecrawl_api_key
+HUGGINGFACE_API_KEY=your_huggingface_api_key
+```
+
+## Usage
+
+Run the script:
+
+```bash
+python deepseek-v3-crawler.py
+```
+
+The script will prompt you to:
+
+1. Enter a website URL to crawl
+2. Enter your objective (what information you're looking for)
+
+The script will then:
+
+- Use DeepSeek V3 to generate optimal search parameters for the website
+- Map the website to find relevant pages
+- Crawl the most relevant pages to extract information based on your objective
+- Output the results in JSON format if successful
+
+## Example
+
+Input:
+
+- Website: https://www.example.com
+- Objective: Find information about their pricing plans
+
+Output:
+
+- The script will output structured JSON data containing the pricing information found on the website.
+
+## Notes
+
+- The script uses DeepSeek V3, an advanced language model, to analyze web content.
+- The model is accessed via Hugging Face's Inference API.
+- You may need to adjust temperature or max_new_tokens parameters in the script based on your needs.
--- a/examples/deepseek-v3-crawler/deepseek-v3-crawler.py
+++ b/examples/deepseek-v3-crawler/deepseek-v3-crawler.py
@ -0,0 +1,164 @@
+import os
+from firecrawl import FirecrawlApp
+import json
+from dotenv import load_dotenv
+from openai import OpenAI
+
+# ANSI color codes
+class Colors:
+    CYAN = '\033[96m'
+    YELLOW = '\033[93m'
+    GREEN = '\033[92m'
+    RED = '\033[91m'
+    MAGENTA = '\033[95m'
+    BLUE = '\033[94m'
+    RESET = '\033[0m'
+
+# Load environment variables
+load_dotenv()
+
+# Retrieve API keys from environment variables
+firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
+openrouter_api_key = os.getenv("OPENROUTER_API_KEY")
+
+# Initialize the FirecrawlApp and OpenRouter client
+app = FirecrawlApp(api_key=firecrawl_api_key)
+client = OpenAI(
+    base_url="https://openrouter.ai/api/v1",
+    api_key=openrouter_api_key
+)
+
+def main():
+    try:
+        # Test the model availability first
+        test_response = client.chat.completions.create(
+            model="deepseek/deepseek-chat-v3-0324:free",
+            messages=[{"role": "user", "content": "test"}]
+        )
+    except Exception as e:
+        print(f"{Colors.RED}Error: Could not connect to the language model. Please try again later.{Colors.RESET}")
+        print(f"{Colors.RED}Details: {str(e)}{Colors.RESET}")
+        return
+
+    url = input(f"{Colors.BLUE}Enter the website to crawl: {Colors.RESET}")
+    objective = input(f"{Colors.BLUE}Enter your objective: {Colors.RESET}")
+    
+    print(f"{Colors.YELLOW}Initiating web crawling process...{Colors.RESET}")
+    
+    relevant_pages = find_relevant_page_via_map(objective, url, app, client)
+    
+    if not relevant_pages:
+        print(f"{Colors.RED}No relevant pages found. Exiting...{Colors.RESET}")
+        return
+    
+    result = find_objective_in_top_pages(relevant_pages, objective, app, client)
+    
+    if result:
+        print(f"{Colors.GREEN}Objective successfully found! Extracted information:{Colors.RESET}")
+        print(json.dumps(result, indent=2))
+    else:
+        print(f"{Colors.RED}Objective could not be fulfilled.{Colors.RESET}")
+
+def find_relevant_page_via_map(objective, url, app, client):
+    try:
+        print(f"{Colors.CYAN}Understood. Objective: {objective}{Colors.RESET}")
+        print(f"{Colors.CYAN}Searching website: {url}{Colors.RESET}")
+        
+        map_prompt = f"""
+        The map function generates a list of URLs from a website and it accepts a search parameter. Based on the objective of: {objective}, come up with a 1-2 word search parameter that will help us find the information we need. Only respond with 1-2 words nothing else.
+        """
+
+       
+        response = client.chat.completions.create(
+            model="deepseek/deepseek-chat-v3-0324:free",
+            messages=[{"role": "user", "content": map_prompt}]
+        )
+        map_search_parameter = response.choices[0].message.content.strip()
+
+        print(f"{Colors.GREEN}Optimal search parameter identified: {map_search_parameter}{Colors.RESET}")
+
+        map_website = app.map_url(url, params={"search": map_search_parameter})
+        print(f"{Colors.GREEN}Website mapping completed successfully.{Colors.RESET}")
+        
+        links = map_website.get('urls', []) or map_website.get('links', [])
+        
+        if not links:
+            print(f"{Colors.RED}No links found in map response.{Colors.RESET}")
+            return None
+
+        return links
+    
+    except Exception as e:
+        print(f"{Colors.RED}Error encountered: {str(e)}{Colors.RESET}")
+        return None
+
+def find_objective_in_top_pages(pages, objective, app, client):
+    try:
+        for link in pages[:3]:
+            print(f"{Colors.YELLOW}Scraping page: {link}{Colors.RESET}")
+            scrape_result = app.scrape_url(link, params={'formats': ['markdown']})
+            
+            check_prompt = f"""
+            Given the following scraped content and objective, determine if the objective is met.
+            If it is, extract the relevant information in a simple JSON format. 
+            If the objective is not met, respond with exactly 'Objective not met'.
+
+            The JSON format should be:
+            {{
+                "found": true,
+                "data": {{
+                    // extracted information here
+                }}
+            }}
+
+            Important: Do not wrap the JSON in markdown code blocks. Just return the raw JSON.
+
+            Objective: {objective}
+            Scraped content: {scrape_result['markdown']}
+            """
+        
+            # Using OpenRouter's API to analyze the content
+            response = client.chat.completions.create(
+                model="deepseek/deepseek-chat-v3-0324:free",
+                messages=[{
+                    "role": "system",
+                    "content": "You are a helpful assistant that extracts information from web pages. Always respond in valid JSON format when information is found. Do not wrap the JSON in markdown code blocks."
+                }, {
+                    "role": "user",
+                    "content": check_prompt
+                }]
+            )
+            result = response.choices[0].message.content.strip()
+            
+            print(f"{Colors.CYAN}Model response: {result}{Colors.RESET}")  # Debug output
+            
+            if result == "Objective not met":
+                print(f"{Colors.YELLOW}Objective not met in this page, continuing search...{Colors.RESET}")
+                continue
+            
+            try:
+                # Clean up the response if it's wrapped in code blocks
+                if result.startswith('```'):
+                    result = result.split('```')[1]
+                    if result.startswith('json'):
+                        result = result[4:]
+                result = result.strip()
+                
+                parsed_result = json.loads(result)
+                if isinstance(parsed_result, dict) and parsed_result.get('found'):
+                    return parsed_result.get('data')
+                else:
+                    print(f"{Colors.YELLOW}Invalid response format, continuing search...{Colors.RESET}")
+            except json.JSONDecodeError as e:
+                print(f"{Colors.RED}Error parsing JSON response: {str(e)}{Colors.RESET}")
+                print(f"{Colors.RED}Raw response: {result}{Colors.RESET}")
+                continue
+        
+        return None
+    
+    except Exception as e:
+        print(f"{Colors.RED}Error encountered: {str(e)}{Colors.RESET}")
+        return None
+
+if __name__ == "__main__":
+    main()
--- a/examples/deepseek-v3-crawler/requirements.txt
+++ b/examples/deepseek-v3-crawler/requirements.txt
@ -0,0 +1,3 @@
+firecrawl==1.13.5
+python-dotenv==1.0.1
+huggingface-hub>=0.20.0