From da76524771ba702d80b9c5fc869256d929740cf1 Mon Sep 17 00:00:00 2001 From: Aparup Ganguly Date: Fri, 28 Mar 2025 16:05:16 +0530 Subject: [PATCH] Add examples/deepseek-v3-crawler --- examples/deepseek-v3-crawler/.gitignore | 50 ++++++ examples/deepseek-v3-crawler/README.md | 68 ++++++++ .../deepseek-v3-crawler.py | 164 ++++++++++++++++++ examples/deepseek-v3-crawler/requirements.txt | 3 + 4 files changed, 285 insertions(+) create mode 100644 examples/deepseek-v3-crawler/.gitignore create mode 100644 examples/deepseek-v3-crawler/README.md create mode 100644 examples/deepseek-v3-crawler/deepseek-v3-crawler.py create mode 100644 examples/deepseek-v3-crawler/requirements.txt diff --git a/examples/deepseek-v3-crawler/.gitignore b/examples/deepseek-v3-crawler/.gitignore new file mode 100644 index 00000000..7a36ce52 --- /dev/null +++ b/examples/deepseek-v3-crawler/.gitignore @@ -0,0 +1,50 @@ +# Environment variables +.env +.env.* + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual environments +venv/ +ENV/ +env/ + +# Editor files +.idea/ +.vscode/ +*.swp +*.swo +*~ + +# OS specific files +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db + +# Logs +*.log +logs/ \ No newline at end of file diff --git a/examples/deepseek-v3-crawler/README.md b/examples/deepseek-v3-crawler/README.md new file mode 100644 index 00000000..8f5ffd43 --- /dev/null +++ b/examples/deepseek-v3-crawler/README.md @@ -0,0 +1,68 @@ +# DeepSeek V3 Web Crawler + +This script uses the DeepSeek V3 large language model (via Hugging Face's Inference API) and FireCrawl to crawl websites based on specific objectives. + +## Prerequisites + +- Python 3.8+ +- A FireCrawl API key (get one at [FireCrawl's website](https://firecrawl.app)) +- A Hugging Face API key with access to inference API + +## Installation + +1. Clone this repository: + +```bash +git clone +cd +``` + +2. Install the required packages: + +```bash +pip install -r requirements.txt +``` + +3. Create a `.env` file in the root directory with your API keys: + +``` +FIRECRAWL_API_KEY=your_firecrawl_api_key +HUGGINGFACE_API_KEY=your_huggingface_api_key +``` + +## Usage + +Run the script: + +```bash +python deepseek-v3-crawler.py +``` + +The script will prompt you to: + +1. Enter a website URL to crawl +2. Enter your objective (what information you're looking for) + +The script will then: + +- Use DeepSeek V3 to generate optimal search parameters for the website +- Map the website to find relevant pages +- Crawl the most relevant pages to extract information based on your objective +- Output the results in JSON format if successful + +## Example + +Input: + +- Website: https://www.example.com +- Objective: Find information about their pricing plans + +Output: + +- The script will output structured JSON data containing the pricing information found on the website. + +## Notes + +- The script uses DeepSeek V3, an advanced language model, to analyze web content. +- The model is accessed via Hugging Face's Inference API. +- You may need to adjust temperature or max_new_tokens parameters in the script based on your needs. diff --git a/examples/deepseek-v3-crawler/deepseek-v3-crawler.py b/examples/deepseek-v3-crawler/deepseek-v3-crawler.py new file mode 100644 index 00000000..3e4075b1 --- /dev/null +++ b/examples/deepseek-v3-crawler/deepseek-v3-crawler.py @@ -0,0 +1,164 @@ +import os +from firecrawl import FirecrawlApp +import json +from dotenv import load_dotenv +from openai import OpenAI + +# ANSI color codes +class Colors: + CYAN = '\033[96m' + YELLOW = '\033[93m' + GREEN = '\033[92m' + RED = '\033[91m' + MAGENTA = '\033[95m' + BLUE = '\033[94m' + RESET = '\033[0m' + +# Load environment variables +load_dotenv() + +# Retrieve API keys from environment variables +firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") +openrouter_api_key = os.getenv("OPENROUTER_API_KEY") + +# Initialize the FirecrawlApp and OpenRouter client +app = FirecrawlApp(api_key=firecrawl_api_key) +client = OpenAI( + base_url="https://openrouter.ai/api/v1", + api_key=openrouter_api_key +) + +def main(): + try: + # Test the model availability first + test_response = client.chat.completions.create( + model="deepseek/deepseek-chat-v3-0324:free", + messages=[{"role": "user", "content": "test"}] + ) + except Exception as e: + print(f"{Colors.RED}Error: Could not connect to the language model. Please try again later.{Colors.RESET}") + print(f"{Colors.RED}Details: {str(e)}{Colors.RESET}") + return + + url = input(f"{Colors.BLUE}Enter the website to crawl: {Colors.RESET}") + objective = input(f"{Colors.BLUE}Enter your objective: {Colors.RESET}") + + print(f"{Colors.YELLOW}Initiating web crawling process...{Colors.RESET}") + + relevant_pages = find_relevant_page_via_map(objective, url, app, client) + + if not relevant_pages: + print(f"{Colors.RED}No relevant pages found. Exiting...{Colors.RESET}") + return + + result = find_objective_in_top_pages(relevant_pages, objective, app, client) + + if result: + print(f"{Colors.GREEN}Objective successfully found! Extracted information:{Colors.RESET}") + print(json.dumps(result, indent=2)) + else: + print(f"{Colors.RED}Objective could not be fulfilled.{Colors.RESET}") + +def find_relevant_page_via_map(objective, url, app, client): + try: + print(f"{Colors.CYAN}Understood. Objective: {objective}{Colors.RESET}") + print(f"{Colors.CYAN}Searching website: {url}{Colors.RESET}") + + map_prompt = f""" + The map function generates a list of URLs from a website and it accepts a search parameter. Based on the objective of: {objective}, come up with a 1-2 word search parameter that will help us find the information we need. Only respond with 1-2 words nothing else. + """ + + + response = client.chat.completions.create( + model="deepseek/deepseek-chat-v3-0324:free", + messages=[{"role": "user", "content": map_prompt}] + ) + map_search_parameter = response.choices[0].message.content.strip() + + print(f"{Colors.GREEN}Optimal search parameter identified: {map_search_parameter}{Colors.RESET}") + + map_website = app.map_url(url, params={"search": map_search_parameter}) + print(f"{Colors.GREEN}Website mapping completed successfully.{Colors.RESET}") + + links = map_website.get('urls', []) or map_website.get('links', []) + + if not links: + print(f"{Colors.RED}No links found in map response.{Colors.RESET}") + return None + + return links + + except Exception as e: + print(f"{Colors.RED}Error encountered: {str(e)}{Colors.RESET}") + return None + +def find_objective_in_top_pages(pages, objective, app, client): + try: + for link in pages[:3]: + print(f"{Colors.YELLOW}Scraping page: {link}{Colors.RESET}") + scrape_result = app.scrape_url(link, params={'formats': ['markdown']}) + + check_prompt = f""" + Given the following scraped content and objective, determine if the objective is met. + If it is, extract the relevant information in a simple JSON format. + If the objective is not met, respond with exactly 'Objective not met'. + + The JSON format should be: + {{ + "found": true, + "data": {{ + // extracted information here + }} + }} + + Important: Do not wrap the JSON in markdown code blocks. Just return the raw JSON. + + Objective: {objective} + Scraped content: {scrape_result['markdown']} + """ + + # Using OpenRouter's API to analyze the content + response = client.chat.completions.create( + model="deepseek/deepseek-chat-v3-0324:free", + messages=[{ + "role": "system", + "content": "You are a helpful assistant that extracts information from web pages. Always respond in valid JSON format when information is found. Do not wrap the JSON in markdown code blocks." + }, { + "role": "user", + "content": check_prompt + }] + ) + result = response.choices[0].message.content.strip() + + print(f"{Colors.CYAN}Model response: {result}{Colors.RESET}") # Debug output + + if result == "Objective not met": + print(f"{Colors.YELLOW}Objective not met in this page, continuing search...{Colors.RESET}") + continue + + try: + # Clean up the response if it's wrapped in code blocks + if result.startswith('```'): + result = result.split('```')[1] + if result.startswith('json'): + result = result[4:] + result = result.strip() + + parsed_result = json.loads(result) + if isinstance(parsed_result, dict) and parsed_result.get('found'): + return parsed_result.get('data') + else: + print(f"{Colors.YELLOW}Invalid response format, continuing search...{Colors.RESET}") + except json.JSONDecodeError as e: + print(f"{Colors.RED}Error parsing JSON response: {str(e)}{Colors.RESET}") + print(f"{Colors.RED}Raw response: {result}{Colors.RESET}") + continue + + return None + + except Exception as e: + print(f"{Colors.RED}Error encountered: {str(e)}{Colors.RESET}") + return None + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/deepseek-v3-crawler/requirements.txt b/examples/deepseek-v3-crawler/requirements.txt new file mode 100644 index 00000000..e179518c --- /dev/null +++ b/examples/deepseek-v3-crawler/requirements.txt @@ -0,0 +1,3 @@ +firecrawl==1.13.5 +python-dotenv==1.0.1 +huggingface-hub>=0.20.0 \ No newline at end of file