Add examples/o4-mini web crawler

2025-08-12 20:08:59 +08:00 · 2025-04-22 22:07:39 +05:30 · 2025-04-22 22:07:39 +05:30 · 6920b85ee1
commit 6920b85ee1
parent cf84324534
5 changed files with 422 additions and 0 deletions
--- a/examples/o4-mini-web-crawler/.env.example
+++ b/examples/o4-mini-web-crawler/.env.example
@ -0,0 +1,2 @@
+FIRECRAWL_API_KEY=your_firecrawl_api_key_here
+OPENAI_API_KEY=your_openai_api_key_here 
--- a/examples/o4-mini-web-crawler/.gitignore
+++ b/examples/o4-mini-web-crawler/.gitignore
@ -0,0 +1,116 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# IDE files
+.idea/
+.vscode/
+*.swp
+*.swo
+
+# OS generated files
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db 
--- a/examples/o4-mini-web-crawler/README.md
+++ b/examples/o4-mini-web-crawler/README.md
@ -0,0 +1,61 @@
+# O4 Mini Web Crawler
+
+A simple web crawler that uses Firecrawl and OpenAI's o4-mini model to search websites based on user objectives.
+
+## Features
+
+- Maps websites to find relevant URLs
+- Uses AI to rank URLs by relevance to the objective
+- Scrapes content and analyzes it with o4-mini
+- Returns structured data when objectives are met
+
+## Prerequisites
+
+- Python 3.6+
+- Firecrawl API key
+- OpenAI API key
+
+## Installation
+
+1. Clone this repository
+2. Install the required packages:
+   ```
+   pip install -r requirements.txt
+   ```
+3. Copy `.env.example` to `.env` and fill in your API keys:
+   ```
+   cp .env.example .env
+   ```
+
+## Usage
+
+Run the script:
+
+```
+python o4-mini-web-crawler.py
+```
+
+You will be prompted to:
+
+1. Enter a website URL to crawl
+2. Define your objective (what information you're looking for)
+
+The crawler will then:
+
+- Map the website to find relevant URLs
+- Rank the most relevant pages
+- Scrape and analyze the content
+- Return structured data if the objective is met
+
+## Example
+
+```
+Enter the website to crawl: https://example.com
+Enter your objective: Find the company's headquarters address
+```
+
+The crawler will search for pages likely to contain this information, analyze them, and return the address in a structured format.
+
+## License
+
+[MIT](LICENSE)
--- a/examples/o4-mini-web-crawler/o4-mini-web-crawler.py
+++ b/examples/o4-mini-web-crawler/o4-mini-web-crawler.py
@ -0,0 +1,240 @@
+import os
+from firecrawl import FirecrawlApp
+import json
+from dotenv import load_dotenv
+from openai import OpenAI
+
+# ANSI color codes
+class Colors:
+    CYAN = '\033[96m'
+    YELLOW = '\033[93m'
+    GREEN = '\033[92m'
+    RED = '\033[91m'
+    MAGENTA = '\033[95m'
+    BLUE = '\033[94m'
+    RESET = '\033[0m'
+
+# Load environment variables
+load_dotenv()
+
+firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
+openai_api_key = os.getenv("OPENAI_API_KEY")
+
+# Initialize the FirecrawlApp and OpenAI client
+app = FirecrawlApp(api_key=firecrawl_api_key)
+client = OpenAI(api_key=openai_api_key)
+
+# Find the page that most likely contains the objective
+def find_relevant_page_via_map(objective, url, app, client):
+    try:
+        print(f"{Colors.CYAN}Understood. The objective is: {objective}{Colors.RESET}")
+        print(f"{Colors.CYAN}Initiating search on the website: {url}{Colors.RESET}")
+        
+        map_prompt = f"""
+        The map function generates a list of URLs from a website and it accepts a search parameter. Based on the objective of: {objective}, come up with a 1-2 word search parameter that will help us find the information we need. Only respond with 1-2 words nothing else.
+        """
+
+        print(f"{Colors.YELLOW}Analyzing objective to determine optimal search parameter...{Colors.RESET}")
+        completion = client.chat.completions.create(
+            model="o4-mini",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": map_prompt
+                        }
+                    ]
+                }
+            ]
+        )
+
+        map_search_parameter = completion.choices[0].message.content
+        print(f"{Colors.GREEN}Optimal search parameter identified: {map_search_parameter}{Colors.RESET}")
+
+        print(f"{Colors.YELLOW}Mapping website using the identified search parameter...{Colors.RESET}")
+        map_website = app.map_url(url, params={"search": map_search_parameter})
+        
+        print(f"{Colors.MAGENTA}Debug - Map response structure: {json.dumps(map_website, indent=2)}{Colors.RESET}")
+        
+        print(f"{Colors.GREEN}Website mapping completed successfully.{Colors.RESET}")
+        
+        # Handle the response based on its structure
+        if isinstance(map_website, dict):
+            links = map_website.get('urls', []) or map_website.get('links', [])
+        elif isinstance(map_website, str):
+            try:
+                parsed = json.loads(map_website)
+                links = parsed.get('urls', []) or parsed.get('links', [])
+            except json.JSONDecodeError:
+                links = []
+        else:
+            links = map_website if isinstance(map_website, list) else []
+
+        if not links:
+            print(f"{Colors.RED}No links found in map response.{Colors.RESET}")
+            return None
+
+        rank_prompt = f"""
+        Given this list of URLs and the objective: {objective}
+        Analyze each URL and rank the top 3 most relevant ones that are most likely to contain the information we need.
+        Return your response as a JSON array with exactly 3 objects, each containing:
+        - "url": the full URL
+        - "relevance_score": number between 0-100 indicating relevance to objective
+        - "reason": brief explanation of why this URL is relevant
+
+        Example output:
+        [
+            {{
+                "url": "https://example.com/about",
+                "relevance_score": 95,
+                "reason": "Main about page containing company information"
+            }},
+            {{
+                "url": "https://example.com/team",
+                "relevance_score": 80,
+                "reason": "Team page with leadership details"
+            }},
+            {{
+                "url": "https://example.com/contact",
+                "relevance_score": 70,
+                "reason": "Contact page with location information"
+            }}
+        ]
+
+        URLs to analyze:
+        {json.dumps(links, indent=2)}
+        """
+
+        print(f"{Colors.YELLOW}Ranking URLs by relevance to objective...{Colors.RESET}")
+        completion = client.chat.completions.create(
+            model="o4-mini",
+            messages=[
+                {
+                    "role": "user", 
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": rank_prompt
+                        }
+                    ]
+                }
+            ]
+        )
+
+        try:
+            ranked_results = json.loads(completion.choices[0].message.content)
+            links = [result["url"] for result in ranked_results]
+            
+            # Print detailed ranking info
+            print(f"{Colors.CYAN}Top 3 ranked URLs:{Colors.RESET}")
+            for result in ranked_results:
+                print(f"{Colors.GREEN}URL: {result['url']}{Colors.RESET}")
+                print(f"{Colors.YELLOW}Relevance Score: {result['relevance_score']}{Colors.RESET}")
+                print(f"{Colors.BLUE}Reason: {result['reason']}{Colors.RESET}")
+                print("---")
+
+            if not links:
+                print(f"{Colors.RED}No relevant links identified.{Colors.RESET}")
+                return None
+
+        except (json.JSONDecodeError, KeyError) as e:
+            print(f"{Colors.RED}Error parsing ranked results: {str(e)}{Colors.RESET}")
+            return None
+            
+        print(f"{Colors.GREEN}Located {len(links)} relevant links.{Colors.RESET}")
+        return links
+    
+    except Exception as e:
+        print(f"{Colors.RED}Error encountered during relevant page identification: {str(e)}{Colors.RESET}")
+        return None
+
+def find_objective_in_top_pages(map_website, objective, app, client):
+    try:
+        # Get top 3 links from the map result
+        if not map_website:
+            print(f"{Colors.RED}No links found to analyze.{Colors.RESET}")
+            return None
+            
+        top_links = map_website[:3]
+        print(f"{Colors.CYAN}Proceeding to analyze top {len(top_links)} links: {top_links}{Colors.RESET}")
+        
+        for link in top_links:
+            print(f"{Colors.YELLOW}Initiating scrape of page: {link}{Colors.RESET}")
+            # Scrape the page
+            scrape_result = app.scrape_url(link, params={'formats': ['markdown']})
+            print(f"{Colors.GREEN}Page scraping completed successfully.{Colors.RESET}")
+     
+            
+            check_prompt = f"""
+            Given the following scraped content and objective, determine if the objective is met.
+            If it is, extract the relevant information in a simple and concise JSON format. Use only the necessary fields and avoid nested structures if possible.
+            If the objective is not met with confidence, respond with 'Objective not met'.
+
+            Objective: {objective}
+            Scraped content: {scrape_result['markdown']}
+
+            Remember:
+            1. Only return JSON if you are confident the objective is fully met.
+            2. Keep the JSON structure as simple and flat as possible.
+            3. Do not include any explanations or markdown formatting in your response.
+            """
+        
+            completion = client.chat.completions.create(
+                model="o4-mini",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": check_prompt
+                            }
+                        ]
+                    }
+                ]
+            )
+            
+            result = completion.choices[0].message.content
+            
+            if result != "Objective not met":
+                print(f"{Colors.GREEN}Objective potentially fulfilled. Relevant information identified.{Colors.RESET}")
+                try:
+                    return json.loads(result)
+                except json.JSONDecodeError:
+                    print(f"{Colors.RED}Error in parsing response. Proceeding to next page...{Colors.RESET}")
+            else:
+                print(f"{Colors.YELLOW}Objective not met on this page. Proceeding to next link...{Colors.RESET}")
+        
+        print(f"{Colors.RED}All available pages analyzed. Objective not fulfilled in examined content.{Colors.RESET}")
+        return None
+    
+    except Exception as e:
+        print(f"{Colors.RED}Error encountered during page analysis: {str(e)}{Colors.RESET}")
+        return None
+
+# Main function to execute the process
+def main():
+    url = input(f"{Colors.BLUE}Enter the website to crawl : {Colors.RESET}")
+    objective = input(f"{Colors.BLUE}Enter your objective: {Colors.RESET}")
+    
+    print(f"{Colors.YELLOW}Initiating web crawling process...{Colors.RESET}")
+    # Find the relevant page
+    map_website = find_relevant_page_via_map(objective, url, app, client)
+    
+    if map_website:
+        print(f"{Colors.GREEN}Relevant pages identified. Proceeding with detailed analysis using o4-mini...{Colors.RESET}")
+        # Find objective in top pages
+        result = find_objective_in_top_pages(map_website, objective, app, client)
+        
+        if result:
+            print(f"{Colors.GREEN}Objective successfully fulfilled. Extracted information :{Colors.RESET}")
+            print(f"{Colors.MAGENTA}{json.dumps(result, indent=2)}{Colors.RESET}")
+        else:
+            print(f"{Colors.RED}Unable to fulfill the objective with the available content.{Colors.RESET}")
+    else:
+        print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}")
+
+if __name__ == "__main__":
+    main()
--- a/examples/o4-mini-web-crawler/requirements.txt
+++ b/examples/o4-mini-web-crawler/requirements.txt
@ -0,0 +1,3 @@
+firecrawl==1.0.0
+openai==1.16.0
+python-dotenv==1.0.0