Nick: resolved conflicts

2025-08-02 14:50:38 +08:00 · 2024-10-01 14:39:57 -03:00 · 2024-10-01 14:39:57 -03:00 · 4d5477f357
commit 4d5477f357
parent 3621e191bd 96245e387d
21 changed files with 388 additions and 36 deletions
--- a/apps/api/Dockerfile
+++ b/apps/api/Dockerfile
@ -37,4 +37,6 @@ COPY --from=go-base /app/src/lib/go-html-to-md/html-to-markdown.so /app/dist/src

 # Start the server by default, this can be overwritten at runtime
 EXPOSE 8080
-ENV PUPPETEER_EXECUTABLE_PATH="/usr/bin/chromium"
+ENV PUPPETEER_EXECUTABLE_PATH="/usr/bin/chromium"
+
+ENTRYPOINT "/app/docker-entrypoint.sh"
--- a/apps/api/docker-entrypoint.sh
+++ b/apps/api/docker-entrypoint.sh
@ -0,0 +1,19 @@
+#!/bin/bash -e
+
+if [ $UID -eq 0 ]; then
+  ulimit -n 65535
+  echo "NEW ULIMIT: $(ulimit -n)"
+else
+  echo ENTRYPOINT DID NOT RUN AS ROOT
+fi
+
+if [ $FLY_PROCESS_GROUP = "app" ]; then
+  echo "RUNNING app"
+  node --max-old-space-size=8192 dist/src/index.js
+elif [ $FLY_PROCESS_GROUP = "worker" ]; then
+  echo "RUNNING worker"
+  node --max-old-space-size=8192 dist/src/services/queue-worker.js
+else
+  echo "NO FLY PROCESS GROUP"
+  node --max-old-space-size=8192 dist/src/index.js
+fi
--- a/apps/api/package.json
+++ b/apps/api/package.json
@ -67,6 +67,7 @@
    "async": "^3.2.5",
    "async-mutex": "^0.5.0",
    "axios": "^1.3.4",
+    "axios-retry": "^4.5.0",
    "bottleneck": "^2.19.5",
    "bullmq": "^5.11.0",
    "cacheable-lookup": "^6.1.0",
--- a/apps/api/pnpm-lock.yaml
+++ b/apps/api/pnpm-lock.yaml
@ -65,6 +65,9 @@ importers:
      axios:
        specifier: ^1.3.4
        version: 1.7.2
+      axios-retry:
+        specifier: ^4.5.0
+        version: 4.5.0(axios@1.7.2)
      bottleneck:
        specifier: ^2.19.5
        version: 2.19.5
@ -1903,6 +1906,11 @@ packages:
  axios-retry@3.9.1:
    resolution: {integrity: sha512-8PJDLJv7qTTMMwdnbMvrLYuvB47M81wRtxQmEdV5w4rgbTXTt+vtPkXwajOfOdSyv/wZICJOC+/UhXH4aQ/R+w==}

+  axios-retry@4.5.0:
+    resolution: {integrity: sha512-aR99oXhpEDGo0UuAlYcn2iGRds30k366Zfa05XWScR9QaQD4JYiP3/1Qt1u7YlefUOK+cn0CcwoL1oefavQUlQ==}
+    peerDependencies:
+      axios: 0.x || 1.x
+
  axios@0.26.1:
    resolution: {integrity: sha512-fPwcX4EvnSHuInCMItEhAGnaSEXRBjtzh9fOtsE6E1G6p7vl7edEeZe11QHf18+6+9gR5PbKV/sGKNaD8YaMeA==}

@ -4518,8 +4526,8 @@ packages:
    engines: {node: '>=14.17'}
    hasBin: true

-  typescript@5.5.4:
-    resolution: {integrity: sha512-Mtq29sKDAEYP7aljRgtPOpTvOfbwRWlS6dPRzwjdE+C0R4brX/GUyhHSecbHMFLNBLcJIPt9nl9yG5TZ1weH+Q==}
+  typescript@5.6.2:
+    resolution: {integrity: sha512-NW8ByodCSNCwZeghjN3o+JX5OFH0Ojg6sadjEKY4huZ52TqbJTJnDo5+Tw98lSy63NZvi4n+ez5m2u5d4PkZyw==}
    engines: {node: '>=14.17'}
    hasBin: true

@ -6950,6 +6958,11 @@ snapshots:
      '@babel/runtime': 7.24.6
      is-retry-allowed: 2.2.0

+  axios-retry@4.5.0(axios@1.7.2):
+    dependencies:
+      axios: 1.7.2
+      is-retry-allowed: 2.2.0
+
  axios@0.26.1:
    dependencies:
      follow-redirects: 1.15.6
@ -9195,7 +9208,7 @@ snapshots:
      csv-parse: 5.5.6
      gpt3-tokenizer: 1.1.5
      openai: 3.3.0
-      typescript: 5.5.4
+      typescript: 5.6.2
      uuid: 9.0.1
      zod: 3.23.8
    transitivePeerDependencies:
@ -9793,7 +9806,7 @@ snapshots:

  typescript@5.4.5: {}

-  typescript@5.5.4: {}
+  typescript@5.6.2: {}

  typesense@1.8.2(@babel/runtime@7.24.6):
    dependencies:
--- a/apps/api/src/controllers/auth.ts
+++ b/apps/api/src/controllers/auth.ts
@ -61,11 +61,10 @@ export async function setCachedACUC(api_key: string, acuc: AuthCreditUsageChunk

      // Cache for 10 minutes. This means that changing subscription tier could have
      // a maximum of 10 minutes of a delay. - mogery
-      await setValue(cacheKeyACUC, JSON.stringify(acuc), 600);
+      await setValue(cacheKeyACUC, JSON.stringify(acuc), 600, true);
    });
  } catch (error) {
-    Logger.error(`Error updating cached ACUC: ${error}`);
-    Sentry.captureException(error);
+    Logger.error(`Error updating cached ACUC ${cacheKeyACUC}: ${error}`);
  }
 }

--- a/apps/api/src/controllers/v0/scrape.ts
+++ b/apps/api/src/controllers/v0/scrape.ts
@ -60,8 +60,8 @@ export async function scrapeHelper(
      mode: "single_urls",
      crawlerOptions,
      team_id,
-      plan,
      pageOptions,
+      plan,
      extractorOptions,
      origin: req.body.origin ?? defaultOrigin,
      is_scrape: true,
@ -197,7 +197,7 @@ export async function scrapeController(req: Request, res: Response) {
        await checkTeamCredits(chunk, team_id, 1);
      if (!creditsCheckSuccess) {
        earlyReturn = true;
-        return res.status(402).json({ error: "Insufficient credits" });
+        return res.status(402).json({ error: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing" });
      }
    } catch (error) {
      Logger.error(error);
--- a/apps/api/src/controllers/v0/search.ts
+++ b/apps/api/src/controllers/v0/search.ts
@ -37,7 +37,12 @@ export async function searchHelper(

  const tbs = searchOptions.tbs ?? null;
  const filter = searchOptions.filter ?? null;
-  const num_results = searchOptions.limit ?? 7;
+  let num_results = Math.min(searchOptions.limit ?? 7, 10);
+
+  if (team_id === "d97c4ceb-290b-4957-8432-2b2a02727d95") {
+    num_results = 1;
+  }
+
  const num_results_buffer = Math.floor(num_results * 1.5);

  let res = await search({
@ -98,7 +103,7 @@ export async function searchHelper(
  if (Sentry.isInitialized()) {
    for (const job of jobDatas) {
      // add with sentry instrumentation
-      jobs.push(await addScrapeJob(job.data as any, {}, job.opts.jobId));
+      jobs.push(await addScrapeJob(job.data as any, {}, job.opts.jobId, job.opts.priority));
    }
  } else {
    jobs = await getScrapeQueue().addBulk(jobDatas);
@ -170,7 +175,7 @@ export async function searchController(req: Request, res: Response) {
      jobId,
      req,
      team_id,
-      chunk.sub_id,
+      chunk?.sub_id,
      crawlerOptions,
      pageOptions,
      searchOptions,
--- a/apps/api/src/controllers/v1/crawl-status.ts
+++ b/apps/api/src/controllers/v1/crawl-status.ts
@ -86,8 +86,8 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
      }
    }

-    // if we ran over the bytes limit, remove the last document
-    if (bytes > bytesLimit) {
+    // if we ran over the bytes limit, remove the last document, except if it's the only document
+    if (bytes > bytesLimit && doneJobs.length !== 1) {
      doneJobs.splice(doneJobs.length - 1, 1);
    }
  } else {
--- a/apps/api/src/controllers/v1/map.ts
+++ b/apps/api/src/controllers/v1/map.ts
@ -152,7 +152,7 @@ export async function mapController(
  // remove duplicates that could be due to http/https or www
  links = removeDuplicateUrls(links);

-  billTeam(req.auth.team_id, req.acuc.sub_id, 1).catch((error) => {
+  billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
    Logger.error(
      `Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`
    );
--- a/apps/api/src/controllers/v1/scrape.ts
+++ b/apps/api/src/controllers/v1/scrape.ts
@ -109,7 +109,7 @@ export async function scrapeController(
    creditsToBeBilled = 5;
  }

-  billTeam(req.auth.team_id, req.acuc.sub_id, creditsToBeBilled).catch(error => {
+  billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(error => {
    Logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
    // Optionally, you could notify an admin or add to a retry queue here
  });
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -390,6 +390,7 @@ export function legacyCrawlerOptions(x: CrawlerOptions) {
    generateImgAltText: false,
    allowBackwardCrawling: x.allowBackwardLinks,
    allowExternalContentLinks: x.allowExternalLinks,
+    ignoreSitemap: x.ignoreSitemap,
  };
 }

--- a/apps/api/src/lib/job-priority.ts
+++ b/apps/api/src/lib/job-priority.ts
@ -37,6 +37,10 @@ export async function getJobPriority({
  team_id: string;
  basePriority?: number;
 }): Promise<number> {
+  if (team_id === "d97c4ceb-290b-4957-8432-2b2a02727d95") {
+    return 50;
+  }
+
  try {
    const setKey = SET_KEY_PREFIX + team_id;

--- a/apps/api/src/lib/logger.ts
+++ b/apps/api/src/lib/logger.ts
@ -19,7 +19,7 @@ export class Logger {
  };

  static log (message: string, level: LogLevel) {
-    const logLevel: LogLevel = LogLevel[process.env.LOGGING_LEVEL as keyof typeof LogLevel] || LogLevel.INFO;
+    const logLevel: LogLevel = LogLevel[process.env.LOGGING_LEVEL as keyof typeof LogLevel] || LogLevel.TRACE;
    const levels = [LogLevel.NONE, LogLevel.ERROR, LogLevel.WARN, LogLevel.INFO, LogLevel.DEBUG, LogLevel.TRACE];
    const currentLevelIndex = levels.indexOf(logLevel);
    const messageLevelIndex = levels.indexOf(level);
--- a/apps/api/src/routes/v1.ts
+++ b/apps/api/src/routes/v1.ts
@ -35,7 +35,7 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
            if (!success) {
                Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
                if (!res.headersSent) {
-                    return res.status(402).json({ success: false, error: "Insufficient credits" });
+                    return res.status(402).json({ success: false, error: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing." });
                }
            }
            req.account = { remainingCredits };
--- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts
+++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts
@ -6,7 +6,11 @@ import { fetchAndProcessPdf } from "../utils/pdfProcessor";
 import { universalTimeout } from "../global";
 import { Logger } from "../../../lib/logger";
 import * as Sentry from "@sentry/node";
+import axiosRetry from 'axios-retry';

+axiosRetry(axios, { retries: 3 , onRetry:()=>{
+  console.log("Retrying (fire-engine)...");
+}, retryDelay: axiosRetry.exponentialDelay});
 /**
 * Scrapes a URL with Fire-Engine
 * @param url The URL to scrape
@ -203,10 +207,10 @@ export async function scrapWithFireEngine({
    }
  } catch (error) {
    if (error.code === "ECONNABORTED") {
-      Logger.debug(`⛏️ Fire-Engine: Request timed out for ${url}`);
+      Logger.debug(`⛏️ Fire-Engine (catch block): Request timed out for ${url}`);
      logParams.error_message = "Request timed out";
    } else {
-      Logger.debug(`⛏️ Fire-Engine: Failed to fetch url: ${url} | Error: ${error}`);
+      Logger.debug(`⛏️ Fire-Engine(catch block): Failed to fetch url: ${url} | Error: ${error}`);
      logParams.error_message = error.message || error;
    }
    return { html: "", pageStatusCode: null, pageError: logParams.error_message };
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -29,8 +29,8 @@ const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIR

 export const baseScrapers = [
  useFireEngine ? "fire-engine;chrome-cdp" : undefined,
-  useScrapingBee ? "scrapingBee" : undefined,
  useFireEngine ? "fire-engine" : undefined,
+  useScrapingBee ? "scrapingBee" : undefined,
  useFireEngine ? undefined : "playwright",
  useScrapingBee ? "scrapingBeeLoad" : undefined,
  "fetch",
@ -95,8 +95,8 @@ function getScrapingFallbackOrder(

  let defaultOrder = [
    useFireEngine ? "fire-engine;chrome-cdp" : undefined,
-    useScrapingBee ? "scrapingBee" : undefined,
    useFireEngine ? "fire-engine" : undefined,
+    useScrapingBee ? "scrapingBee" : undefined,
    useScrapingBee ? "scrapingBeeLoad" : undefined,
    useFireEngine ? undefined : "playwright",
    "fetch",
@ -424,7 +424,7 @@ export async function scrapSingleUrl(
        Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`);
        break;
      }
-      if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 500)) {
+      if (pageStatusCode && (pageStatusCode == 404)) {
        Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code 404, breaking`);
        break;
      }
--- a/apps/api/src/scraper/WebScraper/utils/tests/removeUnwantedElements.test.ts
+++ b/apps/api/src/scraper/WebScraper/utils/tests/removeUnwantedElements.test.ts
--- a/apps/api/src/scraper/WebScraper/utils/removeUnwantedElements.ts
+++ b/apps/api/src/scraper/WebScraper/utils/removeUnwantedElements.ts
@ -1,30 +1,31 @@
-import cheerio, { AnyNode, Cheerio } from "cheerio";
+import { AnyNode, Cheerio, load } from "cheerio";
 import { PageOptions } from "../../../lib/entities";
 import { excludeNonMainTags } from "./excludeTags";

 export const removeUnwantedElements = (
  html: string,
-  pageOptions: PageOptions
+  pageOptions: PageOptions,
 ) => {
-  const soup = cheerio.load(html);
+  let soup = load(html);

  if (
    pageOptions.onlyIncludeTags &&
    pageOptions.onlyIncludeTags.length > 0 &&
-    pageOptions.onlyIncludeTags[0] !== ''
+    pageOptions.onlyIncludeTags[0] !== ""
  ) {
    if (typeof pageOptions.onlyIncludeTags === "string") {
      pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags];
    }
    if (pageOptions.onlyIncludeTags.length !== 0) {
      // Create a new root element to hold the tags to keep
-      const newRoot = cheerio.load("<div></div>")("div");
+      const newRoot = load("<div></div>")("div");
      pageOptions.onlyIncludeTags.forEach((tag) => {
        soup(tag).each((index, element) => {
          newRoot.append(soup(element).clone());
        });
      });
-      return newRoot.html();
+
+      soup = load(newRoot.html());
    }
  }

@ -33,7 +34,7 @@ export const removeUnwantedElements = (
  if (
    pageOptions.removeTags &&
    pageOptions.removeTags.length > 0 &&
-    pageOptions.removeTags[0] !== ''
+    pageOptions.removeTags[0] !== ""
  ) {
    if (typeof pageOptions.removeTags === "string") {
      pageOptions.removeTags = [pageOptions.removeTags];
@ -51,11 +52,11 @@ export const removeUnwantedElements = (
              const attributes = element.attribs;
              const tagNameMatches = regexPattern.test(element.name);
              const attributesMatch = Object.keys(attributes).some((attr) =>
-                regexPattern.test(`${attr}="${attributes[attr]}"`)
+                regexPattern.test(`${attr}="${attributes[attr]}"`),
              );
              if (tag.startsWith("*.")) {
                classMatch = Object.keys(attributes).some((attr) =>
-                  regexPattern.test(`class="${attributes[attr]}"`)
+                  regexPattern.test(`class="${attributes[attr]}"`),
                );
              }
              return tagNameMatches || attributesMatch || classMatch;
--- a/apps/api/src/services/billing/credit_billing.ts
+++ b/apps/api/src/services/billing/credit_billing.ts
@ -66,7 +66,7 @@ export async function supaCheckTeamCredits(chunk: AuthCreditUsageChunk, team_id:
      chunk.sub_current_period_start,
      chunk.sub_current_period_end
    );
-    return { success: false, message: "Insufficient credits, please upgrade!", remainingCredits: chunk.remaining_credits, chunk };
+    return { success: false, message: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing.", remainingCredits: chunk.remaining_credits, chunk };
  } else if (creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) {
    // Send email notification for approaching credit limit
    sendNotification(
--- a/apps/api/src/services/redis.ts
+++ b/apps/api/src/services/redis.ts
@ -35,12 +35,15 @@ redisRateLimitClient.on("connect", (err) => {
 * @param {string} value The value to store.
 * @param {number} [expire] Optional expiration time in seconds.
 */
-const setValue = async (key: string, value: string, expire?: number) => {
-  if (expire) {
+const setValue = async (key: string, value: string, expire?: number, nx = false) => {
+  if (expire && !nx) {
    await redisRateLimitClient.set(key, value, "EX", expire);
  } else {
    await redisRateLimitClient.set(key, value);
  }
+  if (expire && nx) {
+    await redisRateLimitClient.expire(key, expire, "NX");
+  }
 };

 /**
--- a/examples/o1_job_recommender/o1_job_recommender.py
+++ b/examples/o1_job_recommender/o1_job_recommender.py
@ -0,0 +1,283 @@
+# %%
+# %%
+import os
+import requests
+import json
+from dotenv import load_dotenv
+from openai import OpenAI
+
+# ANSI color codes
+class Colors:
+    CYAN = '\033[96m'
+    YELLOW = '\033[93m'
+    GREEN = '\033[92m'
+    RED = '\033[91m'
+    MAGENTA = '\033[95m'
+    BLUE = '\033[94m'
+    RESET = '\033[0m'
+# Load environment variables
+load_dotenv()
+
+# Initialize the FirecrawlApp with your API key
+firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
+client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+
+# Set the jobs page URL
+jobs_page_url = "https://openai.com/careers/search"
+
+# Resume
+resume_paste = """"
+Eric Ciarla
+Co-Founder @ Firecrawl
+San Francisco, California, United States
+Summary
+Building…
+Experience
+Firecrawl
+Co-Founder
+April 2024 - Present (6 months)
+San Francisco, California, United States
+Firecrawl by Mendable. Building data extraction infrastructure for AI. Used by
+Amazon, Zapier, and Nvidia (YC S22)
+Mendable
+2 years 7 months
+Co-Founder @ Mendable.ai
+March 2022 - Present (2 years 7 months)
+San Francisco, California, United States
+- Built an AI powered search platform that that served millions of queries for
+hundreds of customers (YC S22)
+- We were one of the first LLM powered apps adopted by industry leaders like
+Coinbase, Snap, DoorDash, and MongoDB
+Co-Founder @ SideGuide
+March 2022 - Present (2 years 7 months)
+San Francisco, California, United States
+- Built and scaled an online course platform with a community of over 50,000
+developers
+- Selected for Y Combinator S22 batch, 2% acceptance rate
+Fracta
+Data Engineer
+2022 - 2022 (less than a year)
+Palo Alto, California, United States
+- Demoed tool during sales calls and provided technical support during the
+entire customer lifecycle
+Page 1 of 2
+- Mined, wrangled, & visualized geospatial and water utility data for predictive
+analytics & ML workflows (Python, QGIS)
+Ford Motor Company
+Data Scientist
+2021 - 2021 (less than a year)
+Dearborn, Michigan, United States
+- Extracted, cleaned, and joined data from multiple sources using SQL,
+Hadoop, and Alteryx
+- Used Bayesian Network Structure Learning (BNLearn, R) to uncover the
+relationships between survey free response verbatim topics (derived from
+natural language processing models) and numerical customer experience
+scores
+MDRemindME
+Co-Founder
+2018 - 2020 (2 years)
+Durham, New Hampshire, United States
+- Founded and led a healthtech startup aimed at improving patient adherence
+to treatment plans through an innovative engagement and retention tool
+- Piloted the product with healthcare providers and patients, gathering critical
+insights to refine functionality and enhance user experience
+- Secured funding through National Science Foundation I-CORPS Grant and
+UNH Entrepreneurship Center Seed Grant
+Education
+Y Combinator
+S22
+University of New Hampshire
+Economics and Philosophy
+"""
+
+# First, scrape the jobs page using Firecrawl
+try:
+    response = requests.post(
+        "https://api.firecrawl.dev/v1/scrape",
+        headers={
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {firecrawl_api_key}"
+        },
+        json={
+            "url": jobs_page_url,
+            "formats": ["markdown"]
+        }
+    )
+    
+    if response.status_code == 200:
+        result = response.json()
+        if result.get('success'):
+            html_content = result['data']['markdown']
+            # Define the O1 prompt for extracting apply links
+            prompt = f"""
+            Extract up to 30 job application links from the given markdown content.
+            Return the result as a JSON object with a single key 'apply_links' containing an array of strings (the links).
+            The output should be a valid JSON object, with no additional text.
+            Do not include any JSON markdown formatting or code block indicators.
+            Provide only the raw JSON object as the response.
+
+            Example of the expected format:
+            {{"apply_links": ["https://example.com/job1", "https://example.com/job2", ...]}}
+
+            Markdown content:
+            {html_content[:100000]}
+            """
+            print(f"{Colors.GREEN}Successfully scraped the jobs page{Colors.RESET}")
+        else:
+            print(f"{Colors.RED}Failed to scrape the jobs page: {result.get('message', 'Unknown error')}{Colors.RESET}")
+            html_content = ""
+    else:
+        print(f"{Colors.RED}Error {response.status_code}: {response.text}{Colors.RESET}")
+        html_content = ""
+except requests.RequestException as e:
+    print(f"{Colors.RED}An error occurred while scraping: {str(e)}{Colors.RESET}")
+    html_content = ""
+except json.JSONDecodeError as e:
+    print(f"{Colors.RED}Error decoding JSON response: {str(e)}{Colors.RESET}")
+    html_content = ""
+except Exception as e:
+    print(f"{Colors.RED}An unexpected error occurred while scraping: {str(e)}{Colors.RESET}")
+    html_content = ""
+
+# Extract apply links from the scraped HTML using O1
+apply_links = []
+if html_content:
+    try:
+        completion = client.chat.completions.create(
+            model="gpt-4o",
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt
+                }
+            ]
+        )
+        
+        if completion.choices:
+            print(completion.choices[0].message.content)
+            result = json.loads(completion.choices[0].message.content.strip())
+        
+            apply_links = result['apply_links']
+            print(f"{Colors.GREEN}Successfully extracted {len(apply_links)} apply links{Colors.RESET}")
+        else:
+            print(f"{Colors.RED}No apply links extracted{Colors.RESET}")
+    except json.JSONDecodeError as e:
+        print(f"{Colors.RED}Error decoding JSON from OpenAI response: {str(e)}{Colors.RESET}")
+    except KeyError as e:
+        print(f"{Colors.RED}Expected key not found in OpenAI response: {str(e)}{Colors.RESET}")
+    except Exception as e:
+        print(f"{Colors.RED}An unexpected error occurred during extraction: {str(e)}{Colors.RESET}")
+else:
+    print(f"{Colors.RED}No HTML content to process{Colors.RESET}")
+
+# Initialize a list to store the extracted data
+extracted_data = []
+
+
+# %%
+print(f"{Colors.CYAN}Apply links:{Colors.RESET}")
+for link in apply_links:
+    print(f"{Colors.YELLOW}{link}{Colors.RESET}")
+
+# %%
+# Process each apply link
+for index, link in enumerate(apply_links):
+    try:
+        response = requests.post(
+            "https://api.firecrawl.dev/v1/scrape",
+            headers={
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {firecrawl_api_key}"
+            },
+            json={
+                "url": link,
+                "formats": ["extract"],
+                "actions": [{
+                    "type": "click",
+                    "selector": "#job-overview"
+                }],
+                "extract": {
+                    "schema": {
+                        "type": "object",
+                        "properties": {
+                            "job_title": {"type": "string"},
+                            "sub_division_of_organization": {"type": "string"},
+                            "key_skills": {"type": "array", "items": {"type": "string"}},
+                            "compensation": {"type": "string"},
+                            "location": {"type": "string"},
+                            "apply_link": {"type": "string"}
+                        },
+                        "required": ["job_title", "sub_division_of_organization", "key_skills", "compensation", "location", "apply_link"]
+                    }
+                }
+            }
+        )
+        
+        if response.status_code == 200:
+            result = response.json()
+            if result.get('success'):
+                extracted_data.append(result['data']['extract'])
+                print(f"{Colors.GREEN}Data extracted for job {index}{Colors.RESET}")
+            else:
+                print(f"")
+        else:
+            print(f"")
+    except Exception as e:
+        print(f"")
+
+
+# %%
+# %%
+# Print the extracted data
+print(f"{Colors.CYAN}Extracted data:{Colors.RESET}")
+for job in extracted_data:
+    print(json.dumps(job, indent=2))
+    print(f"{Colors.MAGENTA}{'-' * 50}{Colors.RESET}")
+
+
+# %%
+
+
+
+
+# Use o1-preview to choose which jobs should be applied to based on the resume
+prompt = f"""
+Please analyze the resume and job listings, and return a JSON list of the top 3 roles that best fit the candidate's experience and skills. Include only the job title, compensation, and apply link for each recommended role. The output should be a valid JSON array of objects in the following format, with no additional text:
+
+[
+  {{
+    "job_title": "Job Title",
+    "compensation": "Compensation (if available, otherwise empty string)",
+    "apply_link": "Application URL"
+  }},
+  ...
+]
+
+Based on the following resume:
+{resume_paste}
+
+And the following job listings:
+{json.dumps(extracted_data, indent=2)}
+"""
+
+completion = client.chat.completions.create(
+    model="o1-preview",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": prompt
+                }
+            ]
+        }
+    ]
+)
+
+recommended_jobs = json.loads(completion.choices[0].message.content.strip())
+
+print(f"{Colors.CYAN}Recommended jobs:{Colors.RESET}")
+print(json.dumps(recommended_jobs, indent=2))
+
+