Nick: resolved conflicts

2025-08-02 23:50:41 +08:00 · 2024-10-01 14:39:57 -03:00 · 2024-10-01 14:39:57 -03:00 · 4d5477f357
commit 4d5477f357
parent 3621e191bd 96245e387d
21 changed files with 388 additions and 36 deletions
--- a/apps/api/Dockerfile
+++ b/apps/api/Dockerfile
@ -37,4 +37,6 @@ COPY --from=go-base /app/src/lib/go-html-to-md/html-to-markdown.so /app/dist/src
 # Start the server by default, this can be overwritten at runtime
 EXPOSE 8080
-ENV PUPPETEER_EXECUTABLE_PATH="/usr/bin/chromium"
+ENV PUPPETEER_EXECUTABLE_PATH="/usr/bin/chromium"
 ENTRYPOINT "/app/docker-entrypoint.sh"
--- a/apps/api/docker-entrypoint.sh
+++ b/apps/api/docker-entrypoint.sh
@ -0,0 +1,19 @@
 #!/bin/bash -e
 if [ $UID -eq 0 ]; then
  ulimit -n 65535
  echo "NEW ULIMIT: $(ulimit -n)"
 else
  echo ENTRYPOINT DID NOT RUN AS ROOT
 fi
 if [ $FLY_PROCESS_GROUP = "app" ]; then
  echo "RUNNING app"
  node --max-old-space-size=8192 dist/src/index.js
 elif [ $FLY_PROCESS_GROUP = "worker" ]; then
  echo "RUNNING worker"
  node --max-old-space-size=8192 dist/src/services/queue-worker.js
 else
  echo "NO FLY PROCESS GROUP"
  node --max-old-space-size=8192 dist/src/index.js
 fi
--- a/apps/api/package.json
+++ b/apps/api/package.json
@ -67,6 +67,7 @@
    "async": "^3.2.5",
    "async-mutex": "^0.5.0",
    "axios": "^1.3.4",
    "axios-retry": "^4.5.0",
    "bottleneck": "^2.19.5",
    "bullmq": "^5.11.0",
    "cacheable-lookup": "^6.1.0",
--- a/apps/api/pnpm-lock.yaml
+++ b/apps/api/pnpm-lock.yaml
@ -65,6 +65,9 @@ importers:
      axios:
        specifier: ^1.3.4
        version: 1.7.2
      axios-retry:
        specifier: ^4.5.0
        version: 4.5.0(axios@1.7.2)
      bottleneck:
        specifier: ^2.19.5
        version: 2.19.5
@ -1903,6 +1906,11 @@ packages:
  axios-retry@3.9.1:
    resolution: {integrity: sha512-8PJDLJv7qTTMMwdnbMvrLYuvB47M81wRtxQmEdV5w4rgbTXTt+vtPkXwajOfOdSyv/wZICJOC+/UhXH4aQ/R+w==}
  axios-retry@4.5.0:
    resolution: {integrity: sha512-aR99oXhpEDGo0UuAlYcn2iGRds30k366Zfa05XWScR9QaQD4JYiP3/1Qt1u7YlefUOK+cn0CcwoL1oefavQUlQ==}
    peerDependencies:
      axios: 0.x || 1.x
  axios@0.26.1:
    resolution: {integrity: sha512-fPwcX4EvnSHuInCMItEhAGnaSEXRBjtzh9fOtsE6E1G6p7vl7edEeZe11QHf18+6+9gR5PbKV/sGKNaD8YaMeA==}
@ -4518,8 +4526,8 @@ packages:
    engines: {node: '>=14.17'}
    hasBin: true
-  typescript@5.5.4:
+  typescript@5.6.2:
-    resolution: {integrity: sha512-Mtq29sKDAEYP7aljRgtPOpTvOfbwRWlS6dPRzwjdE+C0R4brX/GUyhHSecbHMFLNBLcJIPt9nl9yG5TZ1weH+Q==}
+    resolution: {integrity: sha512-NW8ByodCSNCwZeghjN3o+JX5OFH0Ojg6sadjEKY4huZ52TqbJTJnDo5+Tw98lSy63NZvi4n+ez5m2u5d4PkZyw==}
    engines: {node: '>=14.17'}
    hasBin: true
@ -6950,6 +6958,11 @@ snapshots:
      '@babel/runtime': 7.24.6
      is-retry-allowed: 2.2.0
  axios-retry@4.5.0(axios@1.7.2):
    dependencies:
      axios: 1.7.2
      is-retry-allowed: 2.2.0
  axios@0.26.1:
    dependencies:
      follow-redirects: 1.15.6
@ -9195,7 +9208,7 @@ snapshots:
      csv-parse: 5.5.6
      gpt3-tokenizer: 1.1.5
      openai: 3.3.0
-      typescript: 5.5.4
+      typescript: 5.6.2
      uuid: 9.0.1
      zod: 3.23.8
    transitivePeerDependencies:
@ -9793,7 +9806,7 @@ snapshots:
  typescript@5.4.5: {}
-  typescript@5.5.4: {}
+  typescript@5.6.2: {}
  typesense@1.8.2(@babel/runtime@7.24.6):
    dependencies:
--- a/apps/api/src/controllers/auth.ts
+++ b/apps/api/src/controllers/auth.ts
@ -61,11 +61,10 @@ export async function setCachedACUC(api_key: string, acuc: AuthCreditUsageChunk
      // Cache for 10 minutes. This means that changing subscription tier could have
      // a maximum of 10 minutes of a delay. - mogery
-      await setValue(cacheKeyACUC, JSON.stringify(acuc), 600);
+      await setValue(cacheKeyACUC, JSON.stringify(acuc), 600, true);
    });
  } catch (error) {
-    Logger.error(`Error updating cached ACUC: ${error}`);
+    Logger.error(`Error updating cached ACUC ${cacheKeyACUC}: ${error}`);
    Sentry.captureException(error);
  }
 }
--- a/apps/api/src/controllers/v0/scrape.ts
+++ b/apps/api/src/controllers/v0/scrape.ts
@ -60,8 +60,8 @@ export async function scrapeHelper(
      mode: "single_urls",
      crawlerOptions,
      team_id,
      plan,
      pageOptions,
      plan,
      extractorOptions,
      origin: req.body.origin ?? defaultOrigin,
      is_scrape: true,
@ -197,7 +197,7 @@ export async function scrapeController(req: Request, res: Response) {
        await checkTeamCredits(chunk, team_id, 1);
      if (!creditsCheckSuccess) {
        earlyReturn = true;
-        return res.status(402).json({ error: "Insufficient credits" });
+        return res.status(402).json({ error: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing" });
      }
    } catch (error) {
      Logger.error(error);
--- a/apps/api/src/controllers/v0/search.ts
+++ b/apps/api/src/controllers/v0/search.ts
@ -37,7 +37,12 @@ export async function searchHelper(
  const tbs = searchOptions.tbs ?? null;
  const filter = searchOptions.filter ?? null;
-  const num_results = searchOptions.limit ?? 7;
+  let num_results = Math.min(searchOptions.limit ?? 7, 10);
  if (team_id === "d97c4ceb-290b-4957-8432-2b2a02727d95") {
    num_results = 1;
  }
  const num_results_buffer = Math.floor(num_results * 1.5);
  let res = await search({
@ -98,7 +103,7 @@ export async function searchHelper(
  if (Sentry.isInitialized()) {
    for (const job of jobDatas) {
      // add with sentry instrumentation
-      jobs.push(await addScrapeJob(job.data as any, {}, job.opts.jobId));
+      jobs.push(await addScrapeJob(job.data as any, {}, job.opts.jobId, job.opts.priority));
    }
  } else {
    jobs = await getScrapeQueue().addBulk(jobDatas);
@ -170,7 +175,7 @@ export async function searchController(req: Request, res: Response) {
      jobId,
      req,
      team_id,
-      chunk.sub_id,
+      chunk?.sub_id,
      crawlerOptions,
      pageOptions,
      searchOptions,
--- a/apps/api/src/controllers/v1/crawl-status.ts
+++ b/apps/api/src/controllers/v1/crawl-status.ts
@ -86,8 +86,8 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
      }
    }
-    // if we ran over the bytes limit, remove the last document
+    // if we ran over the bytes limit, remove the last document, except if it's the only document
-    if (bytes > bytesLimit) {
+    if (bytes > bytesLimit && doneJobs.length !== 1) {
      doneJobs.splice(doneJobs.length - 1, 1);
    }
  } else {
--- a/apps/api/src/controllers/v1/map.ts
+++ b/apps/api/src/controllers/v1/map.ts
@ -152,7 +152,7 @@ export async function mapController(
  // remove duplicates that could be due to http/https or www
  links = removeDuplicateUrls(links);
-  billTeam(req.auth.team_id, req.acuc.sub_id, 1).catch((error) => {
+  billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
    Logger.error(
      `Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`
    );
--- a/apps/api/src/controllers/v1/scrape.ts
+++ b/apps/api/src/controllers/v1/scrape.ts
@ -109,7 +109,7 @@ export async function scrapeController(
    creditsToBeBilled = 5;
  }
-  billTeam(req.auth.team_id, req.acuc.sub_id, creditsToBeBilled).catch(error => {
+  billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(error => {
    Logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
    // Optionally, you could notify an admin or add to a retry queue here
  });
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -390,6 +390,7 @@ export function legacyCrawlerOptions(x: CrawlerOptions) {
    generateImgAltText: false,
    allowBackwardCrawling: x.allowBackwardLinks,
    allowExternalContentLinks: x.allowExternalLinks,
    ignoreSitemap: x.ignoreSitemap,
  };
 }
--- a/apps/api/src/lib/job-priority.ts
+++ b/apps/api/src/lib/job-priority.ts
@ -37,6 +37,10 @@ export async function getJobPriority({
  team_id: string;
  basePriority?: number;
 }): Promise<number> {
  if (team_id === "d97c4ceb-290b-4957-8432-2b2a02727d95") {
    return 50;
  }
  try {
    const setKey = SET_KEY_PREFIX + team_id;
--- a/apps/api/src/lib/logger.ts
+++ b/apps/api/src/lib/logger.ts
@ -19,7 +19,7 @@ export class Logger {
  };
  static log (message: string, level: LogLevel) {
-    const logLevel: LogLevel = LogLevel[process.env.LOGGING_LEVEL as keyof typeof LogLevel] || LogLevel.INFO;
+    const logLevel: LogLevel = LogLevel[process.env.LOGGING_LEVEL as keyof typeof LogLevel] || LogLevel.TRACE;
    const levels = [LogLevel.NONE, LogLevel.ERROR, LogLevel.WARN, LogLevel.INFO, LogLevel.DEBUG, LogLevel.TRACE];
    const currentLevelIndex = levels.indexOf(logLevel);
    const messageLevelIndex = levels.indexOf(level);
--- a/apps/api/src/routes/v1.ts
+++ b/apps/api/src/routes/v1.ts
@ -35,7 +35,7 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
            if (!success) {
                Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
                if (!res.headersSent) {
-                    return res.status(402).json({ success: false, error: "Insufficient credits" });
+                    return res.status(402).json({ success: false, error: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing." });
                }
            }
            req.account = { remainingCredits };
--- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts
+++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts
@ -6,7 +6,11 @@ import { fetchAndProcessPdf } from "../utils/pdfProcessor";
 import { universalTimeout } from "../global";
 import { Logger } from "../../../lib/logger";
 import * as Sentry from "@sentry/node";
 import axiosRetry from 'axios-retry';
 axiosRetry(axios, { retries: 3 , onRetry:()=>{
  console.log("Retrying (fire-engine)...");
 }, retryDelay: axiosRetry.exponentialDelay});
 /**
 * Scrapes a URL with Fire-Engine
 * @param url The URL to scrape
@ -203,10 +207,10 @@ export async function scrapWithFireEngine({
    }
  } catch (error) {
    if (error.code === "ECONNABORTED") {
-      Logger.debug(`⛏️ Fire-Engine: Request timed out for ${url}`);
+      Logger.debug(`⛏️ Fire-Engine (catch block): Request timed out for ${url}`);
      logParams.error_message = "Request timed out";
    } else {
-      Logger.debug(`⛏️ Fire-Engine: Failed to fetch url: ${url} | Error: ${error}`);
+      Logger.debug(`⛏️ Fire-Engine(catch block): Failed to fetch url: ${url} | Error: ${error}`);
      logParams.error_message = error.message || error;
    }
    return { html: "", pageStatusCode: null, pageError: logParams.error_message };
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -29,8 +29,8 @@ const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIR
 export const baseScrapers = [
  useFireEngine ? "fire-engine;chrome-cdp" : undefined,
  useScrapingBee ? "scrapingBee" : undefined,
  useFireEngine ? "fire-engine" : undefined,
  useScrapingBee ? "scrapingBee" : undefined,
  useFireEngine ? undefined : "playwright",
  useScrapingBee ? "scrapingBeeLoad" : undefined,
  "fetch",
@ -95,8 +95,8 @@ function getScrapingFallbackOrder(
  let defaultOrder = [
    useFireEngine ? "fire-engine;chrome-cdp" : undefined,
    useScrapingBee ? "scrapingBee" : undefined,
    useFireEngine ? "fire-engine" : undefined,
    useScrapingBee ? "scrapingBee" : undefined,
    useScrapingBee ? "scrapingBeeLoad" : undefined,
    useFireEngine ? undefined : "playwright",
    "fetch",
@ -424,7 +424,7 @@ export async function scrapSingleUrl(
        Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`);
        break;
      }
-      if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 500)) {
+      if (pageStatusCode && (pageStatusCode == 404)) {
        Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code 404, breaking`);
        break;
      }
--- a/apps/api/src/scraper/WebScraper/utils/tests/removeUnwantedElements.test.ts
+++ b/apps/api/src/scraper/WebScraper/utils/tests/removeUnwantedElements.test.ts
--- a/apps/api/src/scraper/WebScraper/utils/removeUnwantedElements.ts
+++ b/apps/api/src/scraper/WebScraper/utils/removeUnwantedElements.ts
@ -1,30 +1,31 @@
-import cheerio, { AnyNode, Cheerio } from "cheerio";
+import { AnyNode, Cheerio, load } from "cheerio";
 import { PageOptions } from "../../../lib/entities";
 import { excludeNonMainTags } from "./excludeTags";
 export const removeUnwantedElements = (
  html: string,
-  pageOptions: PageOptions
+  pageOptions: PageOptions,
 ) => {
-  const soup = cheerio.load(html);
+  let soup = load(html);
  if (
    pageOptions.onlyIncludeTags &&
    pageOptions.onlyIncludeTags.length > 0 &&
-    pageOptions.onlyIncludeTags[0] !== ''
+    pageOptions.onlyIncludeTags[0] !== ""
  ) {
    if (typeof pageOptions.onlyIncludeTags === "string") {
      pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags];
    }
    if (pageOptions.onlyIncludeTags.length !== 0) {
      // Create a new root element to hold the tags to keep
-      const newRoot = cheerio.load("<div></div>")("div");
+      const newRoot = load("<div></div>")("div");
      pageOptions.onlyIncludeTags.forEach((tag) => {
        soup(tag).each((index, element) => {
          newRoot.append(soup(element).clone());
        });
      });
-      return newRoot.html();
+
      soup = load(newRoot.html());
    }
  }
@ -33,7 +34,7 @@ export const removeUnwantedElements = (
  if (
    pageOptions.removeTags &&
    pageOptions.removeTags.length > 0 &&
-    pageOptions.removeTags[0] !== ''
+    pageOptions.removeTags[0] !== ""
  ) {
    if (typeof pageOptions.removeTags === "string") {
      pageOptions.removeTags = [pageOptions.removeTags];
@ -51,11 +52,11 @@ export const removeUnwantedElements = (
              const attributes = element.attribs;
              const tagNameMatches = regexPattern.test(element.name);
              const attributesMatch = Object.keys(attributes).some((attr) =>
-                regexPattern.test(`${attr}="${attributes[attr]}"`)
+                regexPattern.test(`${attr}="${attributes[attr]}"`),
              );
              if (tag.startsWith("*.")) {
                classMatch = Object.keys(attributes).some((attr) =>
-                  regexPattern.test(`class="${attributes[attr]}"`)
+                  regexPattern.test(`class="${attributes[attr]}"`),
                );
              }
              return tagNameMatches || attributesMatch || classMatch;
--- a/apps/api/src/services/billing/credit_billing.ts
+++ b/apps/api/src/services/billing/credit_billing.ts
@ -66,7 +66,7 @@ export async function supaCheckTeamCredits(chunk: AuthCreditUsageChunk, team_id:
      chunk.sub_current_period_start,
      chunk.sub_current_period_end
    );
-    return { success: false, message: "Insufficient credits, please upgrade!", remainingCredits: chunk.remaining_credits, chunk };
+    return { success: false, message: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing.", remainingCredits: chunk.remaining_credits, chunk };
  } else if (creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) {
    // Send email notification for approaching credit limit
    sendNotification(
--- a/apps/api/src/services/redis.ts
+++ b/apps/api/src/services/redis.ts
@ -35,12 +35,15 @@ redisRateLimitClient.on("connect", (err) => {
 * @param {string} value The value to store.
 * @param {number} [expire] Optional expiration time in seconds.
 */
-const setValue = async (key: string, value: string, expire?: number) => {
+const setValue = async (key: string, value: string, expire?: number, nx = false) => {
-  if (expire) {
+  if (expire && !nx) {
    await redisRateLimitClient.set(key, value, "EX", expire);
  } else {
    await redisRateLimitClient.set(key, value);
  }
  if (expire && nx) {
    await redisRateLimitClient.expire(key, expire, "NX");
  }
 };
 /**
--- a/examples/o1_job_recommender/o1_job_recommender.py
+++ b/examples/o1_job_recommender/o1_job_recommender.py
@ -0,0 +1,283 @@
 # %%
 # %%
 import os
 import requests
 import json
 from dotenv import load_dotenv
 from openai import OpenAI
 # ANSI color codes
 class Colors:
    CYAN = '\033[96m'
    YELLOW = '\033[93m'
    GREEN = '\033[92m'
    RED = '\033[91m'
    MAGENTA = '\033[95m'
    BLUE = '\033[94m'
    RESET = '\033[0m'
 # Load environment variables
 load_dotenv()
 # Initialize the FirecrawlApp with your API key
 firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
 client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 # Set the jobs page URL
 jobs_page_url = "https://openai.com/careers/search"
 # Resume
 resume_paste = """"
 Eric Ciarla
 Co-Founder @ Firecrawl
 San Francisco, California, United States
 Summary
 Building…
 Experience
 Firecrawl
 Co-Founder
 April 2024 - Present (6 months)
 San Francisco, California, United States
 Firecrawl by Mendable. Building data extraction infrastructure for AI. Used by
 Amazon, Zapier, and Nvidia (YC S22)
 Mendable
 2 years 7 months
 Co-Founder @ Mendable.ai
 March 2022 - Present (2 years 7 months)
 San Francisco, California, United States
 - Built an AI powered search platform that that served millions of queries for
 hundreds of customers (YC S22)
 - We were one of the first LLM powered apps adopted by industry leaders like
 Coinbase, Snap, DoorDash, and MongoDB
 Co-Founder @ SideGuide
 March 2022 - Present (2 years 7 months)
 San Francisco, California, United States
 - Built and scaled an online course platform with a community of over 50,000
 developers
 - Selected for Y Combinator S22 batch, 2% acceptance rate
 Fracta
 Data Engineer
 2022 - 2022 (less than a year)
 Palo Alto, California, United States
 - Demoed tool during sales calls and provided technical support during the
 entire customer lifecycle
 Page 1 of 2
 - Mined, wrangled, & visualized geospatial and water utility data for predictive
 analytics & ML workflows (Python, QGIS)
 Ford Motor Company
 Data Scientist
 2021 - 2021 (less than a year)
 Dearborn, Michigan, United States
 - Extracted, cleaned, and joined data from multiple sources using SQL,
 Hadoop, and Alteryx
 - Used Bayesian Network Structure Learning (BNLearn, R) to uncover the
 relationships between survey free response verbatim topics (derived from
 natural language processing models) and numerical customer experience
 scores
 MDRemindME
 Co-Founder
 2018 - 2020 (2 years)
 Durham, New Hampshire, United States
 - Founded and led a healthtech startup aimed at improving patient adherence
 to treatment plans through an innovative engagement and retention tool
 - Piloted the product with healthcare providers and patients, gathering critical
 insights to refine functionality and enhance user experience
 - Secured funding through National Science Foundation I-CORPS Grant and
 UNH Entrepreneurship Center Seed Grant
 Education
 Y Combinator
 S22
 University of New Hampshire
 Economics and Philosophy
 """
 # First, scrape the jobs page using Firecrawl
 try:
    response = requests.post(
        "https://api.firecrawl.dev/v1/scrape",
        headers={
            "Content-Type": "application/json",
            "Authorization": f"Bearer {firecrawl_api_key}"
        },
        json={
            "url": jobs_page_url,
            "formats": ["markdown"]
        }
    )
    if response.status_code == 200:
        result = response.json()
        if result.get('success'):
            html_content = result['data']['markdown']
            # Define the O1 prompt for extracting apply links
            prompt = f"""
            Extract up to 30 job application links from the given markdown content.
            Return the result as a JSON object with a single key 'apply_links' containing an array of strings (the links).
            The output should be a valid JSON object, with no additional text.
            Do not include any JSON markdown formatting or code block indicators.
            Provide only the raw JSON object as the response.
            Example of the expected format:
            {{"apply_links": ["https://example.com/job1", "https://example.com/job2", ...]}}
            Markdown content:
            {html_content[:100000]}
            """
            print(f"{Colors.GREEN}Successfully scraped the jobs page{Colors.RESET}")
        else:
            print(f"{Colors.RED}Failed to scrape the jobs page: {result.get('message', 'Unknown error')}{Colors.RESET}")
            html_content = ""
    else:
        print(f"{Colors.RED}Error {response.status_code}: {response.text}{Colors.RESET}")
        html_content = ""
 except requests.RequestException as e:
    print(f"{Colors.RED}An error occurred while scraping: {str(e)}{Colors.RESET}")
    html_content = ""
 except json.JSONDecodeError as e:
    print(f"{Colors.RED}Error decoding JSON response: {str(e)}{Colors.RESET}")
    html_content = ""
 except Exception as e:
    print(f"{Colors.RED}An unexpected error occurred while scraping: {str(e)}{Colors.RESET}")
    html_content = ""
 # Extract apply links from the scraped HTML using O1
 apply_links = []
 if html_content:
    try:
        completion = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ]
        )
        if completion.choices:
            print(completion.choices[0].message.content)
            result = json.loads(completion.choices[0].message.content.strip())
            apply_links = result['apply_links']
            print(f"{Colors.GREEN}Successfully extracted {len(apply_links)} apply links{Colors.RESET}")
        else:
            print(f"{Colors.RED}No apply links extracted{Colors.RESET}")
    except json.JSONDecodeError as e:
        print(f"{Colors.RED}Error decoding JSON from OpenAI response: {str(e)}{Colors.RESET}")
    except KeyError as e:
        print(f"{Colors.RED}Expected key not found in OpenAI response: {str(e)}{Colors.RESET}")
    except Exception as e:
        print(f"{Colors.RED}An unexpected error occurred during extraction: {str(e)}{Colors.RESET}")
 else:
    print(f"{Colors.RED}No HTML content to process{Colors.RESET}")
 # Initialize a list to store the extracted data
 extracted_data = []
 # %%
 print(f"{Colors.CYAN}Apply links:{Colors.RESET}")
 for link in apply_links:
    print(f"{Colors.YELLOW}{link}{Colors.RESET}")
 # %%
 # Process each apply link
 for index, link in enumerate(apply_links):
    try:
        response = requests.post(
            "https://api.firecrawl.dev/v1/scrape",
            headers={
                "Content-Type": "application/json",
                "Authorization": f"Bearer {firecrawl_api_key}"
            },
            json={
                "url": link,
                "formats": ["extract"],
                "actions": [{
                    "type": "click",
                    "selector": "#job-overview"
                }],
                "extract": {
                    "schema": {
                        "type": "object",
                        "properties": {
                            "job_title": {"type": "string"},
                            "sub_division_of_organization": {"type": "string"},
                            "key_skills": {"type": "array", "items": {"type": "string"}},
                            "compensation": {"type": "string"},
                            "location": {"type": "string"},
                            "apply_link": {"type": "string"}
                        },
                        "required": ["job_title", "sub_division_of_organization", "key_skills", "compensation", "location", "apply_link"]
                    }
                }
            }
        )
        if response.status_code == 200:
            result = response.json()
            if result.get('success'):
                extracted_data.append(result['data']['extract'])
                print(f"{Colors.GREEN}Data extracted for job {index}{Colors.RESET}")
            else:
                print(f"")
        else:
            print(f"")
    except Exception as e:
        print(f"")
 # %%
 # %%
 # Print the extracted data
 print(f"{Colors.CYAN}Extracted data:{Colors.RESET}")
 for job in extracted_data:
    print(json.dumps(job, indent=2))
    print(f"{Colors.MAGENTA}{'-' * 50}{Colors.RESET}")
 # %%
 # Use o1-preview to choose which jobs should be applied to based on the resume
 prompt = f"""
 Please analyze the resume and job listings, and return a JSON list of the top 3 roles that best fit the candidate's experience and skills. Include only the job title, compensation, and apply link for each recommended role. The output should be a valid JSON array of objects in the following format, with no additional text:
 [
  {{
    "job_title": "Job Title",
    "compensation": "Compensation (if available, otherwise empty string)",
    "apply_link": "Application URL"
  }},
  ...
 ]
 Based on the following resume:
 {resume_paste}
 And the following job listings:
 {json.dumps(extracted_data, indent=2)}
 """
 completion = client.chat.completions.create(
    model="o1-preview",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": prompt
                }
            ]
        }
    ]
 )
 recommended_jobs = json.loads(completion.choices[0].message.content.strip())
 print(f"{Colors.CYAN}Recommended jobs:{Colors.RESET}")
 print(json.dumps(recommended_jobs, indent=2))