From 14fa75cae6a6da480bb8d8edec8fc7d76e65e7cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 22 Aug 2024 13:08:54 +0200 Subject: [PATCH 01/43] fix(crawl): send error if url is not a string Fixes FIRECRAWL-SCRAPER-JS-1E and FIRECRAWL-SCRAPER-JS-Z --- apps/api/src/controllers/crawl.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index d40f2a9e..2e31c257 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -72,6 +72,9 @@ export async function crawlController(req: Request, res: Response) { if (!url) { return res.status(400).json({ error: "Url is required" }); } + if (typeof url !== "string") { + return res.status(400).json({ error: "URL must be a string" }); + } try { url = checkAndUpdateURL(url).url; } catch (e) { @@ -87,8 +90,6 @@ export async function crawlController(req: Request, res: Response) { }); } - const mode = req.body.mode ?? "crawl"; - // if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this? // try { // const a = new WebScraperDataProvider(); From 508568f9438166d0fe564f6fac0aec1753968c50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 22 Aug 2024 13:10:58 +0200 Subject: [PATCH 02/43] fix(search): handle scrape timeouts on search Fixes FIRECRAWL-SCRAPER-JS-15 --- apps/api/src/controllers/search.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 21a8e390..63820aba 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -186,6 +186,10 @@ export async function searchController(req: Request, res: Response) { }); return res.status(result.returnCode).json(result); } catch (error) { + if (error instanceof Error && error.message.startsWith("Job wait")) { + return res.status(408).json({ error: "Request timed out" }); + } + Sentry.captureException(error); Logger.error(error); return res.status(500).json({ error: error.message }); From fbbc3878f189b661e4096833fcf0517dd052049a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 22 Aug 2024 13:18:26 +0200 Subject: [PATCH 03/43] fix(crawler): make sure includes/excludes is an array --- apps/api/src/scraper/WebScraper/crawler.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 02894cfc..67f1c22e 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -53,8 +53,8 @@ export class WebCrawler { this.jobId = jobId; this.initialUrl = initialUrl; this.baseUrl = new URL(initialUrl).origin; - this.includes = includes ?? []; - this.excludes = excludes ?? []; + this.includes = Array.isArray(includes) ? includes : []; + this.excludes = Array.isArray(excludes) ? excludes : []; this.limit = limit; this.robotsTxtUrl = `${this.baseUrl}/robots.txt`; this.robots = robotsParser(this.robotsTxtUrl, ""); From 1f580deefc073faecf95b43fe5e77820ff421ad1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 22 Aug 2024 13:29:11 +0200 Subject: [PATCH 04/43] fix(crawl): validate includes.excludes regexes --- apps/api/src/controllers/crawl.ts | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 2e31c257..4335334a 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -57,6 +57,26 @@ export async function crawlController(req: Request, res: Response) { }; const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions }; + if (Array.isArray(crawlerOptions.includes)) { + for (const x of crawlerOptions.includes) { + try { + new RegExp(x); + } catch (e) { + return res.status(400).json({ error: e.message }); + } + } + } + + if (Array.isArray(crawlerOptions.excludes)) { + for (const x of crawlerOptions.excludes) { + try { + new RegExp(x); + } catch (e) { + return res.status(400).json({ error: e.message }); + } + } + } + const limitCheck = req.body?.crawlerOptions?.limit ?? 1; const { success: creditsCheckSuccess, message: creditsCheckMessage, remainingCredits } = await checkTeamCredits(team_id, limitCheck); From 7d9f5bf8b1a616fc2dbc51b7e54b4c3b79770bec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 22 Aug 2024 13:40:55 +0200 Subject: [PATCH 05/43] fix(crawl): don't use sitemap if it's empty Fixes FIRECRAWL-SCRAPER-JS-11 --- apps/api/src/controllers/crawl.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 4335334a..c299dc01 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -163,7 +163,7 @@ export async function crawlController(req: Request, res: Response) { ? null : await crawler.tryGetSitemap(); - if (sitemap !== null) { + if (sitemap !== null && sitemap.length > 0) { const jobs = sitemap.map((x) => { const url = x.url; const uuid = uuidv4(); From 670d253a8cfed25042d3e075b944fbef4a7e7bce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 22 Aug 2024 14:08:09 +0200 Subject: [PATCH 06/43] fix(auth): fix error reporting --- apps/api/src/controllers/auth.ts | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 467d09fc..ac60dc53 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -61,8 +61,10 @@ async function getKeyAndPriceId(normalizedApi: string): Promise<{ }; } if (!data || data.length === 0) { - Logger.warn(`Error fetching api key: ${error.message} or data is empty`); - Sentry.captureException(error); + if (error) { + Logger.warn(`Error fetching api key: ${error.message} or data is empty`); + Sentry.captureException(error); + } // TODO: change this error code ? return { success: false, @@ -309,8 +311,8 @@ export async function supaAuthenticateUser( if (error || !data || data.length === 0) { if (error) { Sentry.captureException(error); + Logger.warn(`Error fetching api key: ${error.message} or data is empty`); } - Logger.warn(`Error fetching api key: ${error.message} or data is empty`); return { success: false, error: "Unauthorized: Invalid token", From e4adbaa88eaa6e56985df8b1e6087ded95c4fc8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 22 Aug 2024 14:12:52 +0200 Subject: [PATCH 07/43] fix(llm-extract): handle llm-extract if scrape failed --- apps/api/src/lib/LLM-extraction/models.ts | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts index e696a8cd..8ca6bbd4 100644 --- a/apps/api/src/lib/LLM-extraction/models.ts +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -15,7 +15,7 @@ const defaultPrompt = function prepareOpenAIDoc( document: Document, mode: "markdown" | "raw-html" -): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] { +): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] | null { let markdown = document.markdown; @@ -27,9 +27,10 @@ function prepareOpenAIDoc( // Check if the markdown content exists in the document if (!extractionTarget) { - throw new Error( - `${mode} content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai` - ); + return null; + // throw new Error( + // `${mode} content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai` + // ); } @@ -64,7 +65,16 @@ export async function generateOpenAICompletions({ mode: "markdown" | "raw-html"; }): Promise { const openai = client as OpenAI; - const [content, numTokens] = prepareOpenAIDoc(document, mode); + const preparedDoc = prepareOpenAIDoc(document, mode); + + if (preparedDoc === null) { + return { + ...document, + warning: "LLM extraction was not performed since the document's content is empty or missing.", + }; + } + + const [content, numTokens] = preparedDoc; const completion = await openai.chat.completions.create({ model, From 4bd2ff26d308d096f703b5b308d9660d6bbaf0fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 22 Aug 2024 14:37:09 +0200 Subject: [PATCH 08/43] fix(llm-extract): pass stacktrace properly --- apps/api/src/lib/LLM-extraction/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts index 85a7e995..af8b0bb1 100644 --- a/apps/api/src/lib/LLM-extraction/index.ts +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -46,7 +46,7 @@ export async function generateCompletions( return completionResult; } catch (error) { Logger.error(`Error generating completions: ${error}`); - throw new Error(`Error generating completions: ${error.message}`); + throw error; } default: throw new Error("Invalid client"); From 0e8fd6ce7089c7dda62540b6c2ddda2071593246 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 22 Aug 2024 14:50:51 +0200 Subject: [PATCH 09/43] fix(scrape): ensure extractionSchema is an object if llm-extraction is specified --- apps/api/src/controllers/scrape.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index b2d1db34..959cc546 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -105,6 +105,10 @@ export async function scrapeController(req: Request, res: Response) { let timeout = req.body.timeout ?? defaultTimeout; if (extractorOptions.mode.includes("llm-extraction")) { + if (typeof extractorOptions.extractionSchema !== "object" || extractorOptions.extractionSchema === null) { + return res.status(400).json({ error: "extractorOptions.extractionSchema must be an object if llm-extraction mode is specified" }); + } + pageOptions.onlyMainContent = true; timeout = req.body.timeout ?? 90000; } From 5ca36fe9fcf6e18acfb42150a2110f2f1b3c722d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 22 Aug 2024 15:49:16 +0200 Subject: [PATCH 10/43] feat(api): add more captureExceptions --- apps/api/src/index.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 6a6437b3..0674a46f 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -119,6 +119,7 @@ if (cluster.isMaster) { waitingJobs, }); } catch (error) { + Sentry.captureException(error); Logger.error(error); return res.status(500).json({ error: error.message }); } @@ -170,6 +171,7 @@ if (cluster.isMaster) { }, timeout); } } catch (error) { + Sentry.captureException(error); Logger.debug(error); } }; From 6d92b8524d19a900de2a3145ac6a8c0aa19a77f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 22 Aug 2024 16:00:13 +0200 Subject: [PATCH 11/43] feat(scrape): record job result in span --- apps/api/src/controllers/scrape.ts | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 959cc546..b0004276 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -49,18 +49,28 @@ export async function scrapeHelper( }, {}, jobId); let doc; - try { - doc = (await job.waitUntilFinished(scrapeQueueEvents, timeout))[0]; //60 seconds timeout - } catch (e) { - if (e instanceof Error && e.message.startsWith("Job wait")) { - return { - success: false, - error: "Request timed out", - returnCode: 408, + + const err = await Sentry.startSpanManual({ name: "Wait for job to finish", op: "bullmq.wait", attributes: { job: jobId } }, async (span) => { + try { + doc = (await job.waitUntilFinished(scrapeQueueEvents, timeout))[0] + } catch (e) { + if (e instanceof Error && e.message.startsWith("Job wait")) { + span.setAttribute("timedOut", true).end(); + return { + success: false, + error: "Request timed out", + returnCode: 408, + } + } else { + throw e; } - } else { - throw e; } + span.setAttribute("result", JSON.stringify(doc)).end(); + return null; + }); + + if (err !== null) { + return err; } await job.remove(); From 6d48dbcd38a5a8173b6917a38338bf296dfc23e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 22 Aug 2024 16:47:38 +0200 Subject: [PATCH 12/43] feat(sentry): add trace continuity for queue --- apps/api/src/controllers/crawl.ts | 9 +++- apps/api/src/controllers/scrape.ts | 6 +-- apps/api/src/controllers/status.ts | 2 - apps/api/src/main/runWebScraper.ts | 1 - apps/api/src/scraper/WebScraper/index.ts | 1 - apps/api/src/services/queue-jobs.ts | 39 ++++++++++++++-- apps/api/src/services/queue-worker.ts | 59 +++++++++++++++++++----- apps/api/src/services/sentry.ts | 3 +- 8 files changed, 95 insertions(+), 25 deletions(-) diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index c299dc01..c5f440e2 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -194,7 +194,14 @@ export async function crawlController(req: Request, res: Response) { id, jobs.map((x) => x.opts.jobId) ); - await getScrapeQueue().addBulk(jobs); + if (Sentry.isInitialized()) { + for (const job of jobs) { + // add with sentry instrumentation + await addScrapeJob(job.data as any, {}, job.opts.jobId); + } + } else { + await getScrapeQueue().addBulk(jobs); + } } else { await lockURL(id, sc, url); const job = await addScrapeJob( diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index b0004276..3666fc1a 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -50,12 +50,12 @@ export async function scrapeHelper( let doc; - const err = await Sentry.startSpanManual({ name: "Wait for job to finish", op: "bullmq.wait", attributes: { job: jobId } }, async (span) => { + const err = await Sentry.startSpan({ name: "Wait for job to finish", op: "bullmq.wait", attributes: { job: jobId } }, async (span) => { try { doc = (await job.waitUntilFinished(scrapeQueueEvents, timeout))[0] } catch (e) { if (e instanceof Error && e.message.startsWith("Job wait")) { - span.setAttribute("timedOut", true).end(); + span.setAttribute("timedOut", true); return { success: false, error: "Request timed out", @@ -65,7 +65,7 @@ export async function scrapeHelper( throw e; } } - span.setAttribute("result", JSON.stringify(doc)).end(); + span.setAttribute("result", JSON.stringify(doc)); return null; }); diff --git a/apps/api/src/controllers/status.ts b/apps/api/src/controllers/status.ts index c3ca906f..362f1f24 100644 --- a/apps/api/src/controllers/status.ts +++ b/apps/api/src/controllers/status.ts @@ -1,8 +1,6 @@ import { Request, Response } from "express"; import { Logger } from "../../src/lib/logger"; import { getCrawl, getCrawlJobs } from "../../src/lib/crawl-redis"; -import { getScrapeQueue } from "../../src/services/queue-service"; -import { supabaseGetJobById } from "../../src/lib/supabase-jobs"; import { getJobs } from "./crawl-status"; import * as Sentry from "@sentry/node"; diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 2be05bd5..aea7876e 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -12,7 +12,6 @@ import { Document } from "../lib/entities"; import { supabase_service } from "../services/supabase"; import { Logger } from "../lib/logger"; import { ScrapeEvents } from "../lib/scrape-events"; -import { getScrapeQueue } from "../services/queue-service"; export async function startWebScraperPipeline({ job, diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 65247df1..38d0cc32 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -16,7 +16,6 @@ import { replacePathsWithAbsolutePaths, } from "./utils/replacePaths"; import { generateCompletions } from "../../lib/LLM-extraction"; -import { getScrapeQueue } from "../../../src/services/queue-service"; import { fetchAndProcessDocx } from "./utils/docxProcessor"; import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils"; import { Logger } from "../../lib/logger"; diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index 3099da68..33997890 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -2,11 +2,12 @@ import { Job, Queue } from "bullmq"; import { getScrapeQueue } from "./queue-service"; import { v4 as uuidv4 } from "uuid"; import { WebScraperOptions } from "../types"; +import * as Sentry from "@sentry/node"; -export async function addScrapeJob( - webScraperOptions: WebScraperOptions, - options: any = {}, - jobId: string = uuidv4(), +async function addScrapeJobRaw( + webScraperOptions: any, + options: any, + jobId: string, ): Promise { return await getScrapeQueue().add(jobId, webScraperOptions, { priority: webScraperOptions.crawl_id ? 20 : 10, @@ -15,3 +16,33 @@ export async function addScrapeJob( }); } +export async function addScrapeJob( + webScraperOptions: WebScraperOptions, + options: any = {}, + jobId: string = uuidv4(), +): Promise { + if (Sentry.isInitialized()) { + const size = JSON.stringify(webScraperOptions).length; + return await Sentry.startSpan({ + name: "Add scrape job", + op: "queue.publish", + attributes: { + "messaging.message.id": jobId, + "messaging.destination.name": getScrapeQueue().name, + "messaging.message.body.size": size, + }, + }, async (span) => { + return await addScrapeJobRaw({ + ...webScraperOptions, + sentry: { + trace: Sentry.spanToTraceHeader(span), + baggage: Sentry.spanToBaggageHeader(span), + size, + }, + }, options, jobId); + }); + } else { + return await addScrapeJobRaw(webScraperOptions, options, jobId); + } +} + diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 2086d0a6..a7d20383 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -50,6 +50,7 @@ const processJobInternal = async (token: string, job: Job) => { await job.extendLock(token, jobLockExtensionTime); }, jobLockExtendInterval); + let err = null; try { const result = await processJob(job, token); try{ @@ -62,11 +63,14 @@ const processJobInternal = async (token: string, job: Job) => { } } catch (error) { console.log("Job failed, error:", error); - + Sentry.captureException(error); + err = error; await job.moveToFailed(error, token, false); } finally { clearInterval(extendLockInterval); } + + return err; }; let isShuttingDown = false; @@ -76,7 +80,7 @@ process.on("SIGINT", () => { isShuttingDown = true; }); -const workerFun = async (queueName: string, processJobInternal: (token: string, job: Job) => Promise) => { +const workerFun = async (queueName: string, processJobInternal: (token: string, job: Job) => Promise) => { const worker = new Worker(queueName, null, { connection: redisConnection, lockDuration: 1 * 60 * 1000, // 1 minute @@ -104,16 +108,47 @@ const workerFun = async (queueName: string, processJobInternal: (token: string, const job = await worker.getNextJob(token); if (job) { - Sentry.startSpan({ - name: "Scrape job", - op: "bullmq.job", - attributes: { - job: job.id, - worker: process.env.FLY_MACHINE_ID ?? worker.id, - }, - }, async () => { - await processJobInternal(token, job); - }); + if (job.data && job.data.sentry && Sentry.isInitialized()) { + Sentry.continueTrace({ sentryTrace: job.data.sentry.trace, baggage: job.data.sentry.baggage }, () => { + Sentry.startSpan({ + name: "Scrape job", + attributes: { + job: job.id, + worker: process.env.FLY_MACHINE_ID ?? worker.id, + }, + }, async (span) => { + await Sentry.startSpan({ + name: "Process scrape job", + op: "queue.process", + attributes: { + "messaging.message.id": job.id, + "messaging.destination.name": getScrapeQueue().name, + "messaging.message.body.size": job.data.sentry.size, + "messaging.message.receive.latency": Date.now() - (job.processedOn ?? job.timestamp), + "messaging.message.retry.count": job.attemptsMade, + } + }, async () => { + const res = await processJobInternal(token, job); + if (res !== null) { + span.setStatus({ code: 2 }); // ERROR + } else { + span.setStatus({ code: 1 }); // OK + } + }); + }); + }); + } else { + Sentry.startSpan({ + name: "Scrape job", + attributes: { + job: job.id, + worker: process.env.FLY_MACHINE_ID ?? worker.id, + }, + }, () => { + processJobInternal(token, job); + }); + } + await sleep(gotJobInterval); } else { await sleep(connectionMonitorInterval); diff --git a/apps/api/src/services/sentry.ts b/apps/api/src/services/sentry.ts index 1292773a..176d3d4b 100644 --- a/apps/api/src/services/sentry.ts +++ b/apps/api/src/services/sentry.ts @@ -10,8 +10,9 @@ if (process.env.SENTRY_DSN) { integrations: [ nodeProfilingIntegration(), ], - tracesSampleRate: 0.045, + tracesSampleRate: process.env.SENTRY_ENVIRONMENT === "dev" ? 1.0 : 0.045, profilesSampleRate: 1.0, serverName: process.env.FLY_MACHINE_ID, + environment: process.env.SENTRY_ENVIRONMENT ?? "production", }); } From d036738da05b5d85dbcafdad3508ff4fc1aa44b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 22 Aug 2024 18:04:09 +0200 Subject: [PATCH 13/43] fix(bullmq): duplicate redis connection for QueueEvents --- apps/api/src/services/queue-service.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/services/queue-service.ts b/apps/api/src/services/queue-service.ts index b13489a6..2e6d7562 100644 --- a/apps/api/src/services/queue-service.ts +++ b/apps/api/src/services/queue-service.ts @@ -37,4 +37,4 @@ export function getScrapeQueue() { import { QueueEvents } from 'bullmq'; -export const scrapeQueueEvents = new QueueEvents(scrapeQueueName, { connection: redisConnection }); \ No newline at end of file +export const scrapeQueueEvents = new QueueEvents(scrapeQueueName, { connection: redisConnection.duplicate() }); \ No newline at end of file From 7265ab7c67457013bae626493957061f95c0e761 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 22 Aug 2024 18:46:56 +0200 Subject: [PATCH 14/43] fix(search): filter docs properly --- apps/api/src/controllers/search.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 63820aba..8a04a978 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -108,7 +108,7 @@ export async function searchHelper( // make sure doc.content is not empty const filteredDocs = docs.filter( - (doc: { content?: string }) => doc.content && doc.content.trim().length > 0 + (doc: { content?: string }) => doc && doc.content && doc.content.trim().length > 0 ); if (filteredDocs.length === 0) { From dd737f1235fad97d602d1a1ac37c7c453cdfa4a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 22 Aug 2024 19:17:51 +0200 Subject: [PATCH 15/43] feat(sentry): add queue instrumentation to --- apps/api/src/controllers/search.ts | 14 ++++++++++++-- apps/api/src/services/queue-jobs.ts | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 8a04a978..304176a3 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -11,6 +11,7 @@ import { v4 as uuidv4 } from "uuid"; import { Logger } from "../lib/logger"; import { getScrapeQueue, scrapeQueueEvents } from "../services/queue-service"; import * as Sentry from "@sentry/node"; +import { addScrapeJob } from "../services/queue-jobs"; export async function searchHelper( jobId: string, @@ -95,8 +96,17 @@ export async function searchHelper( } }; }) - - const jobs = await getScrapeQueue().addBulk(jobDatas); + + let jobs = []; + if (Sentry.isInitialized()) { + for (const job of jobDatas) { + // add with sentry instrumentation + jobs.push(await addScrapeJob(job.data as any, {}, job.opts.jobId)); + } + } else { + jobs = await getScrapeQueue().addBulk(jobDatas); + await getScrapeQueue().addBulk(jobs); + } const docs = (await Promise.all(jobs.map(x => x.waitUntilFinished(scrapeQueueEvents, 60000)))).map(x => x[0]); diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index 33997890..888cdefc 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -10,8 +10,8 @@ async function addScrapeJobRaw( jobId: string, ): Promise { return await getScrapeQueue().add(jobId, webScraperOptions, { - priority: webScraperOptions.crawl_id ? 20 : 10, ...options, + priority: webScraperOptions.crawl_id ? 20 : 10, jobId, }); } From ad82175fb8d390167e4dc6799a6ee7b2d197db19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 22 Aug 2024 22:12:02 +0200 Subject: [PATCH 16/43] fix(scrape): poll --- apps/api/src/controllers/scrape.ts | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 3666fc1a..e9bd33b8 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -9,7 +9,7 @@ import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import import { numTokensFromString } from '../lib/LLM-extraction/helpers'; import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values'; import { addScrapeJob } from '../services/queue-jobs'; -import { scrapeQueueEvents } from '../services/queue-service'; +import { getScrapeQueue, scrapeQueueEvents } from '../services/queue-service'; import { v4 as uuidv4 } from "uuid"; import { Logger } from '../lib/logger'; import * as Sentry from "@sentry/node"; @@ -52,7 +52,19 @@ export async function scrapeHelper( const err = await Sentry.startSpan({ name: "Wait for job to finish", op: "bullmq.wait", attributes: { job: jobId } }, async (span) => { try { - doc = (await job.waitUntilFinished(scrapeQueueEvents, timeout))[0] + doc = (await new Promise((resolve, reject) => { + const start = Date.now(); + const int = setInterval(async () => { + if (Date.now() >= start + timeout) { + clearInterval(int); + reject(new Error("Job wait ")); + } else if (await job.getState() === "completed") { + clearInterval(int); + resolve((await getScrapeQueue().getJob(job.id)).returnvalue); + } + }, 1000); + job.waitUntilFinished(scrapeQueueEvents, timeout) + }))[0] } catch (e) { if (e instanceof Error && e.message.startsWith("Job wait")) { span.setAttribute("timedOut", true); From 76c8e9f996a2ca01c0cb2c25ff137fd7665902e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 22 Aug 2024 22:24:24 +0200 Subject: [PATCH 17/43] fix --- apps/api/src/controllers/scrape.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index e9bd33b8..4f992891 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -63,7 +63,6 @@ export async function scrapeHelper( resolve((await getScrapeQueue().getJob(job.id)).returnvalue); } }, 1000); - job.waitUntilFinished(scrapeQueueEvents, timeout) }))[0] } catch (e) { if (e instanceof Error && e.message.startsWith("Job wait")) { From e690a6fda7d0b600880fbd1f988282b8c8fa5459 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 22 Aug 2024 22:38:39 +0200 Subject: [PATCH 18/43] fix: remove QueueEvents --- apps/api/src/controllers/scrape.ts | 2 +- apps/api/src/controllers/search.ts | 15 +++++++++++++-- apps/api/src/services/queue-service.ts | 6 +++--- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 4f992891..3ffbc92b 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -9,7 +9,7 @@ import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import import { numTokensFromString } from '../lib/LLM-extraction/helpers'; import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values'; import { addScrapeJob } from '../services/queue-jobs'; -import { getScrapeQueue, scrapeQueueEvents } from '../services/queue-service'; +import { getScrapeQueue } from '../services/queue-service'; import { v4 as uuidv4 } from "uuid"; import { Logger } from '../lib/logger'; import * as Sentry from "@sentry/node"; diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 304176a3..d86862b1 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -9,7 +9,7 @@ import { search } from "../search"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; import { v4 as uuidv4 } from "uuid"; import { Logger } from "../lib/logger"; -import { getScrapeQueue, scrapeQueueEvents } from "../services/queue-service"; +import { getScrapeQueue } from "../services/queue-service"; import * as Sentry from "@sentry/node"; import { addScrapeJob } from "../services/queue-jobs"; @@ -108,7 +108,18 @@ export async function searchHelper( await getScrapeQueue().addBulk(jobs); } - const docs = (await Promise.all(jobs.map(x => x.waitUntilFinished(scrapeQueueEvents, 60000)))).map(x => x[0]); + const docs = (await Promise.all(jobs.map(x => new Promise((resolve, reject) => { + const start = Date.now(); + const int = setInterval(async () => { + if (Date.now() >= start + 60000) { + clearInterval(int); + reject(new Error("Job wait ")); + } else if (await x.getState() === "completed") { + clearInterval(int); + resolve((await getScrapeQueue().getJob(x.id)).returnvalue); + } + }, 1000); + })))).map(x => x[0]); if (docs.length === 0) { return { success: true, error: "No search results found", returnCode: 200 }; diff --git a/apps/api/src/services/queue-service.ts b/apps/api/src/services/queue-service.ts index 2e6d7562..113b3fa3 100644 --- a/apps/api/src/services/queue-service.ts +++ b/apps/api/src/services/queue-service.ts @@ -35,6 +35,6 @@ export function getScrapeQueue() { } -import { QueueEvents } from 'bullmq'; - -export const scrapeQueueEvents = new QueueEvents(scrapeQueueName, { connection: redisConnection.duplicate() }); \ No newline at end of file +// === REMOVED IN FAVOR OF POLLING -- NOT RELIABLE +// import { QueueEvents } from 'bullmq'; +// export const scrapeQueueEvents = new QueueEvents(scrapeQueueName, { connection: redisConnection.duplicate() }); \ No newline at end of file From 8e3c2b28550aafcdf4627724940a3b951672c496 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 22 Aug 2024 23:30:19 +0200 Subject: [PATCH 19/43] fix(crawler): verify URL --- apps/api/src/scraper/WebScraper/crawler.ts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 67f1c22e..92b9ae40 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -108,7 +108,12 @@ export class WebCrawler { // Normalize the initial URL and the link to account for www and non-www versions const normalizedInitialUrl = new URL(this.initialUrl); - const normalizedLink = new URL(link); + let normalizedLink; + try { + normalizedLink = new URL(link); + } catch (_) { + return false; + } const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, ''); const linkHostname = normalizedLink.hostname.replace(/^www\./, ''); From 1f779e261a3260964488cc896fa24c2248a09bfb Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 22 Aug 2024 18:30:45 -0300 Subject: [PATCH 20/43] Update rate-limiter.ts --- apps/api/src/services/rate-limiter.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 05fb102c..cd923c4c 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -90,7 +90,7 @@ export function getRateLimiter( plan?: string ) { - if (token.includes("a01ccae") || token.includes("6254cf9")) { + if (token.includes("a01ccae") || token.includes("6254cf9") || token.includes("0f96e673")) { return testSuiteRateLimiter; } From 8d9ff90bcb6d25f2c0d9592c6a5e9d03dab199ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 22 Aug 2024 23:37:23 +0200 Subject: [PATCH 21/43] feat(fire-engine): propagate sentry trace --- .../scraper/WebScraper/scrapers/fireEngine.ts | 49 +++++++++++-------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index b520bfe2..aa86ad5e 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -5,6 +5,7 @@ import { generateRequestParams } from "../single_url"; import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { universalTimeout } from "../global"; import { Logger } from "../../../lib/logger"; +import * as Sentry from "@sentry/node"; /** * Scrapes a URL with Fire-Engine @@ -92,27 +93,35 @@ export async function scrapWithFireEngine({ }); const startTime = Date.now(); - const _response = await axiosInstance.post( - process.env.FIRE_ENGINE_BETA_URL + endpoint, - { - url: url, - wait: waitParam, - screenshot: screenshotParam, - fullPageScreenshot: fullPageScreenshotParam, - headers: headers, - pageOptions: pageOptions, - disableJsDom: pageOptions?.disableJsDom ?? false, - priority, - engine, - instantReturn: true, - ...fireEngineOptionsParam, - }, - { - headers: { - "Content-Type": "application/json", + const _response = await Sentry.startSpan({ + name: "Call to fire-engine" + }, async span => { + return await axiosInstance.post( + process.env.FIRE_ENGINE_BETA_URL + endpoint, + { + url: url, + wait: waitParam, + screenshot: screenshotParam, + fullPageScreenshot: fullPageScreenshotParam, + headers: headers, + pageOptions: pageOptions, + disableJsDom: pageOptions?.disableJsDom ?? false, + priority, + engine, + instantReturn: true, + ...fireEngineOptionsParam, + }, + { + headers: { + "Content-Type": "application/json", + ...(Sentry.isInitialized() ? ({ + "sentry-trace": Sentry.spanToTraceHeader(span), + "baggage": Sentry.spanToBaggageHeader(span), + }) : {}), + } } - } - ); + ); + }); let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`); while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitParam) { From 64e9be0cd4044c89b56c6e4e017184893e7ad694 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 22 Aug 2024 23:37:52 +0200 Subject: [PATCH 22/43] feat(redis): use bitnami image --- apps/dragonfly/.dockerignore | 2 + apps/dragonfly/Dockerfile | 6 + apps/dragonfly/Procfile | 2 + apps/dragonfly/README.md | 48 ++++++ apps/dragonfly/fly.toml | 7 +- apps/dragonfly/scripts/bump_version.sh | 91 +++++++++++ apps/dragonfly/scripts/semver | 200 +++++++++++++++++++++++++ apps/dragonfly/scripts/version.sh | 5 + apps/dragonfly/start-redis-server.sh | 30 ++++ 9 files changed, 385 insertions(+), 6 deletions(-) create mode 100644 apps/dragonfly/.dockerignore create mode 100644 apps/dragonfly/Dockerfile create mode 100644 apps/dragonfly/Procfile create mode 100644 apps/dragonfly/README.md create mode 100755 apps/dragonfly/scripts/bump_version.sh create mode 100755 apps/dragonfly/scripts/semver create mode 100755 apps/dragonfly/scripts/version.sh create mode 100755 apps/dragonfly/start-redis-server.sh diff --git a/apps/dragonfly/.dockerignore b/apps/dragonfly/.dockerignore new file mode 100644 index 00000000..860aa7ad --- /dev/null +++ b/apps/dragonfly/.dockerignore @@ -0,0 +1,2 @@ +.git +fly.toml diff --git a/apps/dragonfly/Dockerfile b/apps/dragonfly/Dockerfile new file mode 100644 index 00000000..77ea66ae --- /dev/null +++ b/apps/dragonfly/Dockerfile @@ -0,0 +1,6 @@ +ARG REDIS_VERSION=7.2.5 +FROM bitnami/redis:${REDIS_VERSION} + +COPY start-redis-server.sh /usr/bin/start-redis-server.sh + +CMD ["/usr/bin/start-redis-server.sh"] diff --git a/apps/dragonfly/Procfile b/apps/dragonfly/Procfile new file mode 100644 index 00000000..8f661345 --- /dev/null +++ b/apps/dragonfly/Procfile @@ -0,0 +1,2 @@ +redis: /usr/bin/start-redis-server.sh +metrics: /usr/local/bin/redis_exporter -redis.addr localhost:6379 -web.listen-address ":9091" diff --git a/apps/dragonfly/README.md b/apps/dragonfly/README.md new file mode 100644 index 00000000..7d2bcabd --- /dev/null +++ b/apps/dragonfly/README.md @@ -0,0 +1,48 @@ +The official repository for Running Redis on Fly.io. Find the accompanying Docker image at [flyio/redis](https://hub.docker.com/repository/docker/flyio/redis). + +## Usage + +This installation requires setting a password on Redis. To do that, run `fly secrets set REDIS_PASSWORD=mypassword` before deploying. Keep +track of this password - it won't be visible again after deployment! + +If you need no customizations, you can deploy using the official Docker image. See `fly.toml` in this repository for an example to get started with. +## Runtime requirements + +By default, this Redis installation will only accept connections on the private IPv6 network, on the standard port 6379. + +If you want to access it from the public internet, add a `[[services]]` section to your `fly.toml`. An example is included in this repo for accessing Redis on port 10000. + + +We recommend adding persistent storage for Redis data. If you skip this step, data will be lost across deploys or restarts. For Fly apps, the volume needs to be in the same region as the app instances. For example: + +```cmd +flyctl volumes create redis_server --region ord +``` +```out + Name: redis_server + Region: ord + Size GB: 10 +Created at: 02 Nov 20 19:55 UTC +``` + +To connect this volume to the app, `fly.toml` includes a `[mounts]` entry. + +``` +[mounts] +source = "redis_server" +destination = "/data" +``` + +When the app starts, that volume will be mounted on /data. + +## Cutting a release + +If you have write access to this repo, you can ship a prerelease or full release with: + +``` +scripts/bump_version.sh +``` +or +``` +scripts/bump_version.sh prerel +``` diff --git a/apps/dragonfly/fly.toml b/apps/dragonfly/fly.toml index 14bdbd96..1bcd05fb 100644 --- a/apps/dragonfly/fly.toml +++ b/apps/dragonfly/fly.toml @@ -1,13 +1,8 @@ app = 'firecrawl-dragonfly' primary_region = 'iad' -[experimental] - cmd = ['dragonfly','--logtostderr', '--cluster_mode=emulated', '--lock_on_hashtags', "--bind","::"] -[build] - image = 'ghcr.io/dragonflydb/dragonfly' - [[mounts]] - source = 'firecrawl_dragonfly' + source = 'firecrawl_redis' destination = '/data' [[services]] diff --git a/apps/dragonfly/scripts/bump_version.sh b/apps/dragonfly/scripts/bump_version.sh new file mode 100755 index 00000000..4a82c00d --- /dev/null +++ b/apps/dragonfly/scripts/bump_version.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash + +set -euo pipefail + +ORIGIN=${ORIGIN:-origin} + +bump=${1:-patch} + +prerel=${2:-none} + +if [[ $bump == "prerel" ]]; then + bump="patch" + prerel="prerel" +fi + +if [[ $(git status --porcelain) != "" ]]; then + echo "Error: repo is dirty. Run git status, clean repo and try again." + exit 1 +elif [[ $(git status --porcelain -b | grep -e "ahead" -e "behind") != "" ]]; then + echo "Error: repo has unpushed commits. Push commits to remote and try again." + exit 1 +fi + +BRANCH="$(git rev-parse --abbrev-ref HEAD)" +if [[ "$prerel" == "prerel" && "$BRANCH" != "prerelease" ]]; then +# echo "❌ Sorry, you can only cut a pre-release from the 'prelease' branch" +# echo "Run 'git checkout prerelease && git pull origin prerelease' and try again." +# exit 1 + echo "⚠️ Pre-releases should be cut from the 'prerelease' branch" + echo "Please make sure you're not overwriting someone else's prerelease!" + echo + read -p "Release anyway? " -n 1 -r + echo + if [[ $REPLY =~ ^[^Yy]$ ]]; then + echo Aborting. + exit 1 + fi +fi + +if [[ "$prerel" != "prerel" && "$BRANCH" != "main" ]]; then + echo "❌ Sorry, you can only cut a release from the 'main' branch" + echo "Run 'git checkout main && git pull origin main' and try again." + exit 1 +fi + +git fetch +if [[ "$(git rev-parse HEAD 2>&1)" != "$(git rev-parse '@{u}' 2>&1)" ]]; then + echo "There are upstream commits that won't be included in this release." + echo "You probably want to exit, run 'git pull', then release." + echo + read -p "Release anyway? " -n 1 -r + echo + if [[ $REPLY =~ ^[^Yy]$ ]]; then + echo Aborting. + exit 1 + fi +fi + +dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +previous_version="$("$dir"/../scripts/version.sh -s)" + +if [[ $prerel == "prerel" ]]; then + prerelversion=$("$dir"/../scripts/semver get prerel "$previous_version") + if [[ $prerelversion == "" ]]; then + new_version=$("$dir"/../scripts/semver bump "$bump" "$previous_version") + new_version=$("$dir"/../scripts/semver bump prerel pre-1 "$new_version") + else + prerel=pre-$((${prerelversion#pre-} + 1)) + new_version=$("$dir"/../scripts/semver bump prerel "$prerel" "$previous_version") + fi +else + prerelversion=$("$dir"/../scripts/semver get prerel "$previous_version") + if [[ $prerelversion == "" ]]; then + new_version=$("$dir"/../scripts/semver bump "$bump" "$previous_version") + else + new_version=${previous_version//-$prerelversion/} + fi +fi + +new_version="v$new_version" + +echo "Bumping version from v${previous_version} to ${new_version}" + +read -p "Are you sure? " -n 1 -r +echo +if [[ $REPLY =~ ^[Yy]$ ]] +then + git tag -m "release ${new_version}" -a "$new_version" && git push "${ORIGIN}" tag "$new_version" + echo "done" +fi diff --git a/apps/dragonfly/scripts/semver b/apps/dragonfly/scripts/semver new file mode 100755 index 00000000..674229e0 --- /dev/null +++ b/apps/dragonfly/scripts/semver @@ -0,0 +1,200 @@ +#!/usr/bin/env bash + +set -o errexit -o nounset -o pipefail + +SEMVER_REGEX="^[vV]?(0|[1-9][0-9]*)\\.(0|[1-9][0-9]*)\\.(0|[1-9][0-9]*)(\\-[0-9A-Za-z-]+(\\.[0-9A-Za-z-]+)*)?(\\+[0-9A-Za-z-]+(\\.[0-9A-Za-z-]+)*)?$" + +PROG=semver +PROG_VERSION=2.1.0 + +USAGE="\ +Usage: + $PROG bump (major|minor|patch|release|prerel |build ) + $PROG compare + $PROG get (major|minor|patch|release|prerel|build) + $PROG --help + $PROG --version + +Arguments: + A version must match the following regex pattern: + \"${SEMVER_REGEX}\". + In english, the version must match X.Y.Z(-PRERELEASE)(+BUILD) + where X, Y and Z are positive integers, PRERELEASE is an optional + string composed of alphanumeric characters and hyphens and + BUILD is also an optional string composed of alphanumeric + characters and hyphens. + + See definition. + + String that must be composed of alphanumeric characters and hyphens. + + String that must be composed of alphanumeric characters and hyphens. + +Options: + -v, --version Print the version of this tool. + -h, --help Print this help message. + +Commands: + bump Bump by one of major, minor, patch, prerel, build + or a forced potentially conflicting version. The bumped version is + shown to stdout. + + compare Compare with , output to stdout the + following values: -1 if is newer, 0 if equal, 1 if + older. + + get Extract given part of , where part is one of major, minor, + patch, prerel, build." + +function error { + echo -e "$1" >&2 + exit 1 +} + +function usage-help { + error "$USAGE" +} + +function usage-version { + echo -e "${PROG}: $PROG_VERSION" + exit 0 +} + +function validate-version { + local version=$1 + if [[ "$version" =~ $SEMVER_REGEX ]]; then + # if a second argument is passed, store the result in var named by $2 + if [ "$#" -eq "2" ]; then + local major=${BASH_REMATCH[1]} + local minor=${BASH_REMATCH[2]} + local patch=${BASH_REMATCH[3]} + local prere=${BASH_REMATCH[4]} + local build=${BASH_REMATCH[6]} + eval "$2=(\"$major\" \"$minor\" \"$patch\" \"$prere\" \"$build\")" + else + echo "$version" + fi + else + error "version $version does not match the semver scheme 'X.Y.Z(-PRERELEASE)(+BUILD)'. See help for more information." + fi +} + +function compare-version { + validate-version "$1" V + validate-version "$2" V_ + + # MAJOR, MINOR and PATCH should compare numerically + for i in 0 1 2; do + local diff=$((${V[$i]} - ${V_[$i]})) + if [[ $diff -lt 0 ]]; then + echo -1; return 0 + elif [[ $diff -gt 0 ]]; then + echo 1; return 0 + fi + done + + # PREREL should compare with the ASCII order. + if [[ -z "${V[3]}" ]] && [[ -n "${V_[3]}" ]]; then + echo 1; return 0; + elif [[ -n "${V[3]}" ]] && [[ -z "${V_[3]}" ]]; then + echo -1; return 0; + elif [[ -n "${V[3]}" ]] && [[ -n "${V_[3]}" ]]; then + if [[ "${V[3]}" > "${V_[3]}" ]]; then + echo 1; return 0; + elif [[ "${V[3]}" < "${V_[3]}" ]]; then + echo -1; return 0; + fi + fi + + echo 0 +} + +function command-bump { + local new; local version; local sub_version; local command; + + case $# in + 2) case $1 in + major|minor|patch|release) command=$1; version=$2;; + *) usage-help;; + esac ;; + 3) case $1 in + prerel|build) command=$1; sub_version=$2 version=$3 ;; + *) usage-help;; + esac ;; + *) usage-help;; + esac + + validate-version "$version" parts + # shellcheck disable=SC2154 + local major="${parts[0]}" + local minor="${parts[1]}" + local patch="${parts[2]}" + local prere="${parts[3]}" + local build="${parts[4]}" + + case "$command" in + major) new="$((major + 1)).0.0";; + minor) new="${major}.$((minor + 1)).0";; + patch) new="${major}.${minor}.$((patch + 1))";; + release) new="${major}.${minor}.${patch}";; + prerel) new=$(validate-version "${major}.${minor}.${patch}-${sub_version}");; + build) new=$(validate-version "${major}.${minor}.${patch}${prere}+${sub_version}");; + *) usage-help ;; + esac + + echo "$new" + exit 0 +} + +function command-compare { + local v; local v_; + + case $# in + 2) v=$(validate-version "$1"); v_=$(validate-version "$2") ;; + *) usage-help ;; + esac + + compare-version "$v" "$v_" + exit 0 +} + + +# shellcheck disable=SC2034 +function command-get { + local part version + + if [[ "$#" -ne "2" ]] || [[ -z "$1" ]] || [[ -z "$2" ]]; then + usage-help + exit 0 + fi + + part="$1" + version="$2" + + validate-version "$version" parts + local major="${parts[0]}" + local minor="${parts[1]}" + local patch="${parts[2]}" + local prerel="${parts[3]:1}" + local build="${parts[4]:1}" + + case "$part" in + major|minor|patch|release|prerel|build) echo "${!part}" ;; + *) usage-help ;; + esac + + exit 0 +} + +case $# in + 0) echo "Unknown command: $*"; usage-help;; +esac + +case $1 in + --help|-h) echo -e "$USAGE"; exit 0;; + --version|-v) usage-version ;; + bump) shift; command-bump "$@";; + get) shift; command-get "$@";; + compare) shift; command-compare "$@";; + *) echo "Unknown arguments: $*"; usage-help;; +esac diff --git a/apps/dragonfly/scripts/version.sh b/apps/dragonfly/scripts/version.sh new file mode 100755 index 00000000..0d3d9875 --- /dev/null +++ b/apps/dragonfly/scripts/version.sh @@ -0,0 +1,5 @@ +ORIGIN=${ORIGIN:-origin} + +version=$(git fetch --tags "${ORIGIN}" &>/dev/null | git -c "versionsort.prereleasesuffix=-pre" tag -l --sort=version:refname | grep -v dev | grep -vE '^v2$' | grep -vE '^v1$' | tail -n1 | cut -c 2-) + +echo "$version" diff --git a/apps/dragonfly/start-redis-server.sh b/apps/dragonfly/start-redis-server.sh new file mode 100755 index 00000000..ed252fde --- /dev/null +++ b/apps/dragonfly/start-redis-server.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -e + +sysctl vm.overcommit_memory=1 || true +sysctl net.core.somaxconn=1024 || true + +PW_ARG="" +if [[ ! -z "${REDIS_PASSWORD}" ]]; then + PW_ARG="--requirepass $REDIS_PASSWORD" +fi + +# Set maxmemory-policy to 'allkeys-lru' for caching servers that should always evict old keys +: ${MAXMEMORY_POLICY:="volatile-lru"} +: ${APPENDONLY:="no"} +: ${FLY_VM_MEMORY_MB:=512} +if [ "${NOSAVE}" = "" ] ; then + : ${SAVE:="3600 1 300 100 60 10000"} +fi +# Set maxmemory to 10% of available memory +MAXMEMORY=$(($FLY_VM_MEMORY_MB*80/100)) + +mkdir /data/redis + +redis-server $PW_ARG \ + --dir /data/redis \ + --maxmemory "${MAXMEMORY}mb" \ + --maxmemory-policy $MAXMEMORY_POLICY \ + --appendonly $APPENDONLY \ + --save "$SAVE" From 52a05b8c6ea5e04243daad229ed960f4428c5833 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 23 Aug 2024 17:05:59 +0200 Subject: [PATCH 23/43] rename "dragonfly" to "redis" --- apps/{dragonfly => redis}/.dockerignore | 0 apps/{dragonfly => redis}/Dockerfile | 0 apps/{dragonfly => redis}/Procfile | 0 apps/{dragonfly => redis}/README.md | 0 apps/{dragonfly => redis}/fly.toml | 0 apps/{dragonfly => redis}/scripts/bump_version.sh | 0 apps/{dragonfly => redis}/scripts/semver | 0 apps/{dragonfly => redis}/scripts/version.sh | 0 apps/{dragonfly => redis}/start-redis-server.sh | 0 9 files changed, 0 insertions(+), 0 deletions(-) rename apps/{dragonfly => redis}/.dockerignore (100%) rename apps/{dragonfly => redis}/Dockerfile (100%) rename apps/{dragonfly => redis}/Procfile (100%) rename apps/{dragonfly => redis}/README.md (100%) rename apps/{dragonfly => redis}/fly.toml (100%) rename apps/{dragonfly => redis}/scripts/bump_version.sh (100%) rename apps/{dragonfly => redis}/scripts/semver (100%) rename apps/{dragonfly => redis}/scripts/version.sh (100%) rename apps/{dragonfly => redis}/start-redis-server.sh (100%) diff --git a/apps/dragonfly/.dockerignore b/apps/redis/.dockerignore similarity index 100% rename from apps/dragonfly/.dockerignore rename to apps/redis/.dockerignore diff --git a/apps/dragonfly/Dockerfile b/apps/redis/Dockerfile similarity index 100% rename from apps/dragonfly/Dockerfile rename to apps/redis/Dockerfile diff --git a/apps/dragonfly/Procfile b/apps/redis/Procfile similarity index 100% rename from apps/dragonfly/Procfile rename to apps/redis/Procfile diff --git a/apps/dragonfly/README.md b/apps/redis/README.md similarity index 100% rename from apps/dragonfly/README.md rename to apps/redis/README.md diff --git a/apps/dragonfly/fly.toml b/apps/redis/fly.toml similarity index 100% rename from apps/dragonfly/fly.toml rename to apps/redis/fly.toml diff --git a/apps/dragonfly/scripts/bump_version.sh b/apps/redis/scripts/bump_version.sh similarity index 100% rename from apps/dragonfly/scripts/bump_version.sh rename to apps/redis/scripts/bump_version.sh diff --git a/apps/dragonfly/scripts/semver b/apps/redis/scripts/semver similarity index 100% rename from apps/dragonfly/scripts/semver rename to apps/redis/scripts/semver diff --git a/apps/dragonfly/scripts/version.sh b/apps/redis/scripts/version.sh similarity index 100% rename from apps/dragonfly/scripts/version.sh rename to apps/redis/scripts/version.sh diff --git a/apps/dragonfly/start-redis-server.sh b/apps/redis/start-redis-server.sh similarity index 100% rename from apps/dragonfly/start-redis-server.sh rename to apps/redis/start-redis-server.sh From 5ef3926d2af23631baa5ff1fef484f17253853eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 23 Aug 2024 18:47:56 +0200 Subject: [PATCH 24/43] fix(scrape,search): handle failed jobs --- apps/api/src/controllers/scrape.ts | 20 ++++++++++++++++---- apps/api/src/controllers/search.ts | 12 +++++++++--- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 3ffbc92b..880cbbfa 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -58,9 +58,15 @@ export async function scrapeHelper( if (Date.now() >= start + timeout) { clearInterval(int); reject(new Error("Job wait ")); - } else if (await job.getState() === "completed") { - clearInterval(int); - resolve((await getScrapeQueue().getJob(job.id)).returnvalue); + } else { + const state = await job.getState(); + if (state === "completed") { + clearInterval(int); + resolve((await getScrapeQueue().getJob(job.id)).returnvalue); + } else if (state === "failed") { + clearInterval(int); + reject((await getScrapeQueue().getJob(job.id)).failedReason); + } } }, 1000); }))[0] @@ -72,6 +78,12 @@ export async function scrapeHelper( error: "Request timed out", returnCode: 408, } + } else if (typeof e === "string" && (e.includes("Error generating completions: ") || e.includes("Invalid schema for function"))) { + return { + success: false, + error: e, + returnCode: 500, + }; } else { throw e; } @@ -214,6 +226,6 @@ export async function scrapeController(req: Request, res: Response) { } catch (error) { Sentry.captureException(error); Logger.error(error); - return res.status(500).json({ error: error.message }); + return res.status(500).json({ error: typeof error === "string" ? error : (error?.message ?? "Internal Server Error") }); } } diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index d86862b1..aeb044d8 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -114,9 +114,15 @@ export async function searchHelper( if (Date.now() >= start + 60000) { clearInterval(int); reject(new Error("Job wait ")); - } else if (await x.getState() === "completed") { - clearInterval(int); - resolve((await getScrapeQueue().getJob(x.id)).returnvalue); + } else { + const state = await x.getState(); + if (state === "completed") { + clearInterval(int); + resolve((await getScrapeQueue().getJob(x.id)).returnvalue); + } else if (state === "failed") { + clearInterval(int); + reject((await getScrapeQueue().getJob(x.id)).failedReason); + } } }, 1000); })))).map(x => x[0]); From 3d53f4e213436b62e3905cfdc80f80a49f108cb4 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 23 Aug 2024 13:56:05 -0300 Subject: [PATCH 25/43] Nick: unblocking pin --- .../api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts | 1 - .../scraper/WebScraper/utils/__tests__/socialBlockList.test.ts | 1 - apps/api/src/scraper/WebScraper/utils/blocklist.ts | 1 - 3 files changed, 3 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts index 42525257..77411b00 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts @@ -8,7 +8,6 @@ describe('Blocklist Functionality', () => { 'https://twitter.com/home', 'https://instagram.com/explore', 'https://linkedin.com/in/johndoe', - 'https://pinterest.com/pin/create', 'https://snapchat.com/add/johndoe', 'https://tiktok.com/@johndoe', 'https://reddit.com/r/funny', diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/socialBlockList.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/socialBlockList.test.ts index c09cc5b3..3d98fedf 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/socialBlockList.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/socialBlockList.test.ts @@ -8,7 +8,6 @@ describe('isUrlBlocked', () => { 'https://twitter.com/someuser', 'https://instagram.com/someuser', 'https://www.linkedin.com/in/someuser', - 'https://pinterest.com/someuser', 'https://snapchat.com/someuser', 'https://tiktok.com/@someuser', 'https://reddit.com/r/somesubreddit', diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index 7f1602e1..7b1ee19c 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -6,7 +6,6 @@ const socialMediaBlocklist = [ 'twitter.com', 'instagram.com', 'linkedin.com', - 'pinterest.com', 'snapchat.com', 'tiktok.com', 'reddit.com', From 2ab0dd2e150da1020fb6d30b7aa99118f4d884c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 23 Aug 2024 19:20:17 +0200 Subject: [PATCH 26/43] fix(scrape): add further llm extraction catch --- apps/api/src/controllers/scrape.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index 880cbbfa..0b4df13c 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -1,4 +1,4 @@ -import { ExtractorOptions, PageOptions } from './../lib/entities'; + import { ExtractorOptions, PageOptions } from './../lib/entities'; import { Request, Response } from "express"; import { billTeam, checkTeamCredits } from "../services/billing/credit_billing"; import { authenticateUser } from "./auth"; @@ -78,7 +78,7 @@ export async function scrapeHelper( error: "Request timed out", returnCode: 408, } - } else if (typeof e === "string" && (e.includes("Error generating completions: ") || e.includes("Invalid schema for function"))) { + } else if (typeof e === "string" && (e.includes("Error generating completions: ") || e.includes("Invalid schema for function") || e.includes("LLM extraction did not match the extraction schema you provided."))) { return { success: false, error: e, From 732e6af8b9ed9734f68e5d58ac8525ecd0c75acc Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Fri, 23 Aug 2024 15:49:41 -0400 Subject: [PATCH 27/43] Add internal link opportunities example --- .../find_internal_link_opportunites.ipynb | 509 ++++++++++++++++++ 1 file changed, 509 insertions(+) create mode 100644 examples/find_internal_link_opportunites/find_internal_link_opportunites.ipynb diff --git a/examples/find_internal_link_opportunites/find_internal_link_opportunites.ipynb b/examples/find_internal_link_opportunites/find_internal_link_opportunites.ipynb new file mode 100644 index 00000000..d6168878 --- /dev/null +++ b/examples/find_internal_link_opportunites/find_internal_link_opportunites.ipynb @@ -0,0 +1,509 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import datetime\n", + "import time\n", + "from firecrawl import FirecrawlApp\n", + "import json\n", + "import anthropic\n", + "from dotenv import load_dotenv\n", + "\n", + "# Load environment variables\n", + "load_dotenv()\n", + "\n", + "# Retrieve API keys from environment variables\n", + "anthropic_api_key = os.getenv(\"ANTHROPIC_API_KEY\") or \"\"\n", + "firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\") or \"\"\n", + "# Set variables\n", + "blog_url=\"https://mendable.ai/blog\"\n", + "\n", + "# Set up anthropic client\n", + "client = anthropic.Anthropic(\n", + " api_key=anthropic_api_key,\n", + ")\n", + "\n", + "# Initialize the FirecrawlApp with your API key\n", + "app = FirecrawlApp(api_key=firecrawl_api_key)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Crawl a website\n", + "params = {\n", + " 'crawlOptions': {\n", + " 'limit': 100\n", + " },\n", + " \"pageOptions\": {\n", + " \"onlyMainContent\": True\n", + " }\n", + "}\n", + "crawl_result = app.crawl_url(blog_url, params=params)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting potential links from crawl_result:\n", + "Collected 36 potential links:\n", + "URL: https://mendable.ai/blog/coachgtm-mongodb, Title: Meet MongoDBs CoachGTM.ai\n", + "URL: https://mendable.ai/blog/building-safe-rag, Title: Building Safe RAG systems with the LLM OWASP top 10\n", + "URL: https://mendable.ai/blog/gdpr-repository-pattern, Title: Navigating the Maze of GDPR Compliance: A Codebase Transformation\n", + "URL: https://mendable.ai/blog/how-mendable-leverages-langsmith-to-debug-tools-and-actions, Title: How Mendable leverages Langsmith to debug Tools & Actions\n", + "URL: https://mendable.ai/blog/european-data-storage, Title: Launching European Data Storage powered by MongoDB\n", + "URL: https://mendable.ai/blog/tools, Title: Introducing Tools and Actions\n", + "URL: https://mendable.ai/blog/december_update, Title: Mendable.ai December Recap\n", + "URL: https://mendable.ai/blog/november_update, Title: Mendable.ai November Update\n", + "URL: https://mendable.ai/blog/october-recap, Title: Mendable.ai October Recap\n", + "URL: https://mendable.ai/blog/midseptemberupdate, Title: Mendable.ai Mid September 2023 Update\n", + "URL: https://mendable.ai/blog/getting-started, Title: Everything you need to know about Mendable: Build and deploy AI Chat Search\n", + "URL: https://mendable.ai/blog/building-copilots, Title: Building context-aware AI copilots with Mendable\n", + "URL: https://mendable.ai/blog/august2023update, Title: Mendable.ai August 2023 Updates\n", + "URL: https://mendable.ai/blog/finetuning-gpt35, Title: Early Insights Fine-Tuning GPT 3.5 from Mendable.ai\n", + "URL: https://mendable.ai/blog/gpt35prompting, Title: Improving GPT-3.5, Insights from Mendable.ai\n", + "URL: https://mendable.ai/blog/precisemode, Title: Introducing Precise Mode for Mendable.ai\n", + "URL: https://mendable.ai/blog/customprompt, Title: Customizing Your LLM Model on Mendable.ai\n", + "URL: https://mendable.ai/blog/mendable-launch, Title: Introducing Mendable.ai\n", + "URL: https://mendable.ai/blog/european-data-storage, Title: Launching European Data Storage powered by MongoDB\n", + "URL: https://mendable.ai/blog/customprompt, Title: Customizing Your LLM Model on Mendable.ai\n", + "URL: https://mendable.ai/blog/precisemode, Title: Introducing Precise Mode for Mendable.ai\n", + "URL: https://mendable.ai/blog/building-copilots, Title: Building context-aware AI copilots with Mendable\n", + "URL: https://mendable.ai/blog/coachgtm-mongodb, Title: Meet MongoDBs CoachGTM.ai\n", + "URL: https://mendable.ai/blog/building-safe-rag, Title: Building Safe RAG systems with the LLM OWASP top 10\n", + "URL: https://mendable.ai/blog/gdpr-repository-pattern, Title: Navigating the Maze of GDPR Compliance: A Codebase Transformation\n", + "URL: https://mendable.ai/blog/how-mendable-leverages-langsmith-to-debug-tools-and-actions, Title: How Mendable leverages Langsmith to debug Tools & Actions\n", + "URL: https://mendable.ai/blog/tools, Title: Introducing Tools and Actions\n", + "URL: https://mendable.ai/blog/december_update, Title: Mendable.ai December Recap\n", + "URL: https://mendable.ai/blog/november_update, Title: Mendable.ai November Update\n", + "URL: https://mendable.ai/blog/october-recap, Title: Mendable.ai October Recap\n", + "URL: https://mendable.ai/blog/midseptemberupdate, Title: Mendable.ai Mid September 2023 Update\n", + "URL: https://mendable.ai/blog/getting-started, Title: Everything you need to know about Mendable: Build and deploy AI Chat Search\n", + "URL: https://mendable.ai/blog/august2023update, Title: Mendable.ai August 2023 Updates\n", + "URL: https://mendable.ai/blog/finetuning-gpt35, Title: Early Insights Fine-Tuning GPT 3.5 from Mendable.ai\n", + "URL: https://mendable.ai/blog/gpt35prompting, Title: Improving GPT-3.5, Insights from Mendable.ai\n", + "URL: https://mendable.ai/blog/mendable-launch, Title: Introducing Mendable.ai\n" + ] + } + ], + "source": [ + "potential_links = []\n", + "\n", + "if crawl_result:\n", + " print(\"Collecting potential links from crawl_result:\")\n", + " \n", + " for item in crawl_result:\n", + " metadata = item[\"metadata\"]\n", + " og_url = metadata.get(\"ogUrl\")\n", + " title = metadata.get(\"title\")\n", + " if og_url and title and og_url != blog_url:\n", + " potential_links.append({\"url\": og_url, \"title\": title})\n", + " \n", + " print(f\"Collected {len(potential_links)} potential links:\")\n", + " for link in potential_links:\n", + " print(f\"URL: {link['url']}, Title: {link['title']}\")\n", + " \n", + "else:\n", + " print(\"crawl_result is empty or None\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Suggestion for: Meet MongoDBs CoachGTM.ai\n", + "Blog phrase: Mendable also provides a Tools\n", + "Internal Link: https://mendable.ai/blog/tools\n", + "---\n", + "\n", + "\n", + "Suggestion for: Meet MongoDBs CoachGTM.ai\n", + "Blog phrase: MongoDB Atlas Vector Search to\n", + "Internal Link: https://mendable.ai/blog/european-data-storage\n", + "---\n", + "\n", + "\n", + "Suggestion for: Meet MongoDBs CoachGTM.ai\n", + "Blog phrase: By harnessing the power of\n", + "Internal Link: https://mendable.ai/blog/building-copilots\n", + "---\n", + "\n", + "\n", + "Suggestion for: Building Safe RAG systems with the LLM OWASP top 10\n", + "Blog phrase: Advantages of RAG\n", + "Internal Link: https://mendable.ai/blog/building-copilots\n", + "---\n", + "\n", + "\n", + "Suggestion for: Building Safe RAG systems with the LLM OWASP top 10\n", + "Blog phrase: Bring Your Model\n", + "Internal Link: https://mendable.ai/blog/customprompt\n", + "---\n", + "\n", + "\n", + "Suggestion for: Building Safe RAG systems with the LLM OWASP top 10\n", + "Blog phrase: Garbage in, Garbage out\n", + "Internal Link: https://mendable.ai/blog/precisemode\n", + "---\n", + "\n", + "\n", + "Suggestion for: Navigating the Maze of GDPR Compliance: A Codebase Transformation\n", + "Blog phrase: European data storage\n", + "Internal Link: https://mendable.ai/blog/european-data-storage\n", + "---\n", + "\n", + "\n", + "Suggestion for: Navigating the Maze of GDPR Compliance: A Codebase Transformation\n", + "Blog phrase: delivering value\n", + "Internal Link: https://mendable.ai/blog/getting-started\n", + "---\n", + "\n", + "\n", + "Suggestion for: How Mendable leverages Langsmith to debug Tools & Actions\n", + "Blog phrase: introduction of Tools & Actions\n", + "Internal Link: https://mendable.ai/blog/tools\n", + "---\n", + "\n", + "\n", + "Suggestion for: How Mendable leverages Langsmith to debug Tools & Actions\n", + "Blog phrase: Mendable Tools & Actions\n", + "Internal Link: https://mendable.ai/blog/tools\n", + "---\n", + "\n", + "\n", + "Suggestion for: Launching European Data Storage powered by MongoDB\n", + "Blog phrase: Clean Architecture and Repository pattern\n", + "Internal Link: https://mendable.ai/blog/gdpr-repository-pattern\n", + "---\n", + "\n", + "\n", + "Suggestion for: Launching European Data Storage powered by MongoDB\n", + "Blog phrase: building the best AI Chat\n", + "Internal Link: https://mendable.ai/blog/building-copilots\n", + "---\n", + "\n", + "\n", + "Suggestion for: Launching European Data Storage powered by MongoDB\n", + "Blog phrase: European RAG pipeline, powered by\n", + "Internal Link: https://mendable.ai/blog/building-safe-rag\n", + "---\n", + "\n", + "\n", + "Suggestion for: Introducing Tools and Actions\n", + "Blog phrase: augmentation and actions for automation\n", + "Internal Link: https://mendable.ai/blog/building-copilots\n", + "---\n", + "\n", + "\n", + "Suggestion for: Introducing Tools and Actions\n", + "Blog phrase: Mendable provides an API request\n", + "Internal Link: https://mendable.ai/blog/getting-started\n", + "---\n", + "\n", + "\n", + "Suggestion for: Introducing Tools and Actions\n", + "Blog phrase: AI use it when it\n", + "Internal Link: https://mendable.ai/blog/how-mendable-leverages-langsmith-to-debug-tools-and-actions\n", + "---\n", + "\n", + "\n", + "Suggestion for: Mendable.ai December Recap\n", + "Blog phrase: customizing the model\n", + "Internal Link: https://mendable.ai/blog/customprompt\n", + "---\n", + "\n", + "\n", + "Suggestion for: Mendable.ai December Recap\n", + "Blog phrase: AI sales copilot\n", + "Internal Link: https://mendable.ai/blog/building-copilots\n", + "---\n", + "\n", + "\n", + "Suggestion for: Mendable.ai December Recap\n", + "Blog phrase: Introducing Tools and Actions\n", + "Internal Link: https://mendable.ai/blog/tools\n", + "---\n", + "\n", + "\n", + "Suggestion for: Mendable.ai November Update\n", + "Blog phrase: Auto syncing data sources\n", + "Internal Link: https://mendable.ai/blog/european-data-storage\n", + "---\n", + "\n", + "\n", + "Suggestion for: Mendable.ai November Update\n", + "Blog phrase: Chat insights feature\n", + "Internal Link: https://mendable.ai/blog/tools\n", + "---\n", + "\n", + "\n", + "Suggestion for: Mendable.ai November Update\n", + "Blog phrase: Github private repo support\n", + "Internal Link: https://mendable.ai/blog/getting-started\n", + "---\n", + "\n", + "\n", + "Suggestion for: Mendable.ai October Recap\n", + "Blog phrase: Full Prompt Customization\n", + "Internal Link: https://mendable.ai/blog/customprompt\n", + "---\n", + "\n", + "\n", + "Suggestion for: Mendable.ai October Recap\n", + "Blog phrase: Expanded Model Support\n", + "Internal Link: https://mendable.ai/blog/gpt35prompting\n", + "---\n", + "\n", + "\n", + "Suggestion for: Mendable.ai October Recap\n", + "Blog phrase: AI-Powered Documentation Management\n", + "Internal Link: https://mendable.ai/blog/building-copilots\n", + "---\n", + "\n", + "\n", + "Suggestion for: Mendable.ai Mid September 2023 Update\n", + "Blog phrase: new integration templates\n", + "Internal Link: https://mendable.ai/blog/tools\n", + "---\n", + "\n", + "\n", + "Suggestion for: Mendable.ai Mid September 2023 Update\n", + "Blog phrase: Product Copilot feature\n", + "Internal Link: https://mendable.ai/blog/building-copilots\n", + "---\n", + "\n", + "\n", + "Suggestion for: Mendable.ai Mid September 2023 Update\n", + "Blog phrase: Data Exporting\n", + "Internal Link: https://mendable.ai/blog/getting-started\n", + "---\n", + "\n", + "\n", + "Suggestion for: Everything you need to know about Mendable: Build and deploy AI Chat Search\n", + "Blog phrase: robust API\n", + "Internal Link: https://mendable.ai/blog/tools\n", + "---\n", + "\n", + "\n", + "Suggestion for: Everything you need to know about Mendable: Build and deploy AI Chat Search\n", + "Blog phrase: pre-built components\n", + "Internal Link: https://mendable.ai/blog/building-copilots\n", + "---\n", + "\n", + "\n", + "Suggestion for: Everything you need to know about Mendable: Build and deploy AI Chat Search\n", + "Blog phrase: Customizing Your LLM Model\n", + "Internal Link: https://mendable.ai/blog/customprompt\n", + "---\n", + "\n", + "\n", + "Suggestion for: Building context-aware AI copilots with Mendable\n", + "Blog phrase: registered on our platform\n", + "Internal Link: https://mendable.ai/blog/getting-started\n", + "---\n", + "\n", + "\n", + "Suggestion for: Building context-aware AI copilots with Mendable\n", + "Blog phrase: dynamic context to the AI\n", + "Internal Link: https://mendable.ai/blog/customprompt\n", + "---\n", + "\n", + "\n", + "Suggestion for: Building context-aware AI copilots with Mendable\n", + "Blog phrase: personalized answers to your users\n", + "Internal Link: https://mendable.ai/blog/precisemode\n", + "---\n", + "\n", + "\n", + "Suggestion for: Mendable.ai August 2023 Updates\n", + "Blog phrase: Learn more about how to\n", + "Internal Link: https://mendable.ai/blog/precisemode\n", + "---\n", + "\n", + "\n", + "Suggestion for: Mendable.ai August 2023 Updates\n", + "Blog phrase: Building context-aware AI copilots with\n", + "Internal Link: https://mendable.ai/blog/building-copilots\n", + "---\n", + "\n", + "\n", + "Suggestion for: Mendable.ai August 2023 Updates\n", + "Blog phrase: customizable AI chat components\n", + "Internal Link: https://mendable.ai/blog/getting-started\n", + "---\n", + "\n", + "\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[15], line 57\u001b[0m\n\u001b[1;32m 27\u001b[0m prompt_instructions \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m\u001b[38;5;124mGiven this blog post from \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcurrent_blog_url\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m called \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcurrent_blog_title\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, analyze the following blog content. Identify 0 to 3 of phrases (5 words max) from the inside of the middle of the article that could be linked to other blog posts from the list of potential links provided inside of . Return a JSON object structured as follows:\u001b[39m\n\u001b[1;32m 28\u001b[0m \n\u001b[1;32m 29\u001b[0m \u001b[38;5;124m\u001b[39m\u001b[38;5;130;01m{{\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 53\u001b[0m \n\u001b[1;32m 54\u001b[0m \u001b[38;5;124mGO AND ONLY RETURN THE JSON NOTHING ELSE:\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[1;32m 56\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 57\u001b[0m message \u001b[38;5;241m=\u001b[39m \u001b[43mclient\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmessages\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcreate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 58\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mclaude-3-5-sonnet-20240620\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 59\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1024\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 60\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessages\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\n\u001b[1;32m 61\u001b[0m \u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrole\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43muser\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcontent\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mprompt_instructions\u001b[49m\u001b[43m}\u001b[49m\n\u001b[1;32m 62\u001b[0m \u001b[43m \u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 63\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 65\u001b[0m \u001b[38;5;66;03m# Extract the JSON string from the TextBlock\u001b[39;00m\n\u001b[1;32m 66\u001b[0m json_string \u001b[38;5;241m=\u001b[39m message\u001b[38;5;241m.\u001b[39mcontent[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mtext\n", + "File \u001b[0;32m~/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/anthropic/_utils/_utils.py:277\u001b[0m, in \u001b[0;36mrequired_args..inner..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 275\u001b[0m msg \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMissing required argument: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mquote(missing[\u001b[38;5;241m0\u001b[39m])\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 276\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(msg)\n\u001b[0;32m--> 277\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/anthropic/resources/messages.py:904\u001b[0m, in \u001b[0;36mMessages.create\u001b[0;34m(self, max_tokens, messages, model, metadata, stop_sequences, stream, system, temperature, tool_choice, tools, top_k, top_p, extra_headers, extra_query, extra_body, timeout)\u001b[0m\n\u001b[1;32m 870\u001b[0m \u001b[38;5;129m@required_args\u001b[39m([\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmax_tokens\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmessages\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodel\u001b[39m\u001b[38;5;124m\"\u001b[39m], [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmax_tokens\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmessages\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodel\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstream\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 871\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcreate\u001b[39m(\n\u001b[1;32m 872\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 902\u001b[0m timeout: \u001b[38;5;28mfloat\u001b[39m \u001b[38;5;241m|\u001b[39m httpx\u001b[38;5;241m.\u001b[39mTimeout \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m|\u001b[39m NotGiven \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m600\u001b[39m,\n\u001b[1;32m 903\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Message \u001b[38;5;241m|\u001b[39m Stream[RawMessageStreamEvent]:\n\u001b[0;32m--> 904\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_post\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 905\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m/v1/messages\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 906\u001b[0m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmaybe_transform\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 907\u001b[0m \u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[1;32m 908\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmax_tokens\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 909\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmessages\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmessages\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 910\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmodel\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 911\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmetadata\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 912\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstop_sequences\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop_sequences\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 913\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstream\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mstream\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 914\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msystem\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43msystem\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 915\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtemperature\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mtemperature\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 916\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtool_choice\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mtool_choice\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 917\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtools\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mtools\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 918\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtop_k\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mtop_k\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 919\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtop_p\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mtop_p\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 920\u001b[0m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 921\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessage_create_params\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mMessageCreateParams\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 922\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 923\u001b[0m \u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmake_request_options\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 924\u001b[0m \u001b[43m \u001b[49m\u001b[43mextra_headers\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextra_headers\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextra_query\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextra_query\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextra_body\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextra_body\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\n\u001b[1;32m 925\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 926\u001b[0m \u001b[43m \u001b[49m\u001b[43mcast_to\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mMessage\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 927\u001b[0m \u001b[43m \u001b[49m\u001b[43mstream\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstream\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 928\u001b[0m \u001b[43m \u001b[49m\u001b[43mstream_cls\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mStream\u001b[49m\u001b[43m[\u001b[49m\u001b[43mRawMessageStreamEvent\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 929\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/anthropic/_base_client.py:1249\u001b[0m, in \u001b[0;36mSyncAPIClient.post\u001b[0;34m(self, path, cast_to, body, options, files, stream, stream_cls)\u001b[0m\n\u001b[1;32m 1235\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mpost\u001b[39m(\n\u001b[1;32m 1236\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 1237\u001b[0m path: \u001b[38;5;28mstr\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1244\u001b[0m stream_cls: \u001b[38;5;28mtype\u001b[39m[_StreamT] \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 1245\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m ResponseT \u001b[38;5;241m|\u001b[39m _StreamT:\n\u001b[1;32m 1246\u001b[0m opts \u001b[38;5;241m=\u001b[39m FinalRequestOptions\u001b[38;5;241m.\u001b[39mconstruct(\n\u001b[1;32m 1247\u001b[0m method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpost\u001b[39m\u001b[38;5;124m\"\u001b[39m, url\u001b[38;5;241m=\u001b[39mpath, json_data\u001b[38;5;241m=\u001b[39mbody, files\u001b[38;5;241m=\u001b[39mto_httpx_files(files), \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39moptions\n\u001b[1;32m 1248\u001b[0m )\n\u001b[0;32m-> 1249\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m cast(ResponseT, \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcast_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mopts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstream\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstream\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstream_cls\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstream_cls\u001b[49m\u001b[43m)\u001b[49m)\n", + "File \u001b[0;32m~/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/anthropic/_base_client.py:931\u001b[0m, in \u001b[0;36mSyncAPIClient.request\u001b[0;34m(self, cast_to, options, remaining_retries, stream, stream_cls)\u001b[0m\n\u001b[1;32m 922\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mrequest\u001b[39m(\n\u001b[1;32m 923\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 924\u001b[0m cast_to: Type[ResponseT],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 929\u001b[0m stream_cls: \u001b[38;5;28mtype\u001b[39m[_StreamT] \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 930\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m ResponseT \u001b[38;5;241m|\u001b[39m _StreamT:\n\u001b[0;32m--> 931\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 932\u001b[0m \u001b[43m \u001b[49m\u001b[43mcast_to\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcast_to\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 933\u001b[0m \u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 934\u001b[0m \u001b[43m \u001b[49m\u001b[43mstream\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstream\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 935\u001b[0m \u001b[43m \u001b[49m\u001b[43mstream_cls\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstream_cls\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 936\u001b[0m \u001b[43m \u001b[49m\u001b[43mremaining_retries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mremaining_retries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 937\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/anthropic/_base_client.py:962\u001b[0m, in \u001b[0;36mSyncAPIClient._request\u001b[0;34m(self, cast_to, options, remaining_retries, stream, stream_cls)\u001b[0m\n\u001b[1;32m 959\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSending HTTP Request: \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, request\u001b[38;5;241m.\u001b[39mmethod, request\u001b[38;5;241m.\u001b[39murl)\n\u001b[1;32m 961\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 962\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_client\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 963\u001b[0m \u001b[43m \u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 964\u001b[0m \u001b[43m \u001b[49m\u001b[43mstream\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstream\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_should_stream_response_body\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 965\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 966\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 967\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m httpx\u001b[38;5;241m.\u001b[39mTimeoutException \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[1;32m 968\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEncountered httpx.TimeoutException\u001b[39m\u001b[38;5;124m\"\u001b[39m, exc_info\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n", + "File \u001b[0;32m~/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/httpx/_client.py:901\u001b[0m, in \u001b[0;36mClient.send\u001b[0;34m(self, request, stream, auth, follow_redirects)\u001b[0m\n\u001b[1;32m 893\u001b[0m follow_redirects \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 894\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfollow_redirects\n\u001b[1;32m 895\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(follow_redirects, UseClientDefault)\n\u001b[1;32m 896\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m follow_redirects\n\u001b[1;32m 897\u001b[0m )\n\u001b[1;32m 899\u001b[0m auth \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_build_request_auth(request, auth)\n\u001b[0;32m--> 901\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_send_handling_auth\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 902\u001b[0m \u001b[43m \u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 903\u001b[0m \u001b[43m \u001b[49m\u001b[43mauth\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mauth\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 904\u001b[0m \u001b[43m \u001b[49m\u001b[43mfollow_redirects\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfollow_redirects\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 905\u001b[0m \u001b[43m \u001b[49m\u001b[43mhistory\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 906\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 907\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 908\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m stream:\n", + "File \u001b[0;32m~/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/httpx/_client.py:929\u001b[0m, in \u001b[0;36mClient._send_handling_auth\u001b[0;34m(self, request, auth, follow_redirects, history)\u001b[0m\n\u001b[1;32m 926\u001b[0m request \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mnext\u001b[39m(auth_flow)\n\u001b[1;32m 928\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m--> 929\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_send_handling_redirects\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 930\u001b[0m \u001b[43m \u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 931\u001b[0m \u001b[43m \u001b[49m\u001b[43mfollow_redirects\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfollow_redirects\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 932\u001b[0m \u001b[43m \u001b[49m\u001b[43mhistory\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhistory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 933\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 934\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 935\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n", + "File \u001b[0;32m~/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/httpx/_client.py:966\u001b[0m, in \u001b[0;36mClient._send_handling_redirects\u001b[0;34m(self, request, follow_redirects, history)\u001b[0m\n\u001b[1;32m 963\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m hook \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_event_hooks[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrequest\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[1;32m 964\u001b[0m hook(request)\n\u001b[0;32m--> 966\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_send_single_request\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 967\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 968\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m hook \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_event_hooks[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mresponse\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n", + "File \u001b[0;32m~/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/httpx/_client.py:1002\u001b[0m, in \u001b[0;36mClient._send_single_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 997\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[1;32m 998\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAttempted to send an async request with a sync Client instance.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 999\u001b[0m )\n\u001b[1;32m 1001\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m request_context(request\u001b[38;5;241m=\u001b[39mrequest):\n\u001b[0;32m-> 1002\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mtransport\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mhandle_request\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1004\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(response\u001b[38;5;241m.\u001b[39mstream, SyncByteStream)\n\u001b[1;32m 1006\u001b[0m response\u001b[38;5;241m.\u001b[39mrequest \u001b[38;5;241m=\u001b[39m request\n", + "File \u001b[0;32m~/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/httpx/_transports/default.py:228\u001b[0m, in \u001b[0;36mHTTPTransport.handle_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 215\u001b[0m req \u001b[38;5;241m=\u001b[39m httpcore\u001b[38;5;241m.\u001b[39mRequest(\n\u001b[1;32m 216\u001b[0m method\u001b[38;5;241m=\u001b[39mrequest\u001b[38;5;241m.\u001b[39mmethod,\n\u001b[1;32m 217\u001b[0m url\u001b[38;5;241m=\u001b[39mhttpcore\u001b[38;5;241m.\u001b[39mURL(\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 225\u001b[0m extensions\u001b[38;5;241m=\u001b[39mrequest\u001b[38;5;241m.\u001b[39mextensions,\n\u001b[1;32m 226\u001b[0m )\n\u001b[1;32m 227\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m map_httpcore_exceptions():\n\u001b[0;32m--> 228\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_pool\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mhandle_request\u001b[49m\u001b[43m(\u001b[49m\u001b[43mreq\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 230\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(resp\u001b[38;5;241m.\u001b[39mstream, typing\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 232\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m Response(\n\u001b[1;32m 233\u001b[0m status_code\u001b[38;5;241m=\u001b[39mresp\u001b[38;5;241m.\u001b[39mstatus,\n\u001b[1;32m 234\u001b[0m headers\u001b[38;5;241m=\u001b[39mresp\u001b[38;5;241m.\u001b[39mheaders,\n\u001b[1;32m 235\u001b[0m stream\u001b[38;5;241m=\u001b[39mResponseStream(resp\u001b[38;5;241m.\u001b[39mstream),\n\u001b[1;32m 236\u001b[0m extensions\u001b[38;5;241m=\u001b[39mresp\u001b[38;5;241m.\u001b[39mextensions,\n\u001b[1;32m 237\u001b[0m )\n", + "File \u001b[0;32m~/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/httpcore/_sync/connection_pool.py:268\u001b[0m, in \u001b[0;36mConnectionPool.handle_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 266\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m ShieldCancellation():\n\u001b[1;32m 267\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mresponse_closed(status)\n\u001b[0;32m--> 268\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exc\n\u001b[1;32m 269\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 270\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n", + "File \u001b[0;32m~/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/httpcore/_sync/connection_pool.py:251\u001b[0m, in \u001b[0;36mConnectionPool.handle_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 248\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exc\n\u001b[1;32m 250\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 251\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mconnection\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mhandle_request\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 252\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ConnectionNotAvailable:\n\u001b[1;32m 253\u001b[0m \u001b[38;5;66;03m# The ConnectionNotAvailable exception is a special case, that\u001b[39;00m\n\u001b[1;32m 254\u001b[0m \u001b[38;5;66;03m# indicates we need to retry the request on a new connection.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 258\u001b[0m \u001b[38;5;66;03m# might end up as an HTTP/2 connection, but which actually ends\u001b[39;00m\n\u001b[1;32m 259\u001b[0m \u001b[38;5;66;03m# up as HTTP/1.1.\u001b[39;00m\n\u001b[1;32m 260\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pool_lock:\n\u001b[1;32m 261\u001b[0m \u001b[38;5;66;03m# Maintain our position in the request queue, but reset the\u001b[39;00m\n\u001b[1;32m 262\u001b[0m \u001b[38;5;66;03m# status so that the request becomes queued again.\u001b[39;00m\n", + "File \u001b[0;32m~/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/httpcore/_sync/connection.py:103\u001b[0m, in \u001b[0;36mHTTPConnection.handle_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 100\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_connection\u001b[38;5;241m.\u001b[39mis_available():\n\u001b[1;32m 101\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ConnectionNotAvailable()\n\u001b[0;32m--> 103\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_connection\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mhandle_request\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/httpcore/_sync/http11.py:133\u001b[0m, in \u001b[0;36mHTTP11Connection.handle_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 131\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m Trace(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mresponse_closed\u001b[39m\u001b[38;5;124m\"\u001b[39m, logger, request) \u001b[38;5;28;01mas\u001b[39;00m trace:\n\u001b[1;32m 132\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_response_closed()\n\u001b[0;32m--> 133\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exc\n", + "File \u001b[0;32m~/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/httpcore/_sync/http11.py:111\u001b[0m, in \u001b[0;36mHTTP11Connection.handle_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m Trace(\n\u001b[1;32m 104\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mreceive_response_headers\u001b[39m\u001b[38;5;124m\"\u001b[39m, logger, request, kwargs\n\u001b[1;32m 105\u001b[0m ) \u001b[38;5;28;01mas\u001b[39;00m trace:\n\u001b[1;32m 106\u001b[0m (\n\u001b[1;32m 107\u001b[0m http_version,\n\u001b[1;32m 108\u001b[0m status,\n\u001b[1;32m 109\u001b[0m reason_phrase,\n\u001b[1;32m 110\u001b[0m headers,\n\u001b[0;32m--> 111\u001b[0m ) \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_receive_response_headers\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 112\u001b[0m trace\u001b[38;5;241m.\u001b[39mreturn_value \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 113\u001b[0m http_version,\n\u001b[1;32m 114\u001b[0m status,\n\u001b[1;32m 115\u001b[0m reason_phrase,\n\u001b[1;32m 116\u001b[0m headers,\n\u001b[1;32m 117\u001b[0m )\n\u001b[1;32m 119\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m Response(\n\u001b[1;32m 120\u001b[0m status\u001b[38;5;241m=\u001b[39mstatus,\n\u001b[1;32m 121\u001b[0m headers\u001b[38;5;241m=\u001b[39mheaders,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 127\u001b[0m },\n\u001b[1;32m 128\u001b[0m )\n", + "File \u001b[0;32m~/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/httpcore/_sync/http11.py:176\u001b[0m, in \u001b[0;36mHTTP11Connection._receive_response_headers\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 173\u001b[0m timeout \u001b[38;5;241m=\u001b[39m timeouts\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mread\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 175\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m--> 176\u001b[0m event \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_receive_event\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 177\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(event, h11\u001b[38;5;241m.\u001b[39mResponse):\n\u001b[1;32m 178\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n", + "File \u001b[0;32m~/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/httpcore/_sync/http11.py:212\u001b[0m, in \u001b[0;36mHTTP11Connection._receive_event\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 209\u001b[0m event \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_h11_state\u001b[38;5;241m.\u001b[39mnext_event()\n\u001b[1;32m 211\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m event \u001b[38;5;129;01mis\u001b[39;00m h11\u001b[38;5;241m.\u001b[39mNEED_DATA:\n\u001b[0;32m--> 212\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_network_stream\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 213\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mREAD_NUM_BYTES\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\n\u001b[1;32m 214\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;66;03m# If we feed this case through h11 we'll raise an exception like:\u001b[39;00m\n\u001b[1;32m 217\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[1;32m 218\u001b[0m \u001b[38;5;66;03m# httpcore.RemoteProtocolError: can't handle event type\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 222\u001b[0m \u001b[38;5;66;03m# perspective. Instead we handle this case distinctly and treat\u001b[39;00m\n\u001b[1;32m 223\u001b[0m \u001b[38;5;66;03m# it as a ConnectError.\u001b[39;00m\n\u001b[1;32m 224\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m data \u001b[38;5;241m==\u001b[39m \u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_h11_state\u001b[38;5;241m.\u001b[39mtheir_state \u001b[38;5;241m==\u001b[39m h11\u001b[38;5;241m.\u001b[39mSEND_RESPONSE:\n", + "File \u001b[0;32m~/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/httpcore/_backends/sync.py:126\u001b[0m, in \u001b[0;36mSyncStream.read\u001b[0;34m(self, max_bytes, timeout)\u001b[0m\n\u001b[1;32m 124\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m map_exceptions(exc_map):\n\u001b[1;32m 125\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sock\u001b[38;5;241m.\u001b[39msettimeout(timeout)\n\u001b[0;32m--> 126\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sock\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrecv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmax_bytes\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/projects/python_projects/agents_testing/.conda/lib/python3.10/ssl.py:1292\u001b[0m, in \u001b[0;36mSSLSocket.recv\u001b[0;34m(self, buflen, flags)\u001b[0m\n\u001b[1;32m 1288\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m flags \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 1289\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 1290\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnon-zero flags not allowed in calls to recv() on \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m\n\u001b[1;32m 1291\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m)\n\u001b[0;32m-> 1292\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbuflen\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1293\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1294\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mrecv(buflen, flags)\n", + "File \u001b[0;32m~/projects/python_projects/agents_testing/.conda/lib/python3.10/ssl.py:1165\u001b[0m, in \u001b[0;36mSSLSocket.read\u001b[0;34m(self, len, buffer)\u001b[0m\n\u001b[1;32m 1163\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sslobj\u001b[38;5;241m.\u001b[39mread(\u001b[38;5;28mlen\u001b[39m, buffer)\n\u001b[1;32m 1164\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1165\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sslobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1166\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m SSLError \u001b[38;5;28;01mas\u001b[39;00m x:\n\u001b[1;32m 1167\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m x\u001b[38;5;241m.\u001b[39margs[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m==\u001b[39m SSL_ERROR_EOF \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msuppress_ragged_eofs:\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "import json\n", + "import csv\n", + "\n", + "# Assuming we have the following variables from the previous code:\n", + "# crawl_result, client, potential_links\n", + "\n", + "# Convert potential_links to a JSON string\n", + "potential_links_json = json.dumps(potential_links, indent=2)\n", + "\n", + "# Prepare CSV file\n", + "csv_filename = \"link_suggestions.csv\"\n", + "csv_headers = [\"Source Blog Title\", \"Source Blog URL\", \"Target Phrase\", \"Suggested Link URL\"]\n", + "\n", + "# Write headers to the CSV file\n", + "with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:\n", + " csvwriter = csv.writer(csvfile)\n", + " csvwriter.writerow(csv_headers)\n", + "\n", + "# Loop through each blog post content\n", + "for item in crawl_result:\n", + " current_blog_url = item[\"metadata\"].get(\"ogUrl\", \"\")\n", + " if current_blog_url == blog_url:\n", + " continue\n", + " current_blog_content = item[\"content\"]\n", + " current_blog_title = item[\"metadata\"].get(\"title\", \"\")\n", + "\n", + " prompt_instructions = f\"\"\"Given this blog post from {current_blog_url} called '{current_blog_title}', analyze the following blog content. Identify 0 to 3 of phrases (5 words max) from the inside of the middle of the article that could be linked to other blog posts from the list of potential links provided inside of . Return a JSON object structured as follows:\n", + "\n", + " {{\n", + " \"link_suggestions\": [\n", + " {{\n", + " \"target_phrase\": \"the EXACT phrase from the to be linked to one of the links in (5 words max)\",\n", + " \"suggested_link_url\": \"url of the suggested internal link from \",\n", + " }}\n", + " ],\n", + " \"metadata\": {{\n", + " \"source_blog_url\": \"{current_blog_url}\",\n", + " \"source_blog_title\": \"{current_blog_title}\",\n", + " }}\n", + " }}\n", + "\n", + " Ensure that you provide the EXACT phrase from in target_phrase (5 words max) to locate each suggestion in the blog content without using character positions. Your target phrases must NOT be a title!\n", + "\n", + " Blog Content:\n", + " \n", + " {current_blog_content}\n", + " \n", + "\n", + " Potential Links:\n", + " \n", + " {potential_links_json}\n", + " \n", + "\n", + " GO AND ONLY RETURN THE JSON NOTHING ELSE:\"\"\"\n", + "\n", + " try:\n", + " message = client.messages.create(\n", + " model=\"claude-3-5-sonnet-20240620\",\n", + " max_tokens=1024,\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": prompt_instructions}\n", + " ]\n", + " )\n", + " \n", + " # Extract the JSON string from the TextBlock\n", + " json_string = message.content[0].text\n", + " \n", + " # Parse the JSON response\n", + " response_json = json.loads(json_string)\n", + " \n", + " # Write suggestions to CSV\n", + " for suggestion in response_json['link_suggestions']:\n", + " print(\"Suggestion for: \" + current_blog_title )\n", + " print(\"Blog phrase: \" + suggestion['target_phrase']) \n", + " print(\"Internal Link: \" + suggestion['suggested_link_url'])\n", + " print(\"---\\n\\n\")\n", + "\n", + " # Open the CSV file in append mode and write the new row\n", + " with open(csv_filename, 'a', newline='', encoding='utf-8') as csvfile:\n", + " csvwriter = csv.writer(csvfile)\n", + " csvwriter.writerow([\n", + " response_json['metadata']['source_blog_title'],\n", + " response_json['metadata']['source_blog_url'],\n", + " suggestion['target_phrase'],\n", + " suggestion['suggested_link_url'],\n", + " ])\n", + " \n", + " except json.JSONDecodeError:\n", + " print(f\"Error parsing JSON response for blog {current_blog_title}\")\n", + " print(\"Raw response:\", message.content)\n", + " except Exception as e:\n", + " print(f\"Error processing blog {current_blog_title}: {str(e)}\")\n", + " \n", + "\n", + "print(f\"Finished processing all blog posts. Results saved to {csv_filename}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 173f4ee1bf0b75ff7582a0fa1756aa35eba27051 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 23 Aug 2024 20:09:59 -0300 Subject: [PATCH 28/43] Nick: chrome cdp main | simple autoscaler --- .github/workflows/autoscale.yml | 36 ++++++ apps/api/src/controllers/admin/queue.ts | 115 ++++++++++++++---- apps/api/src/routes/admin.ts | 6 + apps/api/src/scraper/WebScraper/single_url.ts | 4 +- 4 files changed, 135 insertions(+), 26 deletions(-) create mode 100644 .github/workflows/autoscale.yml diff --git a/.github/workflows/autoscale.yml b/.github/workflows/autoscale.yml new file mode 100644 index 00000000..6ab7ca74 --- /dev/null +++ b/.github/workflows/autoscale.yml @@ -0,0 +1,36 @@ +name: Simple Autoscaler +on: + schedule: + - cron: '*/0.5 * * * *' + +env: + BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} + FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} + +jobs: + scale: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: superfly/flyctl-actions/setup-flyctl@master + - name: Send GET request to check queues + run: | + response=$(curl --silent --max-time 180 https://api.firecrawl.dev/admin/${{ secrets.BULL_AUTH_KEY }}/autoscaler) + http_code=$(echo "$response" | jq -r '.status_code') + + mode=$(echo "$response" | jq -r '.mode') + count=$(echo "$response" | jq -r '.count') + + echo "Mode: $mode" + echo "Count: $count" + + if [ "$mode" = "scale-descale" ]; then + flyctl scale count $count -c fly.staging.toml --process-group=worker --yes + echo "Scaled to $count machines." + else + echo "No scaling needed. Mode: $mode" + fi + env: + FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} + BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} + working-directory: apps/api diff --git a/apps/api/src/controllers/admin/queue.ts b/apps/api/src/controllers/admin/queue.ts index 095e7ca7..729ea004 100644 --- a/apps/api/src/controllers/admin/queue.ts +++ b/apps/api/src/controllers/admin/queue.ts @@ -4,6 +4,7 @@ import { Job } from "bullmq"; import { Logger } from "../../lib/logger"; import { getScrapeQueue } from "../../services/queue-service"; import { checkAlerts } from "../../services/alerts"; +import { exec } from "node:child_process"; export async function cleanBefore24hCompleteJobsController( req: Request, @@ -54,34 +55,100 @@ export async function cleanBefore24hCompleteJobsController( } } - export async function checkQueuesController(req: Request, res: Response) { - try { - await checkAlerts(); - return res.status(200).send("Alerts initialized"); - } catch (error) { - Logger.debug(`Failed to initialize alerts: ${error}`); - return res.status(500).send("Failed to initialize alerts"); - } + try { + await checkAlerts(); + return res.status(200).send("Alerts initialized"); + } catch (error) { + Logger.debug(`Failed to initialize alerts: ${error}`); + return res.status(500).send("Failed to initialize alerts"); } +} - // Use this as a "health check" that way we dont destroy the server +// Use this as a "health check" that way we dont destroy the server export async function queuesController(req: Request, res: Response) { - try { - const scrapeQueue = getScrapeQueue(); + try { + const scrapeQueue = getScrapeQueue(); - const [webScraperActive] = await Promise.all([ - scrapeQueue.getActiveCount(), - ]); + const [webScraperActive] = await Promise.all([ + scrapeQueue.getActiveCount(), + ]); - const noActiveJobs = webScraperActive === 0; - // 200 if no active jobs, 503 if there are active jobs - return res.status(noActiveJobs ? 200 : 500).json({ - webScraperActive, - noActiveJobs, - }); - } catch (error) { - Logger.error(error); - return res.status(500).json({ error: error.message }); + const noActiveJobs = webScraperActive === 0; + // 200 if no active jobs, 503 if there are active jobs + return res.status(noActiveJobs ? 200 : 500).json({ + webScraperActive, + noActiveJobs, + }); + } catch (error) { + Logger.error(error); + return res.status(500).json({ error: error.message }); + } +} + +export async function autoscalerController(req: Request, res: Response) { + try { + const maxNumberOfMachines = 100; + const minNumberOfMachines = 20; + + const scrapeQueue = getScrapeQueue(); + + const [webScraperActive, webScraperWaiting, webScraperPriority] = await Promise.all([ + scrapeQueue.getActiveCount(), + scrapeQueue.getWaitingCount(), + scrapeQueue.getPrioritizedCount(), + ]); + + let waitingAndPriorityCount = webScraperWaiting + webScraperPriority; + + // get number of machines active + const request = await fetch('https://api.machines.dev/v1/apps/firecrawl-scraper-js/machines', + { + headers: { + 'Authorization': `Bearer ${process.env.FLY_API_TOKEN}` + } + } + ) + const machines = await request.json(); + const activeMachines = machines.filter(machine => machine.state === 'started' || machine.state === "starting").length; + + let targetMachineCount = activeMachines; + + const baseScaleUp = 10; + const baseScaleDown = 5; + + // Scale up logic + if (webScraperActive > 9000 || waitingAndPriorityCount > 2000) { + targetMachineCount = Math.min(maxNumberOfMachines, activeMachines + (baseScaleUp * 3)); + } else if (webScraperActive > 5000 || waitingAndPriorityCount > 1000) { + targetMachineCount = Math.min(maxNumberOfMachines, activeMachines + (baseScaleUp * 2)); + } else if (webScraperActive > 1000 || waitingAndPriorityCount > 500) { + targetMachineCount = Math.min(maxNumberOfMachines, activeMachines + baseScaleUp); } - } \ No newline at end of file + + // Scale down logic + if (webScraperActive < 100 && waitingAndPriorityCount < 50) { + targetMachineCount = Math.max(minNumberOfMachines, activeMachines - (baseScaleDown * 3)); + } else if (webScraperActive < 500 && waitingAndPriorityCount < 200) { + targetMachineCount = Math.max(minNumberOfMachines, activeMachines - (baseScaleDown * 2)); + } else if (webScraperActive < 1000 && waitingAndPriorityCount < 500) { + targetMachineCount = Math.max(minNumberOfMachines, activeMachines - baseScaleDown); + } + + if (targetMachineCount !== activeMachines) { + Logger.info(`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`); + return res.status(200).json({ + mode: "scale-descale", + count: targetMachineCount, + }); + } + + return res.status(200).json({ + mode: "normal", + count: activeMachines, + }); + } catch (error) { + Logger.error(error); + return res.status(500).send("Failed to initialize autoscaler"); + } +} diff --git a/apps/api/src/routes/admin.ts b/apps/api/src/routes/admin.ts index 77d1bf46..d32808ce 100644 --- a/apps/api/src/routes/admin.ts +++ b/apps/api/src/routes/admin.ts @@ -1,6 +1,7 @@ import express from "express"; import { redisHealthController } from "../controllers/admin/redis-health"; import { + autoscalerController, checkQueuesController, cleanBefore24hCompleteJobsController, queuesController, @@ -27,3 +28,8 @@ adminRouter.get( `/admin/${process.env.BULL_AUTH_KEY}/queues`, queuesController ); + +adminRouter.get( + `/admin/${process.env.BULL_AUTH_KEY}/autoscaler`, + autoscalerController +); diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 1f2a62de..6998a665 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -24,8 +24,8 @@ import { clientSideError } from "../../strings"; dotenv.config(); export const baseScrapers = [ - "fire-engine", "fire-engine;chrome-cdp", + "fire-engine", "scrapingBee", process.env.USE_DB_AUTHENTICATION ? undefined : "playwright", "scrapingBeeLoad", @@ -85,8 +85,8 @@ function getScrapingFallbackOrder( }); let defaultOrder = [ - !process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine", !process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine;chrome-cdp", + !process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine", "scrapingBee", process.env.USE_DB_AUTHENTICATION ? undefined : "playwright", "scrapingBeeLoad", From 0dc592b3e420f4979f173fadfd4f3e01769ce53a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 23 Aug 2024 20:11:30 -0300 Subject: [PATCH 29/43] Update autoscale.yml --- .github/workflows/autoscale.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/autoscale.yml b/.github/workflows/autoscale.yml index 6ab7ca74..189a1755 100644 --- a/.github/workflows/autoscale.yml +++ b/.github/workflows/autoscale.yml @@ -1,7 +1,7 @@ name: Simple Autoscaler on: schedule: - - cron: '*/0.5 * * * *' + - cron: '*/1 * * * *' env: BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} From b23bf2eef9a00673d22b01273a00383ccd235ee4 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 23 Aug 2024 20:14:35 -0300 Subject: [PATCH 30/43] Update autoscale.yml --- .github/workflows/autoscale.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/autoscale.yml b/.github/workflows/autoscale.yml index 189a1755..694c8cac 100644 --- a/.github/workflows/autoscale.yml +++ b/.github/workflows/autoscale.yml @@ -1,7 +1,7 @@ name: Simple Autoscaler on: schedule: - - cron: '*/1 * * * *' + - cron: '* * * * *' env: BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} From 28d7a637c21fd58908da37aba7c961e562e46923 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 23 Aug 2024 22:07:49 -0300 Subject: [PATCH 31/43] Update queue.ts --- apps/api/src/controllers/admin/queue.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/admin/queue.ts b/apps/api/src/controllers/admin/queue.ts index 729ea004..8ce12942 100644 --- a/apps/api/src/controllers/admin/queue.ts +++ b/apps/api/src/controllers/admin/queue.ts @@ -115,7 +115,8 @@ export async function autoscalerController(req: Request, res: Response) { let targetMachineCount = activeMachines; const baseScaleUp = 10; - const baseScaleDown = 5; + // Slow scale down + const baseScaleDown = 2; // Scale up logic if (webScraperActive > 9000 || waitingAndPriorityCount > 2000) { From 8e78511ed49a1b5a69c52117501943a3f53d1179 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 23 Aug 2024 22:15:47 -0300 Subject: [PATCH 32/43] Update queue.ts --- apps/api/src/controllers/admin/queue.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/admin/queue.ts b/apps/api/src/controllers/admin/queue.ts index 8ce12942..43bf2e98 100644 --- a/apps/api/src/controllers/admin/queue.ts +++ b/apps/api/src/controllers/admin/queue.ts @@ -110,7 +110,9 @@ export async function autoscalerController(req: Request, res: Response) { } ) const machines = await request.json(); - const activeMachines = machines.filter(machine => machine.state === 'started' || machine.state === "starting").length; + + // Only worker machines + const activeMachines = machines.filter(machine => (machine.state === 'started' || machine.state === "starting") && machine.config.env["FLY_PROCESS_GROUP"] === "worker").length; let targetMachineCount = activeMachines; From b9e06e27f400d2f7fdfdec6996398897722f24ad Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 23 Aug 2024 22:17:27 -0300 Subject: [PATCH 33/43] Update queue.ts --- apps/api/src/controllers/admin/queue.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/admin/queue.ts b/apps/api/src/controllers/admin/queue.ts index 43bf2e98..f3244d2c 100644 --- a/apps/api/src/controllers/admin/queue.ts +++ b/apps/api/src/controllers/admin/queue.ts @@ -88,7 +88,7 @@ export async function queuesController(req: Request, res: Response) { export async function autoscalerController(req: Request, res: Response) { try { - const maxNumberOfMachines = 100; + const maxNumberOfMachines = 80; const minNumberOfMachines = 20; const scrapeQueue = getScrapeQueue(); @@ -110,7 +110,7 @@ export async function autoscalerController(req: Request, res: Response) { } ) const machines = await request.json(); - + // Only worker machines const activeMachines = machines.filter(machine => (machine.state === 'started' || machine.state === "starting") && machine.config.env["FLY_PROCESS_GROUP"] === "worker").length; From d87b62fed903d3c7642bf353c1e7026d3d004d8d Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 23 Aug 2024 22:33:17 -0300 Subject: [PATCH 34/43] Nick: --- apps/api/src/controllers/admin/queue.ts | 7 +++++++ apps/api/src/services/alerts/slack.ts | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/admin/queue.ts b/apps/api/src/controllers/admin/queue.ts index f3244d2c..2923ebba 100644 --- a/apps/api/src/controllers/admin/queue.ts +++ b/apps/api/src/controllers/admin/queue.ts @@ -5,6 +5,7 @@ import { Logger } from "../../lib/logger"; import { getScrapeQueue } from "../../services/queue-service"; import { checkAlerts } from "../../services/alerts"; import { exec } from "node:child_process"; +import { sendSlackWebhook } from "../../services/alerts/slack"; export async function cleanBefore24hCompleteJobsController( req: Request, @@ -140,6 +141,12 @@ export async function autoscalerController(req: Request, res: Response) { if (targetMachineCount !== activeMachines) { Logger.info(`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`); + + if(targetMachineCount > activeMachines) { + sendSlackWebhook("🐂 Scaling up to " + targetMachineCount + " machines", false, process.env.SLACK_AUTOSCALER ?? ""); + } else { + sendSlackWebhook("🐂 Scaling down to " + targetMachineCount + " machines", false, process.env.SLACK_AUTOSCALER ?? ""); + } return res.status(200).json({ mode: "scale-descale", count: targetMachineCount, diff --git a/apps/api/src/services/alerts/slack.ts b/apps/api/src/services/alerts/slack.ts index 96bf1c09..0fa75693 100644 --- a/apps/api/src/services/alerts/slack.ts +++ b/apps/api/src/services/alerts/slack.ts @@ -3,9 +3,9 @@ import { Logger } from "../../../src/lib/logger"; export async function sendSlackWebhook( message: string, - alertEveryone: boolean = false + alertEveryone: boolean = false, + webhookUrl: string = process.env.SLACK_WEBHOOK_URL ?? "" ) { - const webhookUrl = process.env.SLACK_WEBHOOK_URL; const messagePrefix = alertEveryone ? " " : ""; const payload = { text: `${messagePrefix} ${message}`, From b80277d4de1e9101482d4ec3856cfed05d1ed95e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 23 Aug 2024 22:46:44 -0300 Subject: [PATCH 35/43] Update queue.ts --- apps/api/src/controllers/admin/queue.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/api/src/controllers/admin/queue.ts b/apps/api/src/controllers/admin/queue.ts index 2923ebba..6a46cfec 100644 --- a/apps/api/src/controllers/admin/queue.ts +++ b/apps/api/src/controllers/admin/queue.ts @@ -113,7 +113,7 @@ export async function autoscalerController(req: Request, res: Response) { const machines = await request.json(); // Only worker machines - const activeMachines = machines.filter(machine => (machine.state === 'started' || machine.state === "starting") && machine.config.env["FLY_PROCESS_GROUP"] === "worker").length; + const activeMachines = machines.filter(machine => (machine.state === 'started' || machine.state === "starting" || machine.state === "replacing") && machine.config.env["FLY_PROCESS_GROUP"] === "worker").length; let targetMachineCount = activeMachines; @@ -143,9 +143,9 @@ export async function autoscalerController(req: Request, res: Response) { Logger.info(`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`); if(targetMachineCount > activeMachines) { - sendSlackWebhook("🐂 Scaling up to " + targetMachineCount + " machines", false, process.env.SLACK_AUTOSCALER ?? ""); + sendSlackWebhook(`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`, false, process.env.SLACK_AUTOSCALER ?? ""); } else { - sendSlackWebhook("🐂 Scaling down to " + targetMachineCount + " machines", false, process.env.SLACK_AUTOSCALER ?? ""); + sendSlackWebhook(`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`, false, process.env.SLACK_AUTOSCALER ?? ""); } return res.status(200).json({ mode: "scale-descale", From 1f99bfd3c80df752a6ae0f0d2e505341e71b2255 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 23 Aug 2024 22:47:12 -0300 Subject: [PATCH 36/43] Update queue.ts --- apps/api/src/controllers/admin/queue.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/admin/queue.ts b/apps/api/src/controllers/admin/queue.ts index 6a46cfec..06844bea 100644 --- a/apps/api/src/controllers/admin/queue.ts +++ b/apps/api/src/controllers/admin/queue.ts @@ -143,9 +143,9 @@ export async function autoscalerController(req: Request, res: Response) { Logger.info(`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`); if(targetMachineCount > activeMachines) { - sendSlackWebhook(`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`, false, process.env.SLACK_AUTOSCALER ?? ""); + sendSlackWebhook(`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`, false, process.env.SLACK_AUTOSCALER ?? ""); } else { - sendSlackWebhook(`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`, false, process.env.SLACK_AUTOSCALER ?? ""); + sendSlackWebhook(`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`, false, process.env.SLACK_AUTOSCALER ?? ""); } return res.status(200).json({ mode: "scale-descale", From 4e196a9146b214b80476f7d143158b4bc4a2fe94 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 25 Aug 2024 01:48:51 -0300 Subject: [PATCH 37/43] Delete autoscale.yml --- .github/workflows/autoscale.yml | 36 --------------------------------- 1 file changed, 36 deletions(-) delete mode 100644 .github/workflows/autoscale.yml diff --git a/.github/workflows/autoscale.yml b/.github/workflows/autoscale.yml deleted file mode 100644 index 694c8cac..00000000 --- a/.github/workflows/autoscale.yml +++ /dev/null @@ -1,36 +0,0 @@ -name: Simple Autoscaler -on: - schedule: - - cron: '* * * * *' - -env: - BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} - FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} - -jobs: - scale: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - uses: superfly/flyctl-actions/setup-flyctl@master - - name: Send GET request to check queues - run: | - response=$(curl --silent --max-time 180 https://api.firecrawl.dev/admin/${{ secrets.BULL_AUTH_KEY }}/autoscaler) - http_code=$(echo "$response" | jq -r '.status_code') - - mode=$(echo "$response" | jq -r '.mode') - count=$(echo "$response" | jq -r '.count') - - echo "Mode: $mode" - echo "Count: $count" - - if [ "$mode" = "scale-descale" ]; then - flyctl scale count $count -c fly.staging.toml --process-group=worker --yes - echo "Scaled to $count machines." - else - echo "No scaling needed. Mode: $mode" - fi - env: - FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} - BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} - working-directory: apps/api From 6f9a2687ae995d7ab43469bf3f74cf9008af7c80 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Sun, 25 Aug 2024 15:04:32 -0300 Subject: [PATCH 38/43] fixed turndown bug --- apps/api/src/lib/html-to-markdown.ts | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts index 233da921..002cb7be 100644 --- a/apps/api/src/lib/html-to-markdown.ts +++ b/apps/api/src/lib/html-to-markdown.ts @@ -1,5 +1,5 @@ -export function parseMarkdown(html: string) { +export async function parseMarkdown(html: string) { var TurndownService = require("turndown"); var turndownPluginGfm = require('joplin-turndown-plugin-gfm') @@ -21,7 +21,27 @@ export function parseMarkdown(html: string) { }); var gfm = turndownPluginGfm.gfm; turndownService.use(gfm); - let markdownContent = turndownService.turndown(html); + let markdownContent = ""; + const turndownPromise = new Promise((resolve, reject) => { + try { + const result = turndownService.turndown(html); + resolve(result); + } catch (error) { + reject("Error converting HTML to Markdown: " + error); + } + }); + + const timeoutPromise = new Promise((resolve, reject) => { + const timeout = 5000; // Timeout in milliseconds + setTimeout(() => reject("Conversion timed out after " + timeout + "ms"), timeout); + }); + + try { + markdownContent = await Promise.race([turndownPromise, timeoutPromise]); + } catch (error) { + console.error(error); + return ""; // Optionally return an empty string or handle the error as needed + } // multiple line links let insideLinkContent = false; From d591e0f51c6a2bb7b58b59618e0d75a435ce5d05 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Sun, 25 Aug 2024 20:05:17 +0200 Subject: [PATCH 39/43] block corterix.com for performance issues --- apps/api/src/scraper/WebScraper/utils/blocklist.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index 7b1ee19c..99eb6bd2 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -15,7 +15,8 @@ const socialMediaBlocklist = [ 'wechat.com', 'telegram.org', 'researchhub.com', - 'youtube.com' + 'youtube.com', + 'corterix.com', ]; const allowedKeywords = [ From 5606fe587068090bf1c7e55368b1e044e34f0b09 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 26 Aug 2024 16:05:11 -0300 Subject: [PATCH 40/43] Nick: --- apps/api/src/controllers/auth.ts | 3 ++- apps/api/src/services/rate-limiter.ts | 14 +++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index ac60dc53..151733c0 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -222,7 +222,8 @@ export async function supaAuthenticateUser( rateLimiter = getRateLimiter( RateLimiterMode.Scrape, token, - subscriptionData.plan + subscriptionData.plan, + teamId ); break; case RateLimiterMode.Search: diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index cd923c4c..d96da069 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -84,16 +84,28 @@ export const testSuiteRateLimiter = new RateLimiterRedis({ duration: 60, // Duration in seconds }); +export const devBRateLimiter = new RateLimiterRedis({ + storeClient: redisRateLimitClient, + keyPrefix: "dev-b", + points: 1200, + duration: 60, // Duration in seconds +}); + export function getRateLimiter( mode: RateLimiterMode, token: string, - plan?: string + plan?: string, + teamId?: string ) { if (token.includes("a01ccae") || token.includes("6254cf9") || token.includes("0f96e673")) { return testSuiteRateLimiter; } + if(teamId === process.env.DEV_B_TEAM_ID) { + return devBRateLimiter; + } + const rateLimitConfig = RATE_LIMITS[mode]; // {default : 5} if (!rateLimitConfig) return serverRateLimiter; From fa7dc5b10bab443b4114af612505b4a178bfb0ad Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 26 Aug 2024 16:33:34 -0300 Subject: [PATCH 41/43] Update rate-limiter.ts --- apps/api/src/services/rate-limiter.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index d96da069..84078894 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -98,7 +98,7 @@ export function getRateLimiter( teamId?: string ) { - if (token.includes("a01ccae") || token.includes("6254cf9") || token.includes("0f96e673")) { + if (token.includes("a01ccae") || token.includes("6254cf9") || token.includes("0f96e673") || token.includes("23befa1b")) { return testSuiteRateLimiter; } From 3850b000f06baf5302a2a16a4b99448c0c48f90b Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 27 Aug 2024 11:14:04 -0300 Subject: [PATCH 42/43] Nick: removing credit notification for now --- .../src/services/billing/credit_billing.ts | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index 2ad07318..19c17b48 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -317,21 +317,21 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { // Compare the adjusted total credits used with the credits allowed by the plan if (adjustedCreditsUsed + credits > price.credits) { - await sendNotification( - team_id, - NotificationType.LIMIT_REACHED, - subscription.current_period_start, - subscription.current_period_end - ); + // await sendNotification( + // team_id, + // NotificationType.LIMIT_REACHED, + // subscription.current_period_start, + // subscription.current_period_end + // ); return { success: false, message: "Insufficient credits, please upgrade!", remainingCredits: creditLimit - adjustedCreditsUsed }; } else if (creditUsagePercentage >= 0.8) { // Send email notification for approaching credit limit - await sendNotification( - team_id, - NotificationType.APPROACHING_LIMIT, - subscription.current_period_start, - subscription.current_period_end - ); + // await sendNotification( + // team_id, + // NotificationType.APPROACHING_LIMIT, + // subscription.current_period_start, + // subscription.current_period_end + // ); } return { success: true, message: "Sufficient credits available", remainingCredits: creditLimit - adjustedCreditsUsed }; From 861e2ebdf1f4f38cec6c2f3b1b6e599647469128 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 27 Aug 2024 12:08:50 -0300 Subject: [PATCH 43/43] Nick: 2x rate limits --- apps/api/src/services/rate-limiter.ts | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 84078894..8e4e9fc9 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -17,10 +17,23 @@ const RATE_LIMITS = { growthdouble: 50, }, scrape: { + default: 20, + free: 10, + starter: 20, + standard: 100, + standardOld: 40, + scale: 500, + hobby: 20, + standardNew: 100, + standardnew: 100, + growth: 1000, + growthdouble: 1000, + }, + search: { default: 20, free: 5, starter: 20, - standard: 50, + standard: 40, standardOld: 40, scale: 500, hobby: 10, @@ -29,7 +42,7 @@ const RATE_LIMITS = { growth: 500, growthdouble: 500, }, - search: { + map:{ default: 20, free: 5, starter: 20,