From 68828a5b5ca74474a57737f5bcdcd596e7535547 Mon Sep 17 00:00:00 2001 From: jhoseph88 Date: Thu, 11 Jul 2024 19:30:18 -0400 Subject: [PATCH 01/23] Pass along current, total, current_step, and current_url in js sdk --- apps/js-sdk/firecrawl/build/index.js | 14 ++++++++++++-- apps/js-sdk/firecrawl/package-lock.json | 4 ++-- apps/js-sdk/firecrawl/package.json | 2 +- .../src/__tests__/e2e_withAuth/index.test.ts | 8 ++++++-- apps/js-sdk/firecrawl/src/index.ts | 12 ++++++++++++ 5 files changed, 33 insertions(+), 7 deletions(-) diff --git a/apps/js-sdk/firecrawl/build/index.js b/apps/js-sdk/firecrawl/build/index.js index e54e5323..99de5e2b 100644 --- a/apps/js-sdk/firecrawl/build/index.js +++ b/apps/js-sdk/firecrawl/build/index.js @@ -157,8 +157,14 @@ export default class FirecrawlApp { return { success: true, status: response.data.status, + current: response.data.current, + current_url: response.data.current_url, + current_step: response.data.current_step, + total: response.data.total, data: response.data.data, - partial_data: !response.data.data ? response.data.partial_data : undefined, + partial_data: !response.data.data + ? response.data.partial_data + : undefined, }; } else { @@ -171,6 +177,10 @@ export default class FirecrawlApp { return { success: false, status: "unknown", + current: 0, + current_url: "", + current_step: "", + total: 0, error: "Internal server error.", }; }); @@ -180,7 +190,7 @@ export default class FirecrawlApp { * @returns {AxiosRequestHeaders} The prepared headers. */ prepareHeaders(idempotencyKey) { - return Object.assign({ 'Content-Type': 'application/json', 'Authorization': `Bearer ${this.apiKey}` }, (idempotencyKey ? { 'x-idempotency-key': idempotencyKey } : {})); + return Object.assign({ "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}` }, (idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {})); } /** * Sends a POST request to the specified URL. diff --git a/apps/js-sdk/firecrawl/package-lock.json b/apps/js-sdk/firecrawl/package-lock.json index 906ab47b..f3045f33 100644 --- a/apps/js-sdk/firecrawl/package-lock.json +++ b/apps/js-sdk/firecrawl/package-lock.json @@ -1,12 +1,12 @@ { "name": "@mendable/firecrawl-js", - "version": "0.0.28", + "version": "0.0.29", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@mendable/firecrawl-js", - "version": "0.0.28", + "version": "0.0.29", "license": "MIT", "dependencies": { "axios": "^1.6.8", diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 71cf91a3..0fef67b0 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "0.0.28", + "version": "0.0.29", "description": "JavaScript SDK for Firecrawl API", "main": "build/index.js", "types": "types/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts index af6aa84b..ad917de4 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts @@ -1,7 +1,7 @@ import FirecrawlApp from '../../index'; import { v4 as uuidv4 } from 'uuid'; import dotenv from 'dotenv'; - +import { describe, test, expect } from '@jest/globals'; dotenv.config(); @@ -9,7 +9,7 @@ const TEST_API_KEY = process.env.TEST_API_KEY; const API_URL = "http://127.0.0.1:3002"; describe('FirecrawlApp E2E Tests', () => { - test.concurrent('should throw error for no API key', () => { + test.concurrent('should throw error for no API key', async () => { expect(() => { new FirecrawlApp({ apiKey: null, apiUrl: API_URL }); }).toThrow("No API key provided"); @@ -107,12 +107,16 @@ describe('FirecrawlApp E2E Tests', () => { while (statusResponse.status === 'active' && checks < maxChecks) { await new Promise(resolve => setTimeout(resolve, 1000)); expect(statusResponse.partial_data).not.toBeNull(); + expect(statusResponse.current).toBeGreaterThanOrEqual(1); statusResponse = await app.checkCrawlStatus(response.jobId); checks++; } expect(statusResponse).not.toBeNull(); + expect(statusResponse.success).toBe(true); expect(statusResponse.status).toBe('completed'); + expect(statusResponse.total).toEqual(statusResponse.current); + expect(statusResponse.current_step).not.toBeNull(); expect(statusResponse?.data?.length).toBeGreaterThan(0); }, 35000); // 35 seconds timeout diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 5ee90434..a42d4618 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -100,6 +100,10 @@ export interface CrawlResponse { export interface JobStatusResponse { success: boolean; status: string; + current?: number; + current_url?: string; + current_step?: string; + total?: number; jobId?: string; data?: FirecrawlDocument[]; partial_data?: FirecrawlDocument[]; @@ -287,6 +291,10 @@ export default class FirecrawlApp { return { success: true, status: response.data.status, + current: response.data.current, + current_url: response.data.current_url, + current_step: response.data.current_step, + total: response.data.total, data: response.data.data, partial_data: !response.data.data ? response.data.partial_data @@ -301,6 +309,10 @@ export default class FirecrawlApp { return { success: false, status: "unknown", + current: 0, + current_url: "", + current_step: "", + total: 0, error: "Internal server error.", }; } From 7f596e7a5513a9b357b0eafa21c321456e745f0a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 15 Jul 2024 23:01:40 -0400 Subject: [PATCH 02/23] Update README.md From 92202de12b994f8113bbe2aae09672fb86780e06 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 16 Jul 2024 10:09:49 -0400 Subject: [PATCH 03/23] Update rate-limiter.ts --- apps/api/src/services/rate-limiter.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index f02e6d06..3a1d353e 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -58,7 +58,7 @@ const RATE_LIMITS = { }; export const redisClient = redis.createClient({ - url: process.env.REDIS_URL, + url: process.env.REDIS_RATE_LIMIT_URL, legacyMode: true, }); From dba1fb2dc818c38c8b6bed9be56ad5b700476087 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 16 Jul 2024 18:22:56 -0300 Subject: [PATCH 04/23] Update removeUnwantedElements.ts --- .../WebScraper/utils/removeUnwantedElements.ts | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/utils/removeUnwantedElements.ts b/apps/api/src/scraper/WebScraper/utils/removeUnwantedElements.ts index 38e4c5a0..7962a4a0 100644 --- a/apps/api/src/scraper/WebScraper/utils/removeUnwantedElements.ts +++ b/apps/api/src/scraper/WebScraper/utils/removeUnwantedElements.ts @@ -8,7 +8,11 @@ export const removeUnwantedElements = ( ) => { const soup = cheerio.load(html); - if (pageOptions.onlyIncludeTags) { + if ( + pageOptions.onlyIncludeTags && + pageOptions.onlyIncludeTags.length > 0 && + pageOptions.onlyIncludeTags[0] !== '' + ) { if (typeof pageOptions.onlyIncludeTags === "string") { pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags]; } @@ -26,7 +30,11 @@ export const removeUnwantedElements = ( soup("script, style, iframe, noscript, meta, head").remove(); - if (pageOptions.removeTags) { + if ( + pageOptions.removeTags && + pageOptions.removeTags.length > 0 && + pageOptions.removeTags[0] !== '' + ) { if (typeof pageOptions.removeTags === "string") { pageOptions.removeTags = [pageOptions.removeTags]; } From d39d3be64938082b6fb19e367b1d852f7844c442 Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Tue, 16 Jul 2024 18:38:03 -0700 Subject: [PATCH 05/23] Caleb: now extracting and returning a list of all links on the page for a customer --- apps/api/src/lib/entities.ts | 4 +- apps/api/src/scraper/WebScraper/single_url.ts | 44 +++++++++++++++++-- 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 089d373c..f60e197f 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -89,7 +89,8 @@ export class Document { warning?: string; index?: number; - + linksOnPage?: string[]; // Add this new field as a separate property + constructor(data: Partial) { if (!data.content) { throw new Error("Missing required fields"); @@ -102,6 +103,7 @@ export class Document { this.markdown = data.markdown || ""; this.childrenLinks = data.childrenLinks || undefined; this.provider = data.provider || undefined; + this.linksOnPage = data.linksOnPage; // Assign linksOnPage if provided } } diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index d24e5c2e..0aef2577 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -109,6 +109,38 @@ function getScrapingFallbackOrder( return scrapersInOrder as (typeof baseScrapers)[number][]; } +function extractLinks(html: string, baseUrl: string): string[] { + const $ = cheerio.load(html); + const links: string[] = []; + + // Parse the base URL to get the origin + const urlObject = new URL(baseUrl); + const origin = urlObject.origin; + + $('a').each((_, element) => { + const href = $(element).attr('href'); + if (href) { + if (href.startsWith('http://') || href.startsWith('https://')) { + // Absolute URL, add as is + links.push(href); + } else if (href.startsWith('/')) { + // Relative URL starting with '/', append to origin + links.push(`${origin}${href}`); + } else if (!href.startsWith('#') && !href.startsWith('mailto:')) { + // Relative URL not starting with '/', append to base URL + links.push(`${baseUrl}/${href}`); + } else if (href.startsWith('mailto:')) { + // mailto: links, add as is + links.push(href); + } + // Fragment-only links (#) are ignored + } + }); + + // Remove duplicates and return + return [...new Set(links)]; +} + export async function scrapSingleUrl( urlToScrap: string, pageOptions: PageOptions = { @@ -234,7 +266,6 @@ export async function scrapSingleUrl( scraperResponse.text = customScrapedContent.html; screenshot = customScrapedContent.screenshot; } - //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions); return { @@ -309,6 +340,10 @@ export async function scrapSingleUrl( const soup = cheerio.load(rawHtml); const metadata = extractMetadata(soup, urlToScrap); + let linksOnPage: string[] | undefined; + + linksOnPage = extractLinks(rawHtml, urlToScrap); + let document: Document; if (screenshot && screenshot.length > 0) { document = { @@ -317,9 +352,10 @@ export async function scrapSingleUrl( html: pageOptions.includeHtml ? html : undefined, rawHtml: pageOptions.includeRawHtml || - extractorOptions.mode === "llm-extraction-from-raw-html" + extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined, + linksOnPage, metadata: { ...metadata, screenshot: screenshot, @@ -335,7 +371,7 @@ export async function scrapSingleUrl( html: pageOptions.includeHtml ? html : undefined, rawHtml: pageOptions.includeRawHtml || - extractorOptions.mode === "llm-extraction-from-raw-html" + extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined, metadata: { @@ -344,6 +380,7 @@ export async function scrapSingleUrl( pageStatusCode: pageStatusCode, pageError: pageError, }, + linksOnPage, }; } @@ -354,6 +391,7 @@ export async function scrapSingleUrl( content: "", markdown: "", html: "", + linksOnPage: [], metadata: { sourceURL: urlToScrap, pageStatusCode: pageStatusCode, From c9073a747c3c1eb328dc983748693a181759d658 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 16 Jul 2024 22:41:13 -0400 Subject: [PATCH 06/23] Nick: --- apps/api/package.json | 3 +- apps/api/pnpm-lock.yaml | 5 +- apps/api/src/index.ts | 147 +++++++++++++++++++------- apps/api/src/services/rate-limiter.ts | 13 ++- apps/api/src/services/redis.ts | 20 ++-- 5 files changed, 127 insertions(+), 61 deletions(-) diff --git a/apps/api/package.json b/apps/api/package.json index 230667a8..183ddaa3 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -73,7 +73,7 @@ "form-data": "^4.0.0", "glob": "^10.4.2", "gpt3-tokenizer": "^1.1.5", - "ioredis": "^5.3.2", + "ioredis": "^5.4.1", "joplin-turndown-plugin-gfm": "^1.0.12", "json-schema-to-zod": "^2.3.0", "keyword-extractor": "^0.0.28", @@ -92,7 +92,6 @@ "promptable": "^0.0.10", "puppeteer": "^22.12.1", "rate-limiter-flexible": "2.4.2", - "redis": "^4.6.7", "resend": "^3.4.0", "robots-parser": "^3.0.1", "scrapingbee": "^1.7.4", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 80b12936..02d8363b 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -90,7 +90,7 @@ importers: specifier: ^1.1.5 version: 1.1.5 ioredis: - specifier: ^5.3.2 + specifier: ^5.4.1 version: 5.4.1 joplin-turndown-plugin-gfm: specifier: ^1.0.12 @@ -146,9 +146,6 @@ importers: rate-limiter-flexible: specifier: 2.4.2 version: 2.4.2 - redis: - specifier: ^4.6.7 - version: 4.6.14 resend: specifier: ^3.4.0 version: 3.4.0 diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 198288f6..c5e6a438 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -3,7 +3,6 @@ import bodyParser from "body-parser"; import cors from "cors"; import "dotenv/config"; import { getWebScraperQueue } from "./services/queue-service"; -import { redisClient } from "./services/rate-limiter"; import { v0Router } from "./routes/v0"; import { initSDK } from "@hyperdx/node-opentelemetry"; import cluster from "cluster"; @@ -11,6 +10,8 @@ import os from "os"; import { Job } from "bull"; import { sendSlackWebhook } from "./services/alerts/slack"; import { checkAlerts } from "./services/alerts"; +import Redis from "ioredis"; +import { redisRateLimitClient } from "./services/rate-limiter"; const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); @@ -34,11 +35,9 @@ if (cluster.isMaster) { cluster.fork(); } }); - } else { const app = express(); - global.isProduction = process.env.IS_PRODUCTION === "true"; app.use(bodyParser.urlencoded({ extended: true })); @@ -46,6 +45,8 @@ if (cluster.isMaster) { app.use(cors()); // Add this line to enable CORS + const queueRedis = new Redis(process.env.REDIS_URL); + const serverAdapter = new ExpressAdapter(); serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`); @@ -73,7 +74,6 @@ if (cluster.isMaster) { const DEFAULT_PORT = process.env.PORT ?? 3002; const HOST = process.env.HOST ?? "localhost"; - redisClient.connect(); // HyperDX OpenTelemetry if (process.env.ENV === "production") { @@ -121,7 +121,6 @@ if (cluster.isMaster) { }); app.post(`/admin/${process.env.BULL_AUTH_KEY}/shutdown`, async (req, res) => { - // return res.status(200).json({ ok: true }); try { console.log("Gracefully shutting down..."); @@ -138,34 +137,38 @@ if (cluster.isMaster) { const wsq = getWebScraperQueue(); const jobs = await wsq.getActive(); - + console.log("Requeueing", jobs.length, "jobs..."); if (jobs.length > 0) { console.log(" Removing", jobs.length, "jobs..."); - await Promise.all(jobs.map(async x => { - try { - await wsq.client.del(await x.lockKey()); - await x.takeLock(); - await x.moveToFailed({ message: "interrupted" }); - await x.remove(); - } catch (e) { - console.warn("Failed to remove job", x.id, e); - } - })); + await Promise.all( + jobs.map(async (x) => { + try { + await wsq.client.del(await x.lockKey()); + await x.takeLock(); + await x.moveToFailed({ message: "interrupted" }); + await x.remove(); + } catch (e) { + console.warn("Failed to remove job", x.id, e); + } + }) + ); console.log(" Re-adding", jobs.length, "jobs..."); - await wsq.addBulk(jobs.map(x => ({ - data: x.data, - opts: { - jobId: x.id, - }, - }))); + await wsq.addBulk( + jobs.map((x) => ({ + data: x.data, + opts: { + jobId: x.id, + }, + })) + ); console.log(" Done!"); } - + await getWebScraperQueue().resume(false); res.json({ ok: true }); } catch (error) { @@ -268,27 +271,32 @@ if (cluster.isMaster) { const numberOfBatches = 9; // Adjust based on your needs const completedJobsPromises: Promise[] = []; for (let i = 0; i < numberOfBatches; i++) { - completedJobsPromises.push(webScraperQueue.getJobs( - ["completed"], - i * batchSize, - i * batchSize + batchSize, - true - )); + completedJobsPromises.push( + webScraperQueue.getJobs( + ["completed"], + i * batchSize, + i * batchSize + batchSize, + true + ) + ); } - const completedJobs: Job[] = (await Promise.all(completedJobsPromises)).flat(); - const before24hJobs = completedJobs.filter( - (job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000 - ) || []; - + const completedJobs: Job[] = ( + await Promise.all(completedJobsPromises) + ).flat(); + const before24hJobs = + completedJobs.filter( + (job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000 + ) || []; + let count = 0; - + if (!before24hJobs) { return res.status(200).send(`No jobs to remove.`); } for (const job of before24hJobs) { try { - await job.remove() + await job.remove(); count++; } catch (jobError) { console.error(`Failed to remove job with ID ${job.id}:`, jobError); @@ -306,8 +314,73 @@ if (cluster.isMaster) { res.send({ isProduction: global.isProduction }); }); + app.get( + `/admin/${process.env.BULL_AUTH_KEY}/redis-health`, + async (req, res) => { + try { + const testKey = "test"; + const testValue = "test"; - + // Test queueRedis + let queueRedisHealth; + try { + await queueRedis.set(testKey, testValue); + queueRedisHealth = await queueRedis.get(testKey); + await queueRedis.del(testKey); + } catch (error) { + console.error("queueRedis health check failed:", error); + queueRedisHealth = null; + } + + // Test redisRateLimitClient + let redisRateLimitHealth; + try { + await redisRateLimitClient.set(testKey, testValue); + redisRateLimitHealth = await redisRateLimitClient.get(testKey); + await redisRateLimitClient.del(testKey); + } catch (error) { + console.error("redisRateLimitClient health check failed:", error); + redisRateLimitHealth = null; + } + + const healthStatus = { + queueRedis: queueRedisHealth === testValue ? "healthy" : "unhealthy", + redisRateLimitClient: + redisRateLimitHealth === testValue ? "healthy" : "unhealthy", + }; + + if ( + healthStatus.queueRedis === "healthy" && + healthStatus.redisRateLimitClient === "healthy" + ) { + console.log("Both Redis instances are healthy"); + return res + .status(200) + .json({ status: "healthy", details: healthStatus }); + } else { + console.log("Redis instances health check:", healthStatus); + await sendSlackWebhook( + `[REDIS DOWN] Redis instances health check: ${JSON.stringify( + healthStatus + )}`, + true + ); + return res + .status(500) + .json({ status: "unhealthy", details: healthStatus }); + } + } catch (error) { + console.error("Redis health check failed:", error); + await sendSlackWebhook( + `[REDIS DOWN] Redis instances health check: ${error.message}`, + true + ); + return res + .status(500) + .json({ status: "unhealthy", message: error.message }); + } + } + ); console.log(`Worker ${process.pid} started`); } diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 3a1d353e..2c7dd963 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -1,6 +1,6 @@ import { RateLimiterRedis } from "rate-limiter-flexible"; -import * as redis from "redis"; import { RateLimiterMode } from "../../src/types"; +import Redis from "ioredis"; const RATE_LIMITS = { crawl: { @@ -57,14 +57,13 @@ const RATE_LIMITS = { }, }; -export const redisClient = redis.createClient({ - url: process.env.REDIS_RATE_LIMIT_URL, - legacyMode: true, -}); +export const redisRateLimitClient = new Redis( + process.env.REDIS_RATE_LIMIT_URL +) const createRateLimiter = (keyPrefix, points) => new RateLimiterRedis({ - storeClient: redisClient, + storeClient: redisRateLimitClient, keyPrefix, points, duration: 60, // Duration in seconds @@ -76,7 +75,7 @@ export const serverRateLimiter = createRateLimiter( ); export const testSuiteRateLimiter = new RateLimiterRedis({ - storeClient: redisClient, + storeClient: redisRateLimitClient, keyPrefix: "test-suite", points: 10000, duration: 60, // Duration in seconds diff --git a/apps/api/src/services/redis.ts b/apps/api/src/services/redis.ts index 491eeb11..b720a330 100644 --- a/apps/api/src/services/redis.ts +++ b/apps/api/src/services/redis.ts @@ -1,10 +1,8 @@ import Redis from "ioredis"; - -// Initialize Redis client -const redis = new Redis(process.env.REDIS_URL); +import { redisRateLimitClient } from "./rate-limiter"; // Listen to 'error' events to the Redis connection -redis.on("error", (error) => { +redisRateLimitClient.on("error", (error) => { try { if (error.message === "ECONNRESET") { console.log("Connection to Redis Session Store timed out."); @@ -15,16 +13,16 @@ redis.on("error", (error) => { }); // Listen to 'reconnecting' event to Redis -redis.on("reconnecting", (err) => { +redisRateLimitClient.on("reconnecting", (err) => { try { - if (redis.status === "reconnecting") + if (redisRateLimitClient.status === "reconnecting") console.log("Reconnecting to Redis Session Store..."); else console.log("Error reconnecting to Redis Session Store."); } catch (error) {} }); // Listen to the 'connect' event to Redis -redis.on("connect", (err) => { +redisRateLimitClient.on("connect", (err) => { try { if (!err) console.log("Connected to Redis Session Store!"); } catch (error) {} @@ -38,9 +36,9 @@ redis.on("connect", (err) => { */ const setValue = async (key: string, value: string, expire?: number) => { if (expire) { - await redis.set(key, value, "EX", expire); + await redisRateLimitClient.set(key, value, "EX", expire); } else { - await redis.set(key, value); + await redisRateLimitClient.set(key, value); } }; @@ -50,7 +48,7 @@ const setValue = async (key: string, value: string, expire?: number) => { * @returns {Promise} The value, if found, otherwise null. */ const getValue = async (key: string): Promise => { - const value = await redis.get(key); + const value = await redisRateLimitClient.get(key); return value; }; @@ -59,7 +57,7 @@ const getValue = async (key: string): Promise => { * @param {string} key The key to delete. */ const deleteKey = async (key: string) => { - await redis.del(key); + await redisRateLimitClient.del(key); }; export { setValue, getValue, deleteKey }; From ffc3b7c5fbe97de4c5633b59811c27d9cd277bc0 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 16 Jul 2024 22:42:40 -0400 Subject: [PATCH 07/23] Update index.ts --- apps/api/src/index.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index c5e6a438..3942e937 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -45,7 +45,6 @@ if (cluster.isMaster) { app.use(cors()); // Add this line to enable CORS - const queueRedis = new Redis(process.env.REDIS_URL); const serverAdapter = new ExpressAdapter(); serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`); @@ -318,6 +317,8 @@ if (cluster.isMaster) { `/admin/${process.env.BULL_AUTH_KEY}/redis-health`, async (req, res) => { try { + const queueRedis = new Redis(process.env.REDIS_URL); + const testKey = "test"; const testValue = "test"; From 3c3412e893d68c089dd2a0d1d4ed69c8f40ba061 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 16 Jul 2024 22:45:12 -0400 Subject: [PATCH 08/23] Update rate-limiter.test.ts --- apps/api/src/services/rate-limiter.test.ts | 315 +++++++++++++++++++-- 1 file changed, 296 insertions(+), 19 deletions(-) diff --git a/apps/api/src/services/rate-limiter.test.ts b/apps/api/src/services/rate-limiter.test.ts index 47a24a7e..c49c85d9 100644 --- a/apps/api/src/services/rate-limiter.test.ts +++ b/apps/api/src/services/rate-limiter.test.ts @@ -1,48 +1,98 @@ -import { getRateLimiter, serverRateLimiter, testSuiteRateLimiter, redisClient } from "./rate-limiter"; +import { + getRateLimiter, + serverRateLimiter, + testSuiteRateLimiter, + redisRateLimitClient, +} from "./rate-limiter"; import { RateLimiterMode } from "../../src/types"; import { RateLimiterRedis } from "rate-limiter-flexible"; describe("Rate Limiter Service", () => { beforeAll(async () => { - await redisClient.connect(); + try { + await redisRateLimitClient.connect(); + // if (process.env.REDIS_RATE_LIMIT_URL === "redis://localhost:6379") { + // console.log("Erasing all keys"); + // // erase all the keys that start with "test-prefix" + // const keys = await redisRateLimitClient.keys("test-prefix:*"); + // if (keys.length > 0) { + // await redisRateLimitClient.del(...keys); + // } + // } + } catch (error) {} }); afterAll(async () => { - await redisClient.disconnect(); + try { + // if (process.env.REDIS_RATE_LIMIT_URL === "redis://localhost:6379") { + await redisRateLimitClient.disconnect(); + // } + } catch (error) {} }); it("should return the testSuiteRateLimiter for specific tokens", () => { - const limiter = getRateLimiter("crawl" as RateLimiterMode, "a01ccae"); + const limiter = getRateLimiter( + "crawl" as RateLimiterMode, + "test-prefix:a01ccae" + ); expect(limiter).toBe(testSuiteRateLimiter); - const limiter2 = getRateLimiter("scrape" as RateLimiterMode, "6254cf9"); + const limiter2 = getRateLimiter( + "scrape" as RateLimiterMode, + "test-prefix:6254cf9" + ); expect(limiter2).toBe(testSuiteRateLimiter); }); it("should return the serverRateLimiter if mode is not found", () => { - const limiter = getRateLimiter("nonexistent" as RateLimiterMode, "someToken"); + const limiter = getRateLimiter( + "nonexistent" as RateLimiterMode, + "test-prefix:someToken" + ); expect(limiter).toBe(serverRateLimiter); }); it("should return the correct rate limiter based on mode and plan", () => { - const limiter = getRateLimiter("crawl" as RateLimiterMode, "someToken", "free"); + const limiter = getRateLimiter( + "crawl" as RateLimiterMode, + "test-prefix:someToken", + "free" + ); expect(limiter.points).toBe(2); - const limiter2 = getRateLimiter("scrape" as RateLimiterMode, "someToken", "standard"); + const limiter2 = getRateLimiter( + "scrape" as RateLimiterMode, + "test-prefix:someToken", + "standard" + ); expect(limiter2.points).toBe(50); - const limiter3 = getRateLimiter("search" as RateLimiterMode, "someToken", "growth"); + const limiter3 = getRateLimiter( + "search" as RateLimiterMode, + "test-prefix:someToken", + "growth" + ); expect(limiter3.points).toBe(500); - const limiter4 = getRateLimiter("crawlStatus" as RateLimiterMode, "someToken", "growth"); + const limiter4 = getRateLimiter( + "crawlStatus" as RateLimiterMode, + "test-prefix:someToken", + "growth" + ); expect(limiter4.points).toBe(150); }); it("should return the default rate limiter if plan is not provided", () => { - const limiter = getRateLimiter("crawl" as RateLimiterMode, "someToken"); + const limiter = getRateLimiter( + "crawl" as RateLimiterMode, + "test-prefix:someToken" + ); expect(limiter.points).toBe(3); - const limiter2 = getRateLimiter("scrape" as RateLimiterMode, "someToken"); + const limiter2 = getRateLimiter( + "scrape" as RateLimiterMode, + "test-prefix:someToken" + ); expect(limiter2.points).toBe(20); }); @@ -50,7 +100,7 @@ describe("Rate Limiter Service", () => { const keyPrefix = "test-prefix"; const points = 10; const limiter = new RateLimiterRedis({ - storeClient: redisClient, + storeClient: redisRateLimitClient, keyPrefix, points, duration: 60, @@ -62,26 +112,253 @@ describe("Rate Limiter Service", () => { }); it("should return the correct rate limiter for 'preview' mode", () => { - const limiter = getRateLimiter("preview" as RateLimiterMode, "someToken", "free"); + const limiter = getRateLimiter( + "preview" as RateLimiterMode, + "test-prefix:someToken", + "free" + ); expect(limiter.points).toBe(5); - const limiter2 = getRateLimiter("preview" as RateLimiterMode, "someToken"); + const limiter2 = getRateLimiter( + "preview" as RateLimiterMode, + "test-prefix:someToken" + ); expect(limiter2.points).toBe(5); }); it("should return the correct rate limiter for 'account' mode", () => { - const limiter = getRateLimiter("account" as RateLimiterMode, "someToken", "free"); + const limiter = getRateLimiter( + "account" as RateLimiterMode, + "test-prefix:someToken", + "free" + ); expect(limiter.points).toBe(100); - const limiter2 = getRateLimiter("account" as RateLimiterMode, "someToken"); + const limiter2 = getRateLimiter( + "account" as RateLimiterMode, + "test-prefix:someToken" + ); expect(limiter2.points).toBe(100); }); it("should return the correct rate limiter for 'crawlStatus' mode", () => { - const limiter = getRateLimiter("crawlStatus" as RateLimiterMode, "someToken", "free"); + const limiter = getRateLimiter( + "crawlStatus" as RateLimiterMode, + "test-prefix:someToken", + "free" + ); expect(limiter.points).toBe(150); - const limiter2 = getRateLimiter("crawlStatus" as RateLimiterMode, "someToken"); + const limiter2 = getRateLimiter( + "crawlStatus" as RateLimiterMode, + "test-prefix:someToken" + ); expect(limiter2.points).toBe(150); }); + + it("should consume points correctly for 'crawl' mode", async () => { + const limiter = getRateLimiter( + "crawl" as RateLimiterMode, + "test-prefix:someTokenCRAWL", + "free" + ); + const consumePoints = 1; + + const res = await limiter.consume( + "test-prefix:someTokenCRAWL", + consumePoints + ); + expect(res.remainingPoints).toBe(1); + }); + + it("should consume points correctly for 'scrape' mode (DEFAULT)", async () => { + const limiter = getRateLimiter( + "scrape" as RateLimiterMode, + "test-prefix:someTokenX" + ); + const consumePoints = 4; + + const res = await limiter.consume("test-prefix:someTokenX", consumePoints); + expect(res.remainingPoints).toBe(16); + }); + + it("should consume points correctly for 'scrape' mode (HOBBY)", async () => { + const limiter = getRateLimiter( + "scrape" as RateLimiterMode, + "test-prefix:someTokenXY", + "hobby" + ); + // expect hobby to have 100 points + expect(limiter.points).toBe(10); + + const consumePoints = 5; + + const res = await limiter.consume("test-prefix:someTokenXY", consumePoints); + expect(res.consumedPoints).toBe(5); + expect(res.remainingPoints).toBe(5); + }); + + it("should return the correct rate limiter for 'crawl' mode", () => { + const limiter = getRateLimiter( + "crawl" as RateLimiterMode, + "test-prefix:someToken", + "free" + ); + expect(limiter.points).toBe(2); + + const limiter2 = getRateLimiter( + "crawl" as RateLimiterMode, + "test-prefix:someToken", + "starter" + ); + expect(limiter2.points).toBe(3); + + const limiter3 = getRateLimiter( + "crawl" as RateLimiterMode, + "test-prefix:someToken", + "standard" + ); + expect(limiter3.points).toBe(5); + }); + + it("should return the correct rate limiter for 'scrape' mode", () => { + const limiter = getRateLimiter( + "scrape" as RateLimiterMode, + "test-prefix:someToken", + "free" + ); + expect(limiter.points).toBe(5); + + const limiter2 = getRateLimiter( + "scrape" as RateLimiterMode, + "test-prefix:someToken", + "starter" + ); + expect(limiter2.points).toBe(20); + + const limiter3 = getRateLimiter( + "scrape" as RateLimiterMode, + "test-prefix:someToken", + "standard" + ); + expect(limiter3.points).toBe(50); + }); + + it("should return the correct rate limiter for 'search' mode", () => { + const limiter = getRateLimiter( + "search" as RateLimiterMode, + "test-prefix:someToken", + "free" + ); + expect(limiter.points).toBe(5); + + const limiter2 = getRateLimiter( + "search" as RateLimiterMode, + "test-prefix:someToken", + "starter" + ); + expect(limiter2.points).toBe(20); + + const limiter3 = getRateLimiter( + "search" as RateLimiterMode, + "test-prefix:someToken", + "standard" + ); + expect(limiter3.points).toBe(40); + }); + + it("should return the correct rate limiter for 'preview' mode", () => { + const limiter = getRateLimiter( + "preview" as RateLimiterMode, + "test-prefix:someToken", + "free" + ); + expect(limiter.points).toBe(5); + + const limiter2 = getRateLimiter( + "preview" as RateLimiterMode, + "test-prefix:someToken" + ); + expect(limiter2.points).toBe(5); + }); + + it("should return the correct rate limiter for 'account' mode", () => { + const limiter = getRateLimiter( + "account" as RateLimiterMode, + "test-prefix:someToken", + "free" + ); + expect(limiter.points).toBe(100); + + const limiter2 = getRateLimiter( + "account" as RateLimiterMode, + "test-prefix:someToken" + ); + expect(limiter2.points).toBe(100); + }); + + it("should return the correct rate limiter for 'crawlStatus' mode", () => { + const limiter = getRateLimiter( + "crawlStatus" as RateLimiterMode, + "test-prefix:someToken", + "free" + ); + expect(limiter.points).toBe(150); + + const limiter2 = getRateLimiter( + "crawlStatus" as RateLimiterMode, + "test-prefix:someToken" + ); + expect(limiter2.points).toBe(150); + }); + + it("should return the correct rate limiter for 'testSuite' mode", () => { + const limiter = getRateLimiter( + "testSuite" as RateLimiterMode, + "test-prefix:someToken", + "free" + ); + expect(limiter.points).toBe(10000); + + const limiter2 = getRateLimiter( + "testSuite" as RateLimiterMode, + "test-prefix:someToken" + ); + expect(limiter2.points).toBe(10000); + }); + + it("should throw an error when consuming more points than available", async () => { + const limiter = getRateLimiter( + "crawl" as RateLimiterMode, + "test-prefix:someToken" + ); + const consumePoints = limiter.points + 1; + + try { + await limiter.consume("test-prefix:someToken", consumePoints); + } catch (error) { + // expect remaining points to be 0 + const res = await limiter.get("test-prefix:someToken"); + expect(res.remainingPoints).toBe(0); + } + }); + + it("should reset points after duration", async () => { + const keyPrefix = "test-prefix"; + const points = 10; + const duration = 1; // 1 second + const limiter = new RateLimiterRedis({ + storeClient: redisRateLimitClient, + keyPrefix, + points, + duration, + }); + + const consumePoints = 5; + await limiter.consume("test-prefix:someToken", consumePoints); + await new Promise((resolve) => setTimeout(resolve, duration * 1000 + 100)); // Wait for duration + 100ms + + const res = await limiter.consume("test-prefix:someToken", consumePoints); + expect(res.remainingPoints).toBe(points - consumePoints); + }); }); From 2bcbe4f35362cd505c913aa6858ad0cc05378e30 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 16 Jul 2024 22:47:34 -0400 Subject: [PATCH 09/23] Create check-redis.yml --- .github/workflows/check-redis.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 .github/workflows/check-redis.yml diff --git a/.github/workflows/check-redis.yml b/.github/workflows/check-redis.yml new file mode 100644 index 00000000..e5e9ff0d --- /dev/null +++ b/.github/workflows/check-redis.yml @@ -0,0 +1,20 @@ +name: Check Redis +on: + schedule: + - cron: '*/5 * * * *' + +env: + BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} + +jobs: + clean-jobs: + runs-on: ubuntu-latest + steps: + - name: Send GET request to check queues + run: | + response=$(curl --write-out '%{http_code}' --silent --output /dev/null --max-time 180 https://api.firecrawl.dev/admin/${{ secrets.BULL_AUTH_KEY }}/redis-health) + if [ "$response" -ne 200 ]; then + echo "Failed to check queues. Response: $response" + exit 1 + fi + echo "Successfully checked queues. Response: $response" From 98c788ca7a0a27f1c9da5a94971f59647634f0f3 Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Tue, 16 Jul 2024 21:13:52 -0700 Subject: [PATCH 10/23] Caleb: added a test to ensure links on page exists and isn't zero on mendable --- .../WebScraper/__tests__/single_url.test.ts | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts index 7966648b..63408eaf 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts @@ -22,3 +22,17 @@ describe('scrapSingleUrl', () => { }, 10000); }); + +it('should return a list of links on the mendable.ai page', async () => { + const url = 'https://mendable.ai'; + const pageOptions: PageOptions = { includeHtml: true }; + + const result = await scrapSingleUrl(url, pageOptions); + + // Check if the result contains a list of links + expect(result.linksOnPage).toBeDefined(); + expect(Array.isArray(result.linksOnPage)).toBe(true); + expect(result.linksOnPage.length).toBeGreaterThan(0); +}, 10000); + + From 0b3c0ede49a77689082acb708c110aaae2fca327 Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Tue, 16 Jul 2024 21:15:59 -0700 Subject: [PATCH 11/23] Added tests per @nicks request --- .../WebScraper/__tests__/single_url.test.ts | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts index 63408eaf..30a836ba 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts @@ -22,6 +22,81 @@ describe('scrapSingleUrl', () => { }, 10000); }); +import { scrapSingleUrl } from '../single_url'; +import { PageOptions } from '../../../lib/entities'; + +// Mock the fetchHtmlContent function +jest.mock('../single_url', () => { + const originalModule = jest.requireActual('../single_url'); + originalModule.fetchHtmlContent = jest.fn().mockResolvedValue(` + + Test Page + + Absolute Link + Relative Link + Page Link + Fragment Link + Email Link + + + `); + return originalModule; +}); + +describe('scrapSingleUrl with linksOnPage', () => { + const baseUrl = 'https://test.com'; + + it('should not include linksOnPage when option is false', async () => { + const pageOptions: PageOptions = {}; + const result = await scrapSingleUrl(baseUrl, pageOptions); + expect(result.linksOnPage).toBeUndefined(); + }); + + it('should include linksOnPage when option is true', async () => { + const pageOptions: PageOptions = { }; + const result = await scrapSingleUrl(baseUrl, pageOptions); + expect(result.linksOnPage).toBeDefined(); + expect(Array.isArray(result.linksOnPage)).toBe(true); + }); + + it('should correctly handle absolute URLs', async () => { + const pageOptions: PageOptions = { }; + const result = await scrapSingleUrl(baseUrl, pageOptions); + expect(result.linksOnPage).toContain('https://example.com'); + }); + + it('should correctly handle relative URLs', async () => { + const pageOptions: PageOptions = { }; + const result = await scrapSingleUrl(baseUrl, pageOptions); + expect(result.linksOnPage).toContain('https://test.com/relative'); + }); + + it('should correctly handle page URLs', async () => { + const pageOptions: PageOptions = { }; + const result = await scrapSingleUrl(baseUrl, pageOptions); + expect(result.linksOnPage).toContain('https://test.com/page'); + }); + + it('should not include fragment-only links', async () => { + const pageOptions: PageOptions = { }; + const result = await scrapSingleUrl(baseUrl, pageOptions); + expect(result.linksOnPage).not.toContain('#fragment'); + expect(result.linksOnPage).not.toContain('https://test.com/#fragment'); + }); + + it('should include mailto links', async () => { + const pageOptions: PageOptions = { }; + const result = await scrapSingleUrl(baseUrl, pageOptions); + expect(result.linksOnPage).toContain('mailto:test@example.com'); + }); + + it('should return unique links', async () => { + const pageOptions: PageOptions = { }; + const result = await scrapSingleUrl(baseUrl, pageOptions); + const uniqueLinks = new Set(result.linksOnPage); + expect(result.linksOnPage?.length).toBe(uniqueLinks.size); + }); +}); it('should return a list of links on the mendable.ai page', async () => { const url = 'https://mendable.ai'; @@ -36,3 +111,5 @@ it('should return a list of links on the mendable.ai page', async () => { }, 10000); + + From da3c6bca374c9d51a21ede7812730b04465b315a Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Tue, 16 Jul 2024 21:23:22 -0700 Subject: [PATCH 12/23] Caleb: added a simple test --- .../WebScraper/__tests__/single_url.test.ts | 87 +------------------ 1 file changed, 4 insertions(+), 83 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts index 30a836ba..0ee3493b 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts @@ -1,3 +1,7 @@ +import { scrapSingleUrl } from '../single_url'; +import { PageOptions } from '../../../lib/entities'; + + jest.mock('../single_url', () => { const originalModule = jest.requireActual('../single_url'); originalModule.fetchHtmlContent = jest.fn().mockResolvedValue('Test

Roast

'); @@ -5,9 +9,6 @@ jest.mock('../single_url', () => { return originalModule; }); -import { scrapSingleUrl } from '../single_url'; -import { PageOptions } from '../../../lib/entities'; - describe('scrapSingleUrl', () => { it('should handle includeHtml option correctly', async () => { const url = 'https://roastmywebsite.ai'; @@ -22,82 +23,6 @@ describe('scrapSingleUrl', () => { }, 10000); }); -import { scrapSingleUrl } from '../single_url'; -import { PageOptions } from '../../../lib/entities'; - -// Mock the fetchHtmlContent function -jest.mock('../single_url', () => { - const originalModule = jest.requireActual('../single_url'); - originalModule.fetchHtmlContent = jest.fn().mockResolvedValue(` - - Test Page - - Absolute Link - Relative Link - Page Link - Fragment Link - Email Link - - - `); - return originalModule; -}); - -describe('scrapSingleUrl with linksOnPage', () => { - const baseUrl = 'https://test.com'; - - it('should not include linksOnPage when option is false', async () => { - const pageOptions: PageOptions = {}; - const result = await scrapSingleUrl(baseUrl, pageOptions); - expect(result.linksOnPage).toBeUndefined(); - }); - - it('should include linksOnPage when option is true', async () => { - const pageOptions: PageOptions = { }; - const result = await scrapSingleUrl(baseUrl, pageOptions); - expect(result.linksOnPage).toBeDefined(); - expect(Array.isArray(result.linksOnPage)).toBe(true); - }); - - it('should correctly handle absolute URLs', async () => { - const pageOptions: PageOptions = { }; - const result = await scrapSingleUrl(baseUrl, pageOptions); - expect(result.linksOnPage).toContain('https://example.com'); - }); - - it('should correctly handle relative URLs', async () => { - const pageOptions: PageOptions = { }; - const result = await scrapSingleUrl(baseUrl, pageOptions); - expect(result.linksOnPage).toContain('https://test.com/relative'); - }); - - it('should correctly handle page URLs', async () => { - const pageOptions: PageOptions = { }; - const result = await scrapSingleUrl(baseUrl, pageOptions); - expect(result.linksOnPage).toContain('https://test.com/page'); - }); - - it('should not include fragment-only links', async () => { - const pageOptions: PageOptions = { }; - const result = await scrapSingleUrl(baseUrl, pageOptions); - expect(result.linksOnPage).not.toContain('#fragment'); - expect(result.linksOnPage).not.toContain('https://test.com/#fragment'); - }); - - it('should include mailto links', async () => { - const pageOptions: PageOptions = { }; - const result = await scrapSingleUrl(baseUrl, pageOptions); - expect(result.linksOnPage).toContain('mailto:test@example.com'); - }); - - it('should return unique links', async () => { - const pageOptions: PageOptions = { }; - const result = await scrapSingleUrl(baseUrl, pageOptions); - const uniqueLinks = new Set(result.linksOnPage); - expect(result.linksOnPage?.length).toBe(uniqueLinks.size); - }); -}); - it('should return a list of links on the mendable.ai page', async () => { const url = 'https://mendable.ai'; const pageOptions: PageOptions = { includeHtml: true }; @@ -109,7 +34,3 @@ it('should return a list of links on the mendable.ai page', async () => { expect(Array.isArray(result.linksOnPage)).toBe(true); expect(result.linksOnPage.length).toBeGreaterThan(0); }, 10000); - - - - From 205cd63c2f664995a19012d27972d08296ff5ee1 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 17 Jul 2024 15:07:06 -0300 Subject: [PATCH 13/23] Update openapi.json --- apps/api/openapi.json | 91 +++++++++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 37 deletions(-) diff --git a/apps/api/openapi.json b/apps/api/openapi.json index bb271976..81481ef6 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -41,14 +41,37 @@ "pageOptions": { "type": "object", "properties": { + "headers": { + "type": "object", + "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." + }, + "includeHtml": { + "type": "boolean", + "description": "Include the raw HTML content of the page. Will output a html key in the response.", + "default": false + }, + "onlyIncludeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'" + }, "onlyMainContent": { "type": "boolean", "description": "Only return the main content of the page excluding headers, navs, footers, etc.", "default": false }, - "includeHtml": { + "removeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" + }, + "replaceAllPathsWithAbsolutePaths": { "type": "boolean", - "description": "Include the raw HTML content of the page. Will output a html key in the response.", + "description": "Replace all relative paths with absolute paths for images and links", "default": false }, "screenshot": { @@ -60,29 +83,6 @@ "type": "integer", "description": "Wait x amount of milliseconds for the page to load to fetch content", "default": 0 - }, - "removeTags": { - "type": "array", - "items": { - "type": "string" - }, - "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" - }, - "onlyIncludeTags": { - "type": "array", - "items": { - "type": "string" - }, - "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'" - }, - "headers": { - "type": "object", - "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." - }, - "replaceAllPathsWithAbsolutePaths": { - "type": "boolean", - "description": "Replace all relative paths with absolute paths for images and links", - "default": false } } }, @@ -216,7 +216,12 @@ }, "allowBackwardCrawling": { "type": "boolean", - "description": "Allow backward crawling (crawl from the base URL to the previous URLs)", + "description": "Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'", + "default": false + }, + "allowExternalContentLinks": { + "type": "boolean", + "description": "Allows the crawler to follow links to external websites.", "default": false } } @@ -224,24 +229,26 @@ "pageOptions": { "type": "object", "properties": { - "onlyMainContent": { - "type": "boolean", - "description": "Only return the main content of the page excluding headers, navs, footers, etc.", - "default": false + "headers": { + "type": "object", + "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." }, "includeHtml": { "type": "boolean", "description": "Include the raw HTML content of the page. Will output a html key in the response.", "default": false }, - "screenshot": { - "type": "boolean", - "description": "Include a screenshot of the top of the page that you are scraping.", - "default": false + "onlyIncludeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'" }, - "headers": { - "type": "object", - "description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc." + "onlyMainContent": { + "type": "boolean", + "description": "Only return the main content of the page excluding headers, navs, footers, etc.", + "default": false }, "removeTags": { "type": "array", @@ -254,6 +261,16 @@ "type": "boolean", "description": "Replace all relative paths with absolute paths for images and links", "default": false + }, + "screenshot": { + "type": "boolean", + "description": "Include a screenshot of the top of the page that you are scraping.", + "default": false + }, + "waitFor": { + "type": "integer", + "description": "Wait x amount of milliseconds for the page to load to fetch content", + "default": 0 } } } From c5d1e7260d95d60b1369eab82ce7e5c0af28acff Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Wed, 17 Jul 2024 11:29:05 -0700 Subject: [PATCH 14/23] Caleb: made changes per Rafaels requests --- .../WebScraper/__tests__/single_url.test.ts | 1 + apps/api/src/scraper/WebScraper/single_url.ts | 31 +---------------- .../api/src/scraper/WebScraper/utils/utils.ts | 34 +++++++++++++++++++ 3 files changed, 36 insertions(+), 30 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts index 0ee3493b..3ef138a5 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts @@ -33,4 +33,5 @@ it('should return a list of links on the mendable.ai page', async () => { expect(result.linksOnPage).toBeDefined(); expect(Array.isArray(result.linksOnPage)).toBe(true); expect(result.linksOnPage.length).toBeGreaterThan(0); + expect(result.linksOnPage).toContain('https://www.mendable.ai/blog') }, 10000); diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 0aef2577..f66a7c06 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -16,6 +16,7 @@ import { scrapWithFetch } from "./scrapers/fetch"; import { scrapWithFireEngine } from "./scrapers/fireEngine"; import { scrapWithPlaywright } from "./scrapers/playwright"; import { scrapWithScrapingBee } from "./scrapers/scrapingBee"; +import { extractLinks } from "./utils/utils"; dotenv.config(); @@ -109,37 +110,7 @@ function getScrapingFallbackOrder( return scrapersInOrder as (typeof baseScrapers)[number][]; } -function extractLinks(html: string, baseUrl: string): string[] { - const $ = cheerio.load(html); - const links: string[] = []; - // Parse the base URL to get the origin - const urlObject = new URL(baseUrl); - const origin = urlObject.origin; - - $('a').each((_, element) => { - const href = $(element).attr('href'); - if (href) { - if (href.startsWith('http://') || href.startsWith('https://')) { - // Absolute URL, add as is - links.push(href); - } else if (href.startsWith('/')) { - // Relative URL starting with '/', append to origin - links.push(`${origin}${href}`); - } else if (!href.startsWith('#') && !href.startsWith('mailto:')) { - // Relative URL not starting with '/', append to base URL - links.push(`${baseUrl}/${href}`); - } else if (href.startsWith('mailto:')) { - // mailto: links, add as is - links.push(href); - } - // Fragment-only links (#) are ignored - } - }); - - // Remove duplicates and return - return [...new Set(links)]; -} export async function scrapSingleUrl( urlToScrap: string, diff --git a/apps/api/src/scraper/WebScraper/utils/utils.ts b/apps/api/src/scraper/WebScraper/utils/utils.ts index f9ce9b3c..3aa021a6 100644 --- a/apps/api/src/scraper/WebScraper/utils/utils.ts +++ b/apps/api/src/scraper/WebScraper/utils/utils.ts @@ -1,4 +1,6 @@ import axios from "axios"; +import * as cheerio from "cheerio"; + export async function attemptScrapWithRequests( urlToScrap: string @@ -21,3 +23,35 @@ export async function attemptScrapWithRequests( export function sanitizeText(text: string): string { return text.replace("\u0000", ""); } + +export function extractLinks(html: string, baseUrl: string): string[] { + const $ = cheerio.load(html); + const links: string[] = []; + + // Parse the base URL to get the origin + const urlObject = new URL(baseUrl); + const origin = urlObject.origin; + + $('a').each((_, element) => { + const href = $(element).attr('href'); + if (href) { + if (href.startsWith('http://') || href.startsWith('https://')) { + // Absolute URL, add as is + links.push(href); + } else if (href.startsWith('/')) { + // Relative URL starting with '/', append to origin + links.push(`${origin}${href}`); + } else if (!href.startsWith('#') && !href.startsWith('mailto:')) { + // Relative URL not starting with '/', append to base URL + links.push(`${baseUrl}/${href}`); + } else if (href.startsWith('mailto:')) { + // mailto: links, add as is + links.push(href); + } + // Fragment-only links (#) are ignored + } + }); + + // Remove duplicates and return + return [...new Set(links)]; +} \ No newline at end of file From 5b24d26c84ca68301af50199994f57021a15e424 Mon Sep 17 00:00:00 2001 From: Caleb Peffer <44934913+calebpeffer@users.noreply.github.com> Date: Wed, 17 Jul 2024 11:33:12 -0700 Subject: [PATCH 15/23] Caleb; fixed test --- apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts index 3ef138a5..8a9df227 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts @@ -33,5 +33,5 @@ it('should return a list of links on the mendable.ai page', async () => { expect(result.linksOnPage).toBeDefined(); expect(Array.isArray(result.linksOnPage)).toBe(true); expect(result.linksOnPage.length).toBeGreaterThan(0); - expect(result.linksOnPage).toContain('https://www.mendable.ai/blog') + expect(result.linksOnPage).toContain('https://mendable.ai/blog') }, 10000); From 8160c311c0cf415f10c19d78d04fb47afe2f27b8 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 17 Jul 2024 21:30:56 +0200 Subject: [PATCH 16/23] fix queue stuck bug via lock setting changes --- apps/api/fly.staging.toml | 3 - apps/api/fly.toml | 3 - apps/api/package.json | 4 +- apps/api/postdeploy.js | 11 -- apps/api/src/index.ts | 57 --------- apps/api/src/services/queue-service.ts | 6 +- apps/api/src/services/queue-worker.ts | 165 +++++++++++++------------ apps/api/src/trigger-shutdown.ts | 9 -- 8 files changed, 92 insertions(+), 166 deletions(-) delete mode 100644 apps/api/postdeploy.js delete mode 100644 apps/api/src/trigger-shutdown.ts diff --git a/apps/api/fly.staging.toml b/apps/api/fly.staging.toml index 09fa135e..7a5e0848 100644 --- a/apps/api/fly.staging.toml +++ b/apps/api/fly.staging.toml @@ -8,9 +8,6 @@ primary_region = 'mia' kill_signal = 'SIGINT' kill_timeout = '30s' -[deploy] - release_command = 'node dist/src/trigger-shutdown.js https://staging-firecrawl-scraper-js.fly.dev' - [build] [processes] diff --git a/apps/api/fly.toml b/apps/api/fly.toml index 481290f0..f7ef786f 100644 --- a/apps/api/fly.toml +++ b/apps/api/fly.toml @@ -8,9 +8,6 @@ primary_region = 'mia' kill_signal = 'SIGINT' kill_timeout = '30s' -[deploy] - release_command = 'node dist/src/trigger-shutdown.js https://api.firecrawl.dev' - [build] [processes] diff --git a/apps/api/package.json b/apps/api/package.json index 183ddaa3..da1b2b33 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -19,8 +19,8 @@ "mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest", "mongo-docker-console": "docker exec -it mongodb mongosh", "run-example": "npx ts-node src/example.ts", - "deploy:fly": "flyctl deploy && node postdeploy.js https://api.firecrawl.dev", - "deploy:fly:staging": "fly deploy -c fly.staging.toml && node postdeploy.js https://staging-firecrawl-scraper-js.fly.dev" + "deploy:fly": "flyctl deploy", + "deploy:fly:staging": "fly deploy -c fly.staging.toml" }, "author": "", "license": "ISC", diff --git a/apps/api/postdeploy.js b/apps/api/postdeploy.js deleted file mode 100644 index c1b94d70..00000000 --- a/apps/api/postdeploy.js +++ /dev/null @@ -1,11 +0,0 @@ -require("dotenv").config(); - -fetch(process.argv[2] + "/admin/" + process.env.BULL_AUTH_KEY + "/unpause", { - method: "POST" -}).then(async x => { - console.log(await x.text()); - process.exit(0); -}).catch(e => { - console.error(e); - process.exit(1); -}); diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 3942e937..88ec4418 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -119,63 +119,6 @@ if (cluster.isMaster) { } }); - app.post(`/admin/${process.env.BULL_AUTH_KEY}/shutdown`, async (req, res) => { - // return res.status(200).json({ ok: true }); - try { - console.log("Gracefully shutting down..."); - await getWebScraperQueue().pause(false, true); - res.json({ ok: true }); - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); - } - }); - - app.post(`/admin/${process.env.BULL_AUTH_KEY}/unpause`, async (req, res) => { - try { - const wsq = getWebScraperQueue(); - - const jobs = await wsq.getActive(); - - console.log("Requeueing", jobs.length, "jobs..."); - - if (jobs.length > 0) { - console.log(" Removing", jobs.length, "jobs..."); - - await Promise.all( - jobs.map(async (x) => { - try { - await wsq.client.del(await x.lockKey()); - await x.takeLock(); - await x.moveToFailed({ message: "interrupted" }); - await x.remove(); - } catch (e) { - console.warn("Failed to remove job", x.id, e); - } - }) - ); - - console.log(" Re-adding", jobs.length, "jobs..."); - await wsq.addBulk( - jobs.map((x) => ({ - data: x.data, - opts: { - jobId: x.id, - }, - })) - ); - - console.log(" Done!"); - } - - await getWebScraperQueue().resume(false); - res.json({ ok: true }); - } catch (error) { - console.error(error); - return res.status(500).json({ error: error.message }); - } - }); - app.get(`/serverHealthCheck`, async (req, res) => { try { const webScraperQueue = getWebScraperQueue(); diff --git a/apps/api/src/services/queue-service.ts b/apps/api/src/services/queue-service.ts index a32b78f0..f93c3504 100644 --- a/apps/api/src/services/queue-service.ts +++ b/apps/api/src/services/queue-service.ts @@ -7,8 +7,10 @@ export function getWebScraperQueue() { if (!webScraperQueue) { webScraperQueue = new Queue("web-scraper", process.env.REDIS_URL, { settings: { - lockDuration: 2 * 60 * 60 * 1000, // 2 hours in milliseconds, - lockRenewTime: 30 * 60 * 1000, // 30 minutes in milliseconds + lockDuration: 2 * 60 * 1000, // 1 minute in milliseconds, + lockRenewTime: 15 * 1000, // 15 seconds in milliseconds + stalledInterval: 30 * 1000, + maxStalledCount: 10, }, }); console.log("Web scraper queue created"); diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 24343487..be2a4c70 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -6,6 +6,7 @@ import { startWebScraperPipeline } from "../main/runWebScraper"; import { callWebhook } from "./webhook"; import { logJob } from "./logging/log_job"; import { initSDK } from '@hyperdx/node-opentelemetry'; +import { Job } from "bull"; if(process.env.ENV === 'production') { initSDK({ @@ -16,93 +17,99 @@ if(process.env.ENV === 'production') { const wsq = getWebScraperQueue(); -wsq.process( - Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)), - async function (job, done) { - try { - job.progress({ - current: 1, - total: 100, - current_step: "SCRAPING", - current_url: "", - }); - const start = Date.now(); - const { success, message, docs } = await startWebScraperPipeline({ job }); - const end = Date.now(); - const timeTakenInSeconds = (end - start) / 1000; +async function processJob(job: Job, done) { + console.log("taking job", job.id); + try { + job.progress({ + current: 1, + total: 100, + current_step: "SCRAPING", + current_url: "", + }); + const start = Date.now(); + const { success, message, docs } = await startWebScraperPipeline({ job }); + const end = Date.now(); + const timeTakenInSeconds = (end - start) / 1000; - const data = { - success: success, - result: { - links: docs.map((doc) => { - return { content: doc, source: doc?.metadata?.sourceURL ?? doc?.url ?? "" }; - }), - }, - project_id: job.data.project_id, - error: message /* etc... */, - }; + const data = { + success: success, + result: { + links: docs.map((doc) => { + return { content: doc, source: doc?.metadata?.sourceURL ?? doc?.url ?? "" }; + }), + }, + project_id: job.data.project_id, + error: message /* etc... */, + }; - await callWebhook(job.data.team_id, job.id as string, data); + await callWebhook(job.data.team_id, job.id as string, data); - await logJob({ - job_id: job.id as string, - success: success, - message: message, - num_docs: docs.length, - docs: docs, - time_taken: timeTakenInSeconds, - team_id: job.data.team_id, - mode: "crawl", - url: job.data.url, - crawlerOptions: job.data.crawlerOptions, - pageOptions: job.data.pageOptions, - origin: job.data.origin, - }); - done(null, data); - } catch (error) { - if (await getWebScraperQueue().isPaused(false)) { - return; - } + await logJob({ + job_id: job.id as string, + success: success, + message: message, + num_docs: docs.length, + docs: docs, + time_taken: timeTakenInSeconds, + team_id: job.data.team_id, + mode: "crawl", + url: job.data.url, + crawlerOptions: job.data.crawlerOptions, + pageOptions: job.data.pageOptions, + origin: job.data.origin, + }); + console.log("job done", job.id); + done(null, data); + } catch (error) { + console.log("job errored", job.id, error); + if (await getWebScraperQueue().isPaused(false)) { + console.log("queue is paused, ignoring"); + return; + } - if (error instanceof CustomError) { - // Here we handle the error, then save the failed job - console.error(error.message); // or any other error handling + if (error instanceof CustomError) { + // Here we handle the error, then save the failed job + console.error(error.message); // or any other error handling - logtail.error("Custom error while ingesting", { - job_id: job.id, - error: error.message, - dataIngestionJob: error.dataIngestionJob, - }); - } - console.log(error); - - logtail.error("Overall error ingesting", { + logtail.error("Custom error while ingesting", { job_id: job.id, error: error.message, + dataIngestionJob: error.dataIngestionJob, }); - - const data = { - success: false, - project_id: job.data.project_id, - error: - "Something went wrong... Contact help@mendable.ai or try again." /* etc... */, - }; - await callWebhook(job.data.team_id, job.id as string, data); - await logJob({ - job_id: job.id as string, - success: false, - message: typeof error === 'string' ? error : (error.message ?? "Something went wrong... Contact help@mendable.ai"), - num_docs: 0, - docs: [], - time_taken: 0, - team_id: job.data.team_id, - mode: "crawl", - url: job.data.url, - crawlerOptions: job.data.crawlerOptions, - pageOptions: job.data.pageOptions, - origin: job.data.origin, - }); - done(null, data); } + console.log(error); + + logtail.error("Overall error ingesting", { + job_id: job.id, + error: error.message, + }); + + const data = { + success: false, + project_id: job.data.project_id, + error: + "Something went wrong... Contact help@mendable.ai or try again." /* etc... */, + }; + await callWebhook(job.data.team_id, job.id as string, data); + await logJob({ + job_id: job.id as string, + success: false, + message: typeof error === 'string' ? error : (error.message ?? "Something went wrong... Contact help@mendable.ai"), + num_docs: 0, + docs: [], + time_taken: 0, + team_id: job.data.team_id, + mode: "crawl", + url: job.data.url, + crawlerOptions: job.data.crawlerOptions, + pageOptions: job.data.pageOptions, + origin: job.data.origin, + }); + done(null, data); } +} + +wsq.process( + Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)), + processJob ); diff --git a/apps/api/src/trigger-shutdown.ts b/apps/api/src/trigger-shutdown.ts deleted file mode 100644 index 5b36f81f..00000000 --- a/apps/api/src/trigger-shutdown.ts +++ /dev/null @@ -1,9 +0,0 @@ -fetch(process.argv[2] + "/admin/" + process.env.BULL_AUTH_KEY + "/shutdown", { - method: "POST" -}).then(async x => { - console.log(await x.text()); - process.exit(0); -}).catch(e => { - console.error(e); - process.exit(1); -}); From 2b4ce12097415a2bf68ef62335720751ba8aa364 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 17 Jul 2024 16:43:22 -0300 Subject: [PATCH 17/23] Update openapi.json --- apps/api/openapi.json | 45 +++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/apps/api/openapi.json b/apps/api/openapi.json index 81481ef6..d12a0ac5 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -47,7 +47,12 @@ }, "includeHtml": { "type": "boolean", - "description": "Include the raw HTML content of the page. Will output a html key in the response.", + "description": "Include the HTML version of the content on page. Will output a html key in the response.", + "default": false + }, + "includeRawHtml": { + "type": "boolean", + "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.", "default": false }, "onlyIncludeTags": { @@ -235,7 +240,12 @@ }, "includeHtml": { "type": "boolean", - "description": "Include the raw HTML content of the page. Will output a html key in the response.", + "description": "Include the HTML version of the content on page. Will output a html key in the response.", + "default": false + }, + "includeRawHtml": { + "type": "boolean", + "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.", "default": false }, "onlyIncludeTags": { @@ -340,7 +350,12 @@ }, "includeHtml": { "type": "boolean", - "description": "Include the raw HTML content of the page. Will output a html key in the response.", + "description": "Include the HTML version of the content on page. Will output a html key in the response.", + "default": false + }, + "includeRawHtml": { + "type": "boolean", + "description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.", "default": false } } @@ -420,14 +435,6 @@ "type": "integer", "description": "Current page number" }, - "current_url": { - "type": "string", - "description": "Current URL being scraped" - }, - "current_step": { - "type": "string", - "description": "Current step in the process" - }, "total": { "type": "integer", "description": "Total number of pages" @@ -444,7 +451,7 @@ "items": { "$ref": "#/components/schemas/CrawlStatusResponseObj" }, - "description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array." + "description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. When the crawl is done, partial_data will become empty and the result will be available in `data`. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array." } } } @@ -540,7 +547,12 @@ "html": { "type": "string", "nullable": true, - "description": "Raw HTML content of the page if `includeHtml` is true" + "description": "HTML version of the content on page if `includeHtml` is true" + }, + "rawHtml": { + "type": "string", + "nullable": true, + "description": "Raw HTML content of the page if `includeRawHtml` is true" }, "metadata": { "type": "object", @@ -600,7 +612,12 @@ "html": { "type": "string", "nullable": true, - "description": "Raw HTML content of the page if `includeHtml` is true" + "description": "HTML version of the content on page if `includeHtml` is true" + }, + "rawHtml": { + "type": "string", + "nullable": true, + "description": "Raw HTML content of the page if `includeRawHtml` is true" }, "index": { "type": "integer", From 17a1f9b55fa85d0b88414c2a8d7d752d4ef96965 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 17 Jul 2024 16:22:04 -0400 Subject: [PATCH 18/23] Update .env.example --- apps/api/.env.example | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/.env.example b/apps/api/.env.example index c39c8fa3..5d8e746d 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -3,6 +3,7 @@ NUM_WORKERS_PER_QUEUE=8 PORT=3002 HOST=0.0.0.0 REDIS_URL=redis://localhost:6379 +REDIS_RATE_LIMIT_URL=redis://localhost:6379 PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000/html ## To turn on DB authentication, you need to set up supabase. From 6609c1b6e5367a5d55495ccd6aa77a6df7fd3898 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 17 Jul 2024 16:22:27 -0400 Subject: [PATCH 19/23] Update .env.local --- apps/api/.env.local | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/.env.local b/apps/api/.env.local index 88133b76..17f85935 100644 --- a/apps/api/.env.local +++ b/apps/api/.env.local @@ -5,6 +5,7 @@ SUPABASE_ANON_TOKEN= SUPABASE_URL= SUPABASE_SERVICE_TOKEN= REDIS_URL= +REDIS_RATE_LIMIT_URL= SCRAPING_BEE_API_KEY= OPENAI_API_KEY= ANTHROPIC_API_KEY= From 2fab2d8d29c9c458a2461a7e3da4cd5510199008 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 17 Jul 2024 20:44:34 -0400 Subject: [PATCH 20/23] Update scrape.ts --- apps/api/src/controllers/scrape.ts | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index f5e2c322..f076425f 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -116,11 +116,14 @@ export async function scrapeController(req: Request, res: Response) { const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions }; const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions }; + const origin = req.body.origin ?? defaultOrigin; + let timeout = req.body.timeout ?? defaultTimeout; + if (extractorOptions.mode === "llm-extraction") { pageOptions.onlyMainContent = true; + timeout = req.body.timeout ?? 90000; } - const origin = req.body.origin ?? defaultOrigin; - const timeout = req.body.timeout ?? defaultTimeout; + try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = From 12ec519f9b3ee106d16013c66f7892e32873a16e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 17 Jul 2024 22:44:23 -0400 Subject: [PATCH 21/23] Update docker-compose.yaml --- docker-compose.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/docker-compose.yaml b/docker-compose.yaml index b88f3ed8..98b00041 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -7,6 +7,7 @@ x-common-service: &common-service - backend environment: - REDIS_URL=${REDIS_URL:-redis://redis:6379} + - REDIS_RATE_LIMIT_URL=${REDIS_URL:-redis://redis:6379} - PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000} - USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION} - PORT=${PORT:-3002} From a23b125471709374a9d07984aaaefb76eb40a648 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Thu, 18 Jul 2024 14:20:51 +0200 Subject: [PATCH 22/23] fix(js-sdk): transform tests with ts-jest and configure node --- apps/js-sdk/firecrawl/jest.config.cjs | 5 - apps/js-sdk/firecrawl/jest.config.js | 16 ++ apps/js-sdk/firecrawl/package-lock.json | 315 +++++++++++++++--------- apps/js-sdk/firecrawl/package.json | 4 +- 4 files changed, 215 insertions(+), 125 deletions(-) delete mode 100644 apps/js-sdk/firecrawl/jest.config.cjs create mode 100644 apps/js-sdk/firecrawl/jest.config.js diff --git a/apps/js-sdk/firecrawl/jest.config.cjs b/apps/js-sdk/firecrawl/jest.config.cjs deleted file mode 100644 index b413e106..00000000 --- a/apps/js-sdk/firecrawl/jest.config.cjs +++ /dev/null @@ -1,5 +0,0 @@ -/** @type {import('ts-jest').JestConfigWithTsJest} */ -module.exports = { - preset: 'ts-jest', - testEnvironment: 'node', -}; \ No newline at end of file diff --git a/apps/js-sdk/firecrawl/jest.config.js b/apps/js-sdk/firecrawl/jest.config.js new file mode 100644 index 00000000..df49ad11 --- /dev/null +++ b/apps/js-sdk/firecrawl/jest.config.js @@ -0,0 +1,16 @@ +/** @type {import('ts-jest').JestConfigWithTsJest} **/ +export default { + testEnvironment: "node", + "moduleNameMapper": { + "^(\\.{1,2}/.*)\\.js$": "$1", + }, + "extensionsToTreatAsEsm": [".ts"], + "transform": { + "^.+\\.(mt|t|cj|j)s$": [ + "ts-jest", + { + "useESM": true + } + ] + }, +}; \ No newline at end of file diff --git a/apps/js-sdk/firecrawl/package-lock.json b/apps/js-sdk/firecrawl/package-lock.json index f3045f33..25b0e305 100644 --- a/apps/js-sdk/firecrawl/package-lock.json +++ b/apps/js-sdk/firecrawl/package-lock.json @@ -24,7 +24,7 @@ "@types/node": "^20.12.12", "@types/uuid": "^9.0.8", "jest": "^29.7.0", - "ts-jest": "^29.1.2", + "ts-jest": "^29.2.2", "typescript": "^5.4.5" } }, @@ -42,12 +42,12 @@ } }, "node_modules/@babel/code-frame": { - "version": "7.24.2", - "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.24.2.tgz", - "integrity": "sha512-y5+tLQyV8pg3fsiln67BVLD1P13Eg4lh5RW9mF0zUuvLrv9uIQ4MCL+CRT+FTsBlBjcIan6PGsLcBN0m3ClUyQ==", + "version": "7.24.7", + "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.24.7.tgz", + "integrity": "sha512-BcYH1CVJBO9tvyIZ2jVeXgSIMvGZ2FDRvDdOIVQyuklNKSsx+eppDEBq/g47Ayw+RqNFE+URvOShmf+f/qwAlA==", "dev": true, "dependencies": { - "@babel/highlight": "^7.24.2", + "@babel/highlight": "^7.24.7", "picocolors": "^1.0.0" }, "engines": { @@ -55,9 +55,9 @@ } }, "node_modules/@babel/compat-data": { - "version": "7.24.4", - "resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.24.4.tgz", - "integrity": "sha512-vg8Gih2MLK+kOkHJp4gBEIkyaIi00jgWot2D9QOmmfLC8jINSOzmCLta6Bvz/JSBCqnegV0L80jhxkol5GWNfQ==", + "version": "7.24.9", + "resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.24.9.tgz", + "integrity": "sha512-e701mcfApCJqMMueQI0Fb68Amflj83+dvAvHawoBpAz+GDjCIyGHzNwnefjsWJ3xiYAqqiQFoWbspGYBdb2/ng==", "dev": true, "engines": { "node": ">=6.9.0" @@ -94,12 +94,12 @@ } }, "node_modules/@babel/generator": { - "version": "7.24.4", - "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.24.4.tgz", - "integrity": "sha512-Xd6+v6SnjWVx/nus+y0l1sxMOTOMBkyL4+BIdbALyatQnAe/SRVjANeDPSCYaX+i1iJmuGSKf3Z+E+V/va1Hvw==", + "version": "7.24.10", + "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.24.10.tgz", + "integrity": "sha512-o9HBZL1G2129luEUlG1hB4N/nlYNWHnpwlND9eOMclRqqu1YDy2sSYVCFUZwl8I1Gxh+QSRrP2vD7EpUmFVXxg==", "dev": true, "dependencies": { - "@babel/types": "^7.24.0", + "@babel/types": "^7.24.9", "@jridgewell/gen-mapping": "^0.3.5", "@jridgewell/trace-mapping": "^0.3.25", "jsesc": "^2.5.1" @@ -109,14 +109,14 @@ } }, "node_modules/@babel/helper-compilation-targets": { - "version": "7.23.6", - "resolved": "https://registry.npmjs.org/@babel/helper-compilation-targets/-/helper-compilation-targets-7.23.6.tgz", - "integrity": "sha512-9JB548GZoQVmzrFgp8o7KxdgkTGm6xs9DW0o/Pim72UDjzr5ObUQ6ZzYPqA+g9OTS2bBQoctLJrky0RDCAWRgQ==", + "version": "7.24.8", + "resolved": "https://registry.npmjs.org/@babel/helper-compilation-targets/-/helper-compilation-targets-7.24.8.tgz", + "integrity": "sha512-oU+UoqCHdp+nWVDkpldqIQL/i/bvAv53tRqLG/s+cOXxe66zOYLU7ar/Xs3LdmBihrUMEUhwu6dMZwbNOYDwvw==", "dev": true, "dependencies": { - "@babel/compat-data": "^7.23.5", - "@babel/helper-validator-option": "^7.23.5", - "browserslist": "^4.22.2", + "@babel/compat-data": "^7.24.8", + "@babel/helper-validator-option": "^7.24.8", + "browserslist": "^4.23.1", "lru-cache": "^5.1.1", "semver": "^6.3.1" }, @@ -125,62 +125,66 @@ } }, "node_modules/@babel/helper-environment-visitor": { - "version": "7.22.20", - "resolved": "https://registry.npmjs.org/@babel/helper-environment-visitor/-/helper-environment-visitor-7.22.20.tgz", - "integrity": "sha512-zfedSIzFhat/gFhWfHtgWvlec0nqB9YEIVrpuwjruLlXfUSnA8cJB0miHKwqDnQ7d32aKo2xt88/xZptwxbfhA==", + "version": "7.24.7", + "resolved": "https://registry.npmjs.org/@babel/helper-environment-visitor/-/helper-environment-visitor-7.24.7.tgz", + "integrity": "sha512-DoiN84+4Gnd0ncbBOM9AZENV4a5ZiL39HYMyZJGZ/AZEykHYdJw0wW3kdcsh9/Kn+BRXHLkkklZ51ecPKmI1CQ==", "dev": true, + "dependencies": { + "@babel/types": "^7.24.7" + }, "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/helper-function-name": { - "version": "7.23.0", - "resolved": "https://registry.npmjs.org/@babel/helper-function-name/-/helper-function-name-7.23.0.tgz", - "integrity": "sha512-OErEqsrxjZTJciZ4Oo+eoZqeW9UIiOcuYKRJA4ZAgV9myA+pOXhhmpfNCKjEH/auVfEYVFJ6y1Tc4r0eIApqiw==", + "version": "7.24.7", + "resolved": "https://registry.npmjs.org/@babel/helper-function-name/-/helper-function-name-7.24.7.tgz", + "integrity": "sha512-FyoJTsj/PEUWu1/TYRiXTIHc8lbw+TDYkZuoE43opPS5TrI7MyONBE1oNvfguEXAD9yhQRrVBnXdXzSLQl9XnA==", "dev": true, "dependencies": { - "@babel/template": "^7.22.15", - "@babel/types": "^7.23.0" + "@babel/template": "^7.24.7", + "@babel/types": "^7.24.7" }, "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/helper-hoist-variables": { - "version": "7.22.5", - "resolved": "https://registry.npmjs.org/@babel/helper-hoist-variables/-/helper-hoist-variables-7.22.5.tgz", - "integrity": "sha512-wGjk9QZVzvknA6yKIUURb8zY3grXCcOZt+/7Wcy8O2uctxhplmUPkOdlgoNhmdVee2c92JXbf1xpMtVNbfoxRw==", + "version": "7.24.7", + "resolved": "https://registry.npmjs.org/@babel/helper-hoist-variables/-/helper-hoist-variables-7.24.7.tgz", + "integrity": "sha512-MJJwhkoGy5c4ehfoRyrJ/owKeMl19U54h27YYftT0o2teQ3FJ3nQUf/I3LlJsX4l3qlw7WRXUmiyajvHXoTubQ==", "dev": true, "dependencies": { - "@babel/types": "^7.22.5" + "@babel/types": "^7.24.7" }, "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/helper-module-imports": { - "version": "7.24.3", - "resolved": "https://registry.npmjs.org/@babel/helper-module-imports/-/helper-module-imports-7.24.3.tgz", - "integrity": "sha512-viKb0F9f2s0BCS22QSF308z/+1YWKV/76mwt61NBzS5izMzDPwdq1pTrzf+Li3npBWX9KdQbkeCt1jSAM7lZqg==", + "version": "7.24.7", + "resolved": "https://registry.npmjs.org/@babel/helper-module-imports/-/helper-module-imports-7.24.7.tgz", + "integrity": "sha512-8AyH3C+74cgCVVXow/myrynrAGv+nTVg5vKu2nZph9x7RcRwzmh0VFallJuFTZ9mx6u4eSdXZfcOzSqTUm0HCA==", "dev": true, "dependencies": { - "@babel/types": "^7.24.0" + "@babel/traverse": "^7.24.7", + "@babel/types": "^7.24.7" }, "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/helper-module-transforms": { - "version": "7.23.3", - "resolved": "https://registry.npmjs.org/@babel/helper-module-transforms/-/helper-module-transforms-7.23.3.tgz", - "integrity": "sha512-7bBs4ED9OmswdfDzpz4MpWgSrV7FXlc3zIagvLFjS5H+Mk7Snr21vQ6QwrsoCGMfNC4e4LQPdoULEt4ykz0SRQ==", + "version": "7.24.9", + "resolved": "https://registry.npmjs.org/@babel/helper-module-transforms/-/helper-module-transforms-7.24.9.tgz", + "integrity": "sha512-oYbh+rtFKj/HwBQkFlUzvcybzklmVdVV3UU+mN7n2t/q3yGHbuVdNxyFvSBO1tfvjyArpHNcWMAzsSPdyI46hw==", "dev": true, "dependencies": { - "@babel/helper-environment-visitor": "^7.22.20", - "@babel/helper-module-imports": "^7.22.15", - "@babel/helper-simple-access": "^7.22.5", - "@babel/helper-split-export-declaration": "^7.22.6", - "@babel/helper-validator-identifier": "^7.22.20" + "@babel/helper-environment-visitor": "^7.24.7", + "@babel/helper-module-imports": "^7.24.7", + "@babel/helper-simple-access": "^7.24.7", + "@babel/helper-split-export-declaration": "^7.24.7", + "@babel/helper-validator-identifier": "^7.24.7" }, "engines": { "node": ">=6.9.0" @@ -190,60 +194,61 @@ } }, "node_modules/@babel/helper-plugin-utils": { - "version": "7.24.0", - "resolved": "https://registry.npmjs.org/@babel/helper-plugin-utils/-/helper-plugin-utils-7.24.0.tgz", - "integrity": "sha512-9cUznXMG0+FxRuJfvL82QlTqIzhVW9sL0KjMPHhAOOvpQGL8QtdxnBKILjBqxlHyliz0yCa1G903ZXI/FuHy2w==", + "version": "7.24.8", + "resolved": "https://registry.npmjs.org/@babel/helper-plugin-utils/-/helper-plugin-utils-7.24.8.tgz", + "integrity": "sha512-FFWx5142D8h2Mgr/iPVGH5G7w6jDn4jUSpZTyDnQO0Yn7Ks2Kuz6Pci8H6MPCoUJegd/UZQ3tAvfLCxQSnWWwg==", "dev": true, "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/helper-simple-access": { - "version": "7.22.5", - "resolved": "https://registry.npmjs.org/@babel/helper-simple-access/-/helper-simple-access-7.22.5.tgz", - "integrity": "sha512-n0H99E/K+Bika3++WNL17POvo4rKWZ7lZEp1Q+fStVbUi8nxPQEBOlTmCOxW/0JsS56SKKQ+ojAe2pHKJHN35w==", + "version": "7.24.7", + "resolved": "https://registry.npmjs.org/@babel/helper-simple-access/-/helper-simple-access-7.24.7.tgz", + "integrity": "sha512-zBAIvbCMh5Ts+b86r/CjU+4XGYIs+R1j951gxI3KmmxBMhCg4oQMsv6ZXQ64XOm/cvzfU1FmoCyt6+owc5QMYg==", "dev": true, "dependencies": { - "@babel/types": "^7.22.5" + "@babel/traverse": "^7.24.7", + "@babel/types": "^7.24.7" }, "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/helper-split-export-declaration": { - "version": "7.22.6", - "resolved": "https://registry.npmjs.org/@babel/helper-split-export-declaration/-/helper-split-export-declaration-7.22.6.tgz", - "integrity": "sha512-AsUnxuLhRYsisFiaJwvp1QF+I3KjD5FOxut14q/GzovUe6orHLesW2C7d754kRm53h5gqrz6sFl6sxc4BVtE/g==", + "version": "7.24.7", + "resolved": "https://registry.npmjs.org/@babel/helper-split-export-declaration/-/helper-split-export-declaration-7.24.7.tgz", + "integrity": "sha512-oy5V7pD+UvfkEATUKvIjvIAH/xCzfsFVw7ygW2SI6NClZzquT+mwdTfgfdbUiceh6iQO0CHtCPsyze/MZ2YbAA==", "dev": true, "dependencies": { - "@babel/types": "^7.22.5" + "@babel/types": "^7.24.7" }, "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/helper-string-parser": { - "version": "7.24.1", - "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.24.1.tgz", - "integrity": "sha512-2ofRCjnnA9y+wk8b9IAREroeUP02KHp431N2mhKniy2yKIDKpbrHv9eXwm8cBeWQYcJmzv5qKCu65P47eCF7CQ==", + "version": "7.24.8", + "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.24.8.tgz", + "integrity": "sha512-pO9KhhRcuUyGnJWwyEgnRJTSIZHiT+vMD0kPeD+so0l7mxkMT19g3pjY9GTnHySck/hDzq+dtW/4VgnMkippsQ==", "dev": true, "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/helper-validator-identifier": { - "version": "7.22.20", - "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.22.20.tgz", - "integrity": "sha512-Y4OZ+ytlatR8AI+8KZfKuL5urKp7qey08ha31L8b3BwewJAoJamTzyvxPR/5D+KkdJCGPq/+8TukHBlY10FX9A==", + "version": "7.24.7", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.24.7.tgz", + "integrity": "sha512-rR+PBcQ1SMQDDyF6X0wxtG8QyLCgUB0eRAGguqRLfkCA87l7yAP7ehq8SNj96OOGTO8OBV70KhuFYcIkHXOg0w==", "dev": true, "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/helper-validator-option": { - "version": "7.23.5", - "resolved": "https://registry.npmjs.org/@babel/helper-validator-option/-/helper-validator-option-7.23.5.tgz", - "integrity": "sha512-85ttAOMLsr53VgXkTbkx8oA6YTfT4q7/HzXSLEYmjcSTJPMPQtvq1BD79Byep5xMUYbGRzEpDsjUf3dyp54IKw==", + "version": "7.24.8", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-option/-/helper-validator-option-7.24.8.tgz", + "integrity": "sha512-xb8t9tD1MHLungh/AIoWYN+gVHaB9kwlu8gffXGSt3FFEIT7RjS+xWbc2vUD1UTZdIpKj/ab3rdqJ7ufngyi2Q==", "dev": true, "engines": { "node": ">=6.9.0" @@ -264,12 +269,12 @@ } }, "node_modules/@babel/highlight": { - "version": "7.24.2", - "resolved": "https://registry.npmjs.org/@babel/highlight/-/highlight-7.24.2.tgz", - "integrity": "sha512-Yac1ao4flkTxTteCDZLEvdxg2fZfz1v8M4QpaGypq/WPDqg3ijHYbDfs+LG5hvzSoqaSZ9/Z9lKSP3CjZjv+pA==", + "version": "7.24.7", + "resolved": "https://registry.npmjs.org/@babel/highlight/-/highlight-7.24.7.tgz", + "integrity": "sha512-EStJpq4OuY8xYfhGVXngigBJRWxftKX9ksiGDnmlY3o7B/V7KIAc9X4oiK87uPJSc/vs5L869bem5fhZa8caZw==", "dev": true, "dependencies": { - "@babel/helper-validator-identifier": "^7.22.20", + "@babel/helper-validator-identifier": "^7.24.7", "chalk": "^2.4.2", "js-tokens": "^4.0.0", "picocolors": "^1.0.0" @@ -350,9 +355,9 @@ } }, "node_modules/@babel/parser": { - "version": "7.24.4", - "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.24.4.tgz", - "integrity": "sha512-zTvEBcghmeBma9QIGunWevvBAp4/Qu9Bdq+2k0Ot4fVMD6v3dsC9WOcRSKk7tRRyBM/53yKMJko9xOatGQAwSg==", + "version": "7.24.8", + "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.24.8.tgz", + "integrity": "sha512-WzfbgXOkGzZiXXCqk43kKwZjzwx4oulxZi3nq2TYL9mOjQv6kYwul9mz6ID36njuL7Xkp6nJEfok848Zj10j/w==", "dev": true, "bin": { "parser": "bin/babel-parser.js" @@ -539,33 +544,33 @@ } }, "node_modules/@babel/template": { - "version": "7.24.0", - "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.24.0.tgz", - "integrity": "sha512-Bkf2q8lMB0AFpX0NFEqSbx1OkTHf0f+0j82mkw+ZpzBnkk7e9Ql0891vlfgi+kHwOk8tQjiQHpqh4LaSa0fKEA==", + "version": "7.24.7", + "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.24.7.tgz", + "integrity": "sha512-jYqfPrU9JTF0PmPy1tLYHW4Mp4KlgxJD9l2nP9fD6yT/ICi554DmrWBAEYpIelzjHf1msDP3PxJIRt/nFNfBig==", "dev": true, "dependencies": { - "@babel/code-frame": "^7.23.5", - "@babel/parser": "^7.24.0", - "@babel/types": "^7.24.0" + "@babel/code-frame": "^7.24.7", + "@babel/parser": "^7.24.7", + "@babel/types": "^7.24.7" }, "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/traverse": { - "version": "7.24.1", - "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.24.1.tgz", - "integrity": "sha512-xuU6o9m68KeqZbQuDt2TcKSxUw/mrsvavlEqQ1leZ/B+C9tk6E4sRWy97WaXgvq5E+nU3cXMxv3WKOCanVMCmQ==", + "version": "7.24.8", + "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.24.8.tgz", + "integrity": "sha512-t0P1xxAPzEDcEPmjprAQq19NWum4K0EQPjMwZQZbHt+GiZqvjCHjj755Weq1YRPVzBI+3zSfvScfpnuIecVFJQ==", "dev": true, "dependencies": { - "@babel/code-frame": "^7.24.1", - "@babel/generator": "^7.24.1", - "@babel/helper-environment-visitor": "^7.22.20", - "@babel/helper-function-name": "^7.23.0", - "@babel/helper-hoist-variables": "^7.22.5", - "@babel/helper-split-export-declaration": "^7.22.6", - "@babel/parser": "^7.24.1", - "@babel/types": "^7.24.0", + "@babel/code-frame": "^7.24.7", + "@babel/generator": "^7.24.8", + "@babel/helper-environment-visitor": "^7.24.7", + "@babel/helper-function-name": "^7.24.7", + "@babel/helper-hoist-variables": "^7.24.7", + "@babel/helper-split-export-declaration": "^7.24.7", + "@babel/parser": "^7.24.8", + "@babel/types": "^7.24.8", "debug": "^4.3.1", "globals": "^11.1.0" }, @@ -574,13 +579,13 @@ } }, "node_modules/@babel/types": { - "version": "7.24.0", - "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.24.0.tgz", - "integrity": "sha512-+j7a5c253RfKh8iABBhywc8NSfP5LURe7Uh4qpsh6jc+aLJguvmIUBdjSdEMQv2bENrCR5MfRdjGo7vzS/ob7w==", + "version": "7.24.9", + "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.24.9.tgz", + "integrity": "sha512-xm8XrMKz0IlUdocVbYJe0Z9xEgidU7msskG8BbhnTPK/HZ2z/7FP7ykqPgrUH+C+r414mNfNWam1f2vqOjqjYQ==", "dev": true, "dependencies": { - "@babel/helper-string-parser": "^7.23.4", - "@babel/helper-validator-identifier": "^7.22.20", + "@babel/helper-string-parser": "^7.24.8", + "@babel/helper-validator-identifier": "^7.24.7", "to-fast-properties": "^2.0.0" }, "engines": { @@ -1175,6 +1180,12 @@ "sprintf-js": "~1.0.2" } }, + "node_modules/async": { + "version": "3.2.5", + "resolved": "https://registry.npmjs.org/async/-/async-3.2.5.tgz", + "integrity": "sha512-baNZyqaaLhyLVKm/DlvdW051MSgO6b8eVfIezl9E5PqWxFgzLm/wQntEW4zOytVburDEr0JlALEpdOFwvErLsg==", + "dev": true + }, "node_modules/asynckit": { "version": "0.4.0", "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", @@ -1326,9 +1337,9 @@ } }, "node_modules/browserslist": { - "version": "4.23.0", - "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.23.0.tgz", - "integrity": "sha512-QW8HiM1shhT2GuzkvklfjcKDiWFXHOeFCIA/huJPwHsslwcydgk7X+z2zXpEijP98UCY7HbubZt5J2Zgvf0CaQ==", + "version": "4.23.2", + "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.23.2.tgz", + "integrity": "sha512-qkqSyistMYdxAcw+CzbZwlBy8AGmS/eEWs+sEV5TnLRGDOL+C5M2EnH6tlZyg0YoAxGJAFKh61En9BR941GnHA==", "dev": true, "funding": [ { @@ -1345,10 +1356,10 @@ } ], "dependencies": { - "caniuse-lite": "^1.0.30001587", - "electron-to-chromium": "^1.4.668", + "caniuse-lite": "^1.0.30001640", + "electron-to-chromium": "^1.4.820", "node-releases": "^2.0.14", - "update-browserslist-db": "^1.0.13" + "update-browserslist-db": "^1.1.0" }, "bin": { "browserslist": "cli.js" @@ -1403,9 +1414,9 @@ } }, "node_modules/caniuse-lite": { - "version": "1.0.30001612", - "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001612.tgz", - "integrity": "sha512-lFgnZ07UhaCcsSZgWW0K5j4e69dK1u/ltrL9lTUiFOwNHs12S3UMIEYgBV0Z6C6hRDev7iRnMzzYmKabYdXF9g==", + "version": "1.0.30001642", + "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001642.tgz", + "integrity": "sha512-3XQ0DoRgLijXJErLSl+bLnJ+Et4KqV1PY6JJBGAFlsNsz31zeAIncyeZfLCabHK/jtSh+671RM9YMldxjUPZtA==", "dev": true, "funding": [ { @@ -1651,10 +1662,25 @@ "url": "https://dotenvx.com" } }, + "node_modules/ejs": { + "version": "3.1.10", + "resolved": "https://registry.npmjs.org/ejs/-/ejs-3.1.10.tgz", + "integrity": "sha512-UeJmFfOrAQS8OJWPZ4qtgHyWExa088/MtK5UEyoJGFH67cDEXkZSviOiKRCZ4Xij0zxI3JECgYs3oKx+AizQBA==", + "dev": true, + "dependencies": { + "jake": "^10.8.5" + }, + "bin": { + "ejs": "bin/cli.js" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/electron-to-chromium": { - "version": "1.4.748", - "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.748.tgz", - "integrity": "sha512-VWqjOlPZn70UZ8FTKUOkUvBLeTQ0xpty66qV0yJcAGY2/CthI4xyW9aEozRVtuwv3Kpf5xTesmJUcPwuJmgP4A==", + "version": "1.4.829", + "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.829.tgz", + "integrity": "sha512-5qp1N2POAfW0u1qGAxXEtz6P7bO1m6gpZr5hdf5ve6lxpLM7MpiM4jIPz7xcrNlClQMafbyUDDWjlIQZ1Mw0Rw==", "dev": true }, "node_modules/emittery": { @@ -1778,6 +1804,36 @@ "bser": "2.1.1" } }, + "node_modules/filelist": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/filelist/-/filelist-1.0.4.tgz", + "integrity": "sha512-w1cEuf3S+DrLCQL7ET6kz+gmlJdbq9J7yXCSjK/OZCPA+qEN1WyF4ZAf0YYJa4/shHJra2t/d/r8SV4Ji+x+8Q==", + "dev": true, + "dependencies": { + "minimatch": "^5.0.1" + } + }, + "node_modules/filelist/node_modules/brace-expansion": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz", + "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==", + "dev": true, + "dependencies": { + "balanced-match": "^1.0.0" + } + }, + "node_modules/filelist/node_modules/minimatch": { + "version": "5.1.6", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-5.1.6.tgz", + "integrity": "sha512-lKwV/1brpG6mBUFHtb7NUmtABCb2WZZmm2wNiOA5hAb8VdCS4B3dtMWyvcoViccwAW/COERjXLt0zP1zXUN26g==", + "dev": true, + "dependencies": { + "brace-expansion": "^2.0.1" + }, + "engines": { + "node": ">=10" + } + }, "node_modules/fill-range": { "version": "7.0.1", "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.0.1.tgz", @@ -2180,6 +2236,24 @@ "node": ">=8" } }, + "node_modules/jake": { + "version": "10.9.1", + "resolved": "https://registry.npmjs.org/jake/-/jake-10.9.1.tgz", + "integrity": "sha512-61btcOHNnLnsOdtLgA5efqQWjnSi/vow5HbI7HMdKKWqvrKR1bLK3BPlJn9gcSaP2ewuamUSMB5XEy76KUIS2w==", + "dev": true, + "dependencies": { + "async": "^3.2.3", + "chalk": "^4.0.2", + "filelist": "^1.0.4", + "minimatch": "^3.1.2" + }, + "bin": { + "jake": "bin/cli.js" + }, + "engines": { + "node": ">=10" + } + }, "node_modules/jest": { "version": "29.7.0", "resolved": "https://registry.npmjs.org/jest/-/jest-29.7.0.tgz", @@ -3009,9 +3083,9 @@ "dev": true }, "node_modules/node-releases": { - "version": "2.0.14", - "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.14.tgz", - "integrity": "sha512-y10wOWt8yZpqXmOgRo77WaHEmhYQYGNA6y421PKsKYWEK8aW+cqAphborZDhqfyKrbZEN92CN1X2KbafY2s7Yw==", + "version": "2.0.17", + "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.17.tgz", + "integrity": "sha512-Ww6ZlOiEQfPfXM45v17oabk77Z7mg5bOt7AjDyzy7RjK9OrLrLC8dyZQoAPEOtFX9SaNf1Tdvr5gRJWdTJj7GA==", "dev": true }, "node_modules/normalize-path": { @@ -3162,9 +3236,9 @@ "dev": true }, "node_modules/picocolors": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.0.0.tgz", - "integrity": "sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==", + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.0.1.tgz", + "integrity": "sha512-anP1Z8qwhkbmu7MFP5iTt+wQKXgwzf7zTyGlcdzabySa9vd0Xt392U0rVmz9poOaBj0uHJKyyo9/upk0HrEQew==", "dev": true }, "node_modules/picomatch": { @@ -3545,12 +3619,13 @@ } }, "node_modules/ts-jest": { - "version": "29.1.2", - "resolved": "https://registry.npmjs.org/ts-jest/-/ts-jest-29.1.2.tgz", - "integrity": "sha512-br6GJoH/WUX4pu7FbZXuWGKGNDuU7b8Uj77g/Sp7puZV6EXzuByl6JrECvm0MzVzSTkSHWTihsXt+5XYER5b+g==", + "version": "29.2.2", + "resolved": "https://registry.npmjs.org/ts-jest/-/ts-jest-29.2.2.tgz", + "integrity": "sha512-sSW7OooaKT34AAngP6k1VS669a0HdLxkQZnlC7T76sckGCokXFnvJ3yRlQZGRTAoV5K19HfSgCiSwWOSIfcYlg==", "dev": true, "dependencies": { "bs-logger": "0.x", + "ejs": "^3.0.0", "fast-json-stable-stringify": "2.x", "jest-util": "^29.0.0", "json5": "^2.2.3", @@ -3563,10 +3638,11 @@ "ts-jest": "cli.js" }, "engines": { - "node": "^16.10.0 || ^18.0.0 || >=20.0.0" + "node": "^14.15.0 || ^16.10.0 || ^18.0.0 || >=20.0.0" }, "peerDependencies": { "@babel/core": ">=7.0.0-beta.0 <8", + "@jest/transform": "^29.0.0", "@jest/types": "^29.0.0", "babel-jest": "^29.0.0", "jest": "^29.0.0", @@ -3576,6 +3652,9 @@ "@babel/core": { "optional": true }, + "@jest/transform": { + "optional": true + }, "@jest/types": { "optional": true }, @@ -3661,9 +3740,9 @@ "dev": true }, "node_modules/update-browserslist-db": { - "version": "1.0.13", - "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.0.13.tgz", - "integrity": "sha512-xebP81SNcPuNpPP3uzeW1NYXxI3rxyJzF3pD6sH4jE7o/IX+WtSpwnVU+qIsDPyk0d3hmFQ7mjqc6AtV604hbg==", + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.1.0.tgz", + "integrity": "sha512-EdRAaAyk2cUE1wOf2DkEhzxqOQvFOoRJFNS6NeyJ01Gp2beMRpBAINjM2iDXE3KCuKhwnvHIQCJm6ThL2Z+HzQ==", "dev": true, "funding": [ { @@ -3680,8 +3759,8 @@ } ], "dependencies": { - "escalade": "^3.1.1", - "picocolors": "^1.0.0" + "escalade": "^3.1.2", + "picocolors": "^1.0.1" }, "bin": { "update-browserslist-db": "cli.js" diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 0fef67b0..71d2362e 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -9,7 +9,7 @@ "build": "tsc", "build-and-publish": "npm run build && npm publish --access public", "publish-beta": "npm run build && npm publish --access public --tag beta", - "test": "jest src/__tests__/**/*.test.ts" + "test": "NODE_OPTIONS=--experimental-vm-modules jest --verbose src/__tests__/**/*.test.ts" }, "repository": { "type": "git", @@ -37,7 +37,7 @@ "@types/node": "^20.12.12", "@types/uuid": "^9.0.8", "jest": "^29.7.0", - "ts-jest": "^29.1.2", + "ts-jest": "^29.2.2", "typescript": "^5.4.5" }, "keywords": [ From f13ef02a08f2d7fc237a48e1309cc4157fe1aa2f Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 18 Jul 2024 10:34:03 -0300 Subject: [PATCH 23/23] Update openapi.json --- apps/api/openapi.json | 236 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 216 insertions(+), 20 deletions(-) diff --git a/apps/api/openapi.json b/apps/api/openapi.json index d12a0ac5..e0b583f0 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -93,21 +93,22 @@ }, "extractorOptions": { "type": "object", - "description": "Options for LLM-based extraction of structured information from the page content", + "description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.", + "default": {}, "properties": { "mode": { "type": "string", - "enum": ["llm-extraction", "llm-extraction-from-raw-html"], - "description": "The extraction mode to use. llm-extraction: Extracts information from the cleaned and parsed content. llm-extraction-from-raw-html: Extracts information directly from the raw HTML." + "enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"], + "description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM." }, "extractionPrompt": { "type": "string", - "description": "A prompt describing what information to extract from the page" + "description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes." }, "extractionSchema": { "type": "object", "additionalProperties": true, - "description": "The schema for the data to be extracted", + "description": "The schema for the data to be extracted, required only for LLM extraction modes.", "required": [ "company_mission", "supports_sso", @@ -139,13 +140,52 @@ } }, "402": { - "description": "Payment required" + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } }, "429": { - "description": "Too many requests" + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } }, "500": { - "description": "Server error" + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } } } } @@ -302,13 +342,52 @@ } }, "402": { - "description": "Payment required" + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } }, "429": { - "description": "Too many requests" + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } }, "500": { - "description": "Server error" + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } } } } @@ -387,13 +466,52 @@ } }, "402": { - "description": "Payment required" + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } }, "429": { - "description": "Too many requests" + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } }, "500": { - "description": "Server error" + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } } } } @@ -459,13 +577,52 @@ } }, "402": { - "description": "Payment required" + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } }, "429": { - "description": "Too many requests" + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } }, "500": { - "description": "Server error" + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } } } } @@ -509,13 +666,52 @@ } }, "402": { - "description": "Payment required" + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } }, "429": { - "description": "Too many requests" + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } }, "500": { - "description": "Server error" + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } } } }