From bd928b1512ff08ba435f8ec23581f44c489ce84a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 13 Nov 2024 20:27:20 -0500 Subject: [PATCH 01/26] Nick: changed email from hello to help --- apps/api/src/controllers/v0/crawl.ts | 2 +- apps/api/src/controllers/v0/scrape.ts | 2 +- apps/api/src/controllers/v1/crawl-status-ws.ts | 2 +- apps/api/src/index.ts | 2 +- apps/api/src/services/notification/email_notification.ts | 6 +++--- apps/api/src/services/queue-worker.ts | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/apps/api/src/controllers/v0/crawl.ts b/apps/api/src/controllers/v0/crawl.ts index fa7627da..5de5eccf 100644 --- a/apps/api/src/controllers/v0/crawl.ts +++ b/apps/api/src/controllers/v0/crawl.ts @@ -75,7 +75,7 @@ export async function crawlController(req: Request, res: Response) { await checkTeamCredits(chunk, team_id, limitCheck); if (!creditsCheckSuccess) { - return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at hello@firecrawl.com" }); + return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at help@firecrawl.com" }); } // TODO: need to do this to v1 diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index 9bc41fc1..02b9400e 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -209,7 +209,7 @@ export async function scrapeController(req: Request, res: Response) { earlyReturn = true; return res.status(500).json({ error: - "Error checking team credits. Please contact hello@firecrawl.com for help.", + "Error checking team credits. Please contact help@firecrawl.com for help.", }); } diff --git a/apps/api/src/controllers/v1/crawl-status-ws.ts b/apps/api/src/controllers/v1/crawl-status-ws.ts index 18222edc..f552492f 100644 --- a/apps/api/src/controllers/v1/crawl-status-ws.ts +++ b/apps/api/src/controllers/v1/crawl-status-ws.ts @@ -175,7 +175,7 @@ export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAut logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose); return close(ws, 1011, { type: "error", - error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id + error: "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + id }); } } diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 049b37a9..e32bf97f 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -207,7 +207,7 @@ app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response } logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose); - res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id }); + res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + id }); }); logger.info(`Worker ${process.pid} started`); diff --git a/apps/api/src/services/notification/email_notification.ts b/apps/api/src/services/notification/email_notification.ts index e451e0c0..e4575b32 100644 --- a/apps/api/src/services/notification/email_notification.ts +++ b/apps/api/src/services/notification/email_notification.ts @@ -23,7 +23,7 @@ const emailTemplates: Record< }, [NotificationType.RATE_LIMIT_REACHED]: { subject: "Rate Limit Reached - Firecrawl", - html: "Hey there,

You've hit one of the Firecrawl endpoint's rate limit! Take a breather and try again in a few moments. If you need higher rate limits, consider upgrading your plan. Check out our pricing page for more info.

If you have any questions, feel free to reach out to us at hello@firecrawl.com


Thanks,
Firecrawl Team

Ps. this email is only sent once every 7 days if you reach a rate limit.", + html: "Hey there,

You've hit one of the Firecrawl endpoint's rate limit! Take a breather and try again in a few moments. If you need higher rate limits, consider upgrading your plan. Check out our pricing page for more info.

If you have any questions, feel free to reach out to us at help@firecrawl.com


Thanks,
Firecrawl Team

Ps. this email is only sent once every 7 days if you reach a rate limit.", }, [NotificationType.AUTO_RECHARGE_SUCCESS]: { subject: "Auto recharge successful - Firecrawl", @@ -31,7 +31,7 @@ const emailTemplates: Record< }, [NotificationType.AUTO_RECHARGE_FAILED]: { subject: "Auto recharge failed - Firecrawl", - html: "Hey there,

Your auto recharge failed. Please try again manually. If the issue persists, please reach out to us at hello@firecrawl.com


Thanks,
Firecrawl Team
", + html: "Hey there,

Your auto recharge failed. Please try again manually. If the issue persists, please reach out to us at help@firecrawl.com


Thanks,
Firecrawl Team
", }, }; @@ -63,7 +63,7 @@ export async function sendEmailNotification( const { data, error } = await resend.emails.send({ from: "Firecrawl ", to: [email], - reply_to: "hello@firecrawl.com", + reply_to: "help@firecrawl.com", subject: emailTemplates[notificationType].subject, html: emailTemplates[notificationType].html, }); diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 33b2ca9a..54841061 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -262,7 +262,7 @@ async function processJob(job: Job & { id: string }, token: string) { document: null, project_id: job.data.project_id, error: - "URL is blocked. Suspecious activity detected. Please contact hello@firecrawl.com if you believe this is an error.", + "URL is blocked. Suspecious activity detected. Please contact help@firecrawl.com if you believe this is an error.", }; return data; } From 0a1c99074f13715fefe00143501313e63b0b12c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=B3ricz=20Gerg=C5=91?= Date: Thu, 14 Nov 2024 08:58:00 +0100 Subject: [PATCH 02/26] fix(html-to-markdown): make error reporting less intrusive --- apps/api/src/lib/html-to-markdown.ts | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts index 8800d916..92bcd4cd 100644 --- a/apps/api/src/lib/html-to-markdown.ts +++ b/apps/api/src/lib/html-to-markdown.ts @@ -6,22 +6,28 @@ import * as Sentry from "@sentry/node"; import dotenv from 'dotenv'; import { logger } from './logger'; +import { stat } from 'fs/promises'; dotenv.config(); // TODO: add a timeout to the Go parser +const goExecutablePath = join(process.cwd(), 'sharedLibs', 'go-html-to-md', 'html-to-markdown.so'); class GoMarkdownConverter { private static instance: GoMarkdownConverter; private convert: any; private constructor() { - const goExecutablePath = join(process.cwd(), 'sharedLibs', 'go-html-to-md', 'html-to-markdown.so'); const lib = koffi.load(goExecutablePath); this.convert = lib.func('ConvertHTMLToMarkdown', 'string', ['string']); } - public static getInstance(): GoMarkdownConverter { + public static async getInstance(): Promise { if (!GoMarkdownConverter.instance) { + try { + await stat(goExecutablePath); + } catch (_) { + throw Error("Go shared library not found"); + } GoMarkdownConverter.instance = new GoMarkdownConverter(); } return GoMarkdownConverter.instance; @@ -47,7 +53,7 @@ export async function parseMarkdown(html: string | null | undefined): Promise Date: Thu, 14 Nov 2024 10:13:48 +0100 Subject: [PATCH 03/26] fix(scrapeURL): adjust error message for clarity --- apps/api/src/scraper/scrapeURL/error.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/scrapeURL/error.ts b/apps/api/src/scraper/scrapeURL/error.ts index 56ab290d..c78c812d 100644 --- a/apps/api/src/scraper/scrapeURL/error.ts +++ b/apps/api/src/scraper/scrapeURL/error.ts @@ -18,7 +18,7 @@ export class NoEnginesLeftError extends Error { public results: EngineResultsTracker; constructor(fallbackList: Engine[], results: EngineResultsTracker) { - super("All scraping engines failed!"); + super("All scraping engines failed! -- Double check the URL to make sure it's not broken. If the issue persists, contact us at help@firecrawl.com."); this.fallbackList = fallbackList; this.results = results; } From 62c8b63b8442a684cfe288bd43aa501c6650a91d Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Thu, 14 Nov 2024 11:55:00 -0500 Subject: [PATCH 04/26] Create README.md --- examples/aginews-ai-newsletter/README.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 examples/aginews-ai-newsletter/README.md diff --git a/examples/aginews-ai-newsletter/README.md b/examples/aginews-ai-newsletter/README.md new file mode 100644 index 00000000..12dc6ada --- /dev/null +++ b/examples/aginews-ai-newsletter/README.md @@ -0,0 +1,6 @@ +# AGI News ✨ +AGI News is a daily AI newsletter that's completely sourced by autonomous AI agents. It is live at [https://www.aginews.io/](https://www.aginews.io/) + +Here is a link to the repo: + +[https://github.com/ericciarla/aginews](https://github.com/ericciarla/aginews) \ No newline at end of file From 86a78a03cb839e67136e4a506a2f497fc83f6edd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Mon, 11 Nov 2024 19:44:32 +0100 Subject: [PATCH 05/26] fix(sitemap): scrape with tlsclient --- apps/api/src/scraper/WebScraper/sitemap.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index 05b3d00d..51f90b18 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -24,7 +24,7 @@ export async function getLinksFromSitemap( const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); content = response.data; } else if (mode === 'fire-engine') { - const response = await scrapeURL("sitemap", sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }), { forceEngine: "fire-engine;playwright" });; + const response = await scrapeURL("sitemap", sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }), { forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true }); if (!response.success) { throw response.error; } From df05124ef56dd1953bda36ce8c82e964cd885ee2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=B3ricz=20Gerg=C5=91?= Date: Thu, 14 Nov 2024 22:36:28 +0100 Subject: [PATCH 06/26] feat(v1/batch/scrape): webhooks --- apps/api/src/controllers/v1/batch-scrape.ts | 1 + apps/api/src/controllers/v1/types.ts | 23 +++++++++++---------- apps/js-sdk/firecrawl/src/index.ts | 6 ++++-- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/apps/api/src/controllers/v1/batch-scrape.ts b/apps/api/src/controllers/v1/batch-scrape.ts index b018dc99..9fd5cc50 100644 --- a/apps/api/src/controllers/v1/batch-scrape.ts +++ b/apps/api/src/controllers/v1/batch-scrape.ts @@ -66,6 +66,7 @@ export async function batchScrapeController( crawl_id: id, sitemapped: true, v1: true, + webhook: req.body.webhook, }, opts: { jobId: uuidv4(), diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index b2edd6e7..d885e128 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -175,9 +175,21 @@ export const scrapeRequestSchema = scrapeOptions.extend({ export type ScrapeRequest = z.infer; export type ScrapeRequestInput = z.input; +export const webhookSchema = z.preprocess(x => { + if (typeof x === "string") { + return { url: x }; + } else { + return x; + } +}, z.object({ + url: z.string().url(), + headers: z.record(z.string(), z.string()).default({}), +}).strict(strictMessage)) + export const batchScrapeRequestSchema = scrapeOptions.extend({ urls: url.array(), origin: z.string().optional().default("api"), + webhook: webhookSchema.optional(), }).strict(strictMessage).refine( (obj) => { const hasExtractFormat = obj.formats?.includes("extract"); @@ -220,17 +232,6 @@ const crawlerOptions = z.object({ export type CrawlerOptions = z.infer; -export const webhookSchema = z.preprocess(x => { - if (typeof x === "string") { - return { url: x }; - } else { - return x; - } -}, z.object({ - url: z.string().url(), - headers: z.record(z.string(), z.string()).default({}), -}).strict(strictMessage)) - export const crawlRequestSchema = crawlerOptions.extend({ url, origin: z.string().optional().default("api"), diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 45e19197..18038945 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -543,16 +543,18 @@ export default class FirecrawlApp { * @param params - Additional parameters for the scrape request. * @param pollInterval - Time in seconds for job status checks. * @param idempotencyKey - Optional idempotency key for the request. + * @param webhook - Optional webhook for the batch scrape. * @returns The response from the crawl operation. */ async batchScrapeUrls( urls: string[], params?: ScrapeParams, pollInterval: number = 2, - idempotencyKey?: string + idempotencyKey?: string, + webhook?: CrawlParams["webhook"], ): Promise { const headers = this.prepareHeaders(idempotencyKey); - let jsonData: any = { urls, ...(params ?? {}) }; + let jsonData: any = { urls, ...(params ?? {}), webhook }; try { const response: AxiosResponse = await this.postRequest( this.apiUrl + `/v1/batch/scrape`, From 7bca4486b4a94693ced2230456bbd93882ddf49f Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 14 Nov 2024 16:37:53 -0500 Subject: [PATCH 07/26] Update package.json --- apps/js-sdk/firecrawl/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 5d0a7fc9..f5e43544 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.8.2", + "version": "1.8.3", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", From 431e64e752cc697a5c3afc57b4413f42ef8196a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=B3ricz=20Gerg=C5=91?= Date: Thu, 14 Nov 2024 22:39:41 +0100 Subject: [PATCH 08/26] fix(batch/scrape/webhook): add batch_scrape.started --- apps/api/src/controllers/v1/batch-scrape.ts | 5 +++++ apps/api/src/types.ts | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/batch-scrape.ts b/apps/api/src/controllers/v1/batch-scrape.ts index 9fd5cc50..3830b1fe 100644 --- a/apps/api/src/controllers/v1/batch-scrape.ts +++ b/apps/api/src/controllers/v1/batch-scrape.ts @@ -16,6 +16,7 @@ import { logCrawl } from "../../services/logging/crawl_log"; import { getScrapeQueue } from "../../services/queue-service"; import { getJobPriority } from "../../lib/job-priority"; import { addScrapeJobs } from "../../services/queue-jobs"; +import { callWebhook } from "../../services/webhook"; export async function batchScrapeController( req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>, @@ -86,6 +87,10 @@ export async function batchScrapeController( ); await addScrapeJobs(jobs); + if(req.body.webhook) { + await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "batch_scrape.started"); + } + const protocol = process.env.ENV === "local" ? req.protocol : "https"; return res.status(200).json({ diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index d7821407..cc04ca18 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -166,4 +166,4 @@ export type PlanType = | ""; -export type WebhookEventType = "crawl.page" | "batch_scrape.page" | "crawl.started" | "crawl.completed" | "batch_scrape.completed" | "crawl.failed"; \ No newline at end of file +export type WebhookEventType = "crawl.page" | "batch_scrape.page" | "crawl.started" | "batch_scrape.started" | "crawl.completed" | "batch_scrape.completed" | "crawl.failed"; \ No newline at end of file From f155449458cc097057bb489dbbab0bff4c53eb8f Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 14 Nov 2024 17:29:53 -0500 Subject: [PATCH 09/26] Nick: sitemap only --- apps/api/src/controllers/v1/map.ts | 156 ++++++++++++++------------- apps/api/src/controllers/v1/types.ts | 1 + apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/src/index.ts | 1 + 4 files changed, 87 insertions(+), 73 deletions(-) diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 45856543..af97c6f1 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -1,10 +1,6 @@ import { Response } from "express"; import { v4 as uuidv4 } from "uuid"; -import { - mapRequestSchema, - RequestWithAuth, - scrapeOptions, -} from "./types"; +import { mapRequestSchema, RequestWithAuth, scrapeOptions } from "./types"; import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis"; import { MapResponse, MapRequest } from "./types"; import { configDotenv } from "dotenv"; @@ -46,6 +42,7 @@ export async function mapController( originUrl: req.body.url, crawlerOptions: { ...req.body, + limit: req.body.sitemapOnly ? 10000000 : limit, scrapeOptions: undefined, }, scrapeOptions: scrapeOptions.parse({}), @@ -57,77 +54,92 @@ export async function mapController( const crawler = crawlToCrawler(id, sc); - let urlWithoutWww = req.body.url.replace("www.", ""); - - let mapUrl = req.body.search - ? `"${req.body.search}" site:${urlWithoutWww}` - : `site:${req.body.url}`; - - const resultsPerPage = 100; - const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage); - - const cacheKey = `fireEngineMap:${mapUrl}`; - const cachedResult = null; - - let allResults: any[] = []; - let pagePromises: Promise[] = []; - - if (cachedResult) { - allResults = JSON.parse(cachedResult); - } else { - const fetchPage = async (page: number) => { - return fireEngineMap(mapUrl, { - numResults: resultsPerPage, - page: page, - }); - }; - - pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1)); - allResults = await Promise.all(pagePromises); - - await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours - } - - // Parallelize sitemap fetch with serper search - const [sitemap, ...searchResults] = await Promise.all([ - req.body.ignoreSitemap ? null : crawler.tryGetSitemap(), - ...(cachedResult ? [] : pagePromises), - ]); - - if (!cachedResult) { - allResults = searchResults; - } - - if (sitemap !== null) { - sitemap.forEach((x) => { - links.push(x.url); - }); - } - - let mapResults = allResults - .flat() - .filter((result) => result !== null && result !== undefined); - - const minumumCutoff = Math.min(MAX_MAP_LIMIT, limit); - if (mapResults.length > minumumCutoff) { - mapResults = mapResults.slice(0, minumumCutoff); - } - - if (mapResults.length > 0) { - if (req.body.search) { - // Ensure all map results are first, maintaining their order - links = [ - mapResults[0].url, - ...mapResults.slice(1).map((x) => x.url), - ...links, - ]; - } else { - mapResults.map((x) => { + // If sitemapOnly is true, only get links from sitemap + if (req.body.sitemapOnly) { + const sitemap = await crawler.tryGetSitemap(); + if (sitemap !== null) { + sitemap.forEach((x) => { links.push(x.url); }); } - } + } else { + let urlWithoutWww = req.body.url.replace("www.", ""); + let mapUrl = req.body.search + ? `"${req.body.search}" site:${urlWithoutWww}` + : `site:${req.body.url}`; + + const resultsPerPage = 100; + const maxPages = Math.ceil( + Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage + ); + + const cacheKey = `fireEngineMap:${mapUrl}`; + const cachedResult = null; + + let allResults: any[] = []; + let pagePromises: Promise[] = []; + + if (cachedResult) { + allResults = JSON.parse(cachedResult); + } else { + const fetchPage = async (page: number) => { + return fireEngineMap(mapUrl, { + numResults: resultsPerPage, + page: page, + }); + }; + + pagePromises = Array.from({ length: maxPages }, (_, i) => + fetchPage(i + 1) + ); + allResults = await Promise.all(pagePromises); + + await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours + } + + // Parallelize sitemap fetch with serper search + const [sitemap, ...searchResults] = await Promise.all([ + req.body.ignoreSitemap ? null : crawler.tryGetSitemap(), + ...(cachedResult ? [] : pagePromises), + ]); + + if (!cachedResult) { + allResults = searchResults; + } + + if (sitemap !== null) { + sitemap.forEach((x) => { + links.push(x.url); + }); + } + + let mapResults = allResults + .flat() + .filter((result) => result !== null && result !== undefined); + + const minumumCutoff = Math.min(MAX_MAP_LIMIT, limit); + if (mapResults.length > minumumCutoff) { + mapResults = mapResults.slice(0, minumumCutoff); + } + + if (mapResults.length > 0) { + if (req.body.search) { + // Ensure all map results are first, maintaining their order + links = [ + mapResults[0].url, + ...mapResults.slice(1).map((x) => x.url), + ...links, + ]; + } else { + mapResults.map((x) => { + links.push(x.url); + }); + } + } + + + } // Perform cosine similarity between the search query and the list of links if (req.body.search) { const searchQuery = req.body.search.toLowerCase(); diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index d885e128..e14087e1 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -261,6 +261,7 @@ export const mapRequestSchema = crawlerOptions.extend({ includeSubdomains: z.boolean().default(true), search: z.string().optional(), ignoreSitemap: z.boolean().default(false), + sitemapOnly: z.boolean().default(false), limit: z.number().min(1).max(5000).default(5000), }).strict(strictMessage); diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index f5e43544..30b72d22 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.8.3", + "version": "1.8.4", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 18038945..3ea9d9e1 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -221,6 +221,7 @@ export interface MapParams { search?: string; ignoreSitemap?: boolean; includeSubdomains?: boolean; + sitemapOnly?: boolean; limit?: number; } From d62f12c9d996cbe2a014bc17f2a6c9d1b18484df Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 14 Nov 2024 17:31:23 -0500 Subject: [PATCH 10/26] Nick: moved away from axios --- apps/api/src/search/fireEngine.ts | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/apps/api/src/search/fireEngine.ts b/apps/api/src/search/fireEngine.ts index 361a9ebc..7439fcf0 100644 --- a/apps/api/src/search/fireEngine.ts +++ b/apps/api/src/search/fireEngine.ts @@ -1,4 +1,3 @@ -import axios from "axios"; import dotenv from "dotenv"; import { SearchResult } from "../../src/lib/entities"; import * as Sentry from "@sentry/node"; @@ -37,18 +36,18 @@ export async function fireEngineMap( return []; } - let config = { + const response = await fetch(`${process.env.FIRE_ENGINE_BETA_URL}/search`, { method: "POST", - url: `${process.env.FIRE_ENGINE_BETA_URL}/search`, headers: { "Content-Type": "application/json", "X-Disable-Cache": "true" }, - data: data, - }; - const response = await axios(config); - if (response && response.data) { - return response.data; + body: data + }); + + if (response.ok) { + const responseData = await response.json(); + return responseData; } else { return []; } From 3fcdf57d2f0fd46e8706b45d8dc90a4a827a52e8 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 14 Nov 2024 17:31:30 -0500 Subject: [PATCH 11/26] Update fireEngine.ts --- apps/api/src/search/fireEngine.ts | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/apps/api/src/search/fireEngine.ts b/apps/api/src/search/fireEngine.ts index 7439fcf0..c1417af1 100644 --- a/apps/api/src/search/fireEngine.ts +++ b/apps/api/src/search/fireEngine.ts @@ -5,7 +5,6 @@ import { logger } from "../lib/logger"; dotenv.config(); - export async function fireEngineMap( q: string, options: { @@ -40,9 +39,9 @@ export async function fireEngineMap( method: "POST", headers: { "Content-Type": "application/json", - "X-Disable-Cache": "true" + "X-Disable-Cache": "true", }, - body: data + body: data, }); if (response.ok) { From 7f084c6c438158a052a7e0a4db53b7a591dc068d Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 14 Nov 2024 17:44:32 -0500 Subject: [PATCH 12/26] Nick: --- apps/api/src/controllers/v1/map.ts | 4 ++-- apps/api/src/scraper/WebScraper/crawler.ts | 15 ++++++++++++--- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index af97c6f1..64e0025a 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -56,7 +56,7 @@ export async function mapController( // If sitemapOnly is true, only get links from sitemap if (req.body.sitemapOnly) { - const sitemap = await crawler.tryGetSitemap(); + const sitemap = await crawler.tryGetSitemap(true, true); if (sitemap !== null) { sitemap.forEach((x) => { links.push(x.url); @@ -100,7 +100,7 @@ export async function mapController( // Parallelize sitemap fetch with serper search const [sitemap, ...searchResults] = await Promise.all([ - req.body.ignoreSitemap ? null : crawler.tryGetSitemap(), + req.body.ignoreSitemap ? null : crawler.tryGetSitemap(true), ...(cachedResult ? [] : pagePromises), ]); diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 7b4a97d9..3fe53e4d 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -65,7 +65,12 @@ export class WebCrawler { this.allowExternalContentLinks = allowExternalContentLinks ?? false; } - public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { + public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] { + // If the initial URL is a sitemap.xml, skip filtering + if (this.initialUrl.endsWith('sitemap.xml') && fromMap) { + return sitemapLinks.slice(0, limit); + } + return sitemapLinks .filter((link) => { let url: URL; @@ -159,11 +164,14 @@ export class WebCrawler { this.robots = robotsParser(this.robotsTxtUrl, txt); } - public async tryGetSitemap(): Promise<{ url: string; html: string; }[] | null> { + public async tryGetSitemap(fromMap: boolean = false, onlySitemap: boolean = false): Promise<{ url: string; html: string; }[] | null> { logger.debug(`Fetching sitemap links from ${this.initialUrl}`); const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); + if(fromMap && onlySitemap) { + return sitemapLinks.map(link => ({ url: link, html: "" })); + } if (sitemapLinks.length > 0) { - let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth); + let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth, fromMap); return filteredLinks.map(link => ({ url: link, html: "" })); } return null; @@ -353,6 +361,7 @@ export class WebCrawler { return url; }; + const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`; From 3c1b1909f8fb8f3157039ef27099fa97d78a521a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 14 Nov 2024 17:52:15 -0500 Subject: [PATCH 13/26] Update map.ts --- apps/api/src/controllers/v1/map.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 64e0025a..81739a0b 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -61,6 +61,7 @@ export async function mapController( sitemap.forEach((x) => { links.push(x.url); }); + links = links.slice(1, limit); } } else { let urlWithoutWww = req.body.url.replace("www.", ""); From 3a342bfbf06c130b5be6d04bacc580b95162ff9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=B3ricz=20Gerg=C5=91?= Date: Fri, 15 Nov 2024 15:18:40 +0100 Subject: [PATCH 14/26] fix(scrapeURL/playwright): JSON body fix --- apps/api/src/scraper/scrapeURL/engines/playwright/index.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts b/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts index c8e0fe59..887b8b64 100644 --- a/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts @@ -13,12 +13,12 @@ export async function scrapeURLWithPlaywright(meta: Meta): Promise Date: Fri, 15 Nov 2024 18:58:03 +0100 Subject: [PATCH 15/26] fix(v1/batch/scrape): raise default timeout --- apps/api/src/controllers/v1/types.ts | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index e14087e1..3b4c8399 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -186,9 +186,10 @@ export const webhookSchema = z.preprocess(x => { headers: z.record(z.string(), z.string()).default({}), }).strict(strictMessage)) -export const batchScrapeRequestSchema = scrapeOptions.extend({ +export const batchScrapeRequestSchema = scrapeOptions.omit({ timeout: true }).extend({ urls: url.array(), origin: z.string().optional().default("api"), + timeout: z.number().int().positive().finite().safe().default(60000), webhook: webhookSchema.optional(), }).strict(strictMessage).refine( (obj) => { @@ -199,12 +200,7 @@ export const batchScrapeRequestSchema = scrapeOptions.extend({ { message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa", } -).transform((obj) => { - if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) { - return { ...obj, timeout: 60000 }; - } - return obj; -}); +); export type BatchScrapeRequest = z.infer; From 7b02c45dd09e0dcab4e16f50ae5a115d69c15165 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 15 Nov 2024 19:35:54 +0100 Subject: [PATCH 16/26] fix(v1/types): better timeout primitives --- apps/api/src/controllers/v1/types.ts | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 3b4c8399..2059ac8d 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -119,7 +119,7 @@ export const scrapeOptions = z.object({ includeTags: z.string().array().optional(), excludeTags: z.string().array().optional(), onlyMainContent: z.boolean().default(true), - timeout: z.number().int().positive().finite().safe().default(30000), + timeout: z.number().int().positive().finite().safe().optional(), waitFor: z.number().int().nonnegative().finite().safe().default(0), extract: extractOptions.optional(), mobile: z.boolean().default(false), @@ -153,9 +153,10 @@ export const scrapeOptions = z.object({ export type ScrapeOptions = z.infer; -export const scrapeRequestSchema = scrapeOptions.extend({ +export const scrapeRequestSchema = scrapeOptions.omit({ timeout: true }).extend({ url, origin: z.string().optional().default("api"), + timeout: z.number().int().positive().finite().safe().default(30000), }).strict(strictMessage).refine( (obj) => { const hasExtractFormat = obj.formats?.includes("extract"); @@ -186,10 +187,9 @@ export const webhookSchema = z.preprocess(x => { headers: z.record(z.string(), z.string()).default({}), }).strict(strictMessage)) -export const batchScrapeRequestSchema = scrapeOptions.omit({ timeout: true }).extend({ +export const batchScrapeRequestSchema = scrapeOptions.extend({ urls: url.array(), origin: z.string().optional().default("api"), - timeout: z.number().int().positive().finite().safe().default(60000), webhook: webhookSchema.optional(), }).strict(strictMessage).refine( (obj) => { @@ -231,7 +231,7 @@ export type CrawlerOptions = z.infer; export const crawlRequestSchema = crawlerOptions.extend({ url, origin: z.string().optional().default("api"), - scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}), + scrapeOptions: scrapeOptions.default({}), webhook: webhookSchema.optional(), limit: z.number().default(10000), }).strict(strictMessage); From ca2e33db0ac5c138427d9ef0f0c74498d76efe2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 15 Nov 2024 19:55:23 +0100 Subject: [PATCH 17/26] fix(log_job): add force option to retry on supabase failure --- apps/api/src/services/logging/log_job.ts | 72 ++++++++++++++++-------- apps/api/src/services/queue-worker.ts | 6 +- 2 files changed, 50 insertions(+), 28 deletions(-) diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index 2a813b21..642b769d 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -7,7 +7,7 @@ import { logger } from "../../lib/logger"; import { configDotenv } from "dotenv"; configDotenv(); -export async function logJob(job: FirecrawlJob) { +export async function logJob(job: FirecrawlJob, force: boolean = false) { try { const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; if (!useDbAuthentication) { @@ -23,28 +23,52 @@ export async function logJob(job: FirecrawlJob) { job.scrapeOptions.headers["Authorization"] = "REDACTED"; job.docs = [{ content: "REDACTED DUE TO AUTHORIZATION HEADER", html: "REDACTED DUE TO AUTHORIZATION HEADER" }]; } + const jobColumn = { + job_id: job.job_id ? job.job_id : null, + success: job.success, + message: job.message, + num_docs: job.num_docs, + docs: job.docs, + time_taken: job.time_taken, + team_id: job.team_id === "preview" ? null : job.team_id, + mode: job.mode, + url: job.url, + crawler_options: job.crawlerOptions, + page_options: job.scrapeOptions, + origin: job.origin, + num_tokens: job.num_tokens, + retry: !!job.retry, + crawl_id: job.crawl_id, + }; - const { data, error } = await supabase_service - .from("firecrawl_jobs") - .insert([ - { - job_id: job.job_id ? job.job_id : null, - success: job.success, - message: job.message, - num_docs: job.num_docs, - docs: job.docs, - time_taken: job.time_taken, - team_id: job.team_id === "preview" ? null : job.team_id, - mode: job.mode, - url: job.url, - crawler_options: job.crawlerOptions, - page_options: job.scrapeOptions, - origin: job.origin, - num_tokens: job.num_tokens, - retry: !!job.retry, - crawl_id: job.crawl_id, - }, - ]); + if (force) { + while (true) { + try { + const { error } = await supabase_service + .from("firecrawl_jobs") + .insert([jobColumn]); + if (error) { + logger.error("Failed to log job due to Supabase error -- trying again", { error, scrapeId: job.job_id }); + await new Promise((resolve) => setTimeout(() => resolve(), 75)); + } else { + break; + } + } catch (error) { + logger.error("Failed to log job due to thrown error -- trying again", { error, scrapeId: job.job_id }); + await new Promise((resolve) => setTimeout(() => resolve(), 75)); + } + } + logger.debug("Job logged successfully!", { scrapeId: job.job_id }); + } else { + const { error } = await supabase_service + .from("firecrawl_jobs") + .insert([jobColumn]); + if (error) { + logger.error(`Error logging job: ${error.message}`, { error, scrapeId: job.job_id }); + } else { + logger.debug("Job logged successfully!", { scrapeId: job.job_id }); + } + } if (process.env.POSTHOG_API_KEY && !job.crawl_id) { let phLog = { @@ -72,9 +96,7 @@ export async function logJob(job: FirecrawlJob) { posthog.capture(phLog); } } - if (error) { - logger.error(`Error logging job: ${error.message}`); - } + } catch (error) { logger.error(`Error logging job: ${error.message}`); } diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 54841061..c25601ca 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -346,7 +346,7 @@ async function processJob(job: Job & { id: string }, token: string) { scrapeOptions: job.data.scrapeOptions, origin: job.data.origin, crawl_id: job.data.crawl_id, - }); + }, true); await addCrawlJobDone(job.data.crawl_id, job.id); @@ -486,7 +486,7 @@ async function processJob(job: Job & { id: string }, token: string) { url: sc?.originUrl ?? (job.data.crawlerOptions === null ? "Batch Scrape" : "Unknown"), crawlerOptions: sc.crawlerOptions, origin: job.data.origin, - }); + }, true); } } } @@ -566,7 +566,7 @@ async function processJob(job: Job & { id: string }, token: string) { scrapeOptions: job.data.scrapeOptions, origin: job.data.origin, crawl_id: job.data.crawl_id, - }); + }, true); // await logJob({ // job_id: job.data.crawl_id, From 350d00d27a6cd598542b4d06d1d790d652e26696 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 15 Nov 2024 20:09:20 +0100 Subject: [PATCH 18/26] fix(crawler): treat XML files as sitemaps (temporarily) --- apps/api/src/scraper/WebScraper/crawler.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 3fe53e4d..c00fed78 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -362,7 +362,7 @@ export class WebCrawler { }; - const sitemapUrl = url.endsWith("/sitemap.xml") + const sitemapUrl = url.endsWith(".xml") ? url : `${url}/sitemap.xml`; From 4cddcd5206b1d2c042dcde1e71f9f9bbc650bf13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 15 Nov 2024 20:15:25 +0100 Subject: [PATCH 19/26] fix(scrapeURL/fire-engine): timeout-less scrape support (initial) --- apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts | 5 +++++ apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts | 2 ++ 2 files changed, 7 insertions(+) diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 0ea54382..7ad950ad 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -87,6 +87,7 @@ export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise Date: Fri, 15 Nov 2024 20:25:16 +0100 Subject: [PATCH 20/26] fix(scrapeURL/fire-engine): wait longer if timeout is not specified --- .../scraper/scrapeURL/engines/fire-engine/index.ts | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 7ad950ad..17bb40f2 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -96,7 +96,9 @@ export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise Date: Fri, 15 Nov 2024 21:03:20 +0100 Subject: [PATCH 21/26] fix(map): ignore limit when using sitemapOnly --- apps/api/src/controllers/v1/map.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 81739a0b..0d0449aa 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -61,7 +61,7 @@ export async function mapController( sitemap.forEach((x) => { links.push(x.url); }); - links = links.slice(1, limit); + // links = links.slice(1, limit); // don't slice, unnecessary } } else { let urlWithoutWww = req.body.url.replace("www.", ""); From 1b032b05fa183d5b4bb3bb02548dc410e561c948 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 15 Nov 2024 21:14:32 +0100 Subject: [PATCH 22/26] fix(map): make sitemapOnly simpler --- apps/api/src/controllers/v1/map.ts | 73 ++++++++++++++++-------------- 1 file changed, 40 insertions(+), 33 deletions(-) diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 0d0449aa..ab9a5ed7 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -61,6 +61,15 @@ export async function mapController( sitemap.forEach((x) => { links.push(x.url); }); + links = links.slice(1) + .map((x) => { + try { + return checkAndUpdateURLForMap(x).url.trim(); + } catch (_) { + return null; + } + }) + .filter((x) => x !== null) as string[]; // links = links.slice(1, limit); // don't slice, unnecessary } } else { @@ -139,35 +148,35 @@ export async function mapController( } } - + // Perform cosine similarity between the search query and the list of links + if (req.body.search) { + const searchQuery = req.body.search.toLowerCase(); + + links = performCosineSimilarity(links, searchQuery); + } + + links = links + .map((x) => { + try { + return checkAndUpdateURLForMap(x).url.trim(); + } catch (_) { + return null; + } + }) + .filter((x) => x !== null) as string[]; + + // allows for subdomains to be included + links = links.filter((x) => isSameDomain(x, req.body.url)); + + // if includeSubdomains is false, filter out subdomains + if (!req.body.includeSubdomains) { + links = links.filter((x) => isSameSubdomain(x, req.body.url)); + } + + // remove duplicates that could be due to http/https or www + links = removeDuplicateUrls(links); + links.slice(0, limit); } - // Perform cosine similarity between the search query and the list of links - if (req.body.search) { - const searchQuery = req.body.search.toLowerCase(); - - links = performCosineSimilarity(links, searchQuery); - } - - links = links - .map((x) => { - try { - return checkAndUpdateURLForMap(x).url.trim(); - } catch (_) { - return null; - } - }) - .filter((x) => x !== null) as string[]; - - // allows for subdomains to be included - links = links.filter((x) => isSameDomain(x, req.body.url)); - - // if includeSubdomains is false, filter out subdomains - if (!req.body.includeSubdomains) { - links = links.filter((x) => isSameSubdomain(x, req.body.url)); - } - - // remove duplicates that could be due to http/https or www - links = removeDuplicateUrls(links); billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => { logger.error( @@ -179,14 +188,12 @@ export async function mapController( const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; - const linksToReturn = links.slice(0, limit); - logJob({ job_id: id, success: links.length > 0, message: "Map completed", - num_docs: linksToReturn.length, - docs: linksToReturn, + num_docs: links.length, + docs: links, time_taken: timeTakenInSeconds, team_id: req.auth.team_id, mode: "map", @@ -199,7 +206,7 @@ export async function mapController( return res.status(200).json({ success: true, - links: linksToReturn, + links: links, scrape_id: req.body.origin?.includes("website") ? id : undefined, }); } From 1a0f13c0eb487dba51151d24c2285a214006a898 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 15 Nov 2024 21:43:02 +0100 Subject: [PATCH 23/26] fix(webhook): add logging --- apps/api/src/services/webhook.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/api/src/services/webhook.ts b/apps/api/src/services/webhook.ts index 1cc4db84..7840484d 100644 --- a/apps/api/src/services/webhook.ts +++ b/apps/api/src/services/webhook.ts @@ -46,6 +46,8 @@ export const callWebhook = async ( webhookUrl = webhooksData[0].url; } + logger.debug("Calling webhook...", { webhookUrl, teamId, specified, v1, eventType, awaitWebhook }); + if (!webhookUrl) { return null; } @@ -128,7 +130,6 @@ export const callWebhook = async ( "Content-Type": "application/json", ...webhookUrl.headers, }, - timeout: v1 ? 10000 : 30000, // 10 seconds timeout (v1) } ) .catch((error) => { From 31a0471bfad73c280b23508b1a0260d2ef024c2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 15 Nov 2024 21:56:15 +0100 Subject: [PATCH 24/26] fix(crawl-redis): ordered push to wrong side of list --- apps/api/src/lib/crawl-redis.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index 2b255971..9bce160b 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -52,7 +52,7 @@ export async function addCrawlJobs(id: string, job_ids: string[]) { export async function addCrawlJobDone(id: string, job_id: string) { await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id); - await redisConnection.lpush("crawl:" + id + ":jobs_done_ordered", job_id); + await redisConnection.rpush("crawl:" + id + ":jobs_done_ordered", job_id); await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX"); await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60, "NX"); } From 77e152cba8f7503bf1506ad79a22c3df8192cdb7 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 18 Nov 2024 15:02:00 -0300 Subject: [PATCH 25/26] added team_id to scrape-status endpoint --- apps/api/src/controllers/v1/scrape-status.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/scrape-status.ts b/apps/api/src/controllers/v1/scrape-status.ts index db50f7d3..b7f19a3b 100644 --- a/apps/api/src/controllers/v1/scrape-status.ts +++ b/apps/api/src/controllers/v1/scrape-status.ts @@ -11,8 +11,12 @@ export async function scrapeStatusController(req: any, res: any) { await rateLimiter.consume(iptoken); const job = await supabaseGetJobByIdOnlyData(req.params.jobId); + const allowedTeams = [ + "41bdbfe1-0579-4d9b-b6d5-809f16be12f5", + "511544f2-2fce-4183-9c59-6c29b02c69b5" + ]; - if(job?.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){ + if(!allowedTeams.includes(job?.team_id)){ return res.status(403).json({ success: false, error: "You are not allowed to access this resource.", From a31336752caa8167b754cf1f039f59687840343f Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Mon, 18 Nov 2024 14:04:29 -0500 Subject: [PATCH 26/26] Create README.md --- examples/ai-podcast-generator/README.md | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 examples/ai-podcast-generator/README.md diff --git a/examples/ai-podcast-generator/README.md b/examples/ai-podcast-generator/README.md new file mode 100644 index 00000000..f27cf084 --- /dev/null +++ b/examples/ai-podcast-generator/README.md @@ -0,0 +1,7 @@ +# Generate AI podcasts based on real time news 🎙️ + +This example crawls the web for interesting news stories then records a podcast with your own voice. + +Here is a link to the repo: + +[https://github.com/ericciarla/aginews-podcast](https://github.com/ericciarla/aginews-podcast) \ No newline at end of file