diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index 50388dea..2e971139 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -117,7 +117,7 @@ export async function scrapWithFireEngine({ } finally { const endTime = Date.now(); logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; - await logScrape(logParams); + await logScrape(logParams, pageOptions); } } diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index 6cb6ec3b..7282296f 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -1,4 +1,4 @@ -import { ExtractorOptions } from './../../lib/entities'; +import { ExtractorOptions } from "./../../lib/entities"; import { supabase_service } from "../supabase"; import { FirecrawlJob } from "../../types"; import { posthog } from "../posthog"; @@ -10,7 +10,17 @@ export async function logJob(job: FirecrawlJob) { if (process.env.ENV !== "production") { return; } - + + // Redact any pages that have an authorization header + if ( + job.pageOptions && + job.pageOptions.headers && + job.pageOptions.headers["Authorization"] + ) { + job.pageOptions.headers["Authorization"] = "REDACTED"; + job.docs = [{ content: "REDACTED DUE TO AUTHORIZATION HEADER", html: "REDACTED DUE TO AUTHORIZATION HEADER" }]; + } + const { data, error } = await supabase_service .from("firecrawl_jobs") .insert([ @@ -27,35 +37,34 @@ export async function logJob(job: FirecrawlJob) { page_options: job.pageOptions, origin: job.origin, extractor_options: job.extractor_options, - num_tokens: job.num_tokens + num_tokens: job.num_tokens, }, ]); - if (process.env.POSTHOG_API_KEY) { - - let phLog = { - distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user - ...(job.team_id !== "preview" && { - groups: { team: job.team_id } - }), //* Identifying event on this team - event: "job-logged", - properties: { - success: job.success, - message: job.message, - num_docs: job.num_docs, - time_taken: job.time_taken, - team_id: job.team_id === "preview" ? null : job.team_id, - mode: job.mode, - url: job.url, - crawler_options: job.crawlerOptions, - page_options: job.pageOptions, - origin: job.origin, - extractor_options: job.extractor_options, - num_tokens: job.num_tokens - }, - } - posthog.capture(phLog); - } + if (process.env.POSTHOG_API_KEY) { + let phLog = { + distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user + ...(job.team_id !== "preview" && { + groups: { team: job.team_id }, + }), //* Identifying event on this team + event: "job-logged", + properties: { + success: job.success, + message: job.message, + num_docs: job.num_docs, + time_taken: job.time_taken, + team_id: job.team_id === "preview" ? null : job.team_id, + mode: job.mode, + url: job.url, + crawler_options: job.crawlerOptions, + page_options: job.pageOptions, + origin: job.origin, + extractor_options: job.extractor_options, + num_tokens: job.num_tokens, + }, + }; + posthog.capture(phLog); + } if (error) { console.error("Error logging job:\n", error); } diff --git a/apps/api/src/services/logging/scrape_log.ts b/apps/api/src/services/logging/scrape_log.ts index bb568242..4186c15c 100644 --- a/apps/api/src/services/logging/scrape_log.ts +++ b/apps/api/src/services/logging/scrape_log.ts @@ -1,32 +1,42 @@ import "dotenv/config"; import { ScrapeLog } from "../../types"; import { supabase_service } from "../supabase"; +import { PageOptions } from "../../lib/entities"; -export async function logScrape(scrapeLog: ScrapeLog) { +export async function logScrape( + scrapeLog: ScrapeLog, + pageOptions?: PageOptions +) { try { // Only log jobs in production // if (process.env.ENV !== "production") { // return; // } + // Redact any pages that have an authorization header + if ( + pageOptions && + pageOptions.headers && + pageOptions.headers["Authorization"] + ) { + scrapeLog.html = "REDACTED DUE TO AUTHORIZATION HEADER"; + } - const { data, error } = await supabase_service - .from("scrape_logs") - .insert([ - { - url: scrapeLog.url, - scraper: scrapeLog.scraper, - success: scrapeLog.success, - response_code: scrapeLog.response_code, - time_taken_seconds: scrapeLog.time_taken_seconds, - proxy: scrapeLog.proxy, - retried: scrapeLog.retried, - error_message: scrapeLog.error_message, - date_added: new Date().toISOString(), - html: scrapeLog.html, - ipv4_support: scrapeLog.ipv4_support, - ipv6_support: scrapeLog.ipv6_support, - }, - ]); + const { data, error } = await supabase_service.from("scrape_logs").insert([ + { + url: scrapeLog.url, + scraper: scrapeLog.scraper, + success: scrapeLog.success, + response_code: scrapeLog.response_code, + time_taken_seconds: scrapeLog.time_taken_seconds, + proxy: scrapeLog.proxy, + retried: scrapeLog.retried, + error_message: scrapeLog.error_message, + date_added: new Date().toISOString(), + html: scrapeLog.html, + ipv4_support: scrapeLog.ipv4_support, + ipv6_support: scrapeLog.ipv6_support, + }, + ]); if (error) { console.error("Error logging proxy:\n", error);