This commit is contained in:
Nicolas 2024-07-03 20:18:11 -03:00
parent 5ecd9cb6f5
commit 32849b017f
3 changed files with 67 additions and 48 deletions

View File

@ -117,7 +117,7 @@ export async function scrapWithFireEngine({
} finally { } finally {
const endTime = Date.now(); const endTime = Date.now();
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
await logScrape(logParams); await logScrape(logParams, pageOptions);
} }
} }

View File

@ -1,4 +1,4 @@
import { ExtractorOptions } from './../../lib/entities'; import { ExtractorOptions } from "./../../lib/entities";
import { supabase_service } from "../supabase"; import { supabase_service } from "../supabase";
import { FirecrawlJob } from "../../types"; import { FirecrawlJob } from "../../types";
import { posthog } from "../posthog"; import { posthog } from "../posthog";
@ -11,6 +11,16 @@ export async function logJob(job: FirecrawlJob) {
return; return;
} }
// Redact any pages that have an authorization header
if (
job.pageOptions &&
job.pageOptions.headers &&
job.pageOptions.headers["Authorization"]
) {
job.pageOptions.headers["Authorization"] = "REDACTED";
job.docs = [{ content: "REDACTED DUE TO AUTHORIZATION HEADER", html: "REDACTED DUE TO AUTHORIZATION HEADER" }];
}
const { data, error } = await supabase_service const { data, error } = await supabase_service
.from("firecrawl_jobs") .from("firecrawl_jobs")
.insert([ .insert([
@ -27,35 +37,34 @@ export async function logJob(job: FirecrawlJob) {
page_options: job.pageOptions, page_options: job.pageOptions,
origin: job.origin, origin: job.origin,
extractor_options: job.extractor_options, extractor_options: job.extractor_options,
num_tokens: job.num_tokens num_tokens: job.num_tokens,
}, },
]); ]);
if (process.env.POSTHOG_API_KEY) { if (process.env.POSTHOG_API_KEY) {
let phLog = {
let phLog = { distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user
distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user ...(job.team_id !== "preview" && {
...(job.team_id !== "preview" && { groups: { team: job.team_id },
groups: { team: job.team_id } }), //* Identifying event on this team
}), //* Identifying event on this team event: "job-logged",
event: "job-logged", properties: {
properties: { success: job.success,
success: job.success, message: job.message,
message: job.message, num_docs: job.num_docs,
num_docs: job.num_docs, time_taken: job.time_taken,
time_taken: job.time_taken, team_id: job.team_id === "preview" ? null : job.team_id,
team_id: job.team_id === "preview" ? null : job.team_id, mode: job.mode,
mode: job.mode, url: job.url,
url: job.url, crawler_options: job.crawlerOptions,
crawler_options: job.crawlerOptions, page_options: job.pageOptions,
page_options: job.pageOptions, origin: job.origin,
origin: job.origin, extractor_options: job.extractor_options,
extractor_options: job.extractor_options, num_tokens: job.num_tokens,
num_tokens: job.num_tokens },
}, };
} posthog.capture(phLog);
posthog.capture(phLog); }
}
if (error) { if (error) {
console.error("Error logging job:\n", error); console.error("Error logging job:\n", error);
} }

View File

@ -1,32 +1,42 @@
import "dotenv/config"; import "dotenv/config";
import { ScrapeLog } from "../../types"; import { ScrapeLog } from "../../types";
import { supabase_service } from "../supabase"; import { supabase_service } from "../supabase";
import { PageOptions } from "../../lib/entities";
export async function logScrape(scrapeLog: ScrapeLog) { export async function logScrape(
scrapeLog: ScrapeLog,
pageOptions?: PageOptions
) {
try { try {
// Only log jobs in production // Only log jobs in production
// if (process.env.ENV !== "production") { // if (process.env.ENV !== "production") {
// return; // return;
// } // }
// Redact any pages that have an authorization header
if (
pageOptions &&
pageOptions.headers &&
pageOptions.headers["Authorization"]
) {
scrapeLog.html = "REDACTED DUE TO AUTHORIZATION HEADER";
}
const { data, error } = await supabase_service const { data, error } = await supabase_service.from("scrape_logs").insert([
.from("scrape_logs") {
.insert([ url: scrapeLog.url,
{ scraper: scrapeLog.scraper,
url: scrapeLog.url, success: scrapeLog.success,
scraper: scrapeLog.scraper, response_code: scrapeLog.response_code,
success: scrapeLog.success, time_taken_seconds: scrapeLog.time_taken_seconds,
response_code: scrapeLog.response_code, proxy: scrapeLog.proxy,
time_taken_seconds: scrapeLog.time_taken_seconds, retried: scrapeLog.retried,
proxy: scrapeLog.proxy, error_message: scrapeLog.error_message,
retried: scrapeLog.retried, date_added: new Date().toISOString(),
error_message: scrapeLog.error_message, html: scrapeLog.html,
date_added: new Date().toISOString(), ipv4_support: scrapeLog.ipv4_support,
html: scrapeLog.html, ipv6_support: scrapeLog.ipv6_support,
ipv4_support: scrapeLog.ipv4_support, },
ipv6_support: scrapeLog.ipv6_support, ]);
},
]);
if (error) { if (error) {
console.error("Error logging proxy:\n", error); console.error("Error logging proxy:\n", error);