mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-16 00:55:57 +08:00
Nick:
This commit is contained in:
parent
5ecd9cb6f5
commit
32849b017f
@ -117,7 +117,7 @@ export async function scrapWithFireEngine({
|
|||||||
} finally {
|
} finally {
|
||||||
const endTime = Date.now();
|
const endTime = Date.now();
|
||||||
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
|
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
|
||||||
await logScrape(logParams);
|
await logScrape(logParams, pageOptions);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import { ExtractorOptions } from './../../lib/entities';
|
import { ExtractorOptions } from "./../../lib/entities";
|
||||||
import { supabase_service } from "../supabase";
|
import { supabase_service } from "../supabase";
|
||||||
import { FirecrawlJob } from "../../types";
|
import { FirecrawlJob } from "../../types";
|
||||||
import { posthog } from "../posthog";
|
import { posthog } from "../posthog";
|
||||||
@ -10,7 +10,17 @@ export async function logJob(job: FirecrawlJob) {
|
|||||||
if (process.env.ENV !== "production") {
|
if (process.env.ENV !== "production") {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Redact any pages that have an authorization header
|
||||||
|
if (
|
||||||
|
job.pageOptions &&
|
||||||
|
job.pageOptions.headers &&
|
||||||
|
job.pageOptions.headers["Authorization"]
|
||||||
|
) {
|
||||||
|
job.pageOptions.headers["Authorization"] = "REDACTED";
|
||||||
|
job.docs = [{ content: "REDACTED DUE TO AUTHORIZATION HEADER", html: "REDACTED DUE TO AUTHORIZATION HEADER" }];
|
||||||
|
}
|
||||||
|
|
||||||
const { data, error } = await supabase_service
|
const { data, error } = await supabase_service
|
||||||
.from("firecrawl_jobs")
|
.from("firecrawl_jobs")
|
||||||
.insert([
|
.insert([
|
||||||
@ -27,35 +37,34 @@ export async function logJob(job: FirecrawlJob) {
|
|||||||
page_options: job.pageOptions,
|
page_options: job.pageOptions,
|
||||||
origin: job.origin,
|
origin: job.origin,
|
||||||
extractor_options: job.extractor_options,
|
extractor_options: job.extractor_options,
|
||||||
num_tokens: job.num_tokens
|
num_tokens: job.num_tokens,
|
||||||
},
|
},
|
||||||
]);
|
]);
|
||||||
|
|
||||||
if (process.env.POSTHOG_API_KEY) {
|
if (process.env.POSTHOG_API_KEY) {
|
||||||
|
let phLog = {
|
||||||
let phLog = {
|
distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user
|
||||||
distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user
|
...(job.team_id !== "preview" && {
|
||||||
...(job.team_id !== "preview" && {
|
groups: { team: job.team_id },
|
||||||
groups: { team: job.team_id }
|
}), //* Identifying event on this team
|
||||||
}), //* Identifying event on this team
|
event: "job-logged",
|
||||||
event: "job-logged",
|
properties: {
|
||||||
properties: {
|
success: job.success,
|
||||||
success: job.success,
|
message: job.message,
|
||||||
message: job.message,
|
num_docs: job.num_docs,
|
||||||
num_docs: job.num_docs,
|
time_taken: job.time_taken,
|
||||||
time_taken: job.time_taken,
|
team_id: job.team_id === "preview" ? null : job.team_id,
|
||||||
team_id: job.team_id === "preview" ? null : job.team_id,
|
mode: job.mode,
|
||||||
mode: job.mode,
|
url: job.url,
|
||||||
url: job.url,
|
crawler_options: job.crawlerOptions,
|
||||||
crawler_options: job.crawlerOptions,
|
page_options: job.pageOptions,
|
||||||
page_options: job.pageOptions,
|
origin: job.origin,
|
||||||
origin: job.origin,
|
extractor_options: job.extractor_options,
|
||||||
extractor_options: job.extractor_options,
|
num_tokens: job.num_tokens,
|
||||||
num_tokens: job.num_tokens
|
},
|
||||||
},
|
};
|
||||||
}
|
posthog.capture(phLog);
|
||||||
posthog.capture(phLog);
|
}
|
||||||
}
|
|
||||||
if (error) {
|
if (error) {
|
||||||
console.error("Error logging job:\n", error);
|
console.error("Error logging job:\n", error);
|
||||||
}
|
}
|
||||||
|
@ -1,32 +1,42 @@
|
|||||||
import "dotenv/config";
|
import "dotenv/config";
|
||||||
import { ScrapeLog } from "../../types";
|
import { ScrapeLog } from "../../types";
|
||||||
import { supabase_service } from "../supabase";
|
import { supabase_service } from "../supabase";
|
||||||
|
import { PageOptions } from "../../lib/entities";
|
||||||
|
|
||||||
export async function logScrape(scrapeLog: ScrapeLog) {
|
export async function logScrape(
|
||||||
|
scrapeLog: ScrapeLog,
|
||||||
|
pageOptions?: PageOptions
|
||||||
|
) {
|
||||||
try {
|
try {
|
||||||
// Only log jobs in production
|
// Only log jobs in production
|
||||||
// if (process.env.ENV !== "production") {
|
// if (process.env.ENV !== "production") {
|
||||||
// return;
|
// return;
|
||||||
// }
|
// }
|
||||||
|
// Redact any pages that have an authorization header
|
||||||
|
if (
|
||||||
|
pageOptions &&
|
||||||
|
pageOptions.headers &&
|
||||||
|
pageOptions.headers["Authorization"]
|
||||||
|
) {
|
||||||
|
scrapeLog.html = "REDACTED DUE TO AUTHORIZATION HEADER";
|
||||||
|
}
|
||||||
|
|
||||||
const { data, error } = await supabase_service
|
const { data, error } = await supabase_service.from("scrape_logs").insert([
|
||||||
.from("scrape_logs")
|
{
|
||||||
.insert([
|
url: scrapeLog.url,
|
||||||
{
|
scraper: scrapeLog.scraper,
|
||||||
url: scrapeLog.url,
|
success: scrapeLog.success,
|
||||||
scraper: scrapeLog.scraper,
|
response_code: scrapeLog.response_code,
|
||||||
success: scrapeLog.success,
|
time_taken_seconds: scrapeLog.time_taken_seconds,
|
||||||
response_code: scrapeLog.response_code,
|
proxy: scrapeLog.proxy,
|
||||||
time_taken_seconds: scrapeLog.time_taken_seconds,
|
retried: scrapeLog.retried,
|
||||||
proxy: scrapeLog.proxy,
|
error_message: scrapeLog.error_message,
|
||||||
retried: scrapeLog.retried,
|
date_added: new Date().toISOString(),
|
||||||
error_message: scrapeLog.error_message,
|
html: scrapeLog.html,
|
||||||
date_added: new Date().toISOString(),
|
ipv4_support: scrapeLog.ipv4_support,
|
||||||
html: scrapeLog.html,
|
ipv6_support: scrapeLog.ipv6_support,
|
||||||
ipv4_support: scrapeLog.ipv4_support,
|
},
|
||||||
ipv6_support: scrapeLog.ipv6_support,
|
]);
|
||||||
},
|
|
||||||
]);
|
|
||||||
|
|
||||||
if (error) {
|
if (error) {
|
||||||
console.error("Error logging proxy:\n", error);
|
console.error("Error logging proxy:\n", error);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user