monitoring v0

This commit is contained in:
Gergő Móricz 2025-03-25 14:41:04 +01:00
parent b3b63486f1
commit 50db3e9e8a
17 changed files with 104 additions and 24 deletions

View File

@ -158,6 +158,7 @@ export async function crawlController(req: Request, res: Response) {
pageOptions, pageOptions,
undefined, undefined,
undefined, undefined,
team_id
); );
internalOptions.disableSmartWaitCache = true; // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter internalOptions.disableSmartWaitCache = true; // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter

View File

@ -99,6 +99,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
pageOptions, pageOptions,
undefined, undefined,
undefined, undefined,
team_id
); );
const sc: StoredCrawl = { const sc: StoredCrawl = {

View File

@ -66,6 +66,7 @@ export async function scrapeHelper(
extractorOptions, extractorOptions,
timeout, timeout,
crawlerOptions, crawlerOptions,
team_id,
); );
await addScrapeJob( await addScrapeJob(
@ -297,6 +298,7 @@ export async function scrapeController(req: Request, res: Response) {
pageOptions, pageOptions,
extractorOptions, extractorOptions,
timeout, timeout,
team_id,
); );
logJob({ logJob({

View File

@ -72,6 +72,7 @@ export async function searchHelper(
undefined, undefined,
60000, 60000,
crawlerOptions, crawlerOptions,
team_id,
); );
if (justSearch) { if (justSearch) {

View File

@ -82,7 +82,7 @@ export async function batchScrapeController(
: { : {
crawlerOptions: null, crawlerOptions: null,
scrapeOptions: req.body, scrapeOptions: req.body,
internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter internalOptions: { disableSmartWaitCache: true, teamId: req.auth.team_id }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter
team_id: req.auth.team_id, team_id: req.auth.team_id,
createdAt: Date.now(), createdAt: Date.now(),
plan: req.auth.plan, plan: req.auth.plan,

View File

@ -81,7 +81,7 @@ export async function crawlController(
originUrl: req.body.url, originUrl: req.body.url,
crawlerOptions: toLegacyCrawlerOptions(crawlerOptions), crawlerOptions: toLegacyCrawlerOptions(crawlerOptions),
scrapeOptions, scrapeOptions,
internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter internalOptions: { disableSmartWaitCache: true, teamId: req.auth.team_id }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
team_id: req.auth.team_id, team_id: req.auth.team_id,
createdAt: Date.now(), createdAt: Date.now(),
plan: req.auth.plan, plan: req.auth.plan,

View File

@ -85,7 +85,7 @@ export async function getMapResults({
scrapeOptions: undefined, scrapeOptions: undefined,
}, },
scrapeOptions: scrapeOptions.parse({}), scrapeOptions: scrapeOptions.parse({}),
internalOptions: {}, internalOptions: { teamId },
team_id: teamId, team_id: teamId,
createdAt: Date.now(), createdAt: Date.now(),
plan: plan, plan: plan,

View File

@ -50,7 +50,7 @@ export async function scrapeController(
mode: "single_urls", mode: "single_urls",
team_id: req.auth.team_id, team_id: req.auth.team_id,
scrapeOptions: req.body, scrapeOptions: req.body,
internalOptions: {}, internalOptions: { teamId: req.auth.team_id },
plan: req.auth.plan!, plan: req.auth.plan!,
origin: req.body.origin, origin: req.body.origin,
is_scrape: true, is_scrape: true,

View File

@ -83,7 +83,7 @@ async function scrapeSearchResult(
mode: "single_urls" as Mode, mode: "single_urls" as Mode,
team_id: options.teamId, team_id: options.teamId,
scrapeOptions: options.scrapeOptions, scrapeOptions: options.scrapeOptions,
internalOptions: {}, internalOptions: { teamId: options.teamId },
plan: options.plan || "free", plan: options.plan || "free",
origin: options.origin, origin: options.origin,
is_scrape: true, is_scrape: true,

View File

@ -20,7 +20,8 @@ export type Format =
| "links" | "links"
| "screenshot" | "screenshot"
| "screenshot@fullPage" | "screenshot@fullPage"
| "extract"; | "extract"
| "diff";
export const url = z.preprocess( export const url = z.preprocess(
(x) => { (x) => {
@ -165,6 +166,7 @@ const baseScrapeOptions = z
"screenshot@fullPage", "screenshot@fullPage",
"extract", "extract",
"json", "json",
"diff",
]) ])
.array() .array()
.optional() .optional()
@ -172,6 +174,10 @@ const baseScrapeOptions = z
.refine( .refine(
(x) => !(x.includes("screenshot") && x.includes("screenshot@fullPage")), (x) => !(x.includes("screenshot") && x.includes("screenshot@fullPage")),
"You may only specify either screenshot or screenshot@fullPage", "You may only specify either screenshot or screenshot@fullPage",
)
.refine(
(x) => !x.includes("diff") || x.includes("markdown"),
"The diff format requires the markdown format to be specified as well",
), ),
headers: z.record(z.string(), z.string()).optional(), headers: z.record(z.string(), z.string()).optional(),
includeTags: z.string().array().optional(), includeTags: z.string().array().optional(),
@ -546,6 +552,11 @@ export type Document = {
value: unknown value: unknown
}[]; }[];
}; };
diff?: {
previousScrapeAt: string | null;
changeStatus: "new" | "same" | "changed" | "removed";
visibility: "visible" | "hidden";
}
metadata: { metadata: {
title?: string; title?: string;
description?: string; description?: string;
@ -812,7 +823,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
}; };
} }
export function fromLegacyCrawlerOptions(x: any): { export function fromLegacyCrawlerOptions(x: any, teamId: string): {
crawlOptions: CrawlerOptions; crawlOptions: CrawlerOptions;
internalOptions: InternalOptions; internalOptions: InternalOptions;
} { } {
@ -834,6 +845,7 @@ export function fromLegacyCrawlerOptions(x: any): {
}), }),
internalOptions: { internalOptions: {
v0CrawlOnlyUrls: x.returnOnlyUrls, v0CrawlOnlyUrls: x.returnOnlyUrls,
teamId,
}, },
}; };
} }
@ -847,6 +859,7 @@ export function fromLegacyScrapeOptions(
pageOptions: PageOptions, pageOptions: PageOptions,
extractorOptions: ExtractorOptions | undefined, extractorOptions: ExtractorOptions | undefined,
timeout: number | undefined, timeout: number | undefined,
teamId: string,
): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } { ): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } {
return { return {
scrapeOptions: scrapeOptions.parse({ scrapeOptions: scrapeOptions.parse({
@ -896,6 +909,7 @@ export function fromLegacyScrapeOptions(
internalOptions: { internalOptions: {
atsv: pageOptions.atsv, atsv: pageOptions.atsv,
v0DisableJsDom: pageOptions.disableJsDom, v0DisableJsDom: pageOptions.disableJsDom,
teamId,
}, },
// TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks // TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks
}; };
@ -906,13 +920,15 @@ export function fromLegacyCombo(
extractorOptions: ExtractorOptions | undefined, extractorOptions: ExtractorOptions | undefined,
timeout: number | undefined, timeout: number | undefined,
crawlerOptions: any, crawlerOptions: any,
teamId: string,
): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } { ): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } {
const { scrapeOptions, internalOptions: i1 } = fromLegacyScrapeOptions( const { scrapeOptions, internalOptions: i1 } = fromLegacyScrapeOptions(
pageOptions, pageOptions,
extractorOptions, extractorOptions,
timeout, timeout,
teamId,
); );
const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions); const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions, teamId);
return { scrapeOptions, internalOptions: Object.assign(i1, i2) }; return { scrapeOptions, internalOptions: Object.assign(i1, i2) };
} }

View File

@ -44,6 +44,7 @@ export async function scrapeDocument(
scrapeOptions: scrapeOptions.parse({ ...internalScrapeOptions }), scrapeOptions: scrapeOptions.parse({ ...internalScrapeOptions }),
internalOptions: { internalOptions: {
useCache: true, useCache: true,
teamId: options.teamId,
}, },
plan: options.plan, plan: options.plan,
origin: options.origin, origin: options.origin,

View File

@ -97,6 +97,7 @@ export async function runWebScraper({
response = await scrapeURL(bull_job_id, url, scrapeOptions, { response = await scrapeURL(bull_job_id, url, scrapeOptions, {
priority, priority,
...internalOptions, ...internalOptions,
teamId: internalOptions?.teamId ?? team_id,
}); });
if (!response.success) { if (!response.success) {
if (response.error instanceof Error) { if (response.error instanceof Error) {

View File

@ -47,6 +47,7 @@ export async function getLinksFromSitemap(
], ],
v0DisableJsDom: true, v0DisableJsDom: true,
abort, abort,
teamId: "sitemap",
}, },
); );

View File

@ -173,6 +173,7 @@ export type InternalOptions = {
isBackgroundIndex?: boolean; isBackgroundIndex?: boolean;
fromCache?: boolean; // Indicates if the document was retrieved from cache fromCache?: boolean; // Indicates if the document was retrieved from cache
abort?: AbortSignal; abort?: AbortSignal;
teamId: string;
}; };
export type EngineResultsTracker = { export type EngineResultsTracker = {
@ -383,7 +384,7 @@ export async function scrapeURL(
id: string, id: string,
url: string, url: string,
options: ScrapeOptions, options: ScrapeOptions,
internalOptions: InternalOptions = {}, internalOptions: InternalOptions,
): Promise<ScrapeUrlResponse> { ): Promise<ScrapeUrlResponse> {
const meta = await buildMetaObject(id, url, options, internalOptions); const meta = await buildMetaObject(id, url, options, internalOptions);
try { try {

View File

@ -31,7 +31,7 @@ describe("Standalone scrapeURL tests", () => {
"test:scrape-basic", "test:scrape-basic",
"https://www.roastmywebsite.ai/", "https://www.roastmywebsite.ai/",
scrapeOptions.parse({}), scrapeOptions.parse({}),
{ forceEngine }, { forceEngine, teamId: "test" },
); );
// expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
@ -78,7 +78,7 @@ describe("Standalone scrapeURL tests", () => {
scrapeOptions.parse({ scrapeOptions.parse({
formats: ["markdown", "html"], formats: ["markdown", "html"],
}), }),
{ forceEngine }, { forceEngine, teamId: "test" },
); );
// expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
@ -102,7 +102,7 @@ describe("Standalone scrapeURL tests", () => {
scrapeOptions.parse({ scrapeOptions.parse({
onlyMainContent: false, onlyMainContent: false,
}), }),
{ forceEngine }, { forceEngine, teamId: "test" },
); );
// expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
@ -125,7 +125,7 @@ describe("Standalone scrapeURL tests", () => {
onlyMainContent: false, onlyMainContent: false,
excludeTags: [".nav", "#footer", "strong"], excludeTags: [".nav", "#footer", "strong"],
}), }),
{ forceEngine }, { forceEngine, teamId: "test" },
); );
// expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
@ -145,7 +145,7 @@ describe("Standalone scrapeURL tests", () => {
"test:scrape-400", "test:scrape-400",
"https://httpstat.us/400", "https://httpstat.us/400",
scrapeOptions.parse({}), scrapeOptions.parse({}),
{ forceEngine }, { forceEngine, teamId: "test" },
); );
// expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
@ -163,7 +163,7 @@ describe("Standalone scrapeURL tests", () => {
"test:scrape-401", "test:scrape-401",
"https://httpstat.us/401", "https://httpstat.us/401",
scrapeOptions.parse({}), scrapeOptions.parse({}),
{ forceEngine }, { forceEngine, teamId: "test" },
); );
// expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
@ -181,7 +181,7 @@ describe("Standalone scrapeURL tests", () => {
"test:scrape-403", "test:scrape-403",
"https://httpstat.us/403", "https://httpstat.us/403",
scrapeOptions.parse({}), scrapeOptions.parse({}),
{ forceEngine }, { forceEngine, teamId: "test" },
); );
// expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
@ -199,7 +199,7 @@ describe("Standalone scrapeURL tests", () => {
"test:scrape-404", "test:scrape-404",
"https://httpstat.us/404", "https://httpstat.us/404",
scrapeOptions.parse({}), scrapeOptions.parse({}),
{ forceEngine }, { forceEngine, teamId: "test" },
); );
// expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
@ -217,7 +217,7 @@ describe("Standalone scrapeURL tests", () => {
"test:scrape-405", "test:scrape-405",
"https://httpstat.us/405", "https://httpstat.us/405",
scrapeOptions.parse({}), scrapeOptions.parse({}),
{ forceEngine }, { forceEngine, teamId: "test" },
); );
// expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
@ -235,7 +235,7 @@ describe("Standalone scrapeURL tests", () => {
"test:scrape-500", "test:scrape-500",
"https://httpstat.us/500", "https://httpstat.us/500",
scrapeOptions.parse({}), scrapeOptions.parse({}),
{ forceEngine }, { forceEngine, teamId: "test" },
); );
// expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
@ -253,7 +253,7 @@ describe("Standalone scrapeURL tests", () => {
"test:scrape-redirect", "test:scrape-redirect",
"https://scrapethissite.com/", "https://scrapethissite.com/",
scrapeOptions.parse({}), scrapeOptions.parse({}),
{ forceEngine }, { forceEngine, teamId: "test" },
); );
// expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
@ -285,7 +285,7 @@ describe("Standalone scrapeURL tests", () => {
scrapeOptions.parse({ scrapeOptions.parse({
formats: ["screenshot"], formats: ["screenshot"],
}), }),
{ forceEngine }, { forceEngine, teamId: "test" },
); );
// expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
@ -313,7 +313,7 @@ describe("Standalone scrapeURL tests", () => {
scrapeOptions.parse({ scrapeOptions.parse({
formats: ["screenshot@fullPage"], formats: ["screenshot@fullPage"],
}), }),
{ forceEngine }, { forceEngine, teamId: "test" },
); );
// expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
@ -341,6 +341,7 @@ describe("Standalone scrapeURL tests", () => {
"test:scrape-pdf", "test:scrape-pdf",
"https://arxiv.org/pdf/astro-ph/9301001.pdf", "https://arxiv.org/pdf/astro-ph/9301001.pdf",
scrapeOptions.parse({}), scrapeOptions.parse({}),
{ teamId: "test" },
); );
// expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
@ -359,6 +360,7 @@ describe("Standalone scrapeURL tests", () => {
"test:scrape-docx", "test:scrape-docx",
"https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx", "https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx",
scrapeOptions.parse({}), scrapeOptions.parse({}),
{ teamId: "test" },
); );
// expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
@ -395,6 +397,7 @@ describe("Standalone scrapeURL tests", () => {
}, },
}, },
}), }),
{ teamId: "test" },
); );
// expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
@ -430,6 +433,7 @@ describe("Standalone scrapeURL tests", () => {
}, },
}, },
}), }),
{ teamId: "test" },
); );
// expect(out.logs.length).toBeGreaterThan(0); // expect(out.logs.length).toBeGreaterThan(0);
@ -451,7 +455,7 @@ describe("Standalone scrapeURL tests", () => {
async (i) => { async (i) => {
const url = "https://www.scrapethissite.com/?i=" + i; const url = "https://www.scrapethissite.com/?i=" + i;
const id = "test:concurrent:" + url; const id = "test:concurrent:" + url;
const out = await scrapeURL(id, url, scrapeOptions.parse({})); const out = await scrapeURL(id, url, scrapeOptions.parse({}), { teamId: "test" });
const replacer = (key: string, value: any) => { const replacer = (key: string, value: any) => {
if (value instanceof Error) { if (value instanceof Error) {

View File

@ -0,0 +1,39 @@
import { supabase_rr_service } from "../../../services/supabase";
import { Document } from "../../../controllers/v1/types";
import { Meta } from "../index";
export async function deriveDiff(meta: Meta, document: Document): Promise<Document> {
if (meta.options.formats.includes("diff")) {
const { data, error } = await supabase_rr_service
.from("firecrawl_jobs")
.select()
.eq("team_id", meta.internalOptions.teamId)
.eq("url", document.metadata.url ?? document.metadata.sourceURL ?? meta.url)
.contains("page_options->>'formats'", "markdown")
.order("date_added", { ascending: false })
.limit(1)
.single();
if (data) {
const previousMarkdown = data.docs[0].markdown;
const currentMarkdown = document.markdown!;
document.diff = {
previousScrapeAt: data.date_added,
changeStatus: previousMarkdown.replace(/\s+/g, "") === currentMarkdown.replace(/\s+/g, "") ? "same" : "changed",
visibility: "visible",
}
} else if (!error) {
document.diff = {
previousScrapeAt: null,
changeStatus: "new",
visibility: "visible",
}
} else {
meta.logger.error("Error fetching previous scrape", { error });
document.warning = "Diffing failed, please try again later." + (document.warning ? ` ${document.warning}` : "");
}
}
return document;
}

View File

@ -8,7 +8,7 @@ import { performLLMExtract } from "./llmExtract";
import { uploadScreenshot } from "./uploadScreenshot"; import { uploadScreenshot } from "./uploadScreenshot";
import { removeBase64Images } from "./removeBase64Images"; import { removeBase64Images } from "./removeBase64Images";
import { saveToCache } from "./cache"; import { saveToCache } from "./cache";
import { deriveDiff } from "./diff";
export type Transformer = ( export type Transformer = (
meta: Meta, meta: Meta,
document: Document, document: Document,
@ -148,6 +148,17 @@ export function coerceFieldsToFormats(
); );
} }
if (!formats.has("diff") && document.diff !== undefined) {
meta.logger.warn(
"Removed diff from Document because it wasn't in formats -- this is extremely wasteful and indicates a bug.",
);
delete document.diff;
} else if (formats.has("diff") && document.diff === undefined) {
meta.logger.warn(
"Request had format diff, but there was no diff field in the result.",
);
}
if (meta.options.actions === undefined || meta.options.actions.length === 0) { if (meta.options.actions === undefined || meta.options.actions.length === 0) {
delete document.actions; delete document.actions;
} }
@ -164,6 +175,7 @@ export const transformerStack: Transformer[] = [
deriveMetadataFromRawHTML, deriveMetadataFromRawHTML,
uploadScreenshot, uploadScreenshot,
performLLMExtract, performLLMExtract,
deriveDiff,
coerceFieldsToFormats, coerceFieldsToFormats,
removeBase64Images, removeBase64Images,
]; ];