From 50db3e9e8abb1943f466aaff4db6709933160482 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 25 Mar 2025 14:41:04 +0100 Subject: [PATCH] monitoring v0 --- apps/api/src/controllers/v0/crawl.ts | 1 + apps/api/src/controllers/v0/crawlPreview.ts | 1 + apps/api/src/controllers/v0/scrape.ts | 2 + apps/api/src/controllers/v0/search.ts | 1 + apps/api/src/controllers/v1/batch-scrape.ts | 2 +- apps/api/src/controllers/v1/crawl.ts | 2 +- apps/api/src/controllers/v1/map.ts | 2 +- apps/api/src/controllers/v1/scrape.ts | 2 +- apps/api/src/controllers/v1/search.ts | 2 +- apps/api/src/controllers/v1/types.ts | 22 +++++++++-- apps/api/src/lib/extract/document-scraper.ts | 1 + apps/api/src/main/runWebScraper.ts | 1 + apps/api/src/scraper/WebScraper/sitemap.ts | 1 + apps/api/src/scraper/scrapeURL/index.ts | 3 +- .../src/scraper/scrapeURL/scrapeURL.test.ts | 32 ++++++++------- .../scraper/scrapeURL/transformers/diff.ts | 39 +++++++++++++++++++ .../scraper/scrapeURL/transformers/index.ts | 14 ++++++- 17 files changed, 104 insertions(+), 24 deletions(-) create mode 100644 apps/api/src/scraper/scrapeURL/transformers/diff.ts diff --git a/apps/api/src/controllers/v0/crawl.ts b/apps/api/src/controllers/v0/crawl.ts index 2eba651d..c8b186b0 100644 --- a/apps/api/src/controllers/v0/crawl.ts +++ b/apps/api/src/controllers/v0/crawl.ts @@ -158,6 +158,7 @@ export async function crawlController(req: Request, res: Response) { pageOptions, undefined, undefined, + team_id ); internalOptions.disableSmartWaitCache = true; // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter diff --git a/apps/api/src/controllers/v0/crawlPreview.ts b/apps/api/src/controllers/v0/crawlPreview.ts index ffb8ebba..9153ea79 100644 --- a/apps/api/src/controllers/v0/crawlPreview.ts +++ b/apps/api/src/controllers/v0/crawlPreview.ts @@ -99,6 +99,7 @@ export async function crawlPreviewController(req: Request, res: Response) { pageOptions, undefined, undefined, + team_id ); const sc: StoredCrawl = { diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index 62d62b09..0bdd197b 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -66,6 +66,7 @@ export async function scrapeHelper( extractorOptions, timeout, crawlerOptions, + team_id, ); await addScrapeJob( @@ -297,6 +298,7 @@ export async function scrapeController(req: Request, res: Response) { pageOptions, extractorOptions, timeout, + team_id, ); logJob({ diff --git a/apps/api/src/controllers/v0/search.ts b/apps/api/src/controllers/v0/search.ts index ac7d7f62..d8649a52 100644 --- a/apps/api/src/controllers/v0/search.ts +++ b/apps/api/src/controllers/v0/search.ts @@ -72,6 +72,7 @@ export async function searchHelper( undefined, 60000, crawlerOptions, + team_id, ); if (justSearch) { diff --git a/apps/api/src/controllers/v1/batch-scrape.ts b/apps/api/src/controllers/v1/batch-scrape.ts index d2c079bf..20fab47c 100644 --- a/apps/api/src/controllers/v1/batch-scrape.ts +++ b/apps/api/src/controllers/v1/batch-scrape.ts @@ -82,7 +82,7 @@ export async function batchScrapeController( : { crawlerOptions: null, scrapeOptions: req.body, - internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter + internalOptions: { disableSmartWaitCache: true, teamId: req.auth.team_id }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter team_id: req.auth.team_id, createdAt: Date.now(), plan: req.auth.plan, diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index 51d373ee..31e39502 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -81,7 +81,7 @@ export async function crawlController( originUrl: req.body.url, crawlerOptions: toLegacyCrawlerOptions(crawlerOptions), scrapeOptions, - internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter + internalOptions: { disableSmartWaitCache: true, teamId: req.auth.team_id }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter team_id: req.auth.team_id, createdAt: Date.now(), plan: req.auth.plan, diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index ebb0b324..49890d90 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -85,7 +85,7 @@ export async function getMapResults({ scrapeOptions: undefined, }, scrapeOptions: scrapeOptions.parse({}), - internalOptions: {}, + internalOptions: { teamId }, team_id: teamId, createdAt: Date.now(), plan: plan, diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index ec11e2cb..44214ee2 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -50,7 +50,7 @@ export async function scrapeController( mode: "single_urls", team_id: req.auth.team_id, scrapeOptions: req.body, - internalOptions: {}, + internalOptions: { teamId: req.auth.team_id }, plan: req.auth.plan!, origin: req.body.origin, is_scrape: true, diff --git a/apps/api/src/controllers/v1/search.ts b/apps/api/src/controllers/v1/search.ts index 18ff9579..082cd8cd 100644 --- a/apps/api/src/controllers/v1/search.ts +++ b/apps/api/src/controllers/v1/search.ts @@ -83,7 +83,7 @@ async function scrapeSearchResult( mode: "single_urls" as Mode, team_id: options.teamId, scrapeOptions: options.scrapeOptions, - internalOptions: {}, + internalOptions: { teamId: options.teamId }, plan: options.plan || "free", origin: options.origin, is_scrape: true, diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 459e5e56..4d7a80ce 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -20,7 +20,8 @@ export type Format = | "links" | "screenshot" | "screenshot@fullPage" - | "extract"; + | "extract" + | "diff"; export const url = z.preprocess( (x) => { @@ -165,6 +166,7 @@ const baseScrapeOptions = z "screenshot@fullPage", "extract", "json", + "diff", ]) .array() .optional() @@ -172,6 +174,10 @@ const baseScrapeOptions = z .refine( (x) => !(x.includes("screenshot") && x.includes("screenshot@fullPage")), "You may only specify either screenshot or screenshot@fullPage", + ) + .refine( + (x) => !x.includes("diff") || x.includes("markdown"), + "The diff format requires the markdown format to be specified as well", ), headers: z.record(z.string(), z.string()).optional(), includeTags: z.string().array().optional(), @@ -546,6 +552,11 @@ export type Document = { value: unknown }[]; }; + diff?: { + previousScrapeAt: string | null; + changeStatus: "new" | "same" | "changed" | "removed"; + visibility: "visible" | "hidden"; + } metadata: { title?: string; description?: string; @@ -812,7 +823,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) { }; } -export function fromLegacyCrawlerOptions(x: any): { +export function fromLegacyCrawlerOptions(x: any, teamId: string): { crawlOptions: CrawlerOptions; internalOptions: InternalOptions; } { @@ -834,6 +845,7 @@ export function fromLegacyCrawlerOptions(x: any): { }), internalOptions: { v0CrawlOnlyUrls: x.returnOnlyUrls, + teamId, }, }; } @@ -847,6 +859,7 @@ export function fromLegacyScrapeOptions( pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined, + teamId: string, ): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } { return { scrapeOptions: scrapeOptions.parse({ @@ -896,6 +909,7 @@ export function fromLegacyScrapeOptions( internalOptions: { atsv: pageOptions.atsv, v0DisableJsDom: pageOptions.disableJsDom, + teamId, }, // TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks }; @@ -906,13 +920,15 @@ export function fromLegacyCombo( extractorOptions: ExtractorOptions | undefined, timeout: number | undefined, crawlerOptions: any, + teamId: string, ): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } { const { scrapeOptions, internalOptions: i1 } = fromLegacyScrapeOptions( pageOptions, extractorOptions, timeout, + teamId, ); - const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions); + const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions, teamId); return { scrapeOptions, internalOptions: Object.assign(i1, i2) }; } diff --git a/apps/api/src/lib/extract/document-scraper.ts b/apps/api/src/lib/extract/document-scraper.ts index 8cbc75fd..e9bd729a 100644 --- a/apps/api/src/lib/extract/document-scraper.ts +++ b/apps/api/src/lib/extract/document-scraper.ts @@ -44,6 +44,7 @@ export async function scrapeDocument( scrapeOptions: scrapeOptions.parse({ ...internalScrapeOptions }), internalOptions: { useCache: true, + teamId: options.teamId, }, plan: options.plan, origin: options.origin, diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index c6751218..51e04354 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -97,6 +97,7 @@ export async function runWebScraper({ response = await scrapeURL(bull_job_id, url, scrapeOptions, { priority, ...internalOptions, + teamId: internalOptions?.teamId ?? team_id, }); if (!response.success) { if (response.error instanceof Error) { diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index c2c60383..f945cd22 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -47,6 +47,7 @@ export async function getLinksFromSitemap( ], v0DisableJsDom: true, abort, + teamId: "sitemap", }, ); diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index eaf5497a..d6d528f7 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -173,6 +173,7 @@ export type InternalOptions = { isBackgroundIndex?: boolean; fromCache?: boolean; // Indicates if the document was retrieved from cache abort?: AbortSignal; + teamId: string; }; export type EngineResultsTracker = { @@ -383,7 +384,7 @@ export async function scrapeURL( id: string, url: string, options: ScrapeOptions, - internalOptions: InternalOptions = {}, + internalOptions: InternalOptions, ): Promise { const meta = await buildMetaObject(id, url, options, internalOptions); try { diff --git a/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts b/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts index 8b783821..b545266f 100644 --- a/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts +++ b/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts @@ -31,7 +31,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-basic", "https://www.roastmywebsite.ai/", scrapeOptions.parse({}), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -78,7 +78,7 @@ describe("Standalone scrapeURL tests", () => { scrapeOptions.parse({ formats: ["markdown", "html"], }), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -102,7 +102,7 @@ describe("Standalone scrapeURL tests", () => { scrapeOptions.parse({ onlyMainContent: false, }), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -125,7 +125,7 @@ describe("Standalone scrapeURL tests", () => { onlyMainContent: false, excludeTags: [".nav", "#footer", "strong"], }), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -145,7 +145,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-400", "https://httpstat.us/400", scrapeOptions.parse({}), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -163,7 +163,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-401", "https://httpstat.us/401", scrapeOptions.parse({}), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -181,7 +181,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-403", "https://httpstat.us/403", scrapeOptions.parse({}), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -199,7 +199,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-404", "https://httpstat.us/404", scrapeOptions.parse({}), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -217,7 +217,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-405", "https://httpstat.us/405", scrapeOptions.parse({}), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -235,7 +235,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-500", "https://httpstat.us/500", scrapeOptions.parse({}), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -253,7 +253,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-redirect", "https://scrapethissite.com/", scrapeOptions.parse({}), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -285,7 +285,7 @@ describe("Standalone scrapeURL tests", () => { scrapeOptions.parse({ formats: ["screenshot"], }), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -313,7 +313,7 @@ describe("Standalone scrapeURL tests", () => { scrapeOptions.parse({ formats: ["screenshot@fullPage"], }), - { forceEngine }, + { forceEngine, teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -341,6 +341,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-pdf", "https://arxiv.org/pdf/astro-ph/9301001.pdf", scrapeOptions.parse({}), + { teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -359,6 +360,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-docx", "https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx", scrapeOptions.parse({}), + { teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -395,6 +397,7 @@ describe("Standalone scrapeURL tests", () => { }, }, }), + { teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -430,6 +433,7 @@ describe("Standalone scrapeURL tests", () => { }, }, }), + { teamId: "test" }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -451,7 +455,7 @@ describe("Standalone scrapeURL tests", () => { async (i) => { const url = "https://www.scrapethissite.com/?i=" + i; const id = "test:concurrent:" + url; - const out = await scrapeURL(id, url, scrapeOptions.parse({})); + const out = await scrapeURL(id, url, scrapeOptions.parse({}), { teamId: "test" }); const replacer = (key: string, value: any) => { if (value instanceof Error) { diff --git a/apps/api/src/scraper/scrapeURL/transformers/diff.ts b/apps/api/src/scraper/scrapeURL/transformers/diff.ts new file mode 100644 index 00000000..9f1061ca --- /dev/null +++ b/apps/api/src/scraper/scrapeURL/transformers/diff.ts @@ -0,0 +1,39 @@ +import { supabase_rr_service } from "../../../services/supabase"; +import { Document } from "../../../controllers/v1/types"; +import { Meta } from "../index"; + +export async function deriveDiff(meta: Meta, document: Document): Promise { + if (meta.options.formats.includes("diff")) { + const { data, error } = await supabase_rr_service + .from("firecrawl_jobs") + .select() + .eq("team_id", meta.internalOptions.teamId) + .eq("url", document.metadata.url ?? document.metadata.sourceURL ?? meta.url) + .contains("page_options->>'formats'", "markdown") + .order("date_added", { ascending: false }) + .limit(1) + .single(); + + if (data) { + const previousMarkdown = data.docs[0].markdown; + const currentMarkdown = document.markdown!; + + document.diff = { + previousScrapeAt: data.date_added, + changeStatus: previousMarkdown.replace(/\s+/g, "") === currentMarkdown.replace(/\s+/g, "") ? "same" : "changed", + visibility: "visible", + } + } else if (!error) { + document.diff = { + previousScrapeAt: null, + changeStatus: "new", + visibility: "visible", + } + } else { + meta.logger.error("Error fetching previous scrape", { error }); + document.warning = "Diffing failed, please try again later." + (document.warning ? ` ${document.warning}` : ""); + } + } + + return document; +} diff --git a/apps/api/src/scraper/scrapeURL/transformers/index.ts b/apps/api/src/scraper/scrapeURL/transformers/index.ts index ea149dba..c01e260f 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/index.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/index.ts @@ -8,7 +8,7 @@ import { performLLMExtract } from "./llmExtract"; import { uploadScreenshot } from "./uploadScreenshot"; import { removeBase64Images } from "./removeBase64Images"; import { saveToCache } from "./cache"; - +import { deriveDiff } from "./diff"; export type Transformer = ( meta: Meta, document: Document, @@ -148,6 +148,17 @@ export function coerceFieldsToFormats( ); } + if (!formats.has("diff") && document.diff !== undefined) { + meta.logger.warn( + "Removed diff from Document because it wasn't in formats -- this is extremely wasteful and indicates a bug.", + ); + delete document.diff; + } else if (formats.has("diff") && document.diff === undefined) { + meta.logger.warn( + "Request had format diff, but there was no diff field in the result.", + ); + } + if (meta.options.actions === undefined || meta.options.actions.length === 0) { delete document.actions; } @@ -164,6 +175,7 @@ export const transformerStack: Transformer[] = [ deriveMetadataFromRawHTML, uploadScreenshot, performLLMExtract, + deriveDiff, coerceFieldsToFormats, removeBase64Images, ];