monitoring v0

2025-08-18 20:55:56 +08:00 · 2025-03-25 14:41:04 +01:00 · 2025-03-25 14:41:04 +01:00 · 50db3e9e8a
commit 50db3e9e8a
parent b3b63486f1
17 changed files with 104 additions and 24 deletions
--- a/apps/api/src/controllers/v0/crawl.ts
+++ b/apps/api/src/controllers/v0/crawl.ts
@ -158,6 +158,7 @@ export async function crawlController(req: Request, res: Response) {
      pageOptions,
      undefined,
      undefined,
+      team_id
    );
    internalOptions.disableSmartWaitCache = true; // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter

--- a/apps/api/src/controllers/v0/crawlPreview.ts
+++ b/apps/api/src/controllers/v0/crawlPreview.ts
@ -99,6 +99,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
      pageOptions,
      undefined,
      undefined,
+      team_id
    );

    const sc: StoredCrawl = {
--- a/apps/api/src/controllers/v0/scrape.ts
+++ b/apps/api/src/controllers/v0/scrape.ts
@ -66,6 +66,7 @@ export async function scrapeHelper(
    extractorOptions,
    timeout,
    crawlerOptions,
+    team_id,
  );

  await addScrapeJob(
@ -297,6 +298,7 @@ export async function scrapeController(req: Request, res: Response) {
      pageOptions,
      extractorOptions,
      timeout,
+      team_id,
    );

    logJob({
--- a/apps/api/src/controllers/v0/search.ts
+++ b/apps/api/src/controllers/v0/search.ts
@ -72,6 +72,7 @@ export async function searchHelper(
    undefined,
    60000,
    crawlerOptions,
+    team_id,
  );

  if (justSearch) {
--- a/apps/api/src/controllers/v1/batch-scrape.ts
+++ b/apps/api/src/controllers/v1/batch-scrape.ts
@ -82,7 +82,7 @@ export async function batchScrapeController(
    : {
        crawlerOptions: null,
        scrapeOptions: req.body,
-        internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter
+        internalOptions: { disableSmartWaitCache: true, teamId: req.auth.team_id }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter
        team_id: req.auth.team_id,
        createdAt: Date.now(),
        plan: req.auth.plan,
--- a/apps/api/src/controllers/v1/crawl.ts
+++ b/apps/api/src/controllers/v1/crawl.ts
@ -81,7 +81,7 @@ export async function crawlController(
    originUrl: req.body.url,
    crawlerOptions: toLegacyCrawlerOptions(crawlerOptions),
    scrapeOptions,
-    internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
+    internalOptions: { disableSmartWaitCache: true, teamId: req.auth.team_id }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
    team_id: req.auth.team_id,
    createdAt: Date.now(),
    plan: req.auth.plan,
--- a/apps/api/src/controllers/v1/map.ts
+++ b/apps/api/src/controllers/v1/map.ts
@ -85,7 +85,7 @@ export async function getMapResults({
      scrapeOptions: undefined,
    },
    scrapeOptions: scrapeOptions.parse({}),
-    internalOptions: {},
+    internalOptions: { teamId },
    team_id: teamId,
    createdAt: Date.now(),
    plan: plan,
--- a/apps/api/src/controllers/v1/scrape.ts
+++ b/apps/api/src/controllers/v1/scrape.ts
@ -50,7 +50,7 @@ export async function scrapeController(
      mode: "single_urls",
      team_id: req.auth.team_id,
      scrapeOptions: req.body,
-      internalOptions: {},
+      internalOptions: { teamId: req.auth.team_id },
      plan: req.auth.plan!,
      origin: req.body.origin,
      is_scrape: true,
--- a/apps/api/src/controllers/v1/search.ts
+++ b/apps/api/src/controllers/v1/search.ts
@ -83,7 +83,7 @@ async function scrapeSearchResult(
        mode: "single_urls" as Mode,
        team_id: options.teamId,
        scrapeOptions: options.scrapeOptions,
-        internalOptions: {},
+        internalOptions: { teamId: options.teamId },
        plan: options.plan || "free",
        origin: options.origin,
        is_scrape: true,
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -20,7 +20,8 @@ export type Format =
  | "links"
  | "screenshot"
  | "screenshot@fullPage"
-  | "extract";
+  | "extract"
+  | "diff";

 export const url = z.preprocess(
  (x) => {
@ -165,6 +166,7 @@ const baseScrapeOptions = z
        "screenshot@fullPage",
        "extract",
        "json",
+        "diff",
      ])
      .array()
      .optional()
@ -172,6 +174,10 @@ const baseScrapeOptions = z
      .refine(
        (x) => !(x.includes("screenshot") && x.includes("screenshot@fullPage")),
        "You may only specify either screenshot or screenshot@fullPage",
+      )
+      .refine(
+        (x) => !x.includes("diff") || x.includes("markdown"),
+        "The diff format requires the markdown format to be specified as well",
      ),
    headers: z.record(z.string(), z.string()).optional(),
    includeTags: z.string().array().optional(),
@ -546,6 +552,11 @@ export type Document = {
      value: unknown
    }[];
  };
+  diff?: {
+    previousScrapeAt: string | null;
+    changeStatus: "new" | "same" | "changed" | "removed";
+    visibility: "visible" | "hidden";
+  }
  metadata: {
    title?: string;
    description?: string;
@ -812,7 +823,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
  };
 }

-export function fromLegacyCrawlerOptions(x: any): {
+export function fromLegacyCrawlerOptions(x: any, teamId: string): {
  crawlOptions: CrawlerOptions;
  internalOptions: InternalOptions;
 } {
@ -834,6 +845,7 @@ export function fromLegacyCrawlerOptions(x: any): {
   }),
    internalOptions: {
      v0CrawlOnlyUrls: x.returnOnlyUrls,
+      teamId,
    },
  };
 }
@ -847,6 +859,7 @@ export function fromLegacyScrapeOptions(
  pageOptions: PageOptions,
  extractorOptions: ExtractorOptions | undefined,
  timeout: number | undefined,
+  teamId: string,
 ): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } {
  return {
    scrapeOptions: scrapeOptions.parse({
@ -896,6 +909,7 @@ export function fromLegacyScrapeOptions(
    internalOptions: {
      atsv: pageOptions.atsv,
      v0DisableJsDom: pageOptions.disableJsDom,
+      teamId,
    },
    // TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks
  };
@ -906,13 +920,15 @@ export function fromLegacyCombo(
  extractorOptions: ExtractorOptions | undefined,
  timeout: number | undefined,
  crawlerOptions: any,
+  teamId: string,
 ): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } {
  const { scrapeOptions, internalOptions: i1 } = fromLegacyScrapeOptions(
    pageOptions,
    extractorOptions,
    timeout,
+    teamId,
  );
-  const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions);
+  const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions, teamId);
  return { scrapeOptions, internalOptions: Object.assign(i1, i2) };
 }

--- a/apps/api/src/lib/extract/document-scraper.ts
+++ b/apps/api/src/lib/extract/document-scraper.ts
@ -44,6 +44,7 @@ export async function scrapeDocument(
        scrapeOptions: scrapeOptions.parse({ ...internalScrapeOptions }),
        internalOptions: {
          useCache: true,
+          teamId: options.teamId,
        },
        plan: options.plan,
        origin: options.origin,
--- a/apps/api/src/main/runWebScraper.ts
+++ b/apps/api/src/main/runWebScraper.ts
@ -97,6 +97,7 @@ export async function runWebScraper({
      response = await scrapeURL(bull_job_id, url, scrapeOptions, {
        priority,
        ...internalOptions,
+        teamId: internalOptions?.teamId ?? team_id,
      });
      if (!response.success) {
        if (response.error instanceof Error) {
--- a/apps/api/src/scraper/WebScraper/sitemap.ts
+++ b/apps/api/src/scraper/WebScraper/sitemap.ts
@ -47,6 +47,7 @@ export async function getLinksFromSitemap(
          ],
          v0DisableJsDom: true,
          abort,
+          teamId: "sitemap",
        },
      );

--- a/apps/api/src/scraper/scrapeURL/index.ts
+++ b/apps/api/src/scraper/scrapeURL/index.ts
@ -173,6 +173,7 @@ export type InternalOptions = {
  isBackgroundIndex?: boolean;
  fromCache?: boolean; // Indicates if the document was retrieved from cache
  abort?: AbortSignal;
+  teamId: string;
 };

 export type EngineResultsTracker = {
@ -383,7 +384,7 @@ export async function scrapeURL(
  id: string,
  url: string,
  options: ScrapeOptions,
-  internalOptions: InternalOptions = {},
+  internalOptions: InternalOptions,
 ): Promise<ScrapeUrlResponse> {
  const meta = await buildMetaObject(id, url, options, internalOptions);
  try {
--- a/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts
+++ b/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts
@ -31,7 +31,7 @@ describe("Standalone scrapeURL tests", () => {
        "test:scrape-basic",
        "https://www.roastmywebsite.ai/",
        scrapeOptions.parse({}),
-        { forceEngine },
+        { forceEngine, teamId: "test" },
      );

      // expect(out.logs.length).toBeGreaterThan(0);
@ -78,7 +78,7 @@ describe("Standalone scrapeURL tests", () => {
        scrapeOptions.parse({
          formats: ["markdown", "html"],
        }),
-        { forceEngine },
+        { forceEngine, teamId: "test" },
      );

      // expect(out.logs.length).toBeGreaterThan(0);
@ -102,7 +102,7 @@ describe("Standalone scrapeURL tests", () => {
        scrapeOptions.parse({
          onlyMainContent: false,
        }),
-        { forceEngine },
+        { forceEngine, teamId: "test" },
      );

      // expect(out.logs.length).toBeGreaterThan(0);
@ -125,7 +125,7 @@ describe("Standalone scrapeURL tests", () => {
          onlyMainContent: false,
          excludeTags: [".nav", "#footer", "strong"],
        }),
-        { forceEngine },
+        { forceEngine, teamId: "test" },
      );

      // expect(out.logs.length).toBeGreaterThan(0);
@ -145,7 +145,7 @@ describe("Standalone scrapeURL tests", () => {
        "test:scrape-400",
        "https://httpstat.us/400",
        scrapeOptions.parse({}),
-        { forceEngine },
+        { forceEngine, teamId: "test" },
      );

      // expect(out.logs.length).toBeGreaterThan(0);
@ -163,7 +163,7 @@ describe("Standalone scrapeURL tests", () => {
        "test:scrape-401",
        "https://httpstat.us/401",
        scrapeOptions.parse({}),
-        { forceEngine },
+        { forceEngine, teamId: "test" },
      );

      // expect(out.logs.length).toBeGreaterThan(0);
@ -181,7 +181,7 @@ describe("Standalone scrapeURL tests", () => {
        "test:scrape-403",
        "https://httpstat.us/403",
        scrapeOptions.parse({}),
-        { forceEngine },
+        { forceEngine, teamId: "test" },
      );

      // expect(out.logs.length).toBeGreaterThan(0);
@ -199,7 +199,7 @@ describe("Standalone scrapeURL tests", () => {
        "test:scrape-404",
        "https://httpstat.us/404",
        scrapeOptions.parse({}),
-        { forceEngine },
+        { forceEngine, teamId: "test" },
      );

      // expect(out.logs.length).toBeGreaterThan(0);
@ -217,7 +217,7 @@ describe("Standalone scrapeURL tests", () => {
        "test:scrape-405",
        "https://httpstat.us/405",
        scrapeOptions.parse({}),
-        { forceEngine },
+        { forceEngine, teamId: "test" },
      );

      // expect(out.logs.length).toBeGreaterThan(0);
@ -235,7 +235,7 @@ describe("Standalone scrapeURL tests", () => {
        "test:scrape-500",
        "https://httpstat.us/500",
        scrapeOptions.parse({}),
-        { forceEngine },
+        { forceEngine, teamId: "test" },
      );

      // expect(out.logs.length).toBeGreaterThan(0);
@ -253,7 +253,7 @@ describe("Standalone scrapeURL tests", () => {
        "test:scrape-redirect",
        "https://scrapethissite.com/",
        scrapeOptions.parse({}),
-        { forceEngine },
+        { forceEngine, teamId: "test" },
      );

      // expect(out.logs.length).toBeGreaterThan(0);
@ -285,7 +285,7 @@ describe("Standalone scrapeURL tests", () => {
          scrapeOptions.parse({
            formats: ["screenshot"],
          }),
-          { forceEngine },
+          { forceEngine, teamId: "test" },
        );

        // expect(out.logs.length).toBeGreaterThan(0);
@ -313,7 +313,7 @@ describe("Standalone scrapeURL tests", () => {
          scrapeOptions.parse({
            formats: ["screenshot@fullPage"],
          }),
-          { forceEngine },
+          { forceEngine, teamId: "test" },
        );

        // expect(out.logs.length).toBeGreaterThan(0);
@ -341,6 +341,7 @@ describe("Standalone scrapeURL tests", () => {
      "test:scrape-pdf",
      "https://arxiv.org/pdf/astro-ph/9301001.pdf",
      scrapeOptions.parse({}),
+      { teamId: "test" },
    );

    // expect(out.logs.length).toBeGreaterThan(0);
@ -359,6 +360,7 @@ describe("Standalone scrapeURL tests", () => {
      "test:scrape-docx",
      "https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx",
      scrapeOptions.parse({}),
+      { teamId: "test" },
    );

    // expect(out.logs.length).toBeGreaterThan(0);
@ -395,6 +397,7 @@ describe("Standalone scrapeURL tests", () => {
          },
        },
      }),
+      { teamId: "test" },
    );

    // expect(out.logs.length).toBeGreaterThan(0);
@ -430,6 +433,7 @@ describe("Standalone scrapeURL tests", () => {
          },
        },
      }),
+      { teamId: "test" },
    );

    // expect(out.logs.length).toBeGreaterThan(0);
@ -451,7 +455,7 @@ describe("Standalone scrapeURL tests", () => {
    async (i) => {
      const url = "https://www.scrapethissite.com/?i=" + i;
      const id = "test:concurrent:" + url;
-      const out = await scrapeURL(id, url, scrapeOptions.parse({}));
+      const out = await scrapeURL(id, url, scrapeOptions.parse({}), { teamId: "test" });

      const replacer = (key: string, value: any) => {
        if (value instanceof Error) {
--- a/apps/api/src/scraper/scrapeURL/transformers/diff.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/diff.ts
@ -0,0 +1,39 @@
+import { supabase_rr_service } from "../../../services/supabase";
+import { Document } from "../../../controllers/v1/types";
+import { Meta } from "../index";
+
+export async function deriveDiff(meta: Meta, document: Document): Promise<Document> {
+  if (meta.options.formats.includes("diff")) {
+    const { data, error } = await supabase_rr_service
+        .from("firecrawl_jobs")
+        .select()
+        .eq("team_id", meta.internalOptions.teamId)
+        .eq("url", document.metadata.url ?? document.metadata.sourceURL ?? meta.url)
+        .contains("page_options->>'formats'", "markdown")
+        .order("date_added", { ascending: false })
+        .limit(1)
+        .single();
+
+    if (data) {
+        const previousMarkdown = data.docs[0].markdown;
+        const currentMarkdown = document.markdown!;
+
+        document.diff = {
+            previousScrapeAt: data.date_added,
+            changeStatus: previousMarkdown.replace(/\s+/g, "") === currentMarkdown.replace(/\s+/g, "") ? "same" : "changed",
+            visibility: "visible",
+        }
+    } else if (!error) {
+        document.diff = {
+            previousScrapeAt: null,
+            changeStatus: "new",
+            visibility: "visible",
+        }
+    } else {
+        meta.logger.error("Error fetching previous scrape", { error });
+        document.warning = "Diffing failed, please try again later." + (document.warning ? ` ${document.warning}` : "");
+    }
+  }
+  
+  return document;
+}
--- a/apps/api/src/scraper/scrapeURL/transformers/index.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/index.ts
@ -8,7 +8,7 @@ import { performLLMExtract } from "./llmExtract";
 import { uploadScreenshot } from "./uploadScreenshot";
 import { removeBase64Images } from "./removeBase64Images";
 import { saveToCache } from "./cache";
-
+import { deriveDiff } from "./diff";
 export type Transformer = (
  meta: Meta,
  document: Document,
@ -148,6 +148,17 @@ export function coerceFieldsToFormats(
    );
  }

+  if (!formats.has("diff") && document.diff !== undefined) {
+    meta.logger.warn(
+      "Removed diff from Document because it wasn't in formats -- this is extremely wasteful and indicates a bug.",
+    );
+    delete document.diff;
+  } else if (formats.has("diff") && document.diff === undefined) {
+    meta.logger.warn(
+      "Request had format diff, but there was no diff field in the result.",
+    );
+  }
+
  if (meta.options.actions === undefined || meta.options.actions.length === 0) {
    delete document.actions;
  }
@ -164,6 +175,7 @@ export const transformerStack: Transformer[] = [
  deriveMetadataFromRawHTML,
  uploadScreenshot,
  performLLMExtract,
+  deriveDiff,
  coerceFieldsToFormats,
  removeBase64Images,
 ];