Nick: init

2025-06-22 14:26:44 +08:00 · 2024-12-26 12:21:46 -03:00 · 2024-12-26 12:21:46 -03:00 · f467a3ae6c
commit f467a3ae6c
parent c911aad228
2 changed files with 241 additions and 120 deletions
--- a/apps/api/src/controllers/v1/extract.ts
+++ b/apps/api/src/controllers/v1/extract.ts
@ -7,6 +7,7 @@ import {
  ExtractResponse,
  MapDocument,
  scrapeOptions,
+  URLTrace,
 } from "./types";
 // import { Document } from "../../lib/entities";
 import Redis from "ioredis";
@ -56,14 +57,22 @@ export async function extractController(
  let links: string[] = [];
  let docs: Document[] = [];
  const earlyReturn = false;
+  const urlTraces: URLTrace[] = [];

  // Process all URLs in parallel
  const urlPromises = req.body.urls.map(async (url) => {
+    const trace: URLTrace = {
+      url,
+      status: 'mapped',
+      timing: {
+        discoveredAt: new Date().toISOString(),
+      },
+    };
+    urlTraces.push(trace);
+
    if (url.includes("/*") || req.body.allowExternalLinks) {
      // Handle glob pattern URLs
      const baseUrl = url.replace("/*", "");
-      // const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any
-
      const allowExternalLinks = req.body.allowExternalLinks;
      let urlWithoutWww = baseUrl.replace("www.", "");

@ -75,6 +84,7 @@ export async function extractController(
          )) ?? req.body.prompt;
      }

+      try {
        const mapResults = await getMapResults({
          url: baseUrl,
          search: rephrasedPrompt,
@ -83,7 +93,6 @@ export async function extractController(
          allowExternalLinks,
          origin: req.body.origin,
          limit: req.body.limit,
-        // If we're self-hosted, we don't want to ignore the sitemap, due to our fire-engine mapping
          ignoreSitemap: false,
          includeMetadata: true,
          includeSubdomains: req.body.includeSubdomains,
@ -95,6 +104,20 @@ export async function extractController(
        const allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links];
        const uniqueUrls = removeDuplicateUrls(allUrls);

+        // Track all discovered URLs
+        uniqueUrls.forEach(discoveredUrl => {
+          if (!urlTraces.some(t => t.url === discoveredUrl)) {
+            urlTraces.push({
+              url: discoveredUrl,
+              status: 'mapped',
+              timing: {
+                discoveredAt: new Date().toISOString(),
+              },
+              usedInCompletion: false, // Default to false, will update if used
+            });
+          }
+        });
+
        // Only add URLs from mapResults.links that aren't already in mappedLinks
        const existingUrls = new Set(mappedLinks.map((m) => m.url));
        const newUrls = uniqueUrls.filter((url) => !existingUrls.has(url));
@ -166,22 +189,62 @@ export async function extractController(
            }
          }

+          // Update URL traces with relevance scores and mark filtered out URLs
+          linksAndScores.forEach((score) => {
+            const trace = urlTraces.find((t) => t.url === score.link);
+            if (trace) {
+              trace.relevanceScore = score.score;
+              // If URL didn't make it through filtering, mark it as filtered out
+              if (!filteredLinks.some(link => link.url === score.link)) {
+                trace.warning = `Relevance score ${score.score} below threshold`;
+                trace.usedInCompletion = false;
+              }
+            }
+          });
+
          mappedLinks = filteredLinks.slice(0, MAX_RANKING_LIMIT);
+          
+          // Mark URLs that will be used in completion
+          mappedLinks.forEach(link => {
+            const trace = urlTraces.find(t => t.url === link.url);
+            if (trace) {
+              trace.usedInCompletion = true;
+            }
+          });
+
+          // Mark URLs that were dropped due to ranking limit
+          filteredLinks.slice(MAX_RANKING_LIMIT).forEach(link => {
+            const trace = urlTraces.find(t => t.url === link.url);
+            if (trace) {
+              trace.warning = 'Excluded due to ranking limit';
+              trace.usedInCompletion = false;
+            }
+          });
        }

-      return mappedLinks.map((x) => x.url) as string[];
+        return mappedLinks.map((x) => x.url);
+      } catch (error) {
+        trace.status = 'error';
+        trace.error = error.message;
+        trace.usedInCompletion = false;
+        return [];
+      }
    } else {
      // Handle direct URLs without glob pattern
      if (!isUrlBlocked(url)) {
+        trace.usedInCompletion = true;
        return [url];
      }
+      trace.status = 'error';
+      trace.error = 'URL is blocked';
+      trace.usedInCompletion = false;
      return [];
    }
  });

  // Wait for all URL processing to complete and flatten results
  const processedUrls = await Promise.all(urlPromises);
-  const flattenedUrls = processedUrls.flat().filter((url) => url); // Filter out any null/undefined values
+  const flattenedUrls = processedUrls.flat().filter((url) => url);
  links.push(...flattenedUrls);

  if (links.length === 0) {
@ -189,13 +252,20 @@ export async function extractController(
      success: false,
      error:
        "No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.",
+      urlTrace: urlTraces,
    });
  }

  // Scrape all links in parallel with retries
  const scrapePromises = links.map(async (url) => {
+    const trace = urlTraces.find((t) => t.url === url);
+    if (trace) {
+      trace.status = 'scraped';
+      trace.timing.scrapedAt = new Date().toISOString();
+    }
+
    const origin = req.body.origin || "api";
-    const timeout = Math.floor((req.body.timeout || 40000) * 0.7) || 30000; // Use 70% of total timeout for individual scrapes
+    const timeout = Math.floor((req.body.timeout || 40000) * 0.7) || 30000;
    const jobId = crypto.randomUUID();

    const jobPriority = await getJobPriority({
@ -204,6 +274,7 @@ export async function extractController(
      basePriority: 10,
    });

+    try {
      await addScrapeJob(
        {
          url,
@ -220,15 +291,28 @@ export async function extractController(
        jobPriority,
      );

-    try {
      const doc = await waitForJob<Document>(jobId, timeout);
      await getScrapeQueue().remove(jobId);
+
+      if (trace) {
+        trace.timing.completedAt = new Date().toISOString();
+        trace.contentStats = {
+          rawContentLength: doc.markdown?.length || 0,
+          processedContentLength: doc.markdown?.length || 0,
+          tokensUsed: 0, // Will be updated after LLM processing
+        };
+      }
+
      if (earlyReturn) {
        return null;
      }
      return doc;
    } catch (e) {
      logger.error(`Error in extractController: ${e}`);
+      if (trace) {
+        trace.status = 'error';
+        trace.error = e.message;
+      }
      return null;
    }
  });
@ -240,6 +324,7 @@ export async function extractController(
    return res.status(e.status).json({
      success: false,
      error: e.error,
+      urlTrace: urlTraces,
    });
  }

@ -256,9 +341,25 @@ export async function extractController(
    },
    docs.map((x) => buildDocument(x)).join("\n"),
    undefined,
-    true, // isExtractEndpoint
+    true,
  );

+  // Update token usage in URL traces
+  if (completions.numTokens) {
+    // Distribute tokens proportionally based on content length
+    const totalLength = docs.reduce((sum, doc) => sum + (doc.markdown?.length || 0), 0);
+    docs.forEach((doc) => {
+      if (doc.metadata?.sourceURL) {
+        const trace = urlTraces.find((t) => t.url === doc.metadata.sourceURL);
+        if (trace && trace.contentStats) {
+          trace.contentStats.tokensUsed = Math.floor(
+            ((doc.markdown?.length || 0) / totalLength) * completions.numTokens
+          );
+        }
+      }
+    });
+  }
+
  // TODO: change this later
  // While on beta, we're billing 5 credits per link discovered/scraped.
  billTeam(req.auth.team_id, req.acuc?.sub_id, links.length * 5).catch(
@ -292,6 +393,7 @@ export async function extractController(
    data: data,
    scrape_id: id,
    warning: warning,
+    urlTrace: urlTraces,
  });
 }

--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -379,16 +379,16 @@ export type MapRequest = z.infer<typeof mapRequestSchema>;

 export type Document = {
  markdown?: string;
-  extract?: any;
  html?: string;
  rawHtml?: string;
  links?: string[];
  screenshot?: string;
+  extract?: any;
+  warning?: string;
  actions?: {
    screenshots?: string[];
    scrapes?: ScrapeActionContent[];
  };
-  warning?: string;
  metadata: {
    title?: string;
    description?: string;
@ -425,7 +425,7 @@ export type Document = {
    error?: string;
    [key: string]: string | string[] | number | undefined;
  };
-};
+}

 export type ErrorResponse = {
  success: false;
@ -448,14 +448,33 @@ export interface ScrapeResponseRequestTest {
  error?: string;
 }

-export type ExtractResponse =
-  | ErrorResponse
-  | {
-      success: true;
-      warning?: string;
-      data: z.infer<typeof extractRequestSchema>;
-      scrape_id?: string;
+export interface URLTrace {
+  url: string;
+  status: 'mapped' | 'scraped' | 'error';
+  timing: {
+    discoveredAt: string;
+    scrapedAt?: string;
+    completedAt?: string;
  };
+  error?: string;
+  warning?: string;
+  contentStats?: {
+    rawContentLength: number;
+    processedContentLength: number;
+    tokensUsed: number;
+  };
+  relevanceScore?: number;
+  usedInCompletion?: boolean;
+}
+
+export interface ExtractResponse {
+  success: boolean;
+  data?: any;
+  scrape_id?: string;
+  warning?: string;
+  error?: string;
+  urlTrace?: URLTrace[];
+}

 export interface ExtractResponseRequestTest {
  statusCode: number;