Nick: init

2025-06-22 14:26:44 +08:00 · 2024-12-26 12:21:46 -03:00 · 2024-12-26 12:21:46 -03:00 · f467a3ae6c
commit f467a3ae6c
parent c911aad228
2 changed files with 241 additions and 120 deletions
--- a/apps/api/src/controllers/v1/extract.ts
+++ b/apps/api/src/controllers/v1/extract.ts
@ -7,6 +7,7 @@ import {
  ExtractResponse,
  MapDocument,
  scrapeOptions,
+  URLTrace,
 } from "./types";
 // import { Document } from "../../lib/entities";
 import Redis from "ioredis";
@ -56,14 +57,22 @@ export async function extractController(
  let links: string[] = [];
  let docs: Document[] = [];
  const earlyReturn = false;
+  const urlTraces: URLTrace[] = [];

  // Process all URLs in parallel
  const urlPromises = req.body.urls.map(async (url) => {
+    const trace: URLTrace = {
+      url,
+      status: 'mapped',
+      timing: {
+        discoveredAt: new Date().toISOString(),
+      },
+    };
+    urlTraces.push(trace);
+
    if (url.includes("/*") || req.body.allowExternalLinks) {
      // Handle glob pattern URLs
      const baseUrl = url.replace("/*", "");
-      // const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any
-
      const allowExternalLinks = req.body.allowExternalLinks;
      let urlWithoutWww = baseUrl.replace("www.", "");

@ -75,113 +84,167 @@ export async function extractController(
          )) ?? req.body.prompt;
      }

-      const mapResults = await getMapResults({
-        url: baseUrl,
-        search: rephrasedPrompt,
-        teamId: req.auth.team_id,
-        plan: req.auth.plan,
-        allowExternalLinks,
-        origin: req.body.origin,
-        limit: req.body.limit,
-        // If we're self-hosted, we don't want to ignore the sitemap, due to our fire-engine mapping
-        ignoreSitemap: false,
-        includeMetadata: true,
-        includeSubdomains: req.body.includeSubdomains,
-      });
+      try {
+        const mapResults = await getMapResults({
+          url: baseUrl,
+          search: rephrasedPrompt,
+          teamId: req.auth.team_id,
+          plan: req.auth.plan,
+          allowExternalLinks,
+          origin: req.body.origin,
+          limit: req.body.limit,
+          ignoreSitemap: false,
+          includeMetadata: true,
+          includeSubdomains: req.body.includeSubdomains,
+        });

-      let mappedLinks = mapResults.mapResults as MapDocument[];
+        let mappedLinks = mapResults.mapResults as MapDocument[];

-      // Remove duplicates between mapResults.links and mappedLinks
-      const allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links];
-      const uniqueUrls = removeDuplicateUrls(allUrls);
+        // Remove duplicates between mapResults.links and mappedLinks
+        const allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links];
+        const uniqueUrls = removeDuplicateUrls(allUrls);

-      // Only add URLs from mapResults.links that aren't already in mappedLinks
-      const existingUrls = new Set(mappedLinks.map((m) => m.url));
-      const newUrls = uniqueUrls.filter((url) => !existingUrls.has(url));
-
-      mappedLinks = [
-        ...mappedLinks,
-        ...newUrls.map((url) => ({ url, title: "", description: "" })),
-      ];
-
-      if (mappedLinks.length === 0) {
-        mappedLinks = [{ url: baseUrl, title: "", description: "" }];
-      }
-
-      // Limit number of links to MAX_EXTRACT_LIMIT
-      mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT);
-
-      let mappedLinksRerank = mappedLinks.map(
-        (x) =>
-          `url: ${x.url}, title: ${x.title}, description: ${x.description}`,
-      );
-
-      if (req.body.prompt) {
-        let searchQuery =
-          req.body.prompt && allowExternalLinks
-            ? `${req.body.prompt} ${urlWithoutWww}`
-            : req.body.prompt
-              ? `${req.body.prompt} site:${urlWithoutWww}`
-              : `site:${urlWithoutWww}`;
-        // Get similarity scores between the search query and each link's context
-        const linksAndScores = await performRanking(
-          mappedLinksRerank,
-          mappedLinks.map((l) => l.url),
-          searchQuery,
-        );
-
-        // First try with high threshold
-        let filteredLinks = filterAndProcessLinks(
-          mappedLinks,
-          linksAndScores,
-          INITIAL_SCORE_THRESHOLD,
-        );
-
-        // If we don't have enough high-quality links, try with lower threshold
-        if (filteredLinks.length < MIN_REQUIRED_LINKS) {
-          logger.info(
-            `Only found ${filteredLinks.length} links with score > ${INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`,
-          );
-          filteredLinks = filterAndProcessLinks(
-            mappedLinks,
-            linksAndScores,
-            FALLBACK_SCORE_THRESHOLD,
-          );
-
-          if (filteredLinks.length === 0) {
-            // If still no results, take top N results regardless of score
-            logger.warn(
-              `No links found with score > ${FALLBACK_SCORE_THRESHOLD}. Taking top ${MIN_REQUIRED_LINKS} results.`,
-            );
-            filteredLinks = linksAndScores
-              .sort((a, b) => b.score - a.score)
-              .slice(0, MIN_REQUIRED_LINKS)
-              .map((x) => mappedLinks.find((link) => link.url === x.link))
-              .filter(
-                (x): x is MapDocument =>
-                  x !== undefined &&
-                  x.url !== undefined &&
-                  !isUrlBlocked(x.url),
-              );
+        // Track all discovered URLs
+        uniqueUrls.forEach(discoveredUrl => {
+          if (!urlTraces.some(t => t.url === discoveredUrl)) {
+            urlTraces.push({
+              url: discoveredUrl,
+              status: 'mapped',
+              timing: {
+                discoveredAt: new Date().toISOString(),
+              },
+              usedInCompletion: false, // Default to false, will update if used
+            });
          }
+        });
+
+        // Only add URLs from mapResults.links that aren't already in mappedLinks
+        const existingUrls = new Set(mappedLinks.map((m) => m.url));
+        const newUrls = uniqueUrls.filter((url) => !existingUrls.has(url));
+
+        mappedLinks = [
+          ...mappedLinks,
+          ...newUrls.map((url) => ({ url, title: "", description: "" })),
+        ];
+
+        if (mappedLinks.length === 0) {
+          mappedLinks = [{ url: baseUrl, title: "", description: "" }];
        }

-        mappedLinks = filteredLinks.slice(0, MAX_RANKING_LIMIT);
-      }
+        // Limit number of links to MAX_EXTRACT_LIMIT
+        mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT);

-      return mappedLinks.map((x) => x.url) as string[];
+        let mappedLinksRerank = mappedLinks.map(
+          (x) =>
+            `url: ${x.url}, title: ${x.title}, description: ${x.description}`,
+        );
+
+        if (req.body.prompt) {
+          let searchQuery =
+            req.body.prompt && allowExternalLinks
+              ? `${req.body.prompt} ${urlWithoutWww}`
+              : req.body.prompt
+                ? `${req.body.prompt} site:${urlWithoutWww}`
+                : `site:${urlWithoutWww}`;
+          // Get similarity scores between the search query and each link's context
+          const linksAndScores = await performRanking(
+            mappedLinksRerank,
+            mappedLinks.map((l) => l.url),
+            searchQuery,
+          );
+
+          // First try with high threshold
+          let filteredLinks = filterAndProcessLinks(
+            mappedLinks,
+            linksAndScores,
+            INITIAL_SCORE_THRESHOLD,
+          );
+
+          // If we don't have enough high-quality links, try with lower threshold
+          if (filteredLinks.length < MIN_REQUIRED_LINKS) {
+            logger.info(
+              `Only found ${filteredLinks.length} links with score > ${INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`,
+            );
+            filteredLinks = filterAndProcessLinks(
+              mappedLinks,
+              linksAndScores,
+              FALLBACK_SCORE_THRESHOLD,
+            );
+
+            if (filteredLinks.length === 0) {
+              // If still no results, take top N results regardless of score
+              logger.warn(
+                `No links found with score > ${FALLBACK_SCORE_THRESHOLD}. Taking top ${MIN_REQUIRED_LINKS} results.`,
+              );
+              filteredLinks = linksAndScores
+                .sort((a, b) => b.score - a.score)
+                .slice(0, MIN_REQUIRED_LINKS)
+                .map((x) => mappedLinks.find((link) => link.url === x.link))
+                .filter(
+                  (x): x is MapDocument =>
+                    x !== undefined &&
+                    x.url !== undefined &&
+                    !isUrlBlocked(x.url),
+                );
+            }
+          }
+
+          // Update URL traces with relevance scores and mark filtered out URLs
+          linksAndScores.forEach((score) => {
+            const trace = urlTraces.find((t) => t.url === score.link);
+            if (trace) {
+              trace.relevanceScore = score.score;
+              // If URL didn't make it through filtering, mark it as filtered out
+              if (!filteredLinks.some(link => link.url === score.link)) {
+                trace.warning = `Relevance score ${score.score} below threshold`;
+                trace.usedInCompletion = false;
+              }
+            }
+          });
+
+          mappedLinks = filteredLinks.slice(0, MAX_RANKING_LIMIT);
+          
+          // Mark URLs that will be used in completion
+          mappedLinks.forEach(link => {
+            const trace = urlTraces.find(t => t.url === link.url);
+            if (trace) {
+              trace.usedInCompletion = true;
+            }
+          });
+
+          // Mark URLs that were dropped due to ranking limit
+          filteredLinks.slice(MAX_RANKING_LIMIT).forEach(link => {
+            const trace = urlTraces.find(t => t.url === link.url);
+            if (trace) {
+              trace.warning = 'Excluded due to ranking limit';
+              trace.usedInCompletion = false;
+            }
+          });
+        }
+
+        return mappedLinks.map((x) => x.url);
+      } catch (error) {
+        trace.status = 'error';
+        trace.error = error.message;
+        trace.usedInCompletion = false;
+        return [];
+      }
    } else {
      // Handle direct URLs without glob pattern
      if (!isUrlBlocked(url)) {
+        trace.usedInCompletion = true;
        return [url];
      }
+      trace.status = 'error';
+      trace.error = 'URL is blocked';
+      trace.usedInCompletion = false;
      return [];
    }
  });

  // Wait for all URL processing to complete and flatten results
  const processedUrls = await Promise.all(urlPromises);
-  const flattenedUrls = processedUrls.flat().filter((url) => url); // Filter out any null/undefined values
+  const flattenedUrls = processedUrls.flat().filter((url) => url);
  links.push(...flattenedUrls);

  if (links.length === 0) {
@ -189,13 +252,20 @@ export async function extractController(
      success: false,
      error:
        "No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.",
+      urlTrace: urlTraces,
    });
  }

  // Scrape all links in parallel with retries
  const scrapePromises = links.map(async (url) => {
+    const trace = urlTraces.find((t) => t.url === url);
+    if (trace) {
+      trace.status = 'scraped';
+      trace.timing.scrapedAt = new Date().toISOString();
+    }
+
    const origin = req.body.origin || "api";
-    const timeout = Math.floor((req.body.timeout || 40000) * 0.7) || 30000; // Use 70% of total timeout for individual scrapes
+    const timeout = Math.floor((req.body.timeout || 40000) * 0.7) || 30000;
    const jobId = crypto.randomUUID();

    const jobPriority = await getJobPriority({
@ -204,31 +274,45 @@ export async function extractController(
      basePriority: 10,
    });

-    await addScrapeJob(
-      {
-        url,
-        mode: "single_urls",
-        team_id: req.auth.team_id,
-        scrapeOptions: scrapeOptions.parse({}),
-        internalOptions: {},
-        plan: req.auth.plan!,
-        origin,
-        is_scrape: true,
-      },
-      {},
-      jobId,
-      jobPriority,
-    );
-
    try {
+      await addScrapeJob(
+        {
+          url,
+          mode: "single_urls",
+          team_id: req.auth.team_id,
+          scrapeOptions: scrapeOptions.parse({}),
+          internalOptions: {},
+          plan: req.auth.plan!,
+          origin,
+          is_scrape: true,
+        },
+        {},
+        jobId,
+        jobPriority,
+      );
+
      const doc = await waitForJob<Document>(jobId, timeout);
      await getScrapeQueue().remove(jobId);
+
+      if (trace) {
+        trace.timing.completedAt = new Date().toISOString();
+        trace.contentStats = {
+          rawContentLength: doc.markdown?.length || 0,
+          processedContentLength: doc.markdown?.length || 0,
+          tokensUsed: 0, // Will be updated after LLM processing
+        };
+      }
+
      if (earlyReturn) {
        return null;
      }
      return doc;
    } catch (e) {
      logger.error(`Error in extractController: ${e}`);
+      if (trace) {
+        trace.status = 'error';
+        trace.error = e.message;
+      }
      return null;
    }
  });
@ -240,6 +324,7 @@ export async function extractController(
    return res.status(e.status).json({
      success: false,
      error: e.error,
+      urlTrace: urlTraces,
    });
  }

@ -256,9 +341,25 @@ export async function extractController(
    },
    docs.map((x) => buildDocument(x)).join("\n"),
    undefined,
-    true, // isExtractEndpoint
+    true,
  );

+  // Update token usage in URL traces
+  if (completions.numTokens) {
+    // Distribute tokens proportionally based on content length
+    const totalLength = docs.reduce((sum, doc) => sum + (doc.markdown?.length || 0), 0);
+    docs.forEach((doc) => {
+      if (doc.metadata?.sourceURL) {
+        const trace = urlTraces.find((t) => t.url === doc.metadata.sourceURL);
+        if (trace && trace.contentStats) {
+          trace.contentStats.tokensUsed = Math.floor(
+            ((doc.markdown?.length || 0) / totalLength) * completions.numTokens
+          );
+        }
+      }
+    });
+  }
+
  // TODO: change this later
  // While on beta, we're billing 5 credits per link discovered/scraped.
  billTeam(req.auth.team_id, req.acuc?.sub_id, links.length * 5).catch(
@ -292,6 +393,7 @@ export async function extractController(
    data: data,
    scrape_id: id,
    warning: warning,
+    urlTrace: urlTraces,
  });
 }

--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -379,16 +379,16 @@ export type MapRequest = z.infer<typeof mapRequestSchema>;

 export type Document = {
  markdown?: string;
-  extract?: any;
  html?: string;
  rawHtml?: string;
  links?: string[];
  screenshot?: string;
+  extract?: any;
+  warning?: string;
  actions?: {
    screenshots?: string[];
    scrapes?: ScrapeActionContent[];
  };
-  warning?: string;
  metadata: {
    title?: string;
    description?: string;
@ -425,7 +425,7 @@ export type Document = {
    error?: string;
    [key: string]: string | string[] | number | undefined;
  };
-};
+}

 export type ErrorResponse = {
  success: false;
@ -448,14 +448,33 @@ export interface ScrapeResponseRequestTest {
  error?: string;
 }

-export type ExtractResponse =
-  | ErrorResponse
-  | {
-      success: true;
-      warning?: string;
-      data: z.infer<typeof extractRequestSchema>;
-      scrape_id?: string;
-    };
+export interface URLTrace {
+  url: string;
+  status: 'mapped' | 'scraped' | 'error';
+  timing: {
+    discoveredAt: string;
+    scrapedAt?: string;
+    completedAt?: string;
+  };
+  error?: string;
+  warning?: string;
+  contentStats?: {
+    rawContentLength: number;
+    processedContentLength: number;
+    tokensUsed: number;
+  };
+  relevanceScore?: number;
+  usedInCompletion?: boolean;
+}
+
+export interface ExtractResponse {
+  success: boolean;
+  data?: any;
+  scrape_id?: string;
+  warning?: string;
+  error?: string;
+  urlTrace?: URLTrace[];
+}

 export interface ExtractResponseRequestTest {
  statusCode: number;