Reapply "Nick:"

This reverts commit 4b4385c520c7223cf79ebba981dded8ffaefde11.
2025-08-08 19:29:02 +08:00 · 2025-01-22 17:26:32 -03:00 · 2025-01-22 17:26:32 -03:00 · 56f048aeff
commit 56f048aeff
parent 4b4385c520
7 changed files with 128 additions and 49 deletions
--- a/apps/api/src/lib/extract/build-prompts.ts
+++ b/apps/api/src/lib/extract/build-prompts.ts
@ -14,3 +14,26 @@ Provide a rephrased search query that:

 Return only the rephrased search query, without any explanation or additional text.`;
 }
+
+export function buildPreRerankPrompt(
+  prompt: string | undefined,
+  schema: any,
+  url: string,
+): string {
+  const schemaString = JSON.stringify(schema, null, 2);
+  return `Create a concise search query that combines the key data points from both the schema and prompt. Focus on the core information needed while keeping it general enough to find relevant matches.
+
+Schema: ${schemaString}
+Prompt: ${prompt}
+Website to get content from: ${url}
+
+Return only a concise sentece or 2 focused on the essential data points that the user wants to extract. This will be used by an LLM to determine how releavant the links that are present are to the user's request.`;
+}
+
+export function buildRerankerSystemPrompt(): string {
+  return "You are a relevance expert. Analyze the provided URLs and their content to determine their relevance to the user's query and intent. For each URL, assign a relevance score between 0 and 1, where 1 means highly relevant and 0 means not relevant at all. Only include URLs that are actually relevant to the query.";
+}
+
+export function buildRerankerUserPrompt(searchQuery: string): string {
+  return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relvancy score of 0.6+.`;
+}
--- a/apps/api/src/lib/extract/config.ts
+++ b/apps/api/src/lib/extract/config.ts
@ -1,10 +1,13 @@
 export const extractConfig = {
-  RERANKING:{
+  RERANKING: {
    MAX_INITIAL_RANKING_LIMIT: 1000,
    MAX_RANKING_LIMIT_FOR_RELEVANCE: 100,
    INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE: 0.75,
    FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE: 0.5,
    MIN_REQUIRED_LINKS: 1,
-  }
+  },
+  DEDUPLICATION: {
+    MAX_TOKENS: 4096,
+  },
 };
 export const CUSTOM_U_TEAMS = ["874d40cc-a5c0-4e93-b661-9ddfbad5e51e"];
--- a/apps/api/src/lib/extract/extraction-service.ts
+++ b/apps/api/src/lib/extract/extraction-service.ts
@ -31,8 +31,9 @@ const openai = new OpenAI();
 import { ExtractStep, updateExtract } from "./extract-redis";
 import { deduplicateObjectsArray } from "./helpers/deduplicate-objs-array";
 import { mergeNullValObjs } from "./helpers/merge-null-val-objs";
-import { CUSTOM_U_TEAMS } from "./config";
+import { CUSTOM_U_TEAMS, extractConfig } from "./config";
 import { calculateFinalResultCost, estimateCost, estimateTotalCost } from "./usage/llm-cost";
+import { numTokensFromString } from "../LLM-extraction/helpers";

 interface ExtractServiceOptions {
  request: ExtractRequest;
@ -673,11 +674,60 @@ export async function performExtraction(
    // }
  }

-  const finalResult = reqSchema
+  let finalResult = reqSchema
    ? await mixSchemaObjects(reqSchema, singleAnswerResult, multiEntityResult)
    : singleAnswerResult || multiEntityResult;

-  
+  // Tokenize final result to get token count
+  let finalResultTokens = 0;
+  if (finalResult) {
+    const finalResultStr = JSON.stringify(finalResult);
+    finalResultTokens = numTokensFromString(finalResultStr, "gpt-4o");
+
+  }
+  // // Deduplicate and validate final result against schema
+  // if (reqSchema && finalResult && finalResult.length <= extractConfig.DEDUPLICATION.MAX_TOKENS) {
+  //   const schemaValidation = await generateOpenAICompletions(
+  //     logger.child({ method: "extractService/validateAndDeduplicate" }),
+  //     {
+  //       mode: "llm",
+  //       systemPrompt: `You are a data validator and deduplicator. Your task is to:
+  //       1. Remove any duplicate entries in the data extracted by merging that into a single object according to the provided shcema
+  //       2. Ensure all data matches the provided schema
+  //       3. Keep only the highest quality and most complete entries when duplicates are found.
+        
+  //       Do not change anything else. If data is null keep it null. If the schema is not provided, return the data as is.`,
+  //       prompt: `Please validate and merge the duplicate entries in this data according to the schema provided:\n
+
+  //       <start of extract data>
+
+  //       ${JSON.stringify(finalResult)}
+
+  //       <end of extract data>
+
+  //       <start of schema>
+
+  //       ${JSON.stringify(reqSchema)}
+
+  //       <end of schema>
+  //       `,
+  //       schema: reqSchema,
+  //     },
+  //     undefined,
+  //     undefined,
+  //     true,
+  //     "gpt-4o"
+  //   );
+  //   console.log("schemaValidation", schemaValidation);
+
+  //   console.log("schemaValidation", finalResult);
+
+  //   if (schemaValidation?.extract) {
+  //     tokenUsage.push(schemaValidation.totalUsage);
+  //     finalResult = schemaValidation.extract;
+  //   }
+  // }
+
  const totalTokensUsed = tokenUsage.reduce((a, b) => a + b.totalTokens, 0);
  const llmUsage = estimateTotalCost(tokenUsage);
  let tokensToBill = calculateFinalResultCost(finalResult);
@ -686,6 +736,8 @@ export async function performExtraction(
  if (CUSTOM_U_TEAMS.includes(teamId)) {
    tokensToBill = 1;
  }
+
+
  // Bill team for usage
  billTeam(teamId, subId, tokensToBill, logger, true).catch((error) => {
    logger.error(
--- a/apps/api/src/lib/extract/reranker.ts
+++ b/apps/api/src/lib/extract/reranker.ts
@ -6,6 +6,8 @@ import { CohereClient } from "cohere-ai";
 import { extractConfig } from "./config";
 import { searchSimilarPages } from "./index/pinecone";
 import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
+import { buildRerankerUserPrompt } from "./build-prompts";
+import { buildRerankerSystemPrompt } from "./build-prompts";

 const cohere = new CohereClient({
  token: process.env.COHERE_API_KEY,
@ -191,6 +193,7 @@ export async function rerankLinksWithLLM(
    required: ["relevantLinks"]
  };

+
  const results = await Promise.all(
    chunks.map(async (chunk, chunkIndex) => {
      // console.log(`Processing chunk ${chunkIndex + 1}/${chunks.length} with ${chunk.length} links`);
@ -205,12 +208,13 @@ export async function rerankLinksWithLLM(
            setTimeout(() => resolve(null), TIMEOUT_MS);
          });

+
          const completionPromise = generateOpenAICompletions(
            logger.child({ method: "rerankLinksWithLLM", chunk: chunkIndex + 1, retry }),
            {
              mode: "llm",
-              systemPrompt: "You are a search relevance expert. Analyze the provided URLs and their content to determine their relevance to the search query. For each URL, assign a relevance score between 0 and 1, where 1 means highly relevant and 0 means not relevant at all. Only include URLs that are actually relevant to the query.",
-              prompt: `Given these URLs and their content, identify which ones are relevant to the search query: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the search query. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relvancy score of 0.6+.`,
+              systemPrompt: buildRerankerSystemPrompt(),
+              prompt: buildRerankerUserPrompt(searchQuery),
              schema: schema
            },
            linksContent,
--- a/apps/api/src/lib/extract/url-processor.ts
+++ b/apps/api/src/lib/extract/url-processor.ts
@ -4,7 +4,7 @@ import { PlanType } from "../../types";
 import { removeDuplicateUrls } from "../validateUrl";
 import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
 import { generateBasicCompletion } from "../LLM-extraction";
-import { buildRefrasedPrompt } from "./build-prompts";
+import { buildPreRerankPrompt, buildRefrasedPrompt } from "./build-prompts";
 import { rerankLinksWithLLM } from "./reranker";
 import { extractConfig } from "./config";
 import { updateExtract } from "./extract-redis";
@ -50,9 +50,9 @@ export async function processUrl(
  const baseUrl = options.url.replace("/*", "");
  let urlWithoutWww = baseUrl.replace("www.", "");

-  let rephrasedPrompt = options.prompt;
+  let searchQuery = options.prompt;
  if (options.prompt) {
-    rephrasedPrompt =
+    searchQuery =
      (
        await generateBasicCompletion(
          buildRefrasedPrompt(options.prompt, baseUrl),
@ -65,7 +65,7 @@ export async function processUrl(
  try {
    const mapResults = await getMapResults({
      url: baseUrl,
-      search: rephrasedPrompt,
+      search: searchQuery,
      teamId: options.teamId,
      plan: options.plan,
      allowExternalLinks: options.allowExternalLinks,
@ -160,46 +160,38 @@ export async function processUrl(
      extractConfig.RERANKING.MAX_INITIAL_RANKING_LIMIT,
    );

-
    updateExtractCallback(mappedLinks.map((x) => x.url));

-
-    // Perform reranking using either prompt or schema
-    let searchQuery = "";
-    if (options.prompt) {
-      searchQuery = options.allowExternalLinks
-        ? `${options.prompt} ${urlWithoutWww}`
-        : `${options.prompt} site:${urlWithoutWww}`;
-    } else if (options.schema) {
-      // Generate search query from schema using basic completion
-      try {
-        const schemaString = JSON.stringify(options.schema, null, 2);
-        const prompt = `Given this JSON schema, generate a natural language search query that would help find relevant pages containing this type of data. Focus on the key properties and their descriptions and keep it very concise. Schema: ${schemaString}`;
-
-        searchQuery =
-          (await generateBasicCompletion(prompt)) ??
-          "Extract the data according to the schema: " + schemaString;
-
-        if (options.allowExternalLinks) {
-          searchQuery = `${searchQuery} ${urlWithoutWww}`;
-        } else {
-          searchQuery = `${searchQuery} site:${urlWithoutWww}`;
-        }
-      } catch (error) {
-        console.error("Error generating search query from schema:", error);
-        searchQuery = urlWithoutWww; // Fallback to just the domain
-      }
-    } else {
-      searchQuery = urlWithoutWww;
+    let rephrasedPrompt = options.prompt ?? searchQuery;
+    try {
+      rephrasedPrompt =
+        (await generateBasicCompletion(
+          buildPreRerankPrompt(rephrasedPrompt, options.schema, baseUrl),
+        )) ??
+        "Extract the data according to the schema: " +
+          JSON.stringify(options.schema, null, 2);
+    } catch (error) {
+      console.error("Error generating search query from schema:", error);
+      rephrasedPrompt =
+        "Extract the data according to the schema: " +
+        JSON.stringify(options.schema, null, 2) +
+        " " +
+        options?.prompt; // Fallback to just the domain
    }

-    // dumpToFile(
    //   "mapped-links.txt",
    //   mappedLinks,
    //   (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}`
    // );

-    const rerankerResult = await rerankLinksWithLLM(mappedLinks, searchQuery, urlTraces);
+    console.log("search query: ", rephrasedPrompt);
+
+
+    const rerankerResult = await rerankLinksWithLLM(
+      mappedLinks,
+      rephrasedPrompt,
+      urlTraces,
+    );
    mappedLinks = rerankerResult.mapDocument;
    let tokensUsed = rerankerResult.tokensUsed;

@ -207,7 +199,7 @@ export async function processUrl(
    if (mappedLinks.length > 100) {
      const rerankerResult = await rerankLinksWithLLM(
        mappedLinks,
-        searchQuery,
+        rephrasedPrompt,
        urlTraces,
      );
      mappedLinks = rerankerResult.mapDocument;
--- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
@ -72,13 +72,13 @@ export async function generateOpenAICompletions(
  markdown?: string,
  previousWarning?: string,
  isExtractEndpoint?: boolean,
-): Promise<{ extract: any; numTokens: number; warning: string | undefined; totalUsage: TokenUsage }> {
+  model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini",
+): Promise<{ extract: any; numTokens: number; warning: string | undefined; totalUsage: TokenUsage, model: string }> {
  let extract: any;
  let warning: string | undefined;

  const openai = new OpenAI();
-  const model: TiktokenModel =
-    (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
+

  if (markdown === undefined) {
    throw new Error("document.markdown is undefined -- this is unexpected");
@ -197,6 +197,7 @@ export async function generateOpenAICompletions(
          jsonCompletion.choices[0].message.content,
        );
        extract = options.schema ? extractData.data.extract : extractData;
+        console.log("extract", extract);
      }
    } catch (e) {
      logger.error("Failed to parse returned JSON, no schema specified.", {
@ -222,7 +223,7 @@ export async function generateOpenAICompletions(
  }
  // num tokens (just user prompt tokenized) | deprecated
  // totalTokens = promptTokens + completionTokens
-  return { extract, warning, numTokens, totalUsage: { promptTokens, completionTokens, totalTokens: promptTokens + completionTokens, model: model } };
+  return { extract, warning, numTokens, totalUsage: { promptTokens, completionTokens, totalTokens: promptTokens + completionTokens }, model };
 }

 export async function performLLMExtract(
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -333,8 +333,12 @@ const processExtractJobInternal = async (
      },
    });

-    // Move job to failed state in Redis
-    await job.moveToFailed(error, token, false);
+    try {
+      // Move job to failed state in Redis
+      await job.moveToFailed(error, token, false);
+    } catch (e) {
+      logger.log("Failed to move job to failed state in Redis", { error });
+    }

    await updateExtract(job.data.extractId, {
      status: "failed",