(feat/extract) Refactor and Reranker improvements (#1100)

* Reapply "Nick: extract api reference"

This reverts commit 61d7ba76f76ce74e0d230f89a93436f29dc8d9df.

* Nick: refactor analyzer

* Nick: formatting

* Nick:

* Update extraction-service.ts

* Nick: fixes

* NIck:

* Nick: wip

* Nick: reverted to the old re-ranker

* Nick:

* Update extract-status.ts
This commit is contained in:
Nicolas 2025-01-27 20:07:01 -03:00 committed by GitHub
parent ad06cde422
commit 6b9e65c4f6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 588 additions and 239 deletions

View File

@ -30,6 +30,7 @@ export async function extractStatusController(
data = jobData[0].docs;
}
// console.log(extract.sources);
return res.status(200).json({
success: extract.status === "failed" ? false : true,
data: data,
@ -38,5 +39,6 @@ export async function extractStatusController(
expiresAt: (await getExtractExpiry(req.params.jobId)).toISOString(),
steps: extract.showSteps ? extract.steps : undefined,
llmUsage: extract.showLLMUsage ? extract.llmUsage : undefined,
// sources: extract.sources,
});
}

View File

@ -537,6 +537,7 @@ export interface URLTrace {
};
relevanceScore?: number;
usedInCompletion?: boolean;
extractedFields?: string[];
}
export interface ExtractResponse {
@ -547,6 +548,9 @@ export interface ExtractResponse {
id?: string;
warning?: string;
urlTrace?: URLTrace[];
sources?: {
[key: string]: string[];
};
}
export interface ExtractResponseRequestTest {

View File

@ -1023,4 +1023,160 @@ describe("mixSchemaObjects function", () => {
expect(finalResult).toEqual(singleAnswerResult);
});
it("should handle empty objects correctly (id: 30)", async () => {
const originalSchema = {
type: "object",
properties: {
business_details: {
type: "object",
properties: {
name: { type: "string" },
years_in_operation: { type: "string" },
services_offered: {
type: "array",
items: { type: "string" }
},
experience_highlights: { type: "string" }
},
required: ["name"]
},
management: {
type: "object",
properties: {
owner_name: { type: "string" },
credentials: {
type: "array",
items: { type: "string" }
}
}
},
contact_information: {
type: "object",
properties: {
address: { type: "string" },
phone: { type: "string" }
}
},
reputation: {
type: "object",
properties: {
client_feedback: { type: "string" },
operational_quality: { type: "string" }
}
}
},
required: ["business_details"]
};
const singleAnswerResult = {
business_details: {
name: "Red Hill Mobility Group",
years_in_operation: "12 years",
services_offered: [
"Recovery equipment for military",
"Vehicle mobility solutions",
"Product development for military vehicles"
],
experience_highlights: "More than 12 years of combined experience overseas on over 25 active combat deployments."
},
management: {
owner_name: "",
credentials: []
},
contact_information: {
address: "659 Shell Drive, Spring Lake, NC 28390",
phone: "910-638-7836"
},
reputation: {
client_feedback: "",
operational_quality: ""
}
};
const multiEntityResult = {};
const finalResult = await mixSchemaObjects(
originalSchema,
singleAnswerResult,
{}
);
expect(finalResult).toEqual(singleAnswerResult);
});
it("should return single answer result when multi entity is undefined", async () => {
const originalSchema = {
type: "object",
properties: {
business_details: {
type: "object",
properties: {
name: { type: "string" },
years_in_operation: { type: "string" },
services_offered: {
type: "array",
items: { type: "string" }
},
experience_highlights: { type: "string" }
},
required: ["name"]
},
management: {
type: "object",
properties: {
owner_name: { type: "string" },
credentials: {
type: "array",
items: { type: "string" }
}
}
},
contact_information: {
type: "object",
properties: {
address: { type: "string" },
phone: { type: "string" }
}
},
reputation: {
type: "object",
properties: {
client_feedback: { type: "string" },
operational_quality: { type: "string" }
}
}
},
required: ["business_details"]
};
const singleAnswerResult = {
business_details: {
name: "Red Hill Mobility Group",
years_in_operation: "12 years",
services_offered: [
"Recovery equipment for military",
"Vehicle mobility solutions",
"Product development for military vehicles"
],
experience_highlights: "More than 12 years of combined experience overseas on over 25 active combat deployments."
},
management: {
owner_name: "",
credentials: []
},
contact_information: {
address: "659 Shell Drive, Spring Lake, NC 28390",
phone: "910-638-7836"
},
reputation: {
client_feedback: "",
operational_quality: ""
}
};
const finalResult = await mixSchemaObjects(originalSchema, singleAnswerResult, {});
expect(finalResult).toEqual(singleAnswerResult);
});
});

View File

@ -40,5 +40,68 @@ to determine their relevance to the user's query and intent.
}
export function buildRerankerUserPrompt(searchQuery: string): string {
return `Given these URLs, rank which ones are relevant to the user's extraction intent: "${searchQuery}".`;
return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relvancy score of 0.6+.`;
}
// Multi entity schema anlayzer
export function buildAnalyzeSchemaPrompt(): string {
return `You are a query classifier for a web scraping system. Classify the data extraction query as either:
A) Single-Answer: One answer across a few pages, possibly containing small arrays.
B) Multi-Entity: Many items across many pages, often involving large arrays.
Consider:
1. Answer Cardinality: Single or multiple items?
2. Page Distribution: Found on 1-3 pages or many?
3. Verification Needs: Cross-page verification or independent extraction?
Provide:
- Method: [Single-Answer/Multi-Entity]
- Confidence: [0-100%]
- Reasoning: Why this classification?
- Key Indicators: Specific aspects leading to this decision.
Examples:
- "Is this company a non-profit?" -> Single-Answer
- "Extract all product prices" -> Multi-Entity
For Single-Answer, arrays may be present but are typically small. For Multi-Entity, if arrays have multiple items not from a single page, return keys with large arrays. If nested, return the full key (e.g., 'ecommerce.products').`;
}
export function buildAnalyzeSchemaUserPrompt(
schemaString: string,
prompt: string,
urls: string[],
): string {
return `Classify the query as Single-Answer or Multi-Entity. For Multi-Entity, return keys with large arrays; otherwise, return none:
Schema: ${schemaString}\nPrompt: ${prompt}\nRelevant URLs: ${urls}`;
}
// Should Extract
export function buildShouldExtractSystemPrompt(): string {
return `You are a content relevance checker. Your job is to determine if the provided content is very relevant to extract information from based on the user's prompt. Return true only if the content appears relevant and contains information that could help answer the prompt. Return false if the content seems irrelevant or unlikely to contain useful information for the prompt.`;
}
export function buildShouldExtractUserPrompt(
prompt: string,
schema: any,
): string {
return `Should the following content be used to extract information for this prompt: "${prompt}" User schema is: ${JSON.stringify(schema)}\nReturn only true or false.`;
}
// Batch extract
export function buildBatchExtractSystemPrompt(
systemPrompt: string,
multiEntitySchema: any,
links: string[],
): string {
return (
(systemPrompt ? `${systemPrompt}\n` : "") +
`Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. Be concise and follow the schema always if provided. If the document provided is not relevant to the prompt nor to the final user schema ${JSON.stringify(multiEntitySchema)}, return null. Here are the urls the user provided of which he wants to extract information from: ` +
links.join(", ")
);
}
export function buildBatchExtractPrompt(prompt: string): string {
return `Today is: ${new Date().toISOString()}\n${prompt}`;
}

View File

@ -0,0 +1,94 @@
import { generateSchemaFromPrompt } from "../../../scraper/scrapeURL/transformers/llmExtract";
import { TokenUsage } from "../../../controllers/v1/types";
import { z } from "zod";
import {
buildAnalyzeSchemaPrompt,
buildAnalyzeSchemaUserPrompt,
} from "../build-prompts";
import OpenAI from "openai";
const openai = new OpenAI();
export async function analyzeSchemaAndPrompt(
urls: string[],
schema: any,
prompt: string,
): Promise<{
isMultiEntity: boolean;
multiEntityKeys: string[];
reasoning?: string;
keyIndicators?: string[];
tokenUsage: TokenUsage;
}> {
if (!schema) {
schema = await generateSchemaFromPrompt(prompt);
}
const schemaString = JSON.stringify(schema);
const checkSchema = z
.object({
isMultiEntity: z.boolean(),
multiEntityKeys: z.array(z.string()).optional().default([]),
reasoning: z.string(),
keyIndicators: z.array(z.string()),
})
.refine(
(x) => !x.isMultiEntity || x.multiEntityKeys.length > 0,
"isMultiEntity was true, but no multiEntityKeys",
);
const model = "gpt-4o";
const result = await openai.beta.chat.completions.parse({
model: model,
messages: [
{
role: "system",
content: buildAnalyzeSchemaPrompt(),
},
{
role: "user",
content: buildAnalyzeSchemaUserPrompt(schemaString, prompt, urls),
},
],
response_format: {
type: "json_schema",
json_schema: {
schema: {
type: "object",
properties: {
isMultiEntity: { type: "boolean" },
multiEntityKeys: { type: "array", items: { type: "string" } },
reasoning: { type: "string" },
keyIndicators: { type: "array", items: { type: "string" } },
},
required: [
"isMultiEntity",
"multiEntityKeys",
"reasoning",
"keyIndicators",
],
additionalProperties: false,
},
name: "checkSchema",
},
},
});
const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators } =
checkSchema.parse(result.choices[0].message.parsed);
const tokenUsage: TokenUsage = {
promptTokens: result.usage?.prompt_tokens ?? 0,
completionTokens: result.usage?.completion_tokens ?? 0,
totalTokens: result.usage?.total_tokens ?? 0,
model: model,
};
return {
isMultiEntity,
multiEntityKeys,
reasoning,
keyIndicators,
tokenUsage,
};
}

View File

@ -0,0 +1,58 @@
import { logger } from "../../../lib/logger";
import { generateOpenAICompletions } from "../../../scraper/scrapeURL/transformers/llmExtract";
import { buildDocument } from "../build-document";
import { ExtractResponse, TokenUsage } from "../../../controllers/v1/types";
import { Document } from "../../../controllers/v1/types";
import {
buildBatchExtractPrompt,
buildBatchExtractSystemPrompt,
} from "../build-prompts";
/**
* Batch extract information from a list of URLs using a multi-entity schema.
* @param multiEntitySchema - The schema for the multi-entity extraction
* @param links - The URLs to extract information from
* @param prompt - The prompt for the extraction
* @param systemPrompt - The system prompt for the extraction
* @param doc - The document to extract information from
* @returns The completion promise
*/
export async function batchExtractPromise(
multiEntitySchema: any,
links: string[],
prompt: string,
systemPrompt: string,
doc: Document,
): Promise<{
extract: any;
numTokens: number;
totalUsage: TokenUsage;
warning?: string;
sources: string[];
}> {
const completion = await generateOpenAICompletions(
logger.child({
method: "extractService/generateOpenAICompletions",
}),
{
mode: "llm",
systemPrompt: buildBatchExtractSystemPrompt(
systemPrompt,
multiEntitySchema,
links,
),
prompt: buildBatchExtractPrompt(prompt),
schema: multiEntitySchema,
},
buildDocument(doc),
undefined,
true,
);
return {
extract: completion.extract,
numTokens: completion.numTokens,
totalUsage: completion.totalUsage,
sources: [doc.metadata.url || doc.metadata.sourceURL || ""]
};
}

View File

@ -0,0 +1,40 @@
import { logger } from "../../../lib/logger";
import { buildDocument } from "../build-document";
import { Document, TokenUsage } from "../../../controllers/v1/types";
import { generateOpenAICompletions } from "../../../scraper/scrapeURL/transformers/llmExtract";
import {
buildShouldExtractSystemPrompt,
buildShouldExtractUserPrompt,
} from "../build-prompts";
export async function checkShouldExtract(
prompt: string,
multiEntitySchema: any,
doc: Document,
): Promise<{ tokenUsage: TokenUsage; extract: boolean }> {
const shouldExtractCheck = await generateOpenAICompletions(
logger.child({ method: "extractService/checkShouldExtract" }),
{
mode: "llm",
systemPrompt: buildShouldExtractSystemPrompt(),
prompt: buildShouldExtractUserPrompt(prompt, multiEntitySchema),
schema: {
type: "object",
properties: {
extract: {
type: "boolean",
},
},
required: ["extract"],
},
},
buildDocument(doc),
undefined,
true,
);
return {
tokenUsage: shouldExtractCheck.totalUsage,
extract: shouldExtractCheck.extract["extract"],
};
}

View File

@ -0,0 +1,43 @@
import { logger } from "../../../lib/logger";
import { generateOpenAICompletions } from "../../../scraper/scrapeURL/transformers/llmExtract";
import { buildDocument } from "../build-document";
import { Document, TokenUsage } from "../../../controllers/v1/types";
export async function singleAnswerCompletion({
singleAnswerDocs,
rSchema,
links,
prompt,
systemPrompt,
}: {
singleAnswerDocs: Document[];
rSchema: any;
links: string[];
prompt: string;
systemPrompt: string;
}): Promise<{
extract: any;
tokenUsage: TokenUsage;
sources: string[];
}> {
const completion = await generateOpenAICompletions(
logger.child({ module: "extract", method: "generateOpenAICompletions" }),
{
mode: "llm",
systemPrompt:
(systemPrompt ? `${systemPrompt}\n` : "") +
"Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. Return 'null' the property that you don't find the information. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " +
links.join(", "),
prompt: "Today is: " + new Date().toISOString() + "\n" + prompt,
schema: rSchema,
},
singleAnswerDocs.map((x) => buildDocument(x)).join("\n"),
undefined,
true,
);
return {
extract: completion.extract,
tokenUsage: completion.totalUsage,
sources: singleAnswerDocs.map(doc => doc.metadata.url || doc.metadata.sourceURL || "")
};
}

View File

@ -32,6 +32,9 @@ export type StoredExtract = {
steps?: ExtractedStep[];
showLLMUsage?: boolean;
llmUsage?: number;
sources?: {
[key: string]: string[];
};
};
// Reduce TTL to 6 hours instead of 24

View File

@ -2,7 +2,6 @@ import {
Document,
ExtractRequest,
TokenUsage,
toLegacyCrawlerOptions,
URLTrace,
} from "../../controllers/v1/types";
import { PlanType } from "../../types";
@ -13,31 +12,28 @@ import {
generateOpenAICompletions,
generateSchemaFromPrompt,
} from "../../scraper/scrapeURL/transformers/llmExtract";
import { buildDocument } from "./build-document";
import { billTeam } from "../../services/billing/credit_billing";
import { logJob } from "../../services/logging/log_job";
import { _addScrapeJobToBullMQ } from "../../services/queue-jobs";
import { saveCrawl, StoredCrawl } from "../crawl-redis";
import { dereferenceSchema } from "./helpers/dereference-schema";
import { z } from "zod";
import OpenAI from "openai";
import { spreadSchemas } from "./helpers/spread-schemas";
import { transformArrayToObject } from "./helpers/transform-array-to-obj";
import { mixSchemaObjects } from "./helpers/mix-schema-objs";
import Ajv from "ajv";
const ajv = new Ajv();
const openai = new OpenAI();
import { ExtractStep, updateExtract } from "./extract-redis";
import { deduplicateObjectsArray } from "./helpers/deduplicate-objs-array";
import { mergeNullValObjs } from "./helpers/merge-null-val-objs";
import { CUSTOM_U_TEAMS, extractConfig } from "./config";
import { CUSTOM_U_TEAMS } from "./config";
import {
calculateFinalResultCost,
estimateCost,
estimateTotalCost,
} from "./usage/llm-cost";
import { numTokensFromString } from "../LLM-extraction/helpers";
import { analyzeSchemaAndPrompt } from "./completions/analyzeSchemaAndPrompt";
import { checkShouldExtract } from "./completions/checkShouldExtract";
import { batchExtractPromise } from "./completions/batchExtract";
import { singleAnswerCompletion } from "./completions/singleAnswer";
interface ExtractServiceOptions {
request: ExtractRequest;
@ -56,108 +52,7 @@ interface ExtractResult {
tokenUsageBreakdown?: TokenUsage[];
llmUsage?: number;
totalUrlsScraped?: number;
}
async function analyzeSchemaAndPrompt(
urls: string[],
schema: any,
prompt: string,
): Promise<{
isMultiEntity: boolean;
multiEntityKeys: string[];
reasoning?: string;
keyIndicators?: string[];
tokenUsage: TokenUsage;
}> {
if (!schema) {
schema = await generateSchemaFromPrompt(prompt);
}
const schemaString = JSON.stringify(schema);
const checkSchema = z.object({
isMultiEntity: z.boolean(),
multiEntityKeys: z.array(z.string()).optional().default([]),
reasoning: z.string(),
keyIndicators: z.array(z.string()),
}).refine(x => !x.isMultiEntity || x.multiEntityKeys.length > 0, "isMultiEntity was true, but no multiEntityKeys");
const model = "gpt-4o";
const result = await openai.beta.chat.completions.parse({
model: model,
messages: [
{
role: "system",
content: `
You are a query classifier for a web scraping system. Classify the data extraction query as either:
A) Single-Answer: One answer across a few pages, possibly containing small arrays.
B) Multi-Entity: Many items across many pages, often involving large arrays.
Consider:
1. Answer Cardinality: Single or multiple items?
2. Page Distribution: Found on 1-3 pages or many?
3. Verification Needs: Cross-page verification or independent extraction?
Provide:
- Method: [Single-Answer/Multi-Entity]
- Confidence: [0-100%]
- Reasoning: Why this classification?
- Key Indicators: Specific aspects leading to this decision.
Examples:
- "Is this company a non-profit?" -> Single-Answer
- "Extract all product prices" -> Multi-Entity
For Single-Answer, arrays may be present but are typically small. For Multi-Entity, if arrays have multiple items not from a single page, return keys with large arrays. If nested, return the full key (e.g., 'ecommerce.products').
`,
},
{
role: "user",
content: `Classify the query as Single-Answer or Multi-Entity. For Multi-Entity, return keys with large arrays; otherwise, return none:
Schema: ${schemaString}\nPrompt: ${prompt}\nRelevant URLs: ${urls}`,
},
],
response_format: {
type: "json_schema",
json_schema: {
schema: {
type: "object",
properties: {
isMultiEntity: { type: "boolean" },
multiEntityKeys: { type: "array", items: { type: "string" } },
reasoning: { type: "string" },
keyIndicators: { type: "array", items: { type: "string" } },
},
required: [
"isMultiEntity",
"multiEntityKeys",
"reasoning",
"keyIndicators",
],
additionalProperties: false,
},
name: "checkSchema",
},
},
});
const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators } =
checkSchema.parse(result.choices[0].message.parsed);
const tokenUsage: TokenUsage = {
promptTokens: result.usage?.prompt_tokens ?? 0,
completionTokens: result.usage?.completion_tokens ?? 0,
totalTokens: result.usage?.total_tokens ?? 0,
model: model,
};
return {
isMultiEntity,
multiEntityKeys,
reasoning,
keyIndicators,
tokenUsage,
};
sources?: Record<string, string[]>;
}
type completions = {
@ -165,19 +60,9 @@ type completions = {
numTokens: number;
totalUsage: TokenUsage;
warning?: string;
sources?: string[];
};
function getRootDomain(url: string): string {
try {
if (url.endsWith("/*")) {
url = url.slice(0, -2);
}
const urlObj = new URL(url);
return `${urlObj.protocol}//${urlObj.hostname}`;
} catch (e) {
return url;
}
}
export async function performExtraction(
extractId: string,
@ -191,6 +76,7 @@ export async function performExtraction(
let multiEntityResult: any = {};
let singleAnswerResult: any = {};
let totalUrlsScraped = 0;
let sources: Record<string, string[]> = {};
const logger = _logger.child({
module: "extract",
@ -218,6 +104,7 @@ export async function performExtraction(
logger.debug("Processing URLs...", {
urlCount: request.urls.length,
});
// Process URLs
const urlPromises = request.urls.map((url) =>
processUrl(
@ -258,7 +145,7 @@ export async function performExtraction(
if (links.length === 0) {
logger.error("0 links! Bailing.", {
linkCount: links.length
linkCount: links.length,
});
return {
success: false,
@ -285,14 +172,20 @@ export async function performExtraction(
let reqSchema = request.schema;
if (!reqSchema && request.prompt) {
reqSchema = await generateSchemaFromPrompt(request.prompt);
logger.debug("Generated request schema.", { originalSchema: request.schema, schema: reqSchema });
logger.debug("Generated request schema.", {
originalSchema: request.schema,
schema: reqSchema,
});
}
if (reqSchema) {
reqSchema = await dereferenceSchema(reqSchema);
}
logger.debug("Transformed schema.", { originalSchema: request.schema, schema: reqSchema });
logger.debug("Transformed schema.", {
originalSchema: request.schema,
schema: reqSchema,
});
// agent evaluates if the schema or the prompt has an array with big amount of items
// also it checks if the schema any other properties that are not arrays
@ -308,7 +201,12 @@ export async function performExtraction(
tokenUsage: schemaAnalysisTokenUsage,
} = await analyzeSchemaAndPrompt(links, reqSchema, request.prompt ?? "");
logger.debug("Analyzed schema.", { isMultiEntity, multiEntityKeys, reasoning, keyIndicators });
logger.debug("Analyzed schema.", {
isMultiEntity,
multiEntityKeys,
reasoning,
keyIndicators,
});
// Track schema analysis tokens
tokenUsage.push(schemaAnalysisTokenUsage);
@ -368,7 +266,12 @@ export async function performExtraction(
timeout,
},
urlTraces,
logger.child({ module: "extract", method: "scrapeDocument", url, isMultiEntity: true }),
logger.child({
module: "extract",
method: "scrapeDocument",
url,
isMultiEntity: true,
}),
);
}
return docsMap.get(url);
@ -427,33 +330,17 @@ export async function performExtraction(
setTimeout(() => resolve(null), timeoutCompletion);
});
// // Check if page should be extracted before proceeding
const shouldExtractCheck = await generateOpenAICompletions(
logger.child({ method: "extractService/checkShouldExtract" }),
{
mode: "llm",
systemPrompt:
"You are a content relevance checker. Your job is to determine if the provided content is very relevant to extract information from based on the user's prompt. Return true only if the content appears relevant and contains information that could help answer the prompt. Return false if the content seems irrelevant or unlikely to contain useful information for the prompt.",
prompt: `Should the following content be used to extract information for this prompt: "${request.prompt}" User schema is: ${JSON.stringify(multiEntitySchema)}\nReturn only true or false.`,
schema: {
type: "object",
properties: {
extract: {
type: "boolean",
},
},
required: ["extract"],
},
},
buildDocument(doc),
undefined,
true,
// Check if page should be extracted before proceeding
const { extract, tokenUsage: shouldExtractCheckTokenUsage } = await checkShouldExtract(
request.prompt ?? "",
multiEntitySchema,
doc,
);
tokenUsage.push(shouldExtractCheck.totalUsage);
tokenUsage.push(shouldExtractCheckTokenUsage);
if (!shouldExtractCheck.extract["extract"]) {
console.log(
if (!extract) {
logger.info(
`Skipping extraction for ${doc.metadata.url} as content is irrelevant`,
);
return null;
@ -490,23 +377,7 @@ export async function performExtraction(
],
});
const completionPromise = generateOpenAICompletions(
logger.child({
method: "extractService/generateOpenAICompletions",
}),
{
mode: "llm",
systemPrompt:
(request.systemPrompt ? `${request.systemPrompt}\n` : "") +
`Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. Be concise and follow the schema always if provided. If the document provided is not relevant to the prompt nor to the final user schema ${JSON.stringify(multiEntitySchema)}, return null. Here are the urls the user provided of which he wants to extract information from: ` +
links.join(", "),
prompt: "Today is: " + new Date().toISOString() + "\n" + request.prompt,
schema: multiEntitySchema,
},
buildDocument(doc),
undefined,
true,
);
const completionPromise = batchExtractPromise(multiEntitySchema, links, request.prompt ?? "", request.systemPrompt ?? "", doc);
// Race between timeout and completion
const multiEntityCompletion = (await Promise.race([
@ -517,6 +388,23 @@ export async function performExtraction(
// Track multi-entity extraction tokens
if (multiEntityCompletion) {
tokenUsage.push(multiEntityCompletion.totalUsage);
// Track sources for multi-entity items
if (multiEntityCompletion.extract) {
// For each multi-entity key, track the source URL
multiEntityKeys.forEach(key => {
const items = multiEntityCompletion.extract[key];
if (Array.isArray(items)) {
items.forEach((item, index) => {
const sourcePath = `${key}[${index}]`;
if (!sources[sourcePath]) {
sources[sourcePath] = [];
}
sources[sourcePath].push(doc.metadata.url || doc.metadata.sourceURL || "");
});
}
});
}
}
// console.log(multiEntityCompletion.extract)
@ -553,7 +441,10 @@ export async function performExtraction(
return multiEntityCompletion.extract;
} catch (error) {
logger.error(`Failed to process document.`, { error, url: doc.metadata.url ?? doc.metadata.sourceURL! });
logger.error(`Failed to process document.`, {
error,
url: doc.metadata.url ?? doc.metadata.sourceURL!,
});
return null;
}
});
@ -563,7 +454,9 @@ export async function performExtraction(
multiEntityCompletions.push(
...chunkResults.filter((result) => result !== null),
);
logger.debug("All multi-entity completion chunks finished.", { completionCount: multiEntityCompletions.length });
logger.debug("All multi-entity completion chunks finished.", {
completionCount: multiEntityCompletions.length,
});
}
try {
@ -625,7 +518,12 @@ export async function performExtraction(
timeout,
},
urlTraces,
logger.child({ module: "extract", method: "scrapeDocument", url, isMultiEntity: false })
logger.child({
module: "extract",
method: "scrapeDocument",
url,
isMultiEntity: false,
}),
);
}
return docsMap.get(url);
@ -685,29 +583,31 @@ export async function performExtraction(
// Generate completions
logger.debug("Generating singleAnswer completions...");
singleAnswerCompletions = await generateOpenAICompletions(
logger.child({ module: "extract", method: "generateOpenAICompletions" }),
{
mode: "llm",
systemPrompt:
(request.systemPrompt ? `${request.systemPrompt}\n` : "") +
"Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. Return 'null' the property that you don't find the information. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " +
links.join(", "),
prompt: "Today is: " + new Date().toISOString() + "\n" + request.prompt,
schema: rSchema,
},
singleAnswerDocs.map((x) => buildDocument(x)).join("\n"),
undefined,
true,
);
let { extract: completionResult, tokenUsage: singleAnswerTokenUsage, sources: singleAnswerSources } = await singleAnswerCompletion({
singleAnswerDocs,
rSchema,
links,
prompt: request.prompt ?? "",
systemPrompt: request.systemPrompt ?? "",
});
logger.debug("Done generating singleAnswer completions.");
// Track single answer extraction tokens
if (singleAnswerCompletions) {
tokenUsage.push(singleAnswerCompletions.totalUsage);
// Track single answer extraction tokens and sources
if (completionResult) {
tokenUsage.push(singleAnswerTokenUsage);
// Add sources for top-level properties in single answer
if (rSchema?.properties) {
Object.keys(rSchema.properties).forEach(key => {
if (completionResult[key] !== undefined) {
sources[key] = singleAnswerSources || singleAnswerDocs.map(doc => doc.metadata.url || doc.metadata.sourceURL || "");
}
});
}
}
singleAnswerResult = singleAnswerCompletions.extract;
singleAnswerResult = completionResult;
singleAnswerCompletions = singleAnswerResult;
// Update token usage in traces
// if (completions && completions.numTokens) {
@ -730,7 +630,12 @@ export async function performExtraction(
}
let finalResult = reqSchema
? await mixSchemaObjects(reqSchema, singleAnswerResult, multiEntityResult, logger.child({ method: "mixSchemaObjects" }))
? await mixSchemaObjects(
reqSchema,
singleAnswerResult,
multiEntityResult,
logger.child({ method: "mixSchemaObjects" }),
)
: singleAnswerResult || multiEntityResult;
// Tokenize final result to get token count
@ -798,7 +703,7 @@ export async function performExtraction(
);
});
// Log job with token usage
// Log job with token usage and sources
logJob({
job_id: extractId,
success: true,
@ -813,10 +718,12 @@ export async function performExtraction(
origin: request.origin ?? "api",
num_tokens: totalTokensUsed,
tokens_billed: tokensToBill,
sources,
}).then(() => {
updateExtract(extractId, {
status: "completed",
llmUsage,
sources,
}).catch((error) => {
logger.error(
`Failed to update extract ${extractId} status to completed: ${error}`,
@ -830,9 +737,10 @@ export async function performExtraction(
success: true,
data: finalResult ?? {},
extractId,
warning: undefined, // TODO FIX
warning: undefined,
urlTrace: request.urlTrace ? urlTraces : undefined,
llmUsage,
totalUrlsScraped,
// sources,
};
}

View File

@ -68,7 +68,7 @@ export async function processUrl(
try {
logger.debug("Running map...", {
search: searchQuery,
})
});
const mapResults = await getMapResults({
url: baseUrl,
search: searchQuery,
@ -200,65 +200,41 @@ export async function processUrl(
// );
logger.info("Generated rephrased prompt.", {
rephrasedPrompt
rephrasedPrompt,
});
let rerankedLinks = mappedLinks;
logger.info("Reranking pass 1 (threshold 0.8)...");
const rerankerResult = await rerankLinksWithLLM({
links: rerankedLinks,
links: mappedLinks,
searchQuery: rephrasedPrompt,
urlTraces
urlTraces,
});
rerankedLinks = rerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.8);
mappedLinks = rerankerResult.mapDocument;
let tokensUsed = rerankerResult.tokensUsed;
logger.info("Reranked! (threshold 0.8)", {
linkCount: rerankedLinks.length,
logger.info("Reranked! (pass 1)", {
linkCount: mappedLinks.length,
});
// lower threshold to 0.6 if no links are found
if (rerankedLinks.length === 0) {
logger.info("No links found. Reranking with threshold 0.6");
rerankedLinks = rerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.6);
logger.info("Reranked! (threshold 0.6)", {
linkCount: rerankedLinks.length,
});
}
// lower threshold to 0.3 if no links are found
if (rerankedLinks.length === 0) {
logger.info("No links found. Reranking with threshold 0.3");
rerankedLinks = rerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.3);
logger.info("Reranked! (threshold 0.3)", {
linkCount: rerankedLinks.length,
});
}
// 2nd Pass, useful for when the first pass returns too many links
if (rerankedLinks.length > 100) {
logger.info("Reranking pass 2 (> 100 links - threshold 0.6)...");
const secondPassRerankerResult = await rerankLinksWithLLM({
links: rerankedLinks,
if (mappedLinks.length > 100) {
logger.info("Reranking (pass 2)...");
const rerankerResult = await rerankLinksWithLLM({
links: mappedLinks,
searchQuery: rephrasedPrompt,
urlTraces,
});
// why 0.6? average? experimental results?
if (secondPassRerankerResult.mapDocument.length > 0) {
rerankedLinks = secondPassRerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.6);
logger.info("Reranked! (threshold 0.6)", {
linkCount: rerankedLinks.length,
});
}
}
// If no relevant links are found, return the original mapped links
if (rerankedLinks.length === 0) {
logger.info("No links found. Not reranking.");
rerankedLinks = mappedLinks;
mappedLinks = rerankerResult.mapDocument;
tokensUsed += rerankerResult.tokensUsed;
logger.info("Reranked! (pass 2)", {
linkCount: mappedLinks.length,
});
}
// dumpToFile(
// "llm-links.txt",
// mappedLinks,
// (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}`
// );
// Remove title and description from mappedLinks
mappedLinks = mappedLinks.map((link) => ({ url: link.url }));
return mappedLinks.map((x) => x.url);

View File

@ -227,6 +227,7 @@ export function getRateLimiterPoints(
const points: number =
rateLimitConfig[makePlanKey(plan)] || rateLimitConfig.default; // 5
return points;
}

View File

@ -88,6 +88,7 @@ export interface FirecrawlJob {
retry?: boolean;
crawl_id?: string;
tokens_billed?: number;
sources?: Record<string, string[]>;
}
export interface FirecrawlScrapeResponse {