mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-11 16:38:58 +08:00
new cost tracking
This commit is contained in:
parent
ba4df67de7
commit
8546bcacc0
@ -22,6 +22,7 @@ import { getJobPriority } from "../../lib/job-priority";
|
|||||||
import { addScrapeJobs } from "../../services/queue-jobs";
|
import { addScrapeJobs } from "../../services/queue-jobs";
|
||||||
import { callWebhook } from "../../services/webhook";
|
import { callWebhook } from "../../services/webhook";
|
||||||
import { logger as _logger } from "../../lib/logger";
|
import { logger as _logger } from "../../lib/logger";
|
||||||
|
import { CostTracking } from "../../lib/extract/extraction-service";
|
||||||
|
|
||||||
export async function batchScrapeController(
|
export async function batchScrapeController(
|
||||||
req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>,
|
req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>,
|
||||||
|
@ -15,6 +15,7 @@ import { getJobPriority } from "../../lib/job-priority";
|
|||||||
import { getScrapeQueue } from "../../services/queue-service";
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
import { getJob } from "./crawl-status";
|
import { getJob } from "./crawl-status";
|
||||||
import { getJobFromGCS } from "../../lib/gcs-jobs";
|
import { getJobFromGCS } from "../../lib/gcs-jobs";
|
||||||
|
import { CostTracking } from "src/lib/extract/extraction-service";
|
||||||
|
|
||||||
export async function scrapeController(
|
export async function scrapeController(
|
||||||
req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>,
|
req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>,
|
||||||
@ -128,12 +129,6 @@ export async function scrapeController(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const cost_tracking = doc?.metadata?.costTracking;
|
|
||||||
|
|
||||||
if (doc && doc.metadata) {
|
|
||||||
delete doc.metadata.costTracking;
|
|
||||||
}
|
|
||||||
|
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
data: doc,
|
data: doc,
|
||||||
|
@ -21,6 +21,7 @@ import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
|
|||||||
import { logger as _logger } from "../../lib/logger";
|
import { logger as _logger } from "../../lib/logger";
|
||||||
import type { Logger } from "winston";
|
import type { Logger } from "winston";
|
||||||
import { getJobFromGCS } from "../../lib/gcs-jobs";
|
import { getJobFromGCS } from "../../lib/gcs-jobs";
|
||||||
|
import { CostTracking } from "../../lib/extract/extraction-service";
|
||||||
|
|
||||||
// Used for deep research
|
// Used for deep research
|
||||||
export async function searchAndScrapeSearchResult(
|
export async function searchAndScrapeSearchResult(
|
||||||
@ -32,6 +33,7 @@ export async function searchAndScrapeSearchResult(
|
|||||||
scrapeOptions: ScrapeOptions;
|
scrapeOptions: ScrapeOptions;
|
||||||
},
|
},
|
||||||
logger: Logger,
|
logger: Logger,
|
||||||
|
costTracking: CostTracking,
|
||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
try {
|
try {
|
||||||
const searchResults = await search({
|
const searchResults = await search({
|
||||||
@ -48,7 +50,8 @@ export async function searchAndScrapeSearchResult(
|
|||||||
description: result.description
|
description: result.description
|
||||||
},
|
},
|
||||||
options,
|
options,
|
||||||
logger
|
logger,
|
||||||
|
costTracking
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
@ -68,6 +71,7 @@ async function scrapeSearchResult(
|
|||||||
scrapeOptions: ScrapeOptions;
|
scrapeOptions: ScrapeOptions;
|
||||||
},
|
},
|
||||||
logger: Logger,
|
logger: Logger,
|
||||||
|
costTracking: CostTracking,
|
||||||
): Promise<Document> {
|
): Promise<Document> {
|
||||||
const jobId = uuidv4();
|
const jobId = uuidv4();
|
||||||
const jobPriority = await getJobPriority({
|
const jobPriority = await getJobPriority({
|
||||||
@ -220,6 +224,8 @@ export async function searchController(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const costTracking = new CostTracking();
|
||||||
|
|
||||||
// Scrape each non-blocked result, handling timeouts individually
|
// Scrape each non-blocked result, handling timeouts individually
|
||||||
logger.info("Scraping search results");
|
logger.info("Scraping search results");
|
||||||
const scrapePromises = searchResults.map((result) =>
|
const scrapePromises = searchResults.map((result) =>
|
||||||
@ -228,7 +234,7 @@ export async function searchController(
|
|||||||
origin: req.body.origin,
|
origin: req.body.origin,
|
||||||
timeout: req.body.timeout,
|
timeout: req.body.timeout,
|
||||||
scrapeOptions: req.body.scrapeOptions,
|
scrapeOptions: req.body.scrapeOptions,
|
||||||
}, logger),
|
}, logger, costTracking),
|
||||||
);
|
);
|
||||||
|
|
||||||
const docs = await Promise.all(scrapePromises);
|
const docs = await Promise.all(scrapePromises);
|
||||||
@ -279,6 +285,7 @@ export async function searchController(
|
|||||||
mode: "search",
|
mode: "search",
|
||||||
url: req.body.query,
|
url: req.body.query,
|
||||||
origin: req.body.origin,
|
origin: req.body.origin,
|
||||||
|
cost_tracking: costTracking,
|
||||||
});
|
});
|
||||||
|
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
|
@ -739,7 +739,6 @@ export type Document = {
|
|||||||
statusCode: number;
|
statusCode: number;
|
||||||
scrapeId?: string;
|
scrapeId?: string;
|
||||||
error?: string;
|
error?: string;
|
||||||
costTracking?: CostTracking;
|
|
||||||
// [key: string]: string | string[] | number | { smartScrape: number; other: number; total: number } | undefined;
|
// [key: string]: string | string[] | number | { smartScrape: number; other: number; total: number } | undefined;
|
||||||
};
|
};
|
||||||
serpResults?: {
|
serpResults?: {
|
||||||
|
@ -5,6 +5,7 @@ import { ResearchLLMService, ResearchStateManager } from "./research-manager";
|
|||||||
import { logJob } from "../../services/logging/log_job";
|
import { logJob } from "../../services/logging/log_job";
|
||||||
import { billTeam } from "../../services/billing/credit_billing";
|
import { billTeam } from "../../services/billing/credit_billing";
|
||||||
import { ExtractOptions } from "../../controllers/v1/types";
|
import { ExtractOptions } from "../../controllers/v1/types";
|
||||||
|
import { CostTracking } from "../extract/extraction-service";
|
||||||
|
|
||||||
interface DeepResearchServiceOptions {
|
interface DeepResearchServiceOptions {
|
||||||
researchId: string;
|
researchId: string;
|
||||||
@ -21,6 +22,7 @@ interface DeepResearchServiceOptions {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
||||||
|
const costTracking = new CostTracking();
|
||||||
const { researchId, teamId, timeLimit, subId, maxUrls } = options;
|
const { researchId, teamId, timeLimit, subId, maxUrls } = options;
|
||||||
const startTime = Date.now();
|
const startTime = Date.now();
|
||||||
let currentTopic = options.query;
|
let currentTopic = options.query;
|
||||||
@ -70,6 +72,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
|||||||
await llmService.generateSearchQueries(
|
await llmService.generateSearchQueries(
|
||||||
nextSearchTopic,
|
nextSearchTopic,
|
||||||
state.getFindings(),
|
state.getFindings(),
|
||||||
|
costTracking,
|
||||||
)
|
)
|
||||||
).slice(0, 3);
|
).slice(0, 3);
|
||||||
|
|
||||||
@ -109,7 +112,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
|||||||
fastMode: false,
|
fastMode: false,
|
||||||
blockAds: false,
|
blockAds: false,
|
||||||
},
|
},
|
||||||
}, logger);
|
}, logger, costTracking);
|
||||||
return response.length > 0 ? response : [];
|
return response.length > 0 ? response : [];
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -205,6 +208,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
|||||||
currentTopic,
|
currentTopic,
|
||||||
timeRemaining,
|
timeRemaining,
|
||||||
options.systemPrompt ?? "",
|
options.systemPrompt ?? "",
|
||||||
|
costTracking,
|
||||||
);
|
);
|
||||||
|
|
||||||
if (!analysis) {
|
if (!analysis) {
|
||||||
@ -268,6 +272,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
|||||||
state.getFindings(),
|
state.getFindings(),
|
||||||
state.getSummaries(),
|
state.getSummaries(),
|
||||||
options.analysisPrompt,
|
options.analysisPrompt,
|
||||||
|
costTracking,
|
||||||
options.formats,
|
options.formats,
|
||||||
options.jsonOptions,
|
options.jsonOptions,
|
||||||
);
|
);
|
||||||
@ -278,6 +283,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
|||||||
state.getFindings(),
|
state.getFindings(),
|
||||||
state.getSummaries(),
|
state.getSummaries(),
|
||||||
options.analysisPrompt,
|
options.analysisPrompt,
|
||||||
|
costTracking,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -307,6 +313,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
|||||||
origin: "api",
|
origin: "api",
|
||||||
num_tokens: 0,
|
num_tokens: 0,
|
||||||
tokens_billed: 0,
|
tokens_billed: 0,
|
||||||
|
cost_tracking: costTracking,
|
||||||
});
|
});
|
||||||
await updateDeepResearch(researchId, {
|
await updateDeepResearch(researchId, {
|
||||||
status: "completed",
|
status: "completed",
|
||||||
|
@ -12,6 +12,7 @@ import {
|
|||||||
import { ExtractOptions } from "../../controllers/v1/types";
|
import { ExtractOptions } from "../../controllers/v1/types";
|
||||||
|
|
||||||
import { getModel } from "../generic-ai";
|
import { getModel } from "../generic-ai";
|
||||||
|
import { CostTracking } from "../extract/extraction-service";
|
||||||
interface AnalysisResult {
|
interface AnalysisResult {
|
||||||
gaps: string[];
|
gaps: string[];
|
||||||
nextSteps: string[];
|
nextSteps: string[];
|
||||||
@ -152,6 +153,7 @@ export class ResearchLLMService {
|
|||||||
async generateSearchQueries(
|
async generateSearchQueries(
|
||||||
topic: string,
|
topic: string,
|
||||||
findings: DeepResearchFinding[] = [],
|
findings: DeepResearchFinding[] = [],
|
||||||
|
costTracking: CostTracking,
|
||||||
): Promise<{ query: string; researchGoal: string }[]> {
|
): Promise<{ query: string; researchGoal: string }[]> {
|
||||||
const { extract } = await generateCompletions({
|
const { extract } = await generateCompletions({
|
||||||
logger: this.logger.child({
|
logger: this.logger.child({
|
||||||
@ -194,6 +196,13 @@ export class ResearchLLMService {
|
|||||||
The first SERP query you generate should be a very concise, simple version of the topic. `,
|
The first SERP query you generate should be a very concise, simple version of the topic. `,
|
||||||
},
|
},
|
||||||
markdown: "",
|
markdown: "",
|
||||||
|
costTrackingOptions: {
|
||||||
|
costTracking,
|
||||||
|
metadata: {
|
||||||
|
module: "deep-research",
|
||||||
|
method: "generateSearchQueries",
|
||||||
|
},
|
||||||
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
return extract.queries;
|
return extract.queries;
|
||||||
@ -204,6 +213,7 @@ export class ResearchLLMService {
|
|||||||
currentTopic: string,
|
currentTopic: string,
|
||||||
timeRemaining: number,
|
timeRemaining: number,
|
||||||
systemPrompt: string,
|
systemPrompt: string,
|
||||||
|
costTracking: CostTracking,
|
||||||
): Promise<AnalysisResult | null> {
|
): Promise<AnalysisResult | null> {
|
||||||
try {
|
try {
|
||||||
const timeRemainingMinutes =
|
const timeRemainingMinutes =
|
||||||
@ -246,6 +256,13 @@ export class ResearchLLMService {
|
|||||||
).text,
|
).text,
|
||||||
},
|
},
|
||||||
markdown: "",
|
markdown: "",
|
||||||
|
costTrackingOptions: {
|
||||||
|
costTracking,
|
||||||
|
metadata: {
|
||||||
|
module: "deep-research",
|
||||||
|
method: "analyzeAndPlan",
|
||||||
|
},
|
||||||
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
return extract.analysis;
|
return extract.analysis;
|
||||||
@ -260,6 +277,7 @@ export class ResearchLLMService {
|
|||||||
findings: DeepResearchFinding[],
|
findings: DeepResearchFinding[],
|
||||||
summaries: string[],
|
summaries: string[],
|
||||||
analysisPrompt: string,
|
analysisPrompt: string,
|
||||||
|
costTracking: CostTracking,
|
||||||
formats?: string[],
|
formats?: string[],
|
||||||
jsonOptions?: ExtractOptions,
|
jsonOptions?: ExtractOptions,
|
||||||
): Promise<any> {
|
): Promise<any> {
|
||||||
@ -312,6 +330,13 @@ export class ResearchLLMService {
|
|||||||
},
|
},
|
||||||
markdown: "",
|
markdown: "",
|
||||||
model: getModel("o3-mini"),
|
model: getModel("o3-mini"),
|
||||||
|
costTrackingOptions: {
|
||||||
|
costTracking,
|
||||||
|
metadata: {
|
||||||
|
module: "deep-research",
|
||||||
|
method: "generateFinalAnalysis",
|
||||||
|
},
|
||||||
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
return extract;
|
return extract;
|
||||||
|
@ -11,25 +11,23 @@ import {
|
|||||||
import { jsonSchema } from "ai";
|
import { jsonSchema } from "ai";
|
||||||
import { getModel } from "../../../lib/generic-ai";
|
import { getModel } from "../../../lib/generic-ai";
|
||||||
import { Logger } from "winston";
|
import { Logger } from "winston";
|
||||||
|
import { CostTracking } from "../extraction-service";
|
||||||
export async function analyzeSchemaAndPrompt(
|
export async function analyzeSchemaAndPrompt(
|
||||||
urls: string[],
|
urls: string[],
|
||||||
schema: any,
|
schema: any,
|
||||||
prompt: string,
|
prompt: string,
|
||||||
logger: Logger,
|
logger: Logger,
|
||||||
|
costTracking: CostTracking,
|
||||||
): Promise<{
|
): Promise<{
|
||||||
isMultiEntity: boolean;
|
isMultiEntity: boolean;
|
||||||
multiEntityKeys: string[];
|
multiEntityKeys: string[];
|
||||||
reasoning: string;
|
reasoning: string;
|
||||||
keyIndicators: string[];
|
keyIndicators: string[];
|
||||||
tokenUsage: TokenUsage;
|
tokenUsage: TokenUsage;
|
||||||
cost: number;
|
|
||||||
}> {
|
}> {
|
||||||
let cost = 0;
|
|
||||||
if (!schema) {
|
if (!schema) {
|
||||||
const genRes = await generateSchemaFromPrompt(prompt, logger);
|
const genRes = await generateSchemaFromPrompt(prompt, logger, costTracking);
|
||||||
schema = genRes.extract;
|
schema = genRes.extract;
|
||||||
cost = genRes.cost;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const schemaString = JSON.stringify(schema);
|
const schemaString = JSON.stringify(schema);
|
||||||
@ -49,7 +47,7 @@ export async function analyzeSchemaAndPrompt(
|
|||||||
);
|
);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const { extract: result, totalUsage, cost: cost2 } = await generateCompletions({
|
const { extract: result, totalUsage } = await generateCompletions({
|
||||||
logger,
|
logger,
|
||||||
options: {
|
options: {
|
||||||
mode: "llm",
|
mode: "llm",
|
||||||
@ -59,8 +57,14 @@ export async function analyzeSchemaAndPrompt(
|
|||||||
},
|
},
|
||||||
markdown: "",
|
markdown: "",
|
||||||
model,
|
model,
|
||||||
|
costTrackingOptions: {
|
||||||
|
costTracking,
|
||||||
|
metadata: {
|
||||||
|
module: "extract",
|
||||||
|
method: "analyzeSchemaAndPrompt",
|
||||||
|
},
|
||||||
|
},
|
||||||
});
|
});
|
||||||
cost += cost2;
|
|
||||||
|
|
||||||
const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators } =
|
const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators } =
|
||||||
checkSchema.parse(result);
|
checkSchema.parse(result);
|
||||||
@ -71,7 +75,6 @@ export async function analyzeSchemaAndPrompt(
|
|||||||
reasoning,
|
reasoning,
|
||||||
keyIndicators,
|
keyIndicators,
|
||||||
tokenUsage: totalUsage,
|
tokenUsage: totalUsage,
|
||||||
cost,
|
|
||||||
};
|
};
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.warn("(analyzeSchemaAndPrompt) Error parsing schema analysis", {
|
logger.warn("(analyzeSchemaAndPrompt) Error parsing schema analysis", {
|
||||||
@ -90,6 +93,5 @@ export async function analyzeSchemaAndPrompt(
|
|||||||
totalTokens: 0,
|
totalTokens: 0,
|
||||||
model: model.modelId,
|
model: model.modelId,
|
||||||
},
|
},
|
||||||
cost: 0,
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -10,7 +10,7 @@ import {
|
|||||||
buildBatchExtractSystemPrompt,
|
buildBatchExtractSystemPrompt,
|
||||||
} from "../build-prompts";
|
} from "../build-prompts";
|
||||||
import { getModel } from "../../generic-ai";
|
import { getModel } from "../../generic-ai";
|
||||||
|
import { CostTracking } from "../extraction-service";
|
||||||
import fs from "fs/promises";
|
import fs from "fs/promises";
|
||||||
import { extractData } from "../../../scraper/scrapeURL/lib/extractSmartScrape";
|
import { extractData } from "../../../scraper/scrapeURL/lib/extractSmartScrape";
|
||||||
import type { Logger } from "winston";
|
import type { Logger } from "winston";
|
||||||
@ -24,6 +24,7 @@ type BatchExtractOptions = {
|
|||||||
useAgent: boolean;
|
useAgent: boolean;
|
||||||
extractId?: string;
|
extractId?: string;
|
||||||
sessionId?: string;
|
sessionId?: string;
|
||||||
|
costTracking: CostTracking;
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -75,6 +76,13 @@ export async function batchExtractPromise(options: BatchExtractOptions, logger:
|
|||||||
isExtractEndpoint: true,
|
isExtractEndpoint: true,
|
||||||
model: getModel("gemini-2.5-pro-preview-03-25", "vertex"),
|
model: getModel("gemini-2.5-pro-preview-03-25", "vertex"),
|
||||||
retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"),
|
retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"),
|
||||||
|
costTrackingOptions: {
|
||||||
|
costTracking: options.costTracking,
|
||||||
|
metadata: {
|
||||||
|
module: "extract",
|
||||||
|
method: "batchExtractPromise",
|
||||||
|
},
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
let extractedDataArray: any[] = [];
|
let extractedDataArray: any[] = [];
|
||||||
@ -84,23 +92,15 @@ export async function batchExtractPromise(options: BatchExtractOptions, logger:
|
|||||||
const {
|
const {
|
||||||
extractedDataArray: e,
|
extractedDataArray: e,
|
||||||
warning: w,
|
warning: w,
|
||||||
smartScrapeCost,
|
|
||||||
otherCost,
|
|
||||||
smartScrapeCallCount,
|
|
||||||
otherCallCount
|
|
||||||
} = await extractData({
|
} = await extractData({
|
||||||
extractOptions: generationOptions,
|
extractOptions: generationOptions,
|
||||||
urls: [doc.metadata.sourceURL || doc.metadata.url || ""],
|
urls: [doc.metadata.sourceURL || doc.metadata.url || ""],
|
||||||
useAgent,
|
useAgent,
|
||||||
extractId,
|
extractId,
|
||||||
sessionId
|
sessionId,
|
||||||
});
|
});
|
||||||
extractedDataArray = e;
|
extractedDataArray = e;
|
||||||
warning = w;
|
warning = w;
|
||||||
smCost = smartScrapeCost;
|
|
||||||
oCost = otherCost;
|
|
||||||
smCallCount = smartScrapeCallCount;
|
|
||||||
oCallCount = otherCallCount;
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error("extractData failed", { error });
|
logger.error("extractData failed", { error });
|
||||||
}
|
}
|
||||||
|
@ -7,12 +7,14 @@ import {
|
|||||||
buildShouldExtractUserPrompt,
|
buildShouldExtractUserPrompt,
|
||||||
} from "../build-prompts";
|
} from "../build-prompts";
|
||||||
import { getModel } from "../../../lib/generic-ai";
|
import { getModel } from "../../../lib/generic-ai";
|
||||||
|
import { CostTracking } from "../extraction-service";
|
||||||
|
|
||||||
export async function checkShouldExtract(
|
export async function checkShouldExtract(
|
||||||
prompt: string,
|
prompt: string,
|
||||||
multiEntitySchema: any,
|
multiEntitySchema: any,
|
||||||
doc: Document,
|
doc: Document,
|
||||||
): Promise<{ tokenUsage: TokenUsage; extract: boolean; cost: number }> {
|
costTracking: CostTracking,
|
||||||
|
): Promise<{ tokenUsage: TokenUsage; extract: boolean; }> {
|
||||||
const shouldExtractCheck = await generateCompletions({
|
const shouldExtractCheck = await generateCompletions({
|
||||||
logger: logger.child({ method: "extractService/checkShouldExtract" }),
|
logger: logger.child({ method: "extractService/checkShouldExtract" }),
|
||||||
options: {
|
options: {
|
||||||
@ -32,11 +34,17 @@ export async function checkShouldExtract(
|
|||||||
markdown: buildDocument(doc),
|
markdown: buildDocument(doc),
|
||||||
isExtractEndpoint: true,
|
isExtractEndpoint: true,
|
||||||
model: getModel("gpt-4o-mini"),
|
model: getModel("gpt-4o-mini"),
|
||||||
|
costTrackingOptions: {
|
||||||
|
costTracking,
|
||||||
|
metadata: {
|
||||||
|
module: "extract",
|
||||||
|
method: "checkShouldExtract",
|
||||||
|
},
|
||||||
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
return {
|
return {
|
||||||
tokenUsage: shouldExtractCheck.totalUsage,
|
tokenUsage: shouldExtractCheck.totalUsage,
|
||||||
extract: shouldExtractCheck.extract["extract"],
|
extract: shouldExtractCheck.extract["extract"],
|
||||||
cost: shouldExtractCheck.cost,
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -7,6 +7,7 @@ import { buildDocument } from "../build-document";
|
|||||||
import { Document, TokenUsage } from "../../../controllers/v1/types";
|
import { Document, TokenUsage } from "../../../controllers/v1/types";
|
||||||
import { getModel } from "../../../lib/generic-ai";
|
import { getModel } from "../../../lib/generic-ai";
|
||||||
import { extractData } from "../../../scraper/scrapeURL/lib/extractSmartScrape";
|
import { extractData } from "../../../scraper/scrapeURL/lib/extractSmartScrape";
|
||||||
|
import { CostTracking } from "../extraction-service";
|
||||||
|
|
||||||
export async function singleAnswerCompletion({
|
export async function singleAnswerCompletion({
|
||||||
singleAnswerDocs,
|
singleAnswerDocs,
|
||||||
@ -17,6 +18,7 @@ export async function singleAnswerCompletion({
|
|||||||
useAgent,
|
useAgent,
|
||||||
extractId,
|
extractId,
|
||||||
sessionId,
|
sessionId,
|
||||||
|
costTracking,
|
||||||
}: {
|
}: {
|
||||||
singleAnswerDocs: Document[];
|
singleAnswerDocs: Document[];
|
||||||
rSchema: any;
|
rSchema: any;
|
||||||
@ -26,14 +28,11 @@ export async function singleAnswerCompletion({
|
|||||||
useAgent: boolean;
|
useAgent: boolean;
|
||||||
extractId: string;
|
extractId: string;
|
||||||
sessionId: string;
|
sessionId: string;
|
||||||
|
costTracking: CostTracking;
|
||||||
}): Promise<{
|
}): Promise<{
|
||||||
extract: any;
|
extract: any;
|
||||||
tokenUsage: TokenUsage;
|
tokenUsage: TokenUsage;
|
||||||
sources: string[];
|
sources: string[];
|
||||||
smartScrapeCallCount: number;
|
|
||||||
smartScrapeCost: number;
|
|
||||||
otherCallCount: number;
|
|
||||||
otherCost: number;
|
|
||||||
}> {
|
}> {
|
||||||
const docsPrompt = `Today is: ` + new Date().toISOString() + `.\n` + prompt;
|
const docsPrompt = `Today is: ` + new Date().toISOString() + `.\n` + prompt;
|
||||||
const generationOptions: GenerateCompletionsOptions = {
|
const generationOptions: GenerateCompletionsOptions = {
|
||||||
@ -53,9 +52,16 @@ export async function singleAnswerCompletion({
|
|||||||
markdown: `${singleAnswerDocs.map((x, i) => `[START_PAGE (ID: ${i})]` + buildDocument(x)).join("\n")} [END_PAGE]\n`,
|
markdown: `${singleAnswerDocs.map((x, i) => `[START_PAGE (ID: ${i})]` + buildDocument(x)).join("\n")} [END_PAGE]\n`,
|
||||||
isExtractEndpoint: true,
|
isExtractEndpoint: true,
|
||||||
model: getModel("gemini-2.0-flash", "google"),
|
model: getModel("gemini-2.0-flash", "google"),
|
||||||
|
costTrackingOptions: {
|
||||||
|
costTracking,
|
||||||
|
metadata: {
|
||||||
|
module: "extract",
|
||||||
|
method: "singleAnswerCompletion",
|
||||||
|
},
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
const { extractedDataArray, warning, smartScrapeCost, otherCost, smartScrapeCallCount, otherCallCount } = await extractData({
|
const { extractedDataArray, warning } = await extractData({
|
||||||
extractOptions: generationOptions,
|
extractOptions: generationOptions,
|
||||||
urls: singleAnswerDocs.map(doc => doc.metadata.url || doc.metadata.sourceURL || ""),
|
urls: singleAnswerDocs.map(doc => doc.metadata.url || doc.metadata.sourceURL || ""),
|
||||||
useAgent,
|
useAgent,
|
||||||
@ -100,9 +106,5 @@ export async function singleAnswerCompletion({
|
|||||||
sources: singleAnswerDocs.map(
|
sources: singleAnswerDocs.map(
|
||||||
(doc) => doc.metadata.url || doc.metadata.sourceURL || "",
|
(doc) => doc.metadata.url || doc.metadata.sourceURL || "",
|
||||||
),
|
),
|
||||||
smartScrapeCost,
|
|
||||||
otherCost,
|
|
||||||
smartScrapeCallCount,
|
|
||||||
otherCallCount,
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -67,14 +67,39 @@ type completions = {
|
|||||||
sources?: string[];
|
sources?: string[];
|
||||||
};
|
};
|
||||||
|
|
||||||
export type CostTracking = {
|
export class CostTracking {
|
||||||
smartScrapeCallCount: number;
|
calls: {
|
||||||
smartScrapeCost: number;
|
type: "smartScrape" | "other",
|
||||||
otherCallCount: number;
|
metadata: Record<string, any>,
|
||||||
otherCost: number;
|
cost: number,
|
||||||
totalCost: number;
|
tokens?: {
|
||||||
costLimitExceededTokenUsage?: number;
|
input: number,
|
||||||
};
|
output: number,
|
||||||
|
},
|
||||||
|
stack: string,
|
||||||
|
}[] = [];
|
||||||
|
|
||||||
|
constructor() {}
|
||||||
|
|
||||||
|
public addCall(call: Omit<typeof this.calls[number], "stack">) {
|
||||||
|
this.calls.push({
|
||||||
|
...call,
|
||||||
|
stack: new Error().stack!.split("\n").slice(2).join("\n"),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
public toJSON() {
|
||||||
|
return {
|
||||||
|
calls: this.calls,
|
||||||
|
|
||||||
|
smartScrapeCallCount: this.calls.filter(c => c.type === "smartScrape").length,
|
||||||
|
smartScrapeCost: this.calls.filter(c => c.type === "smartScrape").reduce((acc, c) => acc + c.cost, 0),
|
||||||
|
otherCallCount: this.calls.filter(c => c.type === "other").length,
|
||||||
|
otherCost: this.calls.filter(c => c.type === "other").reduce((acc, c) => acc + c.cost, 0),
|
||||||
|
totalCost: this.calls.reduce((acc, c) => acc + c.cost, 0),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
export async function performExtraction(
|
export async function performExtraction(
|
||||||
extractId: string,
|
extractId: string,
|
||||||
@ -89,13 +114,7 @@ export async function performExtraction(
|
|||||||
let singleAnswerResult: any = {};
|
let singleAnswerResult: any = {};
|
||||||
let totalUrlsScraped = 0;
|
let totalUrlsScraped = 0;
|
||||||
let sources: Record<string, string[]> = {};
|
let sources: Record<string, string[]> = {};
|
||||||
let costTracking: CostTracking = {
|
let costTracking: CostTracking = new CostTracking();
|
||||||
smartScrapeCallCount: 0,
|
|
||||||
smartScrapeCost: 0,
|
|
||||||
otherCallCount: 0,
|
|
||||||
otherCost: 0,
|
|
||||||
totalCost: 0,
|
|
||||||
};
|
|
||||||
|
|
||||||
let log = {
|
let log = {
|
||||||
extractId,
|
extractId,
|
||||||
@ -118,13 +137,9 @@ export async function performExtraction(
|
|||||||
});
|
});
|
||||||
const rephrasedPrompt = await generateBasicCompletion(
|
const rephrasedPrompt = await generateBasicCompletion(
|
||||||
buildRephraseToSerpPrompt(request.prompt),
|
buildRephraseToSerpPrompt(request.prompt),
|
||||||
|
costTracking,
|
||||||
);
|
);
|
||||||
let rptxt = rephrasedPrompt?.text.replace('"', "").replace("'", "") || "";
|
let rptxt = rephrasedPrompt?.text.replace('"', "").replace("'", "") || "";
|
||||||
if (rephrasedPrompt) {
|
|
||||||
costTracking.otherCallCount++;
|
|
||||||
costTracking.otherCost += rephrasedPrompt.cost;
|
|
||||||
costTracking.totalCost += rephrasedPrompt.cost;
|
|
||||||
}
|
|
||||||
const searchResults = await search({
|
const searchResults = await search({
|
||||||
query: rptxt,
|
query: rptxt,
|
||||||
num_results: 10,
|
num_results: 10,
|
||||||
@ -197,11 +212,9 @@ export async function performExtraction(
|
|||||||
|
|
||||||
let reqSchema = request.schema;
|
let reqSchema = request.schema;
|
||||||
if (!reqSchema && request.prompt) {
|
if (!reqSchema && request.prompt) {
|
||||||
const schemaGenRes = await generateSchemaFromPrompt(request.prompt, logger);
|
const schemaGenRes = await generateSchemaFromPrompt(request.prompt, logger, costTracking);
|
||||||
reqSchema = schemaGenRes.extract;
|
reqSchema = schemaGenRes.extract;
|
||||||
costTracking.otherCallCount++;
|
|
||||||
costTracking.otherCost += schemaGenRes.cost;
|
|
||||||
costTracking.totalCost += schemaGenRes.cost;
|
|
||||||
|
|
||||||
logger.debug("Generated request schema.", {
|
logger.debug("Generated request schema.", {
|
||||||
originalSchema: request.schema,
|
originalSchema: request.schema,
|
||||||
@ -232,8 +245,7 @@ export async function performExtraction(
|
|||||||
reasoning,
|
reasoning,
|
||||||
keyIndicators,
|
keyIndicators,
|
||||||
tokenUsage: schemaAnalysisTokenUsage,
|
tokenUsage: schemaAnalysisTokenUsage,
|
||||||
cost: schemaAnalysisCost,
|
} = await analyzeSchemaAndPrompt(urls, reqSchema, request.prompt ?? "", logger, costTracking);
|
||||||
} = await analyzeSchemaAndPrompt(urls, reqSchema, request.prompt ?? "", logger);
|
|
||||||
|
|
||||||
logger.debug("Analyzed schema.", {
|
logger.debug("Analyzed schema.", {
|
||||||
isMultiEntity,
|
isMultiEntity,
|
||||||
@ -242,11 +254,6 @@ export async function performExtraction(
|
|||||||
keyIndicators,
|
keyIndicators,
|
||||||
});
|
});
|
||||||
|
|
||||||
costTracking.otherCallCount++;
|
|
||||||
costTracking.otherCost += schemaAnalysisCost;
|
|
||||||
costTracking.totalCost += schemaAnalysisCost;
|
|
||||||
|
|
||||||
// Track schema analysis tokens
|
|
||||||
tokenUsage.push(schemaAnalysisTokenUsage);
|
tokenUsage.push(schemaAnalysisTokenUsage);
|
||||||
|
|
||||||
let startMap = Date.now();
|
let startMap = Date.now();
|
||||||
@ -467,7 +474,8 @@ export async function performExtraction(
|
|||||||
doc,
|
doc,
|
||||||
useAgent: isAgentExtractModelValid(request.agent?.model),
|
useAgent: isAgentExtractModelValid(request.agent?.model),
|
||||||
extractId,
|
extractId,
|
||||||
sessionId
|
sessionId,
|
||||||
|
costTracking,
|
||||||
}, logger);
|
}, logger);
|
||||||
|
|
||||||
// Race between timeout and completion
|
// Race between timeout and completion
|
||||||
@ -481,12 +489,6 @@ export async function performExtraction(
|
|||||||
if (multiEntityCompletion) {
|
if (multiEntityCompletion) {
|
||||||
tokenUsage.push(multiEntityCompletion.totalUsage);
|
tokenUsage.push(multiEntityCompletion.totalUsage);
|
||||||
|
|
||||||
costTracking.smartScrapeCallCount += multiEntityCompletion.smartScrapeCallCount;
|
|
||||||
costTracking.smartScrapeCost += multiEntityCompletion.smartScrapeCost;
|
|
||||||
costTracking.otherCallCount += multiEntityCompletion.otherCallCount;
|
|
||||||
costTracking.otherCost += multiEntityCompletion.otherCost;
|
|
||||||
costTracking.totalCost += multiEntityCompletion.smartScrapeCost + multiEntityCompletion.otherCost;
|
|
||||||
|
|
||||||
if (multiEntityCompletion.extract) {
|
if (multiEntityCompletion.extract) {
|
||||||
return {
|
return {
|
||||||
extract: multiEntityCompletion.extract,
|
extract: multiEntityCompletion.extract,
|
||||||
@ -776,10 +778,6 @@ export async function performExtraction(
|
|||||||
extract: completionResult,
|
extract: completionResult,
|
||||||
tokenUsage: singleAnswerTokenUsage,
|
tokenUsage: singleAnswerTokenUsage,
|
||||||
sources: singleAnswerSources,
|
sources: singleAnswerSources,
|
||||||
smartScrapeCost: singleAnswerSmartScrapeCost,
|
|
||||||
otherCost: singleAnswerOtherCost,
|
|
||||||
smartScrapeCallCount: singleAnswerSmartScrapeCallCount,
|
|
||||||
otherCallCount: singleAnswerOtherCallCount,
|
|
||||||
} = await singleAnswerCompletion({
|
} = await singleAnswerCompletion({
|
||||||
singleAnswerDocs,
|
singleAnswerDocs,
|
||||||
rSchema,
|
rSchema,
|
||||||
@ -789,12 +787,8 @@ export async function performExtraction(
|
|||||||
useAgent: isAgentExtractModelValid(request.agent?.model),
|
useAgent: isAgentExtractModelValid(request.agent?.model),
|
||||||
extractId,
|
extractId,
|
||||||
sessionId: thisSessionId,
|
sessionId: thisSessionId,
|
||||||
|
costTracking,
|
||||||
});
|
});
|
||||||
costTracking.smartScrapeCost += singleAnswerSmartScrapeCost;
|
|
||||||
costTracking.smartScrapeCallCount += singleAnswerSmartScrapeCallCount;
|
|
||||||
costTracking.otherCost += singleAnswerOtherCost;
|
|
||||||
costTracking.otherCallCount += singleAnswerOtherCallCount;
|
|
||||||
costTracking.totalCost += singleAnswerSmartScrapeCost + singleAnswerOtherCost;
|
|
||||||
logger.debug("Done generating singleAnswer completions.");
|
logger.debug("Done generating singleAnswer completions.");
|
||||||
|
|
||||||
singleAnswerResult = transformArrayToObject(rSchema, completionResult);
|
singleAnswerResult = transformArrayToObject(rSchema, completionResult);
|
||||||
|
@ -6,7 +6,7 @@ import { extractConfig } from "../config";
|
|||||||
import { generateCompletions } from "../../../scraper/scrapeURL/transformers/llmExtract";
|
import { generateCompletions } from "../../../scraper/scrapeURL/transformers/llmExtract";
|
||||||
import { performRanking_F0 } from "./ranker-f0";
|
import { performRanking_F0 } from "./ranker-f0";
|
||||||
import { buildRerankerSystemPrompt_F0, buildRerankerUserPrompt_F0 } from "./build-prompts-f0";
|
import { buildRerankerSystemPrompt_F0, buildRerankerUserPrompt_F0 } from "./build-prompts-f0";
|
||||||
|
import { CostTracking } from "../extraction-service";
|
||||||
const cohere = new CohereClient({
|
const cohere = new CohereClient({
|
||||||
token: process.env.COHERE_API_KEY,
|
token: process.env.COHERE_API_KEY,
|
||||||
});
|
});
|
||||||
@ -166,7 +166,7 @@ export type RerankerOptions = {
|
|||||||
urlTraces: URLTrace[];
|
urlTraces: URLTrace[];
|
||||||
};
|
};
|
||||||
|
|
||||||
export async function rerankLinksWithLLM_F0(options: RerankerOptions): Promise<RerankerResult> {
|
export async function rerankLinksWithLLM_F0(options: RerankerOptions, costTracking: CostTracking): Promise<RerankerResult> {
|
||||||
const { links, searchQuery, urlTraces } = options;
|
const { links, searchQuery, urlTraces } = options;
|
||||||
const chunkSize = 100;
|
const chunkSize = 100;
|
||||||
const chunks: MapDocument[][] = [];
|
const chunks: MapDocument[][] = [];
|
||||||
@ -231,7 +231,14 @@ export async function rerankLinksWithLLM_F0(options: RerankerOptions): Promise<R
|
|||||||
schema: schema,
|
schema: schema,
|
||||||
},
|
},
|
||||||
markdown: linksContent,
|
markdown: linksContent,
|
||||||
isExtractEndpoint: true
|
isExtractEndpoint: true,
|
||||||
|
costTrackingOptions: {
|
||||||
|
costTracking: new CostTracking(),
|
||||||
|
metadata: {
|
||||||
|
module: "extract",
|
||||||
|
method: "rerankLinksWithLLM",
|
||||||
|
},
|
||||||
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
const completion = await Promise.race([
|
const completion = await Promise.race([
|
||||||
|
@ -8,6 +8,7 @@ import { extractConfig } from "../config";
|
|||||||
import type { Logger } from "winston";
|
import type { Logger } from "winston";
|
||||||
import { generateText } from "ai";
|
import { generateText } from "ai";
|
||||||
import { getModel } from "../../generic-ai";
|
import { getModel } from "../../generic-ai";
|
||||||
|
import { CostTracking } from "../extraction-service";
|
||||||
|
|
||||||
export async function generateBasicCompletion_FO(prompt: string) {
|
export async function generateBasicCompletion_FO(prompt: string) {
|
||||||
const { text } = await generateText({
|
const { text } = await generateText({
|
||||||
@ -211,7 +212,7 @@ export async function processUrl_F0(
|
|||||||
links: mappedLinks,
|
links: mappedLinks,
|
||||||
searchQuery: rephrasedPrompt,
|
searchQuery: rephrasedPrompt,
|
||||||
urlTraces,
|
urlTraces,
|
||||||
});
|
}, new CostTracking());
|
||||||
mappedLinks = rerankerResult.mapDocument;
|
mappedLinks = rerankerResult.mapDocument;
|
||||||
let tokensUsed = rerankerResult.tokensUsed;
|
let tokensUsed = rerankerResult.tokensUsed;
|
||||||
logger.info("Reranked! (pass 1)", {
|
logger.info("Reranked! (pass 1)", {
|
||||||
@ -225,7 +226,7 @@ export async function processUrl_F0(
|
|||||||
links: mappedLinks,
|
links: mappedLinks,
|
||||||
searchQuery: rephrasedPrompt,
|
searchQuery: rephrasedPrompt,
|
||||||
urlTraces,
|
urlTraces,
|
||||||
});
|
}, new CostTracking());
|
||||||
mappedLinks = rerankerResult.mapDocument;
|
mappedLinks = rerankerResult.mapDocument;
|
||||||
tokensUsed += rerankerResult.tokensUsed;
|
tokensUsed += rerankerResult.tokensUsed;
|
||||||
logger.info("Reranked! (pass 2)", {
|
logger.info("Reranked! (pass 2)", {
|
||||||
|
@ -11,6 +11,7 @@ import { buildRerankerSystemPrompt } from "./build-prompts";
|
|||||||
import { dumpToFile } from "./helpers/dump-to-file";
|
import { dumpToFile } from "./helpers/dump-to-file";
|
||||||
import { getModel } from "../generic-ai";
|
import { getModel } from "../generic-ai";
|
||||||
import fs from "fs/promises";
|
import fs from "fs/promises";
|
||||||
|
import { CostTracking } from "./extraction-service";
|
||||||
|
|
||||||
const THRESHOLD_FOR_SINGLEPAGE = 0.6;
|
const THRESHOLD_FOR_SINGLEPAGE = 0.6;
|
||||||
const THRESHOLD_FOR_MULTIENTITY = 0.45;
|
const THRESHOLD_FOR_MULTIENTITY = 0.45;
|
||||||
@ -177,6 +178,7 @@ export type RerankerOptions = {
|
|||||||
reasoning: string;
|
reasoning: string;
|
||||||
multiEntityKeys: string[];
|
multiEntityKeys: string[];
|
||||||
keyIndicators: string[];
|
keyIndicators: string[];
|
||||||
|
costTracking: CostTracking;
|
||||||
};
|
};
|
||||||
|
|
||||||
export async function rerankLinksWithLLM(
|
export async function rerankLinksWithLLM(
|
||||||
@ -315,6 +317,13 @@ export async function rerankLinksWithLLM(
|
|||||||
// },
|
// },
|
||||||
markdown: linksContent,
|
markdown: linksContent,
|
||||||
isExtractEndpoint: true,
|
isExtractEndpoint: true,
|
||||||
|
costTrackingOptions: {
|
||||||
|
costTracking: options.costTracking,
|
||||||
|
metadata: {
|
||||||
|
module: "extract",
|
||||||
|
method: "rerankLinksWithLLM",
|
||||||
|
},
|
||||||
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
completion = await completionPromise;
|
completion = await completionPromise;
|
||||||
|
@ -11,7 +11,7 @@ import { getModel } from "../generic-ai";
|
|||||||
import { calculateCost } from "../../scraper/scrapeURL/transformers/llmExtract";
|
import { calculateCost } from "../../scraper/scrapeURL/transformers/llmExtract";
|
||||||
import type { CostTracking } from "./extraction-service";
|
import type { CostTracking } from "./extraction-service";
|
||||||
|
|
||||||
export async function generateBasicCompletion(prompt: string): Promise<{ text: string, cost: number } | null> {
|
export async function generateBasicCompletion(prompt: string, costTracking: CostTracking): Promise<{ text: string } | null> {
|
||||||
try {
|
try {
|
||||||
const result = await generateText({
|
const result = await generateText({
|
||||||
model: getModel("gpt-4o", "openai"),
|
model: getModel("gpt-4o", "openai"),
|
||||||
@ -22,7 +22,19 @@ export async function generateBasicCompletion(prompt: string): Promise<{ text: s
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
return { text: result.text, cost: calculateCost("openai/gpt-4o", result.usage?.promptTokens ?? 0, result.usage?.completionTokens ?? 0) };
|
costTracking.addCall({
|
||||||
|
type: "other",
|
||||||
|
metadata: {
|
||||||
|
module: "extract",
|
||||||
|
method: "generateBasicCompletion",
|
||||||
|
},
|
||||||
|
cost: calculateCost("openai/gpt-4o", result.usage?.promptTokens ?? 0, result.usage?.completionTokens ?? 0),
|
||||||
|
tokens: {
|
||||||
|
input: result.usage?.promptTokens ?? 0,
|
||||||
|
output: result.usage?.completionTokens ?? 0,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
return { text: result.text };
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Error generating basic completion:", error);
|
console.error("Error generating basic completion:", error);
|
||||||
if (error?.type == "rate_limit_error") {
|
if (error?.type == "rate_limit_error") {
|
||||||
@ -36,7 +48,19 @@ export async function generateBasicCompletion(prompt: string): Promise<{ text: s
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
return { text: result.text, cost: calculateCost("openai/gpt-4o-mini", result.usage?.promptTokens ?? 0, result.usage?.completionTokens ?? 0) };
|
costTracking.addCall({
|
||||||
|
type: "other",
|
||||||
|
metadata: {
|
||||||
|
module: "extract",
|
||||||
|
method: "generateBasicCompletion",
|
||||||
|
},
|
||||||
|
cost: calculateCost("openai/gpt-4o-mini", result.usage?.promptTokens ?? 0, result.usage?.completionTokens ?? 0),
|
||||||
|
tokens: {
|
||||||
|
input: result.usage?.promptTokens ?? 0,
|
||||||
|
output: result.usage?.completionTokens ?? 0,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
return { text: result.text };
|
||||||
} catch (fallbackError) {
|
} catch (fallbackError) {
|
||||||
console.error("Error generating basic completion with fallback model:", fallbackError);
|
console.error("Error generating basic completion with fallback model:", fallbackError);
|
||||||
return null;
|
return null;
|
||||||
@ -96,13 +120,11 @@ export async function processUrl(
|
|||||||
if (options.prompt) {
|
if (options.prompt) {
|
||||||
const res = await generateBasicCompletion(
|
const res = await generateBasicCompletion(
|
||||||
buildRefrasedPrompt(options.prompt, baseUrl),
|
buildRefrasedPrompt(options.prompt, baseUrl),
|
||||||
|
costTracking,
|
||||||
);
|
);
|
||||||
|
|
||||||
if (res) {
|
if (res) {
|
||||||
searchQuery = res.text.replace('"', "").replace("/", "") ?? options.prompt;
|
searchQuery = res.text.replace('"', "").replace("/", "") ?? options.prompt;
|
||||||
costTracking.otherCallCount++;
|
|
||||||
costTracking.otherCost += res.cost;
|
|
||||||
costTracking.totalCost += res.cost;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -223,13 +245,11 @@ export async function processUrl(
|
|||||||
try {
|
try {
|
||||||
const res = await generateBasicCompletion(
|
const res = await generateBasicCompletion(
|
||||||
buildPreRerankPrompt(rephrasedPrompt, options.schema, baseUrl),
|
buildPreRerankPrompt(rephrasedPrompt, options.schema, baseUrl),
|
||||||
|
costTracking,
|
||||||
);
|
);
|
||||||
|
|
||||||
if (res) {
|
if (res) {
|
||||||
rephrasedPrompt = res.text;
|
rephrasedPrompt = res.text;
|
||||||
costTracking.otherCallCount++;
|
|
||||||
costTracking.otherCost += res.cost;
|
|
||||||
costTracking.totalCost += res.cost;
|
|
||||||
} else {
|
} else {
|
||||||
rephrasedPrompt =
|
rephrasedPrompt =
|
||||||
"Extract the data according to the schema: " +
|
"Extract the data according to the schema: " +
|
||||||
@ -262,10 +282,8 @@ export async function processUrl(
|
|||||||
reasoning: options.reasoning,
|
reasoning: options.reasoning,
|
||||||
multiEntityKeys: options.multiEntityKeys,
|
multiEntityKeys: options.multiEntityKeys,
|
||||||
keyIndicators: options.keyIndicators,
|
keyIndicators: options.keyIndicators,
|
||||||
|
costTracking,
|
||||||
});
|
});
|
||||||
costTracking.otherCallCount++;
|
|
||||||
costTracking.otherCost += rerankerResult.cost;
|
|
||||||
costTracking.totalCost += rerankerResult.cost;
|
|
||||||
mappedLinks = rerankerResult.mapDocument;
|
mappedLinks = rerankerResult.mapDocument;
|
||||||
let tokensUsed = rerankerResult.tokensUsed;
|
let tokensUsed = rerankerResult.tokensUsed;
|
||||||
logger.info("Reranked! (pass 1)", {
|
logger.info("Reranked! (pass 1)", {
|
||||||
@ -283,10 +301,8 @@ export async function processUrl(
|
|||||||
reasoning: options.reasoning,
|
reasoning: options.reasoning,
|
||||||
multiEntityKeys: options.multiEntityKeys,
|
multiEntityKeys: options.multiEntityKeys,
|
||||||
keyIndicators: options.keyIndicators,
|
keyIndicators: options.keyIndicators,
|
||||||
|
costTracking,
|
||||||
});
|
});
|
||||||
costTracking.otherCallCount++;
|
|
||||||
costTracking.otherCost += rerankerResult.cost;
|
|
||||||
costTracking.totalCost += rerankerResult.cost;
|
|
||||||
mappedLinks = rerankerResult.mapDocument;
|
mappedLinks = rerankerResult.mapDocument;
|
||||||
tokensUsed += rerankerResult.tokensUsed;
|
tokensUsed += rerankerResult.tokensUsed;
|
||||||
logger.info("Reranked! (pass 2)", {
|
logger.info("Reranked! (pass 2)", {
|
||||||
|
@ -11,7 +11,7 @@ import { billTeam } from "../../services/billing/credit_billing";
|
|||||||
import { logJob } from "../../services/logging/log_job";
|
import { logJob } from "../../services/logging/log_job";
|
||||||
import { getModel } from "../generic-ai";
|
import { getModel } from "../generic-ai";
|
||||||
import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
|
import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
|
||||||
|
import { CostTracking } from "../extract/extraction-service";
|
||||||
interface GenerateLLMsTextServiceOptions {
|
interface GenerateLLMsTextServiceOptions {
|
||||||
generationId: string;
|
generationId: string;
|
||||||
teamId: string;
|
teamId: string;
|
||||||
@ -71,6 +71,7 @@ export async function performGenerateLlmsTxt(
|
|||||||
generationId,
|
generationId,
|
||||||
teamId,
|
teamId,
|
||||||
});
|
});
|
||||||
|
const costTracking = new CostTracking();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Enforce max URL limit
|
// Enforce max URL limit
|
||||||
@ -167,6 +168,13 @@ export async function performGenerateLlmsTxt(
|
|||||||
prompt: `Generate a 9-10 word description and a 3-4 word title of the entire page based on ALL the content one will find on the page for this url: ${document.metadata?.url}. This will help in a user finding the page for its intended purpose.`,
|
prompt: `Generate a 9-10 word description and a 3-4 word title of the entire page based on ALL the content one will find on the page for this url: ${document.metadata?.url}. This will help in a user finding the page for its intended purpose.`,
|
||||||
},
|
},
|
||||||
markdown: document.markdown,
|
markdown: document.markdown,
|
||||||
|
costTrackingOptions: {
|
||||||
|
costTracking,
|
||||||
|
metadata: {
|
||||||
|
module: "generate-llmstxt",
|
||||||
|
method: "generateDescription",
|
||||||
|
},
|
||||||
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@ -229,6 +237,7 @@ export async function performGenerateLlmsTxt(
|
|||||||
num_tokens: 0,
|
num_tokens: 0,
|
||||||
tokens_billed: 0,
|
tokens_billed: 0,
|
||||||
sources: {},
|
sources: {},
|
||||||
|
cost_tracking: costTracking,
|
||||||
});
|
});
|
||||||
|
|
||||||
// Bill team for usage
|
// Bill team for usage
|
||||||
|
@ -17,14 +17,17 @@ import {
|
|||||||
} from "../scraper/scrapeURL";
|
} from "../scraper/scrapeURL";
|
||||||
import { Engine } from "../scraper/scrapeURL/engines";
|
import { Engine } from "../scraper/scrapeURL/engines";
|
||||||
import { indexPage } from "../lib/extract/index/pinecone";
|
import { indexPage } from "../lib/extract/index/pinecone";
|
||||||
|
import { CostTracking } from "../lib/extract/extraction-service";
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
export async function startWebScraperPipeline({
|
export async function startWebScraperPipeline({
|
||||||
job,
|
job,
|
||||||
token,
|
token,
|
||||||
|
costTracking,
|
||||||
}: {
|
}: {
|
||||||
job: Job<WebScraperOptions> & { id: string };
|
job: Job<WebScraperOptions> & { id: string };
|
||||||
token: string;
|
token: string;
|
||||||
|
costTracking: CostTracking;
|
||||||
}) {
|
}) {
|
||||||
return await runWebScraper({
|
return await runWebScraper({
|
||||||
url: job.data.url,
|
url: job.data.url,
|
||||||
@ -52,6 +55,7 @@ export async function startWebScraperPipeline({
|
|||||||
is_scrape: job.data.is_scrape ?? false,
|
is_scrape: job.data.is_scrape ?? false,
|
||||||
is_crawl: !!(job.data.crawl_id && job.data.crawlerOptions !== null),
|
is_crawl: !!(job.data.crawl_id && job.data.crawlerOptions !== null),
|
||||||
urlInvisibleInCurrentCrawl: job.data.crawlerOptions?.urlInvisibleInCurrentCrawl ?? false,
|
urlInvisibleInCurrentCrawl: job.data.crawlerOptions?.urlInvisibleInCurrentCrawl ?? false,
|
||||||
|
costTracking,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -68,6 +72,7 @@ export async function runWebScraper({
|
|||||||
is_scrape = false,
|
is_scrape = false,
|
||||||
is_crawl = false,
|
is_crawl = false,
|
||||||
urlInvisibleInCurrentCrawl = false,
|
urlInvisibleInCurrentCrawl = false,
|
||||||
|
costTracking,
|
||||||
}: RunWebScraperParams): Promise<ScrapeUrlResponse> {
|
}: RunWebScraperParams): Promise<ScrapeUrlResponse> {
|
||||||
const logger = _logger.child({
|
const logger = _logger.child({
|
||||||
method: "runWebScraper",
|
method: "runWebScraper",
|
||||||
@ -101,7 +106,7 @@ export async function runWebScraper({
|
|||||||
...internalOptions,
|
...internalOptions,
|
||||||
urlInvisibleInCurrentCrawl,
|
urlInvisibleInCurrentCrawl,
|
||||||
teamId: internalOptions?.teamId ?? team_id,
|
teamId: internalOptions?.teamId ?? team_id,
|
||||||
});
|
}, costTracking);
|
||||||
if (!response.success) {
|
if (!response.success) {
|
||||||
if (response.error instanceof Error) {
|
if (response.error instanceof Error) {
|
||||||
throw response.error;
|
throw response.error;
|
||||||
|
@ -3,6 +3,7 @@ import { WebCrawler } from "./crawler";
|
|||||||
import { scrapeURL } from "../scrapeURL";
|
import { scrapeURL } from "../scrapeURL";
|
||||||
import { scrapeOptions, TimeoutSignal } from "../../controllers/v1/types";
|
import { scrapeOptions, TimeoutSignal } from "../../controllers/v1/types";
|
||||||
import type { Logger } from "winston";
|
import type { Logger } from "winston";
|
||||||
|
import { CostTracking } from "../../lib/extract/extraction-service";
|
||||||
const useFireEngine =
|
const useFireEngine =
|
||||||
process.env.FIRE_ENGINE_BETA_URL !== "" &&
|
process.env.FIRE_ENGINE_BETA_URL !== "" &&
|
||||||
process.env.FIRE_ENGINE_BETA_URL !== undefined;
|
process.env.FIRE_ENGINE_BETA_URL !== undefined;
|
||||||
@ -49,6 +50,7 @@ export async function getLinksFromSitemap(
|
|||||||
abort,
|
abort,
|
||||||
teamId: "sitemap",
|
teamId: "sitemap",
|
||||||
},
|
},
|
||||||
|
new CostTracking(),
|
||||||
);
|
);
|
||||||
|
|
||||||
if (
|
if (
|
||||||
|
@ -26,6 +26,7 @@ import { executeTransformers } from "./transformers";
|
|||||||
import { LLMRefusalError } from "./transformers/llmExtract";
|
import { LLMRefusalError } from "./transformers/llmExtract";
|
||||||
import { urlSpecificParams } from "./lib/urlSpecificParams";
|
import { urlSpecificParams } from "./lib/urlSpecificParams";
|
||||||
import { loadMock, MockState } from "./lib/mock";
|
import { loadMock, MockState } from "./lib/mock";
|
||||||
|
import { CostTracking } from "../../lib/extract/extraction-service";
|
||||||
|
|
||||||
export type ScrapeUrlResponse = (
|
export type ScrapeUrlResponse = (
|
||||||
| {
|
| {
|
||||||
@ -55,6 +56,7 @@ export type Meta = {
|
|||||||
url?: string;
|
url?: string;
|
||||||
status: number;
|
status: number;
|
||||||
} | null | undefined; // undefined: no prefetch yet, null: prefetch came back empty
|
} | null | undefined; // undefined: no prefetch yet, null: prefetch came back empty
|
||||||
|
costTracking: CostTracking;
|
||||||
};
|
};
|
||||||
|
|
||||||
function buildFeatureFlags(
|
function buildFeatureFlags(
|
||||||
@ -127,6 +129,7 @@ async function buildMetaObject(
|
|||||||
url: string,
|
url: string,
|
||||||
options: ScrapeOptions,
|
options: ScrapeOptions,
|
||||||
internalOptions: InternalOptions,
|
internalOptions: InternalOptions,
|
||||||
|
costTracking: CostTracking,
|
||||||
): Promise<Meta> {
|
): Promise<Meta> {
|
||||||
const specParams =
|
const specParams =
|
||||||
urlSpecificParams[new URL(url).hostname.replace(/^www\./, "")];
|
urlSpecificParams[new URL(url).hostname.replace(/^www\./, "")];
|
||||||
@ -158,6 +161,7 @@ async function buildMetaObject(
|
|||||||
? await loadMock(options.useMock, _logger)
|
? await loadMock(options.useMock, _logger)
|
||||||
: null,
|
: null,
|
||||||
pdfPrefetch: undefined,
|
pdfPrefetch: undefined,
|
||||||
|
costTracking,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -389,8 +393,9 @@ export async function scrapeURL(
|
|||||||
url: string,
|
url: string,
|
||||||
options: ScrapeOptions,
|
options: ScrapeOptions,
|
||||||
internalOptions: InternalOptions,
|
internalOptions: InternalOptions,
|
||||||
|
costTracking: CostTracking,
|
||||||
): Promise<ScrapeUrlResponse> {
|
): Promise<ScrapeUrlResponse> {
|
||||||
const meta = await buildMetaObject(id, url, options, internalOptions);
|
const meta = await buildMetaObject(id, url, options, internalOptions, costTracking);
|
||||||
try {
|
try {
|
||||||
while (true) {
|
while (true) {
|
||||||
try {
|
try {
|
||||||
|
@ -10,8 +10,7 @@ import { parseMarkdown } from "../../../lib/html-to-markdown";
|
|||||||
import { getModel } from "../../../lib/generic-ai";
|
import { getModel } from "../../../lib/generic-ai";
|
||||||
import { TokenUsage } from "../../../controllers/v1/types";
|
import { TokenUsage } from "../../../controllers/v1/types";
|
||||||
import type { SmartScrapeResult } from "./smartScrape";
|
import type { SmartScrapeResult } from "./smartScrape";
|
||||||
import { ExtractStep } from "src/lib/extract/extract-redis";
|
import { CostTracking } from "../../../lib/extract/extraction-service";
|
||||||
|
|
||||||
const commonSmartScrapeProperties = {
|
const commonSmartScrapeProperties = {
|
||||||
shouldUseSmartscrape: {
|
shouldUseSmartscrape: {
|
||||||
type: "boolean",
|
type: "boolean",
|
||||||
@ -225,26 +224,16 @@ export async function extractData({
|
|||||||
}): Promise<{
|
}): Promise<{
|
||||||
extractedDataArray: any[];
|
extractedDataArray: any[];
|
||||||
warning: any;
|
warning: any;
|
||||||
smartScrapeCallCount: number;
|
|
||||||
otherCallCount: number;
|
|
||||||
smartScrapeCost: number;
|
|
||||||
otherCost: number;
|
|
||||||
costLimitExceededTokenUsage: number | null;
|
costLimitExceededTokenUsage: number | null;
|
||||||
}> {
|
}> {
|
||||||
let schema = extractOptions.options.schema;
|
let schema = extractOptions.options.schema;
|
||||||
const logger = extractOptions.logger;
|
const logger = extractOptions.logger;
|
||||||
const isSingleUrl = urls.length === 1;
|
const isSingleUrl = urls.length === 1;
|
||||||
let smartScrapeCost = 0;
|
|
||||||
let otherCost = 0;
|
|
||||||
let smartScrapeCallCount = 0;
|
|
||||||
let otherCallCount = 0;
|
|
||||||
let costLimitExceededTokenUsage: number | null = null;
|
let costLimitExceededTokenUsage: number | null = null;
|
||||||
// TODO: remove the "required" fields here!! it breaks o3-mini
|
// TODO: remove the "required" fields here!! it breaks o3-mini
|
||||||
|
|
||||||
if (!schema && extractOptions.options.prompt) {
|
if (!schema && extractOptions.options.prompt) {
|
||||||
const genRes = await generateSchemaFromPrompt(extractOptions.options.prompt, logger);
|
const genRes = await generateSchemaFromPrompt(extractOptions.options.prompt, logger, extractOptions.costTrackingOptions.costTracking);
|
||||||
otherCallCount++;
|
|
||||||
otherCost += genRes.cost;
|
|
||||||
schema = genRes.extract;
|
schema = genRes.extract;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -278,17 +267,22 @@ export async function extractData({
|
|||||||
extract: e,
|
extract: e,
|
||||||
warning: w,
|
warning: w,
|
||||||
totalUsage: t,
|
totalUsage: t,
|
||||||
cost: c,
|
|
||||||
} = await generateCompletions({
|
} = await generateCompletions({
|
||||||
...extractOptionsNewSchema,
|
...extractOptionsNewSchema,
|
||||||
model: getModel("gemini-2.5-pro-preview-03-25", "vertex"),
|
model: getModel("gemini-2.5-pro-preview-03-25", "vertex"),
|
||||||
retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"),
|
retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"),
|
||||||
|
costTrackingOptions: {
|
||||||
|
costTracking: extractOptions.costTrackingOptions.costTracking,
|
||||||
|
metadata: {
|
||||||
|
module: "scrapeURL",
|
||||||
|
method: "extractData",
|
||||||
|
description: "Check if using smartScrape is needed for this case"
|
||||||
|
},
|
||||||
|
},
|
||||||
});
|
});
|
||||||
extract = e;
|
extract = e;
|
||||||
warning = w;
|
warning = w;
|
||||||
totalUsage = t;
|
totalUsage = t;
|
||||||
otherCost += c;
|
|
||||||
otherCallCount++;
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(
|
logger.error(
|
||||||
"failed during extractSmartScrape.ts:generateCompletions",
|
"failed during extractSmartScrape.ts:generateCompletions",
|
||||||
@ -321,10 +315,9 @@ export async function extractData({
|
|||||||
sessionId,
|
sessionId,
|
||||||
extractId,
|
extractId,
|
||||||
scrapeId,
|
scrapeId,
|
||||||
|
costTracking: extractOptions.costTrackingOptions.costTracking,
|
||||||
}),
|
}),
|
||||||
];
|
];
|
||||||
smartScrapeCost += smartscrapeResults[0].tokenUsage;
|
|
||||||
smartScrapeCallCount++;
|
|
||||||
} else {
|
} else {
|
||||||
const pages = extract?.smartscrapePages ?? [];
|
const pages = extract?.smartscrapePages ?? [];
|
||||||
//do it async promiseall instead
|
//do it async promiseall instead
|
||||||
@ -344,14 +337,10 @@ export async function extractData({
|
|||||||
sessionId,
|
sessionId,
|
||||||
extractId,
|
extractId,
|
||||||
scrapeId,
|
scrapeId,
|
||||||
|
costTracking: extractOptions.costTrackingOptions.costTracking,
|
||||||
});
|
});
|
||||||
}),
|
}),
|
||||||
);
|
);
|
||||||
smartScrapeCost += smartscrapeResults.reduce(
|
|
||||||
(acc, result) => acc + result.tokenUsage,
|
|
||||||
0,
|
|
||||||
);
|
|
||||||
smartScrapeCallCount += smartscrapeResults.length;
|
|
||||||
}
|
}
|
||||||
// console.log("smartscrapeResults", smartscrapeResults);
|
// console.log("smartscrapeResults", smartscrapeResults);
|
||||||
|
|
||||||
@ -372,11 +361,17 @@ export async function extractData({
|
|||||||
markdown: markdown,
|
markdown: markdown,
|
||||||
model: getModel("gemini-2.5-pro-preview-03-25", "vertex"),
|
model: getModel("gemini-2.5-pro-preview-03-25", "vertex"),
|
||||||
retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"),
|
retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"),
|
||||||
|
costTrackingOptions: {
|
||||||
|
costTracking: extractOptions.costTrackingOptions.costTracking,
|
||||||
|
metadata: {
|
||||||
|
module: "scrapeURL",
|
||||||
|
method: "extractData",
|
||||||
|
description: "Extract data from markdown (smart-scape results)",
|
||||||
|
},
|
||||||
|
},
|
||||||
};
|
};
|
||||||
const { extract, warning, totalUsage, model, cost } =
|
const { extract } =
|
||||||
await generateCompletions(newExtractOptions);
|
await generateCompletions(newExtractOptions);
|
||||||
otherCost += cost;
|
|
||||||
otherCallCount++;
|
|
||||||
return extract;
|
return extract;
|
||||||
}),
|
}),
|
||||||
);
|
);
|
||||||
@ -399,10 +394,6 @@ export async function extractData({
|
|||||||
return {
|
return {
|
||||||
extractedDataArray: extractedData,
|
extractedDataArray: extractedData,
|
||||||
warning: warning,
|
warning: warning,
|
||||||
smartScrapeCallCount: smartScrapeCallCount,
|
|
||||||
otherCallCount: otherCallCount,
|
|
||||||
smartScrapeCost: smartScrapeCost,
|
|
||||||
otherCost: otherCost,
|
|
||||||
costLimitExceededTokenUsage: costLimitExceededTokenUsage,
|
costLimitExceededTokenUsage: costLimitExceededTokenUsage,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -3,7 +3,7 @@ import { logger as _logger } from "../../../lib/logger";
|
|||||||
import { robustFetch } from "./fetch";
|
import { robustFetch } from "./fetch";
|
||||||
import fs from "fs/promises";
|
import fs from "fs/promises";
|
||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
|
import { CostTracking } from "../../../lib/extract/extraction-service";
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
// Define schemas outside the function scope
|
// Define schemas outside the function scope
|
||||||
@ -52,6 +52,7 @@ export async function smartScrape({
|
|||||||
extractId,
|
extractId,
|
||||||
scrapeId,
|
scrapeId,
|
||||||
beforeSubmission,
|
beforeSubmission,
|
||||||
|
costTracking,
|
||||||
}: {
|
}: {
|
||||||
url: string,
|
url: string,
|
||||||
prompt: string,
|
prompt: string,
|
||||||
@ -59,6 +60,7 @@ export async function smartScrape({
|
|||||||
extractId?: string,
|
extractId?: string,
|
||||||
scrapeId?: string,
|
scrapeId?: string,
|
||||||
beforeSubmission?: () => unknown,
|
beforeSubmission?: () => unknown,
|
||||||
|
costTracking: CostTracking,
|
||||||
}): Promise<SmartScrapeResult> {
|
}): Promise<SmartScrapeResult> {
|
||||||
let logger = _logger.child({
|
let logger = _logger.child({
|
||||||
method: "smartScrape",
|
method: "smartScrape",
|
||||||
@ -139,6 +141,16 @@ export async function smartScrape({
|
|||||||
});
|
});
|
||||||
|
|
||||||
logger.info("Smart scrape cost $" + response.tokenUsage);
|
logger.info("Smart scrape cost $" + response.tokenUsage);
|
||||||
|
costTracking.addCall({
|
||||||
|
type: "smartScrape",
|
||||||
|
cost: response.tokenUsage,
|
||||||
|
metadata: {
|
||||||
|
module: "smartScrape",
|
||||||
|
method: "smartScrape",
|
||||||
|
url,
|
||||||
|
sessionId,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
return response; // The response type now matches SmartScrapeResult
|
return response; // The response type now matches SmartScrapeResult
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
@ -5,6 +5,7 @@ process.env.ENV = "test";
|
|||||||
import { scrapeURL } from ".";
|
import { scrapeURL } from ".";
|
||||||
import { scrapeOptions } from "../../controllers/v1/types";
|
import { scrapeOptions } from "../../controllers/v1/types";
|
||||||
import { Engine } from "./engines";
|
import { Engine } from "./engines";
|
||||||
|
import { CostTracking } from "../../lib/extract/extraction-service";
|
||||||
|
|
||||||
const testEngines: (Engine | undefined)[] = [
|
const testEngines: (Engine | undefined)[] = [
|
||||||
undefined,
|
undefined,
|
||||||
@ -32,6 +33,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"https://www.roastmywebsite.ai/",
|
"https://www.roastmywebsite.ai/",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
{ forceEngine, teamId: "test" },
|
{ forceEngine, teamId: "test" },
|
||||||
|
new CostTracking(),
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -79,6 +81,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
formats: ["markdown", "html"],
|
formats: ["markdown", "html"],
|
||||||
}),
|
}),
|
||||||
{ forceEngine, teamId: "test" },
|
{ forceEngine, teamId: "test" },
|
||||||
|
new CostTracking(),
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -103,6 +106,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
onlyMainContent: false,
|
onlyMainContent: false,
|
||||||
}),
|
}),
|
||||||
{ forceEngine, teamId: "test" },
|
{ forceEngine, teamId: "test" },
|
||||||
|
new CostTracking(),
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -126,6 +130,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
excludeTags: [".nav", "#footer", "strong"],
|
excludeTags: [".nav", "#footer", "strong"],
|
||||||
}),
|
}),
|
||||||
{ forceEngine, teamId: "test" },
|
{ forceEngine, teamId: "test" },
|
||||||
|
new CostTracking(),
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -146,6 +151,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"https://httpstat.us/400",
|
"https://httpstat.us/400",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
{ forceEngine, teamId: "test" },
|
{ forceEngine, teamId: "test" },
|
||||||
|
new CostTracking(),
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -164,6 +170,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"https://httpstat.us/401",
|
"https://httpstat.us/401",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
{ forceEngine, teamId: "test" },
|
{ forceEngine, teamId: "test" },
|
||||||
|
new CostTracking(),
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -182,6 +189,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"https://httpstat.us/403",
|
"https://httpstat.us/403",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
{ forceEngine, teamId: "test" },
|
{ forceEngine, teamId: "test" },
|
||||||
|
new CostTracking(),
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -200,6 +208,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"https://httpstat.us/404",
|
"https://httpstat.us/404",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
{ forceEngine, teamId: "test" },
|
{ forceEngine, teamId: "test" },
|
||||||
|
new CostTracking(),
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -218,6 +227,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"https://httpstat.us/405",
|
"https://httpstat.us/405",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
{ forceEngine, teamId: "test" },
|
{ forceEngine, teamId: "test" },
|
||||||
|
new CostTracking(),
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -236,6 +246,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"https://httpstat.us/500",
|
"https://httpstat.us/500",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
{ forceEngine, teamId: "test" },
|
{ forceEngine, teamId: "test" },
|
||||||
|
new CostTracking(),
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -254,6 +265,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"https://scrapethissite.com/",
|
"https://scrapethissite.com/",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
{ forceEngine, teamId: "test" },
|
{ forceEngine, teamId: "test" },
|
||||||
|
new CostTracking(),
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -286,6 +298,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
formats: ["screenshot"],
|
formats: ["screenshot"],
|
||||||
}),
|
}),
|
||||||
{ forceEngine, teamId: "test" },
|
{ forceEngine, teamId: "test" },
|
||||||
|
new CostTracking(),
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -314,6 +327,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
formats: ["screenshot@fullPage"],
|
formats: ["screenshot@fullPage"],
|
||||||
}),
|
}),
|
||||||
{ forceEngine, teamId: "test" },
|
{ forceEngine, teamId: "test" },
|
||||||
|
new CostTracking(),
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -342,6 +356,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"https://arxiv.org/pdf/astro-ph/9301001.pdf",
|
"https://arxiv.org/pdf/astro-ph/9301001.pdf",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
{ teamId: "test" },
|
{ teamId: "test" },
|
||||||
|
new CostTracking(),
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -361,6 +376,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx",
|
"https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
{ teamId: "test" },
|
{ teamId: "test" },
|
||||||
|
new CostTracking(),
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -398,6 +414,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
},
|
},
|
||||||
}),
|
}),
|
||||||
{ teamId: "test" },
|
{ teamId: "test" },
|
||||||
|
new CostTracking(),
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -434,6 +451,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
},
|
},
|
||||||
}),
|
}),
|
||||||
{ teamId: "test" },
|
{ teamId: "test" },
|
||||||
|
new CostTracking(),
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@ -455,7 +473,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
async (i) => {
|
async (i) => {
|
||||||
const url = "https://www.scrapethissite.com/?i=" + i;
|
const url = "https://www.scrapethissite.com/?i=" + i;
|
||||||
const id = "test:concurrent:" + url;
|
const id = "test:concurrent:" + url;
|
||||||
const out = await scrapeURL(id, url, scrapeOptions.parse({}), { teamId: "test" });
|
const out = await scrapeURL(id, url, scrapeOptions.parse({}), { teamId: "test" }, new CostTracking());
|
||||||
|
|
||||||
const replacer = (key: string, value: any) => {
|
const replacer = (key: string, value: any) => {
|
||||||
if (value instanceof Error) {
|
if (value instanceof Error) {
|
||||||
|
@ -30,6 +30,7 @@ export async function performAgent(
|
|||||||
prompt,
|
prompt,
|
||||||
sessionId,
|
sessionId,
|
||||||
scrapeId: meta.id,
|
scrapeId: meta.id,
|
||||||
|
costTracking: meta.costTracking,
|
||||||
})
|
})
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error instanceof Error && error.message === "Cost limit exceeded") {
|
if (error instanceof Error && error.message === "Cost limit exceeded") {
|
||||||
@ -50,20 +51,6 @@ export async function performAgent(
|
|||||||
if (meta.options.formats.includes("html")) {
|
if (meta.options.formats.includes("html")) {
|
||||||
document.html = html
|
document.html = html
|
||||||
}
|
}
|
||||||
|
|
||||||
if (document.metadata.costTracking) {
|
|
||||||
document.metadata.costTracking.smartScrapeCallCount++;
|
|
||||||
document.metadata.costTracking.smartScrapeCost = document.metadata.costTracking.smartScrapeCost + smartscrapeResults.tokenUsage;
|
|
||||||
document.metadata.costTracking.totalCost = document.metadata.costTracking.totalCost + smartscrapeResults.tokenUsage;
|
|
||||||
} else {
|
|
||||||
document.metadata.costTracking = {
|
|
||||||
smartScrapeCallCount: 1,
|
|
||||||
smartScrapeCost: smartscrapeResults.tokenUsage,
|
|
||||||
otherCallCount: 0,
|
|
||||||
otherCost: 0,
|
|
||||||
totalCost: smartscrapeResults.tokenUsage,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return document;
|
return document;
|
||||||
|
@ -6,9 +6,9 @@ import gitDiff from 'git-diff';
|
|||||||
import parseDiff from 'parse-diff';
|
import parseDiff from 'parse-diff';
|
||||||
import { generateCompletions } from "./llmExtract";
|
import { generateCompletions } from "./llmExtract";
|
||||||
|
|
||||||
async function extractDataWithSchema(content: string, meta: Meta): Promise<{ extract: any, cost: number } | null> {
|
async function extractDataWithSchema(content: string, meta: Meta): Promise<{ extract: any } | null> {
|
||||||
try {
|
try {
|
||||||
const { extract, cost } = await generateCompletions({
|
const { extract } = await generateCompletions({
|
||||||
logger: meta.logger.child({
|
logger: meta.logger.child({
|
||||||
method: "extractDataWithSchema/generateCompletions",
|
method: "extractDataWithSchema/generateCompletions",
|
||||||
}),
|
}),
|
||||||
@ -18,9 +18,16 @@ async function extractDataWithSchema(content: string, meta: Meta): Promise<{ ext
|
|||||||
systemPrompt: "Extract the requested information from the content based on the provided schema.",
|
systemPrompt: "Extract the requested information from the content based on the provided schema.",
|
||||||
temperature: 0
|
temperature: 0
|
||||||
},
|
},
|
||||||
markdown: content
|
markdown: content,
|
||||||
|
costTrackingOptions: {
|
||||||
|
costTracking: meta.costTracking,
|
||||||
|
metadata: {
|
||||||
|
module: "extract",
|
||||||
|
method: "extractDataWithSchema",
|
||||||
|
},
|
||||||
|
},
|
||||||
});
|
});
|
||||||
return { extract, cost };
|
return { extract };
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
meta.logger.error("Error extracting data with schema", { error });
|
meta.logger.error("Error extracting data with schema", { error });
|
||||||
return null;
|
return null;
|
||||||
@ -145,19 +152,6 @@ export async function deriveDiff(meta: Meta, document: Document): Promise<Docume
|
|||||||
|
|
||||||
if (previousData && currentData) {
|
if (previousData && currentData) {
|
||||||
document.changeTracking.json = compareExtractedData(previousData.extract, currentData.extract);
|
document.changeTracking.json = compareExtractedData(previousData.extract, currentData.extract);
|
||||||
|
|
||||||
if (document.metadata.costTracking) {
|
|
||||||
document.metadata.costTracking.otherCallCount += 2;
|
|
||||||
document.metadata.costTracking.otherCost = document.metadata.costTracking.otherCost + previousData.cost + currentData.cost;
|
|
||||||
} else {
|
|
||||||
document.metadata.costTracking = {
|
|
||||||
smartScrapeCallCount: 0,
|
|
||||||
smartScrapeCost: 0,
|
|
||||||
otherCallCount: 2,
|
|
||||||
otherCost: previousData.cost + currentData.cost,
|
|
||||||
totalCost: previousData.cost + currentData.cost
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
const { extract } = await generateCompletions({
|
const { extract } = await generateCompletions({
|
||||||
logger: meta.logger.child({
|
logger: meta.logger.child({
|
||||||
@ -171,7 +165,14 @@ export async function deriveDiff(meta: Meta, document: Document): Promise<Docume
|
|||||||
temperature: 0
|
temperature: 0
|
||||||
},
|
},
|
||||||
markdown: `Previous Content:\n${previousMarkdown}\n\nCurrent Content:\n${currentMarkdown}`,
|
markdown: `Previous Content:\n${previousMarkdown}\n\nCurrent Content:\n${currentMarkdown}`,
|
||||||
previousWarning: document.warning
|
previousWarning: document.warning,
|
||||||
|
costTrackingOptions: {
|
||||||
|
costTracking: meta.costTracking,
|
||||||
|
metadata: {
|
||||||
|
module: "diff",
|
||||||
|
method: "deriveDiff",
|
||||||
|
},
|
||||||
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
document.changeTracking.json = extract;
|
document.changeTracking.json = extract;
|
||||||
|
@ -11,6 +11,7 @@ import { EngineResultsTracker, Meta } from "..";
|
|||||||
import { logger } from "../../../lib/logger";
|
import { logger } from "../../../lib/logger";
|
||||||
import { modelPrices } from "../../../lib/extract/usage/model-prices";
|
import { modelPrices } from "../../../lib/extract/usage/model-prices";
|
||||||
import {
|
import {
|
||||||
|
AISDKError,
|
||||||
generateObject,
|
generateObject,
|
||||||
generateText,
|
generateText,
|
||||||
LanguageModel,
|
LanguageModel,
|
||||||
@ -22,7 +23,7 @@ import { z } from "zod";
|
|||||||
import fs from "fs/promises";
|
import fs from "fs/promises";
|
||||||
import Ajv from "ajv";
|
import Ajv from "ajv";
|
||||||
import { extractData } from "../lib/extractSmartScrape";
|
import { extractData } from "../lib/extractSmartScrape";
|
||||||
|
import { CostTracking } from "../../../lib/extract/extraction-service";
|
||||||
// TODO: fix this, it's horrible
|
// TODO: fix this, it's horrible
|
||||||
type LanguageModelV1ProviderMetadata = {
|
type LanguageModelV1ProviderMetadata = {
|
||||||
anthropic?: {
|
anthropic?: {
|
||||||
@ -231,6 +232,10 @@ export type GenerateCompletionsOptions = {
|
|||||||
mode?: "object" | "no-object";
|
mode?: "object" | "no-object";
|
||||||
providerOptions?: LanguageModelV1ProviderMetadata;
|
providerOptions?: LanguageModelV1ProviderMetadata;
|
||||||
retryModel?: LanguageModel;
|
retryModel?: LanguageModel;
|
||||||
|
costTrackingOptions: {
|
||||||
|
costTracking: CostTracking;
|
||||||
|
metadata: Record<string, any>;
|
||||||
|
};
|
||||||
};
|
};
|
||||||
export async function generateCompletions({
|
export async function generateCompletions({
|
||||||
logger,
|
logger,
|
||||||
@ -242,13 +247,13 @@ export async function generateCompletions({
|
|||||||
mode = "object",
|
mode = "object",
|
||||||
providerOptions,
|
providerOptions,
|
||||||
retryModel = getModel("claude-3-5-sonnet-20240620", "anthropic"),
|
retryModel = getModel("claude-3-5-sonnet-20240620", "anthropic"),
|
||||||
|
costTrackingOptions,
|
||||||
}: GenerateCompletionsOptions): Promise<{
|
}: GenerateCompletionsOptions): Promise<{
|
||||||
extract: any;
|
extract: any;
|
||||||
numTokens: number;
|
numTokens: number;
|
||||||
warning: string | undefined;
|
warning: string | undefined;
|
||||||
totalUsage: TokenUsage;
|
totalUsage: TokenUsage;
|
||||||
model: string;
|
model: string;
|
||||||
cost: number;
|
|
||||||
}> {
|
}> {
|
||||||
let extract: any;
|
let extract: any;
|
||||||
let warning: string | undefined;
|
let warning: string | undefined;
|
||||||
@ -278,6 +283,19 @@ export async function generateCompletions({
|
|||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
|
costTrackingOptions.costTracking.addCall({
|
||||||
|
type: "other",
|
||||||
|
metadata: {
|
||||||
|
...costTrackingOptions.metadata,
|
||||||
|
gcDetails: "no-object",
|
||||||
|
},
|
||||||
|
cost: calculateCost(
|
||||||
|
currentModel.modelId,
|
||||||
|
result.usage?.promptTokens ?? 0,
|
||||||
|
result.usage?.completionTokens ?? 0,
|
||||||
|
),
|
||||||
|
});
|
||||||
|
|
||||||
extract = result.text;
|
extract = result.text;
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@ -290,11 +308,6 @@ export async function generateCompletions({
|
|||||||
totalTokens: result.usage?.promptTokens ?? 0 + (result.usage?.completionTokens ?? 0),
|
totalTokens: result.usage?.promptTokens ?? 0 + (result.usage?.completionTokens ?? 0),
|
||||||
},
|
},
|
||||||
model: currentModel.modelId,
|
model: currentModel.modelId,
|
||||||
cost: calculateCost(
|
|
||||||
currentModel.modelId,
|
|
||||||
result.usage?.promptTokens ?? 0,
|
|
||||||
result.usage?.completionTokens ?? 0,
|
|
||||||
),
|
|
||||||
};
|
};
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
lastError = error as Error;
|
lastError = error as Error;
|
||||||
@ -321,6 +334,19 @@ export async function generateCompletions({
|
|||||||
|
|
||||||
extract = result.text;
|
extract = result.text;
|
||||||
|
|
||||||
|
costTrackingOptions.costTracking.addCall({
|
||||||
|
type: "other",
|
||||||
|
metadata: {
|
||||||
|
...costTrackingOptions.metadata,
|
||||||
|
gcDetails: "no-object fallback",
|
||||||
|
},
|
||||||
|
cost: calculateCost(
|
||||||
|
currentModel.modelId,
|
||||||
|
result.usage?.promptTokens ?? 0,
|
||||||
|
result.usage?.completionTokens ?? 0,
|
||||||
|
),
|
||||||
|
});
|
||||||
|
|
||||||
return {
|
return {
|
||||||
extract,
|
extract,
|
||||||
warning,
|
warning,
|
||||||
@ -331,11 +357,6 @@ export async function generateCompletions({
|
|||||||
totalTokens: result.usage?.promptTokens ?? 0 + (result.usage?.completionTokens ?? 0),
|
totalTokens: result.usage?.promptTokens ?? 0 + (result.usage?.completionTokens ?? 0),
|
||||||
},
|
},
|
||||||
model: currentModel.modelId,
|
model: currentModel.modelId,
|
||||||
cost: calculateCost(
|
|
||||||
currentModel.modelId,
|
|
||||||
result.usage?.promptTokens ?? 0,
|
|
||||||
result.usage?.completionTokens ?? 0,
|
|
||||||
),
|
|
||||||
};
|
};
|
||||||
} catch (retryError) {
|
} catch (retryError) {
|
||||||
lastError = retryError as Error;
|
lastError = retryError as Error;
|
||||||
@ -410,7 +431,7 @@ export async function generateCompletions({
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const { text: fixedText } = await generateText({
|
const { text: fixedText, usage: repairUsage } = await generateText({
|
||||||
model: currentModel,
|
model: currentModel,
|
||||||
prompt: `Fix this JSON that had the following error: ${error}\n\nOriginal text:\n${text}\n\nReturn only the fixed JSON, no explanation.`,
|
prompt: `Fix this JSON that had the following error: ${error}\n\nOriginal text:\n${text}\n\nReturn only the fixed JSON, no explanation.`,
|
||||||
system:
|
system:
|
||||||
@ -421,6 +442,23 @@ export async function generateCompletions({
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
|
costTrackingOptions.costTracking.addCall({
|
||||||
|
type: "other",
|
||||||
|
metadata: {
|
||||||
|
...costTrackingOptions.metadata,
|
||||||
|
gcDetails: "repairConfig",
|
||||||
|
},
|
||||||
|
cost: calculateCost(
|
||||||
|
currentModel.modelId,
|
||||||
|
repairUsage?.promptTokens ?? 0,
|
||||||
|
repairUsage?.completionTokens ?? 0,
|
||||||
|
),
|
||||||
|
tokens: {
|
||||||
|
input: repairUsage?.promptTokens ?? 0,
|
||||||
|
output: repairUsage?.completionTokens ?? 0,
|
||||||
|
},
|
||||||
|
});
|
||||||
logger.debug("Repaired text with LLM");
|
logger.debug("Repaired text with LLM");
|
||||||
return fixedText;
|
return fixedText;
|
||||||
} catch (repairError) {
|
} catch (repairError) {
|
||||||
@ -464,6 +502,23 @@ export async function generateCompletions({
|
|||||||
let result: { object: any; usage: TokenUsage } | undefined;
|
let result: { object: any; usage: TokenUsage } | undefined;
|
||||||
try {
|
try {
|
||||||
result = await generateObject(generateObjectConfig);
|
result = await generateObject(generateObjectConfig);
|
||||||
|
costTrackingOptions.costTracking.addCall({
|
||||||
|
type: "other",
|
||||||
|
metadata: {
|
||||||
|
...costTrackingOptions.metadata,
|
||||||
|
gcDetails: "generateObject",
|
||||||
|
gcModel: generateObjectConfig.model.modelId,
|
||||||
|
},
|
||||||
|
tokens: {
|
||||||
|
input: result.usage?.promptTokens ?? 0,
|
||||||
|
output: result.usage?.completionTokens ?? 0,
|
||||||
|
},
|
||||||
|
cost: calculateCost(
|
||||||
|
currentModel.modelId,
|
||||||
|
result.usage?.promptTokens ?? 0,
|
||||||
|
result.usage?.completionTokens ?? 0,
|
||||||
|
),
|
||||||
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
lastError = error as Error;
|
lastError = error as Error;
|
||||||
if (
|
if (
|
||||||
@ -481,6 +536,23 @@ export async function generateCompletions({
|
|||||||
model: currentModel,
|
model: currentModel,
|
||||||
};
|
};
|
||||||
result = await generateObject(retryConfig);
|
result = await generateObject(retryConfig);
|
||||||
|
costTrackingOptions.costTracking.addCall({
|
||||||
|
type: "other",
|
||||||
|
metadata: {
|
||||||
|
...costTrackingOptions.metadata,
|
||||||
|
gcDetails: "generateObject fallback",
|
||||||
|
gcModel: retryConfig.model.modelId,
|
||||||
|
},
|
||||||
|
tokens: {
|
||||||
|
input: result.usage?.promptTokens ?? 0,
|
||||||
|
output: result.usage?.completionTokens ?? 0,
|
||||||
|
},
|
||||||
|
cost: calculateCost(
|
||||||
|
currentModel.modelId,
|
||||||
|
result.usage?.promptTokens ?? 0,
|
||||||
|
result.usage?.completionTokens ?? 0,
|
||||||
|
),
|
||||||
|
});
|
||||||
} catch (retryError) {
|
} catch (retryError) {
|
||||||
lastError = retryError as Error;
|
lastError = retryError as Error;
|
||||||
logger.error("Failed with fallback model", {
|
logger.error("Failed with fallback model", {
|
||||||
@ -549,7 +621,6 @@ export async function generateCompletions({
|
|||||||
totalTokens: promptTokens + completionTokens,
|
totalTokens: promptTokens + completionTokens,
|
||||||
},
|
},
|
||||||
model: currentModel.modelId,
|
model: currentModel.modelId,
|
||||||
cost: calculateCost(currentModel.modelId, promptTokens, completionTokens),
|
|
||||||
};
|
};
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
lastError = error as Error;
|
lastError = error as Error;
|
||||||
@ -589,9 +660,16 @@ export async function performLLMExtract(
|
|||||||
// model: getModel("gemini-2.5-pro-preview-03-25", "vertex"),
|
// model: getModel("gemini-2.5-pro-preview-03-25", "vertex"),
|
||||||
model: getModel("gemini-2.5-pro-preview-03-25", "vertex"),
|
model: getModel("gemini-2.5-pro-preview-03-25", "vertex"),
|
||||||
retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"),
|
retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"),
|
||||||
|
costTrackingOptions: {
|
||||||
|
costTracking: meta.costTracking,
|
||||||
|
metadata: {
|
||||||
|
module: "scrapeURL",
|
||||||
|
method: "performLLMExtract",
|
||||||
|
},
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
const { extractedDataArray, warning, smartScrapeCost, otherCost, costLimitExceededTokenUsage } =
|
const { extractedDataArray, warning, costLimitExceededTokenUsage } =
|
||||||
await extractData({
|
await extractData({
|
||||||
extractOptions: generationOptions,
|
extractOptions: generationOptions,
|
||||||
urls: [meta.url],
|
urls: [meta.url],
|
||||||
@ -603,25 +681,6 @@ export async function performLLMExtract(
|
|||||||
document.warning = warning + (document.warning ? " " + document.warning : "");
|
document.warning = warning + (document.warning ? " " + document.warning : "");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (document.metadata.costTracking) {
|
|
||||||
document.metadata.costTracking.smartScrapeCallCount++;
|
|
||||||
document.metadata.costTracking.smartScrapeCost += smartScrapeCost;
|
|
||||||
document.metadata.costTracking.otherCallCount++;
|
|
||||||
document.metadata.costTracking.otherCost += otherCost;
|
|
||||||
document.metadata.costTracking.totalCost += smartScrapeCost + otherCost;
|
|
||||||
if (costLimitExceededTokenUsage) {
|
|
||||||
document.metadata.costTracking.costLimitExceededTokenUsage = costLimitExceededTokenUsage;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
document.metadata.costTracking = {
|
|
||||||
smartScrapeCallCount: 1,
|
|
||||||
smartScrapeCost: smartScrapeCost,
|
|
||||||
otherCallCount: 1,
|
|
||||||
otherCost: otherCost,
|
|
||||||
totalCost: smartScrapeCost + otherCost,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
// IMPORTANT: here it only get's the last page!!!
|
// IMPORTANT: here it only get's the last page!!!
|
||||||
const extractedData =
|
const extractedData =
|
||||||
extractedDataArray[extractedDataArray.length - 1] ?? undefined;
|
extractedDataArray[extractedDataArray.length - 1] ?? undefined;
|
||||||
@ -758,7 +817,8 @@ export function removeDefaultProperty(schema: any): any {
|
|||||||
export async function generateSchemaFromPrompt(
|
export async function generateSchemaFromPrompt(
|
||||||
prompt: string,
|
prompt: string,
|
||||||
logger: Logger,
|
logger: Logger,
|
||||||
): Promise<{ extract: any; cost: number }> {
|
costTracking: CostTracking,
|
||||||
|
): Promise<{ extract: any }> {
|
||||||
const model = getModel("gpt-4o", "openai");
|
const model = getModel("gpt-4o", "openai");
|
||||||
const retryModel = getModel("gpt-4o-mini", "openai");
|
const retryModel = getModel("gpt-4o-mini", "openai");
|
||||||
const temperatures = [0, 0.1, 0.3]; // Different temperatures to try
|
const temperatures = [0, 0.1, 0.3]; // Different temperatures to try
|
||||||
@ -766,7 +826,7 @@ export async function generateSchemaFromPrompt(
|
|||||||
|
|
||||||
for (const temp of temperatures) {
|
for (const temp of temperatures) {
|
||||||
try {
|
try {
|
||||||
const { extract, cost } = await generateCompletions({
|
const { extract } = await generateCompletions({
|
||||||
logger: logger.child({
|
logger: logger.child({
|
||||||
method: "generateSchemaFromPrompt/generateCompletions",
|
method: "generateSchemaFromPrompt/generateCompletions",
|
||||||
}),
|
}),
|
||||||
@ -802,10 +862,16 @@ Return a valid JSON schema object with properties that would capture the informa
|
|||||||
prompt: `Generate a JSON schema for extracting the following information: ${prompt}`,
|
prompt: `Generate a JSON schema for extracting the following information: ${prompt}`,
|
||||||
// temperature: temp,
|
// temperature: temp,
|
||||||
},
|
},
|
||||||
markdown: prompt,
|
costTrackingOptions: {
|
||||||
|
costTracking,
|
||||||
|
metadata: {
|
||||||
|
module: "scrapeURL",
|
||||||
|
method: "generateSchemaFromPrompt",
|
||||||
|
},
|
||||||
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
return { extract, cost };
|
return { extract };
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
lastError = error as Error;
|
lastError = error as Error;
|
||||||
logger.warn(`Failed attempt with temperature ${temp}: ${error.message}`);
|
logger.warn(`Failed attempt with temperature ${temp}: ${error.message}`);
|
||||||
|
@ -75,6 +75,7 @@ import { performDeepResearch } from "../lib/deep-research/deep-research-service"
|
|||||||
import { performGenerateLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt-service";
|
import { performGenerateLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt-service";
|
||||||
import { updateGeneratedLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt-redis";
|
import { updateGeneratedLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt-redis";
|
||||||
import { performExtraction_F0 } from "../lib/extract/fire-0/extraction-service-f0";
|
import { performExtraction_F0 } from "../lib/extract/fire-0/extraction-service-f0";
|
||||||
|
import { CostTracking } from "../lib/extract/extraction-service";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
@ -1010,6 +1011,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
// };
|
// };
|
||||||
// return data;
|
// return data;
|
||||||
// }
|
// }
|
||||||
|
const costTracking = new CostTracking();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
job.updateProgress({
|
job.updateProgress({
|
||||||
@ -1030,6 +1032,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
startWebScraperPipeline({
|
startWebScraperPipeline({
|
||||||
job,
|
job,
|
||||||
token,
|
token,
|
||||||
|
costTracking,
|
||||||
}),
|
}),
|
||||||
...(job.data.scrapeOptions.timeout !== undefined
|
...(job.data.scrapeOptions.timeout !== undefined
|
||||||
? [
|
? [
|
||||||
@ -1171,6 +1174,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
scrapeOptions: job.data.scrapeOptions,
|
scrapeOptions: job.data.scrapeOptions,
|
||||||
origin: job.data.origin,
|
origin: job.data.origin,
|
||||||
crawl_id: job.data.crawl_id,
|
crawl_id: job.data.crawl_id,
|
||||||
|
cost_tracking: costTracking,
|
||||||
},
|
},
|
||||||
true,
|
true,
|
||||||
);
|
);
|
||||||
@ -1276,10 +1280,6 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
|
|
||||||
await finishCrawlIfNeeded(job, sc);
|
await finishCrawlIfNeeded(job, sc);
|
||||||
} else {
|
} else {
|
||||||
const cost_tracking = doc?.metadata?.costTracking;
|
|
||||||
|
|
||||||
delete doc.metadata.costTracking;
|
|
||||||
|
|
||||||
await logJob({
|
await logJob({
|
||||||
job_id: job.id,
|
job_id: job.id,
|
||||||
success: true,
|
success: true,
|
||||||
@ -1293,7 +1293,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
scrapeOptions: job.data.scrapeOptions,
|
scrapeOptions: job.data.scrapeOptions,
|
||||||
origin: job.data.origin,
|
origin: job.data.origin,
|
||||||
num_tokens: 0, // TODO: fix
|
num_tokens: 0, // TODO: fix
|
||||||
cost_tracking,
|
cost_tracking: costTracking,
|
||||||
});
|
});
|
||||||
|
|
||||||
indexJob(job, doc);
|
indexJob(job, doc);
|
||||||
@ -1442,6 +1442,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
scrapeOptions: job.data.scrapeOptions,
|
scrapeOptions: job.data.scrapeOptions,
|
||||||
origin: job.data.origin,
|
origin: job.data.origin,
|
||||||
crawl_id: job.data.crawl_id,
|
crawl_id: job.data.crawl_id,
|
||||||
|
cost_tracking: costTracking,
|
||||||
},
|
},
|
||||||
true,
|
true,
|
||||||
);
|
);
|
||||||
|
@ -62,6 +62,7 @@ export interface RunWebScraperParams {
|
|||||||
is_scrape?: boolean;
|
is_scrape?: boolean;
|
||||||
is_crawl?: boolean;
|
is_crawl?: boolean;
|
||||||
urlInvisibleInCurrentCrawl?: boolean;
|
urlInvisibleInCurrentCrawl?: boolean;
|
||||||
|
costTracking: CostTracking;
|
||||||
}
|
}
|
||||||
|
|
||||||
export type RunWebScraperResult =
|
export type RunWebScraperResult =
|
||||||
|
Loading…
x
Reference in New Issue
Block a user