mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-07-31 05:32:01 +08:00
new cost tracking
This commit is contained in:
parent
ba4df67de7
commit
8546bcacc0
@ -22,6 +22,7 @@ import { getJobPriority } from "../../lib/job-priority";
|
||||
import { addScrapeJobs } from "../../services/queue-jobs";
|
||||
import { callWebhook } from "../../services/webhook";
|
||||
import { logger as _logger } from "../../lib/logger";
|
||||
import { CostTracking } from "../../lib/extract/extraction-service";
|
||||
|
||||
export async function batchScrapeController(
|
||||
req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>,
|
||||
|
@ -15,6 +15,7 @@ import { getJobPriority } from "../../lib/job-priority";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { getJob } from "./crawl-status";
|
||||
import { getJobFromGCS } from "../../lib/gcs-jobs";
|
||||
import { CostTracking } from "src/lib/extract/extraction-service";
|
||||
|
||||
export async function scrapeController(
|
||||
req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>,
|
||||
@ -128,12 +129,6 @@ export async function scrapeController(
|
||||
}
|
||||
}
|
||||
|
||||
const cost_tracking = doc?.metadata?.costTracking;
|
||||
|
||||
if (doc && doc.metadata) {
|
||||
delete doc.metadata.costTracking;
|
||||
}
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
data: doc,
|
||||
|
@ -21,6 +21,7 @@ import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
|
||||
import { logger as _logger } from "../../lib/logger";
|
||||
import type { Logger } from "winston";
|
||||
import { getJobFromGCS } from "../../lib/gcs-jobs";
|
||||
import { CostTracking } from "../../lib/extract/extraction-service";
|
||||
|
||||
// Used for deep research
|
||||
export async function searchAndScrapeSearchResult(
|
||||
@ -32,6 +33,7 @@ export async function searchAndScrapeSearchResult(
|
||||
scrapeOptions: ScrapeOptions;
|
||||
},
|
||||
logger: Logger,
|
||||
costTracking: CostTracking,
|
||||
): Promise<Document[]> {
|
||||
try {
|
||||
const searchResults = await search({
|
||||
@ -48,7 +50,8 @@ export async function searchAndScrapeSearchResult(
|
||||
description: result.description
|
||||
},
|
||||
options,
|
||||
logger
|
||||
logger,
|
||||
costTracking
|
||||
)
|
||||
)
|
||||
);
|
||||
@ -68,6 +71,7 @@ async function scrapeSearchResult(
|
||||
scrapeOptions: ScrapeOptions;
|
||||
},
|
||||
logger: Logger,
|
||||
costTracking: CostTracking,
|
||||
): Promise<Document> {
|
||||
const jobId = uuidv4();
|
||||
const jobPriority = await getJobPriority({
|
||||
@ -220,6 +224,8 @@ export async function searchController(
|
||||
});
|
||||
}
|
||||
|
||||
const costTracking = new CostTracking();
|
||||
|
||||
// Scrape each non-blocked result, handling timeouts individually
|
||||
logger.info("Scraping search results");
|
||||
const scrapePromises = searchResults.map((result) =>
|
||||
@ -228,7 +234,7 @@ export async function searchController(
|
||||
origin: req.body.origin,
|
||||
timeout: req.body.timeout,
|
||||
scrapeOptions: req.body.scrapeOptions,
|
||||
}, logger),
|
||||
}, logger, costTracking),
|
||||
);
|
||||
|
||||
const docs = await Promise.all(scrapePromises);
|
||||
@ -279,6 +285,7 @@ export async function searchController(
|
||||
mode: "search",
|
||||
url: req.body.query,
|
||||
origin: req.body.origin,
|
||||
cost_tracking: costTracking,
|
||||
});
|
||||
|
||||
return res.status(200).json({
|
||||
|
@ -739,7 +739,6 @@ export type Document = {
|
||||
statusCode: number;
|
||||
scrapeId?: string;
|
||||
error?: string;
|
||||
costTracking?: CostTracking;
|
||||
// [key: string]: string | string[] | number | { smartScrape: number; other: number; total: number } | undefined;
|
||||
};
|
||||
serpResults?: {
|
||||
|
@ -5,6 +5,7 @@ import { ResearchLLMService, ResearchStateManager } from "./research-manager";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { ExtractOptions } from "../../controllers/v1/types";
|
||||
import { CostTracking } from "../extract/extraction-service";
|
||||
|
||||
interface DeepResearchServiceOptions {
|
||||
researchId: string;
|
||||
@ -21,6 +22,7 @@ interface DeepResearchServiceOptions {
|
||||
}
|
||||
|
||||
export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
||||
const costTracking = new CostTracking();
|
||||
const { researchId, teamId, timeLimit, subId, maxUrls } = options;
|
||||
const startTime = Date.now();
|
||||
let currentTopic = options.query;
|
||||
@ -70,6 +72,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
||||
await llmService.generateSearchQueries(
|
||||
nextSearchTopic,
|
||||
state.getFindings(),
|
||||
costTracking,
|
||||
)
|
||||
).slice(0, 3);
|
||||
|
||||
@ -109,7 +112,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
||||
fastMode: false,
|
||||
blockAds: false,
|
||||
},
|
||||
}, logger);
|
||||
}, logger, costTracking);
|
||||
return response.length > 0 ? response : [];
|
||||
});
|
||||
|
||||
@ -205,6 +208,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
||||
currentTopic,
|
||||
timeRemaining,
|
||||
options.systemPrompt ?? "",
|
||||
costTracking,
|
||||
);
|
||||
|
||||
if (!analysis) {
|
||||
@ -268,6 +272,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
||||
state.getFindings(),
|
||||
state.getSummaries(),
|
||||
options.analysisPrompt,
|
||||
costTracking,
|
||||
options.formats,
|
||||
options.jsonOptions,
|
||||
);
|
||||
@ -278,6 +283,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
||||
state.getFindings(),
|
||||
state.getSummaries(),
|
||||
options.analysisPrompt,
|
||||
costTracking,
|
||||
);
|
||||
}
|
||||
|
||||
@ -307,6 +313,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
||||
origin: "api",
|
||||
num_tokens: 0,
|
||||
tokens_billed: 0,
|
||||
cost_tracking: costTracking,
|
||||
});
|
||||
await updateDeepResearch(researchId, {
|
||||
status: "completed",
|
||||
|
@ -12,6 +12,7 @@ import {
|
||||
import { ExtractOptions } from "../../controllers/v1/types";
|
||||
|
||||
import { getModel } from "../generic-ai";
|
||||
import { CostTracking } from "../extract/extraction-service";
|
||||
interface AnalysisResult {
|
||||
gaps: string[];
|
||||
nextSteps: string[];
|
||||
@ -152,6 +153,7 @@ export class ResearchLLMService {
|
||||
async generateSearchQueries(
|
||||
topic: string,
|
||||
findings: DeepResearchFinding[] = [],
|
||||
costTracking: CostTracking,
|
||||
): Promise<{ query: string; researchGoal: string }[]> {
|
||||
const { extract } = await generateCompletions({
|
||||
logger: this.logger.child({
|
||||
@ -194,6 +196,13 @@ export class ResearchLLMService {
|
||||
The first SERP query you generate should be a very concise, simple version of the topic. `,
|
||||
},
|
||||
markdown: "",
|
||||
costTrackingOptions: {
|
||||
costTracking,
|
||||
metadata: {
|
||||
module: "deep-research",
|
||||
method: "generateSearchQueries",
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
return extract.queries;
|
||||
@ -204,6 +213,7 @@ export class ResearchLLMService {
|
||||
currentTopic: string,
|
||||
timeRemaining: number,
|
||||
systemPrompt: string,
|
||||
costTracking: CostTracking,
|
||||
): Promise<AnalysisResult | null> {
|
||||
try {
|
||||
const timeRemainingMinutes =
|
||||
@ -246,6 +256,13 @@ export class ResearchLLMService {
|
||||
).text,
|
||||
},
|
||||
markdown: "",
|
||||
costTrackingOptions: {
|
||||
costTracking,
|
||||
metadata: {
|
||||
module: "deep-research",
|
||||
method: "analyzeAndPlan",
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
return extract.analysis;
|
||||
@ -260,6 +277,7 @@ export class ResearchLLMService {
|
||||
findings: DeepResearchFinding[],
|
||||
summaries: string[],
|
||||
analysisPrompt: string,
|
||||
costTracking: CostTracking,
|
||||
formats?: string[],
|
||||
jsonOptions?: ExtractOptions,
|
||||
): Promise<any> {
|
||||
@ -312,6 +330,13 @@ export class ResearchLLMService {
|
||||
},
|
||||
markdown: "",
|
||||
model: getModel("o3-mini"),
|
||||
costTrackingOptions: {
|
||||
costTracking,
|
||||
metadata: {
|
||||
module: "deep-research",
|
||||
method: "generateFinalAnalysis",
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
return extract;
|
||||
|
@ -11,25 +11,23 @@ import {
|
||||
import { jsonSchema } from "ai";
|
||||
import { getModel } from "../../../lib/generic-ai";
|
||||
import { Logger } from "winston";
|
||||
|
||||
import { CostTracking } from "../extraction-service";
|
||||
export async function analyzeSchemaAndPrompt(
|
||||
urls: string[],
|
||||
schema: any,
|
||||
prompt: string,
|
||||
logger: Logger,
|
||||
costTracking: CostTracking,
|
||||
): Promise<{
|
||||
isMultiEntity: boolean;
|
||||
multiEntityKeys: string[];
|
||||
reasoning: string;
|
||||
keyIndicators: string[];
|
||||
tokenUsage: TokenUsage;
|
||||
cost: number;
|
||||
}> {
|
||||
let cost = 0;
|
||||
if (!schema) {
|
||||
const genRes = await generateSchemaFromPrompt(prompt, logger);
|
||||
const genRes = await generateSchemaFromPrompt(prompt, logger, costTracking);
|
||||
schema = genRes.extract;
|
||||
cost = genRes.cost;
|
||||
}
|
||||
|
||||
const schemaString = JSON.stringify(schema);
|
||||
@ -49,7 +47,7 @@ export async function analyzeSchemaAndPrompt(
|
||||
);
|
||||
|
||||
try {
|
||||
const { extract: result, totalUsage, cost: cost2 } = await generateCompletions({
|
||||
const { extract: result, totalUsage } = await generateCompletions({
|
||||
logger,
|
||||
options: {
|
||||
mode: "llm",
|
||||
@ -59,8 +57,14 @@ export async function analyzeSchemaAndPrompt(
|
||||
},
|
||||
markdown: "",
|
||||
model,
|
||||
costTrackingOptions: {
|
||||
costTracking,
|
||||
metadata: {
|
||||
module: "extract",
|
||||
method: "analyzeSchemaAndPrompt",
|
||||
},
|
||||
},
|
||||
});
|
||||
cost += cost2;
|
||||
|
||||
const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators } =
|
||||
checkSchema.parse(result);
|
||||
@ -71,7 +75,6 @@ export async function analyzeSchemaAndPrompt(
|
||||
reasoning,
|
||||
keyIndicators,
|
||||
tokenUsage: totalUsage,
|
||||
cost,
|
||||
};
|
||||
} catch (e) {
|
||||
logger.warn("(analyzeSchemaAndPrompt) Error parsing schema analysis", {
|
||||
@ -90,6 +93,5 @@ export async function analyzeSchemaAndPrompt(
|
||||
totalTokens: 0,
|
||||
model: model.modelId,
|
||||
},
|
||||
cost: 0,
|
||||
};
|
||||
}
|
||||
|
@ -10,7 +10,7 @@ import {
|
||||
buildBatchExtractSystemPrompt,
|
||||
} from "../build-prompts";
|
||||
import { getModel } from "../../generic-ai";
|
||||
|
||||
import { CostTracking } from "../extraction-service";
|
||||
import fs from "fs/promises";
|
||||
import { extractData } from "../../../scraper/scrapeURL/lib/extractSmartScrape";
|
||||
import type { Logger } from "winston";
|
||||
@ -24,6 +24,7 @@ type BatchExtractOptions = {
|
||||
useAgent: boolean;
|
||||
extractId?: string;
|
||||
sessionId?: string;
|
||||
costTracking: CostTracking;
|
||||
};
|
||||
|
||||
/**
|
||||
@ -75,6 +76,13 @@ export async function batchExtractPromise(options: BatchExtractOptions, logger:
|
||||
isExtractEndpoint: true,
|
||||
model: getModel("gemini-2.5-pro-preview-03-25", "vertex"),
|
||||
retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"),
|
||||
costTrackingOptions: {
|
||||
costTracking: options.costTracking,
|
||||
metadata: {
|
||||
module: "extract",
|
||||
method: "batchExtractPromise",
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
let extractedDataArray: any[] = [];
|
||||
@ -84,23 +92,15 @@ export async function batchExtractPromise(options: BatchExtractOptions, logger:
|
||||
const {
|
||||
extractedDataArray: e,
|
||||
warning: w,
|
||||
smartScrapeCost,
|
||||
otherCost,
|
||||
smartScrapeCallCount,
|
||||
otherCallCount
|
||||
} = await extractData({
|
||||
extractOptions: generationOptions,
|
||||
urls: [doc.metadata.sourceURL || doc.metadata.url || ""],
|
||||
useAgent,
|
||||
extractId,
|
||||
sessionId
|
||||
sessionId,
|
||||
});
|
||||
extractedDataArray = e;
|
||||
warning = w;
|
||||
smCost = smartScrapeCost;
|
||||
oCost = otherCost;
|
||||
smCallCount = smartScrapeCallCount;
|
||||
oCallCount = otherCallCount;
|
||||
} catch (error) {
|
||||
logger.error("extractData failed", { error });
|
||||
}
|
||||
|
@ -7,12 +7,14 @@ import {
|
||||
buildShouldExtractUserPrompt,
|
||||
} from "../build-prompts";
|
||||
import { getModel } from "../../../lib/generic-ai";
|
||||
import { CostTracking } from "../extraction-service";
|
||||
|
||||
export async function checkShouldExtract(
|
||||
prompt: string,
|
||||
multiEntitySchema: any,
|
||||
doc: Document,
|
||||
): Promise<{ tokenUsage: TokenUsage; extract: boolean; cost: number }> {
|
||||
costTracking: CostTracking,
|
||||
): Promise<{ tokenUsage: TokenUsage; extract: boolean; }> {
|
||||
const shouldExtractCheck = await generateCompletions({
|
||||
logger: logger.child({ method: "extractService/checkShouldExtract" }),
|
||||
options: {
|
||||
@ -32,11 +34,17 @@ export async function checkShouldExtract(
|
||||
markdown: buildDocument(doc),
|
||||
isExtractEndpoint: true,
|
||||
model: getModel("gpt-4o-mini"),
|
||||
costTrackingOptions: {
|
||||
costTracking,
|
||||
metadata: {
|
||||
module: "extract",
|
||||
method: "checkShouldExtract",
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
return {
|
||||
tokenUsage: shouldExtractCheck.totalUsage,
|
||||
extract: shouldExtractCheck.extract["extract"],
|
||||
cost: shouldExtractCheck.cost,
|
||||
};
|
||||
}
|
||||
|
@ -7,6 +7,7 @@ import { buildDocument } from "../build-document";
|
||||
import { Document, TokenUsage } from "../../../controllers/v1/types";
|
||||
import { getModel } from "../../../lib/generic-ai";
|
||||
import { extractData } from "../../../scraper/scrapeURL/lib/extractSmartScrape";
|
||||
import { CostTracking } from "../extraction-service";
|
||||
|
||||
export async function singleAnswerCompletion({
|
||||
singleAnswerDocs,
|
||||
@ -17,6 +18,7 @@ export async function singleAnswerCompletion({
|
||||
useAgent,
|
||||
extractId,
|
||||
sessionId,
|
||||
costTracking,
|
||||
}: {
|
||||
singleAnswerDocs: Document[];
|
||||
rSchema: any;
|
||||
@ -26,14 +28,11 @@ export async function singleAnswerCompletion({
|
||||
useAgent: boolean;
|
||||
extractId: string;
|
||||
sessionId: string;
|
||||
costTracking: CostTracking;
|
||||
}): Promise<{
|
||||
extract: any;
|
||||
tokenUsage: TokenUsage;
|
||||
sources: string[];
|
||||
smartScrapeCallCount: number;
|
||||
smartScrapeCost: number;
|
||||
otherCallCount: number;
|
||||
otherCost: number;
|
||||
}> {
|
||||
const docsPrompt = `Today is: ` + new Date().toISOString() + `.\n` + prompt;
|
||||
const generationOptions: GenerateCompletionsOptions = {
|
||||
@ -53,9 +52,16 @@ export async function singleAnswerCompletion({
|
||||
markdown: `${singleAnswerDocs.map((x, i) => `[START_PAGE (ID: ${i})]` + buildDocument(x)).join("\n")} [END_PAGE]\n`,
|
||||
isExtractEndpoint: true,
|
||||
model: getModel("gemini-2.0-flash", "google"),
|
||||
costTrackingOptions: {
|
||||
costTracking,
|
||||
metadata: {
|
||||
module: "extract",
|
||||
method: "singleAnswerCompletion",
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const { extractedDataArray, warning, smartScrapeCost, otherCost, smartScrapeCallCount, otherCallCount } = await extractData({
|
||||
const { extractedDataArray, warning } = await extractData({
|
||||
extractOptions: generationOptions,
|
||||
urls: singleAnswerDocs.map(doc => doc.metadata.url || doc.metadata.sourceURL || ""),
|
||||
useAgent,
|
||||
@ -100,9 +106,5 @@ export async function singleAnswerCompletion({
|
||||
sources: singleAnswerDocs.map(
|
||||
(doc) => doc.metadata.url || doc.metadata.sourceURL || "",
|
||||
),
|
||||
smartScrapeCost,
|
||||
otherCost,
|
||||
smartScrapeCallCount,
|
||||
otherCallCount,
|
||||
};
|
||||
}
|
||||
|
@ -67,14 +67,39 @@ type completions = {
|
||||
sources?: string[];
|
||||
};
|
||||
|
||||
export type CostTracking = {
|
||||
smartScrapeCallCount: number;
|
||||
smartScrapeCost: number;
|
||||
otherCallCount: number;
|
||||
otherCost: number;
|
||||
totalCost: number;
|
||||
costLimitExceededTokenUsage?: number;
|
||||
};
|
||||
export class CostTracking {
|
||||
calls: {
|
||||
type: "smartScrape" | "other",
|
||||
metadata: Record<string, any>,
|
||||
cost: number,
|
||||
tokens?: {
|
||||
input: number,
|
||||
output: number,
|
||||
},
|
||||
stack: string,
|
||||
}[] = [];
|
||||
|
||||
constructor() {}
|
||||
|
||||
public addCall(call: Omit<typeof this.calls[number], "stack">) {
|
||||
this.calls.push({
|
||||
...call,
|
||||
stack: new Error().stack!.split("\n").slice(2).join("\n"),
|
||||
});
|
||||
}
|
||||
|
||||
public toJSON() {
|
||||
return {
|
||||
calls: this.calls,
|
||||
|
||||
smartScrapeCallCount: this.calls.filter(c => c.type === "smartScrape").length,
|
||||
smartScrapeCost: this.calls.filter(c => c.type === "smartScrape").reduce((acc, c) => acc + c.cost, 0),
|
||||
otherCallCount: this.calls.filter(c => c.type === "other").length,
|
||||
otherCost: this.calls.filter(c => c.type === "other").reduce((acc, c) => acc + c.cost, 0),
|
||||
totalCost: this.calls.reduce((acc, c) => acc + c.cost, 0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export async function performExtraction(
|
||||
extractId: string,
|
||||
@ -89,13 +114,7 @@ export async function performExtraction(
|
||||
let singleAnswerResult: any = {};
|
||||
let totalUrlsScraped = 0;
|
||||
let sources: Record<string, string[]> = {};
|
||||
let costTracking: CostTracking = {
|
||||
smartScrapeCallCount: 0,
|
||||
smartScrapeCost: 0,
|
||||
otherCallCount: 0,
|
||||
otherCost: 0,
|
||||
totalCost: 0,
|
||||
};
|
||||
let costTracking: CostTracking = new CostTracking();
|
||||
|
||||
let log = {
|
||||
extractId,
|
||||
@ -118,13 +137,9 @@ export async function performExtraction(
|
||||
});
|
||||
const rephrasedPrompt = await generateBasicCompletion(
|
||||
buildRephraseToSerpPrompt(request.prompt),
|
||||
costTracking,
|
||||
);
|
||||
let rptxt = rephrasedPrompt?.text.replace('"', "").replace("'", "") || "";
|
||||
if (rephrasedPrompt) {
|
||||
costTracking.otherCallCount++;
|
||||
costTracking.otherCost += rephrasedPrompt.cost;
|
||||
costTracking.totalCost += rephrasedPrompt.cost;
|
||||
}
|
||||
const searchResults = await search({
|
||||
query: rptxt,
|
||||
num_results: 10,
|
||||
@ -197,11 +212,9 @@ export async function performExtraction(
|
||||
|
||||
let reqSchema = request.schema;
|
||||
if (!reqSchema && request.prompt) {
|
||||
const schemaGenRes = await generateSchemaFromPrompt(request.prompt, logger);
|
||||
const schemaGenRes = await generateSchemaFromPrompt(request.prompt, logger, costTracking);
|
||||
reqSchema = schemaGenRes.extract;
|
||||
costTracking.otherCallCount++;
|
||||
costTracking.otherCost += schemaGenRes.cost;
|
||||
costTracking.totalCost += schemaGenRes.cost;
|
||||
|
||||
|
||||
logger.debug("Generated request schema.", {
|
||||
originalSchema: request.schema,
|
||||
@ -232,8 +245,7 @@ export async function performExtraction(
|
||||
reasoning,
|
||||
keyIndicators,
|
||||
tokenUsage: schemaAnalysisTokenUsage,
|
||||
cost: schemaAnalysisCost,
|
||||
} = await analyzeSchemaAndPrompt(urls, reqSchema, request.prompt ?? "", logger);
|
||||
} = await analyzeSchemaAndPrompt(urls, reqSchema, request.prompt ?? "", logger, costTracking);
|
||||
|
||||
logger.debug("Analyzed schema.", {
|
||||
isMultiEntity,
|
||||
@ -242,11 +254,6 @@ export async function performExtraction(
|
||||
keyIndicators,
|
||||
});
|
||||
|
||||
costTracking.otherCallCount++;
|
||||
costTracking.otherCost += schemaAnalysisCost;
|
||||
costTracking.totalCost += schemaAnalysisCost;
|
||||
|
||||
// Track schema analysis tokens
|
||||
tokenUsage.push(schemaAnalysisTokenUsage);
|
||||
|
||||
let startMap = Date.now();
|
||||
@ -467,7 +474,8 @@ export async function performExtraction(
|
||||
doc,
|
||||
useAgent: isAgentExtractModelValid(request.agent?.model),
|
||||
extractId,
|
||||
sessionId
|
||||
sessionId,
|
||||
costTracking,
|
||||
}, logger);
|
||||
|
||||
// Race between timeout and completion
|
||||
@ -481,12 +489,6 @@ export async function performExtraction(
|
||||
if (multiEntityCompletion) {
|
||||
tokenUsage.push(multiEntityCompletion.totalUsage);
|
||||
|
||||
costTracking.smartScrapeCallCount += multiEntityCompletion.smartScrapeCallCount;
|
||||
costTracking.smartScrapeCost += multiEntityCompletion.smartScrapeCost;
|
||||
costTracking.otherCallCount += multiEntityCompletion.otherCallCount;
|
||||
costTracking.otherCost += multiEntityCompletion.otherCost;
|
||||
costTracking.totalCost += multiEntityCompletion.smartScrapeCost + multiEntityCompletion.otherCost;
|
||||
|
||||
if (multiEntityCompletion.extract) {
|
||||
return {
|
||||
extract: multiEntityCompletion.extract,
|
||||
@ -776,10 +778,6 @@ export async function performExtraction(
|
||||
extract: completionResult,
|
||||
tokenUsage: singleAnswerTokenUsage,
|
||||
sources: singleAnswerSources,
|
||||
smartScrapeCost: singleAnswerSmartScrapeCost,
|
||||
otherCost: singleAnswerOtherCost,
|
||||
smartScrapeCallCount: singleAnswerSmartScrapeCallCount,
|
||||
otherCallCount: singleAnswerOtherCallCount,
|
||||
} = await singleAnswerCompletion({
|
||||
singleAnswerDocs,
|
||||
rSchema,
|
||||
@ -789,12 +787,8 @@ export async function performExtraction(
|
||||
useAgent: isAgentExtractModelValid(request.agent?.model),
|
||||
extractId,
|
||||
sessionId: thisSessionId,
|
||||
costTracking,
|
||||
});
|
||||
costTracking.smartScrapeCost += singleAnswerSmartScrapeCost;
|
||||
costTracking.smartScrapeCallCount += singleAnswerSmartScrapeCallCount;
|
||||
costTracking.otherCost += singleAnswerOtherCost;
|
||||
costTracking.otherCallCount += singleAnswerOtherCallCount;
|
||||
costTracking.totalCost += singleAnswerSmartScrapeCost + singleAnswerOtherCost;
|
||||
logger.debug("Done generating singleAnswer completions.");
|
||||
|
||||
singleAnswerResult = transformArrayToObject(rSchema, completionResult);
|
||||
|
@ -6,7 +6,7 @@ import { extractConfig } from "../config";
|
||||
import { generateCompletions } from "../../../scraper/scrapeURL/transformers/llmExtract";
|
||||
import { performRanking_F0 } from "./ranker-f0";
|
||||
import { buildRerankerSystemPrompt_F0, buildRerankerUserPrompt_F0 } from "./build-prompts-f0";
|
||||
|
||||
import { CostTracking } from "../extraction-service";
|
||||
const cohere = new CohereClient({
|
||||
token: process.env.COHERE_API_KEY,
|
||||
});
|
||||
@ -166,7 +166,7 @@ export type RerankerOptions = {
|
||||
urlTraces: URLTrace[];
|
||||
};
|
||||
|
||||
export async function rerankLinksWithLLM_F0(options: RerankerOptions): Promise<RerankerResult> {
|
||||
export async function rerankLinksWithLLM_F0(options: RerankerOptions, costTracking: CostTracking): Promise<RerankerResult> {
|
||||
const { links, searchQuery, urlTraces } = options;
|
||||
const chunkSize = 100;
|
||||
const chunks: MapDocument[][] = [];
|
||||
@ -231,7 +231,14 @@ export async function rerankLinksWithLLM_F0(options: RerankerOptions): Promise<R
|
||||
schema: schema,
|
||||
},
|
||||
markdown: linksContent,
|
||||
isExtractEndpoint: true
|
||||
isExtractEndpoint: true,
|
||||
costTrackingOptions: {
|
||||
costTracking: new CostTracking(),
|
||||
metadata: {
|
||||
module: "extract",
|
||||
method: "rerankLinksWithLLM",
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const completion = await Promise.race([
|
||||
|
@ -8,6 +8,7 @@ import { extractConfig } from "../config";
|
||||
import type { Logger } from "winston";
|
||||
import { generateText } from "ai";
|
||||
import { getModel } from "../../generic-ai";
|
||||
import { CostTracking } from "../extraction-service";
|
||||
|
||||
export async function generateBasicCompletion_FO(prompt: string) {
|
||||
const { text } = await generateText({
|
||||
@ -211,7 +212,7 @@ export async function processUrl_F0(
|
||||
links: mappedLinks,
|
||||
searchQuery: rephrasedPrompt,
|
||||
urlTraces,
|
||||
});
|
||||
}, new CostTracking());
|
||||
mappedLinks = rerankerResult.mapDocument;
|
||||
let tokensUsed = rerankerResult.tokensUsed;
|
||||
logger.info("Reranked! (pass 1)", {
|
||||
@ -225,7 +226,7 @@ export async function processUrl_F0(
|
||||
links: mappedLinks,
|
||||
searchQuery: rephrasedPrompt,
|
||||
urlTraces,
|
||||
});
|
||||
}, new CostTracking());
|
||||
mappedLinks = rerankerResult.mapDocument;
|
||||
tokensUsed += rerankerResult.tokensUsed;
|
||||
logger.info("Reranked! (pass 2)", {
|
||||
|
@ -11,6 +11,7 @@ import { buildRerankerSystemPrompt } from "./build-prompts";
|
||||
import { dumpToFile } from "./helpers/dump-to-file";
|
||||
import { getModel } from "../generic-ai";
|
||||
import fs from "fs/promises";
|
||||
import { CostTracking } from "./extraction-service";
|
||||
|
||||
const THRESHOLD_FOR_SINGLEPAGE = 0.6;
|
||||
const THRESHOLD_FOR_MULTIENTITY = 0.45;
|
||||
@ -177,6 +178,7 @@ export type RerankerOptions = {
|
||||
reasoning: string;
|
||||
multiEntityKeys: string[];
|
||||
keyIndicators: string[];
|
||||
costTracking: CostTracking;
|
||||
};
|
||||
|
||||
export async function rerankLinksWithLLM(
|
||||
@ -315,6 +317,13 @@ export async function rerankLinksWithLLM(
|
||||
// },
|
||||
markdown: linksContent,
|
||||
isExtractEndpoint: true,
|
||||
costTrackingOptions: {
|
||||
costTracking: options.costTracking,
|
||||
metadata: {
|
||||
module: "extract",
|
||||
method: "rerankLinksWithLLM",
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
completion = await completionPromise;
|
||||
|
@ -11,7 +11,7 @@ import { getModel } from "../generic-ai";
|
||||
import { calculateCost } from "../../scraper/scrapeURL/transformers/llmExtract";
|
||||
import type { CostTracking } from "./extraction-service";
|
||||
|
||||
export async function generateBasicCompletion(prompt: string): Promise<{ text: string, cost: number } | null> {
|
||||
export async function generateBasicCompletion(prompt: string, costTracking: CostTracking): Promise<{ text: string } | null> {
|
||||
try {
|
||||
const result = await generateText({
|
||||
model: getModel("gpt-4o", "openai"),
|
||||
@ -22,7 +22,19 @@ export async function generateBasicCompletion(prompt: string): Promise<{ text: s
|
||||
},
|
||||
}
|
||||
});
|
||||
return { text: result.text, cost: calculateCost("openai/gpt-4o", result.usage?.promptTokens ?? 0, result.usage?.completionTokens ?? 0) };
|
||||
costTracking.addCall({
|
||||
type: "other",
|
||||
metadata: {
|
||||
module: "extract",
|
||||
method: "generateBasicCompletion",
|
||||
},
|
||||
cost: calculateCost("openai/gpt-4o", result.usage?.promptTokens ?? 0, result.usage?.completionTokens ?? 0),
|
||||
tokens: {
|
||||
input: result.usage?.promptTokens ?? 0,
|
||||
output: result.usage?.completionTokens ?? 0,
|
||||
},
|
||||
});
|
||||
return { text: result.text };
|
||||
} catch (error) {
|
||||
console.error("Error generating basic completion:", error);
|
||||
if (error?.type == "rate_limit_error") {
|
||||
@ -36,7 +48,19 @@ export async function generateBasicCompletion(prompt: string): Promise<{ text: s
|
||||
},
|
||||
}
|
||||
});
|
||||
return { text: result.text, cost: calculateCost("openai/gpt-4o-mini", result.usage?.promptTokens ?? 0, result.usage?.completionTokens ?? 0) };
|
||||
costTracking.addCall({
|
||||
type: "other",
|
||||
metadata: {
|
||||
module: "extract",
|
||||
method: "generateBasicCompletion",
|
||||
},
|
||||
cost: calculateCost("openai/gpt-4o-mini", result.usage?.promptTokens ?? 0, result.usage?.completionTokens ?? 0),
|
||||
tokens: {
|
||||
input: result.usage?.promptTokens ?? 0,
|
||||
output: result.usage?.completionTokens ?? 0,
|
||||
},
|
||||
});
|
||||
return { text: result.text };
|
||||
} catch (fallbackError) {
|
||||
console.error("Error generating basic completion with fallback model:", fallbackError);
|
||||
return null;
|
||||
@ -96,13 +120,11 @@ export async function processUrl(
|
||||
if (options.prompt) {
|
||||
const res = await generateBasicCompletion(
|
||||
buildRefrasedPrompt(options.prompt, baseUrl),
|
||||
costTracking,
|
||||
);
|
||||
|
||||
if (res) {
|
||||
searchQuery = res.text.replace('"', "").replace("/", "") ?? options.prompt;
|
||||
costTracking.otherCallCount++;
|
||||
costTracking.otherCost += res.cost;
|
||||
costTracking.totalCost += res.cost;
|
||||
}
|
||||
}
|
||||
|
||||
@ -223,13 +245,11 @@ export async function processUrl(
|
||||
try {
|
||||
const res = await generateBasicCompletion(
|
||||
buildPreRerankPrompt(rephrasedPrompt, options.schema, baseUrl),
|
||||
costTracking,
|
||||
);
|
||||
|
||||
if (res) {
|
||||
rephrasedPrompt = res.text;
|
||||
costTracking.otherCallCount++;
|
||||
costTracking.otherCost += res.cost;
|
||||
costTracking.totalCost += res.cost;
|
||||
} else {
|
||||
rephrasedPrompt =
|
||||
"Extract the data according to the schema: " +
|
||||
@ -262,10 +282,8 @@ export async function processUrl(
|
||||
reasoning: options.reasoning,
|
||||
multiEntityKeys: options.multiEntityKeys,
|
||||
keyIndicators: options.keyIndicators,
|
||||
costTracking,
|
||||
});
|
||||
costTracking.otherCallCount++;
|
||||
costTracking.otherCost += rerankerResult.cost;
|
||||
costTracking.totalCost += rerankerResult.cost;
|
||||
mappedLinks = rerankerResult.mapDocument;
|
||||
let tokensUsed = rerankerResult.tokensUsed;
|
||||
logger.info("Reranked! (pass 1)", {
|
||||
@ -283,10 +301,8 @@ export async function processUrl(
|
||||
reasoning: options.reasoning,
|
||||
multiEntityKeys: options.multiEntityKeys,
|
||||
keyIndicators: options.keyIndicators,
|
||||
costTracking,
|
||||
});
|
||||
costTracking.otherCallCount++;
|
||||
costTracking.otherCost += rerankerResult.cost;
|
||||
costTracking.totalCost += rerankerResult.cost;
|
||||
mappedLinks = rerankerResult.mapDocument;
|
||||
tokensUsed += rerankerResult.tokensUsed;
|
||||
logger.info("Reranked! (pass 2)", {
|
||||
|
@ -11,7 +11,7 @@ import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { getModel } from "../generic-ai";
|
||||
import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
|
||||
|
||||
import { CostTracking } from "../extract/extraction-service";
|
||||
interface GenerateLLMsTextServiceOptions {
|
||||
generationId: string;
|
||||
teamId: string;
|
||||
@ -71,6 +71,7 @@ export async function performGenerateLlmsTxt(
|
||||
generationId,
|
||||
teamId,
|
||||
});
|
||||
const costTracking = new CostTracking();
|
||||
|
||||
try {
|
||||
// Enforce max URL limit
|
||||
@ -167,6 +168,13 @@ export async function performGenerateLlmsTxt(
|
||||
prompt: `Generate a 9-10 word description and a 3-4 word title of the entire page based on ALL the content one will find on the page for this url: ${document.metadata?.url}. This will help in a user finding the page for its intended purpose.`,
|
||||
},
|
||||
markdown: document.markdown,
|
||||
costTrackingOptions: {
|
||||
costTracking,
|
||||
metadata: {
|
||||
module: "generate-llmstxt",
|
||||
method: "generateDescription",
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
return {
|
||||
@ -229,6 +237,7 @@ export async function performGenerateLlmsTxt(
|
||||
num_tokens: 0,
|
||||
tokens_billed: 0,
|
||||
sources: {},
|
||||
cost_tracking: costTracking,
|
||||
});
|
||||
|
||||
// Bill team for usage
|
||||
|
@ -17,14 +17,17 @@ import {
|
||||
} from "../scraper/scrapeURL";
|
||||
import { Engine } from "../scraper/scrapeURL/engines";
|
||||
import { indexPage } from "../lib/extract/index/pinecone";
|
||||
import { CostTracking } from "../lib/extract/extraction-service";
|
||||
configDotenv();
|
||||
|
||||
export async function startWebScraperPipeline({
|
||||
job,
|
||||
token,
|
||||
costTracking,
|
||||
}: {
|
||||
job: Job<WebScraperOptions> & { id: string };
|
||||
token: string;
|
||||
costTracking: CostTracking;
|
||||
}) {
|
||||
return await runWebScraper({
|
||||
url: job.data.url,
|
||||
@ -52,6 +55,7 @@ export async function startWebScraperPipeline({
|
||||
is_scrape: job.data.is_scrape ?? false,
|
||||
is_crawl: !!(job.data.crawl_id && job.data.crawlerOptions !== null),
|
||||
urlInvisibleInCurrentCrawl: job.data.crawlerOptions?.urlInvisibleInCurrentCrawl ?? false,
|
||||
costTracking,
|
||||
});
|
||||
}
|
||||
|
||||
@ -68,6 +72,7 @@ export async function runWebScraper({
|
||||
is_scrape = false,
|
||||
is_crawl = false,
|
||||
urlInvisibleInCurrentCrawl = false,
|
||||
costTracking,
|
||||
}: RunWebScraperParams): Promise<ScrapeUrlResponse> {
|
||||
const logger = _logger.child({
|
||||
method: "runWebScraper",
|
||||
@ -101,7 +106,7 @@ export async function runWebScraper({
|
||||
...internalOptions,
|
||||
urlInvisibleInCurrentCrawl,
|
||||
teamId: internalOptions?.teamId ?? team_id,
|
||||
});
|
||||
}, costTracking);
|
||||
if (!response.success) {
|
||||
if (response.error instanceof Error) {
|
||||
throw response.error;
|
||||
|
@ -3,6 +3,7 @@ import { WebCrawler } from "./crawler";
|
||||
import { scrapeURL } from "../scrapeURL";
|
||||
import { scrapeOptions, TimeoutSignal } from "../../controllers/v1/types";
|
||||
import type { Logger } from "winston";
|
||||
import { CostTracking } from "../../lib/extract/extraction-service";
|
||||
const useFireEngine =
|
||||
process.env.FIRE_ENGINE_BETA_URL !== "" &&
|
||||
process.env.FIRE_ENGINE_BETA_URL !== undefined;
|
||||
@ -49,6 +50,7 @@ export async function getLinksFromSitemap(
|
||||
abort,
|
||||
teamId: "sitemap",
|
||||
},
|
||||
new CostTracking(),
|
||||
);
|
||||
|
||||
if (
|
||||
|
@ -26,6 +26,7 @@ import { executeTransformers } from "./transformers";
|
||||
import { LLMRefusalError } from "./transformers/llmExtract";
|
||||
import { urlSpecificParams } from "./lib/urlSpecificParams";
|
||||
import { loadMock, MockState } from "./lib/mock";
|
||||
import { CostTracking } from "../../lib/extract/extraction-service";
|
||||
|
||||
export type ScrapeUrlResponse = (
|
||||
| {
|
||||
@ -55,6 +56,7 @@ export type Meta = {
|
||||
url?: string;
|
||||
status: number;
|
||||
} | null | undefined; // undefined: no prefetch yet, null: prefetch came back empty
|
||||
costTracking: CostTracking;
|
||||
};
|
||||
|
||||
function buildFeatureFlags(
|
||||
@ -127,6 +129,7 @@ async function buildMetaObject(
|
||||
url: string,
|
||||
options: ScrapeOptions,
|
||||
internalOptions: InternalOptions,
|
||||
costTracking: CostTracking,
|
||||
): Promise<Meta> {
|
||||
const specParams =
|
||||
urlSpecificParams[new URL(url).hostname.replace(/^www\./, "")];
|
||||
@ -158,6 +161,7 @@ async function buildMetaObject(
|
||||
? await loadMock(options.useMock, _logger)
|
||||
: null,
|
||||
pdfPrefetch: undefined,
|
||||
costTracking,
|
||||
};
|
||||
}
|
||||
|
||||
@ -389,8 +393,9 @@ export async function scrapeURL(
|
||||
url: string,
|
||||
options: ScrapeOptions,
|
||||
internalOptions: InternalOptions,
|
||||
costTracking: CostTracking,
|
||||
): Promise<ScrapeUrlResponse> {
|
||||
const meta = await buildMetaObject(id, url, options, internalOptions);
|
||||
const meta = await buildMetaObject(id, url, options, internalOptions, costTracking);
|
||||
try {
|
||||
while (true) {
|
||||
try {
|
||||
|
@ -10,8 +10,7 @@ import { parseMarkdown } from "../../../lib/html-to-markdown";
|
||||
import { getModel } from "../../../lib/generic-ai";
|
||||
import { TokenUsage } from "../../../controllers/v1/types";
|
||||
import type { SmartScrapeResult } from "./smartScrape";
|
||||
import { ExtractStep } from "src/lib/extract/extract-redis";
|
||||
|
||||
import { CostTracking } from "../../../lib/extract/extraction-service";
|
||||
const commonSmartScrapeProperties = {
|
||||
shouldUseSmartscrape: {
|
||||
type: "boolean",
|
||||
@ -225,26 +224,16 @@ export async function extractData({
|
||||
}): Promise<{
|
||||
extractedDataArray: any[];
|
||||
warning: any;
|
||||
smartScrapeCallCount: number;
|
||||
otherCallCount: number;
|
||||
smartScrapeCost: number;
|
||||
otherCost: number;
|
||||
costLimitExceededTokenUsage: number | null;
|
||||
}> {
|
||||
let schema = extractOptions.options.schema;
|
||||
const logger = extractOptions.logger;
|
||||
const isSingleUrl = urls.length === 1;
|
||||
let smartScrapeCost = 0;
|
||||
let otherCost = 0;
|
||||
let smartScrapeCallCount = 0;
|
||||
let otherCallCount = 0;
|
||||
let costLimitExceededTokenUsage: number | null = null;
|
||||
// TODO: remove the "required" fields here!! it breaks o3-mini
|
||||
|
||||
if (!schema && extractOptions.options.prompt) {
|
||||
const genRes = await generateSchemaFromPrompt(extractOptions.options.prompt, logger);
|
||||
otherCallCount++;
|
||||
otherCost += genRes.cost;
|
||||
const genRes = await generateSchemaFromPrompt(extractOptions.options.prompt, logger, extractOptions.costTrackingOptions.costTracking);
|
||||
schema = genRes.extract;
|
||||
}
|
||||
|
||||
@ -278,17 +267,22 @@ export async function extractData({
|
||||
extract: e,
|
||||
warning: w,
|
||||
totalUsage: t,
|
||||
cost: c,
|
||||
} = await generateCompletions({
|
||||
...extractOptionsNewSchema,
|
||||
model: getModel("gemini-2.5-pro-preview-03-25", "vertex"),
|
||||
retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"),
|
||||
costTrackingOptions: {
|
||||
costTracking: extractOptions.costTrackingOptions.costTracking,
|
||||
metadata: {
|
||||
module: "scrapeURL",
|
||||
method: "extractData",
|
||||
description: "Check if using smartScrape is needed for this case"
|
||||
},
|
||||
},
|
||||
});
|
||||
extract = e;
|
||||
warning = w;
|
||||
totalUsage = t;
|
||||
otherCost += c;
|
||||
otherCallCount++;
|
||||
} catch (error) {
|
||||
logger.error(
|
||||
"failed during extractSmartScrape.ts:generateCompletions",
|
||||
@ -321,10 +315,9 @@ export async function extractData({
|
||||
sessionId,
|
||||
extractId,
|
||||
scrapeId,
|
||||
costTracking: extractOptions.costTrackingOptions.costTracking,
|
||||
}),
|
||||
];
|
||||
smartScrapeCost += smartscrapeResults[0].tokenUsage;
|
||||
smartScrapeCallCount++;
|
||||
} else {
|
||||
const pages = extract?.smartscrapePages ?? [];
|
||||
//do it async promiseall instead
|
||||
@ -344,14 +337,10 @@ export async function extractData({
|
||||
sessionId,
|
||||
extractId,
|
||||
scrapeId,
|
||||
costTracking: extractOptions.costTrackingOptions.costTracking,
|
||||
});
|
||||
}),
|
||||
);
|
||||
smartScrapeCost += smartscrapeResults.reduce(
|
||||
(acc, result) => acc + result.tokenUsage,
|
||||
0,
|
||||
);
|
||||
smartScrapeCallCount += smartscrapeResults.length;
|
||||
}
|
||||
// console.log("smartscrapeResults", smartscrapeResults);
|
||||
|
||||
@ -372,11 +361,17 @@ export async function extractData({
|
||||
markdown: markdown,
|
||||
model: getModel("gemini-2.5-pro-preview-03-25", "vertex"),
|
||||
retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"),
|
||||
costTrackingOptions: {
|
||||
costTracking: extractOptions.costTrackingOptions.costTracking,
|
||||
metadata: {
|
||||
module: "scrapeURL",
|
||||
method: "extractData",
|
||||
description: "Extract data from markdown (smart-scape results)",
|
||||
},
|
||||
},
|
||||
};
|
||||
const { extract, warning, totalUsage, model, cost } =
|
||||
const { extract } =
|
||||
await generateCompletions(newExtractOptions);
|
||||
otherCost += cost;
|
||||
otherCallCount++;
|
||||
return extract;
|
||||
}),
|
||||
);
|
||||
@ -399,10 +394,6 @@ export async function extractData({
|
||||
return {
|
||||
extractedDataArray: extractedData,
|
||||
warning: warning,
|
||||
smartScrapeCallCount: smartScrapeCallCount,
|
||||
otherCallCount: otherCallCount,
|
||||
smartScrapeCost: smartScrapeCost,
|
||||
otherCost: otherCost,
|
||||
costLimitExceededTokenUsage: costLimitExceededTokenUsage,
|
||||
};
|
||||
}
|
||||
|
@ -3,7 +3,7 @@ import { logger as _logger } from "../../../lib/logger";
|
||||
import { robustFetch } from "./fetch";
|
||||
import fs from "fs/promises";
|
||||
import { configDotenv } from "dotenv";
|
||||
|
||||
import { CostTracking } from "../../../lib/extract/extraction-service";
|
||||
configDotenv();
|
||||
|
||||
// Define schemas outside the function scope
|
||||
@ -52,6 +52,7 @@ export async function smartScrape({
|
||||
extractId,
|
||||
scrapeId,
|
||||
beforeSubmission,
|
||||
costTracking,
|
||||
}: {
|
||||
url: string,
|
||||
prompt: string,
|
||||
@ -59,6 +60,7 @@ export async function smartScrape({
|
||||
extractId?: string,
|
||||
scrapeId?: string,
|
||||
beforeSubmission?: () => unknown,
|
||||
costTracking: CostTracking,
|
||||
}): Promise<SmartScrapeResult> {
|
||||
let logger = _logger.child({
|
||||
method: "smartScrape",
|
||||
@ -139,6 +141,16 @@ export async function smartScrape({
|
||||
});
|
||||
|
||||
logger.info("Smart scrape cost $" + response.tokenUsage);
|
||||
costTracking.addCall({
|
||||
type: "smartScrape",
|
||||
cost: response.tokenUsage,
|
||||
metadata: {
|
||||
module: "smartScrape",
|
||||
method: "smartScrape",
|
||||
url,
|
||||
sessionId,
|
||||
},
|
||||
});
|
||||
|
||||
return response; // The response type now matches SmartScrapeResult
|
||||
} catch (error) {
|
||||
|
@ -5,6 +5,7 @@ process.env.ENV = "test";
|
||||
import { scrapeURL } from ".";
|
||||
import { scrapeOptions } from "../../controllers/v1/types";
|
||||
import { Engine } from "./engines";
|
||||
import { CostTracking } from "../../lib/extract/extraction-service";
|
||||
|
||||
const testEngines: (Engine | undefined)[] = [
|
||||
undefined,
|
||||
@ -32,6 +33,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
"https://www.roastmywebsite.ai/",
|
||||
scrapeOptions.parse({}),
|
||||
{ forceEngine, teamId: "test" },
|
||||
new CostTracking(),
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -79,6 +81,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
formats: ["markdown", "html"],
|
||||
}),
|
||||
{ forceEngine, teamId: "test" },
|
||||
new CostTracking(),
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -103,6 +106,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
onlyMainContent: false,
|
||||
}),
|
||||
{ forceEngine, teamId: "test" },
|
||||
new CostTracking(),
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -126,6 +130,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
excludeTags: [".nav", "#footer", "strong"],
|
||||
}),
|
||||
{ forceEngine, teamId: "test" },
|
||||
new CostTracking(),
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -146,6 +151,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
"https://httpstat.us/400",
|
||||
scrapeOptions.parse({}),
|
||||
{ forceEngine, teamId: "test" },
|
||||
new CostTracking(),
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -164,6 +170,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
"https://httpstat.us/401",
|
||||
scrapeOptions.parse({}),
|
||||
{ forceEngine, teamId: "test" },
|
||||
new CostTracking(),
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -182,6 +189,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
"https://httpstat.us/403",
|
||||
scrapeOptions.parse({}),
|
||||
{ forceEngine, teamId: "test" },
|
||||
new CostTracking(),
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -200,6 +208,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
"https://httpstat.us/404",
|
||||
scrapeOptions.parse({}),
|
||||
{ forceEngine, teamId: "test" },
|
||||
new CostTracking(),
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -218,6 +227,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
"https://httpstat.us/405",
|
||||
scrapeOptions.parse({}),
|
||||
{ forceEngine, teamId: "test" },
|
||||
new CostTracking(),
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -236,6 +246,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
"https://httpstat.us/500",
|
||||
scrapeOptions.parse({}),
|
||||
{ forceEngine, teamId: "test" },
|
||||
new CostTracking(),
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -254,6 +265,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
"https://scrapethissite.com/",
|
||||
scrapeOptions.parse({}),
|
||||
{ forceEngine, teamId: "test" },
|
||||
new CostTracking(),
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -286,6 +298,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
formats: ["screenshot"],
|
||||
}),
|
||||
{ forceEngine, teamId: "test" },
|
||||
new CostTracking(),
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -314,6 +327,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
formats: ["screenshot@fullPage"],
|
||||
}),
|
||||
{ forceEngine, teamId: "test" },
|
||||
new CostTracking(),
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -342,6 +356,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
"https://arxiv.org/pdf/astro-ph/9301001.pdf",
|
||||
scrapeOptions.parse({}),
|
||||
{ teamId: "test" },
|
||||
new CostTracking(),
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -361,6 +376,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
"https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx",
|
||||
scrapeOptions.parse({}),
|
||||
{ teamId: "test" },
|
||||
new CostTracking(),
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -398,6 +414,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
},
|
||||
}),
|
||||
{ teamId: "test" },
|
||||
new CostTracking(),
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -434,6 +451,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
},
|
||||
}),
|
||||
{ teamId: "test" },
|
||||
new CostTracking(),
|
||||
);
|
||||
|
||||
// expect(out.logs.length).toBeGreaterThan(0);
|
||||
@ -455,7 +473,7 @@ describe("Standalone scrapeURL tests", () => {
|
||||
async (i) => {
|
||||
const url = "https://www.scrapethissite.com/?i=" + i;
|
||||
const id = "test:concurrent:" + url;
|
||||
const out = await scrapeURL(id, url, scrapeOptions.parse({}), { teamId: "test" });
|
||||
const out = await scrapeURL(id, url, scrapeOptions.parse({}), { teamId: "test" }, new CostTracking());
|
||||
|
||||
const replacer = (key: string, value: any) => {
|
||||
if (value instanceof Error) {
|
||||
|
@ -30,6 +30,7 @@ export async function performAgent(
|
||||
prompt,
|
||||
sessionId,
|
||||
scrapeId: meta.id,
|
||||
costTracking: meta.costTracking,
|
||||
})
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message === "Cost limit exceeded") {
|
||||
@ -50,20 +51,6 @@ export async function performAgent(
|
||||
if (meta.options.formats.includes("html")) {
|
||||
document.html = html
|
||||
}
|
||||
|
||||
if (document.metadata.costTracking) {
|
||||
document.metadata.costTracking.smartScrapeCallCount++;
|
||||
document.metadata.costTracking.smartScrapeCost = document.metadata.costTracking.smartScrapeCost + smartscrapeResults.tokenUsage;
|
||||
document.metadata.costTracking.totalCost = document.metadata.costTracking.totalCost + smartscrapeResults.tokenUsage;
|
||||
} else {
|
||||
document.metadata.costTracking = {
|
||||
smartScrapeCallCount: 1,
|
||||
smartScrapeCost: smartscrapeResults.tokenUsage,
|
||||
otherCallCount: 0,
|
||||
otherCost: 0,
|
||||
totalCost: smartscrapeResults.tokenUsage,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return document;
|
||||
|
@ -6,9 +6,9 @@ import gitDiff from 'git-diff';
|
||||
import parseDiff from 'parse-diff';
|
||||
import { generateCompletions } from "./llmExtract";
|
||||
|
||||
async function extractDataWithSchema(content: string, meta: Meta): Promise<{ extract: any, cost: number } | null> {
|
||||
async function extractDataWithSchema(content: string, meta: Meta): Promise<{ extract: any } | null> {
|
||||
try {
|
||||
const { extract, cost } = await generateCompletions({
|
||||
const { extract } = await generateCompletions({
|
||||
logger: meta.logger.child({
|
||||
method: "extractDataWithSchema/generateCompletions",
|
||||
}),
|
||||
@ -18,9 +18,16 @@ async function extractDataWithSchema(content: string, meta: Meta): Promise<{ ext
|
||||
systemPrompt: "Extract the requested information from the content based on the provided schema.",
|
||||
temperature: 0
|
||||
},
|
||||
markdown: content
|
||||
markdown: content,
|
||||
costTrackingOptions: {
|
||||
costTracking: meta.costTracking,
|
||||
metadata: {
|
||||
module: "extract",
|
||||
method: "extractDataWithSchema",
|
||||
},
|
||||
},
|
||||
});
|
||||
return { extract, cost };
|
||||
return { extract };
|
||||
} catch (error) {
|
||||
meta.logger.error("Error extracting data with schema", { error });
|
||||
return null;
|
||||
@ -145,19 +152,6 @@ export async function deriveDiff(meta: Meta, document: Document): Promise<Docume
|
||||
|
||||
if (previousData && currentData) {
|
||||
document.changeTracking.json = compareExtractedData(previousData.extract, currentData.extract);
|
||||
|
||||
if (document.metadata.costTracking) {
|
||||
document.metadata.costTracking.otherCallCount += 2;
|
||||
document.metadata.costTracking.otherCost = document.metadata.costTracking.otherCost + previousData.cost + currentData.cost;
|
||||
} else {
|
||||
document.metadata.costTracking = {
|
||||
smartScrapeCallCount: 0,
|
||||
smartScrapeCost: 0,
|
||||
otherCallCount: 2,
|
||||
otherCost: previousData.cost + currentData.cost,
|
||||
totalCost: previousData.cost + currentData.cost
|
||||
}
|
||||
}
|
||||
} else {
|
||||
const { extract } = await generateCompletions({
|
||||
logger: meta.logger.child({
|
||||
@ -171,7 +165,14 @@ export async function deriveDiff(meta: Meta, document: Document): Promise<Docume
|
||||
temperature: 0
|
||||
},
|
||||
markdown: `Previous Content:\n${previousMarkdown}\n\nCurrent Content:\n${currentMarkdown}`,
|
||||
previousWarning: document.warning
|
||||
previousWarning: document.warning,
|
||||
costTrackingOptions: {
|
||||
costTracking: meta.costTracking,
|
||||
metadata: {
|
||||
module: "diff",
|
||||
method: "deriveDiff",
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
document.changeTracking.json = extract;
|
||||
|
@ -11,6 +11,7 @@ import { EngineResultsTracker, Meta } from "..";
|
||||
import { logger } from "../../../lib/logger";
|
||||
import { modelPrices } from "../../../lib/extract/usage/model-prices";
|
||||
import {
|
||||
AISDKError,
|
||||
generateObject,
|
||||
generateText,
|
||||
LanguageModel,
|
||||
@ -22,7 +23,7 @@ import { z } from "zod";
|
||||
import fs from "fs/promises";
|
||||
import Ajv from "ajv";
|
||||
import { extractData } from "../lib/extractSmartScrape";
|
||||
|
||||
import { CostTracking } from "../../../lib/extract/extraction-service";
|
||||
// TODO: fix this, it's horrible
|
||||
type LanguageModelV1ProviderMetadata = {
|
||||
anthropic?: {
|
||||
@ -231,6 +232,10 @@ export type GenerateCompletionsOptions = {
|
||||
mode?: "object" | "no-object";
|
||||
providerOptions?: LanguageModelV1ProviderMetadata;
|
||||
retryModel?: LanguageModel;
|
||||
costTrackingOptions: {
|
||||
costTracking: CostTracking;
|
||||
metadata: Record<string, any>;
|
||||
};
|
||||
};
|
||||
export async function generateCompletions({
|
||||
logger,
|
||||
@ -242,13 +247,13 @@ export async function generateCompletions({
|
||||
mode = "object",
|
||||
providerOptions,
|
||||
retryModel = getModel("claude-3-5-sonnet-20240620", "anthropic"),
|
||||
costTrackingOptions,
|
||||
}: GenerateCompletionsOptions): Promise<{
|
||||
extract: any;
|
||||
numTokens: number;
|
||||
warning: string | undefined;
|
||||
totalUsage: TokenUsage;
|
||||
model: string;
|
||||
cost: number;
|
||||
}> {
|
||||
let extract: any;
|
||||
let warning: string | undefined;
|
||||
@ -278,6 +283,19 @@ export async function generateCompletions({
|
||||
},
|
||||
});
|
||||
|
||||
costTrackingOptions.costTracking.addCall({
|
||||
type: "other",
|
||||
metadata: {
|
||||
...costTrackingOptions.metadata,
|
||||
gcDetails: "no-object",
|
||||
},
|
||||
cost: calculateCost(
|
||||
currentModel.modelId,
|
||||
result.usage?.promptTokens ?? 0,
|
||||
result.usage?.completionTokens ?? 0,
|
||||
),
|
||||
});
|
||||
|
||||
extract = result.text;
|
||||
|
||||
return {
|
||||
@ -290,11 +308,6 @@ export async function generateCompletions({
|
||||
totalTokens: result.usage?.promptTokens ?? 0 + (result.usage?.completionTokens ?? 0),
|
||||
},
|
||||
model: currentModel.modelId,
|
||||
cost: calculateCost(
|
||||
currentModel.modelId,
|
||||
result.usage?.promptTokens ?? 0,
|
||||
result.usage?.completionTokens ?? 0,
|
||||
),
|
||||
};
|
||||
} catch (error) {
|
||||
lastError = error as Error;
|
||||
@ -321,6 +334,19 @@ export async function generateCompletions({
|
||||
|
||||
extract = result.text;
|
||||
|
||||
costTrackingOptions.costTracking.addCall({
|
||||
type: "other",
|
||||
metadata: {
|
||||
...costTrackingOptions.metadata,
|
||||
gcDetails: "no-object fallback",
|
||||
},
|
||||
cost: calculateCost(
|
||||
currentModel.modelId,
|
||||
result.usage?.promptTokens ?? 0,
|
||||
result.usage?.completionTokens ?? 0,
|
||||
),
|
||||
});
|
||||
|
||||
return {
|
||||
extract,
|
||||
warning,
|
||||
@ -331,11 +357,6 @@ export async function generateCompletions({
|
||||
totalTokens: result.usage?.promptTokens ?? 0 + (result.usage?.completionTokens ?? 0),
|
||||
},
|
||||
model: currentModel.modelId,
|
||||
cost: calculateCost(
|
||||
currentModel.modelId,
|
||||
result.usage?.promptTokens ?? 0,
|
||||
result.usage?.completionTokens ?? 0,
|
||||
),
|
||||
};
|
||||
} catch (retryError) {
|
||||
lastError = retryError as Error;
|
||||
@ -410,7 +431,7 @@ export async function generateCompletions({
|
||||
}
|
||||
|
||||
try {
|
||||
const { text: fixedText } = await generateText({
|
||||
const { text: fixedText, usage: repairUsage } = await generateText({
|
||||
model: currentModel,
|
||||
prompt: `Fix this JSON that had the following error: ${error}\n\nOriginal text:\n${text}\n\nReturn only the fixed JSON, no explanation.`,
|
||||
system:
|
||||
@ -421,6 +442,23 @@ export async function generateCompletions({
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
costTrackingOptions.costTracking.addCall({
|
||||
type: "other",
|
||||
metadata: {
|
||||
...costTrackingOptions.metadata,
|
||||
gcDetails: "repairConfig",
|
||||
},
|
||||
cost: calculateCost(
|
||||
currentModel.modelId,
|
||||
repairUsage?.promptTokens ?? 0,
|
||||
repairUsage?.completionTokens ?? 0,
|
||||
),
|
||||
tokens: {
|
||||
input: repairUsage?.promptTokens ?? 0,
|
||||
output: repairUsage?.completionTokens ?? 0,
|
||||
},
|
||||
});
|
||||
logger.debug("Repaired text with LLM");
|
||||
return fixedText;
|
||||
} catch (repairError) {
|
||||
@ -464,6 +502,23 @@ export async function generateCompletions({
|
||||
let result: { object: any; usage: TokenUsage } | undefined;
|
||||
try {
|
||||
result = await generateObject(generateObjectConfig);
|
||||
costTrackingOptions.costTracking.addCall({
|
||||
type: "other",
|
||||
metadata: {
|
||||
...costTrackingOptions.metadata,
|
||||
gcDetails: "generateObject",
|
||||
gcModel: generateObjectConfig.model.modelId,
|
||||
},
|
||||
tokens: {
|
||||
input: result.usage?.promptTokens ?? 0,
|
||||
output: result.usage?.completionTokens ?? 0,
|
||||
},
|
||||
cost: calculateCost(
|
||||
currentModel.modelId,
|
||||
result.usage?.promptTokens ?? 0,
|
||||
result.usage?.completionTokens ?? 0,
|
||||
),
|
||||
});
|
||||
} catch (error) {
|
||||
lastError = error as Error;
|
||||
if (
|
||||
@ -481,6 +536,23 @@ export async function generateCompletions({
|
||||
model: currentModel,
|
||||
};
|
||||
result = await generateObject(retryConfig);
|
||||
costTrackingOptions.costTracking.addCall({
|
||||
type: "other",
|
||||
metadata: {
|
||||
...costTrackingOptions.metadata,
|
||||
gcDetails: "generateObject fallback",
|
||||
gcModel: retryConfig.model.modelId,
|
||||
},
|
||||
tokens: {
|
||||
input: result.usage?.promptTokens ?? 0,
|
||||
output: result.usage?.completionTokens ?? 0,
|
||||
},
|
||||
cost: calculateCost(
|
||||
currentModel.modelId,
|
||||
result.usage?.promptTokens ?? 0,
|
||||
result.usage?.completionTokens ?? 0,
|
||||
),
|
||||
});
|
||||
} catch (retryError) {
|
||||
lastError = retryError as Error;
|
||||
logger.error("Failed with fallback model", {
|
||||
@ -549,7 +621,6 @@ export async function generateCompletions({
|
||||
totalTokens: promptTokens + completionTokens,
|
||||
},
|
||||
model: currentModel.modelId,
|
||||
cost: calculateCost(currentModel.modelId, promptTokens, completionTokens),
|
||||
};
|
||||
} catch (error) {
|
||||
lastError = error as Error;
|
||||
@ -589,9 +660,16 @@ export async function performLLMExtract(
|
||||
// model: getModel("gemini-2.5-pro-preview-03-25", "vertex"),
|
||||
model: getModel("gemini-2.5-pro-preview-03-25", "vertex"),
|
||||
retryModel: getModel("gemini-2.5-pro-preview-03-25", "google"),
|
||||
costTrackingOptions: {
|
||||
costTracking: meta.costTracking,
|
||||
metadata: {
|
||||
module: "scrapeURL",
|
||||
method: "performLLMExtract",
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const { extractedDataArray, warning, smartScrapeCost, otherCost, costLimitExceededTokenUsage } =
|
||||
const { extractedDataArray, warning, costLimitExceededTokenUsage } =
|
||||
await extractData({
|
||||
extractOptions: generationOptions,
|
||||
urls: [meta.url],
|
||||
@ -603,25 +681,6 @@ export async function performLLMExtract(
|
||||
document.warning = warning + (document.warning ? " " + document.warning : "");
|
||||
}
|
||||
|
||||
if (document.metadata.costTracking) {
|
||||
document.metadata.costTracking.smartScrapeCallCount++;
|
||||
document.metadata.costTracking.smartScrapeCost += smartScrapeCost;
|
||||
document.metadata.costTracking.otherCallCount++;
|
||||
document.metadata.costTracking.otherCost += otherCost;
|
||||
document.metadata.costTracking.totalCost += smartScrapeCost + otherCost;
|
||||
if (costLimitExceededTokenUsage) {
|
||||
document.metadata.costTracking.costLimitExceededTokenUsage = costLimitExceededTokenUsage;
|
||||
}
|
||||
} else {
|
||||
document.metadata.costTracking = {
|
||||
smartScrapeCallCount: 1,
|
||||
smartScrapeCost: smartScrapeCost,
|
||||
otherCallCount: 1,
|
||||
otherCost: otherCost,
|
||||
totalCost: smartScrapeCost + otherCost,
|
||||
};
|
||||
}
|
||||
|
||||
// IMPORTANT: here it only get's the last page!!!
|
||||
const extractedData =
|
||||
extractedDataArray[extractedDataArray.length - 1] ?? undefined;
|
||||
@ -758,7 +817,8 @@ export function removeDefaultProperty(schema: any): any {
|
||||
export async function generateSchemaFromPrompt(
|
||||
prompt: string,
|
||||
logger: Logger,
|
||||
): Promise<{ extract: any; cost: number }> {
|
||||
costTracking: CostTracking,
|
||||
): Promise<{ extract: any }> {
|
||||
const model = getModel("gpt-4o", "openai");
|
||||
const retryModel = getModel("gpt-4o-mini", "openai");
|
||||
const temperatures = [0, 0.1, 0.3]; // Different temperatures to try
|
||||
@ -766,7 +826,7 @@ export async function generateSchemaFromPrompt(
|
||||
|
||||
for (const temp of temperatures) {
|
||||
try {
|
||||
const { extract, cost } = await generateCompletions({
|
||||
const { extract } = await generateCompletions({
|
||||
logger: logger.child({
|
||||
method: "generateSchemaFromPrompt/generateCompletions",
|
||||
}),
|
||||
@ -802,10 +862,16 @@ Return a valid JSON schema object with properties that would capture the informa
|
||||
prompt: `Generate a JSON schema for extracting the following information: ${prompt}`,
|
||||
// temperature: temp,
|
||||
},
|
||||
markdown: prompt,
|
||||
costTrackingOptions: {
|
||||
costTracking,
|
||||
metadata: {
|
||||
module: "scrapeURL",
|
||||
method: "generateSchemaFromPrompt",
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
return { extract, cost };
|
||||
return { extract };
|
||||
} catch (error) {
|
||||
lastError = error as Error;
|
||||
logger.warn(`Failed attempt with temperature ${temp}: ${error.message}`);
|
||||
|
@ -75,6 +75,7 @@ import { performDeepResearch } from "../lib/deep-research/deep-research-service"
|
||||
import { performGenerateLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt-service";
|
||||
import { updateGeneratedLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt-redis";
|
||||
import { performExtraction_F0 } from "../lib/extract/fire-0/extraction-service-f0";
|
||||
import { CostTracking } from "../lib/extract/extraction-service";
|
||||
|
||||
configDotenv();
|
||||
|
||||
@ -1010,6 +1011,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
// };
|
||||
// return data;
|
||||
// }
|
||||
const costTracking = new CostTracking();
|
||||
|
||||
try {
|
||||
job.updateProgress({
|
||||
@ -1030,6 +1032,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
startWebScraperPipeline({
|
||||
job,
|
||||
token,
|
||||
costTracking,
|
||||
}),
|
||||
...(job.data.scrapeOptions.timeout !== undefined
|
||||
? [
|
||||
@ -1171,6 +1174,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
scrapeOptions: job.data.scrapeOptions,
|
||||
origin: job.data.origin,
|
||||
crawl_id: job.data.crawl_id,
|
||||
cost_tracking: costTracking,
|
||||
},
|
||||
true,
|
||||
);
|
||||
@ -1276,10 +1280,6 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
|
||||
await finishCrawlIfNeeded(job, sc);
|
||||
} else {
|
||||
const cost_tracking = doc?.metadata?.costTracking;
|
||||
|
||||
delete doc.metadata.costTracking;
|
||||
|
||||
await logJob({
|
||||
job_id: job.id,
|
||||
success: true,
|
||||
@ -1293,7 +1293,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
scrapeOptions: job.data.scrapeOptions,
|
||||
origin: job.data.origin,
|
||||
num_tokens: 0, // TODO: fix
|
||||
cost_tracking,
|
||||
cost_tracking: costTracking,
|
||||
});
|
||||
|
||||
indexJob(job, doc);
|
||||
@ -1442,6 +1442,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
scrapeOptions: job.data.scrapeOptions,
|
||||
origin: job.data.origin,
|
||||
crawl_id: job.data.crawl_id,
|
||||
cost_tracking: costTracking,
|
||||
},
|
||||
true,
|
||||
);
|
||||
|
@ -62,6 +62,7 @@ export interface RunWebScraperParams {
|
||||
is_scrape?: boolean;
|
||||
is_crawl?: boolean;
|
||||
urlInvisibleInCurrentCrawl?: boolean;
|
||||
costTracking: CostTracking;
|
||||
}
|
||||
|
||||
export type RunWebScraperResult =
|
||||
|
Loading…
x
Reference in New Issue
Block a user