mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-08 18:09:00 +08:00
feat(extract): add logging
This commit is contained in:
parent
434a435a4b
commit
d3518e85a8
@ -5,6 +5,7 @@ import { getScrapeQueue } from "../../services/queue-service";
|
|||||||
import { waitForJob } from "../../services/queue-jobs";
|
import { waitForJob } from "../../services/queue-jobs";
|
||||||
import { addScrapeJob } from "../../services/queue-jobs";
|
import { addScrapeJob } from "../../services/queue-jobs";
|
||||||
import { getJobPriority } from "../job-priority";
|
import { getJobPriority } from "../job-priority";
|
||||||
|
import type { Logger } from "winston";
|
||||||
|
|
||||||
interface ScrapeDocumentOptions {
|
interface ScrapeDocumentOptions {
|
||||||
url: string;
|
url: string;
|
||||||
@ -18,6 +19,7 @@ interface ScrapeDocumentOptions {
|
|||||||
export async function scrapeDocument(
|
export async function scrapeDocument(
|
||||||
options: ScrapeDocumentOptions,
|
options: ScrapeDocumentOptions,
|
||||||
urlTraces: URLTrace[],
|
urlTraces: URLTrace[],
|
||||||
|
logger: Logger,
|
||||||
): Promise<Document | null> {
|
): Promise<Document | null> {
|
||||||
const trace = urlTraces.find((t) => t.url === options.url);
|
const trace = urlTraces.find((t) => t.url === options.url);
|
||||||
if (trace) {
|
if (trace) {
|
||||||
@ -68,16 +70,25 @@ export async function scrapeDocument(
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
try {
|
try {
|
||||||
return await attemptScrape(options.timeout);
|
logger.debug("Attempting scrape...");
|
||||||
|
const x = await attemptScrape(options.timeout);
|
||||||
|
logger.debug("Scrape finished!");
|
||||||
|
return x;
|
||||||
} catch (timeoutError) {
|
} catch (timeoutError) {
|
||||||
|
logger.warn("Scrape failed.", { error: timeoutError });
|
||||||
|
|
||||||
if (options.isSingleUrl) {
|
if (options.isSingleUrl) {
|
||||||
// For single URLs, try again with double timeout
|
// For single URLs, try again with double timeout
|
||||||
return await attemptScrape(options.timeout * 2);
|
logger.debug("Attempting scrape...");
|
||||||
|
const x = await attemptScrape(options.timeout * 2);
|
||||||
|
logger.debug("Scrape finished!");
|
||||||
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
throw timeoutError;
|
throw timeoutError;
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Error in scrapeDocument: ${error}`);
|
logger.error(`error in scrapeDocument`, { error });
|
||||||
if (trace) {
|
if (trace) {
|
||||||
trace.status = "error";
|
trace.status = "error";
|
||||||
trace.error = error.message;
|
trace.error = error.message;
|
||||||
|
@ -193,7 +193,7 @@ export async function performExtraction(
|
|||||||
let totalUrlsScraped = 0;
|
let totalUrlsScraped = 0;
|
||||||
|
|
||||||
const logger = _logger.child({
|
const logger = _logger.child({
|
||||||
module: "extraction-service",
|
module: "extract",
|
||||||
method: "performExtraction",
|
method: "performExtraction",
|
||||||
extractId,
|
extractId,
|
||||||
});
|
});
|
||||||
@ -215,6 +215,9 @@ export async function performExtraction(
|
|||||||
|
|
||||||
let startMap = Date.now();
|
let startMap = Date.now();
|
||||||
let aggMapLinks: string[] = [];
|
let aggMapLinks: string[] = [];
|
||||||
|
logger.debug("Processing URLs...", {
|
||||||
|
urlCount: request.urls.length,
|
||||||
|
});
|
||||||
// Process URLs
|
// Process URLs
|
||||||
const urlPromises = request.urls.map((url) =>
|
const urlPromises = request.urls.map((url) =>
|
||||||
processUrl(
|
processUrl(
|
||||||
@ -243,11 +246,15 @@ export async function performExtraction(
|
|||||||
],
|
],
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
|
logger.child({ module: "extract", method: "processUrl", url }),
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
|
|
||||||
const processedUrls = await Promise.all(urlPromises);
|
const processedUrls = await Promise.all(urlPromises);
|
||||||
const links = processedUrls.flat().filter((url) => url);
|
const links = processedUrls.flat().filter((url) => url);
|
||||||
|
logger.debug("Processed URLs.", {
|
||||||
|
linkCount: links.length,
|
||||||
|
});
|
||||||
|
|
||||||
if (links.length === 0) {
|
if (links.length === 0) {
|
||||||
return {
|
return {
|
||||||
@ -281,6 +288,8 @@ export async function performExtraction(
|
|||||||
reqSchema = await dereferenceSchema(reqSchema);
|
reqSchema = await dereferenceSchema(reqSchema);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
logger.debug("Transformed schema.", { schema: reqSchema });
|
||||||
|
|
||||||
// agent evaluates if the schema or the prompt has an array with big amount of items
|
// agent evaluates if the schema or the prompt has an array with big amount of items
|
||||||
// also it checks if the schema any other properties that are not arrays
|
// also it checks if the schema any other properties that are not arrays
|
||||||
// if so, it splits the results into 2 types of completions:
|
// if so, it splits the results into 2 types of completions:
|
||||||
@ -295,6 +304,8 @@ export async function performExtraction(
|
|||||||
tokenUsage: schemaAnalysisTokenUsage,
|
tokenUsage: schemaAnalysisTokenUsage,
|
||||||
} = await analyzeSchemaAndPrompt(links, reqSchema, request.prompt ?? "");
|
} = await analyzeSchemaAndPrompt(links, reqSchema, request.prompt ?? "");
|
||||||
|
|
||||||
|
logger.debug("Analyzed schema.", { isMultiEntity, multiEntityKeys, reasoning, keyIndicators });
|
||||||
|
|
||||||
// Track schema analysis tokens
|
// Track schema analysis tokens
|
||||||
tokenUsage.push(schemaAnalysisTokenUsage);
|
tokenUsage.push(schemaAnalysisTokenUsage);
|
||||||
|
|
||||||
@ -305,11 +316,14 @@ export async function performExtraction(
|
|||||||
|
|
||||||
let rSchema = reqSchema;
|
let rSchema = reqSchema;
|
||||||
if (isMultiEntity && reqSchema) {
|
if (isMultiEntity && reqSchema) {
|
||||||
|
logger.debug("=== MULTI-ENTITY ===");
|
||||||
|
|
||||||
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
|
||||||
reqSchema,
|
reqSchema,
|
||||||
multiEntityKeys,
|
multiEntityKeys,
|
||||||
);
|
);
|
||||||
rSchema = singleAnswerSchema;
|
rSchema = singleAnswerSchema;
|
||||||
|
logger.debug("Spread schemas.", { singleAnswerSchema, multiEntitySchema });
|
||||||
|
|
||||||
await updateExtract(extractId, {
|
await updateExtract(extractId, {
|
||||||
status: "processing",
|
status: "processing",
|
||||||
@ -337,6 +351,7 @@ export async function performExtraction(
|
|||||||
],
|
],
|
||||||
});
|
});
|
||||||
|
|
||||||
|
logger.debug("Starting multi-entity scrape...");
|
||||||
let startScrape = Date.now();
|
let startScrape = Date.now();
|
||||||
const scrapePromises = links.map((url) => {
|
const scrapePromises = links.map((url) => {
|
||||||
if (!docsMap.has(url)) {
|
if (!docsMap.has(url)) {
|
||||||
@ -349,6 +364,7 @@ export async function performExtraction(
|
|||||||
timeout,
|
timeout,
|
||||||
},
|
},
|
||||||
urlTraces,
|
urlTraces,
|
||||||
|
logger.child({ module: "extract", method: "scrapeDocument", url, isMultiEntity: true }),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
return docsMap.get(url);
|
return docsMap.get(url);
|
||||||
@ -358,6 +374,10 @@ export async function performExtraction(
|
|||||||
(doc): doc is Document => doc !== null,
|
(doc): doc is Document => doc !== null,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
logger.debug("Multi-entity scrape finished.", {
|
||||||
|
docCount: multyEntityDocs.length,
|
||||||
|
});
|
||||||
|
|
||||||
totalUrlsScraped += multyEntityDocs.length;
|
totalUrlsScraped += multyEntityDocs.length;
|
||||||
|
|
||||||
let endScrape = Date.now();
|
let endScrape = Date.now();
|
||||||
@ -380,6 +400,8 @@ export async function performExtraction(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
logger.debug("Updated docsMap.", { docsMapSize: docsMap.size }); // useful for error probing
|
||||||
|
|
||||||
// Process docs in chunks with queue style processing
|
// Process docs in chunks with queue style processing
|
||||||
const chunkSize = 50;
|
const chunkSize = 50;
|
||||||
const timeoutCompletion = 45000; // 45 second timeout
|
const timeoutCompletion = 45000; // 45 second timeout
|
||||||
@ -527,7 +549,7 @@ export async function performExtraction(
|
|||||||
|
|
||||||
return multiEntityCompletion.extract;
|
return multiEntityCompletion.extract;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Failed to process document: ${error}`);
|
logger.error(`Failed to process document.`, { error, url: doc.metadata.url ?? doc.metadata.sourceURL! });
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -537,6 +559,7 @@ export async function performExtraction(
|
|||||||
multiEntityCompletions.push(
|
multiEntityCompletions.push(
|
||||||
...chunkResults.filter((result) => result !== null),
|
...chunkResults.filter((result) => result !== null),
|
||||||
);
|
);
|
||||||
|
logger.debug("All multi-entity completion chunks finished.", { completionCount: multiEntityCompletions.length });
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@ -548,7 +571,7 @@ export async function performExtraction(
|
|||||||
multiEntityResult = mergeNullValObjs(multiEntityResult);
|
multiEntityResult = mergeNullValObjs(multiEntityResult);
|
||||||
// @nick: maybe we can add here a llm that checks if the array probably has a primary key?
|
// @nick: maybe we can add here a llm that checks if the array probably has a primary key?
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Failed to transform array to object: ${error}`);
|
logger.error(`Failed to transform array to object`, { error });
|
||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
error:
|
error:
|
||||||
@ -565,6 +588,11 @@ export async function performExtraction(
|
|||||||
rSchema.properties &&
|
rSchema.properties &&
|
||||||
Object.keys(rSchema.properties).length > 0
|
Object.keys(rSchema.properties).length > 0
|
||||||
) {
|
) {
|
||||||
|
logger.debug("=== SINGLE PAGES ===", {
|
||||||
|
linkCount: links.length,
|
||||||
|
schema: rSchema,
|
||||||
|
});
|
||||||
|
|
||||||
// Scrape documents
|
// Scrape documents
|
||||||
const timeout = 60000;
|
const timeout = 60000;
|
||||||
let singleAnswerDocs: Document[] = [];
|
let singleAnswerDocs: Document[] = [];
|
||||||
@ -593,6 +621,7 @@ export async function performExtraction(
|
|||||||
timeout,
|
timeout,
|
||||||
},
|
},
|
||||||
urlTraces,
|
urlTraces,
|
||||||
|
logger.child({ module: "extract", method: "scrapeDocument", url, isMultiEntity: false })
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
return docsMap.get(url);
|
return docsMap.get(url);
|
||||||
@ -606,12 +635,15 @@ export async function performExtraction(
|
|||||||
docsMap.set(doc.metadata.url, doc);
|
docsMap.set(doc.metadata.url, doc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
logger.debug("Updated docsMap.", { docsMapSize: docsMap.size }); // useful for error probing
|
||||||
|
|
||||||
const validResults = results.filter(
|
const validResults = results.filter(
|
||||||
(doc): doc is Document => doc !== null,
|
(doc): doc is Document => doc !== null,
|
||||||
);
|
);
|
||||||
singleAnswerDocs.push(...validResults);
|
singleAnswerDocs.push(...validResults);
|
||||||
totalUrlsScraped += validResults.length;
|
totalUrlsScraped += validResults.length;
|
||||||
|
|
||||||
|
logger.debug("Scrapes finished.", { docCount: validResults.length });
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
@ -624,6 +656,7 @@ export async function performExtraction(
|
|||||||
|
|
||||||
if (docsMap.size == 0) {
|
if (docsMap.size == 0) {
|
||||||
// All urls are invalid
|
// All urls are invalid
|
||||||
|
logger.error("All provided URLs are invalid!");
|
||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
error:
|
error:
|
||||||
@ -647,8 +680,9 @@ export async function performExtraction(
|
|||||||
});
|
});
|
||||||
|
|
||||||
// Generate completions
|
// Generate completions
|
||||||
|
logger.debug("Generating singleAnswer completions...");
|
||||||
singleAnswerCompletions = await generateOpenAICompletions(
|
singleAnswerCompletions = await generateOpenAICompletions(
|
||||||
logger.child({ method: "extractService/generateOpenAICompletions" }),
|
logger.child({ module: "extract", method: "generateOpenAICompletions" }),
|
||||||
{
|
{
|
||||||
mode: "llm",
|
mode: "llm",
|
||||||
systemPrompt:
|
systemPrompt:
|
||||||
@ -662,6 +696,7 @@ export async function performExtraction(
|
|||||||
undefined,
|
undefined,
|
||||||
true,
|
true,
|
||||||
);
|
);
|
||||||
|
logger.debug("Done generating singleAnswer completions.");
|
||||||
|
|
||||||
// Track single answer extraction tokens
|
// Track single answer extraction tokens
|
||||||
if (singleAnswerCompletions) {
|
if (singleAnswerCompletions) {
|
||||||
@ -691,7 +726,7 @@ export async function performExtraction(
|
|||||||
}
|
}
|
||||||
|
|
||||||
let finalResult = reqSchema
|
let finalResult = reqSchema
|
||||||
? await mixSchemaObjects(reqSchema, singleAnswerResult, multiEntityResult)
|
? await mixSchemaObjects(reqSchema, singleAnswerResult, multiEntityResult, logger.child({ method: "mixSchemaObjects" }))
|
||||||
: singleAnswerResult || multiEntityResult;
|
: singleAnswerResult || multiEntityResult;
|
||||||
|
|
||||||
// Tokenize final result to get token count
|
// Tokenize final result to get token count
|
||||||
@ -785,6 +820,8 @@ export async function performExtraction(
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
logger.debug("Done!");
|
||||||
|
|
||||||
return {
|
return {
|
||||||
success: true,
|
success: true,
|
||||||
data: finalResult ?? {},
|
data: finalResult ?? {},
|
||||||
|
@ -1,9 +1,13 @@
|
|||||||
|
import type { Logger } from "winston";
|
||||||
|
|
||||||
export async function mixSchemaObjects(
|
export async function mixSchemaObjects(
|
||||||
finalSchema: any,
|
finalSchema: any,
|
||||||
singleAnswerResult: any,
|
singleAnswerResult: any,
|
||||||
multiEntityResult: any,
|
multiEntityResult: any,
|
||||||
|
logger: Logger
|
||||||
) {
|
) {
|
||||||
const finalResult: any = {};
|
const finalResult: any = {};
|
||||||
|
logger.debug("Mixing schema objects.");
|
||||||
|
|
||||||
// Recursive helper function to merge results based on schema
|
// Recursive helper function to merge results based on schema
|
||||||
function mergeResults(schema: any, singleResult: any, multiResult: any) {
|
function mergeResults(schema: any, singleResult: any, multiResult: any) {
|
||||||
|
@ -9,6 +9,7 @@ import { rerankLinksWithLLM } from "./reranker";
|
|||||||
import { extractConfig } from "./config";
|
import { extractConfig } from "./config";
|
||||||
import { updateExtract } from "./extract-redis";
|
import { updateExtract } from "./extract-redis";
|
||||||
import { ExtractStep } from "./extract-redis";
|
import { ExtractStep } from "./extract-redis";
|
||||||
|
import type { Logger } from "winston";
|
||||||
|
|
||||||
interface ProcessUrlOptions {
|
interface ProcessUrlOptions {
|
||||||
url: string;
|
url: string;
|
||||||
@ -26,6 +27,7 @@ export async function processUrl(
|
|||||||
options: ProcessUrlOptions,
|
options: ProcessUrlOptions,
|
||||||
urlTraces: URLTrace[],
|
urlTraces: URLTrace[],
|
||||||
updateExtractCallback: (links: string[]) => void,
|
updateExtractCallback: (links: string[]) => void,
|
||||||
|
logger: Logger,
|
||||||
): Promise<string[]> {
|
): Promise<string[]> {
|
||||||
const trace: URLTrace = {
|
const trace: URLTrace = {
|
||||||
url: options.url,
|
url: options.url,
|
||||||
@ -41,6 +43,7 @@ export async function processUrl(
|
|||||||
trace.usedInCompletion = true;
|
trace.usedInCompletion = true;
|
||||||
return [options.url];
|
return [options.url];
|
||||||
}
|
}
|
||||||
|
logger.warn("URL is blocked");
|
||||||
trace.status = "error";
|
trace.status = "error";
|
||||||
trace.error = "URL is blocked";
|
trace.error = "URL is blocked";
|
||||||
trace.usedInCompletion = false;
|
trace.usedInCompletion = false;
|
||||||
@ -63,6 +66,9 @@ export async function processUrl(
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
logger.debug("Running map...", {
|
||||||
|
search: searchQuery,
|
||||||
|
})
|
||||||
const mapResults = await getMapResults({
|
const mapResults = await getMapResults({
|
||||||
url: baseUrl,
|
url: baseUrl,
|
||||||
search: searchQuery,
|
search: searchQuery,
|
||||||
@ -79,6 +85,10 @@ export async function processUrl(
|
|||||||
let mappedLinks = mapResults.mapResults as MapDocument[];
|
let mappedLinks = mapResults.mapResults as MapDocument[];
|
||||||
let allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links];
|
let allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links];
|
||||||
let uniqueUrls = removeDuplicateUrls(allUrls);
|
let uniqueUrls = removeDuplicateUrls(allUrls);
|
||||||
|
logger.debug("Map finished.", {
|
||||||
|
linkCount: allUrls.length,
|
||||||
|
uniqueLinkCount: uniqueUrls.length,
|
||||||
|
});
|
||||||
|
|
||||||
// Track all discovered URLs
|
// Track all discovered URLs
|
||||||
uniqueUrls.forEach((discoveredUrl) => {
|
uniqueUrls.forEach((discoveredUrl) => {
|
||||||
@ -96,6 +106,7 @@ export async function processUrl(
|
|||||||
|
|
||||||
// retry if only one url is returned
|
// retry if only one url is returned
|
||||||
if (uniqueUrls.length <= 1) {
|
if (uniqueUrls.length <= 1) {
|
||||||
|
logger.debug("Running map... (pass 2)");
|
||||||
const retryMapResults = await getMapResults({
|
const retryMapResults = await getMapResults({
|
||||||
url: baseUrl,
|
url: baseUrl,
|
||||||
teamId: options.teamId,
|
teamId: options.teamId,
|
||||||
@ -111,6 +122,10 @@ export async function processUrl(
|
|||||||
mappedLinks = retryMapResults.mapResults as MapDocument[];
|
mappedLinks = retryMapResults.mapResults as MapDocument[];
|
||||||
allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links];
|
allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links];
|
||||||
uniqueUrls = removeDuplicateUrls(allUrls);
|
uniqueUrls = removeDuplicateUrls(allUrls);
|
||||||
|
logger.debug("Map finished. (pass 2)", {
|
||||||
|
linkCount: allUrls.length,
|
||||||
|
uniqueLinkCount: uniqueUrls.length,
|
||||||
|
});
|
||||||
|
|
||||||
// Track all discovered URLs
|
// Track all discovered URLs
|
||||||
uniqueUrls.forEach((discoveredUrl) => {
|
uniqueUrls.forEach((discoveredUrl) => {
|
||||||
@ -184,6 +199,11 @@ export async function processUrl(
|
|||||||
// (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}`
|
// (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}`
|
||||||
// );
|
// );
|
||||||
|
|
||||||
|
logger.info("Generated rephrased prompt.", {
|
||||||
|
rephrasedPrompt
|
||||||
|
});
|
||||||
|
|
||||||
|
logger.info("Reranking (pass 1)...");
|
||||||
const rerankerResult = await rerankLinksWithLLM(
|
const rerankerResult = await rerankLinksWithLLM(
|
||||||
mappedLinks,
|
mappedLinks,
|
||||||
rephrasedPrompt,
|
rephrasedPrompt,
|
||||||
@ -191,9 +211,13 @@ export async function processUrl(
|
|||||||
);
|
);
|
||||||
mappedLinks = rerankerResult.mapDocument;
|
mappedLinks = rerankerResult.mapDocument;
|
||||||
let tokensUsed = rerankerResult.tokensUsed;
|
let tokensUsed = rerankerResult.tokensUsed;
|
||||||
|
logger.info("Reranked! (pass 1)", {
|
||||||
|
linkCount: mappedLinks.length,
|
||||||
|
});
|
||||||
|
|
||||||
// 2nd Pass, useful for when the first pass returns too many links
|
// 2nd Pass, useful for when the first pass returns too many links
|
||||||
if (mappedLinks.length > 100) {
|
if (mappedLinks.length > 100) {
|
||||||
|
logger.info("Reranking (pass 2)...");
|
||||||
const rerankerResult = await rerankLinksWithLLM(
|
const rerankerResult = await rerankLinksWithLLM(
|
||||||
mappedLinks,
|
mappedLinks,
|
||||||
rephrasedPrompt,
|
rephrasedPrompt,
|
||||||
@ -201,6 +225,9 @@ export async function processUrl(
|
|||||||
);
|
);
|
||||||
mappedLinks = rerankerResult.mapDocument;
|
mappedLinks = rerankerResult.mapDocument;
|
||||||
tokensUsed += rerankerResult.tokensUsed;
|
tokensUsed += rerankerResult.tokensUsed;
|
||||||
|
logger.info("Reranked! (pass 2)", {
|
||||||
|
linkCount: mappedLinks.length,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// dumpToFile(
|
// dumpToFile(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user