feat(search): add further logging

This commit is contained in:
Gergő Móricz 2025-04-11 18:57:14 +02:00
parent b1fdc0e850
commit 6e9396dc57
2 changed files with 55 additions and 6 deletions

View File

@ -1,5 +1,4 @@
import { Response } from "express"; import { Response } from "express";
import { logger } from "../../lib/logger";
import { import {
Document, Document,
RequestWithAuth, RequestWithAuth,
@ -19,6 +18,8 @@ import { search } from "../../search";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
import { logger as _logger } from "../../lib/logger";
import type { Logger } from "winston";
// Used for deep research // Used for deep research
export async function searchAndScrapeSearchResult( export async function searchAndScrapeSearchResult(
@ -28,7 +29,8 @@ export async function searchAndScrapeSearchResult(
origin: string; origin: string;
timeout: number; timeout: number;
scrapeOptions: ScrapeOptions; scrapeOptions: ScrapeOptions;
} },
logger: Logger,
): Promise<Document[]> { ): Promise<Document[]> {
try { try {
const searchResults = await search({ const searchResults = await search({
@ -44,7 +46,8 @@ export async function searchAndScrapeSearchResult(
title: result.title, title: result.title,
description: result.description description: result.description
}, },
options options,
logger
) )
) )
); );
@ -63,6 +66,7 @@ async function scrapeSearchResult(
timeout: number; timeout: number;
scrapeOptions: ScrapeOptions; scrapeOptions: ScrapeOptions;
}, },
logger: Logger,
): Promise<Document> { ): Promise<Document> {
const jobId = uuidv4(); const jobId = uuidv4();
const jobPriority = await getJobPriority({ const jobPriority = await getJobPriority({
@ -74,6 +78,12 @@ async function scrapeSearchResult(
if (isUrlBlocked(searchResult.url)) { if (isUrlBlocked(searchResult.url)) {
throw new Error("Could not scrape url: " + BLOCKLISTED_URL_MESSAGE); throw new Error("Could not scrape url: " + BLOCKLISTED_URL_MESSAGE);
} }
logger.info("Adding scrape job", {
scrapeId: jobId,
url: searchResult.url,
teamId: options.teamId,
origin: options.origin,
});
await addScrapeJob( await addScrapeJob(
{ {
url: searchResult.url, url: searchResult.url,
@ -90,6 +100,12 @@ async function scrapeSearchResult(
); );
const doc = await waitForJob<Document>(jobId, options.timeout); const doc = await waitForJob<Document>(jobId, options.timeout);
logger.info("Scrape job completed", {
scrapeId: jobId,
url: searchResult.url,
teamId: options.teamId,
origin: options.origin,
});
await getScrapeQueue().remove(jobId); await getScrapeQueue().remove(jobId);
// Move SERP results to top level // Move SERP results to top level
@ -101,6 +117,7 @@ async function scrapeSearchResult(
}; };
} catch (error) { } catch (error) {
logger.error(`Error in scrapeSearchResult: ${error}`, { logger.error(`Error in scrapeSearchResult: ${error}`, {
scrapeId: jobId,
url: searchResult.url, url: searchResult.url,
teamId: options.teamId, teamId: options.teamId,
}); });
@ -126,10 +143,22 @@ export async function searchController(
req: RequestWithAuth<{}, SearchResponse, SearchRequest>, req: RequestWithAuth<{}, SearchResponse, SearchRequest>,
res: Response<SearchResponse>, res: Response<SearchResponse>,
) { ) {
const jobId = uuidv4();
let logger = _logger.child({
jobId,
teamId: req.auth.team_id,
module: "search",
method: "searchController",
});
try { try {
req.body = searchRequestSchema.parse(req.body); req.body = searchRequestSchema.parse(req.body);
const jobId = uuidv4(); logger = logger.child({
query: req.body.query,
origin: req.body.origin,
});
const startTime = new Date().getTime(); const startTime = new Date().getTime();
let limit = req.body.limit; let limit = req.body.limit;
@ -137,6 +166,8 @@ export async function searchController(
// Buffer results by 50% to account for filtered URLs // Buffer results by 50% to account for filtered URLs
const num_results_buffer = Math.floor(limit * 2); const num_results_buffer = Math.floor(limit * 2);
logger.info("Searching for results");
let searchResults = await search({ let searchResults = await search({
query: req.body.query, query: req.body.query,
advanced: false, advanced: false,
@ -148,12 +179,17 @@ export async function searchController(
location: req.body.location, location: req.body.location,
}); });
logger.info("Searching completed", {
num_results: searchResults.length,
});
// Filter blocked URLs early to avoid unnecessary billing // Filter blocked URLs early to avoid unnecessary billing
if (searchResults.length > limit) { if (searchResults.length > limit) {
searchResults = searchResults.slice(0, limit); searchResults = searchResults.slice(0, limit);
} }
if (searchResults.length === 0) { if (searchResults.length === 0) {
logger.info("No search results found");
return res.status(200).json({ return res.status(200).json({
success: true, success: true,
data: [], data: [],
@ -183,16 +219,20 @@ export async function searchController(
} }
// Scrape each non-blocked result, handling timeouts individually // Scrape each non-blocked result, handling timeouts individually
logger.info("Scraping search results");
const scrapePromises = searchResults.map((result) => const scrapePromises = searchResults.map((result) =>
scrapeSearchResult(result, { scrapeSearchResult(result, {
teamId: req.auth.team_id, teamId: req.auth.team_id,
origin: req.body.origin, origin: req.body.origin,
timeout: req.body.timeout, timeout: req.body.timeout,
scrapeOptions: req.body.scrapeOptions, scrapeOptions: req.body.scrapeOptions,
}), }, logger),
); );
const docs = await Promise.all(scrapePromises); const docs = await Promise.all(scrapePromises);
logger.info("Scraping completed", {
num_docs: docs.length,
});
// Bill for successful scrapes only // Bill for successful scrapes only
billTeam(req.auth.team_id, req.acuc?.sub_id, docs.length).catch((error) => { billTeam(req.auth.team_id, req.acuc?.sub_id, docs.length).catch((error) => {
@ -207,6 +247,10 @@ export async function searchController(
doc.serpResults || (doc.markdown && doc.markdown.trim().length > 0), doc.serpResults || (doc.markdown && doc.markdown.trim().length > 0),
); );
logger.info("Filtering completed", {
num_docs: filteredDocs.length,
});
if (filteredDocs.length === 0) { if (filteredDocs.length === 0) {
return res.status(200).json({ return res.status(200).json({
success: true, success: true,
@ -218,6 +262,11 @@ export async function searchController(
const endTime = new Date().getTime(); const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000; const timeTakenInSeconds = (endTime - startTime) / 1000;
logger.info("Logging job", {
num_docs: filteredDocs.length,
time_taken: timeTakenInSeconds,
});
logJob({ logJob({
job_id: jobId, job_id: jobId,
success: true, success: true,

View File

@ -109,7 +109,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
fastMode: false, fastMode: false,
blockAds: false, blockAds: false,
}, },
}); }, logger);
return response.length > 0 ? response : []; return response.length > 0 ? response : [];
}); });