mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 03:46:04 +08:00
feat(search): add further logging
This commit is contained in:
parent
b1fdc0e850
commit
6e9396dc57
@ -1,5 +1,4 @@
|
|||||||
import { Response } from "express";
|
import { Response } from "express";
|
||||||
import { logger } from "../../lib/logger";
|
|
||||||
import {
|
import {
|
||||||
Document,
|
Document,
|
||||||
RequestWithAuth,
|
RequestWithAuth,
|
||||||
@ -19,6 +18,8 @@ import { search } from "../../search";
|
|||||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
|
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
|
||||||
|
import { logger as _logger } from "../../lib/logger";
|
||||||
|
import type { Logger } from "winston";
|
||||||
|
|
||||||
// Used for deep research
|
// Used for deep research
|
||||||
export async function searchAndScrapeSearchResult(
|
export async function searchAndScrapeSearchResult(
|
||||||
@ -28,7 +29,8 @@ export async function searchAndScrapeSearchResult(
|
|||||||
origin: string;
|
origin: string;
|
||||||
timeout: number;
|
timeout: number;
|
||||||
scrapeOptions: ScrapeOptions;
|
scrapeOptions: ScrapeOptions;
|
||||||
}
|
},
|
||||||
|
logger: Logger,
|
||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
try {
|
try {
|
||||||
const searchResults = await search({
|
const searchResults = await search({
|
||||||
@ -44,7 +46,8 @@ export async function searchAndScrapeSearchResult(
|
|||||||
title: result.title,
|
title: result.title,
|
||||||
description: result.description
|
description: result.description
|
||||||
},
|
},
|
||||||
options
|
options,
|
||||||
|
logger
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
@ -63,6 +66,7 @@ async function scrapeSearchResult(
|
|||||||
timeout: number;
|
timeout: number;
|
||||||
scrapeOptions: ScrapeOptions;
|
scrapeOptions: ScrapeOptions;
|
||||||
},
|
},
|
||||||
|
logger: Logger,
|
||||||
): Promise<Document> {
|
): Promise<Document> {
|
||||||
const jobId = uuidv4();
|
const jobId = uuidv4();
|
||||||
const jobPriority = await getJobPriority({
|
const jobPriority = await getJobPriority({
|
||||||
@ -74,6 +78,12 @@ async function scrapeSearchResult(
|
|||||||
if (isUrlBlocked(searchResult.url)) {
|
if (isUrlBlocked(searchResult.url)) {
|
||||||
throw new Error("Could not scrape url: " + BLOCKLISTED_URL_MESSAGE);
|
throw new Error("Could not scrape url: " + BLOCKLISTED_URL_MESSAGE);
|
||||||
}
|
}
|
||||||
|
logger.info("Adding scrape job", {
|
||||||
|
scrapeId: jobId,
|
||||||
|
url: searchResult.url,
|
||||||
|
teamId: options.teamId,
|
||||||
|
origin: options.origin,
|
||||||
|
});
|
||||||
await addScrapeJob(
|
await addScrapeJob(
|
||||||
{
|
{
|
||||||
url: searchResult.url,
|
url: searchResult.url,
|
||||||
@ -90,6 +100,12 @@ async function scrapeSearchResult(
|
|||||||
);
|
);
|
||||||
|
|
||||||
const doc = await waitForJob<Document>(jobId, options.timeout);
|
const doc = await waitForJob<Document>(jobId, options.timeout);
|
||||||
|
logger.info("Scrape job completed", {
|
||||||
|
scrapeId: jobId,
|
||||||
|
url: searchResult.url,
|
||||||
|
teamId: options.teamId,
|
||||||
|
origin: options.origin,
|
||||||
|
});
|
||||||
await getScrapeQueue().remove(jobId);
|
await getScrapeQueue().remove(jobId);
|
||||||
|
|
||||||
// Move SERP results to top level
|
// Move SERP results to top level
|
||||||
@ -101,6 +117,7 @@ async function scrapeSearchResult(
|
|||||||
};
|
};
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Error in scrapeSearchResult: ${error}`, {
|
logger.error(`Error in scrapeSearchResult: ${error}`, {
|
||||||
|
scrapeId: jobId,
|
||||||
url: searchResult.url,
|
url: searchResult.url,
|
||||||
teamId: options.teamId,
|
teamId: options.teamId,
|
||||||
});
|
});
|
||||||
@ -126,10 +143,22 @@ export async function searchController(
|
|||||||
req: RequestWithAuth<{}, SearchResponse, SearchRequest>,
|
req: RequestWithAuth<{}, SearchResponse, SearchRequest>,
|
||||||
res: Response<SearchResponse>,
|
res: Response<SearchResponse>,
|
||||||
) {
|
) {
|
||||||
|
const jobId = uuidv4();
|
||||||
|
let logger = _logger.child({
|
||||||
|
jobId,
|
||||||
|
teamId: req.auth.team_id,
|
||||||
|
module: "search",
|
||||||
|
method: "searchController",
|
||||||
|
});
|
||||||
|
|
||||||
try {
|
try {
|
||||||
req.body = searchRequestSchema.parse(req.body);
|
req.body = searchRequestSchema.parse(req.body);
|
||||||
|
|
||||||
const jobId = uuidv4();
|
logger = logger.child({
|
||||||
|
query: req.body.query,
|
||||||
|
origin: req.body.origin,
|
||||||
|
});
|
||||||
|
|
||||||
const startTime = new Date().getTime();
|
const startTime = new Date().getTime();
|
||||||
|
|
||||||
let limit = req.body.limit;
|
let limit = req.body.limit;
|
||||||
@ -137,6 +166,8 @@ export async function searchController(
|
|||||||
// Buffer results by 50% to account for filtered URLs
|
// Buffer results by 50% to account for filtered URLs
|
||||||
const num_results_buffer = Math.floor(limit * 2);
|
const num_results_buffer = Math.floor(limit * 2);
|
||||||
|
|
||||||
|
logger.info("Searching for results");
|
||||||
|
|
||||||
let searchResults = await search({
|
let searchResults = await search({
|
||||||
query: req.body.query,
|
query: req.body.query,
|
||||||
advanced: false,
|
advanced: false,
|
||||||
@ -148,12 +179,17 @@ export async function searchController(
|
|||||||
location: req.body.location,
|
location: req.body.location,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
logger.info("Searching completed", {
|
||||||
|
num_results: searchResults.length,
|
||||||
|
});
|
||||||
|
|
||||||
// Filter blocked URLs early to avoid unnecessary billing
|
// Filter blocked URLs early to avoid unnecessary billing
|
||||||
if (searchResults.length > limit) {
|
if (searchResults.length > limit) {
|
||||||
searchResults = searchResults.slice(0, limit);
|
searchResults = searchResults.slice(0, limit);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (searchResults.length === 0) {
|
if (searchResults.length === 0) {
|
||||||
|
logger.info("No search results found");
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
data: [],
|
data: [],
|
||||||
@ -183,16 +219,20 @@ export async function searchController(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Scrape each non-blocked result, handling timeouts individually
|
// Scrape each non-blocked result, handling timeouts individually
|
||||||
|
logger.info("Scraping search results");
|
||||||
const scrapePromises = searchResults.map((result) =>
|
const scrapePromises = searchResults.map((result) =>
|
||||||
scrapeSearchResult(result, {
|
scrapeSearchResult(result, {
|
||||||
teamId: req.auth.team_id,
|
teamId: req.auth.team_id,
|
||||||
origin: req.body.origin,
|
origin: req.body.origin,
|
||||||
timeout: req.body.timeout,
|
timeout: req.body.timeout,
|
||||||
scrapeOptions: req.body.scrapeOptions,
|
scrapeOptions: req.body.scrapeOptions,
|
||||||
}),
|
}, logger),
|
||||||
);
|
);
|
||||||
|
|
||||||
const docs = await Promise.all(scrapePromises);
|
const docs = await Promise.all(scrapePromises);
|
||||||
|
logger.info("Scraping completed", {
|
||||||
|
num_docs: docs.length,
|
||||||
|
});
|
||||||
|
|
||||||
// Bill for successful scrapes only
|
// Bill for successful scrapes only
|
||||||
billTeam(req.auth.team_id, req.acuc?.sub_id, docs.length).catch((error) => {
|
billTeam(req.auth.team_id, req.acuc?.sub_id, docs.length).catch((error) => {
|
||||||
@ -207,6 +247,10 @@ export async function searchController(
|
|||||||
doc.serpResults || (doc.markdown && doc.markdown.trim().length > 0),
|
doc.serpResults || (doc.markdown && doc.markdown.trim().length > 0),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
logger.info("Filtering completed", {
|
||||||
|
num_docs: filteredDocs.length,
|
||||||
|
});
|
||||||
|
|
||||||
if (filteredDocs.length === 0) {
|
if (filteredDocs.length === 0) {
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
@ -218,6 +262,11 @@ export async function searchController(
|
|||||||
const endTime = new Date().getTime();
|
const endTime = new Date().getTime();
|
||||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
|
|
||||||
|
logger.info("Logging job", {
|
||||||
|
num_docs: filteredDocs.length,
|
||||||
|
time_taken: timeTakenInSeconds,
|
||||||
|
});
|
||||||
|
|
||||||
logJob({
|
logJob({
|
||||||
job_id: jobId,
|
job_id: jobId,
|
||||||
success: true,
|
success: true,
|
||||||
|
@ -109,7 +109,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
|||||||
fastMode: false,
|
fastMode: false,
|
||||||
blockAds: false,
|
blockAds: false,
|
||||||
},
|
},
|
||||||
});
|
}, logger);
|
||||||
return response.length > 0 ? response : [];
|
return response.length > 0 ? response : [];
|
||||||
});
|
});
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user