Update search.ts

This commit is contained in:
Nicolas 2025-01-02 21:13:24 -03:00
parent e37ab8431a
commit cbe0716439

View File

@ -18,6 +18,7 @@ import { getScrapeQueue } from "../../services/queue-service";
import { search } from "../../search"; import { search } from "../../search";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
async function scrapeSearchResult( async function scrapeSearchResult(
searchResult: { url: string; title: string; description: string }, searchResult: { url: string; title: string; description: string },
@ -37,6 +38,9 @@ async function scrapeSearchResult(
}); });
try { try {
if (isUrlBlocked(searchResult.url)) {
throw new Error("Could not scrape url: " + BLOCKLISTED_URL_MESSAGE);
}
await addScrapeJob( await addScrapeJob(
{ {
url: searchResult.url, url: searchResult.url,
@ -75,9 +79,6 @@ async function scrapeSearchResult(
description: searchResult.description, description: searchResult.description,
url: searchResult.url, url: searchResult.url,
metadata: { metadata: {
title: searchResult.title,
description: searchResult.description,
sourceURL: searchResult.url,
statusCode: 0, statusCode: 0,
error: error.message, error: error.message,
}, },
@ -145,9 +146,6 @@ export async function searchController(
}); });
} }
// Filter out blocked URLs before scraping
searchResults = searchResults.filter((r) => !isUrlBlocked(r.url));
// Scrape each non-blocked result, handling timeouts individually // Scrape each non-blocked result, handling timeouts individually
const scrapePromises = searchResults.map((result) => const scrapePromises = searchResults.map((result) =>
scrapeSearchResult(result, { scrapeSearchResult(result, {