Nick: metadata fixes and lock duration for bull decreased to 2 hrs

This commit is contained in:
Nicolas 2024-06-25 15:21:14 -03:00
parent e5314ee8e7
commit e7be17db92
2 changed files with 7 additions and 4 deletions

View File

@ -399,12 +399,14 @@ export async function scrapSingleUrl(
return { return {
text: await parseMarkdown(cleanedHtml), text: await parseMarkdown(cleanedHtml),
html: cleanedHtml, html: cleanedHtml,
rawHtml: scraperResponse.text,
screenshot: scraperResponse.screenshot, screenshot: scraperResponse.screenshot,
pageStatusCode: scraperResponse.metadata.pageStatusCode, pageStatusCode: scraperResponse.metadata.pageStatusCode,
pageError: scraperResponse.metadata.pageError || undefined pageError: scraperResponse.metadata.pageError || undefined
}; };
}; };
let { text, html, screenshot, pageStatusCode, pageError } = { text: "", html: "", screenshot: "", pageStatusCode: 200, pageError: undefined };
let { text, html, rawHtml, screenshot, pageStatusCode, pageError } = { text: "", html: "", rawHtml: "", screenshot: "", pageStatusCode: 200, pageError: undefined };
try { try {
let urlKey = urlToScrap; let urlKey = urlToScrap;
try { try {
@ -432,6 +434,7 @@ export async function scrapSingleUrl(
const attempt = await attemptScraping(urlToScrap, scraper); const attempt = await attemptScraping(urlToScrap, scraper);
text = attempt.text ?? ''; text = attempt.text ?? '';
html = attempt.html ?? ''; html = attempt.html ?? '';
rawHtml = attempt.rawHtml ?? '';
screenshot = attempt.screenshot ?? ''; screenshot = attempt.screenshot ?? '';
if (attempt.pageStatusCode) { if (attempt.pageStatusCode) {
pageStatusCode = attempt.pageStatusCode; pageStatusCode = attempt.pageStatusCode;
@ -453,7 +456,7 @@ export async function scrapSingleUrl(
throw new Error(`All scraping methods failed for URL: ${urlToScrap}`); throw new Error(`All scraping methods failed for URL: ${urlToScrap}`);
} }
const soup = cheerio.load(html); const soup = cheerio.load(rawHtml);
const metadata = extractMetadata(soup, urlToScrap); const metadata = extractMetadata(soup, urlToScrap);
let document: Document; let document: Document;

View File

@ -7,7 +7,7 @@ export function getWebScraperQueue() {
if (!webScraperQueue) { if (!webScraperQueue) {
webScraperQueue = new Queue("web-scraper", process.env.REDIS_URL, { webScraperQueue = new Queue("web-scraper", process.env.REDIS_URL, {
settings: { settings: {
lockDuration: 4 * 60 * 60 * 1000, // 4 hours in milliseconds, lockDuration: 2 * 60 * 60 * 1000, // 2 hours in milliseconds,
lockRenewTime: 30 * 60 * 1000, // 30 minutes in milliseconds lockRenewTime: 30 * 60 * 1000, // 30 minutes in milliseconds
}, },
}); });