Nick: metadata fixes and lock duration for bull decreased to 2 hrs

This commit is contained in:
Nicolas 2024-06-25 15:21:14 -03:00
parent e5314ee8e7
commit e7be17db92
2 changed files with 7 additions and 4 deletions

View File

@ -399,12 +399,14 @@ export async function scrapSingleUrl(
return {
text: await parseMarkdown(cleanedHtml),
html: cleanedHtml,
rawHtml: scraperResponse.text,
screenshot: scraperResponse.screenshot,
pageStatusCode: scraperResponse.metadata.pageStatusCode,
pageError: scraperResponse.metadata.pageError || undefined
};
};
let { text, html, screenshot, pageStatusCode, pageError } = { text: "", html: "", screenshot: "", pageStatusCode: 200, pageError: undefined };
let { text, html, rawHtml, screenshot, pageStatusCode, pageError } = { text: "", html: "", rawHtml: "", screenshot: "", pageStatusCode: 200, pageError: undefined };
try {
let urlKey = urlToScrap;
try {
@ -432,6 +434,7 @@ export async function scrapSingleUrl(
const attempt = await attemptScraping(urlToScrap, scraper);
text = attempt.text ?? '';
html = attempt.html ?? '';
rawHtml = attempt.rawHtml ?? '';
screenshot = attempt.screenshot ?? '';
if (attempt.pageStatusCode) {
pageStatusCode = attempt.pageStatusCode;
@ -453,7 +456,7 @@ export async function scrapSingleUrl(
throw new Error(`All scraping methods failed for URL: ${urlToScrap}`);
}
const soup = cheerio.load(html);
const soup = cheerio.load(rawHtml);
const metadata = extractMetadata(soup, urlToScrap);
let document: Document;

View File

@ -7,7 +7,7 @@ export function getWebScraperQueue() {
if (!webScraperQueue) {
webScraperQueue = new Queue("web-scraper", process.env.REDIS_URL, {
settings: {
lockDuration: 4 * 60 * 60 * 1000, // 4 hours in milliseconds,
lockDuration: 2 * 60 * 60 * 1000, // 2 hours in milliseconds,
lockRenewTime: 30 * 60 * 1000, // 30 minutes in milliseconds
},
});