Nick: metadata fixes and lock duration for bull decreased to 2 hrs

2025-08-11 00:28:59 +08:00 · 2024-06-25 15:21:14 -03:00 · 2024-06-25 15:21:14 -03:00 · e7be17db92
commit e7be17db92
parent e5314ee8e7
2 changed files with 7 additions and 4 deletions
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -399,12 +399,14 @@ export async function scrapSingleUrl(
    return {
      text: await parseMarkdown(cleanedHtml),
      html: cleanedHtml,
      rawHtml: scraperResponse.text,
      screenshot: scraperResponse.screenshot,
      pageStatusCode: scraperResponse.metadata.pageStatusCode,
      pageError: scraperResponse.metadata.pageError || undefined
    };
  };
-  let { text, html, screenshot, pageStatusCode, pageError } = { text: "", html: "", screenshot: "", pageStatusCode: 200, pageError: undefined };
+
  let { text, html, rawHtml, screenshot, pageStatusCode, pageError } = { text: "", html: "", rawHtml: "", screenshot: "", pageStatusCode: 200, pageError: undefined };
  try {
    let urlKey = urlToScrap;
    try {
@ -432,6 +434,7 @@ export async function scrapSingleUrl(
      const attempt = await attemptScraping(urlToScrap, scraper);
      text = attempt.text ?? '';
      html = attempt.html ?? '';
      rawHtml = attempt.rawHtml ?? '';
      screenshot = attempt.screenshot ?? '';
      if (attempt.pageStatusCode) {
        pageStatusCode = attempt.pageStatusCode;
@ -453,7 +456,7 @@ export async function scrapSingleUrl(
      throw new Error(`All scraping methods failed for URL: ${urlToScrap}`);
    }
-    const soup = cheerio.load(html);
+    const soup = cheerio.load(rawHtml);
    const metadata = extractMetadata(soup, urlToScrap);
    let document: Document;
--- a/apps/api/src/services/queue-service.ts
+++ b/apps/api/src/services/queue-service.ts
@ -7,7 +7,7 @@ export function getWebScraperQueue() {
  if (!webScraperQueue) {
    webScraperQueue = new Queue("web-scraper", process.env.REDIS_URL, {
      settings: {
-        lockDuration: 4 * 60 * 60 * 1000, // 4 hours in milliseconds,
+        lockDuration: 2 * 60 * 60 * 1000, // 2 hours in milliseconds,
        lockRenewTime: 30 * 60 * 1000, // 30 minutes in milliseconds
      },
    });