From c327d688a64b943bbac6a0c3ab685a84752c093c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 12 Nov 2024 18:10:11 +0100 Subject: [PATCH 1/3] fix(queue-worker): don't log timeouts --- apps/api/src/services/queue-worker.ts | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index e44f65b3..428e7e01 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -504,19 +504,19 @@ async function processJob(job: Job & { id: string }, token: string) { job: job.id, }, }); + + if (error instanceof CustomError) { + // Here we handle the error, then save the failed job + logger.error(error.message); // or any other error handling + } + logger.error(error); + if (error.stack) { + logger.error(error.stack); + } } else { logger.error(`🐂 Job timed out ${job.id}`); } - if (error instanceof CustomError) { - // Here we handle the error, then save the failed job - logger.error(error.message); // or any other error handling - } - logger.error(error); - if (error.stack) { - logger.error(error.stack); - } - const data = { success: false, document: null, From 740a429790c25e1f67e4b959d558f823a1877e10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 12 Nov 2024 18:10:24 +0100 Subject: [PATCH 2/3] feat(api): graceful shutdown for less 502 errors --- apps/api/src/index.ts | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 7f7ec036..049b37a9 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -80,6 +80,17 @@ function startServer(port = DEFAULT_PORT) { `For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` ); }); + + const exitHandler = () => { + logger.info('SIGTERM signal received: closing HTTP server') + server.close(() => { + logger.info("Server closed."); + process.exit(0); + }); + }; + + process.on('SIGTERM', exitHandler); + process.on('SIGINT', exitHandler); return server; } From fbabc779f59c3889424f42e9d75a844807da475a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 12 Nov 2024 18:20:53 +0100 Subject: [PATCH 3/3] fix(crawler): relative URL handling on non-start pages (#893) * fix(crawler): relative URL handling on non-start pages * fix(crawl): further fixing --- apps/api/src/lib/crawl-redis.ts | 4 ++-- apps/api/src/scraper/WebScraper/crawler.ts | 2 +- apps/api/src/services/queue-worker.ts | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index b5936ad6..bd79a86d 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -166,10 +166,10 @@ export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Pro return res; } -export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler { +export function crawlToCrawler(id: string, sc: StoredCrawl, initialUrl?: string): WebCrawler { const crawler = new WebCrawler({ jobId: id, - initialUrl: sc.originUrl!, + initialUrl: initialUrl ?? sc.originUrl!, includes: sc.crawlerOptions?.includes ?? [], excludes: sc.crawlerOptions?.excludes ?? [], maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000, diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 9e3f7cd2..e5a25f37 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -171,7 +171,7 @@ export class WebCrawler { let fullUrl = href; if (!href.startsWith("http")) { try { - fullUrl = new URL(href, this.baseUrl).toString(); + fullUrl = new URL(href, url).toString(); } catch (_) { return null; } diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 428e7e01..5a0b28db 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -352,10 +352,10 @@ async function processJob(job: Job & { id: string }, token: string) { if (!job.data.sitemapped && job.data.crawlerOptions !== null) { if (!sc.cancelled) { - const crawler = crawlToCrawler(job.data.crawl_id, sc); + const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata?.url ?? doc.metadata?.sourceURL ?? undefined); const links = crawler.filterLinks( - crawler.extractLinksFromHTML(rawHtml ?? "", sc.originUrl as string), + crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string), Infinity, sc.crawlerOptions?.maxDepth ?? 10 );