From 579f259cb94174f5fd7ab42160acb96dace6eee9 Mon Sep 17 00:00:00 2001 From: "yanlong.wang" Date: Thu, 20 Jun 2024 18:20:13 +0800 Subject: [PATCH] fix: detect when readability does not work --- .../functions/src/cloud-functions/crawler.ts | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index 1b9fc97..77cc3a5 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -327,8 +327,19 @@ export class CrawlerHost extends RPCHost { break; } - const toBeTurnedToMd = mode === 'markdown' ? snapshot.html : snapshot.parsed?.content; - let turnDownService = mode === 'markdown' ? this.getTurndown({ url: snapshot.href }) : this.getTurndown({ noRules: true, url: snapshot.href }); + let toBeTurnedToMd = snapshot.html; + let turnDownService = this.getTurndown({ url: nominalUrl }); + if (mode !== 'markdown' && snapshot.parsed?.content) { + const par1 = turnDownService.turndown(toBeTurnedToMd); + const par2 = turnDownService.turndown(snapshot.parsed.content) + + // If Readability did its job + if (par2.length >= 0.3 * par1.length) { + turnDownService = this.getTurndown({ noRules: true, url: snapshot.href }); + toBeTurnedToMd = snapshot.parsed.content; + } + } + for (const plugin of this.turnDownPlugins) { turnDownService = turnDownService.use(plugin); } @@ -585,7 +596,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; let urlToCrawl; const normalizeUrl = (await pNormalizeUrl).default; try { - urlToCrawl = new URL(normalizeUrl(noSlashURL.trim(), { stripWWW: false, removeTrailingSlash: false, removeSingleSlash: false, sortQueryParameters:false })); + urlToCrawl = new URL(normalizeUrl(noSlashURL.trim(), { stripWWW: false, removeTrailingSlash: false, removeSingleSlash: false, sortQueryParameters: false })); } catch (err) { throw new ParamValidationError({ message: `${err}`,