From 9b190127aa40f430d9b5a261f4949f92dde13a4f Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Sat, 13 Apr 2024 21:40:51 -0700 Subject: [PATCH] fix: clean broken markdown --- backend/functions/src/cloud-functions/crawler.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index 2a7ed73..feedab2 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -8,6 +8,7 @@ import { Request, Response } from 'express'; import normalizeUrl from "@esm2cjs/normalize-url"; function tidyMarkdown(markdown: string): string { + // Step 1: Handle complex broken links with text and optional images spread across multiple lines let normalizedMarkdown = markdown.replace(/\[\s*([^]+?)\s*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, url) => { // Remove internal new lines and excessive spaces within the text @@ -39,7 +40,10 @@ function tidyMarkdown(markdown: string): string { // Step 3: Replace more than two consecutive empty lines with exactly two empty lines normalizedMarkdown = normalizedMarkdown.replace(/\n{3,}/g, '\n\n'); - return normalizedMarkdown; + // Step 4: Remove leading spaces from each line + normalizedMarkdown = normalizedMarkdown.replace(/^[ \t]+/gm, ''); + + return normalizedMarkdown.trim(); } @singleton()