diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index d68d644..36c4c10 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -16,7 +16,6 @@ import TurndownService from 'turndown'; import { parseString as parseSetCookieString } from 'set-cookie-parser'; import type { CookieParam } from 'puppeteer'; import { Crawled } from '../db/crawled'; -import { tidyMarkdown } from '../utils/markdown'; import { cleanAttribute } from '../utils/misc'; import { randomUUID } from 'crypto'; import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth'; @@ -198,7 +197,7 @@ export class CrawlerHost extends RPCHost { contentText = snapshot.text; } - const cleanText = tidyMarkdown(contentText || '').trim(); + const cleanText = (contentText || '').trim(); const formatted = { title: (snapshot.parsed?.title || snapshot.title || '').trim(), diff --git a/backend/functions/src/utils/markdown.ts b/backend/functions/src/utils/markdown.ts index 0feaa9d..f8def89 100644 --- a/backend/functions/src/utils/markdown.ts +++ b/backend/functions/src/utils/markdown.ts @@ -1,34 +1,39 @@ + export function tidyMarkdown(markdown: string): string { - const lines = markdown.split('\n'); - const processedLines = lines.map((line) => { - // Handle complex broken links with text and optional images - line = line.replace(/\[\s*([^\]\n!]*?)\s*(?:!\[([^\]]*)\]\((.*?)\))?\s*\]\s*\(\s*([^)\n]+)\s*\)/g, (match, text, alt, imgUrl, linkUrl) => { - text = text.replace(/\s+/g, ' ').trim(); - alt = alt ? alt.replace(/\s+/g, ' ').trim() : ''; - imgUrl = imgUrl ? imgUrl.replace(/\s+/g, '').trim() : ''; - linkUrl = linkUrl.replace(/\s+/g, '').trim(); - if (imgUrl) { - return `[${text} ![${alt}](${imgUrl})](${linkUrl})`; - } else { - return `[${text}](${linkUrl})`; - } - }); - // Normalize regular links that may be broken across lines - line = line.replace(/\[\s*([^\]\n]+)\]\s*\(\s*([^)\n]+)\s*\)/g, (match, text, url) => { - text = text.replace(/\s+/g, ' ').trim(); - url = url.replace(/\s+/g, '').trim(); - return `[${text}](${url})`; - }); - - return line; + // Step 1: Handle complex broken links with text and optional images spread across multiple lines + let normalizedMarkdown = markdown.replace(/\[\s*([^\]\n]+?)\s*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, url) => { + // Remove internal new lines and excessive spaces within the text + text = text.replace(/\s+/g, ' ').trim(); + url = url.replace(/\s+/g, '').trim(); + return `[${text}](${url})`; }); - // Join the processed lines back together - let normalizedMarkdown = processedLines.join('\n'); + normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]\n!]*?)\s*\n*(?:!\[([^\]]*)\]\((.*?)\))?\s*\n*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, alt, imgUrl, linkUrl) => { + // Normalize by removing excessive spaces and new lines + text = text.replace(/\s+/g, ' ').trim(); + alt = alt ? alt.replace(/\s+/g, ' ').trim() : ''; + imgUrl = imgUrl ? imgUrl.replace(/\s+/g, '').trim() : ''; + linkUrl = linkUrl.replace(/\s+/g, '').trim(); + if (imgUrl) { + return `[${text} ![${alt}](${imgUrl})](${linkUrl})`; + } else { + return `[${text}](${linkUrl})`; + } + }); - // Replace more than two consecutive empty lines with exactly two empty lines + // Step 2: Normalize regular links that may be broken across lines + normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]]+)\]\s*\(\s*([^)]+)\)/g, (match, text, url) => { + text = text.replace(/\s+/g, ' ').trim(); + url = url.replace(/\s+/g, '').trim(); + return `[${text}](${url})`; + }); + + // Step 3: Replace more than two consecutive empty lines with exactly two empty lines normalizedMarkdown = normalizedMarkdown.replace(/\n{3,}/g, '\n\n'); + // Step 4: Remove leading spaces from each line + normalizedMarkdown = normalizedMarkdown.replace(/^[ \t]+/gm, ''); + return normalizedMarkdown.trim(); }