mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-18 02:25:55 +08:00
fix: remove tidyMarkdown at all
This commit is contained in:
parent
59f807cb7c
commit
36bf5d96b5
@ -16,7 +16,6 @@ import TurndownService from 'turndown';
|
|||||||
import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
||||||
import type { CookieParam } from 'puppeteer';
|
import type { CookieParam } from 'puppeteer';
|
||||||
import { Crawled } from '../db/crawled';
|
import { Crawled } from '../db/crawled';
|
||||||
import { tidyMarkdown } from '../utils/markdown';
|
|
||||||
import { cleanAttribute } from '../utils/misc';
|
import { cleanAttribute } from '../utils/misc';
|
||||||
import { randomUUID } from 'crypto';
|
import { randomUUID } from 'crypto';
|
||||||
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
||||||
@ -198,7 +197,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
contentText = snapshot.text;
|
contentText = snapshot.text;
|
||||||
}
|
}
|
||||||
|
|
||||||
const cleanText = tidyMarkdown(contentText || '').trim();
|
const cleanText = (contentText || '').trim();
|
||||||
|
|
||||||
const formatted = {
|
const formatted = {
|
||||||
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
||||||
|
@ -1,34 +1,39 @@
|
|||||||
|
|
||||||
export function tidyMarkdown(markdown: string): string {
|
export function tidyMarkdown(markdown: string): string {
|
||||||
const lines = markdown.split('\n');
|
|
||||||
const processedLines = lines.map((line) => {
|
|
||||||
// Handle complex broken links with text and optional images
|
|
||||||
line = line.replace(/\[\s*([^\]\n!]*?)\s*(?:!\[([^\]]*)\]\((.*?)\))?\s*\]\s*\(\s*([^)\n]+)\s*\)/g, (match, text, alt, imgUrl, linkUrl) => {
|
|
||||||
text = text.replace(/\s+/g, ' ').trim();
|
|
||||||
alt = alt ? alt.replace(/\s+/g, ' ').trim() : '';
|
|
||||||
imgUrl = imgUrl ? imgUrl.replace(/\s+/g, '').trim() : '';
|
|
||||||
linkUrl = linkUrl.replace(/\s+/g, '').trim();
|
|
||||||
if (imgUrl) {
|
|
||||||
return `[${text} ](${linkUrl})`;
|
|
||||||
} else {
|
|
||||||
return `[${text}](${linkUrl})`;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
// Normalize regular links that may be broken across lines
|
// Step 1: Handle complex broken links with text and optional images spread across multiple lines
|
||||||
line = line.replace(/\[\s*([^\]\n]+)\]\s*\(\s*([^)\n]+)\s*\)/g, (match, text, url) => {
|
let normalizedMarkdown = markdown.replace(/\[\s*([^\]\n]+?)\s*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, url) => {
|
||||||
text = text.replace(/\s+/g, ' ').trim();
|
// Remove internal new lines and excessive spaces within the text
|
||||||
url = url.replace(/\s+/g, '').trim();
|
text = text.replace(/\s+/g, ' ').trim();
|
||||||
return `[${text}](${url})`;
|
url = url.replace(/\s+/g, '').trim();
|
||||||
});
|
return `[${text}](${url})`;
|
||||||
|
|
||||||
return line;
|
|
||||||
});
|
});
|
||||||
|
|
||||||
// Join the processed lines back together
|
normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]\n!]*?)\s*\n*(?:!\[([^\]]*)\]\((.*?)\))?\s*\n*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, alt, imgUrl, linkUrl) => {
|
||||||
let normalizedMarkdown = processedLines.join('\n');
|
// Normalize by removing excessive spaces and new lines
|
||||||
|
text = text.replace(/\s+/g, ' ').trim();
|
||||||
|
alt = alt ? alt.replace(/\s+/g, ' ').trim() : '';
|
||||||
|
imgUrl = imgUrl ? imgUrl.replace(/\s+/g, '').trim() : '';
|
||||||
|
linkUrl = linkUrl.replace(/\s+/g, '').trim();
|
||||||
|
if (imgUrl) {
|
||||||
|
return `[${text} ](${linkUrl})`;
|
||||||
|
} else {
|
||||||
|
return `[${text}](${linkUrl})`;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
// Replace more than two consecutive empty lines with exactly two empty lines
|
// Step 2: Normalize regular links that may be broken across lines
|
||||||
|
normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]]+)\]\s*\(\s*([^)]+)\)/g, (match, text, url) => {
|
||||||
|
text = text.replace(/\s+/g, ' ').trim();
|
||||||
|
url = url.replace(/\s+/g, '').trim();
|
||||||
|
return `[${text}](${url})`;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Step 3: Replace more than two consecutive empty lines with exactly two empty lines
|
||||||
normalizedMarkdown = normalizedMarkdown.replace(/\n{3,}/g, '\n\n');
|
normalizedMarkdown = normalizedMarkdown.replace(/\n{3,}/g, '\n\n');
|
||||||
|
|
||||||
|
// Step 4: Remove leading spaces from each line
|
||||||
|
normalizedMarkdown = normalizedMarkdown.replace(/^[ \t]+/gm, '');
|
||||||
|
|
||||||
return normalizedMarkdown.trim();
|
return normalizedMarkdown.trim();
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user