mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 05:15:57 +08:00
feat: opt out gfm/table
This commit is contained in:
parent
2b29679801
commit
16cabcaf22
@ -688,6 +688,7 @@ export class CrawlerHost extends RPCHost {
|
||||
this.threadLocal.set('timeout', opts.timeout * 1000);
|
||||
}
|
||||
this.threadLocal.set('retainImages', opts.retainImages);
|
||||
this.threadLocal.set('noGfm', opts.noGfm);
|
||||
|
||||
const crawlOpts: ExtraScrappingOptions = {
|
||||
proxyUrl: opts.proxyUrl,
|
||||
|
@ -203,6 +203,11 @@ export class CrawlerOptions extends AutoCastable {
|
||||
})
|
||||
noCache!: boolean;
|
||||
|
||||
@Prop({
|
||||
default: false,
|
||||
})
|
||||
noGfm!: string | boolean;
|
||||
|
||||
@Prop()
|
||||
cacheTolerance?: number;
|
||||
|
||||
@ -318,6 +323,11 @@ export class CrawlerOptions extends AutoCastable {
|
||||
instance.cacheTolerance = cacheTolerance;
|
||||
}
|
||||
|
||||
const noGfm = ctx?.req.get('x-no-gfm');
|
||||
if (noGfm) {
|
||||
instance.noGfm = noGfm === 'table' ? noGfm : Boolean(noGfm);
|
||||
}
|
||||
|
||||
let timeoutSeconds = parseInt(ctx?.req.get('x-timeout') || '');
|
||||
if (!isNaN(timeoutSeconds) && timeoutSeconds > 0) {
|
||||
instance.timeout = timeoutSeconds <= 180 ? timeoutSeconds : 180;
|
||||
|
@ -1,7 +1,7 @@
|
||||
import { randomUUID } from 'crypto';
|
||||
import { container, singleton } from 'tsyringe';
|
||||
import { AsyncService, HashManager, marshalErrorLike } from 'civkit';
|
||||
import TurndownService, { Rule } from 'turndown';
|
||||
import TurndownService, { Filter, Rule } from 'turndown';
|
||||
import { Logger } from '../shared/services/logger';
|
||||
import { PageSnapshot } from './puppeteer';
|
||||
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
||||
@ -44,12 +44,15 @@ export interface FormattedPage {
|
||||
|
||||
export const md5Hasher = new HashManager('md5', 'hex');
|
||||
|
||||
const gfmPlugin = require('turndown-plugin-gfm');
|
||||
|
||||
@singleton()
|
||||
export class SnapshotFormatter extends AsyncService {
|
||||
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
turnDownPlugins = [require('turndown-plugin-gfm').tables, require('turndown-plugin-gfm').strikethrough];
|
||||
gfmPlugin = gfmPlugin.gfm;
|
||||
gfmNoTable = [gfmPlugin.highlightedCodeBlock, gfmPlugin.strikethrough, gfmPlugin.taskListItems];
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
@ -182,9 +185,10 @@ export class SnapshotFormatter extends AsyncService {
|
||||
}
|
||||
|
||||
const urlToAltMap: { [k: string]: string | undefined; } = {};
|
||||
const noGFMOpts = this.threadLocal.get('noGfm');
|
||||
const imageRetention = this.threadLocal.get('retainImages') as CrawlerOptions['retainImages'];
|
||||
let imgIdx = 0;
|
||||
const customRules = {
|
||||
const customRules: { [k: string]: Rule; } = {
|
||||
'img-retention': {
|
||||
filter: 'img',
|
||||
replacement: (_content: string, node: HTMLElement) => {
|
||||
@ -255,10 +259,16 @@ export class SnapshotFormatter extends AsyncService {
|
||||
}
|
||||
} as Rule
|
||||
};
|
||||
const optsMixin = {
|
||||
url: snapshot.rebase || nominalUrl,
|
||||
customRules,
|
||||
customKeep: noGFMOpts === 'table' ? 'table' : undefined,
|
||||
imgDataUrlToObjectUrl,
|
||||
} as const;
|
||||
|
||||
const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
|
||||
let toBeTurnedToMd = jsDomElementOfHTML;
|
||||
let turnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl, customRules });
|
||||
let turnDownService = this.getTurndown({ ...optsMixin });
|
||||
if (!mode.includes('markdown') && snapshot.parsed?.content) {
|
||||
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
|
||||
const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
|
||||
@ -266,15 +276,15 @@ export class SnapshotFormatter extends AsyncService {
|
||||
|
||||
// If Readability did its job
|
||||
if (par2.length >= 0.3 * par1.length) {
|
||||
turnDownService = this.getTurndown({ noRules: true, url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl, customRules });
|
||||
turnDownService = this.getTurndown({ noRules: true, ...optsMixin });
|
||||
if (snapshot.parsed.content) {
|
||||
toBeTurnedToMd = jsDomElementOfParsed;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const plugin of this.turnDownPlugins) {
|
||||
turnDownService = turnDownService.use(plugin);
|
||||
if (!noGFMOpts) {
|
||||
turnDownService = turnDownService.use(noGFMOpts === 'table' ? this.gfmNoTable : this.gfmPlugin);
|
||||
}
|
||||
|
||||
// _p is the special suffix for withGeneratedAlt
|
||||
@ -297,7 +307,7 @@ export class SnapshotFormatter extends AsyncService {
|
||||
contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
|
||||
} catch (err) {
|
||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl, customRules });
|
||||
const vanillaTurnDownService = this.getTurndown({ ...optsMixin });
|
||||
try {
|
||||
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
|
||||
} catch (err2) {
|
||||
@ -315,7 +325,7 @@ export class SnapshotFormatter extends AsyncService {
|
||||
contentText = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML).trim();
|
||||
} catch (err) {
|
||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl, customRules });
|
||||
const vanillaTurnDownService = this.getTurndown({ ...optsMixin });
|
||||
try {
|
||||
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, jsDomElementOfHTML).trim();
|
||||
} catch (err2) {
|
||||
@ -465,11 +475,15 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
imgDataUrlToObjectUrl?: boolean;
|
||||
removeImages?: boolean | 'src';
|
||||
customRules?: { [k: string]: Rule; };
|
||||
customKeep?: Filter
|
||||
}) {
|
||||
const turnDownService = new TurndownService({
|
||||
codeBlockStyle: 'fenced',
|
||||
preformattedCode: true,
|
||||
} as any);
|
||||
if (options?.customKeep) {
|
||||
turnDownService.keep(options.customKeep);
|
||||
}
|
||||
if (!options?.noRules) {
|
||||
turnDownService.addRule('remove-irrelevant', {
|
||||
filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea', 'select'],
|
||||
@ -586,7 +600,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (content.includes('<table') && content.includes('</table>')) {
|
||||
if (!this.threadLocal.get('noGfm') && content.includes('<table') && content.includes('</table>')) {
|
||||
if (node?.textContent && content.length > node.textContent.length * 0.8) {
|
||||
return true;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user