mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-20 12:39:15 +08:00
feat: md options pass though to turndown
This commit is contained in:
parent
2720b69e60
commit
6b9e14de62
8
package-lock.json
generated
8
package-lock.json
generated
@ -17,7 +17,7 @@
|
|||||||
"axios": "^1.3.3",
|
"axios": "^1.3.3",
|
||||||
"bcrypt": "^5.1.0",
|
"bcrypt": "^5.1.0",
|
||||||
"busboy": "^1.6.0",
|
"busboy": "^1.6.0",
|
||||||
"civkit": "^0.8.4-9a05d29",
|
"civkit": "^0.8.4-5f839a7",
|
||||||
"core-js": "^3.37.1",
|
"core-js": "^3.37.1",
|
||||||
"cors": "^2.8.5",
|
"cors": "^2.8.5",
|
||||||
"dayjs": "^1.11.9",
|
"dayjs": "^1.11.9",
|
||||||
@ -4095,9 +4095,9 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/civkit": {
|
"node_modules/civkit": {
|
||||||
"version": "0.8.4-9a05d29",
|
"version": "0.8.4-5f839a7",
|
||||||
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-9a05d29.tgz",
|
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-5f839a7.tgz",
|
||||||
"integrity": "sha512-NqK2lDSrtVGVLrASGuD6khlS1mDV2Ey/HNufB+Q0loxAf9NGFtkLgoB6WdGuSmA3EQnZFQ5nX3EQLbX5IiLTjQ==",
|
"integrity": "sha512-wF9Sm0dKBNGTXtueYtmwqreciilEw2+H3uAZgJNK/B+MoeQecvQ1alrqPqIP/Xf64H1ik6mD0Z47cez8jkayGA==",
|
||||||
"license": "AGPL",
|
"license": "AGPL",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"lodash": "^4.17.21",
|
"lodash": "^4.17.21",
|
||||||
|
@ -25,7 +25,7 @@
|
|||||||
"axios": "^1.3.3",
|
"axios": "^1.3.3",
|
||||||
"bcrypt": "^5.1.0",
|
"bcrypt": "^5.1.0",
|
||||||
"busboy": "^1.6.0",
|
"busboy": "^1.6.0",
|
||||||
"civkit": "^0.8.4-9a05d29",
|
"civkit": "^0.8.4-5f839a7",
|
||||||
"core-js": "^3.37.1",
|
"core-js": "^3.37.1",
|
||||||
"cors": "^2.8.5",
|
"cors": "^2.8.5",
|
||||||
"dayjs": "^1.11.9",
|
"dayjs": "^1.11.9",
|
||||||
|
@ -940,6 +940,9 @@ export class CrawlerHost extends RPCHost {
|
|||||||
this.threadLocal.set('retainImages', opts.retainImages);
|
this.threadLocal.set('retainImages', opts.retainImages);
|
||||||
this.threadLocal.set('noGfm', opts.noGfm);
|
this.threadLocal.set('noGfm', opts.noGfm);
|
||||||
this.threadLocal.set('DNT', Boolean(opts.doNotTrack));
|
this.threadLocal.set('DNT', Boolean(opts.doNotTrack));
|
||||||
|
if (opts.markdown) {
|
||||||
|
this.threadLocal.set('turndownOpts', opts.markdown);
|
||||||
|
}
|
||||||
|
|
||||||
const crawlOpts: ExtraScrappingOptions = {
|
const crawlOpts: ExtraScrappingOptions = {
|
||||||
proxyUrl: opts.proxyUrl,
|
proxyUrl: opts.proxyUrl,
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
|
import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
|
||||||
import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
|
import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
|
||||||
import { Context } from '../services/registry';
|
import { Context } from '../services/registry';
|
||||||
|
import { TurnDownTweakableOptions } from './turndown-tweakable-options';
|
||||||
|
|
||||||
export enum CONTENT_FORMAT {
|
export enum CONTENT_FORMAT {
|
||||||
CONTENT = 'content',
|
CONTENT = 'content',
|
||||||
@ -209,6 +210,41 @@ class Viewport extends AutoCastable {
|
|||||||
in: 'header',
|
in: 'header',
|
||||||
schema: { type: 'string' }
|
schema: { type: 'string' }
|
||||||
},
|
},
|
||||||
|
'X-Md-Heading-Style': {
|
||||||
|
description: 'Heading style of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: setext, atx',
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
'X-Md-Hr': {
|
||||||
|
description: 'Hr text of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).',
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
'X-Md-Bullet-List-Marker': {
|
||||||
|
description: 'Bullet list marker of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: -, +, *',
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
'X-Md-Em-Delimiter': {
|
||||||
|
description: 'Em delimiter of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: _, *',
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
'X-Md-Strong-Delimiter': {
|
||||||
|
description: 'Strong delimiter of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: **, __',
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
'X-Md-Link-Style': {
|
||||||
|
description: 'Link style of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: inlined, referenced',
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
'X-Md-Link-Reference-Style': {
|
||||||
|
description: 'Link reference style of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: full, collapsed, shortcut',
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -353,6 +389,9 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
@Prop()
|
@Prop()
|
||||||
doNotTrack?: number | null;
|
doNotTrack?: number | null;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
markdown?: TurnDownTweakableOptions;
|
||||||
|
|
||||||
static override from(input: any) {
|
static override from(input: any) {
|
||||||
const instance = super.from(input) as CrawlerOptions;
|
const instance = super.from(input) as CrawlerOptions;
|
||||||
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
|
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
|
||||||
@ -510,6 +549,10 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ctx) {
|
||||||
|
instance.markdown ??= TurnDownTweakableOptions.fromCtx(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
return instance;
|
return instance;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
61
src/dto/turndown-tweakable-options.ts
Normal file
61
src/dto/turndown-tweakable-options.ts
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
import { AutoCastable, Prop } from 'civkit/civ-rpc';
|
||||||
|
import {Context} from '../services/registry';
|
||||||
|
import _ from 'lodash';
|
||||||
|
|
||||||
|
|
||||||
|
export class TurnDownTweakableOptions extends AutoCastable {
|
||||||
|
@Prop({
|
||||||
|
desc: 'Turndown options > headingStyle',
|
||||||
|
type: new Set(['setext', 'atx']),
|
||||||
|
})
|
||||||
|
headingStyle?: 'setext' | 'atx';
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
desc: 'Turndown options > hr',
|
||||||
|
validate: (v: string) => v.length > 0 && v.length <= 128
|
||||||
|
})
|
||||||
|
hr?: string;
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
desc: 'Turndown options > bulletListMarker',
|
||||||
|
type: new Set(['-', '+', '*']),
|
||||||
|
})
|
||||||
|
bulletListMarker?: '-' | '+' | '*';
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
desc: 'Turndown options > emDelimiter',
|
||||||
|
type: new Set(['_', '*']),
|
||||||
|
})
|
||||||
|
emDelimiter?: '_' | '*';
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
desc: 'Turndown options > strongDelimiter',
|
||||||
|
type: new Set(['__', '**']),
|
||||||
|
})
|
||||||
|
strongDelimiter?: '__' | '**';
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
desc: 'Turndown options > linkStyle',
|
||||||
|
type: new Set(['inlined', 'referenced']),
|
||||||
|
})
|
||||||
|
linkStyle?: 'inlined' | 'referenced';
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
desc: 'Turndown options > linkReferenceStyle',
|
||||||
|
type: new Set(['full', 'collapsed', 'shortcut']),
|
||||||
|
})
|
||||||
|
linkReferenceStyle?: 'full' | 'collapsed' | 'shortcut';
|
||||||
|
|
||||||
|
static fromCtx(ctx: Context, prefix= 'x-md-') {
|
||||||
|
const draft: Record<string, string> = {};
|
||||||
|
for (const [k, v] of Object.entries(ctx.headers)) {
|
||||||
|
if (k.startsWith(prefix)) {
|
||||||
|
const prop = k.slice(prefix.length);
|
||||||
|
const sk = _.camelCase(prop);
|
||||||
|
draft[sk] = v as string;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return this.from(draft);
|
||||||
|
}
|
||||||
|
}
|
@ -580,7 +580,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
customRules?: { [k: string]: Rule; };
|
customRules?: { [k: string]: Rule; };
|
||||||
customKeep?: Filter;
|
customKeep?: Filter;
|
||||||
}) {
|
}) {
|
||||||
|
const turndownOpts = this.threadLocal.get('turndownOpts');
|
||||||
const turnDownService = new TurndownService({
|
const turnDownService = new TurndownService({
|
||||||
|
...turndownOpts,
|
||||||
codeBlockStyle: 'fenced',
|
codeBlockStyle: 'fenced',
|
||||||
preformattedCode: true,
|
preformattedCode: true,
|
||||||
} as any);
|
} as any);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user