mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 05:45:54 +08:00
feat: md options pass though to turndown
This commit is contained in:
parent
2720b69e60
commit
6b9e14de62
8
package-lock.json
generated
8
package-lock.json
generated
@ -17,7 +17,7 @@
|
||||
"axios": "^1.3.3",
|
||||
"bcrypt": "^5.1.0",
|
||||
"busboy": "^1.6.0",
|
||||
"civkit": "^0.8.4-9a05d29",
|
||||
"civkit": "^0.8.4-5f839a7",
|
||||
"core-js": "^3.37.1",
|
||||
"cors": "^2.8.5",
|
||||
"dayjs": "^1.11.9",
|
||||
@ -4095,9 +4095,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/civkit": {
|
||||
"version": "0.8.4-9a05d29",
|
||||
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-9a05d29.tgz",
|
||||
"integrity": "sha512-NqK2lDSrtVGVLrASGuD6khlS1mDV2Ey/HNufB+Q0loxAf9NGFtkLgoB6WdGuSmA3EQnZFQ5nX3EQLbX5IiLTjQ==",
|
||||
"version": "0.8.4-5f839a7",
|
||||
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-5f839a7.tgz",
|
||||
"integrity": "sha512-wF9Sm0dKBNGTXtueYtmwqreciilEw2+H3uAZgJNK/B+MoeQecvQ1alrqPqIP/Xf64H1ik6mD0Z47cez8jkayGA==",
|
||||
"license": "AGPL",
|
||||
"dependencies": {
|
||||
"lodash": "^4.17.21",
|
||||
|
@ -25,7 +25,7 @@
|
||||
"axios": "^1.3.3",
|
||||
"bcrypt": "^5.1.0",
|
||||
"busboy": "^1.6.0",
|
||||
"civkit": "^0.8.4-9a05d29",
|
||||
"civkit": "^0.8.4-5f839a7",
|
||||
"core-js": "^3.37.1",
|
||||
"cors": "^2.8.5",
|
||||
"dayjs": "^1.11.9",
|
||||
|
@ -940,6 +940,9 @@ export class CrawlerHost extends RPCHost {
|
||||
this.threadLocal.set('retainImages', opts.retainImages);
|
||||
this.threadLocal.set('noGfm', opts.noGfm);
|
||||
this.threadLocal.set('DNT', Boolean(opts.doNotTrack));
|
||||
if (opts.markdown) {
|
||||
this.threadLocal.set('turndownOpts', opts.markdown);
|
||||
}
|
||||
|
||||
const crawlOpts: ExtraScrappingOptions = {
|
||||
proxyUrl: opts.proxyUrl,
|
||||
|
@ -1,6 +1,7 @@
|
||||
import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
|
||||
import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
|
||||
import { Context } from '../services/registry';
|
||||
import { TurnDownTweakableOptions } from './turndown-tweakable-options';
|
||||
|
||||
export enum CONTENT_FORMAT {
|
||||
CONTENT = 'content',
|
||||
@ -209,6 +210,41 @@ class Viewport extends AutoCastable {
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Md-Heading-Style': {
|
||||
description: 'Heading style of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: setext, atx',
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Md-Hr': {
|
||||
description: 'Hr text of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).',
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Md-Bullet-List-Marker': {
|
||||
description: 'Bullet list marker of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: -, +, *',
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Md-Em-Delimiter': {
|
||||
description: 'Em delimiter of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: _, *',
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Md-Strong-Delimiter': {
|
||||
description: 'Strong delimiter of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: **, __',
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Md-Link-Style': {
|
||||
description: 'Link style of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: inlined, referenced',
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Md-Link-Reference-Style': {
|
||||
description: 'Link reference style of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: full, collapsed, shortcut',
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -353,6 +389,9 @@ export class CrawlerOptions extends AutoCastable {
|
||||
@Prop()
|
||||
doNotTrack?: number | null;
|
||||
|
||||
@Prop()
|
||||
markdown?: TurnDownTweakableOptions;
|
||||
|
||||
static override from(input: any) {
|
||||
const instance = super.from(input) as CrawlerOptions;
|
||||
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
|
||||
@ -510,6 +549,10 @@ export class CrawlerOptions extends AutoCastable {
|
||||
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
||||
}
|
||||
|
||||
if (ctx) {
|
||||
instance.markdown ??= TurnDownTweakableOptions.fromCtx(ctx);
|
||||
}
|
||||
|
||||
return instance;
|
||||
}
|
||||
|
||||
|
61
src/dto/turndown-tweakable-options.ts
Normal file
61
src/dto/turndown-tweakable-options.ts
Normal file
@ -0,0 +1,61 @@
|
||||
import { AutoCastable, Prop } from 'civkit/civ-rpc';
|
||||
import {Context} from '../services/registry';
|
||||
import _ from 'lodash';
|
||||
|
||||
|
||||
export class TurnDownTweakableOptions extends AutoCastable {
|
||||
@Prop({
|
||||
desc: 'Turndown options > headingStyle',
|
||||
type: new Set(['setext', 'atx']),
|
||||
})
|
||||
headingStyle?: 'setext' | 'atx';
|
||||
|
||||
@Prop({
|
||||
desc: 'Turndown options > hr',
|
||||
validate: (v: string) => v.length > 0 && v.length <= 128
|
||||
})
|
||||
hr?: string;
|
||||
|
||||
@Prop({
|
||||
desc: 'Turndown options > bulletListMarker',
|
||||
type: new Set(['-', '+', '*']),
|
||||
})
|
||||
bulletListMarker?: '-' | '+' | '*';
|
||||
|
||||
@Prop({
|
||||
desc: 'Turndown options > emDelimiter',
|
||||
type: new Set(['_', '*']),
|
||||
})
|
||||
emDelimiter?: '_' | '*';
|
||||
|
||||
@Prop({
|
||||
desc: 'Turndown options > strongDelimiter',
|
||||
type: new Set(['__', '**']),
|
||||
})
|
||||
strongDelimiter?: '__' | '**';
|
||||
|
||||
@Prop({
|
||||
desc: 'Turndown options > linkStyle',
|
||||
type: new Set(['inlined', 'referenced']),
|
||||
})
|
||||
linkStyle?: 'inlined' | 'referenced';
|
||||
|
||||
@Prop({
|
||||
desc: 'Turndown options > linkReferenceStyle',
|
||||
type: new Set(['full', 'collapsed', 'shortcut']),
|
||||
})
|
||||
linkReferenceStyle?: 'full' | 'collapsed' | 'shortcut';
|
||||
|
||||
static fromCtx(ctx: Context, prefix= 'x-md-') {
|
||||
const draft: Record<string, string> = {};
|
||||
for (const [k, v] of Object.entries(ctx.headers)) {
|
||||
if (k.startsWith(prefix)) {
|
||||
const prop = k.slice(prefix.length);
|
||||
const sk = _.camelCase(prop);
|
||||
draft[sk] = v as string;
|
||||
}
|
||||
}
|
||||
|
||||
return this.from(draft);
|
||||
}
|
||||
}
|
@ -580,7 +580,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
customRules?: { [k: string]: Rule; };
|
||||
customKeep?: Filter;
|
||||
}) {
|
||||
const turndownOpts = this.threadLocal.get('turndownOpts');
|
||||
const turnDownService = new TurndownService({
|
||||
...turndownOpts,
|
||||
codeBlockStyle: 'fenced',
|
||||
preformattedCode: true,
|
||||
} as any);
|
||||
|
Loading…
x
Reference in New Issue
Block a user