feat: md options pass though to turndown

This commit is contained in:
Yanlong Wang 2025-03-09 10:31:39 +08:00
parent 2720b69e60
commit 6b9e14de62
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
6 changed files with 114 additions and 5 deletions

8
package-lock.json generated
View File

@ -17,7 +17,7 @@
"axios": "^1.3.3",
"bcrypt": "^5.1.0",
"busboy": "^1.6.0",
"civkit": "^0.8.4-9a05d29",
"civkit": "^0.8.4-5f839a7",
"core-js": "^3.37.1",
"cors": "^2.8.5",
"dayjs": "^1.11.9",
@ -4095,9 +4095,9 @@
}
},
"node_modules/civkit": {
"version": "0.8.4-9a05d29",
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-9a05d29.tgz",
"integrity": "sha512-NqK2lDSrtVGVLrASGuD6khlS1mDV2Ey/HNufB+Q0loxAf9NGFtkLgoB6WdGuSmA3EQnZFQ5nX3EQLbX5IiLTjQ==",
"version": "0.8.4-5f839a7",
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-5f839a7.tgz",
"integrity": "sha512-wF9Sm0dKBNGTXtueYtmwqreciilEw2+H3uAZgJNK/B+MoeQecvQ1alrqPqIP/Xf64H1ik6mD0Z47cez8jkayGA==",
"license": "AGPL",
"dependencies": {
"lodash": "^4.17.21",

View File

@ -25,7 +25,7 @@
"axios": "^1.3.3",
"bcrypt": "^5.1.0",
"busboy": "^1.6.0",
"civkit": "^0.8.4-9a05d29",
"civkit": "^0.8.4-5f839a7",
"core-js": "^3.37.1",
"cors": "^2.8.5",
"dayjs": "^1.11.9",

View File

@ -940,6 +940,9 @@ export class CrawlerHost extends RPCHost {
this.threadLocal.set('retainImages', opts.retainImages);
this.threadLocal.set('noGfm', opts.noGfm);
this.threadLocal.set('DNT', Boolean(opts.doNotTrack));
if (opts.markdown) {
this.threadLocal.set('turndownOpts', opts.markdown);
}
const crawlOpts: ExtraScrappingOptions = {
proxyUrl: opts.proxyUrl,

View File

@ -1,6 +1,7 @@
import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
import { Context } from '../services/registry';
import { TurnDownTweakableOptions } from './turndown-tweakable-options';
export enum CONTENT_FORMAT {
CONTENT = 'content',
@ -209,6 +210,41 @@ class Viewport extends AutoCastable {
in: 'header',
schema: { type: 'string' }
},
'X-Md-Heading-Style': {
description: 'Heading style of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: setext, atx',
in: 'header',
schema: { type: 'string' }
},
'X-Md-Hr': {
description: 'Hr text of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).',
in: 'header',
schema: { type: 'string' }
},
'X-Md-Bullet-List-Marker': {
description: 'Bullet list marker of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: -, +, *',
in: 'header',
schema: { type: 'string' }
},
'X-Md-Em-Delimiter': {
description: 'Em delimiter of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: _, *',
in: 'header',
schema: { type: 'string' }
},
'X-Md-Strong-Delimiter': {
description: 'Strong delimiter of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: **, __',
in: 'header',
schema: { type: 'string' }
},
'X-Md-Link-Style': {
description: 'Link style of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: inlined, referenced',
in: 'header',
schema: { type: 'string' }
},
'X-Md-Link-Reference-Style': {
description: 'Link reference style of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: full, collapsed, shortcut',
in: 'header',
schema: { type: 'string' }
},
}
}
}
@ -353,6 +389,9 @@ export class CrawlerOptions extends AutoCastable {
@Prop()
doNotTrack?: number | null;
@Prop()
markdown?: TurnDownTweakableOptions;
static override from(input: any) {
const instance = super.from(input) as CrawlerOptions;
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
@ -510,6 +549,10 @@ export class CrawlerOptions extends AutoCastable {
instance.cacheTolerance = instance.cacheTolerance * 1000;
}
if (ctx) {
instance.markdown ??= TurnDownTweakableOptions.fromCtx(ctx);
}
return instance;
}

View File

@ -0,0 +1,61 @@
import { AutoCastable, Prop } from 'civkit/civ-rpc';
import {Context} from '../services/registry';
import _ from 'lodash';
export class TurnDownTweakableOptions extends AutoCastable {
@Prop({
desc: 'Turndown options > headingStyle',
type: new Set(['setext', 'atx']),
})
headingStyle?: 'setext' | 'atx';
@Prop({
desc: 'Turndown options > hr',
validate: (v: string) => v.length > 0 && v.length <= 128
})
hr?: string;
@Prop({
desc: 'Turndown options > bulletListMarker',
type: new Set(['-', '+', '*']),
})
bulletListMarker?: '-' | '+' | '*';
@Prop({
desc: 'Turndown options > emDelimiter',
type: new Set(['_', '*']),
})
emDelimiter?: '_' | '*';
@Prop({
desc: 'Turndown options > strongDelimiter',
type: new Set(['__', '**']),
})
strongDelimiter?: '__' | '**';
@Prop({
desc: 'Turndown options > linkStyle',
type: new Set(['inlined', 'referenced']),
})
linkStyle?: 'inlined' | 'referenced';
@Prop({
desc: 'Turndown options > linkReferenceStyle',
type: new Set(['full', 'collapsed', 'shortcut']),
})
linkReferenceStyle?: 'full' | 'collapsed' | 'shortcut';
static fromCtx(ctx: Context, prefix= 'x-md-') {
const draft: Record<string, string> = {};
for (const [k, v] of Object.entries(ctx.headers)) {
if (k.startsWith(prefix)) {
const prop = k.slice(prefix.length);
const sk = _.camelCase(prop);
draft[sk] = v as string;
}
}
return this.from(draft);
}
}

View File

@ -580,7 +580,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
customRules?: { [k: string]: Rule; };
customKeep?: Filter;
}) {
const turndownOpts = this.threadLocal.get('turndownOpts');
const turnDownService = new TurndownService({
...turndownOpts,
codeBlockStyle: 'fenced',
preformattedCode: true,
} as any);