diff --git a/package-lock.json b/package-lock.json index 94b66bb..feb3d0f 100644 --- a/package-lock.json +++ b/package-lock.json @@ -17,7 +17,7 @@ "axios": "^1.3.3", "bcrypt": "^5.1.0", "busboy": "^1.6.0", - "civkit": "^0.8.4-9a05d29", + "civkit": "^0.8.4-5f839a7", "core-js": "^3.37.1", "cors": "^2.8.5", "dayjs": "^1.11.9", @@ -4095,9 +4095,9 @@ } }, "node_modules/civkit": { - "version": "0.8.4-9a05d29", - "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-9a05d29.tgz", - "integrity": "sha512-NqK2lDSrtVGVLrASGuD6khlS1mDV2Ey/HNufB+Q0loxAf9NGFtkLgoB6WdGuSmA3EQnZFQ5nX3EQLbX5IiLTjQ==", + "version": "0.8.4-5f839a7", + "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-5f839a7.tgz", + "integrity": "sha512-wF9Sm0dKBNGTXtueYtmwqreciilEw2+H3uAZgJNK/B+MoeQecvQ1alrqPqIP/Xf64H1ik6mD0Z47cez8jkayGA==", "license": "AGPL", "dependencies": { "lodash": "^4.17.21", diff --git a/package.json b/package.json index 0aabb0f..c5a584a 100644 --- a/package.json +++ b/package.json @@ -25,7 +25,7 @@ "axios": "^1.3.3", "bcrypt": "^5.1.0", "busboy": "^1.6.0", - "civkit": "^0.8.4-9a05d29", + "civkit": "^0.8.4-5f839a7", "core-js": "^3.37.1", "cors": "^2.8.5", "dayjs": "^1.11.9", diff --git a/src/api/crawler.ts b/src/api/crawler.ts index d241d87..05334cc 100644 --- a/src/api/crawler.ts +++ b/src/api/crawler.ts @@ -940,6 +940,9 @@ export class CrawlerHost extends RPCHost { this.threadLocal.set('retainImages', opts.retainImages); this.threadLocal.set('noGfm', opts.noGfm); this.threadLocal.set('DNT', Boolean(opts.doNotTrack)); + if (opts.markdown) { + this.threadLocal.set('turndownOpts', opts.markdown); + } const crawlOpts: ExtraScrappingOptions = { proxyUrl: opts.proxyUrl, diff --git a/src/dto/crawler-options.ts b/src/dto/crawler-options.ts index 1101719..44cee6f 100644 --- a/src/dto/crawler-options.ts +++ b/src/dto/crawler-options.ts @@ -1,6 +1,7 @@ import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser'; import { Context } from '../services/registry'; +import { TurnDownTweakableOptions } from './turndown-tweakable-options'; export enum CONTENT_FORMAT { CONTENT = 'content', @@ -209,6 +210,41 @@ class Viewport extends AutoCastable { in: 'header', schema: { type: 'string' } }, + 'X-Md-Heading-Style': { + description: 'Heading style of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: setext, atx', + in: 'header', + schema: { type: 'string' } + }, + 'X-Md-Hr': { + description: 'Hr text of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).', + in: 'header', + schema: { type: 'string' } + }, + 'X-Md-Bullet-List-Marker': { + description: 'Bullet list marker of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: -, +, *', + in: 'header', + schema: { type: 'string' } + }, + 'X-Md-Em-Delimiter': { + description: 'Em delimiter of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: _, *', + in: 'header', + schema: { type: 'string' } + }, + 'X-Md-Strong-Delimiter': { + description: 'Strong delimiter of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: **, __', + in: 'header', + schema: { type: 'string' } + }, + 'X-Md-Link-Style': { + description: 'Link style of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: inlined, referenced', + in: 'header', + schema: { type: 'string' } + }, + 'X-Md-Link-Reference-Style': { + description: 'Link reference style of the generated markdown.\n\nThis is an option passed through to [Turndown](https://github.com/mixmark-io/turndown?tab=readme-ov-file#options).\n\nSupported: full, collapsed, shortcut', + in: 'header', + schema: { type: 'string' } + }, } } } @@ -353,6 +389,9 @@ export class CrawlerOptions extends AutoCastable { @Prop() doNotTrack?: number | null; + @Prop() + markdown?: TurnDownTweakableOptions; + static override from(input: any) { const instance = super.from(input) as CrawlerOptions; const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined; @@ -510,6 +549,10 @@ export class CrawlerOptions extends AutoCastable { instance.cacheTolerance = instance.cacheTolerance * 1000; } + if (ctx) { + instance.markdown ??= TurnDownTweakableOptions.fromCtx(ctx); + } + return instance; } diff --git a/src/dto/turndown-tweakable-options.ts b/src/dto/turndown-tweakable-options.ts new file mode 100644 index 0000000..d167351 --- /dev/null +++ b/src/dto/turndown-tweakable-options.ts @@ -0,0 +1,61 @@ +import { AutoCastable, Prop } from 'civkit/civ-rpc'; +import {Context} from '../services/registry'; +import _ from 'lodash'; + + +export class TurnDownTweakableOptions extends AutoCastable { + @Prop({ + desc: 'Turndown options > headingStyle', + type: new Set(['setext', 'atx']), + }) + headingStyle?: 'setext' | 'atx'; + + @Prop({ + desc: 'Turndown options > hr', + validate: (v: string) => v.length > 0 && v.length <= 128 + }) + hr?: string; + + @Prop({ + desc: 'Turndown options > bulletListMarker', + type: new Set(['-', '+', '*']), + }) + bulletListMarker?: '-' | '+' | '*'; + + @Prop({ + desc: 'Turndown options > emDelimiter', + type: new Set(['_', '*']), + }) + emDelimiter?: '_' | '*'; + + @Prop({ + desc: 'Turndown options > strongDelimiter', + type: new Set(['__', '**']), + }) + strongDelimiter?: '__' | '**'; + + @Prop({ + desc: 'Turndown options > linkStyle', + type: new Set(['inlined', 'referenced']), + }) + linkStyle?: 'inlined' | 'referenced'; + + @Prop({ + desc: 'Turndown options > linkReferenceStyle', + type: new Set(['full', 'collapsed', 'shortcut']), + }) + linkReferenceStyle?: 'full' | 'collapsed' | 'shortcut'; + + static fromCtx(ctx: Context, prefix= 'x-md-') { + const draft: Record = {}; + for (const [k, v] of Object.entries(ctx.headers)) { + if (k.startsWith(prefix)) { + const prop = k.slice(prefix.length); + const sk = _.camelCase(prop); + draft[sk] = v as string; + } + } + + return this.from(draft); + } +} diff --git a/src/services/snapshot-formatter.ts b/src/services/snapshot-formatter.ts index b3562ba..11382d6 100644 --- a/src/services/snapshot-formatter.ts +++ b/src/services/snapshot-formatter.ts @@ -580,7 +580,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; customRules?: { [k: string]: Rule; }; customKeep?: Filter; }) { + const turndownOpts = this.threadLocal.get('turndownOpts'); const turnDownService = new TurndownService({ + ...turndownOpts, codeBlockStyle: 'fenced', preformattedCode: true, } as any);