feat: keepImgDataUrl

This commit is contained in:
yanlong.wang 2024-06-24 15:18:53 +08:00
parent 1084b16c84
commit 62fb6cff94
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
3 changed files with 61 additions and 6 deletions

View File

@ -29,6 +29,7 @@ const md5Hasher = new HashManager('md5', 'hex');
export interface ExtraScrappingOptions extends ScrappingOptions {
targetSelector?: string | string[];
removeSelector?: string | string[];
keepImgDataUrl?: boolean;
}
export interface FormattedPage {
@ -135,6 +136,7 @@ export class CrawlerHost extends RPCHost {
getTurndown(options?: {
noRules?: boolean | string,
url?: string | URL;
imgDataUrlToObjectUrl?: boolean;
}) {
const turnDownService = new TurndownService({
codeBlockStyle: 'fenced',
@ -154,6 +156,26 @@ export class CrawlerHost extends RPCHost {
replacement: (innerText) => `${innerText}\n===============\n`
});
}
if (options?.imgDataUrlToObjectUrl) {
turnDownService.addRule('data-url-to-pseudo-object-url', {
filter: (node) => Boolean(node.tagName === 'IMG' && node.getAttribute('src')?.startsWith('data:')),
replacement: (_content, node: any) => {
const src = (node.getAttribute('src') || '').trim();
const alt = cleanAttribute(node.getAttribute('alt')) || '';
if (options.url) {
const refUrl = new URL(options.url);
const mappedUrl = new URL(`blob:${refUrl.origin}/${md5Hasher.hash(src)}`);
return `![${alt}](${mappedUrl})`;
}
return `![${alt}](blob:${md5Hasher.hash(src)})`;
}
});
}
turnDownService.addRule('improved-paragraph', {
filter: 'p',
replacement: (innerText) => {
@ -317,6 +339,7 @@ export class CrawlerHost extends RPCHost {
}
} as FormattedPage;
}
const imgDataUrlToObjectUrl = !Boolean(this.threadLocal.get('keepImgDataUrl'));
let contentText = '';
const imageSummary = {} as { [k: string]: string; };
@ -328,14 +351,14 @@ export class CrawlerHost extends RPCHost {
}
let toBeTurnedToMd = snapshot.html;
let turnDownService = this.getTurndown({ url: nominalUrl });
let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });
if (mode !== 'markdown' && snapshot.parsed?.content) {
const par1 = turnDownService.turndown(toBeTurnedToMd);
const par1 = turnDownService.turndown(snapshot.html);
const par2 = turnDownService.turndown(snapshot.parsed.content);
// If Readability did its job
if (par2.length >= 0.3 * par1.length) {
turnDownService = this.getTurndown({ noRules: true, url: snapshot.href });
turnDownService = this.getTurndown({ noRules: true, url: snapshot.href, imgDataUrlToObjectUrl });
toBeTurnedToMd = snapshot.parsed.content;
}
}
@ -388,11 +411,25 @@ export class CrawlerHost extends RPCHost {
if (mapped) {
imageSummary[src] = mapped || alt;
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
mappedUrl.protocol = 'blob:';
return `![Image ${imgIdx}: ${mapped || alt}](${mappedUrl})`;
}
return `![Image ${imgIdx}: ${mapped || alt}](${src})`;
}
imageSummary[src] = alt || '';
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
mappedUrl.protocol = 'blob:';
return alt ? `![Image ${imgIdx}: ${alt}](${mappedUrl})` : `![Image ${imgIdx}](${mappedUrl})`;
}
return alt ? `![Image ${imgIdx}: ${alt}](${src})` : `![Image ${imgIdx}](${src})`;
}
});
@ -402,7 +439,7 @@ export class CrawlerHost extends RPCHost {
contentText = turnDownService.turndown(toBeTurnedToMd).trim();
} catch (err) {
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href });
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
try {
contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
} catch (err2) {
@ -419,7 +456,7 @@ export class CrawlerHost extends RPCHost {
contentText = turnDownService.turndown(snapshot.html);
} catch (err) {
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href });
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
try {
contentText = vanillaTurnDownService.turndown(snapshot.html);
} catch (err2) {
@ -922,6 +959,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
this.threadLocal.set('withGeneratedAlt', opts.withGeneratedAlt);
this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
this.threadLocal.set('keepImgDataUrl', opts.keepImgDataUrl);
this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
this.threadLocal.set('userAgent', opts.userAgent);
if (opts.timeout) {

View File

@ -60,6 +60,13 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
in: 'header',
schema: { type: 'string' }
},
'X-Keep-Img-Data-Url': {
description: `Keep data-url as it instead of transforming them to object-url. (Only applicable when targeting markdown format)\n\n` +
'Example `X-Keep-Img-Data-Url: true`'
,
in: 'header',
schema: { type: 'string' }
},
'X-Proxy-Url': {
description: `Specifies your custom proxy if you prefer to use one.\n\n` +
`Supported protocols: \n` +
@ -146,6 +153,11 @@ export class CrawlerOptions extends AutoCastable {
@Prop({ arrayOf: String })
removeSelector?: string | string[];
@Prop({
default: false,
})
keepImgDataUrl!: boolean;
@Prop({
arrayOf: String,
})
@ -212,6 +224,11 @@ export class CrawlerOptions extends AutoCastable {
const overrideUserAgent = ctx?.req.get('x-user-agent');
instance.userAgent ??= overrideUserAgent;
const keepImgDataUrl = ctx?.req.get('x-keep-img-data-url');
if (keepImgDataUrl !== undefined) {
instance.keepImgDataUrl = Boolean(keepImgDataUrl);
}
const cookies: CookieParam[] = [];
const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
if (Array.isArray(setCookieHeaders)) {

@ -1 +1 @@
Subproject commit 38177e1e3814970613ce6d8fe3e3cf0030d92066
Subproject commit e7216f6ed7aaee80068ffabce78a37ce66b9c50e