mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 06:45:53 +08:00
feat: keepImgDataUrl
This commit is contained in:
parent
1084b16c84
commit
62fb6cff94
@ -29,6 +29,7 @@ const md5Hasher = new HashManager('md5', 'hex');
|
||||
export interface ExtraScrappingOptions extends ScrappingOptions {
|
||||
targetSelector?: string | string[];
|
||||
removeSelector?: string | string[];
|
||||
keepImgDataUrl?: boolean;
|
||||
}
|
||||
|
||||
export interface FormattedPage {
|
||||
@ -135,6 +136,7 @@ export class CrawlerHost extends RPCHost {
|
||||
getTurndown(options?: {
|
||||
noRules?: boolean | string,
|
||||
url?: string | URL;
|
||||
imgDataUrlToObjectUrl?: boolean;
|
||||
}) {
|
||||
const turnDownService = new TurndownService({
|
||||
codeBlockStyle: 'fenced',
|
||||
@ -154,6 +156,26 @@ export class CrawlerHost extends RPCHost {
|
||||
replacement: (innerText) => `${innerText}\n===============\n`
|
||||
});
|
||||
}
|
||||
|
||||
if (options?.imgDataUrlToObjectUrl) {
|
||||
turnDownService.addRule('data-url-to-pseudo-object-url', {
|
||||
filter: (node) => Boolean(node.tagName === 'IMG' && node.getAttribute('src')?.startsWith('data:')),
|
||||
replacement: (_content, node: any) => {
|
||||
const src = (node.getAttribute('src') || '').trim();
|
||||
const alt = cleanAttribute(node.getAttribute('alt')) || '';
|
||||
|
||||
if (options.url) {
|
||||
const refUrl = new URL(options.url);
|
||||
const mappedUrl = new URL(`blob:${refUrl.origin}/${md5Hasher.hash(src)}`);
|
||||
|
||||
return ``;
|
||||
}
|
||||
|
||||
return `})`;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
turnDownService.addRule('improved-paragraph', {
|
||||
filter: 'p',
|
||||
replacement: (innerText) => {
|
||||
@ -317,6 +339,7 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
} as FormattedPage;
|
||||
}
|
||||
const imgDataUrlToObjectUrl = !Boolean(this.threadLocal.get('keepImgDataUrl'));
|
||||
|
||||
let contentText = '';
|
||||
const imageSummary = {} as { [k: string]: string; };
|
||||
@ -328,14 +351,14 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
|
||||
let toBeTurnedToMd = snapshot.html;
|
||||
let turnDownService = this.getTurndown({ url: nominalUrl });
|
||||
let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });
|
||||
if (mode !== 'markdown' && snapshot.parsed?.content) {
|
||||
const par1 = turnDownService.turndown(toBeTurnedToMd);
|
||||
const par1 = turnDownService.turndown(snapshot.html);
|
||||
const par2 = turnDownService.turndown(snapshot.parsed.content);
|
||||
|
||||
// If Readability did its job
|
||||
if (par2.length >= 0.3 * par1.length) {
|
||||
turnDownService = this.getTurndown({ noRules: true, url: snapshot.href });
|
||||
turnDownService = this.getTurndown({ noRules: true, url: snapshot.href, imgDataUrlToObjectUrl });
|
||||
toBeTurnedToMd = snapshot.parsed.content;
|
||||
}
|
||||
}
|
||||
@ -388,11 +411,25 @@ export class CrawlerHost extends RPCHost {
|
||||
if (mapped) {
|
||||
imageSummary[src] = mapped || alt;
|
||||
|
||||
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
|
||||
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
|
||||
mappedUrl.protocol = 'blob:';
|
||||
|
||||
return ``;
|
||||
}
|
||||
|
||||
return ``;
|
||||
}
|
||||
|
||||
imageSummary[src] = alt || '';
|
||||
|
||||
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
|
||||
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
|
||||
mappedUrl.protocol = 'blob:';
|
||||
|
||||
return alt ? `` : ``;
|
||||
}
|
||||
|
||||
return alt ? `` : ``;
|
||||
}
|
||||
});
|
||||
@ -402,7 +439,7 @@ export class CrawlerHost extends RPCHost {
|
||||
contentText = turnDownService.turndown(toBeTurnedToMd).trim();
|
||||
} catch (err) {
|
||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href });
|
||||
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
|
||||
try {
|
||||
contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
|
||||
} catch (err2) {
|
||||
@ -419,7 +456,7 @@ export class CrawlerHost extends RPCHost {
|
||||
contentText = turnDownService.turndown(snapshot.html);
|
||||
} catch (err) {
|
||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href });
|
||||
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
|
||||
try {
|
||||
contentText = vanillaTurnDownService.turndown(snapshot.html);
|
||||
} catch (err2) {
|
||||
@ -922,6 +959,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
this.threadLocal.set('withGeneratedAlt', opts.withGeneratedAlt);
|
||||
this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
|
||||
this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
|
||||
this.threadLocal.set('keepImgDataUrl', opts.keepImgDataUrl);
|
||||
this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
|
||||
this.threadLocal.set('userAgent', opts.userAgent);
|
||||
if (opts.timeout) {
|
||||
|
@ -60,6 +60,13 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Keep-Img-Data-Url': {
|
||||
description: `Keep data-url as it instead of transforming them to object-url. (Only applicable when targeting markdown format)\n\n` +
|
||||
'Example `X-Keep-Img-Data-Url: true`'
|
||||
,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Proxy-Url': {
|
||||
description: `Specifies your custom proxy if you prefer to use one.\n\n` +
|
||||
`Supported protocols: \n` +
|
||||
@ -146,6 +153,11 @@ export class CrawlerOptions extends AutoCastable {
|
||||
@Prop({ arrayOf: String })
|
||||
removeSelector?: string | string[];
|
||||
|
||||
@Prop({
|
||||
default: false,
|
||||
})
|
||||
keepImgDataUrl!: boolean;
|
||||
|
||||
@Prop({
|
||||
arrayOf: String,
|
||||
})
|
||||
@ -212,6 +224,11 @@ export class CrawlerOptions extends AutoCastable {
|
||||
const overrideUserAgent = ctx?.req.get('x-user-agent');
|
||||
instance.userAgent ??= overrideUserAgent;
|
||||
|
||||
const keepImgDataUrl = ctx?.req.get('x-keep-img-data-url');
|
||||
if (keepImgDataUrl !== undefined) {
|
||||
instance.keepImgDataUrl = Boolean(keepImgDataUrl);
|
||||
}
|
||||
|
||||
const cookies: CookieParam[] = [];
|
||||
const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
|
||||
if (Array.isArray(setCookieHeaders)) {
|
||||
|
@ -1 +1 @@
|
||||
Subproject commit 38177e1e3814970613ce6d8fe3e3cf0030d92066
|
||||
Subproject commit e7216f6ed7aaee80068ffabce78a37ce66b9c50e
|
Loading…
x
Reference in New Issue
Block a user