mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-20 01:49:10 +08:00
feat: keepImgDataUrl
This commit is contained in:
parent
1084b16c84
commit
62fb6cff94
@ -29,6 +29,7 @@ const md5Hasher = new HashManager('md5', 'hex');
|
|||||||
export interface ExtraScrappingOptions extends ScrappingOptions {
|
export interface ExtraScrappingOptions extends ScrappingOptions {
|
||||||
targetSelector?: string | string[];
|
targetSelector?: string | string[];
|
||||||
removeSelector?: string | string[];
|
removeSelector?: string | string[];
|
||||||
|
keepImgDataUrl?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface FormattedPage {
|
export interface FormattedPage {
|
||||||
@ -135,6 +136,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
getTurndown(options?: {
|
getTurndown(options?: {
|
||||||
noRules?: boolean | string,
|
noRules?: boolean | string,
|
||||||
url?: string | URL;
|
url?: string | URL;
|
||||||
|
imgDataUrlToObjectUrl?: boolean;
|
||||||
}) {
|
}) {
|
||||||
const turnDownService = new TurndownService({
|
const turnDownService = new TurndownService({
|
||||||
codeBlockStyle: 'fenced',
|
codeBlockStyle: 'fenced',
|
||||||
@ -154,6 +156,26 @@ export class CrawlerHost extends RPCHost {
|
|||||||
replacement: (innerText) => `${innerText}\n===============\n`
|
replacement: (innerText) => `${innerText}\n===============\n`
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (options?.imgDataUrlToObjectUrl) {
|
||||||
|
turnDownService.addRule('data-url-to-pseudo-object-url', {
|
||||||
|
filter: (node) => Boolean(node.tagName === 'IMG' && node.getAttribute('src')?.startsWith('data:')),
|
||||||
|
replacement: (_content, node: any) => {
|
||||||
|
const src = (node.getAttribute('src') || '').trim();
|
||||||
|
const alt = cleanAttribute(node.getAttribute('alt')) || '';
|
||||||
|
|
||||||
|
if (options.url) {
|
||||||
|
const refUrl = new URL(options.url);
|
||||||
|
const mappedUrl = new URL(`blob:${refUrl.origin}/${md5Hasher.hash(src)}`);
|
||||||
|
|
||||||
|
return ``;
|
||||||
|
}
|
||||||
|
|
||||||
|
return `})`;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
turnDownService.addRule('improved-paragraph', {
|
turnDownService.addRule('improved-paragraph', {
|
||||||
filter: 'p',
|
filter: 'p',
|
||||||
replacement: (innerText) => {
|
replacement: (innerText) => {
|
||||||
@ -317,6 +339,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
} as FormattedPage;
|
} as FormattedPage;
|
||||||
}
|
}
|
||||||
|
const imgDataUrlToObjectUrl = !Boolean(this.threadLocal.get('keepImgDataUrl'));
|
||||||
|
|
||||||
let contentText = '';
|
let contentText = '';
|
||||||
const imageSummary = {} as { [k: string]: string; };
|
const imageSummary = {} as { [k: string]: string; };
|
||||||
@ -328,14 +351,14 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let toBeTurnedToMd = snapshot.html;
|
let toBeTurnedToMd = snapshot.html;
|
||||||
let turnDownService = this.getTurndown({ url: nominalUrl });
|
let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });
|
||||||
if (mode !== 'markdown' && snapshot.parsed?.content) {
|
if (mode !== 'markdown' && snapshot.parsed?.content) {
|
||||||
const par1 = turnDownService.turndown(toBeTurnedToMd);
|
const par1 = turnDownService.turndown(snapshot.html);
|
||||||
const par2 = turnDownService.turndown(snapshot.parsed.content);
|
const par2 = turnDownService.turndown(snapshot.parsed.content);
|
||||||
|
|
||||||
// If Readability did its job
|
// If Readability did its job
|
||||||
if (par2.length >= 0.3 * par1.length) {
|
if (par2.length >= 0.3 * par1.length) {
|
||||||
turnDownService = this.getTurndown({ noRules: true, url: snapshot.href });
|
turnDownService = this.getTurndown({ noRules: true, url: snapshot.href, imgDataUrlToObjectUrl });
|
||||||
toBeTurnedToMd = snapshot.parsed.content;
|
toBeTurnedToMd = snapshot.parsed.content;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -388,11 +411,25 @@ export class CrawlerHost extends RPCHost {
|
|||||||
if (mapped) {
|
if (mapped) {
|
||||||
imageSummary[src] = mapped || alt;
|
imageSummary[src] = mapped || alt;
|
||||||
|
|
||||||
|
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
|
||||||
|
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
|
||||||
|
mappedUrl.protocol = 'blob:';
|
||||||
|
|
||||||
|
return ``;
|
||||||
|
}
|
||||||
|
|
||||||
return ``;
|
return ``;
|
||||||
}
|
}
|
||||||
|
|
||||||
imageSummary[src] = alt || '';
|
imageSummary[src] = alt || '';
|
||||||
|
|
||||||
|
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
|
||||||
|
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
|
||||||
|
mappedUrl.protocol = 'blob:';
|
||||||
|
|
||||||
|
return alt ? `` : ``;
|
||||||
|
}
|
||||||
|
|
||||||
return alt ? `` : ``;
|
return alt ? `` : ``;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -402,7 +439,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
contentText = turnDownService.turndown(toBeTurnedToMd).trim();
|
contentText = turnDownService.turndown(toBeTurnedToMd).trim();
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||||
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href });
|
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
|
||||||
try {
|
try {
|
||||||
contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
|
contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
|
||||||
} catch (err2) {
|
} catch (err2) {
|
||||||
@ -419,7 +456,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
contentText = turnDownService.turndown(snapshot.html);
|
contentText = turnDownService.turndown(snapshot.html);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||||
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href });
|
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
|
||||||
try {
|
try {
|
||||||
contentText = vanillaTurnDownService.turndown(snapshot.html);
|
contentText = vanillaTurnDownService.turndown(snapshot.html);
|
||||||
} catch (err2) {
|
} catch (err2) {
|
||||||
@ -922,6 +959,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
this.threadLocal.set('withGeneratedAlt', opts.withGeneratedAlt);
|
this.threadLocal.set('withGeneratedAlt', opts.withGeneratedAlt);
|
||||||
this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
|
this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
|
||||||
this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
|
this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
|
||||||
|
this.threadLocal.set('keepImgDataUrl', opts.keepImgDataUrl);
|
||||||
this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
|
this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
|
||||||
this.threadLocal.set('userAgent', opts.userAgent);
|
this.threadLocal.set('userAgent', opts.userAgent);
|
||||||
if (opts.timeout) {
|
if (opts.timeout) {
|
||||||
|
@ -60,6 +60,13 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
|||||||
in: 'header',
|
in: 'header',
|
||||||
schema: { type: 'string' }
|
schema: { type: 'string' }
|
||||||
},
|
},
|
||||||
|
'X-Keep-Img-Data-Url': {
|
||||||
|
description: `Keep data-url as it instead of transforming them to object-url. (Only applicable when targeting markdown format)\n\n` +
|
||||||
|
'Example `X-Keep-Img-Data-Url: true`'
|
||||||
|
,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
'X-Proxy-Url': {
|
'X-Proxy-Url': {
|
||||||
description: `Specifies your custom proxy if you prefer to use one.\n\n` +
|
description: `Specifies your custom proxy if you prefer to use one.\n\n` +
|
||||||
`Supported protocols: \n` +
|
`Supported protocols: \n` +
|
||||||
@ -146,6 +153,11 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
@Prop({ arrayOf: String })
|
@Prop({ arrayOf: String })
|
||||||
removeSelector?: string | string[];
|
removeSelector?: string | string[];
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
default: false,
|
||||||
|
})
|
||||||
|
keepImgDataUrl!: boolean;
|
||||||
|
|
||||||
@Prop({
|
@Prop({
|
||||||
arrayOf: String,
|
arrayOf: String,
|
||||||
})
|
})
|
||||||
@ -212,6 +224,11 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
const overrideUserAgent = ctx?.req.get('x-user-agent');
|
const overrideUserAgent = ctx?.req.get('x-user-agent');
|
||||||
instance.userAgent ??= overrideUserAgent;
|
instance.userAgent ??= overrideUserAgent;
|
||||||
|
|
||||||
|
const keepImgDataUrl = ctx?.req.get('x-keep-img-data-url');
|
||||||
|
if (keepImgDataUrl !== undefined) {
|
||||||
|
instance.keepImgDataUrl = Boolean(keepImgDataUrl);
|
||||||
|
}
|
||||||
|
|
||||||
const cookies: CookieParam[] = [];
|
const cookies: CookieParam[] = [];
|
||||||
const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
|
const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
|
||||||
if (Array.isArray(setCookieHeaders)) {
|
if (Array.isArray(setCookieHeaders)) {
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit 38177e1e3814970613ce6d8fe3e3cf0030d92066
|
Subproject commit e7216f6ed7aaee80068ffabce78a37ce66b9c50e
|
Loading…
x
Reference in New Issue
Block a user