mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-06 06:36:31 +08:00
feat: image retention config
This commit is contained in:
parent
ccb4b8a49d
commit
59dcc2db94
@ -687,6 +687,7 @@ export class CrawlerHost extends RPCHost {
|
||||
if (opts.timeout) {
|
||||
this.threadLocal.set('timeout', opts.timeout * 1000);
|
||||
}
|
||||
this.threadLocal.set('retainImages', opts.retainImages);
|
||||
|
||||
const crawlOpts: ExtraScrappingOptions = {
|
||||
proxyUrl: opts.proxyUrl,
|
||||
|
@ -13,6 +13,9 @@ export enum CONTENT_FORMAT {
|
||||
|
||||
const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
|
||||
|
||||
export const IMAGE_RETENTION_MODES = ['none', 'all', 'alt', 'all_p', 'alt_p'] as const;
|
||||
const IMAGE_RETENTION_MODE_VALUES = new Set<string>(IMAGE_RETENTION_MODES);
|
||||
|
||||
@Also({
|
||||
openapi: {
|
||||
operation: {
|
||||
@ -113,6 +116,17 @@ const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Retain-Images': {
|
||||
description: `Image retention modes.\n\n` +
|
||||
`Supported modes: \n` +
|
||||
`- all: all images\n` +
|
||||
`- none: no images\n` +
|
||||
`- alt: only alt text\n` +
|
||||
`- all_p: all images and with generated alt text\n` +
|
||||
`- alt_p: only alt text and with generated alt\n\n`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-With-Iframe': {
|
||||
description: `Enable filling iframe contents into main. (violates standards)`,
|
||||
in: 'header',
|
||||
@ -171,6 +185,9 @@ export class CrawlerOptions extends AutoCastable {
|
||||
})
|
||||
withGeneratedAlt!: boolean;
|
||||
|
||||
@Prop({ default: 'all', type: IMAGE_RETENTION_MODE_VALUES })
|
||||
retainImages?: typeof IMAGE_RETENTION_MODES[number];
|
||||
|
||||
@Prop({
|
||||
default: false,
|
||||
})
|
||||
@ -282,6 +299,13 @@ export class CrawlerOptions extends AutoCastable {
|
||||
if (withImagesSummary !== undefined) {
|
||||
instance.withImagesSummary = Boolean(withImagesSummary);
|
||||
}
|
||||
const retainImages = ctx?.req.get('x-retain-images');
|
||||
if (retainImages && IMAGE_RETENTION_MODE_VALUES.has(retainImages)) {
|
||||
instance.retainImages = retainImages as any;
|
||||
}
|
||||
if (instance.withGeneratedAlt) {
|
||||
instance.retainImages = 'all_p';
|
||||
}
|
||||
const noCache = ctx?.req.get('x-no-cache');
|
||||
if (noCache !== undefined) {
|
||||
instance.noCache = Boolean(noCache);
|
||||
|
@ -13,6 +13,7 @@ import { PDFExtractor } from './pdf-extract';
|
||||
import { cleanAttribute } from '../utils/misc';
|
||||
import _ from 'lodash';
|
||||
import { STATUS_CODES } from 'http';
|
||||
import type { CrawlerOptions } from '../dto/scrapping-options';
|
||||
|
||||
|
||||
export interface FormattedPage {
|
||||
@ -201,7 +202,9 @@ export class SnapshotFormatter extends AsyncService {
|
||||
turnDownService = turnDownService.use(plugin);
|
||||
}
|
||||
const urlToAltMap: { [k: string]: string | undefined; } = {};
|
||||
if (snapshot.imgs?.length && this.threadLocal.get('withGeneratedAlt')) {
|
||||
const imageRetention = this.threadLocal.get('retainImages') as CrawlerOptions['retainImages'];
|
||||
// _p is the special suffix for withGeneratedAlt
|
||||
if (snapshot.imgs?.length && imageRetention?.endsWith('_p')) {
|
||||
const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
|
||||
const r = await this.altTextService.getAltText(x).catch((err: any) => {
|
||||
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
|
||||
@ -215,9 +218,17 @@ export class SnapshotFormatter extends AsyncService {
|
||||
await Promise.all(tasks);
|
||||
}
|
||||
let imgIdx = 0;
|
||||
turnDownService.addRule('img-generated-alt', {
|
||||
turnDownService.addRule('img-retention', {
|
||||
filter: 'img',
|
||||
replacement: (_content, node: any) => {
|
||||
if (imageRetention === 'none') {
|
||||
return '';
|
||||
}
|
||||
const alt = cleanAttribute(node.getAttribute('alt'));
|
||||
|
||||
if (imageRetention === 'alt') {
|
||||
return alt ? `(Image ${++imgIdx}: ${alt})` : '';
|
||||
}
|
||||
let linkPreferredSrc = (node.getAttribute('src') || '').trim();
|
||||
if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) {
|
||||
const dataSrc = (node.getAttribute('data-src') || '').trim();
|
||||
@ -232,7 +243,6 @@ export class SnapshotFormatter extends AsyncService {
|
||||
} catch (_err) {
|
||||
void 0;
|
||||
}
|
||||
const alt = cleanAttribute(node.getAttribute('alt'));
|
||||
if (!src) {
|
||||
return '';
|
||||
}
|
||||
@ -245,6 +255,10 @@ export class SnapshotFormatter extends AsyncService {
|
||||
if (mapped) {
|
||||
imageSummary[src] = mapped || alt;
|
||||
|
||||
if (imageRetention === 'alt_p') {
|
||||
return `(Image ${imgIdx}: ${mapped || alt})`;
|
||||
}
|
||||
|
||||
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
|
||||
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
|
||||
mappedUrl.protocol = 'blob:';
|
||||
@ -253,6 +267,8 @@ export class SnapshotFormatter extends AsyncService {
|
||||
}
|
||||
|
||||
return ``;
|
||||
} else if (imageRetention === 'alt_p') {
|
||||
return alt ? `(Image ${imgIdx}: ${alt})` : '';
|
||||
}
|
||||
|
||||
imageSummary[src] = alt || '';
|
||||
@ -439,6 +455,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
noRules?: boolean | string,
|
||||
url?: string | URL;
|
||||
imgDataUrlToObjectUrl?: boolean;
|
||||
removeImages?: boolean | 'src';
|
||||
}) {
|
||||
const turnDownService = new TurndownService({
|
||||
codeBlockStyle: 'fenced',
|
||||
|
Loading…
x
Reference in New Issue
Block a user