feat: image retention config

This commit is contained in:
Yanlong Wang 2024-11-14 22:36:53 +08:00
parent ccb4b8a49d
commit 59dcc2db94
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
3 changed files with 45 additions and 3 deletions

View File

@ -687,6 +687,7 @@ export class CrawlerHost extends RPCHost {
if (opts.timeout) {
this.threadLocal.set('timeout', opts.timeout * 1000);
}
this.threadLocal.set('retainImages', opts.retainImages);
const crawlOpts: ExtraScrappingOptions = {
proxyUrl: opts.proxyUrl,

View File

@ -13,6 +13,9 @@ export enum CONTENT_FORMAT {
const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
export const IMAGE_RETENTION_MODES = ['none', 'all', 'alt', 'all_p', 'alt_p'] as const;
const IMAGE_RETENTION_MODE_VALUES = new Set<string>(IMAGE_RETENTION_MODES);
@Also({
openapi: {
operation: {
@ -113,6 +116,17 @@ const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
in: 'header',
schema: { type: 'string' }
},
'X-Retain-Images': {
description: `Image retention modes.\n\n` +
`Supported modes: \n` +
`- all: all images\n` +
`- none: no images\n` +
`- alt: only alt text\n` +
`- all_p: all images and with generated alt text\n` +
`- alt_p: only alt text and with generated alt\n\n`,
in: 'header',
schema: { type: 'string' }
},
'X-With-Iframe': {
description: `Enable filling iframe contents into main. (violates standards)`,
in: 'header',
@ -171,6 +185,9 @@ export class CrawlerOptions extends AutoCastable {
})
withGeneratedAlt!: boolean;
@Prop({ default: 'all', type: IMAGE_RETENTION_MODE_VALUES })
retainImages?: typeof IMAGE_RETENTION_MODES[number];
@Prop({
default: false,
})
@ -282,6 +299,13 @@ export class CrawlerOptions extends AutoCastable {
if (withImagesSummary !== undefined) {
instance.withImagesSummary = Boolean(withImagesSummary);
}
const retainImages = ctx?.req.get('x-retain-images');
if (retainImages && IMAGE_RETENTION_MODE_VALUES.has(retainImages)) {
instance.retainImages = retainImages as any;
}
if (instance.withGeneratedAlt) {
instance.retainImages = 'all_p';
}
const noCache = ctx?.req.get('x-no-cache');
if (noCache !== undefined) {
instance.noCache = Boolean(noCache);

View File

@ -13,6 +13,7 @@ import { PDFExtractor } from './pdf-extract';
import { cleanAttribute } from '../utils/misc';
import _ from 'lodash';
import { STATUS_CODES } from 'http';
import type { CrawlerOptions } from '../dto/scrapping-options';
export interface FormattedPage {
@ -201,7 +202,9 @@ export class SnapshotFormatter extends AsyncService {
turnDownService = turnDownService.use(plugin);
}
const urlToAltMap: { [k: string]: string | undefined; } = {};
if (snapshot.imgs?.length && this.threadLocal.get('withGeneratedAlt')) {
const imageRetention = this.threadLocal.get('retainImages') as CrawlerOptions['retainImages'];
// _p is the special suffix for withGeneratedAlt
if (snapshot.imgs?.length && imageRetention?.endsWith('_p')) {
const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
const r = await this.altTextService.getAltText(x).catch((err: any) => {
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
@ -215,9 +218,17 @@ export class SnapshotFormatter extends AsyncService {
await Promise.all(tasks);
}
let imgIdx = 0;
turnDownService.addRule('img-generated-alt', {
turnDownService.addRule('img-retention', {
filter: 'img',
replacement: (_content, node: any) => {
if (imageRetention === 'none') {
return '';
}
const alt = cleanAttribute(node.getAttribute('alt'));
if (imageRetention === 'alt') {
return alt ? `(Image ${++imgIdx}: ${alt})` : '';
}
let linkPreferredSrc = (node.getAttribute('src') || '').trim();
if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) {
const dataSrc = (node.getAttribute('data-src') || '').trim();
@ -232,7 +243,6 @@ export class SnapshotFormatter extends AsyncService {
} catch (_err) {
void 0;
}
const alt = cleanAttribute(node.getAttribute('alt'));
if (!src) {
return '';
}
@ -245,6 +255,10 @@ export class SnapshotFormatter extends AsyncService {
if (mapped) {
imageSummary[src] = mapped || alt;
if (imageRetention === 'alt_p') {
return `(Image ${imgIdx}: ${mapped || alt})`;
}
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
mappedUrl.protocol = 'blob:';
@ -253,6 +267,8 @@ export class SnapshotFormatter extends AsyncService {
}
return `![Image ${imgIdx}: ${mapped || alt}](${src})`;
} else if (imageRetention === 'alt_p') {
return alt ? `(Image ${imgIdx}: ${alt})` : '';
}
imageSummary[src] = alt || '';
@ -439,6 +455,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
noRules?: boolean | string,
url?: string | URL;
imgDataUrlToObjectUrl?: boolean;
removeImages?: boolean | 'src';
}) {
const turnDownService = new TurndownService({
codeBlockStyle: 'fenced',