mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-15 15:25:58 +08:00
fix: give expireAt for image cache
This commit is contained in:
parent
4f284f51b6
commit
8a2b095bd7
@ -53,8 +53,6 @@ export class CrawlerHost extends RPCHost {
|
|||||||
|
|
||||||
turnDownPlugins = [require('turndown-plugin-gfm').gfm];
|
turnDownPlugins = [require('turndown-plugin-gfm').gfm];
|
||||||
|
|
||||||
imageShortUrlPrefix?: string;
|
|
||||||
|
|
||||||
constructor(
|
constructor(
|
||||||
protected globalLogger: Logger,
|
protected globalLogger: Logger,
|
||||||
protected puppeteerControl: PuppeteerControl,
|
protected puppeteerControl: PuppeteerControl,
|
||||||
@ -78,13 +76,13 @@ export class CrawlerHost extends RPCHost {
|
|||||||
|
|
||||||
let contentText = '';
|
let contentText = '';
|
||||||
if (toBeTurnedToMd) {
|
if (toBeTurnedToMd) {
|
||||||
const urlToAltMap: { [k: string]: { shortDigest: string, alt?: string; }; } = {};
|
const urlToAltMap: { [k: string]: string | undefined; } = {};
|
||||||
const tasks = (snapshot.imgs || []).map(async (x) => {
|
const tasks = (snapshot.imgs || []).map(async (x) => {
|
||||||
const r = await this.altTextService.getAltTextAndShortDigest(x).catch((err)=> {
|
const r = await this.altTextService.getAltText(x).catch((err: any) => {
|
||||||
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
|
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
|
||||||
return undefined;
|
return undefined;
|
||||||
});
|
});
|
||||||
if (r) {
|
if (r && x.src) {
|
||||||
urlToAltMap[x.src.trim()] = r;
|
urlToAltMap[x.src.trim()] = r;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -103,7 +101,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
const mapped = urlToAltMap[src];
|
const mapped = urlToAltMap[src];
|
||||||
imgIdx++;
|
imgIdx++;
|
||||||
if (mapped) {
|
if (mapped) {
|
||||||
return ``;
|
return ``;
|
||||||
}
|
}
|
||||||
return ``;
|
return ``;
|
||||||
}
|
}
|
||||||
@ -115,7 +113,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
if (!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))) {
|
if (!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))) {
|
||||||
contentText = turnDownService.turndown(snapshot.html);
|
contentText = turnDownService.turndown(snapshot.html);
|
||||||
}
|
}
|
||||||
if (!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))) {
|
if (!contentText || (contentText.startsWith('<') || contentText.endsWith('>'))) {
|
||||||
contentText = snapshot.text;
|
contentText = snapshot.text;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -44,32 +44,33 @@ export class AltTextService extends AsyncService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async getAltTextAndShortDigest(imgBrief: ImgBrief) {
|
async getAltText(imgBrief: ImgBrief) {
|
||||||
if (!imgBrief.src) {
|
if (!imgBrief.src) {
|
||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
|
if (imgBrief.alt) {
|
||||||
|
return imgBrief.alt;
|
||||||
|
}
|
||||||
const digest = md5Hasher.hash(imgBrief.src);
|
const digest = md5Hasher.hash(imgBrief.src);
|
||||||
const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
|
const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
|
||||||
|
|
||||||
const existing = await ImgAlt.fromFirestore(shortDigest);
|
const existing = await ImgAlt.fromFirestore(shortDigest);
|
||||||
|
|
||||||
if (existing?.generatedAlt) {
|
if (existing) {
|
||||||
return {
|
return existing.generatedAlt || existing.originalAlt || '';
|
||||||
shortDigest,
|
|
||||||
alt: existing.generatedAlt,
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let generatedCaption;
|
let generatedCaption = '';
|
||||||
|
|
||||||
if (!imgBrief.alt) {
|
try {
|
||||||
try {
|
generatedCaption = await this.caption(imgBrief.src);
|
||||||
generatedCaption = await this.caption(imgBrief.src);
|
} catch (err) {
|
||||||
} catch (err) {
|
this.logger.warn(`Unable to generate alt text for ${imgBrief.src}`, { err });
|
||||||
this.logger.warn(`Unable to generate alt text for ${imgBrief.src}`, { err });
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Don't try again until the next day
|
||||||
|
const expireMixin = generatedCaption ? {} : { expireAt: new Date(Date.now() + 1000 * 3600 * 24) };
|
||||||
|
|
||||||
await ImgAlt.COLLECTION.doc(shortDigest).set(
|
await ImgAlt.COLLECTION.doc(shortDigest).set(
|
||||||
{
|
{
|
||||||
_id: shortDigest,
|
_id: shortDigest,
|
||||||
@ -79,13 +80,11 @@ export class AltTextService extends AsyncService {
|
|||||||
urlDigest: digest,
|
urlDigest: digest,
|
||||||
originalAlt: imgBrief.alt || '',
|
originalAlt: imgBrief.alt || '',
|
||||||
generatedAlt: generatedCaption || '',
|
generatedAlt: generatedCaption || '',
|
||||||
createdAt: new Date()
|
createdAt: new Date(),
|
||||||
|
...expireMixin
|
||||||
}, { merge: true }
|
}, { merge: true }
|
||||||
);
|
);
|
||||||
|
|
||||||
return {
|
return generatedCaption;
|
||||||
shortDigest,
|
|
||||||
alt: generatedCaption,
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user