diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index b968d7a..75561fd 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -53,8 +53,6 @@ export class CrawlerHost extends RPCHost { turnDownPlugins = [require('turndown-plugin-gfm').gfm]; - imageShortUrlPrefix?: string; - constructor( protected globalLogger: Logger, protected puppeteerControl: PuppeteerControl, @@ -78,13 +76,13 @@ export class CrawlerHost extends RPCHost { let contentText = ''; if (toBeTurnedToMd) { - const urlToAltMap: { [k: string]: { shortDigest: string, alt?: string; }; } = {}; + const urlToAltMap: { [k: string]: string | undefined; } = {}; const tasks = (snapshot.imgs || []).map(async (x) => { - const r = await this.altTextService.getAltTextAndShortDigest(x).catch((err)=> { + const r = await this.altTextService.getAltText(x).catch((err: any) => { this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) }); return undefined; }); - if (r) { + if (r && x.src) { urlToAltMap[x.src.trim()] = r; } }); @@ -103,7 +101,7 @@ export class CrawlerHost extends RPCHost { const mapped = urlToAltMap[src]; imgIdx++; if (mapped) { - return `![Image ${imgIdx}: ${mapped.alt || alt}](${this.imageShortUrlPrefix ? `${this.imageShortUrlPrefix}/${mapped.shortDigest}` : src})`; + return `![Image ${imgIdx}: ${mapped || alt}](${src})`; } return `![Image ${imgIdx}: ${alt}](${src})`; } @@ -115,7 +113,7 @@ export class CrawlerHost extends RPCHost { if (!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))) { contentText = turnDownService.turndown(snapshot.html); } - if (!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))) { + if (!contentText || (contentText.startsWith('<') || contentText.endsWith('>'))) { contentText = snapshot.text; } diff --git a/backend/functions/src/services/alt-text.ts b/backend/functions/src/services/alt-text.ts index c3dd5d9..6ebaf34 100644 --- a/backend/functions/src/services/alt-text.ts +++ b/backend/functions/src/services/alt-text.ts @@ -44,32 +44,33 @@ export class AltTextService extends AsyncService { } } - async getAltTextAndShortDigest(imgBrief: ImgBrief) { + async getAltText(imgBrief: ImgBrief) { if (!imgBrief.src) { return undefined; } + if (imgBrief.alt) { + return imgBrief.alt; + } const digest = md5Hasher.hash(imgBrief.src); const shortDigest = Buffer.from(digest, 'hex').toString('base64url'); const existing = await ImgAlt.fromFirestore(shortDigest); - if (existing?.generatedAlt) { - return { - shortDigest, - alt: existing.generatedAlt, - }; + if (existing) { + return existing.generatedAlt || existing.originalAlt || ''; } - let generatedCaption; + let generatedCaption = ''; - if (!imgBrief.alt) { - try { - generatedCaption = await this.caption(imgBrief.src); - } catch (err) { - this.logger.warn(`Unable to generate alt text for ${imgBrief.src}`, { err }); - } + try { + generatedCaption = await this.caption(imgBrief.src); + } catch (err) { + this.logger.warn(`Unable to generate alt text for ${imgBrief.src}`, { err }); } + // Don't try again until the next day + const expireMixin = generatedCaption ? {} : { expireAt: new Date(Date.now() + 1000 * 3600 * 24) }; + await ImgAlt.COLLECTION.doc(shortDigest).set( { _id: shortDigest, @@ -79,13 +80,11 @@ export class AltTextService extends AsyncService { urlDigest: digest, originalAlt: imgBrief.alt || '', generatedAlt: generatedCaption || '', - createdAt: new Date() + createdAt: new Date(), + ...expireMixin }, { merge: true } ); - return { - shortDigest, - alt: generatedCaption, - }; + return generatedCaption; } }