fix: firebase limit on document size causing cache failures

This commit is contained in:
yanlong.wang 2024-04-25 12:24:05 +08:00
parent 9dd5af0cb5
commit f1016649ac
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
3 changed files with 61 additions and 27 deletions

View File

@ -13,7 +13,7 @@ import normalizeUrl from "@esm2cjs/normalize-url";
import { AltTextService } from '../services/alt-text'; import { AltTextService } from '../services/alt-text';
import TurndownService from 'turndown'; import TurndownService from 'turndown';
import { parseString as parseSetCookieString } from 'set-cookie-parser'; import { parseString as parseSetCookieString } from 'set-cookie-parser';
import { CookieParam } from 'puppeteer'; import type { CookieParam } from 'puppeteer';
import { Crawled } from '../db/crawled'; import { Crawled } from '../db/crawled';
import { tidyMarkdown } from '../utils/markdown'; import { tidyMarkdown } from '../utils/markdown';
import { cleanAttribute } from '../utils/misc'; import { cleanAttribute } from '../utils/misc';
@ -408,30 +408,46 @@ ${this.content}
const cache = (await Crawled.fromFirestoreQuery(Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0]; const cache = (await Crawled.fromFirestoreQuery(Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
if (cache) { if (!cache) {
return undefined;
}
const age = Date.now() - cache.createdAt.valueOf(); const age = Date.now() - cache.createdAt.valueOf();
const stale = cache.createdAt.valueOf() > (Date.now() - this.cacheValidMs); const stale = cache.createdAt.valueOf() < (Date.now() - this.cacheValidMs);
this.logger.info(`${stale ? 'Only stale ' : ''}Cache exists for ${urlToCrawl}, normalized digest: ${digest}, ${age}ms old`, { this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for ${urlToCrawl}, normalized digest: ${digest}, ${age}ms old`, {
url: urlToCrawl, digest, age, stale url: urlToCrawl, digest, age, stale
}); });
const r = cache.snapshot; let snapshot: PageSnapshot | undefined;
let screenshotUrl: string | undefined;
const preparations = [
this.firebaseObjectStorage.downloadFile(`snapshots/${cache._id}`).then((r) => {
snapshot = JSON.parse(r.toString('utf-8'));
}),
cache.screenshotAvailable ?
this.firebaseObjectStorage.signDownloadUrl(`screenshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => {
screenshotUrl = r;
}) :
Promise.resolve(undefined)
];
try {
await Promise.all(preparations);
} catch (_err) {
// Swallow cache errors.
return undefined;
}
return { return {
isFresh: !stale, isFresh: !stale,
...cache, ...cache,
snapshot: { snapshot: {
...r, ...snapshot,
screenshot: undefined, screenshot: undefined,
screenshotUrl: cache.screenshotAvailable ? screenshotUrl,
await this.firebaseObjectStorage.signDownloadUrl(`screenshots/${cache._id}`, Date.now() + this.urlValidMs) : undefined,
} as PageSnapshot & { screenshotUrl?: string; } } as PageSnapshot & { screenshotUrl?: string; }
}; };
} }
return undefined;
}
async setToCache(urlToCrawl: URL, snapshot: PageSnapshot) { async setToCache(urlToCrawl: URL, snapshot: PageSnapshot) {
const digest = this.getUrlDigest(urlToCrawl); const digest = this.getUrlDigest(urlToCrawl);
@ -444,10 +460,24 @@ ${this.content}
createdAt: nowDate, createdAt: nowDate,
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs), expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
urlPathDigest: digest, urlPathDigest: digest,
snapshot: { });
const savingOfSnapshot = this.firebaseObjectStorage.saveFile(`snapshots/${cache._id}`,
Buffer.from(
JSON.stringify({
...snapshot, ...snapshot,
screenshot: null screenshot: undefined
}, }),
'utf-8'
),
{
metadata: {
contentType: 'application/json',
}
}
).then((r) => {
cache.snapshotAvailable = true;
return r;
}); });
if (snapshot.screenshot) { if (snapshot.screenshot) {
@ -458,6 +488,7 @@ ${this.content}
}); });
cache.screenshotAvailable = true; cache.screenshotAvailable = true;
} }
await savingOfSnapshot;
const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => { const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => {
this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) }); this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });

View File

@ -22,11 +22,14 @@ export class Crawled extends FirestoreRecord {
urlPathDigest!: string; urlPathDigest!: string;
@Prop() @Prop()
snapshot!: PageSnapshot & { screenshot: never; }; snapshot?: PageSnapshot & { screenshot: never; };
@Prop() @Prop()
screenshotAvailable?: boolean; screenshotAvailable?: boolean;
@Prop()
snapshotAvailable?: boolean;
@Prop() @Prop()
createdAt!: Date; createdAt!: Date;

@ -1 +1 @@
Subproject commit 577131db50d5c86ffb3d085a593eaed8950eabcd Subproject commit 64157bc57ef9ce2cec69f37b5f55fccb71742b6f