fix(adaptive-crawler): fix cache problem

This commit is contained in:
Zhaofeng Miao 2024-10-10 16:37:12 +08:00
parent db432645c3
commit f82504540b

View File

@ -1,6 +1,8 @@
import {
AssertionFailureError,
assignTransferProtocolMeta,
HashManager,
ParamValidationError,
RPCHost, RPCReflection,
} from 'civkit';
import { singleton } from 'tsyringe';
@ -33,6 +35,8 @@ const removeURLHash = (url: string) => {
@singleton()
export class AdaptiveCrawlerHost extends RPCHost {
logger = this.globalLogger.child({ service: this.constructor.name });
// Actual cache storage (gcp buckets) exists for 7 days, so here we need to select a time < 7 days.
cacheExpiry = 3 * 1000 * 60 * 60 * 24;
static readonly __singleCrawlQueueName = 'singleCrawlQueue';
@ -105,8 +109,13 @@ export class AdaptiveCrawlerHost extends RPCHost {
const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
const existing = await AdaptiveCrawlTask.fromFirestore(shortDigest);
if (existing) {
if (existing?.createdAt) {
if (existing.createdAt.getTime() > Date.now() - this.cacheExpiry) {
this.logger.info(`Cache hit for ${shortDigest}, created at ${existing.createdAt.toDateString()}`);
return { taskId: shortDigest };
} else {
this.logger.info(`Cache expired for ${shortDigest}, created at ${existing.createdAt.toDateString()}`);
}
}
await AdaptiveCrawlTask.COLLECTION.doc(shortDigest).set({
@ -182,11 +191,19 @@ export class AdaptiveCrawlerHost extends RPCHost {
@Param('urls') urls: string[] = [],
) {
if (!taskId) {
throw new Error('taskId is required');
throw new ParamValidationError('taskId is required');
}
const state = await AdaptiveCrawlTask.fromFirestore(taskId);
if (!state) {
throw new AssertionFailureError('The task does not exist');
}
if (state?.createdAt && state.createdAt.getTime() < Date.now() - this.cacheExpiry) {
throw new AssertionFailureError('The task has expired');
}
if (urls.length) {
const promises = Object.entries(state?.processed ?? {}).map(async ([url, cachePath]) => {
if (urls.includes(url)) {