fix: cache use and edge cases

This commit is contained in:
yanlong.wang 2025-03-17 15:10:07 +08:00
parent b4b99f0096
commit 3a40db2590
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37

View File

@ -107,6 +107,9 @@ export class CrawlerHost extends RPCHost {
// Potentially mangeled content, dont cache if scripts are injected // Potentially mangeled content, dont cache if scripts are injected
return; return;
} }
if (snapshot.isIntermediate) {
return;
}
if (options.locale) { if (options.locale) {
Reflect.set(snapshot, 'locale', options.locale); Reflect.set(snapshot, 'locale', options.locale);
} }
@ -360,27 +363,36 @@ export class CrawlerHost extends RPCHost {
let lastScrapped; let lastScrapped;
if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) { if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) { try {
lastScrapped = scrapped; for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
if (rpcReflect.signal.aborted) { lastScrapped = scrapped;
break; if (rpcReflect.signal.aborted) {
} break;
if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) { }
continue; if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) {
} continue;
}
if (!scrapped.title) {
continue;
}
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts); const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
chargeAmount = this.assignChargeAmount(formatted, crawlOpts); chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
}
if (scrapped?.pdfs?.length && !chargeAmount) {
continue;
}
return formatted;
} }
} catch (err) {
if (scrapped?.pdfs?.length && !chargeAmount) { if (!lastScrapped) {
continue; throw err;
} }
return formatted;
} }
if (!lastScrapped) { if (!lastScrapped) {
@ -406,33 +418,42 @@ export class CrawlerHost extends RPCHost {
}); });
} }
for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) { try {
lastScrapped = scrapped; for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
if (rpcReflect.signal.aborted) { lastScrapped = scrapped;
break; if (rpcReflect.signal.aborted) {
} break;
if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) { }
continue; if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) {
} continue;
}
if (!scrapped.title) {
continue;
}
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts); const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
chargeAmount = this.assignChargeAmount(formatted, crawlOpts); chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
} }
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) { if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } } { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
); );
} }
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) { if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } } { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
); );
} }
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain; charset=utf-8', envelope: null }); return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain; charset=utf-8', envelope: null });
}
} catch (err) {
if (!lastScrapped) {
throw err;
}
} }
if (!lastScrapped) { if (!lastScrapped) {
@ -733,7 +754,7 @@ export class CrawlerHost extends RPCHost {
let cache = (await cacheIt.next()).value; let cache = (await cacheIt.next()).value;
if (cache?.htmlSignificantlyModifiedByJs === false) { if (cache?.htmlSignificantlyModifiedByJs === false) {
if (crawlerOpts) { if (crawlerOpts && crawlerOpts.timeout === undefined) {
crawlerOpts.respondTiming ??= RESPOND_TIMING.HTML; crawlerOpts.respondTiming ??= RESPOND_TIMING.HTML;
} }
} }