chore: dont abuse our service

This commit is contained in:
Yanlong Wang 2024-06-10 17:23:50 +08:00
parent 1e3bae6aad
commit 5789ae1407
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37

View File

@ -571,10 +571,15 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
if (blockade) { if (blockade) {
throw new SecurityCompromiseError(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`); throw new SecurityCompromiseError(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
} }
} if (urlToCrawl.protocol === 'http:' && (!urlToCrawl.pathname || urlToCrawl.pathname === '/') &&
crawlerOptions.respondWith === 'html') {
throw new SecurityCompromiseError(`Your request is categorized as abuse. Please don't abuse our service. If you are sure you are not abusing, please authenticate yourself with an API key.`);
}
}
const crawlOpts = this.configure(crawlerOptions); const crawlOpts = this.configure(crawlerOptions);
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) { if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
const sseStream = new OutputServerEventStream(); const sseStream = new OutputServerEventStream();
rpcReflect.return(sseStream); rpcReflect.return(sseStream);
@ -767,7 +772,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
return r; return r;
} }
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, cacheTolerance: number = this.cacheValidMs) { async * cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, cacheTolerance: number = this.cacheValidMs) {
let cache; let cache;
if (cacheTolerance && !crawlOpts?.cookies?.length) { if (cacheTolerance && !crawlOpts?.cookies?.length) {
cache = await this.queryCache(urlToCrawl, cacheTolerance); cache = await this.queryCache(urlToCrawl, cacheTolerance);
@ -821,7 +826,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
} }
async *scrapMany(urls: URL[], options?: ScrappingOptions, cacheTolerance?: number) { async * scrapMany(urls: URL[], options?: ScrappingOptions, cacheTolerance?: number) {
const iterators = urls.map((url) => this.cachedScrap(url, options, cacheTolerance)); const iterators = urls.map((url) => this.cachedScrap(url, options, cacheTolerance));
const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined); const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined);