mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-14 17:05:57 +08:00
fix: potential circular crawling
This commit is contained in:
parent
be91371b93
commit
f0668a96b4
@ -581,6 +581,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
{ contentType: 'text/plain', envelope: null }
|
||||
);
|
||||
}
|
||||
|
||||
// Prevent circular crawling
|
||||
this.puppeteerControl.circuitBreakerHosts.add(
|
||||
ctx.req.hostname.toLowerCase()
|
||||
);
|
||||
|
||||
if (uid) {
|
||||
const user = await auth.assertUser();
|
||||
if (!(user.wallet.total_balance > 0)) {
|
||||
@ -638,15 +644,17 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
path: 'url'
|
||||
});
|
||||
}
|
||||
const blockade = (await DomainBlockade.fromFirestoreQuery(
|
||||
DomainBlockade.COLLECTION
|
||||
.where('domain', '==', urlToCrawl.hostname.toLowerCase())
|
||||
.where('expireAt', '>=', new Date())
|
||||
.limit(1)
|
||||
))[0];
|
||||
|
||||
if (blockade && !uid) {
|
||||
throw new SecurityCompromiseError(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
|
||||
if (!uid) {
|
||||
const blockade = (await DomainBlockade.fromFirestoreQuery(
|
||||
DomainBlockade.COLLECTION
|
||||
.where('domain', '==', urlToCrawl.hostname.toLowerCase())
|
||||
.where('expireAt', '>=', new Date())
|
||||
.limit(1)
|
||||
))[0];
|
||||
if (blockade) {
|
||||
throw new SecurityCompromiseError(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
|
||||
}
|
||||
}
|
||||
|
||||
const crawlOpts = this.configure(crawlerOptions);
|
||||
|
@ -100,6 +100,8 @@ export class PuppeteerControl extends AsyncService {
|
||||
livePages = new Set<Page>();
|
||||
lastPageCratedAt: number = 0;
|
||||
|
||||
circuitBreakerHosts: Set<string> = new Set();
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
) {
|
||||
@ -285,6 +287,12 @@ function giveSnapshot(stopActiveSnapshot) {
|
||||
|
||||
const parsedUrl = new URL(requestUrl);
|
||||
|
||||
if (this.circuitBreakerHosts.has(parsedUrl.hostname.toLowerCase())) {
|
||||
page.emit('abuse', { url: requestUrl, page, sn, reason: `Abusive request: ${requestUrl}` });
|
||||
|
||||
return req.abort('blockedbyclient', 1000);
|
||||
}
|
||||
|
||||
if (
|
||||
parsedUrl.hostname === 'localhost' ||
|
||||
parsedUrl.hostname.startsWith('127.')
|
||||
|
Loading…
x
Reference in New Issue
Block a user