mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-17 12:35:55 +08:00
fix: potential circular crawling
This commit is contained in:
parent
be91371b93
commit
f0668a96b4
@ -581,6 +581,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
{ contentType: 'text/plain', envelope: null }
|
{ contentType: 'text/plain', envelope: null }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Prevent circular crawling
|
||||||
|
this.puppeteerControl.circuitBreakerHosts.add(
|
||||||
|
ctx.req.hostname.toLowerCase()
|
||||||
|
);
|
||||||
|
|
||||||
if (uid) {
|
if (uid) {
|
||||||
const user = await auth.assertUser();
|
const user = await auth.assertUser();
|
||||||
if (!(user.wallet.total_balance > 0)) {
|
if (!(user.wallet.total_balance > 0)) {
|
||||||
@ -638,15 +644,17 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
path: 'url'
|
path: 'url'
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
const blockade = (await DomainBlockade.fromFirestoreQuery(
|
|
||||||
DomainBlockade.COLLECTION
|
|
||||||
.where('domain', '==', urlToCrawl.hostname.toLowerCase())
|
|
||||||
.where('expireAt', '>=', new Date())
|
|
||||||
.limit(1)
|
|
||||||
))[0];
|
|
||||||
|
|
||||||
if (blockade && !uid) {
|
if (!uid) {
|
||||||
throw new SecurityCompromiseError(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
|
const blockade = (await DomainBlockade.fromFirestoreQuery(
|
||||||
|
DomainBlockade.COLLECTION
|
||||||
|
.where('domain', '==', urlToCrawl.hostname.toLowerCase())
|
||||||
|
.where('expireAt', '>=', new Date())
|
||||||
|
.limit(1)
|
||||||
|
))[0];
|
||||||
|
if (blockade) {
|
||||||
|
throw new SecurityCompromiseError(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const crawlOpts = this.configure(crawlerOptions);
|
const crawlOpts = this.configure(crawlerOptions);
|
||||||
|
@ -100,6 +100,8 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
livePages = new Set<Page>();
|
livePages = new Set<Page>();
|
||||||
lastPageCratedAt: number = 0;
|
lastPageCratedAt: number = 0;
|
||||||
|
|
||||||
|
circuitBreakerHosts: Set<string> = new Set();
|
||||||
|
|
||||||
constructor(
|
constructor(
|
||||||
protected globalLogger: Logger,
|
protected globalLogger: Logger,
|
||||||
) {
|
) {
|
||||||
@ -285,6 +287,12 @@ function giveSnapshot(stopActiveSnapshot) {
|
|||||||
|
|
||||||
const parsedUrl = new URL(requestUrl);
|
const parsedUrl = new URL(requestUrl);
|
||||||
|
|
||||||
|
if (this.circuitBreakerHosts.has(parsedUrl.hostname.toLowerCase())) {
|
||||||
|
page.emit('abuse', { url: requestUrl, page, sn, reason: `Abusive request: ${requestUrl}` });
|
||||||
|
|
||||||
|
return req.abort('blockedbyclient', 1000);
|
||||||
|
}
|
||||||
|
|
||||||
if (
|
if (
|
||||||
parsedUrl.hostname === 'localhost' ||
|
parsedUrl.hostname === 'localhost' ||
|
||||||
parsedUrl.hostname.startsWith('127.')
|
parsedUrl.hostname.startsWith('127.')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user