This commit is contained in:
Yanlong Wang 2025-04-23 15:19:11 +08:00
parent f1fe45fbfe
commit 161d861925
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
6 changed files with 22 additions and 33 deletions

View File

@ -18,8 +18,8 @@ import { FancyFile } from 'civkit/fancy-file';
import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE, RESPOND_TIMING } from '../dto/crawler-options';
import { Crawled } from '../db/crawled';
import { DomainBlockade } from '../db/domain-blockade';
import { Crawled, PageCacheCollection } from '../db/crawled';
import { DomainBlockade, DomainBlockadeCollection } from '../db/domain-blockade';
import { OutputServerEventStream } from '../lib/transform-server-event-stream';
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
@ -102,6 +102,8 @@ export class CrawlerHost extends RPCHost {
protected miscService: MiscService,
protected pdfContentCollection: PDFContentCollection,
protected pdfExtractor: PDFExtractor,
protected domainBlockadeCollection: DomainBlockadeCollection,
protected pageCacheCollection: PageCacheCollection,
) {
super(...arguments);
@ -145,7 +147,7 @@ export class CrawlerHost extends RPCHost {
puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
this.logger.warn(`Abuse detected on ${abuseEvent.url}, blocking ${abuseEvent.url.hostname}`, { reason: abuseEvent.reason, sn: abuseEvent.sn });
await DomainBlockade.save(DomainBlockade.from({
await this.domainBlockadeCollection.save(DomainBlockade.from({
domain: abuseEvent.url.hostname.toLowerCase(),
triggerReason: `${abuseEvent.reason}`,
triggerUrl: abuseEvent.url.toString(),
@ -310,12 +312,14 @@ export class CrawlerHost extends RPCHost {
if (!uid) {
// Enforce no proxy is allocated for anonymous users due to abuse.
crawlerOptions.proxy = 'none';
const blockade = (await DomainBlockade.fromFirestoreQuery(
DomainBlockade.COLLECTION
.where('domain', '==', targetUrl.hostname.toLowerCase())
.where('expireAt', '>=', new Date())
.limit(1)
))[0];
const blockade = await this.domainBlockadeCollection.findOne({
domain: targetUrl.hostname.toLowerCase(),
expireAt: { $gte: new Date() }
}).catch((err) => {
this.logger.warn(`Failed to query domain blockade for ${targetUrl.hostname}`, { err: marshalErrorLike(err) });
return undefined;
});
if (blockade) {
throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
}
@ -522,19 +526,10 @@ export class CrawlerHost extends RPCHost {
async *queryCache(urlToCrawl: URL, cacheTolerance: number) {
const digest = this.getUrlDigest(urlToCrawl);
const cache = (
await
(Crawled.fromFirestoreQuery(
Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)
).catch((err) => {
this.logger.warn(`Failed to query cache, unknown issue`, { err });
// https://github.com/grpc/grpc-node/issues/2647
// https://github.com/googleapis/nodejs-firestore/issues/1023
// https://github.com/googleapis/nodejs-firestore/issues/1023
return undefined;
}))
)?.[0];
const cache = await this.pageCacheCollection.findOne({ urlPathDigest: digest }, { sort: { createdAt: -1 } }).catch((err) => {
this.logger.warn(`Failed to query cache, unknown issue`, { err });
return undefined;
});
yield cache;
@ -593,7 +588,6 @@ export class CrawlerHost extends RPCHost {
const nowDate = new Date();
const cache = Crawled.from({
_id: randomUUID(),
url: urlToCrawl.toString(),
createdAt: nowDate,
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
@ -637,7 +631,7 @@ export class CrawlerHost extends RPCHost {
cache.pageshotAvailable = true;
}
await savingOfSnapshot;
const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => {
const r = await this.pageCacheCollection.save(cache).catch((err) => {
this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });
return undefined;

View File

@ -25,7 +25,7 @@ import { toAsyncGenerator } from '../utils/misc';
import type { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
import { LRUCache } from 'lru-cache';
import { API_CALL_STATUS } from '../shared/db/api-roll';
import { SERPResult } from '../db/searched';
import { SearchResult } from '../db/searched';
import { SerperSearchQueryParams, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
import { InternalJinaSerpService } from '../services/serp/internal';
import { WebSearchEntry } from '../services/serp/compat';

View File

@ -10,7 +10,7 @@ export class DomainBlockade extends AutoCastable {
@Prop({
defaultFactory: () => new ObjectId()
})
_id!: string;
_id!: ObjectId;
@Prop({
required: true

View File

@ -1,12 +1,10 @@
import { singleton } from 'tsyringe';
import _ from 'lodash';
import { TextItem } from 'pdfjs-dist/types/src/display/api';
import { AssertionFailureError, AsyncService, HashManager } from 'civkit';
import { AsyncService } from 'civkit/async-service';
import { GlobalLogger } from './logger';
import { PDFContent } from '../db/pdf';
import dayjs from 'dayjs';
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
import { randomUUID } from 'crypto';
import type { PDFDocumentLoadingTask } from 'pdfjs-dist';
import path from 'path';
import { AsyncLocalContext } from './async-context';
@ -18,8 +16,6 @@ dayjs.extend(timezone);
const pPdfjs = import('pdfjs-dist/legacy/build/pdf.mjs');
const nodeCmapUrl = path.resolve(require.resolve('pdfjs-dist'), '../../cmaps') + '/';
const md5Hasher = new HashManager('md5', 'hex');
function stdDev(numbers: number[]) {
const mean = _.mean(numbers);
const squareDiffs = numbers.map((num) => Math.pow(num - mean, 2));

View File

@ -9,7 +9,6 @@ import { AsyncContext } from '../shared/services/async-context';
import { Threaded } from '../services/threaded';
import { JSDomControl } from './jsdom';
import { AltTextService } from './alt-text';
import { PDFExtractor } from './pdf-extract';
import { cleanAttribute } from '../utils/misc';
import _ from 'lodash';
import { STATUS_CODES } from 'http';

@ -1 +1 @@
Subproject commit 1a7dca40c52569d455237497c7285bd25eb2e3d2
Subproject commit a677cbd23ffba78ac34d92d732be1945e016b6c7