This commit is contained in:
Yanlong Wang 2025-04-23 15:19:11 +08:00
parent f1fe45fbfe
commit 161d861925
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
6 changed files with 22 additions and 33 deletions

View File

@ -18,8 +18,8 @@ import { FancyFile } from 'civkit/fancy-file';
import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE, RESPOND_TIMING } from '../dto/crawler-options'; import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE, RESPOND_TIMING } from '../dto/crawler-options';
import { Crawled } from '../db/crawled'; import { Crawled, PageCacheCollection } from '../db/crawled';
import { DomainBlockade } from '../db/domain-blockade'; import { DomainBlockade, DomainBlockadeCollection } from '../db/domain-blockade';
import { OutputServerEventStream } from '../lib/transform-server-event-stream'; import { OutputServerEventStream } from '../lib/transform-server-event-stream';
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer'; import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
@ -102,6 +102,8 @@ export class CrawlerHost extends RPCHost {
protected miscService: MiscService, protected miscService: MiscService,
protected pdfContentCollection: PDFContentCollection, protected pdfContentCollection: PDFContentCollection,
protected pdfExtractor: PDFExtractor, protected pdfExtractor: PDFExtractor,
protected domainBlockadeCollection: DomainBlockadeCollection,
protected pageCacheCollection: PageCacheCollection,
) { ) {
super(...arguments); super(...arguments);
@ -145,7 +147,7 @@ export class CrawlerHost extends RPCHost {
puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => { puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
this.logger.warn(`Abuse detected on ${abuseEvent.url}, blocking ${abuseEvent.url.hostname}`, { reason: abuseEvent.reason, sn: abuseEvent.sn }); this.logger.warn(`Abuse detected on ${abuseEvent.url}, blocking ${abuseEvent.url.hostname}`, { reason: abuseEvent.reason, sn: abuseEvent.sn });
await DomainBlockade.save(DomainBlockade.from({ await this.domainBlockadeCollection.save(DomainBlockade.from({
domain: abuseEvent.url.hostname.toLowerCase(), domain: abuseEvent.url.hostname.toLowerCase(),
triggerReason: `${abuseEvent.reason}`, triggerReason: `${abuseEvent.reason}`,
triggerUrl: abuseEvent.url.toString(), triggerUrl: abuseEvent.url.toString(),
@ -310,12 +312,14 @@ export class CrawlerHost extends RPCHost {
if (!uid) { if (!uid) {
// Enforce no proxy is allocated for anonymous users due to abuse. // Enforce no proxy is allocated for anonymous users due to abuse.
crawlerOptions.proxy = 'none'; crawlerOptions.proxy = 'none';
const blockade = (await DomainBlockade.fromFirestoreQuery( const blockade = await this.domainBlockadeCollection.findOne({
DomainBlockade.COLLECTION domain: targetUrl.hostname.toLowerCase(),
.where('domain', '==', targetUrl.hostname.toLowerCase()) expireAt: { $gte: new Date() }
.where('expireAt', '>=', new Date()) }).catch((err) => {
.limit(1) this.logger.warn(`Failed to query domain blockade for ${targetUrl.hostname}`, { err: marshalErrorLike(err) });
))[0]; return undefined;
});
if (blockade) { if (blockade) {
throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`); throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
} }
@ -522,19 +526,10 @@ export class CrawlerHost extends RPCHost {
async *queryCache(urlToCrawl: URL, cacheTolerance: number) { async *queryCache(urlToCrawl: URL, cacheTolerance: number) {
const digest = this.getUrlDigest(urlToCrawl); const digest = this.getUrlDigest(urlToCrawl);
const cache = ( const cache = await this.pageCacheCollection.findOne({ urlPathDigest: digest }, { sort: { createdAt: -1 } }).catch((err) => {
await
(Crawled.fromFirestoreQuery(
Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)
).catch((err) => {
this.logger.warn(`Failed to query cache, unknown issue`, { err }); this.logger.warn(`Failed to query cache, unknown issue`, { err });
// https://github.com/grpc/grpc-node/issues/2647
// https://github.com/googleapis/nodejs-firestore/issues/1023
// https://github.com/googleapis/nodejs-firestore/issues/1023
return undefined; return undefined;
})) });
)?.[0];
yield cache; yield cache;
@ -593,7 +588,6 @@ export class CrawlerHost extends RPCHost {
const nowDate = new Date(); const nowDate = new Date();
const cache = Crawled.from({ const cache = Crawled.from({
_id: randomUUID(),
url: urlToCrawl.toString(), url: urlToCrawl.toString(),
createdAt: nowDate, createdAt: nowDate,
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs), expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
@ -637,7 +631,7 @@ export class CrawlerHost extends RPCHost {
cache.pageshotAvailable = true; cache.pageshotAvailable = true;
} }
await savingOfSnapshot; await savingOfSnapshot;
const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => { const r = await this.pageCacheCollection.save(cache).catch((err) => {
this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) }); this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });
return undefined; return undefined;

View File

@ -25,7 +25,7 @@ import { toAsyncGenerator } from '../utils/misc';
import type { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account'; import type { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
import { LRUCache } from 'lru-cache'; import { LRUCache } from 'lru-cache';
import { API_CALL_STATUS } from '../shared/db/api-roll'; import { API_CALL_STATUS } from '../shared/db/api-roll';
import { SERPResult } from '../db/searched'; import { SearchResult } from '../db/searched';
import { SerperSearchQueryParams, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search'; import { SerperSearchQueryParams, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
import { InternalJinaSerpService } from '../services/serp/internal'; import { InternalJinaSerpService } from '../services/serp/internal';
import { WebSearchEntry } from '../services/serp/compat'; import { WebSearchEntry } from '../services/serp/compat';

View File

@ -10,7 +10,7 @@ export class DomainBlockade extends AutoCastable {
@Prop({ @Prop({
defaultFactory: () => new ObjectId() defaultFactory: () => new ObjectId()
}) })
_id!: string; _id!: ObjectId;
@Prop({ @Prop({
required: true required: true

View File

@ -1,12 +1,10 @@
import { singleton } from 'tsyringe'; import { singleton } from 'tsyringe';
import _ from 'lodash'; import _ from 'lodash';
import { TextItem } from 'pdfjs-dist/types/src/display/api'; import { TextItem } from 'pdfjs-dist/types/src/display/api';
import { AssertionFailureError, AsyncService, HashManager } from 'civkit'; import { AsyncService } from 'civkit/async-service';
import { GlobalLogger } from './logger'; import { GlobalLogger } from './logger';
import { PDFContent } from '../db/pdf';
import dayjs from 'dayjs'; import dayjs from 'dayjs';
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket'; import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
import { randomUUID } from 'crypto';
import type { PDFDocumentLoadingTask } from 'pdfjs-dist'; import type { PDFDocumentLoadingTask } from 'pdfjs-dist';
import path from 'path'; import path from 'path';
import { AsyncLocalContext } from './async-context'; import { AsyncLocalContext } from './async-context';
@ -18,8 +16,6 @@ dayjs.extend(timezone);
const pPdfjs = import('pdfjs-dist/legacy/build/pdf.mjs'); const pPdfjs = import('pdfjs-dist/legacy/build/pdf.mjs');
const nodeCmapUrl = path.resolve(require.resolve('pdfjs-dist'), '../../cmaps') + '/'; const nodeCmapUrl = path.resolve(require.resolve('pdfjs-dist'), '../../cmaps') + '/';
const md5Hasher = new HashManager('md5', 'hex');
function stdDev(numbers: number[]) { function stdDev(numbers: number[]) {
const mean = _.mean(numbers); const mean = _.mean(numbers);
const squareDiffs = numbers.map((num) => Math.pow(num - mean, 2)); const squareDiffs = numbers.map((num) => Math.pow(num - mean, 2));

View File

@ -9,7 +9,6 @@ import { AsyncContext } from '../shared/services/async-context';
import { Threaded } from '../services/threaded'; import { Threaded } from '../services/threaded';
import { JSDomControl } from './jsdom'; import { JSDomControl } from './jsdom';
import { AltTextService } from './alt-text'; import { AltTextService } from './alt-text';
import { PDFExtractor } from './pdf-extract';
import { cleanAttribute } from '../utils/misc'; import { cleanAttribute } from '../utils/misc';
import _ from 'lodash'; import _ from 'lodash';
import { STATUS_CODES } from 'http'; import { STATUS_CODES } from 'http';

@ -1 +1 @@
Subproject commit 1a7dca40c52569d455237497c7285bd25eb2e3d2 Subproject commit a677cbd23ffba78ac34d92d732be1945e016b6c7