From 161d861925dd492a40129d2fc928cce9dfbc59ea Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Wed, 23 Apr 2025 15:19:11 +0800 Subject: [PATCH] wip --- src/api/crawler.ts | 42 +++++++++++++----------------- src/api/searcher.ts | 2 +- src/db/domain-blockade.ts | 2 +- src/services/pdf-extract.ts | 6 +---- src/services/snapshot-formatter.ts | 1 - thinapps-shared | 2 +- 6 files changed, 22 insertions(+), 33 deletions(-) diff --git a/src/api/crawler.ts b/src/api/crawler.ts index 608ef2a..aa13127 100644 --- a/src/api/crawler.ts +++ b/src/api/crawler.ts @@ -18,8 +18,8 @@ import { FancyFile } from 'civkit/fancy-file'; import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE, RESPOND_TIMING } from '../dto/crawler-options'; -import { Crawled } from '../db/crawled'; -import { DomainBlockade } from '../db/domain-blockade'; +import { Crawled, PageCacheCollection } from '../db/crawled'; +import { DomainBlockade, DomainBlockadeCollection } from '../db/domain-blockade'; import { OutputServerEventStream } from '../lib/transform-server-event-stream'; import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer'; @@ -102,6 +102,8 @@ export class CrawlerHost extends RPCHost { protected miscService: MiscService, protected pdfContentCollection: PDFContentCollection, protected pdfExtractor: PDFExtractor, + protected domainBlockadeCollection: DomainBlockadeCollection, + protected pageCacheCollection: PageCacheCollection, ) { super(...arguments); @@ -145,7 +147,7 @@ export class CrawlerHost extends RPCHost { puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => { this.logger.warn(`Abuse detected on ${abuseEvent.url}, blocking ${abuseEvent.url.hostname}`, { reason: abuseEvent.reason, sn: abuseEvent.sn }); - await DomainBlockade.save(DomainBlockade.from({ + await this.domainBlockadeCollection.save(DomainBlockade.from({ domain: abuseEvent.url.hostname.toLowerCase(), triggerReason: `${abuseEvent.reason}`, triggerUrl: abuseEvent.url.toString(), @@ -310,12 +312,14 @@ export class CrawlerHost extends RPCHost { if (!uid) { // Enforce no proxy is allocated for anonymous users due to abuse. crawlerOptions.proxy = 'none'; - const blockade = (await DomainBlockade.fromFirestoreQuery( - DomainBlockade.COLLECTION - .where('domain', '==', targetUrl.hostname.toLowerCase()) - .where('expireAt', '>=', new Date()) - .limit(1) - ))[0]; + const blockade = await this.domainBlockadeCollection.findOne({ + domain: targetUrl.hostname.toLowerCase(), + expireAt: { $gte: new Date() } + }).catch((err) => { + this.logger.warn(`Failed to query domain blockade for ${targetUrl.hostname}`, { err: marshalErrorLike(err) }); + return undefined; + }); + if (blockade) { throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`); } @@ -522,19 +526,10 @@ export class CrawlerHost extends RPCHost { async *queryCache(urlToCrawl: URL, cacheTolerance: number) { const digest = this.getUrlDigest(urlToCrawl); - const cache = ( - await - (Crawled.fromFirestoreQuery( - Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1) - ).catch((err) => { - this.logger.warn(`Failed to query cache, unknown issue`, { err }); - // https://github.com/grpc/grpc-node/issues/2647 - // https://github.com/googleapis/nodejs-firestore/issues/1023 - // https://github.com/googleapis/nodejs-firestore/issues/1023 - - return undefined; - })) - )?.[0]; + const cache = await this.pageCacheCollection.findOne({ urlPathDigest: digest }, { sort: { createdAt: -1 } }).catch((err) => { + this.logger.warn(`Failed to query cache, unknown issue`, { err }); + return undefined; + }); yield cache; @@ -593,7 +588,6 @@ export class CrawlerHost extends RPCHost { const nowDate = new Date(); const cache = Crawled.from({ - _id: randomUUID(), url: urlToCrawl.toString(), createdAt: nowDate, expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs), @@ -637,7 +631,7 @@ export class CrawlerHost extends RPCHost { cache.pageshotAvailable = true; } await savingOfSnapshot; - const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => { + const r = await this.pageCacheCollection.save(cache).catch((err) => { this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) }); return undefined; diff --git a/src/api/searcher.ts b/src/api/searcher.ts index 7486331..2b9c9a0 100644 --- a/src/api/searcher.ts +++ b/src/api/searcher.ts @@ -25,7 +25,7 @@ import { toAsyncGenerator } from '../utils/misc'; import type { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account'; import { LRUCache } from 'lru-cache'; import { API_CALL_STATUS } from '../shared/db/api-roll'; -import { SERPResult } from '../db/searched'; +import { SearchResult } from '../db/searched'; import { SerperSearchQueryParams, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search'; import { InternalJinaSerpService } from '../services/serp/internal'; import { WebSearchEntry } from '../services/serp/compat'; diff --git a/src/db/domain-blockade.ts b/src/db/domain-blockade.ts index 49d7189..388dfd5 100644 --- a/src/db/domain-blockade.ts +++ b/src/db/domain-blockade.ts @@ -10,7 +10,7 @@ export class DomainBlockade extends AutoCastable { @Prop({ defaultFactory: () => new ObjectId() }) - _id!: string; + _id!: ObjectId; @Prop({ required: true diff --git a/src/services/pdf-extract.ts b/src/services/pdf-extract.ts index a818971..2b1c881 100644 --- a/src/services/pdf-extract.ts +++ b/src/services/pdf-extract.ts @@ -1,12 +1,10 @@ import { singleton } from 'tsyringe'; import _ from 'lodash'; import { TextItem } from 'pdfjs-dist/types/src/display/api'; -import { AssertionFailureError, AsyncService, HashManager } from 'civkit'; +import { AsyncService } from 'civkit/async-service'; import { GlobalLogger } from './logger'; -import { PDFContent } from '../db/pdf'; import dayjs from 'dayjs'; import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket'; -import { randomUUID } from 'crypto'; import type { PDFDocumentLoadingTask } from 'pdfjs-dist'; import path from 'path'; import { AsyncLocalContext } from './async-context'; @@ -18,8 +16,6 @@ dayjs.extend(timezone); const pPdfjs = import('pdfjs-dist/legacy/build/pdf.mjs'); const nodeCmapUrl = path.resolve(require.resolve('pdfjs-dist'), '../../cmaps') + '/'; -const md5Hasher = new HashManager('md5', 'hex'); - function stdDev(numbers: number[]) { const mean = _.mean(numbers); const squareDiffs = numbers.map((num) => Math.pow(num - mean, 2)); diff --git a/src/services/snapshot-formatter.ts b/src/services/snapshot-formatter.ts index 98b5cac..91e63a5 100644 --- a/src/services/snapshot-formatter.ts +++ b/src/services/snapshot-formatter.ts @@ -9,7 +9,6 @@ import { AsyncContext } from '../shared/services/async-context'; import { Threaded } from '../services/threaded'; import { JSDomControl } from './jsdom'; import { AltTextService } from './alt-text'; -import { PDFExtractor } from './pdf-extract'; import { cleanAttribute } from '../utils/misc'; import _ from 'lodash'; import { STATUS_CODES } from 'http'; diff --git a/thinapps-shared b/thinapps-shared index 1a7dca4..a677cbd 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit 1a7dca40c52569d455237497c7285bd25eb2e3d2 +Subproject commit a677cbd23ffba78ac34d92d732be1945e016b6c7