mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-18 10:35:53 +08:00
wip
This commit is contained in:
parent
f1fe45fbfe
commit
161d861925
@ -18,8 +18,8 @@ import { FancyFile } from 'civkit/fancy-file';
|
||||
|
||||
import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE, RESPOND_TIMING } from '../dto/crawler-options';
|
||||
|
||||
import { Crawled } from '../db/crawled';
|
||||
import { DomainBlockade } from '../db/domain-blockade';
|
||||
import { Crawled, PageCacheCollection } from '../db/crawled';
|
||||
import { DomainBlockade, DomainBlockadeCollection } from '../db/domain-blockade';
|
||||
import { OutputServerEventStream } from '../lib/transform-server-event-stream';
|
||||
|
||||
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
||||
@ -102,6 +102,8 @@ export class CrawlerHost extends RPCHost {
|
||||
protected miscService: MiscService,
|
||||
protected pdfContentCollection: PDFContentCollection,
|
||||
protected pdfExtractor: PDFExtractor,
|
||||
protected domainBlockadeCollection: DomainBlockadeCollection,
|
||||
protected pageCacheCollection: PageCacheCollection,
|
||||
) {
|
||||
super(...arguments);
|
||||
|
||||
@ -145,7 +147,7 @@ export class CrawlerHost extends RPCHost {
|
||||
puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
|
||||
this.logger.warn(`Abuse detected on ${abuseEvent.url}, blocking ${abuseEvent.url.hostname}`, { reason: abuseEvent.reason, sn: abuseEvent.sn });
|
||||
|
||||
await DomainBlockade.save(DomainBlockade.from({
|
||||
await this.domainBlockadeCollection.save(DomainBlockade.from({
|
||||
domain: abuseEvent.url.hostname.toLowerCase(),
|
||||
triggerReason: `${abuseEvent.reason}`,
|
||||
triggerUrl: abuseEvent.url.toString(),
|
||||
@ -310,12 +312,14 @@ export class CrawlerHost extends RPCHost {
|
||||
if (!uid) {
|
||||
// Enforce no proxy is allocated for anonymous users due to abuse.
|
||||
crawlerOptions.proxy = 'none';
|
||||
const blockade = (await DomainBlockade.fromFirestoreQuery(
|
||||
DomainBlockade.COLLECTION
|
||||
.where('domain', '==', targetUrl.hostname.toLowerCase())
|
||||
.where('expireAt', '>=', new Date())
|
||||
.limit(1)
|
||||
))[0];
|
||||
const blockade = await this.domainBlockadeCollection.findOne({
|
||||
domain: targetUrl.hostname.toLowerCase(),
|
||||
expireAt: { $gte: new Date() }
|
||||
}).catch((err) => {
|
||||
this.logger.warn(`Failed to query domain blockade for ${targetUrl.hostname}`, { err: marshalErrorLike(err) });
|
||||
return undefined;
|
||||
});
|
||||
|
||||
if (blockade) {
|
||||
throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
|
||||
}
|
||||
@ -522,19 +526,10 @@ export class CrawlerHost extends RPCHost {
|
||||
async *queryCache(urlToCrawl: URL, cacheTolerance: number) {
|
||||
const digest = this.getUrlDigest(urlToCrawl);
|
||||
|
||||
const cache = (
|
||||
await
|
||||
(Crawled.fromFirestoreQuery(
|
||||
Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)
|
||||
).catch((err) => {
|
||||
this.logger.warn(`Failed to query cache, unknown issue`, { err });
|
||||
// https://github.com/grpc/grpc-node/issues/2647
|
||||
// https://github.com/googleapis/nodejs-firestore/issues/1023
|
||||
// https://github.com/googleapis/nodejs-firestore/issues/1023
|
||||
|
||||
return undefined;
|
||||
}))
|
||||
)?.[0];
|
||||
const cache = await this.pageCacheCollection.findOne({ urlPathDigest: digest }, { sort: { createdAt: -1 } }).catch((err) => {
|
||||
this.logger.warn(`Failed to query cache, unknown issue`, { err });
|
||||
return undefined;
|
||||
});
|
||||
|
||||
yield cache;
|
||||
|
||||
@ -593,7 +588,6 @@ export class CrawlerHost extends RPCHost {
|
||||
const nowDate = new Date();
|
||||
|
||||
const cache = Crawled.from({
|
||||
_id: randomUUID(),
|
||||
url: urlToCrawl.toString(),
|
||||
createdAt: nowDate,
|
||||
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
|
||||
@ -637,7 +631,7 @@ export class CrawlerHost extends RPCHost {
|
||||
cache.pageshotAvailable = true;
|
||||
}
|
||||
await savingOfSnapshot;
|
||||
const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => {
|
||||
const r = await this.pageCacheCollection.save(cache).catch((err) => {
|
||||
this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });
|
||||
|
||||
return undefined;
|
||||
|
@ -25,7 +25,7 @@ import { toAsyncGenerator } from '../utils/misc';
|
||||
import type { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
||||
import { LRUCache } from 'lru-cache';
|
||||
import { API_CALL_STATUS } from '../shared/db/api-roll';
|
||||
import { SERPResult } from '../db/searched';
|
||||
import { SearchResult } from '../db/searched';
|
||||
import { SerperSearchQueryParams, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
|
||||
import { InternalJinaSerpService } from '../services/serp/internal';
|
||||
import { WebSearchEntry } from '../services/serp/compat';
|
||||
|
@ -10,7 +10,7 @@ export class DomainBlockade extends AutoCastable {
|
||||
@Prop({
|
||||
defaultFactory: () => new ObjectId()
|
||||
})
|
||||
_id!: string;
|
||||
_id!: ObjectId;
|
||||
|
||||
@Prop({
|
||||
required: true
|
||||
|
@ -1,12 +1,10 @@
|
||||
import { singleton } from 'tsyringe';
|
||||
import _ from 'lodash';
|
||||
import { TextItem } from 'pdfjs-dist/types/src/display/api';
|
||||
import { AssertionFailureError, AsyncService, HashManager } from 'civkit';
|
||||
import { AsyncService } from 'civkit/async-service';
|
||||
import { GlobalLogger } from './logger';
|
||||
import { PDFContent } from '../db/pdf';
|
||||
import dayjs from 'dayjs';
|
||||
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
||||
import { randomUUID } from 'crypto';
|
||||
import type { PDFDocumentLoadingTask } from 'pdfjs-dist';
|
||||
import path from 'path';
|
||||
import { AsyncLocalContext } from './async-context';
|
||||
@ -18,8 +16,6 @@ dayjs.extend(timezone);
|
||||
const pPdfjs = import('pdfjs-dist/legacy/build/pdf.mjs');
|
||||
const nodeCmapUrl = path.resolve(require.resolve('pdfjs-dist'), '../../cmaps') + '/';
|
||||
|
||||
const md5Hasher = new HashManager('md5', 'hex');
|
||||
|
||||
function stdDev(numbers: number[]) {
|
||||
const mean = _.mean(numbers);
|
||||
const squareDiffs = numbers.map((num) => Math.pow(num - mean, 2));
|
||||
|
@ -9,7 +9,6 @@ import { AsyncContext } from '../shared/services/async-context';
|
||||
import { Threaded } from '../services/threaded';
|
||||
import { JSDomControl } from './jsdom';
|
||||
import { AltTextService } from './alt-text';
|
||||
import { PDFExtractor } from './pdf-extract';
|
||||
import { cleanAttribute } from '../utils/misc';
|
||||
import _ from 'lodash';
|
||||
import { STATUS_CODES } from 'http';
|
||||
|
@ -1 +1 @@
|
||||
Subproject commit 1a7dca40c52569d455237497c7285bd25eb2e3d2
|
||||
Subproject commit a677cbd23ffba78ac34d92d732be1945e016b6c7
|
Loading…
x
Reference in New Issue
Block a user