mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-20 01:29:05 +08:00
wip
This commit is contained in:
parent
f1fe45fbfe
commit
161d861925
@ -18,8 +18,8 @@ import { FancyFile } from 'civkit/fancy-file';
|
|||||||
|
|
||||||
import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE, RESPOND_TIMING } from '../dto/crawler-options';
|
import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE, RESPOND_TIMING } from '../dto/crawler-options';
|
||||||
|
|
||||||
import { Crawled } from '../db/crawled';
|
import { Crawled, PageCacheCollection } from '../db/crawled';
|
||||||
import { DomainBlockade } from '../db/domain-blockade';
|
import { DomainBlockade, DomainBlockadeCollection } from '../db/domain-blockade';
|
||||||
import { OutputServerEventStream } from '../lib/transform-server-event-stream';
|
import { OutputServerEventStream } from '../lib/transform-server-event-stream';
|
||||||
|
|
||||||
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
||||||
@ -102,6 +102,8 @@ export class CrawlerHost extends RPCHost {
|
|||||||
protected miscService: MiscService,
|
protected miscService: MiscService,
|
||||||
protected pdfContentCollection: PDFContentCollection,
|
protected pdfContentCollection: PDFContentCollection,
|
||||||
protected pdfExtractor: PDFExtractor,
|
protected pdfExtractor: PDFExtractor,
|
||||||
|
protected domainBlockadeCollection: DomainBlockadeCollection,
|
||||||
|
protected pageCacheCollection: PageCacheCollection,
|
||||||
) {
|
) {
|
||||||
super(...arguments);
|
super(...arguments);
|
||||||
|
|
||||||
@ -145,7 +147,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
|
puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
|
||||||
this.logger.warn(`Abuse detected on ${abuseEvent.url}, blocking ${abuseEvent.url.hostname}`, { reason: abuseEvent.reason, sn: abuseEvent.sn });
|
this.logger.warn(`Abuse detected on ${abuseEvent.url}, blocking ${abuseEvent.url.hostname}`, { reason: abuseEvent.reason, sn: abuseEvent.sn });
|
||||||
|
|
||||||
await DomainBlockade.save(DomainBlockade.from({
|
await this.domainBlockadeCollection.save(DomainBlockade.from({
|
||||||
domain: abuseEvent.url.hostname.toLowerCase(),
|
domain: abuseEvent.url.hostname.toLowerCase(),
|
||||||
triggerReason: `${abuseEvent.reason}`,
|
triggerReason: `${abuseEvent.reason}`,
|
||||||
triggerUrl: abuseEvent.url.toString(),
|
triggerUrl: abuseEvent.url.toString(),
|
||||||
@ -310,12 +312,14 @@ export class CrawlerHost extends RPCHost {
|
|||||||
if (!uid) {
|
if (!uid) {
|
||||||
// Enforce no proxy is allocated for anonymous users due to abuse.
|
// Enforce no proxy is allocated for anonymous users due to abuse.
|
||||||
crawlerOptions.proxy = 'none';
|
crawlerOptions.proxy = 'none';
|
||||||
const blockade = (await DomainBlockade.fromFirestoreQuery(
|
const blockade = await this.domainBlockadeCollection.findOne({
|
||||||
DomainBlockade.COLLECTION
|
domain: targetUrl.hostname.toLowerCase(),
|
||||||
.where('domain', '==', targetUrl.hostname.toLowerCase())
|
expireAt: { $gte: new Date() }
|
||||||
.where('expireAt', '>=', new Date())
|
}).catch((err) => {
|
||||||
.limit(1)
|
this.logger.warn(`Failed to query domain blockade for ${targetUrl.hostname}`, { err: marshalErrorLike(err) });
|
||||||
))[0];
|
return undefined;
|
||||||
|
});
|
||||||
|
|
||||||
if (blockade) {
|
if (blockade) {
|
||||||
throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
|
throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
|
||||||
}
|
}
|
||||||
@ -522,19 +526,10 @@ export class CrawlerHost extends RPCHost {
|
|||||||
async *queryCache(urlToCrawl: URL, cacheTolerance: number) {
|
async *queryCache(urlToCrawl: URL, cacheTolerance: number) {
|
||||||
const digest = this.getUrlDigest(urlToCrawl);
|
const digest = this.getUrlDigest(urlToCrawl);
|
||||||
|
|
||||||
const cache = (
|
const cache = await this.pageCacheCollection.findOne({ urlPathDigest: digest }, { sort: { createdAt: -1 } }).catch((err) => {
|
||||||
await
|
|
||||||
(Crawled.fromFirestoreQuery(
|
|
||||||
Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)
|
|
||||||
).catch((err) => {
|
|
||||||
this.logger.warn(`Failed to query cache, unknown issue`, { err });
|
this.logger.warn(`Failed to query cache, unknown issue`, { err });
|
||||||
// https://github.com/grpc/grpc-node/issues/2647
|
|
||||||
// https://github.com/googleapis/nodejs-firestore/issues/1023
|
|
||||||
// https://github.com/googleapis/nodejs-firestore/issues/1023
|
|
||||||
|
|
||||||
return undefined;
|
return undefined;
|
||||||
}))
|
});
|
||||||
)?.[0];
|
|
||||||
|
|
||||||
yield cache;
|
yield cache;
|
||||||
|
|
||||||
@ -593,7 +588,6 @@ export class CrawlerHost extends RPCHost {
|
|||||||
const nowDate = new Date();
|
const nowDate = new Date();
|
||||||
|
|
||||||
const cache = Crawled.from({
|
const cache = Crawled.from({
|
||||||
_id: randomUUID(),
|
|
||||||
url: urlToCrawl.toString(),
|
url: urlToCrawl.toString(),
|
||||||
createdAt: nowDate,
|
createdAt: nowDate,
|
||||||
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
|
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
|
||||||
@ -637,7 +631,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
cache.pageshotAvailable = true;
|
cache.pageshotAvailable = true;
|
||||||
}
|
}
|
||||||
await savingOfSnapshot;
|
await savingOfSnapshot;
|
||||||
const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => {
|
const r = await this.pageCacheCollection.save(cache).catch((err) => {
|
||||||
this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });
|
this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });
|
||||||
|
|
||||||
return undefined;
|
return undefined;
|
||||||
|
@ -25,7 +25,7 @@ import { toAsyncGenerator } from '../utils/misc';
|
|||||||
import type { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
import type { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
||||||
import { LRUCache } from 'lru-cache';
|
import { LRUCache } from 'lru-cache';
|
||||||
import { API_CALL_STATUS } from '../shared/db/api-roll';
|
import { API_CALL_STATUS } from '../shared/db/api-roll';
|
||||||
import { SERPResult } from '../db/searched';
|
import { SearchResult } from '../db/searched';
|
||||||
import { SerperSearchQueryParams, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
|
import { SerperSearchQueryParams, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
|
||||||
import { InternalJinaSerpService } from '../services/serp/internal';
|
import { InternalJinaSerpService } from '../services/serp/internal';
|
||||||
import { WebSearchEntry } from '../services/serp/compat';
|
import { WebSearchEntry } from '../services/serp/compat';
|
||||||
|
@ -10,7 +10,7 @@ export class DomainBlockade extends AutoCastable {
|
|||||||
@Prop({
|
@Prop({
|
||||||
defaultFactory: () => new ObjectId()
|
defaultFactory: () => new ObjectId()
|
||||||
})
|
})
|
||||||
_id!: string;
|
_id!: ObjectId;
|
||||||
|
|
||||||
@Prop({
|
@Prop({
|
||||||
required: true
|
required: true
|
||||||
|
@ -1,12 +1,10 @@
|
|||||||
import { singleton } from 'tsyringe';
|
import { singleton } from 'tsyringe';
|
||||||
import _ from 'lodash';
|
import _ from 'lodash';
|
||||||
import { TextItem } from 'pdfjs-dist/types/src/display/api';
|
import { TextItem } from 'pdfjs-dist/types/src/display/api';
|
||||||
import { AssertionFailureError, AsyncService, HashManager } from 'civkit';
|
import { AsyncService } from 'civkit/async-service';
|
||||||
import { GlobalLogger } from './logger';
|
import { GlobalLogger } from './logger';
|
||||||
import { PDFContent } from '../db/pdf';
|
|
||||||
import dayjs from 'dayjs';
|
import dayjs from 'dayjs';
|
||||||
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
||||||
import { randomUUID } from 'crypto';
|
|
||||||
import type { PDFDocumentLoadingTask } from 'pdfjs-dist';
|
import type { PDFDocumentLoadingTask } from 'pdfjs-dist';
|
||||||
import path from 'path';
|
import path from 'path';
|
||||||
import { AsyncLocalContext } from './async-context';
|
import { AsyncLocalContext } from './async-context';
|
||||||
@ -18,8 +16,6 @@ dayjs.extend(timezone);
|
|||||||
const pPdfjs = import('pdfjs-dist/legacy/build/pdf.mjs');
|
const pPdfjs = import('pdfjs-dist/legacy/build/pdf.mjs');
|
||||||
const nodeCmapUrl = path.resolve(require.resolve('pdfjs-dist'), '../../cmaps') + '/';
|
const nodeCmapUrl = path.resolve(require.resolve('pdfjs-dist'), '../../cmaps') + '/';
|
||||||
|
|
||||||
const md5Hasher = new HashManager('md5', 'hex');
|
|
||||||
|
|
||||||
function stdDev(numbers: number[]) {
|
function stdDev(numbers: number[]) {
|
||||||
const mean = _.mean(numbers);
|
const mean = _.mean(numbers);
|
||||||
const squareDiffs = numbers.map((num) => Math.pow(num - mean, 2));
|
const squareDiffs = numbers.map((num) => Math.pow(num - mean, 2));
|
||||||
|
@ -9,7 +9,6 @@ import { AsyncContext } from '../shared/services/async-context';
|
|||||||
import { Threaded } from '../services/threaded';
|
import { Threaded } from '../services/threaded';
|
||||||
import { JSDomControl } from './jsdom';
|
import { JSDomControl } from './jsdom';
|
||||||
import { AltTextService } from './alt-text';
|
import { AltTextService } from './alt-text';
|
||||||
import { PDFExtractor } from './pdf-extract';
|
|
||||||
import { cleanAttribute } from '../utils/misc';
|
import { cleanAttribute } from '../utils/misc';
|
||||||
import _ from 'lodash';
|
import _ from 'lodash';
|
||||||
import { STATUS_CODES } from 'http';
|
import { STATUS_CODES } from 'http';
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit 1a7dca40c52569d455237497c7285bd25eb2e3d2
|
Subproject commit a677cbd23ffba78ac34d92d732be1945e016b6c7
|
Loading…
x
Reference in New Issue
Block a user