feat: domain profile (#1127)

* feat: domain profile

* fix

* fix

* fix

* fix

* fix

* refactor: curl as direct engine

* fix

---------

Co-authored-by: yanlong.wang <yanlong.wang@naiver.org>
This commit is contained in:
Sha Zhou 2025-01-13 17:44:09 +08:00 committed by GitHub
parent 6c23342cbf
commit 54abc175bb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 211 additions and 58 deletions

View File

@ -9,19 +9,20 @@ import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
import _ from 'lodash';
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
import { Request, Response } from 'express';
import { Curl } from 'node-libcurl';
const pNormalizeUrl = import("@esm2cjs/normalize-url");
import { Crawled } from '../db/crawled';
import { randomUUID } from 'crypto';
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
import { countGPTToken as estimateToken } from '../shared/utils/openai';
import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options';
import { CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/scrapping-options';
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
import { DomainBlockade } from '../db/domain-blockade';
import { DomainProfile } from '../db/domain-profile';
import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-checker';
import { JSDomControl } from '../services/jsdom';
import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
import { CurlControl } from '../services/curl';
export interface ExtraScrappingOptions extends ScrappingOptions {
withIframe?: boolean | 'quoted';
@ -50,10 +51,12 @@ export class CrawlerHost extends RPCHost {
cacheValidMs = 1000 * 3600;
urlValidMs = 1000 * 3600 * 4;
abuseBlockMs = 1000 * 3600;
domainProfileRetentionMs = 1000 * 3600 * 24 * 30;
constructor(
protected globalLogger: Logger,
protected puppeteerControl: PuppeteerControl,
protected curlControl: CurlControl,
protected jsdomControl: JSDomControl,
protected snapshotFormatter: SnapshotFormatter,
protected firebaseObjectStorage: FirebaseStorageBucketControl,
@ -63,7 +66,7 @@ export class CrawlerHost extends RPCHost {
) {
super(...arguments);
puppeteerControl.on('crawled', async (snapshot: PageSnapshot, options: ScrappingOptions & { url: URL; }) => {
puppeteerControl.on('crawled', async (snapshot: PageSnapshot, options: ExtraScrappingOptions & { url: URL; }) => {
if (!snapshot.title?.trim() && !snapshot.pdfs?.length) {
return;
}
@ -78,8 +81,15 @@ export class CrawlerHost extends RPCHost {
if (options.locale) {
Reflect.set(snapshot, 'locale', options.locale);
}
await this.setToCache(options.url, snapshot);
if (!options.engine) {
try {
await this.exploreDirectEngine(options.url, options, snapshot);
} catch (err) {
this.logger.warn(`Failed to explore direct engine option for ${options.url.href}`, { err });
}
}
});
puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
@ -245,8 +255,21 @@ export class CrawlerHost extends RPCHost {
throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
}
}
const crawlOpts = await this.configure(crawlerOptions);
if (!crawlOpts.engine) {
const domainProfile = (await DomainProfile.fromFirestoreQuery(
DomainProfile.COLLECTION
.where('origin', '==', targetUrl.origin.toLowerCase())
.limit(1)
))[0];
if (domainProfile?.engine) {
crawlOpts.engine = domainProfile.engine;
}
}
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
const sseStream = new OutputServerEventStream();
@ -388,6 +411,7 @@ export class CrawlerHost extends RPCHost {
}
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
}
async getTargetUrl(originPath: string, crawlerOptions: CrawlerOptions) {
@ -574,7 +598,6 @@ export class CrawlerHost extends RPCHost {
}
if (crawlerOpts?.pdf) {
const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64');
const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`;
const fakeSnapshot = {
@ -590,55 +613,9 @@ export class CrawlerHost extends RPCHost {
return;
}
if (crawlerOpts?.engine?.toLowerCase() === 'curl') {
const html = await new Promise<string>((resolve, reject) => {
const curl = new Curl();
curl.setOpt('URL', urlToCrawl.toString());
curl.setOpt(Curl.option.FOLLOWLOCATION, true);
if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) {
yield this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
if (crawlOpts?.timeoutMs) {
curl.setOpt(Curl.option.TIMEOUT_MS, crawlOpts.timeoutMs);
}
if (crawlOpts?.overrideUserAgent) {
curl.setOpt(Curl.option.USERAGENT, crawlOpts.overrideUserAgent);
}
if (crawlOpts?.extraHeaders) {
curl.setOpt(Curl.option.HTTPHEADER, Object.entries(crawlOpts.extraHeaders).map(([k, v]) => `${k}: ${v}`));
}
if (crawlOpts?.proxyUrl) {
curl.setOpt(Curl.option.PROXY, crawlOpts.proxyUrl);
}
if (crawlOpts?.cookies) {
curl.setOpt(Curl.option.COOKIE, crawlOpts.cookies.join('; '));
}
if (crawlOpts?.referer) {
curl.setOpt(Curl.option.REFERER, crawlOpts.referer);
}
curl.on('end', (statusCode, data, headers) => {
this.logger.info(`Successfully requested ${urlToCrawl} by curl`, { statusCode, headers });
resolve(data.toString());
curl.close();
});
curl.on('error', (err) => {
this.logger.error(`Failed to request ${urlToCrawl} by curl`, { err: marshalErrorLike(err) });
reject(err);
curl.close();
});
curl.perform();
});
const fakeSnapshot = {
href: urlToCrawl.toString(),
html: html,
title: '',
text: '',
} as PageSnapshot;
yield this.jsdomControl.narrowSnapshot(fakeSnapshot, crawlOpts);
return;
}
@ -760,7 +737,6 @@ export class CrawlerHost extends RPCHost {
this.threadLocal.set('keepImgDataUrl', opts.keepImgDataUrl);
this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
this.threadLocal.set('userAgent', opts.userAgent);
this.threadLocal.set('engine', opts.engine);
if (opts.timeout) {
this.threadLocal.set('timeout', opts.timeout * 1000);
}
@ -775,13 +751,13 @@ export class CrawlerHost extends RPCHost {
targetSelector: opts.targetSelector,
waitForSelector: opts.waitForSelector,
overrideUserAgent: opts.userAgent,
engine: opts.engine,
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
withIframe: opts.withIframe,
withShadowDom: opts.withShadowDom,
locale: opts.locale,
referer: opts.referer,
viewport: opts.viewport,
engine: opts.engine,
};
if (opts.locale) {
@ -849,4 +825,37 @@ export class CrawlerHost extends RPCHost {
return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
}
async exploreDirectEngine(targetUrl: URL, crawlerOptions: ScrappingOptions, knownSnapshot: PageSnapshot) {
const snapshot = await this.curlControl.urlToSnapshot(targetUrl, crawlerOptions);
const thisFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', snapshot);
const knownFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', knownSnapshot);
let engine = ENGINE_TYPE.DIRECT;
if (!(thisFormatted.content && knownFormatted.content &&
thisFormatted.content.trim() === knownFormatted.content.trim())) {
engine = ENGINE_TYPE.BROWSER;
}
const realUrl = new URL(knownSnapshot.href);
const profile = (await DomainProfile.fromFirestoreQuery(
DomainProfile.COLLECTION
.where('domain', '==', targetUrl.origin.toLowerCase())
.limit(1)
))[0] || new DomainProfile();
profile.origin = realUrl.origin.toLowerCase();
profile.triggerReason ??= 'Auto Explore';
profile.triggerUrl = realUrl.href;
profile.engine = engine;
profile.createdAt ??= new Date();
profile.expireAt = new Date(Date.now() + this.domainProfileRetentionMs);
await DomainProfile.save(profile);
return true;
}
}

View File

@ -0,0 +1,34 @@
import { Also, Prop } from 'civkit';
import { FirestoreRecord } from '../shared/lib/firestore';
import { ENGINE_TYPE } from '../dto/scrapping-options';
@Also({
dictOf: Object
})
export class DomainProfile extends FirestoreRecord {
static override collectionName = 'domainProfiles';
override _id!: string;
@Prop({
required: true
})
origin!: string;
@Prop({ required: true })
triggerReason!: string;
@Prop()
triggerUrl?: string;
@Prop({ required: true, type: ENGINE_TYPE })
engine!: string;
@Prop()
createdAt!: Date;
@Prop()
expireAt?: Date;
[k: string]: any;
}

View File

@ -11,6 +11,12 @@ export enum CONTENT_FORMAT {
SCREENSHOT = 'screenshot',
}
export enum ENGINE_TYPE {
BROWSER = 'browser',
DIRECT = 'direct',
VLM = 'vlm',
}
const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
export const IMAGE_RETENTION_MODES = ['none', 'all', 'alt', 'all_p', 'alt_p'] as const;
@ -182,7 +188,7 @@ class Viewport extends AutoCastable {
schema: { type: 'string' }
},
'X-Engine': {
description: 'Specify the engine to use for crawling.\n\nDefault: puppeteer, supported: puppeteer, curl',
description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, vlm',
in: 'header',
schema: { type: 'string' }
},
@ -277,7 +283,9 @@ export class CrawlerOptions extends AutoCastable {
@Prop()
userAgent?: string;
@Prop({ default: 'puppeteer' })
@Prop({
type: ENGINE_TYPE,
})
engine?: string;
@Prop({
@ -477,6 +485,26 @@ export class CrawlerOptions extends AutoCastable {
isRequestingCompoundContentFormat() {
return !CONTENT_FORMAT_VALUES.has(this.respondWith);
}
isGeneralMarkdownRequest() {
if (this.respondWith !== CONTENT_FORMAT.CONTENT && this.respondWith !== CONTENT_FORMAT.MARKDOWN) {
return false;
}
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
return false;
}
if (this.viewport) {
return false;
}
if (this.pdf) {
return false;
}
if (this.html) {
return false;
}
return true;
}
}
export class CrawlerOptionsHeaderOnly extends CrawlerOptions {

View File

@ -0,0 +1,82 @@
import { marshalErrorLike } from 'civkit/lang';
import { AsyncService } from 'civkit/async-service';
import { singleton } from 'tsyringe';
import { Curl } from 'node-libcurl';
import { PageSnapshot, ScrappingOptions } from './puppeteer';
import { Logger } from '../shared/services/logger';
import { JSDomControl } from './jsdom';
@singleton()
export class CurlControl extends AsyncService {
logger = this.globalLogger.child({ service: this.constructor.name });
constructor(
protected globalLogger: Logger,
protected jsdomControl: JSDomControl,
) {
super(...arguments);
}
override async init() {
await this.dependencyReady();
this.emit('ready');
}
async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions) {
const html = await new Promise<string>((resolve, reject) => {
const curl = new Curl();
curl.setOpt('URL', urlToCrawl.toString());
curl.setOpt(Curl.option.FOLLOWLOCATION, true);
if (crawlOpts?.timeoutMs) {
curl.setOpt(Curl.option.TIMEOUT_MS, crawlOpts.timeoutMs);
}
if (crawlOpts?.overrideUserAgent) {
curl.setOpt(Curl.option.USERAGENT, crawlOpts.overrideUserAgent);
}
if (crawlOpts?.extraHeaders) {
curl.setOpt(Curl.option.HTTPHEADER, Object.entries(crawlOpts.extraHeaders).map(([k, v]) => `${k}: ${v}`));
}
if (crawlOpts?.proxyUrl) {
curl.setOpt(Curl.option.PROXY, crawlOpts.proxyUrl);
}
if (crawlOpts?.cookies?.length) {
const cookieChunks = crawlOpts.cookies.map((cookie) => `${cookie.name}=${cookie.value}`);
curl.setOpt(Curl.option.COOKIE, cookieChunks.join('; '));
}
if (crawlOpts?.referer) {
curl.setOpt(Curl.option.REFERER, crawlOpts.referer);
}
curl.on('end', (statusCode, data, headers) => {
this.logger.debug(`CURL: ${urlToCrawl}`, { statusCode, headers });
resolve(data.toString());
curl.close();
});
curl.on('error', (err) => {
this.logger.warn(`Failed to curl ${urlToCrawl}`, { err: marshalErrorLike(err) });
curl.close();
reject(err);
});
curl.perform();
});
const snapshot = {
href: urlToCrawl.toString(),
html: html,
title: '',
text: '',
} as PageSnapshot;
const curlSnapshot = await this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
return curlSnapshot!;
}
}

@ -1 +1 @@
Subproject commit 98e9bf19bc6859c79eff516275cf1120e59e47bf
Subproject commit 439f633d464f3fd5fe288313766a43163190b60f