mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-06 08:36:02 +08:00
feat: domain profile (#1127)
* feat: domain profile * fix * fix * fix * fix * fix * refactor: curl as direct engine * fix --------- Co-authored-by: yanlong.wang <yanlong.wang@naiver.org>
This commit is contained in:
parent
6c23342cbf
commit
54abc175bb
@ -9,19 +9,20 @@ import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
||||
import _ from 'lodash';
|
||||
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
||||
import { Request, Response } from 'express';
|
||||
import { Curl } from 'node-libcurl';
|
||||
const pNormalizeUrl = import("@esm2cjs/normalize-url");
|
||||
import { Crawled } from '../db/crawled';
|
||||
import { randomUUID } from 'crypto';
|
||||
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
||||
|
||||
import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
||||
import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options';
|
||||
import { CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/scrapping-options';
|
||||
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
||||
import { DomainBlockade } from '../db/domain-blockade';
|
||||
import { DomainProfile } from '../db/domain-profile';
|
||||
import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-checker';
|
||||
import { JSDomControl } from '../services/jsdom';
|
||||
import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
|
||||
import { CurlControl } from '../services/curl';
|
||||
|
||||
export interface ExtraScrappingOptions extends ScrappingOptions {
|
||||
withIframe?: boolean | 'quoted';
|
||||
@ -50,10 +51,12 @@ export class CrawlerHost extends RPCHost {
|
||||
cacheValidMs = 1000 * 3600;
|
||||
urlValidMs = 1000 * 3600 * 4;
|
||||
abuseBlockMs = 1000 * 3600;
|
||||
domainProfileRetentionMs = 1000 * 3600 * 24 * 30;
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
protected puppeteerControl: PuppeteerControl,
|
||||
protected curlControl: CurlControl,
|
||||
protected jsdomControl: JSDomControl,
|
||||
protected snapshotFormatter: SnapshotFormatter,
|
||||
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
||||
@ -63,7 +66,7 @@ export class CrawlerHost extends RPCHost {
|
||||
) {
|
||||
super(...arguments);
|
||||
|
||||
puppeteerControl.on('crawled', async (snapshot: PageSnapshot, options: ScrappingOptions & { url: URL; }) => {
|
||||
puppeteerControl.on('crawled', async (snapshot: PageSnapshot, options: ExtraScrappingOptions & { url: URL; }) => {
|
||||
if (!snapshot.title?.trim() && !snapshot.pdfs?.length) {
|
||||
return;
|
||||
}
|
||||
@ -78,8 +81,15 @@ export class CrawlerHost extends RPCHost {
|
||||
if (options.locale) {
|
||||
Reflect.set(snapshot, 'locale', options.locale);
|
||||
}
|
||||
|
||||
await this.setToCache(options.url, snapshot);
|
||||
|
||||
if (!options.engine) {
|
||||
try {
|
||||
await this.exploreDirectEngine(options.url, options, snapshot);
|
||||
} catch (err) {
|
||||
this.logger.warn(`Failed to explore direct engine option for ${options.url.href}`, { err });
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
|
||||
@ -245,8 +255,21 @@ export class CrawlerHost extends RPCHost {
|
||||
throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
const crawlOpts = await this.configure(crawlerOptions);
|
||||
|
||||
if (!crawlOpts.engine) {
|
||||
const domainProfile = (await DomainProfile.fromFirestoreQuery(
|
||||
DomainProfile.COLLECTION
|
||||
.where('origin', '==', targetUrl.origin.toLowerCase())
|
||||
.limit(1)
|
||||
))[0];
|
||||
|
||||
if (domainProfile?.engine) {
|
||||
crawlOpts.engine = domainProfile.engine;
|
||||
}
|
||||
}
|
||||
|
||||
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
||||
const sseStream = new OutputServerEventStream();
|
||||
@ -388,6 +411,7 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
|
||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
|
||||
|
||||
}
|
||||
|
||||
async getTargetUrl(originPath: string, crawlerOptions: CrawlerOptions) {
|
||||
@ -574,7 +598,6 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
|
||||
if (crawlerOpts?.pdf) {
|
||||
|
||||
const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64');
|
||||
const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`;
|
||||
const fakeSnapshot = {
|
||||
@ -590,55 +613,9 @@ export class CrawlerHost extends RPCHost {
|
||||
return;
|
||||
}
|
||||
|
||||
if (crawlerOpts?.engine?.toLowerCase() === 'curl') {
|
||||
const html = await new Promise<string>((resolve, reject) => {
|
||||
const curl = new Curl();
|
||||
curl.setOpt('URL', urlToCrawl.toString());
|
||||
curl.setOpt(Curl.option.FOLLOWLOCATION, true);
|
||||
if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) {
|
||||
yield this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
|
||||
|
||||
if (crawlOpts?.timeoutMs) {
|
||||
curl.setOpt(Curl.option.TIMEOUT_MS, crawlOpts.timeoutMs);
|
||||
}
|
||||
if (crawlOpts?.overrideUserAgent) {
|
||||
curl.setOpt(Curl.option.USERAGENT, crawlOpts.overrideUserAgent);
|
||||
}
|
||||
if (crawlOpts?.extraHeaders) {
|
||||
curl.setOpt(Curl.option.HTTPHEADER, Object.entries(crawlOpts.extraHeaders).map(([k, v]) => `${k}: ${v}`));
|
||||
}
|
||||
if (crawlOpts?.proxyUrl) {
|
||||
curl.setOpt(Curl.option.PROXY, crawlOpts.proxyUrl);
|
||||
}
|
||||
if (crawlOpts?.cookies) {
|
||||
curl.setOpt(Curl.option.COOKIE, crawlOpts.cookies.join('; '));
|
||||
}
|
||||
if (crawlOpts?.referer) {
|
||||
curl.setOpt(Curl.option.REFERER, crawlOpts.referer);
|
||||
}
|
||||
|
||||
|
||||
curl.on('end', (statusCode, data, headers) => {
|
||||
this.logger.info(`Successfully requested ${urlToCrawl} by curl`, { statusCode, headers });
|
||||
resolve(data.toString());
|
||||
curl.close();
|
||||
});
|
||||
|
||||
curl.on('error', (err) => {
|
||||
this.logger.error(`Failed to request ${urlToCrawl} by curl`, { err: marshalErrorLike(err) });
|
||||
reject(err);
|
||||
curl.close();
|
||||
});
|
||||
|
||||
curl.perform();
|
||||
});
|
||||
|
||||
const fakeSnapshot = {
|
||||
href: urlToCrawl.toString(),
|
||||
html: html,
|
||||
title: '',
|
||||
text: '',
|
||||
} as PageSnapshot;
|
||||
|
||||
yield this.jsdomControl.narrowSnapshot(fakeSnapshot, crawlOpts);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -760,7 +737,6 @@ export class CrawlerHost extends RPCHost {
|
||||
this.threadLocal.set('keepImgDataUrl', opts.keepImgDataUrl);
|
||||
this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
|
||||
this.threadLocal.set('userAgent', opts.userAgent);
|
||||
this.threadLocal.set('engine', opts.engine);
|
||||
if (opts.timeout) {
|
||||
this.threadLocal.set('timeout', opts.timeout * 1000);
|
||||
}
|
||||
@ -775,13 +751,13 @@ export class CrawlerHost extends RPCHost {
|
||||
targetSelector: opts.targetSelector,
|
||||
waitForSelector: opts.waitForSelector,
|
||||
overrideUserAgent: opts.userAgent,
|
||||
engine: opts.engine,
|
||||
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
|
||||
withIframe: opts.withIframe,
|
||||
withShadowDom: opts.withShadowDom,
|
||||
locale: opts.locale,
|
||||
referer: opts.referer,
|
||||
viewport: opts.viewport,
|
||||
engine: opts.engine,
|
||||
};
|
||||
|
||||
if (opts.locale) {
|
||||
@ -849,4 +825,37 @@ export class CrawlerHost extends RPCHost {
|
||||
|
||||
return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
|
||||
}
|
||||
|
||||
async exploreDirectEngine(targetUrl: URL, crawlerOptions: ScrappingOptions, knownSnapshot: PageSnapshot) {
|
||||
const snapshot = await this.curlControl.urlToSnapshot(targetUrl, crawlerOptions);
|
||||
|
||||
const thisFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', snapshot);
|
||||
const knownFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', knownSnapshot);
|
||||
|
||||
let engine = ENGINE_TYPE.DIRECT;
|
||||
if (!(thisFormatted.content && knownFormatted.content &&
|
||||
thisFormatted.content.trim() === knownFormatted.content.trim())) {
|
||||
engine = ENGINE_TYPE.BROWSER;
|
||||
}
|
||||
|
||||
const realUrl = new URL(knownSnapshot.href);
|
||||
|
||||
const profile = (await DomainProfile.fromFirestoreQuery(
|
||||
DomainProfile.COLLECTION
|
||||
.where('domain', '==', targetUrl.origin.toLowerCase())
|
||||
.limit(1)
|
||||
))[0] || new DomainProfile();
|
||||
|
||||
|
||||
profile.origin = realUrl.origin.toLowerCase();
|
||||
profile.triggerReason ??= 'Auto Explore';
|
||||
profile.triggerUrl = realUrl.href;
|
||||
profile.engine = engine;
|
||||
profile.createdAt ??= new Date();
|
||||
profile.expireAt = new Date(Date.now() + this.domainProfileRetentionMs);
|
||||
|
||||
await DomainProfile.save(profile);
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
34
backend/functions/src/db/domain-profile.ts
Normal file
34
backend/functions/src/db/domain-profile.ts
Normal file
@ -0,0 +1,34 @@
|
||||
import { Also, Prop } from 'civkit';
|
||||
import { FirestoreRecord } from '../shared/lib/firestore';
|
||||
import { ENGINE_TYPE } from '../dto/scrapping-options';
|
||||
|
||||
@Also({
|
||||
dictOf: Object
|
||||
})
|
||||
export class DomainProfile extends FirestoreRecord {
|
||||
static override collectionName = 'domainProfiles';
|
||||
|
||||
override _id!: string;
|
||||
|
||||
@Prop({
|
||||
required: true
|
||||
})
|
||||
origin!: string;
|
||||
|
||||
@Prop({ required: true })
|
||||
triggerReason!: string;
|
||||
|
||||
@Prop()
|
||||
triggerUrl?: string;
|
||||
|
||||
@Prop({ required: true, type: ENGINE_TYPE })
|
||||
engine!: string;
|
||||
|
||||
@Prop()
|
||||
createdAt!: Date;
|
||||
|
||||
@Prop()
|
||||
expireAt?: Date;
|
||||
|
||||
[k: string]: any;
|
||||
}
|
@ -11,6 +11,12 @@ export enum CONTENT_FORMAT {
|
||||
SCREENSHOT = 'screenshot',
|
||||
}
|
||||
|
||||
export enum ENGINE_TYPE {
|
||||
BROWSER = 'browser',
|
||||
DIRECT = 'direct',
|
||||
VLM = 'vlm',
|
||||
}
|
||||
|
||||
const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
|
||||
|
||||
export const IMAGE_RETENTION_MODES = ['none', 'all', 'alt', 'all_p', 'alt_p'] as const;
|
||||
@ -182,7 +188,7 @@ class Viewport extends AutoCastable {
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Engine': {
|
||||
description: 'Specify the engine to use for crawling.\n\nDefault: puppeteer, supported: puppeteer, curl',
|
||||
description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, vlm',
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
@ -277,7 +283,9 @@ export class CrawlerOptions extends AutoCastable {
|
||||
@Prop()
|
||||
userAgent?: string;
|
||||
|
||||
@Prop({ default: 'puppeteer' })
|
||||
@Prop({
|
||||
type: ENGINE_TYPE,
|
||||
})
|
||||
engine?: string;
|
||||
|
||||
@Prop({
|
||||
@ -477,6 +485,26 @@ export class CrawlerOptions extends AutoCastable {
|
||||
isRequestingCompoundContentFormat() {
|
||||
return !CONTENT_FORMAT_VALUES.has(this.respondWith);
|
||||
}
|
||||
|
||||
isGeneralMarkdownRequest() {
|
||||
if (this.respondWith !== CONTENT_FORMAT.CONTENT && this.respondWith !== CONTENT_FORMAT.MARKDOWN) {
|
||||
return false;
|
||||
}
|
||||
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
||||
return false;
|
||||
}
|
||||
if (this.viewport) {
|
||||
return false;
|
||||
}
|
||||
if (this.pdf) {
|
||||
return false;
|
||||
}
|
||||
if (this.html) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
export class CrawlerOptionsHeaderOnly extends CrawlerOptions {
|
||||
|
82
backend/functions/src/services/curl.ts
Normal file
82
backend/functions/src/services/curl.ts
Normal file
@ -0,0 +1,82 @@
|
||||
import { marshalErrorLike } from 'civkit/lang';
|
||||
import { AsyncService } from 'civkit/async-service';
|
||||
import { singleton } from 'tsyringe';
|
||||
|
||||
import { Curl } from 'node-libcurl';
|
||||
import { PageSnapshot, ScrappingOptions } from './puppeteer';
|
||||
import { Logger } from '../shared/services/logger';
|
||||
import { JSDomControl } from './jsdom';
|
||||
|
||||
@singleton()
|
||||
export class CurlControl extends AsyncService {
|
||||
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
protected jsdomControl: JSDomControl,
|
||||
) {
|
||||
super(...arguments);
|
||||
}
|
||||
|
||||
override async init() {
|
||||
await this.dependencyReady();
|
||||
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions) {
|
||||
const html = await new Promise<string>((resolve, reject) => {
|
||||
const curl = new Curl();
|
||||
curl.setOpt('URL', urlToCrawl.toString());
|
||||
curl.setOpt(Curl.option.FOLLOWLOCATION, true);
|
||||
|
||||
if (crawlOpts?.timeoutMs) {
|
||||
curl.setOpt(Curl.option.TIMEOUT_MS, crawlOpts.timeoutMs);
|
||||
}
|
||||
if (crawlOpts?.overrideUserAgent) {
|
||||
curl.setOpt(Curl.option.USERAGENT, crawlOpts.overrideUserAgent);
|
||||
}
|
||||
if (crawlOpts?.extraHeaders) {
|
||||
curl.setOpt(Curl.option.HTTPHEADER, Object.entries(crawlOpts.extraHeaders).map(([k, v]) => `${k}: ${v}`));
|
||||
}
|
||||
if (crawlOpts?.proxyUrl) {
|
||||
curl.setOpt(Curl.option.PROXY, crawlOpts.proxyUrl);
|
||||
}
|
||||
if (crawlOpts?.cookies?.length) {
|
||||
const cookieChunks = crawlOpts.cookies.map((cookie) => `${cookie.name}=${cookie.value}`);
|
||||
curl.setOpt(Curl.option.COOKIE, cookieChunks.join('; '));
|
||||
}
|
||||
if (crawlOpts?.referer) {
|
||||
curl.setOpt(Curl.option.REFERER, crawlOpts.referer);
|
||||
}
|
||||
|
||||
curl.on('end', (statusCode, data, headers) => {
|
||||
this.logger.debug(`CURL: ${urlToCrawl}`, { statusCode, headers });
|
||||
resolve(data.toString());
|
||||
curl.close();
|
||||
});
|
||||
|
||||
curl.on('error', (err) => {
|
||||
this.logger.warn(`Failed to curl ${urlToCrawl}`, { err: marshalErrorLike(err) });
|
||||
curl.close();
|
||||
reject(err);
|
||||
});
|
||||
|
||||
curl.perform();
|
||||
});
|
||||
|
||||
const snapshot = {
|
||||
href: urlToCrawl.toString(),
|
||||
html: html,
|
||||
title: '',
|
||||
text: '',
|
||||
} as PageSnapshot;
|
||||
|
||||
const curlSnapshot = await this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
|
||||
|
||||
return curlSnapshot!;
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -1 +1 @@
|
||||
Subproject commit 98e9bf19bc6859c79eff516275cf1120e59e47bf
|
||||
Subproject commit 439f633d464f3fd5fe288313766a43163190b60f
|
Loading…
x
Reference in New Issue
Block a user