diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index 3d65dd8..aa73681 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -9,19 +9,20 @@ import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit'; import _ from 'lodash'; import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer'; import { Request, Response } from 'express'; -import { Curl } from 'node-libcurl'; const pNormalizeUrl = import("@esm2cjs/normalize-url"); import { Crawled } from '../db/crawled'; import { randomUUID } from 'crypto'; import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth'; import { countGPTToken as estimateToken } from '../shared/utils/openai'; -import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options'; +import { CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/scrapping-options'; import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account'; import { DomainBlockade } from '../db/domain-blockade'; +import { DomainProfile } from '../db/domain-profile'; import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-checker'; import { JSDomControl } from '../services/jsdom'; import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter'; +import { CurlControl } from '../services/curl'; export interface ExtraScrappingOptions extends ScrappingOptions { withIframe?: boolean | 'quoted'; @@ -50,10 +51,12 @@ export class CrawlerHost extends RPCHost { cacheValidMs = 1000 * 3600; urlValidMs = 1000 * 3600 * 4; abuseBlockMs = 1000 * 3600; + domainProfileRetentionMs = 1000 * 3600 * 24 * 30; constructor( protected globalLogger: Logger, protected puppeteerControl: PuppeteerControl, + protected curlControl: CurlControl, protected jsdomControl: JSDomControl, protected snapshotFormatter: SnapshotFormatter, protected firebaseObjectStorage: FirebaseStorageBucketControl, @@ -63,7 +66,7 @@ export class CrawlerHost extends RPCHost { ) { super(...arguments); - puppeteerControl.on('crawled', async (snapshot: PageSnapshot, options: ScrappingOptions & { url: URL; }) => { + puppeteerControl.on('crawled', async (snapshot: PageSnapshot, options: ExtraScrappingOptions & { url: URL; }) => { if (!snapshot.title?.trim() && !snapshot.pdfs?.length) { return; } @@ -78,8 +81,15 @@ export class CrawlerHost extends RPCHost { if (options.locale) { Reflect.set(snapshot, 'locale', options.locale); } - await this.setToCache(options.url, snapshot); + + if (!options.engine) { + try { + await this.exploreDirectEngine(options.url, options, snapshot); + } catch (err) { + this.logger.warn(`Failed to explore direct engine option for ${options.url.href}`, { err }); + } + } }); puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => { @@ -245,8 +255,21 @@ export class CrawlerHost extends RPCHost { throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`); } } + + const crawlOpts = await this.configure(crawlerOptions); + if (!crawlOpts.engine) { + const domainProfile = (await DomainProfile.fromFirestoreQuery( + DomainProfile.COLLECTION + .where('origin', '==', targetUrl.origin.toLowerCase()) + .limit(1) + ))[0]; + + if (domainProfile?.engine) { + crawlOpts.engine = domainProfile.engine; + } + } if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) { const sseStream = new OutputServerEventStream(); @@ -388,6 +411,7 @@ export class CrawlerHost extends RPCHost { } return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null }); + } async getTargetUrl(originPath: string, crawlerOptions: CrawlerOptions) { @@ -574,7 +598,6 @@ export class CrawlerHost extends RPCHost { } if (crawlerOpts?.pdf) { - const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64'); const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`; const fakeSnapshot = { @@ -590,55 +613,9 @@ export class CrawlerHost extends RPCHost { return; } - if (crawlerOpts?.engine?.toLowerCase() === 'curl') { - const html = await new Promise((resolve, reject) => { - const curl = new Curl(); - curl.setOpt('URL', urlToCrawl.toString()); - curl.setOpt(Curl.option.FOLLOWLOCATION, true); + if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) { + yield this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts); - if (crawlOpts?.timeoutMs) { - curl.setOpt(Curl.option.TIMEOUT_MS, crawlOpts.timeoutMs); - } - if (crawlOpts?.overrideUserAgent) { - curl.setOpt(Curl.option.USERAGENT, crawlOpts.overrideUserAgent); - } - if (crawlOpts?.extraHeaders) { - curl.setOpt(Curl.option.HTTPHEADER, Object.entries(crawlOpts.extraHeaders).map(([k, v]) => `${k}: ${v}`)); - } - if (crawlOpts?.proxyUrl) { - curl.setOpt(Curl.option.PROXY, crawlOpts.proxyUrl); - } - if (crawlOpts?.cookies) { - curl.setOpt(Curl.option.COOKIE, crawlOpts.cookies.join('; ')); - } - if (crawlOpts?.referer) { - curl.setOpt(Curl.option.REFERER, crawlOpts.referer); - } - - - curl.on('end', (statusCode, data, headers) => { - this.logger.info(`Successfully requested ${urlToCrawl} by curl`, { statusCode, headers }); - resolve(data.toString()); - curl.close(); - }); - - curl.on('error', (err) => { - this.logger.error(`Failed to request ${urlToCrawl} by curl`, { err: marshalErrorLike(err) }); - reject(err); - curl.close(); - }); - - curl.perform(); - }); - - const fakeSnapshot = { - href: urlToCrawl.toString(), - html: html, - title: '', - text: '', - } as PageSnapshot; - - yield this.jsdomControl.narrowSnapshot(fakeSnapshot, crawlOpts); return; } @@ -760,7 +737,6 @@ export class CrawlerHost extends RPCHost { this.threadLocal.set('keepImgDataUrl', opts.keepImgDataUrl); this.threadLocal.set('cacheTolerance', opts.cacheTolerance); this.threadLocal.set('userAgent', opts.userAgent); - this.threadLocal.set('engine', opts.engine); if (opts.timeout) { this.threadLocal.set('timeout', opts.timeout * 1000); } @@ -775,13 +751,13 @@ export class CrawlerHost extends RPCHost { targetSelector: opts.targetSelector, waitForSelector: opts.waitForSelector, overrideUserAgent: opts.userAgent, - engine: opts.engine, timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined, withIframe: opts.withIframe, withShadowDom: opts.withShadowDom, locale: opts.locale, referer: opts.referer, viewport: opts.viewport, + engine: opts.engine, }; if (opts.locale) { @@ -849,4 +825,37 @@ export class CrawlerHost extends RPCHost { return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs); } + + async exploreDirectEngine(targetUrl: URL, crawlerOptions: ScrappingOptions, knownSnapshot: PageSnapshot) { + const snapshot = await this.curlControl.urlToSnapshot(targetUrl, crawlerOptions); + + const thisFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', snapshot); + const knownFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', knownSnapshot); + + let engine = ENGINE_TYPE.DIRECT; + if (!(thisFormatted.content && knownFormatted.content && + thisFormatted.content.trim() === knownFormatted.content.trim())) { + engine = ENGINE_TYPE.BROWSER; + } + + const realUrl = new URL(knownSnapshot.href); + + const profile = (await DomainProfile.fromFirestoreQuery( + DomainProfile.COLLECTION + .where('domain', '==', targetUrl.origin.toLowerCase()) + .limit(1) + ))[0] || new DomainProfile(); + + + profile.origin = realUrl.origin.toLowerCase(); + profile.triggerReason ??= 'Auto Explore'; + profile.triggerUrl = realUrl.href; + profile.engine = engine; + profile.createdAt ??= new Date(); + profile.expireAt = new Date(Date.now() + this.domainProfileRetentionMs); + + await DomainProfile.save(profile); + + return true; + } } diff --git a/backend/functions/src/db/domain-profile.ts b/backend/functions/src/db/domain-profile.ts new file mode 100644 index 0000000..02c693b --- /dev/null +++ b/backend/functions/src/db/domain-profile.ts @@ -0,0 +1,34 @@ +import { Also, Prop } from 'civkit'; +import { FirestoreRecord } from '../shared/lib/firestore'; +import { ENGINE_TYPE } from '../dto/scrapping-options'; + +@Also({ + dictOf: Object +}) +export class DomainProfile extends FirestoreRecord { + static override collectionName = 'domainProfiles'; + + override _id!: string; + + @Prop({ + required: true + }) + origin!: string; + + @Prop({ required: true }) + triggerReason!: string; + + @Prop() + triggerUrl?: string; + + @Prop({ required: true, type: ENGINE_TYPE }) + engine!: string; + + @Prop() + createdAt!: Date; + + @Prop() + expireAt?: Date; + + [k: string]: any; +} diff --git a/backend/functions/src/dto/scrapping-options.ts b/backend/functions/src/dto/scrapping-options.ts index 147fbaf..44aded0 100644 --- a/backend/functions/src/dto/scrapping-options.ts +++ b/backend/functions/src/dto/scrapping-options.ts @@ -11,6 +11,12 @@ export enum CONTENT_FORMAT { SCREENSHOT = 'screenshot', } +export enum ENGINE_TYPE { + BROWSER = 'browser', + DIRECT = 'direct', + VLM = 'vlm', +} + const CONTENT_FORMAT_VALUES = new Set(Object.values(CONTENT_FORMAT)); export const IMAGE_RETENTION_MODES = ['none', 'all', 'alt', 'all_p', 'alt_p'] as const; @@ -182,7 +188,7 @@ class Viewport extends AutoCastable { schema: { type: 'string' } }, 'X-Engine': { - description: 'Specify the engine to use for crawling.\n\nDefault: puppeteer, supported: puppeteer, curl', + description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, vlm', in: 'header', schema: { type: 'string' } }, @@ -277,7 +283,9 @@ export class CrawlerOptions extends AutoCastable { @Prop() userAgent?: string; - @Prop({ default: 'puppeteer' }) + @Prop({ + type: ENGINE_TYPE, + }) engine?: string; @Prop({ @@ -477,6 +485,26 @@ export class CrawlerOptions extends AutoCastable { isRequestingCompoundContentFormat() { return !CONTENT_FORMAT_VALUES.has(this.respondWith); } + + isGeneralMarkdownRequest() { + if (this.respondWith !== CONTENT_FORMAT.CONTENT && this.respondWith !== CONTENT_FORMAT.MARKDOWN) { + return false; + } + if (this.injectFrameScript?.length || this.injectPageScript?.length) { + return false; + } + if (this.viewport) { + return false; + } + if (this.pdf) { + return false; + } + if (this.html) { + return false; + } + + return true; + } } export class CrawlerOptionsHeaderOnly extends CrawlerOptions { diff --git a/backend/functions/src/services/curl.ts b/backend/functions/src/services/curl.ts new file mode 100644 index 0000000..38a524d --- /dev/null +++ b/backend/functions/src/services/curl.ts @@ -0,0 +1,82 @@ +import { marshalErrorLike } from 'civkit/lang'; +import { AsyncService } from 'civkit/async-service'; +import { singleton } from 'tsyringe'; + +import { Curl } from 'node-libcurl'; +import { PageSnapshot, ScrappingOptions } from './puppeteer'; +import { Logger } from '../shared/services/logger'; +import { JSDomControl } from './jsdom'; + +@singleton() +export class CurlControl extends AsyncService { + + logger = this.globalLogger.child({ service: this.constructor.name }); + + constructor( + protected globalLogger: Logger, + protected jsdomControl: JSDomControl, + ) { + super(...arguments); + } + + override async init() { + await this.dependencyReady(); + + this.emit('ready'); + } + + async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions) { + const html = await new Promise((resolve, reject) => { + const curl = new Curl(); + curl.setOpt('URL', urlToCrawl.toString()); + curl.setOpt(Curl.option.FOLLOWLOCATION, true); + + if (crawlOpts?.timeoutMs) { + curl.setOpt(Curl.option.TIMEOUT_MS, crawlOpts.timeoutMs); + } + if (crawlOpts?.overrideUserAgent) { + curl.setOpt(Curl.option.USERAGENT, crawlOpts.overrideUserAgent); + } + if (crawlOpts?.extraHeaders) { + curl.setOpt(Curl.option.HTTPHEADER, Object.entries(crawlOpts.extraHeaders).map(([k, v]) => `${k}: ${v}`)); + } + if (crawlOpts?.proxyUrl) { + curl.setOpt(Curl.option.PROXY, crawlOpts.proxyUrl); + } + if (crawlOpts?.cookies?.length) { + const cookieChunks = crawlOpts.cookies.map((cookie) => `${cookie.name}=${cookie.value}`); + curl.setOpt(Curl.option.COOKIE, cookieChunks.join('; ')); + } + if (crawlOpts?.referer) { + curl.setOpt(Curl.option.REFERER, crawlOpts.referer); + } + + curl.on('end', (statusCode, data, headers) => { + this.logger.debug(`CURL: ${urlToCrawl}`, { statusCode, headers }); + resolve(data.toString()); + curl.close(); + }); + + curl.on('error', (err) => { + this.logger.warn(`Failed to curl ${urlToCrawl}`, { err: marshalErrorLike(err) }); + curl.close(); + reject(err); + }); + + curl.perform(); + }); + + const snapshot = { + href: urlToCrawl.toString(), + html: html, + title: '', + text: '', + } as PageSnapshot; + + const curlSnapshot = await this.jsdomControl.narrowSnapshot(snapshot, crawlOpts); + + return curlSnapshot!; + } + + +} diff --git a/thinapps-shared b/thinapps-shared index 98e9bf1..439f633 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit 98e9bf19bc6859c79eff516275cf1120e59e47bf +Subproject commit 439f633d464f3fd5fe288313766a43163190b60f