diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml new file mode 100644 index 0000000..1e19f89 --- /dev/null +++ b/.github/workflows/cd.yml @@ -0,0 +1,76 @@ +run-name: Build push and deploy (CD) +on: + push: + branches: + - main + - ci-debug + tags: + - '*' + +jobs: + build-and-push-to-gcr: + runs-on: ubuntu-latest + concurrency: + group: ${{ github.ref_type == 'branch' && github.ref }} + cancel-in-progress: true + defaults: + run: + working-directory: backend/functions + permissions: + contents: read + steps: + - uses: actions/checkout@v4 + with: + lfs: true + submodules: true + token: ${{ secrets.THINAPPS_SHARED_READ_TOKEN }} + - uses: 'google-github-actions/auth@v2' + with: + credentials_json: '${{ secrets.GCLOUD_SERVICE_ACCOUNT_SECRET_JSON }}' + - name: 'Set up Cloud SDK' + uses: 'google-github-actions/setup-gcloud@v2' + - name: "Docker auth" + run: |- + gcloud auth configure-docker us-docker.pkg.dev --quiet + - name: Set controller release version + run: echo "RELEASE_VERSION=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: 22.12.0 + cache: npm + cache-dependency-path: backend/functions/package-lock.json + + - name: npm install + run: npm ci + - name: get maxmind mmdb + run: mkdir -p licensed && curl -o licensed/GeoLite2-City.mmdb https://github.com/P3TERX/GeoLite.mmdb/raw/download/GeoLite2-City.mmdb + - name: build application + run: npm run build + - name: Set package version + run: npm version --no-git-tag-version ${{ env.RELEASE_VERSION }} + if: github.ref_type == 'tag' + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: | + us-docker.pkg.dev/reader-6b7dc/jina-reader/reader + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Build and push + id: container + uses: docker/build-push-action@v6 + with: + context: backend/functions + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + - name: Deploy CRAWL with Tag + run: | + gcloud run deploy crawl --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/crawl.js --region us-central1 --async --min-instances 0 + - name: Deploy SEARCH with Tag + run: | + gcloud run deploy search --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region us-central1 --async --min-instances 0 \ No newline at end of file diff --git a/backend/functions/.dockerignore b/backend/functions/.dockerignore new file mode 100644 index 0000000..c2658d7 --- /dev/null +++ b/backend/functions/.dockerignore @@ -0,0 +1 @@ +node_modules/ diff --git a/backend/functions/Dockerfile b/backend/functions/Dockerfile new file mode 100644 index 0000000..63bb9cf --- /dev/null +++ b/backend/functions/Dockerfile @@ -0,0 +1,37 @@ +# syntax=docker/dockerfile:1 +FROM lwthiker/curl-impersonate:0.6-chrome-slim-bullseye + +FROM node:20 + +RUN apt-get update \ + && apt-get install -y wget gnupg \ + && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \ + && sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \ + && apt-get update \ + && apt-get install -y google-chrome-stable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-freefont-ttf libxss1 \ + --no-install-recommends \ + && rm -rf /var/lib/apt/lists/* + +COPY --from=0 /usr/local/lib/libcurl-impersonate.so /usr/local/lib/libcurl-impersonate.so + +RUN groupadd -r jina +RUN useradd -g jina -G audio,video -m jina +USER jina + +WORKDIR /app + +COPY package.json package-lock.json ./ +RUN npm ci + +COPY build ./build +COPY public ./public +COPY licensed ./licensed + +RUN rm -rf ~/.config/chromium && mkdir -p ~/.config/chromium + +ENV LD_PRELOAD=/usr/local/lib/libcurl-impersonate.so CURL_IMPERSONATE=chrome116 CURL_IMPERSONATE_HEADERS=no +ENV PORT=8080 + +EXPOSE 3000 3001 8080 8081 +ENTRYPOINT ["node"] +CMD [ "build/stand-alone/crawl.js" ] diff --git a/backend/functions/package-lock.json b/backend/functions/package-lock.json index ca2e5a6..ae431d8 100644 --- a/backend/functions/package-lock.json +++ b/backend/functions/package-lock.json @@ -16,7 +16,7 @@ "axios": "^1.3.3", "bcrypt": "^5.1.0", "busboy": "^1.6.0", - "civkit": "^0.8.2-4c0357a", + "civkit": "^0.8.2-03243fe", "core-js": "^3.37.1", "cors": "^2.8.5", "dayjs": "^1.11.9", @@ -3979,9 +3979,9 @@ } }, "node_modules/civkit": { - "version": "0.8.2-4c0357a", - "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.2-4c0357a.tgz", - "integrity": "sha512-8/RcapAm8YYImf+YVBRhybEFuSuV5Pg1p/s6Niql3VAY2cV1/OC1fTCDZY689yeq8zFcwxwBvaqyIEGo69F+IA==", + "version": "0.8.2-03243fe", + "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.2-03243fe.tgz", + "integrity": "sha512-hoTxGeGdD27iOCDi51cVY0PHlRN3OSC640QRJ1YSmD42o+LP7mZtbdy8dN7j/FSkPP/5yLuB2ch9BMSOp54POQ==", "license": "AGPL", "dependencies": { "lodash": "^4.17.21", diff --git a/backend/functions/package.json b/backend/functions/package.json index 989692f..57a0cd0 100644 --- a/backend/functions/package.json +++ b/backend/functions/package.json @@ -36,7 +36,7 @@ "axios": "^1.3.3", "bcrypt": "^5.1.0", "busboy": "^1.6.0", - "civkit": "^0.8.2-4c0357a", + "civkit": "^0.8.2-03243fe", "core-js": "^3.37.1", "cors": "^2.8.5", "dayjs": "^1.11.9", diff --git a/backend/functions/public/favicon.ico b/backend/functions/public/favicon.ico new file mode 100644 index 0000000..1f9ee24 Binary files /dev/null and b/backend/functions/public/favicon.ico differ diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index 412e37a..66c2a9e 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -84,6 +84,8 @@ export class CrawlerHost extends RPCHost { Reflect.set(snapshot, 'locale', options.locale); } await this.setToCache(options.url, snapshot); + + await this.exploreDirectEngine(snapshot).catch(() => undefined); }); puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => { @@ -581,9 +583,14 @@ export class CrawlerHost extends RPCHost { if (crawlerOpts?.respondWith.includes(CONTENT_FORMAT.READER_LM)) { const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, { - ...crawlOpts, engine: ENGINE_TYPE.AUTO + ...crawlOpts, + engine: crawlOpts?.engine || ENGINE_TYPE.AUTO, }, crawlerOpts); + if (!finalAutoSnapshot?.html) { + throw new AssertionFailureError(`Unexpected non HTML content for ReaderLM: ${urlToCrawl}`); + } + if (crawlerOpts?.instruction || crawlerOpts?.jsonSchema) { const jsonSchema = crawlerOpts.jsonSchema ? JSON.stringify(crawlerOpts.jsonSchema, undefined, 2) : undefined; yield* this.lmControl.readerLMFromSnapshot(crawlerOpts.instruction, jsonSchema, finalAutoSnapshot); @@ -628,18 +635,9 @@ export class CrawlerHost extends RPCHost { return; } - if (crawlOpts?.engine?.startsWith(ENGINE_TYPE.DIRECT)) { - const engine = crawlOpts?.engine; - try { - const snapshot = await this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts); - yield snapshot; - - return; - } catch (err) { - if (!engine.endsWith('?')) { - throw err; - } - } + if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) { + yield this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts); + return; } let cache; @@ -658,6 +656,24 @@ export class CrawlerHost extends RPCHost { return; } + if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && crawlerOpts?.browserIsNotRequired()) { + const { digest } = this.getDomainProfileUrlDigest(urlToCrawl); + const domainProfile = await DomainProfile.fromFirestore(digest); + if (domainProfile?.engine === ENGINE_TYPE.DIRECT) { + try { + const snapshot = await this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts); + + // Expect downstream code to "break" here if it's satisfied with the direct engine + yield snapshot; + if (crawlOpts?.engine === ENGINE_TYPE.AUTO) { + return; + } + } catch (err: any) { + this.logger.warn(`Failed to scrap ${urlToCrawl} with direct engine`, { err: marshalErrorLike(err) }); + } + } + } + try { if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe || crawlOpts?.withShadowDom) { for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) { @@ -855,7 +871,7 @@ export class CrawlerHost extends RPCHost { } async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise { - const it = this.cachedScrap(url, { ...opts, engine: ENGINE_TYPE.BROWSER }, crawlerOptions); + const it = this.cachedScrap(url, opts, crawlerOptions); let lastSnapshot; let lastError; @@ -912,36 +928,54 @@ export class CrawlerHost extends RPCHost { return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs); } - async exploreDirectEngine(targetUrl: URL, crawlerOptions: ScrappingOptions, knownSnapshot: PageSnapshot) { - const snapshot = await this.curlControl.urlToSnapshot(targetUrl, crawlerOptions, true); + async exploreDirectEngine(knownSnapshot: PageSnapshot) { + const realUrl = new URL(knownSnapshot.href); + const { digest, path } = this.getDomainProfileUrlDigest(realUrl); + const profile = await DomainProfile.fromFirestore(digest); - const thisFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', snapshot); - const knownFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', knownSnapshot); + if (!profile) { + const record = DomainProfile.from({ + _id: digest, + origin: realUrl.origin.toLowerCase(), + path, + triggerUrl: realUrl.href, + engine: knownSnapshot.htmlModifiedByJs ? ENGINE_TYPE.BROWSER : ENGINE_TYPE.DIRECT, + createdAt: new Date(), + expireAt: new Date(Date.now() + this.domainProfileRetentionMs), + }); + await DomainProfile.save(record); - let engine = ENGINE_TYPE.DIRECT; - if (!(thisFormatted.content && knownFormatted.content && - thisFormatted.content.trim() === knownFormatted.content.trim())) { - engine = ENGINE_TYPE.BROWSER; + return; } - const realUrl = new URL(knownSnapshot.href); - - const profile = (await DomainProfile.fromFirestoreQuery( - DomainProfile.COLLECTION - .where('domain', '==', targetUrl.origin.toLowerCase()) - .limit(1) - ))[0] || new DomainProfile(); - + if (profile.engine === ENGINE_TYPE.BROWSER) { + // Mixed engine, always use browser + return; + } profile.origin = realUrl.origin.toLowerCase(); - profile.triggerReason ??= 'Auto Explore'; profile.triggerUrl = realUrl.href; - profile.engine = engine; - profile.createdAt ??= new Date(); + profile.path = path; + profile.engine = knownSnapshot.htmlModifiedByJs ? ENGINE_TYPE.BROWSER : ENGINE_TYPE.DIRECT; profile.expireAt = new Date(Date.now() + this.domainProfileRetentionMs); await DomainProfile.save(profile); - return true; + return; + } + + getDomainProfileUrlDigest(url: URL) { + const pathname = url.pathname; + const pathVec = pathname.split('/'); + const parentPath = pathVec.slice(0, -1).join('/'); + + const finalPath = parentPath || pathname; + + const key = url.origin.toLocaleLowerCase() + finalPath; + + return { + digest: md5Hasher.hash(key), + path: finalPath, + }; } } diff --git a/backend/functions/src/db/domain-profile.ts b/backend/functions/src/db/domain-profile.ts index 02c693b..6e552c1 100644 --- a/backend/functions/src/db/domain-profile.ts +++ b/backend/functions/src/db/domain-profile.ts @@ -13,10 +13,7 @@ export class DomainProfile extends FirestoreRecord { @Prop({ required: true }) - origin!: string; - - @Prop({ required: true }) - triggerReason!: string; + path!: string; @Prop() triggerUrl?: string; diff --git a/backend/functions/src/dto/scrapping-options.ts b/backend/functions/src/dto/scrapping-options.ts index 2f8f5ef..e2cfd41 100644 --- a/backend/functions/src/dto/scrapping-options.ts +++ b/backend/functions/src/dto/scrapping-options.ts @@ -439,7 +439,7 @@ export class CrawlerOptions extends AutoCastable { instance.engine = ENGINE_TYPE.BROWSER; instance.respondWith = CONTENT_FORMAT.VLM; } else if (instance.engine === ENGINE_TYPE.READER_LM) { - instance.engine = undefined; + instance.engine = ENGINE_TYPE.AUTO; instance.respondWith = CONTENT_FORMAT.READER_LM; } @@ -496,10 +496,6 @@ export class CrawlerOptions extends AutoCastable { instance.cacheTolerance = instance.cacheTolerance * 1000; } - if (instance.noCache || !instance.isTypicalRequest()) { - instance.engine ??= ENGINE_TYPE.BROWSER + '?'; - } - return instance; } @@ -544,13 +540,19 @@ export class CrawlerOptions extends AutoCastable { return !CONTENT_FORMAT_VALUES.has(this.respondWith); } - isTypicalRequest() { + browserIsNotRequired() { if (this.respondWith.includes(CONTENT_FORMAT.PAGESHOT) || this.respondWith.includes(CONTENT_FORMAT.SCREENSHOT)) { return false; } if (this.injectFrameScript?.length || this.injectPageScript?.length) { return false; } + if (this.waitForSelector?.length) { + return false; + } + if (this.withIframe || this.withShadowDom) { + return false; + } if (this.viewport) { return false; } diff --git a/backend/functions/src/services/curl.ts b/backend/functions/src/services/curl.ts index 898b836..ff5a5e8 100644 --- a/backend/functions/src/services/curl.ts +++ b/backend/functions/src/services/curl.ts @@ -2,11 +2,14 @@ import { marshalErrorLike } from 'civkit/lang'; import { AsyncService } from 'civkit/async-service'; import { singleton } from 'tsyringe'; -import { Curl, HeaderInfo } from 'node-libcurl'; +import { Curl, CurlFeature, HeaderInfo } from 'node-libcurl'; import { PageSnapshot, ScrappingOptions } from './puppeteer'; import { Logger } from '../shared/services/logger'; import { JSDomControl } from './jsdom'; -import { AssertionFailureError } from 'civkit'; +import { AssertionFailureError, FancyFile } from 'civkit'; +import { TempFileManager } from '../shared'; +import { readFile } from 'fs/promises'; +import { pathToFileURL } from 'url'; @singleton() export class CurlControl extends AsyncService { @@ -16,6 +19,7 @@ export class CurlControl extends AsyncService { constructor( protected globalLogger: Logger, protected jsdomControl: JSDomControl, + protected tempFileManager: TempFileManager, ) { super(...arguments); } @@ -26,25 +30,55 @@ export class CurlControl extends AsyncService { this.emit('ready'); } + curlImpersonateHeader(curl: Curl, headers?: object, chromeVersion: number = 132) { + const mixinHeaders = { + 'sch-ch-ua': `Not A(Brand";v="8", "Chromium";v="${chromeVersion}", "Google Chrome";v="${chromeVersion}"`, + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': 'Windows', + 'Upgrade-Insecure-Requests': '1', + 'User-Agent': `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${chromeVersion}.0.0.0 Safari/537.36`, + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'Sec-Fetch-Site': 'none', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-User': '?1', + 'Sec-Fetch-Dest': 'document', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept-Language': 'en-US,en;q=0.9', + }; + + curl.setOpt(Curl.option.HTTPHEADER, Object.entries({ ...mixinHeaders, ...headers }).map(([k, v]) => `${k}: ${v}`)); + + return curl; + } + async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions, throwOnNon200 = false): Promise { + const snapshot = { + href: urlToCrawl.toString(), + html: '', + title: '', + text: '', + } as PageSnapshot; + const result = await new Promise<{ statusCode: number, - data: string, + data?: FancyFile, headers: Buffer | HeaderInfo[], }>((resolve, reject) => { const curl = new Curl(); + curl.enable(CurlFeature.StreamResponse); curl.setOpt('URL', urlToCrawl.toString()); curl.setOpt(Curl.option.FOLLOWLOCATION, true); - if (crawlOpts?.timeoutMs) { - curl.setOpt(Curl.option.TIMEOUT_MS, crawlOpts.timeoutMs); - } + curl.setOpt(Curl.option.TIMEOUT_MS, Math.min(10_000, crawlOpts?.timeoutMs || 10_000)); + if (crawlOpts?.overrideUserAgent) { curl.setOpt(Curl.option.USERAGENT, crawlOpts.overrideUserAgent); } - if (crawlOpts?.extraHeaders) { - curl.setOpt(Curl.option.HTTPHEADER, Object.entries(crawlOpts.extraHeaders).map(([k, v]) => `${k}: ${v}`)); - } + + this.curlImpersonateHeader(curl, crawlOpts?.extraHeaders); + // if (crawlOpts?.extraHeaders) { + // curl.setOpt(Curl.option.HTTPHEADER, Object.entries(crawlOpts.extraHeaders).map(([k, v]) => `${k}: ${v}`)); + // } if (crawlOpts?.proxyUrl) { curl.setOpt(Curl.option.PROXY, crawlOpts.proxyUrl); } @@ -56,35 +90,82 @@ export class CurlControl extends AsyncService { curl.setOpt(Curl.option.REFERER, crawlOpts.referer); } - curl.on('end', (statusCode, data, headers) => { + curl.on('end', (statusCode, _data, headers) => { this.logger.debug(`CURL: [${statusCode}] ${urlToCrawl}`, { statusCode, headers }); - resolve({ - statusCode, - data: data.toString(), - headers, - }); curl.close(); }); curl.on('error', (err) => { - this.logger.warn(`Failed to curl ${urlToCrawl}`, { err: marshalErrorLike(err) }); curl.close(); + this.logger.warn(`Curl ${urlToCrawl}: ${err} (Not necessarily an error)`, { err: marshalErrorLike(err) }); reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: ${err.message}`)); }); + curl.setOpt(Curl.option.MAXFILESIZE, 1024 * 1024 * 1024); // 1GB + let status = -1; + let contentType = ''; + curl.on('stream', (stream, statusCode, headers) => { + status = statusCode; + outerLoop: + for (const headerVec of headers) { + for (const [k, v] of Object.entries(headerVec)) { + if (k.toLowerCase() === 'content-type') { + contentType = v.toLowerCase(); + break outerLoop; + } + } + } + + if (!contentType) { + reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: no content-type`)); + stream.destroy(); + return; + } + if (contentType.startsWith('image/')) { + snapshot.html = `${urlToCrawl.origin}${urlToCrawl.pathname}`; + stream.destroy(); + resolve({ + statusCode: status, + headers, + }); + return; + } + + const fpath = this.tempFileManager.alloc(); + const fancyFile = FancyFile.auto(stream, fpath); + this.tempFileManager.bindPathTo(fancyFile, fpath); + resolve({ + statusCode: status, + data: fancyFile, + headers, + }); + }); curl.perform(); }); if (throwOnNon200 && result.statusCode && (result.statusCode < 200 || result.statusCode >= 300)) { - throw new AssertionFailureError(`Failed to directly access ${urlToCrawl}: HTTP ${result.statusCode}`); + throw new AssertionFailureError(`Failed to access ${urlToCrawl}: HTTP ${result.statusCode}`); } - const snapshot = { - href: urlToCrawl.toString(), - html: result.data, - title: '', - text: '', - } as PageSnapshot; + if (result.data) { + const mimeType: string = await result.data.mimeType; + if (mimeType.startsWith('text/html')) { + if ((await result.data.size) > 1024 * 1024 * 32) { + throw new AssertionFailureError(`Failed to access ${urlToCrawl}: file too large`); + } + snapshot.html = await readFile(await result.data.filePath, { encoding: 'utf-8' }); + } else if (mimeType.startsWith('text/') || mimeType.startsWith('application/json')) { + if ((await result.data.size) > 1024 * 1024 * 32) { + throw new AssertionFailureError(`Failed to access ${urlToCrawl}: file too large`); + } + snapshot.text = await readFile(await result.data.filePath, { encoding: 'utf-8' }); + snapshot.html = `
${snapshot.text}
`; + } else if (mimeType.startsWith('application/pdf')) { + snapshot.pdfs = [pathToFileURL(await result.data.filePath).href]; + } else { + throw new AssertionFailureError(`Failed to access ${urlToCrawl}: unexpected type ${mimeType}`); + } + } const curlSnapshot = await this.jsdomControl.narrowSnapshot(snapshot, crawlOpts); diff --git a/backend/functions/src/services/pdf-extract.ts b/backend/functions/src/services/pdf-extract.ts index 396ef97..d6d2abe 100644 --- a/backend/functions/src/services/pdf-extract.ts +++ b/backend/functions/src/services/pdf-extract.ts @@ -266,12 +266,12 @@ export class PDFExtractor extends AsyncService { return { meta: meta.info as Record, content: mdChunks.join(''), text: rawChunks.join('') }; } - async cachedExtract(url: string | URL, cacheTolerance: number = 1000 * 3600 * 24) { + async cachedExtract(url: string | URL, cacheTolerance: number = 1000 * 3600 * 24, alternativeUrl?: string) { if (!url) { return undefined; } - - const digest = md5Hasher.hash(url.toString()); + const nameUrl = alternativeUrl || url.toString(); + const digest = md5Hasher.hash(nameUrl); const data = url; if (typeof url === 'string' && this.isDataUrl(url)) { @@ -283,8 +283,8 @@ export class PDFExtractor extends AsyncService { if (cache) { const age = Date.now() - cache?.createdAt.valueOf(); const stale = cache.createdAt.valueOf() < (Date.now() - cacheTolerance); - this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for PDF ${url}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, { - url, digest, age, stale, cacheTolerance + this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for PDF ${nameUrl}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, { + data: url, url: nameUrl, digest, age, stale, cacheTolerance }); if (!stale) { @@ -306,7 +306,7 @@ export class PDFExtractor extends AsyncService { text: cached.text }; } catch (err) { - this.logger.warn(`Unable to load cached content for ${url}`, { err }); + this.logger.warn(`Unable to load cached content for ${nameUrl}`, { err }); return undefined; } @@ -324,17 +324,17 @@ export class PDFExtractor extends AsyncService { PDFContent.save( PDFContent.from({ _id: theID, - src: url.toString(), + src: nameUrl, meta: extracted?.meta || {}, urlDigest: digest, createdAt: new Date(), expireAt: new Date(Date.now() + this.cacheRetentionMs) }).degradeForFireStore() ).catch((r) => { - this.logger.warn(`Unable to cache PDF content for ${url}`, { err: r }); + this.logger.warn(`Unable to cache PDF content for ${nameUrl}`, { err: r }); }); } catch (err) { - this.logger.warn(`Unable to extract from pdf ${url}`, { err }); + this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err }); } return extracted; diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index 5361aed..2e40689 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -48,6 +48,7 @@ export interface PageSnapshot { href: string; rebase?: string; html: string; + htmlModifiedByJs?: boolean; shadowExpanded?: string; text: string; status?: number; @@ -369,7 +370,9 @@ function shadowDomPresent(rootElement = document.documentElement) { return false; } +let initialHTML; function giveSnapshot(stopActiveSnapshot) { + initialHTML ??= document.documentElement?.outerHTML; if (stopActiveSnapshot) { window.haltSnapshot = true; } @@ -385,6 +388,7 @@ function giveSnapshot(stopActiveSnapshot) { description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '', href: document.location.href, html: document.documentElement?.outerHTML, + htmlModifiedByJs: false, text: document.body?.innerText, shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined, parsed: parsed, @@ -392,6 +396,9 @@ function giveSnapshot(stopActiveSnapshot) { maxElemDepth: domAnalysis.maxDepth, elemCount: domAnalysis.elementCount, }; + if (initialHTML) { + r.htmlModifiedByJs = initialHTML !== r.html && !r.shadowExpanded; + } if (document.baseURI !== r.href) { r.rebase = document.baseURI; } @@ -448,6 +455,7 @@ export class PuppeteerControl extends AsyncService { finalizerMap = new WeakMap>(); snMap = new WeakMap(); livePages = new Set(); + pagePhase = new WeakMap(); lastPageCratedAt: number = 0; rpsCap: number = 500; @@ -491,7 +499,8 @@ export class PuppeteerControl extends AsyncService { } } this.browser = await puppeteer.launch({ - timeout: 10_000 + timeout: 10_000, + args: ['--disable-dev-shm-usage'] }).catch((err: any) => { this.logger.error(`Unknown firebase issue, just die fast.`, { err }); process.nextTick(() => { @@ -611,7 +620,14 @@ export class PuppeteerControl extends AsyncService { const dt = Math.ceil((Date.now() - t0) / 1000); const rps = reqCounter / dt; // console.log(`rps: ${rps}`); + const pagePhase = this.pagePhase.get(page); + if (pagePhase === 'background') { + if (rps > 10 || reqCounter > 1000) { + halt = true; + return req.abort('blockedbyclient', 1000); + } + } if (reqCounter > 1000) { if (rps > 60 || reqCounter > 2000) { page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many requests` }); @@ -676,6 +692,7 @@ export class PuppeteerControl extends AsyncService { this.logger.info(`Page ${sn} created.`); this.lastPageCratedAt = Date.now(); this.livePages.add(page); + this.pagePhase.set(page, 'idle'); return page; } @@ -717,7 +734,6 @@ export class PuppeteerControl extends AsyncService { } const sn = this.snMap.get(page); this.logger.info(`Closing page ${sn}`); - this.livePages.delete(page); await Promise.race([ (async () => { const ctx = page.browserContext(); @@ -731,6 +747,8 @@ export class PuppeteerControl extends AsyncService { ]).catch((err) => { this.logger.error(`Failed to destroy page ${sn}`, { err: marshalErrorLike(err) }); }); + this.livePages.delete(page); + this.pagePhase.delete(page); } async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator { @@ -743,6 +761,7 @@ export class PuppeteerControl extends AsyncService { const pdfUrls: string[] = []; let navigationResponse: HTTPResponse | undefined; const page = await this.getNextPage(); + this.pagePhase.set(page, 'active'); page.on('response', (resp) => { if (resp.request().isNavigationRequest()) { navigationResponse = resp; @@ -802,8 +821,6 @@ export class PuppeteerControl extends AsyncService { } const sn = this.snMap.get(page); this.logger.info(`Page ${sn}: Scraping ${url}`, { url }); - - this.logger.info(`Locale setting: ${options?.locale}`); if (options?.locale) { // Add headers via request interception to walk around this bug // https://github.com/puppeteer/puppeteer/issues/10235 @@ -896,6 +913,10 @@ export class PuppeteerControl extends AsyncService { page.on('snapshot', hdl); page.once('abuse', (event: any) => { this.emit('abuse', { ...event, url: parsedUrl }); + if (snapshot?.href && parsedUrl.href !== snapshot.href) { + this.emit('abuse', { ...event, url: snapshot.href }); + } + nextSnapshotDeferred.reject( new SecurityCompromiseError(`Abuse detected: ${event.reason}`) ); @@ -1071,6 +1092,7 @@ export class PuppeteerControl extends AsyncService { } } } finally { + this.pagePhase.set(page, 'background'); (waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => { page.off('snapshot', hdl); this.ditchPage(page); diff --git a/backend/functions/src/services/snapshot-formatter.ts b/backend/functions/src/services/snapshot-formatter.ts index 5743738..0601ac9 100644 --- a/backend/functions/src/services/snapshot-formatter.ts +++ b/backend/functions/src/services/snapshot-formatter.ts @@ -152,7 +152,8 @@ export class SnapshotFormatter extends AsyncService { // in case of Google Web Cache content if (snapshot.pdfs?.length && (!snapshot.title || snapshot.title.startsWith('cache:'))) { const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0], - this.threadLocal.get('cacheTolerance') + this.threadLocal.get('cacheTolerance'), + snapshot.pdfs[0].startsWith('http') ? undefined : snapshot.href, ); if (pdf) { pdfMode = true; diff --git a/backend/functions/src/stand-alone/crawl.ts b/backend/functions/src/stand-alone/crawl.ts new file mode 100644 index 0000000..589ded2 --- /dev/null +++ b/backend/functions/src/stand-alone/crawl.ts @@ -0,0 +1,151 @@ +import 'reflect-metadata'; +import { container, singleton } from 'tsyringe'; +import { initializeApp, applicationDefault } from 'firebase-admin/app'; + +process.env['FIREBASE_CONFIG'] ??= JSON.stringify({ + projectId: process.env['GCLOUD_PROJECT'] || 'reader-6b7dc', + storageBucket: `${process.env['GCLOUD_PROJECT'] || 'reader-6b7dc'}.appspot.com`, + credential: applicationDefault(), +}); + +initializeApp(); + + +import { Logger, CloudFunctionRegistry } from '../shared'; +import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc'; +import { ExpressServer } from 'civkit/civ-rpc/express'; +import http2 from 'http2'; +import { CrawlerHost } from '../cloud-functions/crawler'; +import { FsWalk, WalkOutEntity } from 'civkit/fswalk'; +import path from 'path'; +import fs from 'fs'; +import { mimeOfExt } from 'civkit/mime'; +import { NextFunction, Request, Response } from 'express'; + +process.on('unhandledRejection', (err) => { + console.error('Unhandled rejection', err); +}); + +process.on('uncaughtException', (err) => { + console.log('Uncaught exception', err); + + // Looks like Firebase runtime does not handle error properly. + // Make sure to quit the process. + console.error('Uncaught exception, process quit.'); + process.nextTick(() => process.exit(1)); +}); + +@singleton() +export class CrawlStandAloneServer extends ExpressServer { + logger = this.globalLogger.child({ service: this.constructor.name }); + + httpAlternativeServer?: typeof this['httpServer']; + assets = new Map(); + + constructor( + protected globalLogger: Logger, + protected registry: CloudFunctionRegistry, + protected crawlerHost: CrawlerHost, + ) { + super(...arguments); + + registry.allHandsOnDeck().catch(() => void 0); + registry.title = 'reader'; + registry.version = '0.1.0'; + } + + h2c() { + this.httpAlternativeServer = this.httpServer; + this.httpServer = http2.createServer(this.expressApp); + // useResourceBasedDefaultTracker(); + + return this; + } + + override async init() { + await this.walkForAssets(); + await super.init(); + } + + async walkForAssets() { + const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public')); + + for (const file of files) { + if (file.type !== 'file') { + continue; + } + this.assets.set(file.relativePath.toString(), file); + } + } + + makeAssetsServingController() { + return (req: Request, res: Response, next: NextFunction) => { + const requestPath = req.url; + const file = requestPath.slice(1); + if (!file) { + return next(); + } + + const asset = this.assets.get(file); + if (asset?.type !== 'file') { + return next(); + } + res.type(mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream'); + res.set('Content-Length', asset.stats.size.toString()); + fs.createReadStream(asset.path).pipe(res); + + return; + }; + } + + override listen(port: number) { + const r = super.listen(port); + if (this.httpAlternativeServer) { + const altPort = port + 1; + this.httpAlternativeServer.listen(altPort, () => { + this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`); + }); + } + + return r; + } + + override registerRoutes(): void { + + const openAPIManager = new OpenAPIManager(); + openAPIManager.document('/{url}', ['get', 'post'], this.registry.conf.get('crawl')!); + const openapiJsonPath = '/openapi.json'; + this.expressRootRouter.get(openapiJsonPath, (req, res) => { + const baseURL = new URL(req.url, `${req.protocol}://${req.headers.host}`); + baseURL.pathname = baseURL.pathname.replace(new RegExp(`${openapiJsonPath}$`, 'i'), '').replace(/\/+$/g, ''); + baseURL.search = ''; + const content = openAPIManager.createOpenAPIObject(baseURL.toString(), { + info: { + title: this.registry.title, + description: `${this.registry.title} openAPI documentations`, + 'x-logo': { + url: this.registry.logoUrl || `https://www.openapis.org/wp-content/uploads/sites/3/2018/02/OpenAPI_Logo_Pantone-1.png` + } + } + }, (this.registry.constructor as typeof AbstractRPCRegistry).envelope, req.query as any); + res.statusCode = 200; + res.end(JSON.stringify(content)); + }); + + this.expressRootRouter.use('/', ...this.registry.expressMiddlewares, this.makeAssetsServingController(), this.registry.makeShimController('crawl')); + } + + protected override featureSelect(): void { + this.insertAsyncHookMiddleware(); + this.insertHealthCheckMiddleware(this.healthCheckEndpoint); + this.insertLogRequestsMiddleware(); + this.registerOpenAPIDocsRoutes('/docs'); + + this.registerRoutes(); + } +} +const instance = container.resolve(CrawlStandAloneServer); + +export default instance; + +instance.serviceReady().then((s) => s.listen(parseInt(process.env.PORT || '') || 3000)); diff --git a/backend/functions/src/stand-alone/search.ts b/backend/functions/src/stand-alone/search.ts new file mode 100644 index 0000000..ab04d5f --- /dev/null +++ b/backend/functions/src/stand-alone/search.ts @@ -0,0 +1,151 @@ +import 'reflect-metadata'; +import { container, singleton } from 'tsyringe'; +import { initializeApp, applicationDefault } from 'firebase-admin/app'; + +process.env['FIREBASE_CONFIG'] ??= JSON.stringify({ + projectId: process.env['GCLOUD_PROJECT'] || 'reader-6b7dc', + storageBucket: `${process.env['GCLOUD_PROJECT'] || 'reader-6b7dc'}.appspot.com`, + credential: applicationDefault(), +}); + +initializeApp(); + + +import { Logger, CloudFunctionRegistry } from '../shared'; +import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc'; +import { ExpressServer } from 'civkit/civ-rpc/express'; +import http2 from 'http2'; +import { SearcherHost } from '../cloud-functions/searcher'; +import { FsWalk, WalkOutEntity } from 'civkit/fswalk'; +import path from 'path'; +import fs from 'fs'; +import { mimeOfExt } from 'civkit/mime'; +import { NextFunction, Request, Response } from 'express'; + +process.on('unhandledRejection', (err) => { + console.error('Unhandled rejection', err); +}); + +process.on('uncaughtException', (err) => { + console.log('Uncaught exception', err); + + // Looks like Firebase runtime does not handle error properly. + // Make sure to quit the process. + console.error('Uncaught exception, process quit.'); + process.nextTick(() => process.exit(1)); +}); + +@singleton() +export class SearchStandAloneServer extends ExpressServer { + logger = this.globalLogger.child({ service: this.constructor.name }); + + httpAlternativeServer?: typeof this['httpServer']; + assets = new Map(); + + constructor( + protected globalLogger: Logger, + protected registry: CloudFunctionRegistry, + protected searcherHost: SearcherHost, + ) { + super(...arguments); + + registry.allHandsOnDeck().catch(() => void 0); + registry.title = 'reader'; + registry.version = '0.1.0'; + } + + h2c() { + this.httpAlternativeServer = this.httpServer; + this.httpServer = http2.createServer(this.expressApp); + // useResourceBasedDefaultTracker(); + + return this; + } + + override async init() { + await this.walkForAssets(); + await super.init(); + } + + async walkForAssets() { + const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public')); + + for (const file of files) { + if (file.type !== 'file') { + continue; + } + this.assets.set(file.relativePath.toString(), file); + } + } + + makeAssetsServingController() { + return (req: Request, res: Response, next: NextFunction) => { + const requestPath = req.url; + const file = requestPath.slice(1); + if (!file) { + return next(); + } + + const asset = this.assets.get(file); + if (asset?.type !== 'file') { + return next(); + } + res.type(mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream'); + res.set('Content-Length', asset.stats.size.toString()); + fs.createReadStream(asset.path).pipe(res); + + return; + }; + } + + override listen(port: number) { + const r = super.listen(port); + if (this.httpAlternativeServer) { + const altPort = port + 1; + this.httpAlternativeServer.listen(altPort, () => { + this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`); + }); + } + + return r; + } + + override registerRoutes(): void { + + const openAPIManager = new OpenAPIManager(); + openAPIManager.document('/{q}', ['get', 'post'], this.registry.conf.get('search')!); + const openapiJsonPath = '/openapi.json'; + this.expressRootRouter.get(openapiJsonPath, (req, res) => { + const baseURL = new URL(req.url, `${req.protocol}://${req.headers.host}`); + baseURL.pathname = baseURL.pathname.replace(new RegExp(`${openapiJsonPath}$`, 'i'), '').replace(/\/+$/g, ''); + baseURL.search = ''; + const content = openAPIManager.createOpenAPIObject(baseURL.toString(), { + info: { + title: this.registry.title, + description: `${this.registry.title} openAPI documentations`, + 'x-logo': { + url: this.registry.logoUrl || `https://www.openapis.org/wp-content/uploads/sites/3/2018/02/OpenAPI_Logo_Pantone-1.png` + } + } + }, (this.registry.constructor as typeof AbstractRPCRegistry).envelope, req.query as any); + res.statusCode = 200; + res.end(JSON.stringify(content)); + }); + + this.expressRootRouter.use('/', ...this.registry.expressMiddlewares, this.makeAssetsServingController(), this.registry.makeShimController('search')); + } + + protected override featureSelect(): void { + this.insertAsyncHookMiddleware(); + this.insertHealthCheckMiddleware(this.healthCheckEndpoint); + this.insertLogRequestsMiddleware(); + this.registerOpenAPIDocsRoutes('/docs'); + + this.registerRoutes(); + } +} +const instance = container.resolve(SearchStandAloneServer); + +export default instance; + +instance.serviceReady().then((s) => s.listen(parseInt(process.env.PORT || '') || 3000));