deployment: dedicated server script for cloud-run (#1139)

* refactor: domain profile and attempt direct engine

* fix: direct engine

* fix: abuse in background phase

* fix

* wip

* use curl-impersonate in custom image

* local pdf for curl

* listen port from env

* fix

* fix

* fix

* fix: ditch http2

* cd: using gh action

* ci: token for thinapps-shared

* ci: setup node lock file path

* ci: tweak

* ci: mmdb

* ci: docker build

* fix: ci

* fix: ci
This commit is contained in:
Yanlong Wang 2025-02-05 14:50:18 +08:00 committed by GitHub
parent a453ab5f16
commit 6a58de590c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 639 additions and 86 deletions

76
.github/workflows/cd.yml vendored Normal file
View File

@ -0,0 +1,76 @@
run-name: Build push and deploy (CD)
on:
push:
branches:
- main
- ci-debug
tags:
- '*'
jobs:
build-and-push-to-gcr:
runs-on: ubuntu-latest
concurrency:
group: ${{ github.ref_type == 'branch' && github.ref }}
cancel-in-progress: true
defaults:
run:
working-directory: backend/functions
permissions:
contents: read
steps:
- uses: actions/checkout@v4
with:
lfs: true
submodules: true
token: ${{ secrets.THINAPPS_SHARED_READ_TOKEN }}
- uses: 'google-github-actions/auth@v2'
with:
credentials_json: '${{ secrets.GCLOUD_SERVICE_ACCOUNT_SECRET_JSON }}'
- name: 'Set up Cloud SDK'
uses: 'google-github-actions/setup-gcloud@v2'
- name: "Docker auth"
run: |-
gcloud auth configure-docker us-docker.pkg.dev --quiet
- name: Set controller release version
run: echo "RELEASE_VERSION=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: 22.12.0
cache: npm
cache-dependency-path: backend/functions/package-lock.json
- name: npm install
run: npm ci
- name: get maxmind mmdb
run: mkdir -p licensed && curl -o licensed/GeoLite2-City.mmdb https://github.com/P3TERX/GeoLite.mmdb/raw/download/GeoLite2-City.mmdb
- name: build application
run: npm run build
- name: Set package version
run: npm version --no-git-tag-version ${{ env.RELEASE_VERSION }}
if: github.ref_type == 'tag'
- name: Docker meta
id: meta
uses: docker/metadata-action@v5
with:
images: |
us-docker.pkg.dev/reader-6b7dc/jina-reader/reader
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Build and push
id: container
uses: docker/build-push-action@v6
with:
context: backend/functions
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
- name: Deploy CRAWL with Tag
run: |
gcloud run deploy crawl --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/crawl.js --region us-central1 --async --min-instances 0
- name: Deploy SEARCH with Tag
run: |
gcloud run deploy search --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region us-central1 --async --min-instances 0

View File

@ -0,0 +1 @@
node_modules/

View File

@ -0,0 +1,37 @@
# syntax=docker/dockerfile:1
FROM lwthiker/curl-impersonate:0.6-chrome-slim-bullseye
FROM node:20
RUN apt-get update \
&& apt-get install -y wget gnupg \
&& wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
&& sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \
&& apt-get update \
&& apt-get install -y google-chrome-stable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-freefont-ttf libxss1 \
--no-install-recommends \
&& rm -rf /var/lib/apt/lists/*
COPY --from=0 /usr/local/lib/libcurl-impersonate.so /usr/local/lib/libcurl-impersonate.so
RUN groupadd -r jina
RUN useradd -g jina -G audio,video -m jina
USER jina
WORKDIR /app
COPY package.json package-lock.json ./
RUN npm ci
COPY build ./build
COPY public ./public
COPY licensed ./licensed
RUN rm -rf ~/.config/chromium && mkdir -p ~/.config/chromium
ENV LD_PRELOAD=/usr/local/lib/libcurl-impersonate.so CURL_IMPERSONATE=chrome116 CURL_IMPERSONATE_HEADERS=no
ENV PORT=8080
EXPOSE 3000 3001 8080 8081
ENTRYPOINT ["node"]
CMD [ "build/stand-alone/crawl.js" ]

View File

@ -16,7 +16,7 @@
"axios": "^1.3.3",
"bcrypt": "^5.1.0",
"busboy": "^1.6.0",
"civkit": "^0.8.2-4c0357a",
"civkit": "^0.8.2-03243fe",
"core-js": "^3.37.1",
"cors": "^2.8.5",
"dayjs": "^1.11.9",
@ -3979,9 +3979,9 @@
}
},
"node_modules/civkit": {
"version": "0.8.2-4c0357a",
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.2-4c0357a.tgz",
"integrity": "sha512-8/RcapAm8YYImf+YVBRhybEFuSuV5Pg1p/s6Niql3VAY2cV1/OC1fTCDZY689yeq8zFcwxwBvaqyIEGo69F+IA==",
"version": "0.8.2-03243fe",
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.2-03243fe.tgz",
"integrity": "sha512-hoTxGeGdD27iOCDi51cVY0PHlRN3OSC640QRJ1YSmD42o+LP7mZtbdy8dN7j/FSkPP/5yLuB2ch9BMSOp54POQ==",
"license": "AGPL",
"dependencies": {
"lodash": "^4.17.21",

View File

@ -36,7 +36,7 @@
"axios": "^1.3.3",
"bcrypt": "^5.1.0",
"busboy": "^1.6.0",
"civkit": "^0.8.2-4c0357a",
"civkit": "^0.8.2-03243fe",
"core-js": "^3.37.1",
"cors": "^2.8.5",
"dayjs": "^1.11.9",

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

View File

@ -84,6 +84,8 @@ export class CrawlerHost extends RPCHost {
Reflect.set(snapshot, 'locale', options.locale);
}
await this.setToCache(options.url, snapshot);
await this.exploreDirectEngine(snapshot).catch(() => undefined);
});
puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
@ -581,9 +583,14 @@ export class CrawlerHost extends RPCHost {
if (crawlerOpts?.respondWith.includes(CONTENT_FORMAT.READER_LM)) {
const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
...crawlOpts, engine: ENGINE_TYPE.AUTO
...crawlOpts,
engine: crawlOpts?.engine || ENGINE_TYPE.AUTO,
}, crawlerOpts);
if (!finalAutoSnapshot?.html) {
throw new AssertionFailureError(`Unexpected non HTML content for ReaderLM: ${urlToCrawl}`);
}
if (crawlerOpts?.instruction || crawlerOpts?.jsonSchema) {
const jsonSchema = crawlerOpts.jsonSchema ? JSON.stringify(crawlerOpts.jsonSchema, undefined, 2) : undefined;
yield* this.lmControl.readerLMFromSnapshot(crawlerOpts.instruction, jsonSchema, finalAutoSnapshot);
@ -628,18 +635,9 @@ export class CrawlerHost extends RPCHost {
return;
}
if (crawlOpts?.engine?.startsWith(ENGINE_TYPE.DIRECT)) {
const engine = crawlOpts?.engine;
try {
const snapshot = await this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
yield snapshot;
return;
} catch (err) {
if (!engine.endsWith('?')) {
throw err;
}
}
if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) {
yield this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
return;
}
let cache;
@ -658,6 +656,24 @@ export class CrawlerHost extends RPCHost {
return;
}
if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && crawlerOpts?.browserIsNotRequired()) {
const { digest } = this.getDomainProfileUrlDigest(urlToCrawl);
const domainProfile = await DomainProfile.fromFirestore(digest);
if (domainProfile?.engine === ENGINE_TYPE.DIRECT) {
try {
const snapshot = await this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
// Expect downstream code to "break" here if it's satisfied with the direct engine
yield snapshot;
if (crawlOpts?.engine === ENGINE_TYPE.AUTO) {
return;
}
} catch (err: any) {
this.logger.warn(`Failed to scrap ${urlToCrawl} with direct engine`, { err: marshalErrorLike(err) });
}
}
}
try {
if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe || crawlOpts?.withShadowDom) {
for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
@ -855,7 +871,7 @@ export class CrawlerHost extends RPCHost {
}
async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise<PageSnapshot | undefined> {
const it = this.cachedScrap(url, { ...opts, engine: ENGINE_TYPE.BROWSER }, crawlerOptions);
const it = this.cachedScrap(url, opts, crawlerOptions);
let lastSnapshot;
let lastError;
@ -912,36 +928,54 @@ export class CrawlerHost extends RPCHost {
return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
}
async exploreDirectEngine(targetUrl: URL, crawlerOptions: ScrappingOptions, knownSnapshot: PageSnapshot) {
const snapshot = await this.curlControl.urlToSnapshot(targetUrl, crawlerOptions, true);
async exploreDirectEngine(knownSnapshot: PageSnapshot) {
const realUrl = new URL(knownSnapshot.href);
const { digest, path } = this.getDomainProfileUrlDigest(realUrl);
const profile = await DomainProfile.fromFirestore(digest);
const thisFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', snapshot);
const knownFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', knownSnapshot);
if (!profile) {
const record = DomainProfile.from({
_id: digest,
origin: realUrl.origin.toLowerCase(),
path,
triggerUrl: realUrl.href,
engine: knownSnapshot.htmlModifiedByJs ? ENGINE_TYPE.BROWSER : ENGINE_TYPE.DIRECT,
createdAt: new Date(),
expireAt: new Date(Date.now() + this.domainProfileRetentionMs),
});
await DomainProfile.save(record);
let engine = ENGINE_TYPE.DIRECT;
if (!(thisFormatted.content && knownFormatted.content &&
thisFormatted.content.trim() === knownFormatted.content.trim())) {
engine = ENGINE_TYPE.BROWSER;
return;
}
const realUrl = new URL(knownSnapshot.href);
const profile = (await DomainProfile.fromFirestoreQuery(
DomainProfile.COLLECTION
.where('domain', '==', targetUrl.origin.toLowerCase())
.limit(1)
))[0] || new DomainProfile();
if (profile.engine === ENGINE_TYPE.BROWSER) {
// Mixed engine, always use browser
return;
}
profile.origin = realUrl.origin.toLowerCase();
profile.triggerReason ??= 'Auto Explore';
profile.triggerUrl = realUrl.href;
profile.engine = engine;
profile.createdAt ??= new Date();
profile.path = path;
profile.engine = knownSnapshot.htmlModifiedByJs ? ENGINE_TYPE.BROWSER : ENGINE_TYPE.DIRECT;
profile.expireAt = new Date(Date.now() + this.domainProfileRetentionMs);
await DomainProfile.save(profile);
return true;
return;
}
getDomainProfileUrlDigest(url: URL) {
const pathname = url.pathname;
const pathVec = pathname.split('/');
const parentPath = pathVec.slice(0, -1).join('/');
const finalPath = parentPath || pathname;
const key = url.origin.toLocaleLowerCase() + finalPath;
return {
digest: md5Hasher.hash(key),
path: finalPath,
};
}
}

View File

@ -13,10 +13,7 @@ export class DomainProfile extends FirestoreRecord {
@Prop({
required: true
})
origin!: string;
@Prop({ required: true })
triggerReason!: string;
path!: string;
@Prop()
triggerUrl?: string;

View File

@ -439,7 +439,7 @@ export class CrawlerOptions extends AutoCastable {
instance.engine = ENGINE_TYPE.BROWSER;
instance.respondWith = CONTENT_FORMAT.VLM;
} else if (instance.engine === ENGINE_TYPE.READER_LM) {
instance.engine = undefined;
instance.engine = ENGINE_TYPE.AUTO;
instance.respondWith = CONTENT_FORMAT.READER_LM;
}
@ -496,10 +496,6 @@ export class CrawlerOptions extends AutoCastable {
instance.cacheTolerance = instance.cacheTolerance * 1000;
}
if (instance.noCache || !instance.isTypicalRequest()) {
instance.engine ??= ENGINE_TYPE.BROWSER + '?';
}
return instance;
}
@ -544,13 +540,19 @@ export class CrawlerOptions extends AutoCastable {
return !CONTENT_FORMAT_VALUES.has(this.respondWith);
}
isTypicalRequest() {
browserIsNotRequired() {
if (this.respondWith.includes(CONTENT_FORMAT.PAGESHOT) || this.respondWith.includes(CONTENT_FORMAT.SCREENSHOT)) {
return false;
}
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
return false;
}
if (this.waitForSelector?.length) {
return false;
}
if (this.withIframe || this.withShadowDom) {
return false;
}
if (this.viewport) {
return false;
}

View File

@ -2,11 +2,14 @@ import { marshalErrorLike } from 'civkit/lang';
import { AsyncService } from 'civkit/async-service';
import { singleton } from 'tsyringe';
import { Curl, HeaderInfo } from 'node-libcurl';
import { Curl, CurlFeature, HeaderInfo } from 'node-libcurl';
import { PageSnapshot, ScrappingOptions } from './puppeteer';
import { Logger } from '../shared/services/logger';
import { JSDomControl } from './jsdom';
import { AssertionFailureError } from 'civkit';
import { AssertionFailureError, FancyFile } from 'civkit';
import { TempFileManager } from '../shared';
import { readFile } from 'fs/promises';
import { pathToFileURL } from 'url';
@singleton()
export class CurlControl extends AsyncService {
@ -16,6 +19,7 @@ export class CurlControl extends AsyncService {
constructor(
protected globalLogger: Logger,
protected jsdomControl: JSDomControl,
protected tempFileManager: TempFileManager,
) {
super(...arguments);
}
@ -26,25 +30,55 @@ export class CurlControl extends AsyncService {
this.emit('ready');
}
curlImpersonateHeader(curl: Curl, headers?: object, chromeVersion: number = 132) {
const mixinHeaders = {
'sch-ch-ua': `Not A(Brand";v="8", "Chromium";v="${chromeVersion}", "Google Chrome";v="${chromeVersion}"`,
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': 'Windows',
'Upgrade-Insecure-Requests': '1',
'User-Agent': `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${chromeVersion}.0.0.0 Safari/537.36`,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
};
curl.setOpt(Curl.option.HTTPHEADER, Object.entries({ ...mixinHeaders, ...headers }).map(([k, v]) => `${k}: ${v}`));
return curl;
}
async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions, throwOnNon200 = false): Promise<PageSnapshot> {
const snapshot = {
href: urlToCrawl.toString(),
html: '',
title: '',
text: '',
} as PageSnapshot;
const result = await new Promise<{
statusCode: number,
data: string,
data?: FancyFile,
headers: Buffer | HeaderInfo[],
}>((resolve, reject) => {
const curl = new Curl();
curl.enable(CurlFeature.StreamResponse);
curl.setOpt('URL', urlToCrawl.toString());
curl.setOpt(Curl.option.FOLLOWLOCATION, true);
if (crawlOpts?.timeoutMs) {
curl.setOpt(Curl.option.TIMEOUT_MS, crawlOpts.timeoutMs);
}
curl.setOpt(Curl.option.TIMEOUT_MS, Math.min(10_000, crawlOpts?.timeoutMs || 10_000));
if (crawlOpts?.overrideUserAgent) {
curl.setOpt(Curl.option.USERAGENT, crawlOpts.overrideUserAgent);
}
if (crawlOpts?.extraHeaders) {
curl.setOpt(Curl.option.HTTPHEADER, Object.entries(crawlOpts.extraHeaders).map(([k, v]) => `${k}: ${v}`));
}
this.curlImpersonateHeader(curl, crawlOpts?.extraHeaders);
// if (crawlOpts?.extraHeaders) {
// curl.setOpt(Curl.option.HTTPHEADER, Object.entries(crawlOpts.extraHeaders).map(([k, v]) => `${k}: ${v}`));
// }
if (crawlOpts?.proxyUrl) {
curl.setOpt(Curl.option.PROXY, crawlOpts.proxyUrl);
}
@ -56,35 +90,82 @@ export class CurlControl extends AsyncService {
curl.setOpt(Curl.option.REFERER, crawlOpts.referer);
}
curl.on('end', (statusCode, data, headers) => {
curl.on('end', (statusCode, _data, headers) => {
this.logger.debug(`CURL: [${statusCode}] ${urlToCrawl}`, { statusCode, headers });
resolve({
statusCode,
data: data.toString(),
headers,
});
curl.close();
});
curl.on('error', (err) => {
this.logger.warn(`Failed to curl ${urlToCrawl}`, { err: marshalErrorLike(err) });
curl.close();
this.logger.warn(`Curl ${urlToCrawl}: ${err} (Not necessarily an error)`, { err: marshalErrorLike(err) });
reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: ${err.message}`));
});
curl.setOpt(Curl.option.MAXFILESIZE, 1024 * 1024 * 1024); // 1GB
let status = -1;
let contentType = '';
curl.on('stream', (stream, statusCode, headers) => {
status = statusCode;
outerLoop:
for (const headerVec of headers) {
for (const [k, v] of Object.entries(headerVec)) {
if (k.toLowerCase() === 'content-type') {
contentType = v.toLowerCase();
break outerLoop;
}
}
}
if (!contentType) {
reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: no content-type`));
stream.destroy();
return;
}
if (contentType.startsWith('image/')) {
snapshot.html = `<html style="height: 100%;"><head><meta name="viewport" content="width=device-width, minimum-scale=0.1"><title>${urlToCrawl.origin}${urlToCrawl.pathname}</title></head><body style="margin: 0px; height: 100%; background-color: rgb(14, 14, 14);"><img style="display: block;-webkit-user-select: none;margin: auto;background-color: hsl(0, 0%, 90%);transition: background-color 300ms;" src="${urlToCrawl.href}"></body></html>`;
stream.destroy();
resolve({
statusCode: status,
headers,
});
return;
}
const fpath = this.tempFileManager.alloc();
const fancyFile = FancyFile.auto(stream, fpath);
this.tempFileManager.bindPathTo(fancyFile, fpath);
resolve({
statusCode: status,
data: fancyFile,
headers,
});
});
curl.perform();
});
if (throwOnNon200 && result.statusCode && (result.statusCode < 200 || result.statusCode >= 300)) {
throw new AssertionFailureError(`Failed to directly access ${urlToCrawl}: HTTP ${result.statusCode}`);
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: HTTP ${result.statusCode}`);
}
const snapshot = {
href: urlToCrawl.toString(),
html: result.data,
title: '',
text: '',
} as PageSnapshot;
if (result.data) {
const mimeType: string = await result.data.mimeType;
if (mimeType.startsWith('text/html')) {
if ((await result.data.size) > 1024 * 1024 * 32) {
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: file too large`);
}
snapshot.html = await readFile(await result.data.filePath, { encoding: 'utf-8' });
} else if (mimeType.startsWith('text/') || mimeType.startsWith('application/json')) {
if ((await result.data.size) > 1024 * 1024 * 32) {
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: file too large`);
}
snapshot.text = await readFile(await result.data.filePath, { encoding: 'utf-8' });
snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`;
} else if (mimeType.startsWith('application/pdf')) {
snapshot.pdfs = [pathToFileURL(await result.data.filePath).href];
} else {
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: unexpected type ${mimeType}`);
}
}
const curlSnapshot = await this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);

View File

@ -266,12 +266,12 @@ export class PDFExtractor extends AsyncService {
return { meta: meta.info as Record<string, any>, content: mdChunks.join(''), text: rawChunks.join('') };
}
async cachedExtract(url: string | URL, cacheTolerance: number = 1000 * 3600 * 24) {
async cachedExtract(url: string | URL, cacheTolerance: number = 1000 * 3600 * 24, alternativeUrl?: string) {
if (!url) {
return undefined;
}
const digest = md5Hasher.hash(url.toString());
const nameUrl = alternativeUrl || url.toString();
const digest = md5Hasher.hash(nameUrl);
const data = url;
if (typeof url === 'string' && this.isDataUrl(url)) {
@ -283,8 +283,8 @@ export class PDFExtractor extends AsyncService {
if (cache) {
const age = Date.now() - cache?.createdAt.valueOf();
const stale = cache.createdAt.valueOf() < (Date.now() - cacheTolerance);
this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for PDF ${url}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, {
url, digest, age, stale, cacheTolerance
this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for PDF ${nameUrl}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, {
data: url, url: nameUrl, digest, age, stale, cacheTolerance
});
if (!stale) {
@ -306,7 +306,7 @@ export class PDFExtractor extends AsyncService {
text: cached.text
};
} catch (err) {
this.logger.warn(`Unable to load cached content for ${url}`, { err });
this.logger.warn(`Unable to load cached content for ${nameUrl}`, { err });
return undefined;
}
@ -324,17 +324,17 @@ export class PDFExtractor extends AsyncService {
PDFContent.save(
PDFContent.from({
_id: theID,
src: url.toString(),
src: nameUrl,
meta: extracted?.meta || {},
urlDigest: digest,
createdAt: new Date(),
expireAt: new Date(Date.now() + this.cacheRetentionMs)
}).degradeForFireStore()
).catch((r) => {
this.logger.warn(`Unable to cache PDF content for ${url}`, { err: r });
this.logger.warn(`Unable to cache PDF content for ${nameUrl}`, { err: r });
});
} catch (err) {
this.logger.warn(`Unable to extract from pdf ${url}`, { err });
this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err });
}
return extracted;

View File

@ -48,6 +48,7 @@ export interface PageSnapshot {
href: string;
rebase?: string;
html: string;
htmlModifiedByJs?: boolean;
shadowExpanded?: string;
text: string;
status?: number;
@ -369,7 +370,9 @@ function shadowDomPresent(rootElement = document.documentElement) {
return false;
}
let initialHTML;
function giveSnapshot(stopActiveSnapshot) {
initialHTML ??= document.documentElement?.outerHTML;
if (stopActiveSnapshot) {
window.haltSnapshot = true;
}
@ -385,6 +388,7 @@ function giveSnapshot(stopActiveSnapshot) {
description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
href: document.location.href,
html: document.documentElement?.outerHTML,
htmlModifiedByJs: false,
text: document.body?.innerText,
shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
parsed: parsed,
@ -392,6 +396,9 @@ function giveSnapshot(stopActiveSnapshot) {
maxElemDepth: domAnalysis.maxDepth,
elemCount: domAnalysis.elementCount,
};
if (initialHTML) {
r.htmlModifiedByJs = initialHTML !== r.html && !r.shadowExpanded;
}
if (document.baseURI !== r.href) {
r.rebase = document.baseURI;
}
@ -448,6 +455,7 @@ export class PuppeteerControl extends AsyncService {
finalizerMap = new WeakMap<Page, ReturnType<typeof setTimeout>>();
snMap = new WeakMap<Page, number>();
livePages = new Set<Page>();
pagePhase = new WeakMap<Page, 'idle' | 'active' | 'background'>();
lastPageCratedAt: number = 0;
rpsCap: number = 500;
@ -491,7 +499,8 @@ export class PuppeteerControl extends AsyncService {
}
}
this.browser = await puppeteer.launch({
timeout: 10_000
timeout: 10_000,
args: ['--disable-dev-shm-usage']
}).catch((err: any) => {
this.logger.error(`Unknown firebase issue, just die fast.`, { err });
process.nextTick(() => {
@ -611,7 +620,14 @@ export class PuppeteerControl extends AsyncService {
const dt = Math.ceil((Date.now() - t0) / 1000);
const rps = reqCounter / dt;
// console.log(`rps: ${rps}`);
const pagePhase = this.pagePhase.get(page);
if (pagePhase === 'background') {
if (rps > 10 || reqCounter > 1000) {
halt = true;
return req.abort('blockedbyclient', 1000);
}
}
if (reqCounter > 1000) {
if (rps > 60 || reqCounter > 2000) {
page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many requests` });
@ -676,6 +692,7 @@ export class PuppeteerControl extends AsyncService {
this.logger.info(`Page ${sn} created.`);
this.lastPageCratedAt = Date.now();
this.livePages.add(page);
this.pagePhase.set(page, 'idle');
return page;
}
@ -717,7 +734,6 @@ export class PuppeteerControl extends AsyncService {
}
const sn = this.snMap.get(page);
this.logger.info(`Closing page ${sn}`);
this.livePages.delete(page);
await Promise.race([
(async () => {
const ctx = page.browserContext();
@ -731,6 +747,8 @@ export class PuppeteerControl extends AsyncService {
]).catch((err) => {
this.logger.error(`Failed to destroy page ${sn}`, { err: marshalErrorLike(err) });
});
this.livePages.delete(page);
this.pagePhase.delete(page);
}
async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
@ -743,6 +761,7 @@ export class PuppeteerControl extends AsyncService {
const pdfUrls: string[] = [];
let navigationResponse: HTTPResponse | undefined;
const page = await this.getNextPage();
this.pagePhase.set(page, 'active');
page.on('response', (resp) => {
if (resp.request().isNavigationRequest()) {
navigationResponse = resp;
@ -802,8 +821,6 @@ export class PuppeteerControl extends AsyncService {
}
const sn = this.snMap.get(page);
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
this.logger.info(`Locale setting: ${options?.locale}`);
if (options?.locale) {
// Add headers via request interception to walk around this bug
// https://github.com/puppeteer/puppeteer/issues/10235
@ -896,6 +913,10 @@ export class PuppeteerControl extends AsyncService {
page.on('snapshot', hdl);
page.once('abuse', (event: any) => {
this.emit('abuse', { ...event, url: parsedUrl });
if (snapshot?.href && parsedUrl.href !== snapshot.href) {
this.emit('abuse', { ...event, url: snapshot.href });
}
nextSnapshotDeferred.reject(
new SecurityCompromiseError(`Abuse detected: ${event.reason}`)
);
@ -1071,6 +1092,7 @@ export class PuppeteerControl extends AsyncService {
}
}
} finally {
this.pagePhase.set(page, 'background');
(waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => {
page.off('snapshot', hdl);
this.ditchPage(page);

View File

@ -152,7 +152,8 @@ export class SnapshotFormatter extends AsyncService {
// in case of Google Web Cache content
if (snapshot.pdfs?.length && (!snapshot.title || snapshot.title.startsWith('cache:'))) {
const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0],
this.threadLocal.get('cacheTolerance')
this.threadLocal.get('cacheTolerance'),
snapshot.pdfs[0].startsWith('http') ? undefined : snapshot.href,
);
if (pdf) {
pdfMode = true;

View File

@ -0,0 +1,151 @@
import 'reflect-metadata';
import { container, singleton } from 'tsyringe';
import { initializeApp, applicationDefault } from 'firebase-admin/app';
process.env['FIREBASE_CONFIG'] ??= JSON.stringify({
projectId: process.env['GCLOUD_PROJECT'] || 'reader-6b7dc',
storageBucket: `${process.env['GCLOUD_PROJECT'] || 'reader-6b7dc'}.appspot.com`,
credential: applicationDefault(),
});
initializeApp();
import { Logger, CloudFunctionRegistry } from '../shared';
import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc';
import { ExpressServer } from 'civkit/civ-rpc/express';
import http2 from 'http2';
import { CrawlerHost } from '../cloud-functions/crawler';
import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
import path from 'path';
import fs from 'fs';
import { mimeOfExt } from 'civkit/mime';
import { NextFunction, Request, Response } from 'express';
process.on('unhandledRejection', (err) => {
console.error('Unhandled rejection', err);
});
process.on('uncaughtException', (err) => {
console.log('Uncaught exception', err);
// Looks like Firebase runtime does not handle error properly.
// Make sure to quit the process.
console.error('Uncaught exception, process quit.');
process.nextTick(() => process.exit(1));
});
@singleton()
export class CrawlStandAloneServer extends ExpressServer {
logger = this.globalLogger.child({ service: this.constructor.name });
httpAlternativeServer?: typeof this['httpServer'];
assets = new Map<string, WalkOutEntity>();
constructor(
protected globalLogger: Logger,
protected registry: CloudFunctionRegistry,
protected crawlerHost: CrawlerHost,
) {
super(...arguments);
registry.allHandsOnDeck().catch(() => void 0);
registry.title = 'reader';
registry.version = '0.1.0';
}
h2c() {
this.httpAlternativeServer = this.httpServer;
this.httpServer = http2.createServer(this.expressApp);
// useResourceBasedDefaultTracker();
return this;
}
override async init() {
await this.walkForAssets();
await super.init();
}
async walkForAssets() {
const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public'));
for (const file of files) {
if (file.type !== 'file') {
continue;
}
this.assets.set(file.relativePath.toString(), file);
}
}
makeAssetsServingController() {
return (req: Request, res: Response, next: NextFunction) => {
const requestPath = req.url;
const file = requestPath.slice(1);
if (!file) {
return next();
}
const asset = this.assets.get(file);
if (asset?.type !== 'file') {
return next();
}
res.type(mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream');
res.set('Content-Length', asset.stats.size.toString());
fs.createReadStream(asset.path).pipe(res);
return;
};
}
override listen(port: number) {
const r = super.listen(port);
if (this.httpAlternativeServer) {
const altPort = port + 1;
this.httpAlternativeServer.listen(altPort, () => {
this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`);
});
}
return r;
}
override registerRoutes(): void {
const openAPIManager = new OpenAPIManager();
openAPIManager.document('/{url}', ['get', 'post'], this.registry.conf.get('crawl')!);
const openapiJsonPath = '/openapi.json';
this.expressRootRouter.get(openapiJsonPath, (req, res) => {
const baseURL = new URL(req.url, `${req.protocol}://${req.headers.host}`);
baseURL.pathname = baseURL.pathname.replace(new RegExp(`${openapiJsonPath}$`, 'i'), '').replace(/\/+$/g, '');
baseURL.search = '';
const content = openAPIManager.createOpenAPIObject(baseURL.toString(), {
info: {
title: this.registry.title,
description: `${this.registry.title} openAPI documentations`,
'x-logo': {
url: this.registry.logoUrl || `https://www.openapis.org/wp-content/uploads/sites/3/2018/02/OpenAPI_Logo_Pantone-1.png`
}
}
}, (this.registry.constructor as typeof AbstractRPCRegistry).envelope, req.query as any);
res.statusCode = 200;
res.end(JSON.stringify(content));
});
this.expressRootRouter.use('/', ...this.registry.expressMiddlewares, this.makeAssetsServingController(), this.registry.makeShimController('crawl'));
}
protected override featureSelect(): void {
this.insertAsyncHookMiddleware();
this.insertHealthCheckMiddleware(this.healthCheckEndpoint);
this.insertLogRequestsMiddleware();
this.registerOpenAPIDocsRoutes('/docs');
this.registerRoutes();
}
}
const instance = container.resolve(CrawlStandAloneServer);
export default instance;
instance.serviceReady().then((s) => s.listen(parseInt(process.env.PORT || '') || 3000));

View File

@ -0,0 +1,151 @@
import 'reflect-metadata';
import { container, singleton } from 'tsyringe';
import { initializeApp, applicationDefault } from 'firebase-admin/app';
process.env['FIREBASE_CONFIG'] ??= JSON.stringify({
projectId: process.env['GCLOUD_PROJECT'] || 'reader-6b7dc',
storageBucket: `${process.env['GCLOUD_PROJECT'] || 'reader-6b7dc'}.appspot.com`,
credential: applicationDefault(),
});
initializeApp();
import { Logger, CloudFunctionRegistry } from '../shared';
import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc';
import { ExpressServer } from 'civkit/civ-rpc/express';
import http2 from 'http2';
import { SearcherHost } from '../cloud-functions/searcher';
import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
import path from 'path';
import fs from 'fs';
import { mimeOfExt } from 'civkit/mime';
import { NextFunction, Request, Response } from 'express';
process.on('unhandledRejection', (err) => {
console.error('Unhandled rejection', err);
});
process.on('uncaughtException', (err) => {
console.log('Uncaught exception', err);
// Looks like Firebase runtime does not handle error properly.
// Make sure to quit the process.
console.error('Uncaught exception, process quit.');
process.nextTick(() => process.exit(1));
});
@singleton()
export class SearchStandAloneServer extends ExpressServer {
logger = this.globalLogger.child({ service: this.constructor.name });
httpAlternativeServer?: typeof this['httpServer'];
assets = new Map<string, WalkOutEntity>();
constructor(
protected globalLogger: Logger,
protected registry: CloudFunctionRegistry,
protected searcherHost: SearcherHost,
) {
super(...arguments);
registry.allHandsOnDeck().catch(() => void 0);
registry.title = 'reader';
registry.version = '0.1.0';
}
h2c() {
this.httpAlternativeServer = this.httpServer;
this.httpServer = http2.createServer(this.expressApp);
// useResourceBasedDefaultTracker();
return this;
}
override async init() {
await this.walkForAssets();
await super.init();
}
async walkForAssets() {
const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public'));
for (const file of files) {
if (file.type !== 'file') {
continue;
}
this.assets.set(file.relativePath.toString(), file);
}
}
makeAssetsServingController() {
return (req: Request, res: Response, next: NextFunction) => {
const requestPath = req.url;
const file = requestPath.slice(1);
if (!file) {
return next();
}
const asset = this.assets.get(file);
if (asset?.type !== 'file') {
return next();
}
res.type(mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream');
res.set('Content-Length', asset.stats.size.toString());
fs.createReadStream(asset.path).pipe(res);
return;
};
}
override listen(port: number) {
const r = super.listen(port);
if (this.httpAlternativeServer) {
const altPort = port + 1;
this.httpAlternativeServer.listen(altPort, () => {
this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`);
});
}
return r;
}
override registerRoutes(): void {
const openAPIManager = new OpenAPIManager();
openAPIManager.document('/{q}', ['get', 'post'], this.registry.conf.get('search')!);
const openapiJsonPath = '/openapi.json';
this.expressRootRouter.get(openapiJsonPath, (req, res) => {
const baseURL = new URL(req.url, `${req.protocol}://${req.headers.host}`);
baseURL.pathname = baseURL.pathname.replace(new RegExp(`${openapiJsonPath}$`, 'i'), '').replace(/\/+$/g, '');
baseURL.search = '';
const content = openAPIManager.createOpenAPIObject(baseURL.toString(), {
info: {
title: this.registry.title,
description: `${this.registry.title} openAPI documentations`,
'x-logo': {
url: this.registry.logoUrl || `https://www.openapis.org/wp-content/uploads/sites/3/2018/02/OpenAPI_Logo_Pantone-1.png`
}
}
}, (this.registry.constructor as typeof AbstractRPCRegistry).envelope, req.query as any);
res.statusCode = 200;
res.end(JSON.stringify(content));
});
this.expressRootRouter.use('/', ...this.registry.expressMiddlewares, this.makeAssetsServingController(), this.registry.makeShimController('search'));
}
protected override featureSelect(): void {
this.insertAsyncHookMiddleware();
this.insertHealthCheckMiddleware(this.healthCheckEndpoint);
this.insertLogRequestsMiddleware();
this.registerOpenAPIDocsRoutes('/docs');
this.registerRoutes();
}
}
const instance = container.resolve(SearchStandAloneServer);
export default instance;
instance.serviceReady().then((s) => s.listen(parseInt(process.env.PORT || '') || 3000));