mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-14 03:15:51 +08:00
deployment: dedicated server script for cloud-run (#1139)
* refactor: domain profile and attempt direct engine * fix: direct engine * fix: abuse in background phase * fix * wip * use curl-impersonate in custom image * local pdf for curl * listen port from env * fix * fix * fix * fix: ditch http2 * cd: using gh action * ci: token for thinapps-shared * ci: setup node lock file path * ci: tweak * ci: mmdb * ci: docker build * fix: ci * fix: ci
This commit is contained in:
parent
a453ab5f16
commit
6a58de590c
76
.github/workflows/cd.yml
vendored
Normal file
76
.github/workflows/cd.yml
vendored
Normal file
@ -0,0 +1,76 @@
|
||||
run-name: Build push and deploy (CD)
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- ci-debug
|
||||
tags:
|
||||
- '*'
|
||||
|
||||
jobs:
|
||||
build-and-push-to-gcr:
|
||||
runs-on: ubuntu-latest
|
||||
concurrency:
|
||||
group: ${{ github.ref_type == 'branch' && github.ref }}
|
||||
cancel-in-progress: true
|
||||
defaults:
|
||||
run:
|
||||
working-directory: backend/functions
|
||||
permissions:
|
||||
contents: read
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
lfs: true
|
||||
submodules: true
|
||||
token: ${{ secrets.THINAPPS_SHARED_READ_TOKEN }}
|
||||
- uses: 'google-github-actions/auth@v2'
|
||||
with:
|
||||
credentials_json: '${{ secrets.GCLOUD_SERVICE_ACCOUNT_SECRET_JSON }}'
|
||||
- name: 'Set up Cloud SDK'
|
||||
uses: 'google-github-actions/setup-gcloud@v2'
|
||||
- name: "Docker auth"
|
||||
run: |-
|
||||
gcloud auth configure-docker us-docker.pkg.dev --quiet
|
||||
- name: Set controller release version
|
||||
run: echo "RELEASE_VERSION=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
|
||||
- name: Set up Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 22.12.0
|
||||
cache: npm
|
||||
cache-dependency-path: backend/functions/package-lock.json
|
||||
|
||||
- name: npm install
|
||||
run: npm ci
|
||||
- name: get maxmind mmdb
|
||||
run: mkdir -p licensed && curl -o licensed/GeoLite2-City.mmdb https://github.com/P3TERX/GeoLite.mmdb/raw/download/GeoLite2-City.mmdb
|
||||
- name: build application
|
||||
run: npm run build
|
||||
- name: Set package version
|
||||
run: npm version --no-git-tag-version ${{ env.RELEASE_VERSION }}
|
||||
if: github.ref_type == 'tag'
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: |
|
||||
us-docker.pkg.dev/reader-6b7dc/jina-reader/reader
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
- name: Build and push
|
||||
id: container
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: backend/functions
|
||||
push: true
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
- name: Deploy CRAWL with Tag
|
||||
run: |
|
||||
gcloud run deploy crawl --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/crawl.js --region us-central1 --async --min-instances 0
|
||||
- name: Deploy SEARCH with Tag
|
||||
run: |
|
||||
gcloud run deploy search --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region us-central1 --async --min-instances 0
|
1
backend/functions/.dockerignore
Normal file
1
backend/functions/.dockerignore
Normal file
@ -0,0 +1 @@
|
||||
node_modules/
|
37
backend/functions/Dockerfile
Normal file
37
backend/functions/Dockerfile
Normal file
@ -0,0 +1,37 @@
|
||||
# syntax=docker/dockerfile:1
|
||||
FROM lwthiker/curl-impersonate:0.6-chrome-slim-bullseye
|
||||
|
||||
FROM node:20
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y wget gnupg \
|
||||
&& wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
|
||||
&& sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y google-chrome-stable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-freefont-ttf libxss1 \
|
||||
--no-install-recommends \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY --from=0 /usr/local/lib/libcurl-impersonate.so /usr/local/lib/libcurl-impersonate.so
|
||||
|
||||
RUN groupadd -r jina
|
||||
RUN useradd -g jina -G audio,video -m jina
|
||||
USER jina
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY package.json package-lock.json ./
|
||||
RUN npm ci
|
||||
|
||||
COPY build ./build
|
||||
COPY public ./public
|
||||
COPY licensed ./licensed
|
||||
|
||||
RUN rm -rf ~/.config/chromium && mkdir -p ~/.config/chromium
|
||||
|
||||
ENV LD_PRELOAD=/usr/local/lib/libcurl-impersonate.so CURL_IMPERSONATE=chrome116 CURL_IMPERSONATE_HEADERS=no
|
||||
ENV PORT=8080
|
||||
|
||||
EXPOSE 3000 3001 8080 8081
|
||||
ENTRYPOINT ["node"]
|
||||
CMD [ "build/stand-alone/crawl.js" ]
|
8
backend/functions/package-lock.json
generated
8
backend/functions/package-lock.json
generated
@ -16,7 +16,7 @@
|
||||
"axios": "^1.3.3",
|
||||
"bcrypt": "^5.1.0",
|
||||
"busboy": "^1.6.0",
|
||||
"civkit": "^0.8.2-4c0357a",
|
||||
"civkit": "^0.8.2-03243fe",
|
||||
"core-js": "^3.37.1",
|
||||
"cors": "^2.8.5",
|
||||
"dayjs": "^1.11.9",
|
||||
@ -3979,9 +3979,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/civkit": {
|
||||
"version": "0.8.2-4c0357a",
|
||||
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.2-4c0357a.tgz",
|
||||
"integrity": "sha512-8/RcapAm8YYImf+YVBRhybEFuSuV5Pg1p/s6Niql3VAY2cV1/OC1fTCDZY689yeq8zFcwxwBvaqyIEGo69F+IA==",
|
||||
"version": "0.8.2-03243fe",
|
||||
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.2-03243fe.tgz",
|
||||
"integrity": "sha512-hoTxGeGdD27iOCDi51cVY0PHlRN3OSC640QRJ1YSmD42o+LP7mZtbdy8dN7j/FSkPP/5yLuB2ch9BMSOp54POQ==",
|
||||
"license": "AGPL",
|
||||
"dependencies": {
|
||||
"lodash": "^4.17.21",
|
||||
|
@ -36,7 +36,7 @@
|
||||
"axios": "^1.3.3",
|
||||
"bcrypt": "^5.1.0",
|
||||
"busboy": "^1.6.0",
|
||||
"civkit": "^0.8.2-4c0357a",
|
||||
"civkit": "^0.8.2-03243fe",
|
||||
"core-js": "^3.37.1",
|
||||
"cors": "^2.8.5",
|
||||
"dayjs": "^1.11.9",
|
||||
|
BIN
backend/functions/public/favicon.ico
Normal file
BIN
backend/functions/public/favicon.ico
Normal file
Binary file not shown.
After Width: | Height: | Size: 14 KiB |
@ -84,6 +84,8 @@ export class CrawlerHost extends RPCHost {
|
||||
Reflect.set(snapshot, 'locale', options.locale);
|
||||
}
|
||||
await this.setToCache(options.url, snapshot);
|
||||
|
||||
await this.exploreDirectEngine(snapshot).catch(() => undefined);
|
||||
});
|
||||
|
||||
puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
|
||||
@ -581,9 +583,14 @@ export class CrawlerHost extends RPCHost {
|
||||
|
||||
if (crawlerOpts?.respondWith.includes(CONTENT_FORMAT.READER_LM)) {
|
||||
const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
|
||||
...crawlOpts, engine: ENGINE_TYPE.AUTO
|
||||
...crawlOpts,
|
||||
engine: crawlOpts?.engine || ENGINE_TYPE.AUTO,
|
||||
}, crawlerOpts);
|
||||
|
||||
if (!finalAutoSnapshot?.html) {
|
||||
throw new AssertionFailureError(`Unexpected non HTML content for ReaderLM: ${urlToCrawl}`);
|
||||
}
|
||||
|
||||
if (crawlerOpts?.instruction || crawlerOpts?.jsonSchema) {
|
||||
const jsonSchema = crawlerOpts.jsonSchema ? JSON.stringify(crawlerOpts.jsonSchema, undefined, 2) : undefined;
|
||||
yield* this.lmControl.readerLMFromSnapshot(crawlerOpts.instruction, jsonSchema, finalAutoSnapshot);
|
||||
@ -628,18 +635,9 @@ export class CrawlerHost extends RPCHost {
|
||||
return;
|
||||
}
|
||||
|
||||
if (crawlOpts?.engine?.startsWith(ENGINE_TYPE.DIRECT)) {
|
||||
const engine = crawlOpts?.engine;
|
||||
try {
|
||||
const snapshot = await this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
|
||||
yield snapshot;
|
||||
|
||||
return;
|
||||
} catch (err) {
|
||||
if (!engine.endsWith('?')) {
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) {
|
||||
yield this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
|
||||
return;
|
||||
}
|
||||
|
||||
let cache;
|
||||
@ -658,6 +656,24 @@ export class CrawlerHost extends RPCHost {
|
||||
return;
|
||||
}
|
||||
|
||||
if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && crawlerOpts?.browserIsNotRequired()) {
|
||||
const { digest } = this.getDomainProfileUrlDigest(urlToCrawl);
|
||||
const domainProfile = await DomainProfile.fromFirestore(digest);
|
||||
if (domainProfile?.engine === ENGINE_TYPE.DIRECT) {
|
||||
try {
|
||||
const snapshot = await this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
|
||||
|
||||
// Expect downstream code to "break" here if it's satisfied with the direct engine
|
||||
yield snapshot;
|
||||
if (crawlOpts?.engine === ENGINE_TYPE.AUTO) {
|
||||
return;
|
||||
}
|
||||
} catch (err: any) {
|
||||
this.logger.warn(`Failed to scrap ${urlToCrawl} with direct engine`, { err: marshalErrorLike(err) });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe || crawlOpts?.withShadowDom) {
|
||||
for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
|
||||
@ -855,7 +871,7 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
|
||||
async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise<PageSnapshot | undefined> {
|
||||
const it = this.cachedScrap(url, { ...opts, engine: ENGINE_TYPE.BROWSER }, crawlerOptions);
|
||||
const it = this.cachedScrap(url, opts, crawlerOptions);
|
||||
|
||||
let lastSnapshot;
|
||||
let lastError;
|
||||
@ -912,36 +928,54 @@ export class CrawlerHost extends RPCHost {
|
||||
return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
|
||||
}
|
||||
|
||||
async exploreDirectEngine(targetUrl: URL, crawlerOptions: ScrappingOptions, knownSnapshot: PageSnapshot) {
|
||||
const snapshot = await this.curlControl.urlToSnapshot(targetUrl, crawlerOptions, true);
|
||||
async exploreDirectEngine(knownSnapshot: PageSnapshot) {
|
||||
const realUrl = new URL(knownSnapshot.href);
|
||||
const { digest, path } = this.getDomainProfileUrlDigest(realUrl);
|
||||
const profile = await DomainProfile.fromFirestore(digest);
|
||||
|
||||
const thisFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', snapshot);
|
||||
const knownFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', knownSnapshot);
|
||||
if (!profile) {
|
||||
const record = DomainProfile.from({
|
||||
_id: digest,
|
||||
origin: realUrl.origin.toLowerCase(),
|
||||
path,
|
||||
triggerUrl: realUrl.href,
|
||||
engine: knownSnapshot.htmlModifiedByJs ? ENGINE_TYPE.BROWSER : ENGINE_TYPE.DIRECT,
|
||||
createdAt: new Date(),
|
||||
expireAt: new Date(Date.now() + this.domainProfileRetentionMs),
|
||||
});
|
||||
await DomainProfile.save(record);
|
||||
|
||||
let engine = ENGINE_TYPE.DIRECT;
|
||||
if (!(thisFormatted.content && knownFormatted.content &&
|
||||
thisFormatted.content.trim() === knownFormatted.content.trim())) {
|
||||
engine = ENGINE_TYPE.BROWSER;
|
||||
return;
|
||||
}
|
||||
|
||||
const realUrl = new URL(knownSnapshot.href);
|
||||
|
||||
const profile = (await DomainProfile.fromFirestoreQuery(
|
||||
DomainProfile.COLLECTION
|
||||
.where('domain', '==', targetUrl.origin.toLowerCase())
|
||||
.limit(1)
|
||||
))[0] || new DomainProfile();
|
||||
|
||||
if (profile.engine === ENGINE_TYPE.BROWSER) {
|
||||
// Mixed engine, always use browser
|
||||
return;
|
||||
}
|
||||
|
||||
profile.origin = realUrl.origin.toLowerCase();
|
||||
profile.triggerReason ??= 'Auto Explore';
|
||||
profile.triggerUrl = realUrl.href;
|
||||
profile.engine = engine;
|
||||
profile.createdAt ??= new Date();
|
||||
profile.path = path;
|
||||
profile.engine = knownSnapshot.htmlModifiedByJs ? ENGINE_TYPE.BROWSER : ENGINE_TYPE.DIRECT;
|
||||
profile.expireAt = new Date(Date.now() + this.domainProfileRetentionMs);
|
||||
|
||||
await DomainProfile.save(profile);
|
||||
|
||||
return true;
|
||||
return;
|
||||
}
|
||||
|
||||
getDomainProfileUrlDigest(url: URL) {
|
||||
const pathname = url.pathname;
|
||||
const pathVec = pathname.split('/');
|
||||
const parentPath = pathVec.slice(0, -1).join('/');
|
||||
|
||||
const finalPath = parentPath || pathname;
|
||||
|
||||
const key = url.origin.toLocaleLowerCase() + finalPath;
|
||||
|
||||
return {
|
||||
digest: md5Hasher.hash(key),
|
||||
path: finalPath,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
@ -13,10 +13,7 @@ export class DomainProfile extends FirestoreRecord {
|
||||
@Prop({
|
||||
required: true
|
||||
})
|
||||
origin!: string;
|
||||
|
||||
@Prop({ required: true })
|
||||
triggerReason!: string;
|
||||
path!: string;
|
||||
|
||||
@Prop()
|
||||
triggerUrl?: string;
|
||||
|
@ -439,7 +439,7 @@ export class CrawlerOptions extends AutoCastable {
|
||||
instance.engine = ENGINE_TYPE.BROWSER;
|
||||
instance.respondWith = CONTENT_FORMAT.VLM;
|
||||
} else if (instance.engine === ENGINE_TYPE.READER_LM) {
|
||||
instance.engine = undefined;
|
||||
instance.engine = ENGINE_TYPE.AUTO;
|
||||
instance.respondWith = CONTENT_FORMAT.READER_LM;
|
||||
}
|
||||
|
||||
@ -496,10 +496,6 @@ export class CrawlerOptions extends AutoCastable {
|
||||
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
||||
}
|
||||
|
||||
if (instance.noCache || !instance.isTypicalRequest()) {
|
||||
instance.engine ??= ENGINE_TYPE.BROWSER + '?';
|
||||
}
|
||||
|
||||
return instance;
|
||||
}
|
||||
|
||||
@ -544,13 +540,19 @@ export class CrawlerOptions extends AutoCastable {
|
||||
return !CONTENT_FORMAT_VALUES.has(this.respondWith);
|
||||
}
|
||||
|
||||
isTypicalRequest() {
|
||||
browserIsNotRequired() {
|
||||
if (this.respondWith.includes(CONTENT_FORMAT.PAGESHOT) || this.respondWith.includes(CONTENT_FORMAT.SCREENSHOT)) {
|
||||
return false;
|
||||
}
|
||||
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
||||
return false;
|
||||
}
|
||||
if (this.waitForSelector?.length) {
|
||||
return false;
|
||||
}
|
||||
if (this.withIframe || this.withShadowDom) {
|
||||
return false;
|
||||
}
|
||||
if (this.viewport) {
|
||||
return false;
|
||||
}
|
||||
|
@ -2,11 +2,14 @@ import { marshalErrorLike } from 'civkit/lang';
|
||||
import { AsyncService } from 'civkit/async-service';
|
||||
import { singleton } from 'tsyringe';
|
||||
|
||||
import { Curl, HeaderInfo } from 'node-libcurl';
|
||||
import { Curl, CurlFeature, HeaderInfo } from 'node-libcurl';
|
||||
import { PageSnapshot, ScrappingOptions } from './puppeteer';
|
||||
import { Logger } from '../shared/services/logger';
|
||||
import { JSDomControl } from './jsdom';
|
||||
import { AssertionFailureError } from 'civkit';
|
||||
import { AssertionFailureError, FancyFile } from 'civkit';
|
||||
import { TempFileManager } from '../shared';
|
||||
import { readFile } from 'fs/promises';
|
||||
import { pathToFileURL } from 'url';
|
||||
|
||||
@singleton()
|
||||
export class CurlControl extends AsyncService {
|
||||
@ -16,6 +19,7 @@ export class CurlControl extends AsyncService {
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
protected jsdomControl: JSDomControl,
|
||||
protected tempFileManager: TempFileManager,
|
||||
) {
|
||||
super(...arguments);
|
||||
}
|
||||
@ -26,25 +30,55 @@ export class CurlControl extends AsyncService {
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
curlImpersonateHeader(curl: Curl, headers?: object, chromeVersion: number = 132) {
|
||||
const mixinHeaders = {
|
||||
'sch-ch-ua': `Not A(Brand";v="8", "Chromium";v="${chromeVersion}", "Google Chrome";v="${chromeVersion}"`,
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': 'Windows',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'User-Agent': `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${chromeVersion}.0.0.0 Safari/537.36`,
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
};
|
||||
|
||||
curl.setOpt(Curl.option.HTTPHEADER, Object.entries({ ...mixinHeaders, ...headers }).map(([k, v]) => `${k}: ${v}`));
|
||||
|
||||
return curl;
|
||||
}
|
||||
|
||||
async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions, throwOnNon200 = false): Promise<PageSnapshot> {
|
||||
const snapshot = {
|
||||
href: urlToCrawl.toString(),
|
||||
html: '',
|
||||
title: '',
|
||||
text: '',
|
||||
} as PageSnapshot;
|
||||
|
||||
const result = await new Promise<{
|
||||
statusCode: number,
|
||||
data: string,
|
||||
data?: FancyFile,
|
||||
headers: Buffer | HeaderInfo[],
|
||||
}>((resolve, reject) => {
|
||||
const curl = new Curl();
|
||||
curl.enable(CurlFeature.StreamResponse);
|
||||
curl.setOpt('URL', urlToCrawl.toString());
|
||||
curl.setOpt(Curl.option.FOLLOWLOCATION, true);
|
||||
|
||||
if (crawlOpts?.timeoutMs) {
|
||||
curl.setOpt(Curl.option.TIMEOUT_MS, crawlOpts.timeoutMs);
|
||||
}
|
||||
curl.setOpt(Curl.option.TIMEOUT_MS, Math.min(10_000, crawlOpts?.timeoutMs || 10_000));
|
||||
|
||||
if (crawlOpts?.overrideUserAgent) {
|
||||
curl.setOpt(Curl.option.USERAGENT, crawlOpts.overrideUserAgent);
|
||||
}
|
||||
if (crawlOpts?.extraHeaders) {
|
||||
curl.setOpt(Curl.option.HTTPHEADER, Object.entries(crawlOpts.extraHeaders).map(([k, v]) => `${k}: ${v}`));
|
||||
}
|
||||
|
||||
this.curlImpersonateHeader(curl, crawlOpts?.extraHeaders);
|
||||
// if (crawlOpts?.extraHeaders) {
|
||||
// curl.setOpt(Curl.option.HTTPHEADER, Object.entries(crawlOpts.extraHeaders).map(([k, v]) => `${k}: ${v}`));
|
||||
// }
|
||||
if (crawlOpts?.proxyUrl) {
|
||||
curl.setOpt(Curl.option.PROXY, crawlOpts.proxyUrl);
|
||||
}
|
||||
@ -56,35 +90,82 @@ export class CurlControl extends AsyncService {
|
||||
curl.setOpt(Curl.option.REFERER, crawlOpts.referer);
|
||||
}
|
||||
|
||||
curl.on('end', (statusCode, data, headers) => {
|
||||
curl.on('end', (statusCode, _data, headers) => {
|
||||
this.logger.debug(`CURL: [${statusCode}] ${urlToCrawl}`, { statusCode, headers });
|
||||
resolve({
|
||||
statusCode,
|
||||
data: data.toString(),
|
||||
headers,
|
||||
});
|
||||
curl.close();
|
||||
});
|
||||
|
||||
curl.on('error', (err) => {
|
||||
this.logger.warn(`Failed to curl ${urlToCrawl}`, { err: marshalErrorLike(err) });
|
||||
curl.close();
|
||||
this.logger.warn(`Curl ${urlToCrawl}: ${err} (Not necessarily an error)`, { err: marshalErrorLike(err) });
|
||||
reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: ${err.message}`));
|
||||
});
|
||||
curl.setOpt(Curl.option.MAXFILESIZE, 1024 * 1024 * 1024); // 1GB
|
||||
let status = -1;
|
||||
let contentType = '';
|
||||
curl.on('stream', (stream, statusCode, headers) => {
|
||||
status = statusCode;
|
||||
outerLoop:
|
||||
for (const headerVec of headers) {
|
||||
for (const [k, v] of Object.entries(headerVec)) {
|
||||
if (k.toLowerCase() === 'content-type') {
|
||||
contentType = v.toLowerCase();
|
||||
break outerLoop;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!contentType) {
|
||||
reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: no content-type`));
|
||||
stream.destroy();
|
||||
return;
|
||||
}
|
||||
if (contentType.startsWith('image/')) {
|
||||
snapshot.html = `<html style="height: 100%;"><head><meta name="viewport" content="width=device-width, minimum-scale=0.1"><title>${urlToCrawl.origin}${urlToCrawl.pathname}</title></head><body style="margin: 0px; height: 100%; background-color: rgb(14, 14, 14);"><img style="display: block;-webkit-user-select: none;margin: auto;background-color: hsl(0, 0%, 90%);transition: background-color 300ms;" src="${urlToCrawl.href}"></body></html>`;
|
||||
stream.destroy();
|
||||
resolve({
|
||||
statusCode: status,
|
||||
headers,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
const fpath = this.tempFileManager.alloc();
|
||||
const fancyFile = FancyFile.auto(stream, fpath);
|
||||
this.tempFileManager.bindPathTo(fancyFile, fpath);
|
||||
resolve({
|
||||
statusCode: status,
|
||||
data: fancyFile,
|
||||
headers,
|
||||
});
|
||||
});
|
||||
|
||||
curl.perform();
|
||||
});
|
||||
|
||||
if (throwOnNon200 && result.statusCode && (result.statusCode < 200 || result.statusCode >= 300)) {
|
||||
throw new AssertionFailureError(`Failed to directly access ${urlToCrawl}: HTTP ${result.statusCode}`);
|
||||
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: HTTP ${result.statusCode}`);
|
||||
}
|
||||
|
||||
const snapshot = {
|
||||
href: urlToCrawl.toString(),
|
||||
html: result.data,
|
||||
title: '',
|
||||
text: '',
|
||||
} as PageSnapshot;
|
||||
if (result.data) {
|
||||
const mimeType: string = await result.data.mimeType;
|
||||
if (mimeType.startsWith('text/html')) {
|
||||
if ((await result.data.size) > 1024 * 1024 * 32) {
|
||||
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: file too large`);
|
||||
}
|
||||
snapshot.html = await readFile(await result.data.filePath, { encoding: 'utf-8' });
|
||||
} else if (mimeType.startsWith('text/') || mimeType.startsWith('application/json')) {
|
||||
if ((await result.data.size) > 1024 * 1024 * 32) {
|
||||
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: file too large`);
|
||||
}
|
||||
snapshot.text = await readFile(await result.data.filePath, { encoding: 'utf-8' });
|
||||
snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`;
|
||||
} else if (mimeType.startsWith('application/pdf')) {
|
||||
snapshot.pdfs = [pathToFileURL(await result.data.filePath).href];
|
||||
} else {
|
||||
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: unexpected type ${mimeType}`);
|
||||
}
|
||||
}
|
||||
|
||||
const curlSnapshot = await this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
|
||||
|
||||
|
@ -266,12 +266,12 @@ export class PDFExtractor extends AsyncService {
|
||||
return { meta: meta.info as Record<string, any>, content: mdChunks.join(''), text: rawChunks.join('') };
|
||||
}
|
||||
|
||||
async cachedExtract(url: string | URL, cacheTolerance: number = 1000 * 3600 * 24) {
|
||||
async cachedExtract(url: string | URL, cacheTolerance: number = 1000 * 3600 * 24, alternativeUrl?: string) {
|
||||
if (!url) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
const digest = md5Hasher.hash(url.toString());
|
||||
const nameUrl = alternativeUrl || url.toString();
|
||||
const digest = md5Hasher.hash(nameUrl);
|
||||
|
||||
const data = url;
|
||||
if (typeof url === 'string' && this.isDataUrl(url)) {
|
||||
@ -283,8 +283,8 @@ export class PDFExtractor extends AsyncService {
|
||||
if (cache) {
|
||||
const age = Date.now() - cache?.createdAt.valueOf();
|
||||
const stale = cache.createdAt.valueOf() < (Date.now() - cacheTolerance);
|
||||
this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for PDF ${url}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, {
|
||||
url, digest, age, stale, cacheTolerance
|
||||
this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for PDF ${nameUrl}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, {
|
||||
data: url, url: nameUrl, digest, age, stale, cacheTolerance
|
||||
});
|
||||
|
||||
if (!stale) {
|
||||
@ -306,7 +306,7 @@ export class PDFExtractor extends AsyncService {
|
||||
text: cached.text
|
||||
};
|
||||
} catch (err) {
|
||||
this.logger.warn(`Unable to load cached content for ${url}`, { err });
|
||||
this.logger.warn(`Unable to load cached content for ${nameUrl}`, { err });
|
||||
|
||||
return undefined;
|
||||
}
|
||||
@ -324,17 +324,17 @@ export class PDFExtractor extends AsyncService {
|
||||
PDFContent.save(
|
||||
PDFContent.from({
|
||||
_id: theID,
|
||||
src: url.toString(),
|
||||
src: nameUrl,
|
||||
meta: extracted?.meta || {},
|
||||
urlDigest: digest,
|
||||
createdAt: new Date(),
|
||||
expireAt: new Date(Date.now() + this.cacheRetentionMs)
|
||||
}).degradeForFireStore()
|
||||
).catch((r) => {
|
||||
this.logger.warn(`Unable to cache PDF content for ${url}`, { err: r });
|
||||
this.logger.warn(`Unable to cache PDF content for ${nameUrl}`, { err: r });
|
||||
});
|
||||
} catch (err) {
|
||||
this.logger.warn(`Unable to extract from pdf ${url}`, { err });
|
||||
this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err });
|
||||
}
|
||||
|
||||
return extracted;
|
||||
|
@ -48,6 +48,7 @@ export interface PageSnapshot {
|
||||
href: string;
|
||||
rebase?: string;
|
||||
html: string;
|
||||
htmlModifiedByJs?: boolean;
|
||||
shadowExpanded?: string;
|
||||
text: string;
|
||||
status?: number;
|
||||
@ -369,7 +370,9 @@ function shadowDomPresent(rootElement = document.documentElement) {
|
||||
return false;
|
||||
}
|
||||
|
||||
let initialHTML;
|
||||
function giveSnapshot(stopActiveSnapshot) {
|
||||
initialHTML ??= document.documentElement?.outerHTML;
|
||||
if (stopActiveSnapshot) {
|
||||
window.haltSnapshot = true;
|
||||
}
|
||||
@ -385,6 +388,7 @@ function giveSnapshot(stopActiveSnapshot) {
|
||||
description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
|
||||
href: document.location.href,
|
||||
html: document.documentElement?.outerHTML,
|
||||
htmlModifiedByJs: false,
|
||||
text: document.body?.innerText,
|
||||
shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
|
||||
parsed: parsed,
|
||||
@ -392,6 +396,9 @@ function giveSnapshot(stopActiveSnapshot) {
|
||||
maxElemDepth: domAnalysis.maxDepth,
|
||||
elemCount: domAnalysis.elementCount,
|
||||
};
|
||||
if (initialHTML) {
|
||||
r.htmlModifiedByJs = initialHTML !== r.html && !r.shadowExpanded;
|
||||
}
|
||||
if (document.baseURI !== r.href) {
|
||||
r.rebase = document.baseURI;
|
||||
}
|
||||
@ -448,6 +455,7 @@ export class PuppeteerControl extends AsyncService {
|
||||
finalizerMap = new WeakMap<Page, ReturnType<typeof setTimeout>>();
|
||||
snMap = new WeakMap<Page, number>();
|
||||
livePages = new Set<Page>();
|
||||
pagePhase = new WeakMap<Page, 'idle' | 'active' | 'background'>();
|
||||
lastPageCratedAt: number = 0;
|
||||
|
||||
rpsCap: number = 500;
|
||||
@ -491,7 +499,8 @@ export class PuppeteerControl extends AsyncService {
|
||||
}
|
||||
}
|
||||
this.browser = await puppeteer.launch({
|
||||
timeout: 10_000
|
||||
timeout: 10_000,
|
||||
args: ['--disable-dev-shm-usage']
|
||||
}).catch((err: any) => {
|
||||
this.logger.error(`Unknown firebase issue, just die fast.`, { err });
|
||||
process.nextTick(() => {
|
||||
@ -611,7 +620,14 @@ export class PuppeteerControl extends AsyncService {
|
||||
const dt = Math.ceil((Date.now() - t0) / 1000);
|
||||
const rps = reqCounter / dt;
|
||||
// console.log(`rps: ${rps}`);
|
||||
const pagePhase = this.pagePhase.get(page);
|
||||
if (pagePhase === 'background') {
|
||||
if (rps > 10 || reqCounter > 1000) {
|
||||
halt = true;
|
||||
|
||||
return req.abort('blockedbyclient', 1000);
|
||||
}
|
||||
}
|
||||
if (reqCounter > 1000) {
|
||||
if (rps > 60 || reqCounter > 2000) {
|
||||
page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many requests` });
|
||||
@ -676,6 +692,7 @@ export class PuppeteerControl extends AsyncService {
|
||||
this.logger.info(`Page ${sn} created.`);
|
||||
this.lastPageCratedAt = Date.now();
|
||||
this.livePages.add(page);
|
||||
this.pagePhase.set(page, 'idle');
|
||||
|
||||
return page;
|
||||
}
|
||||
@ -717,7 +734,6 @@ export class PuppeteerControl extends AsyncService {
|
||||
}
|
||||
const sn = this.snMap.get(page);
|
||||
this.logger.info(`Closing page ${sn}`);
|
||||
this.livePages.delete(page);
|
||||
await Promise.race([
|
||||
(async () => {
|
||||
const ctx = page.browserContext();
|
||||
@ -731,6 +747,8 @@ export class PuppeteerControl extends AsyncService {
|
||||
]).catch((err) => {
|
||||
this.logger.error(`Failed to destroy page ${sn}`, { err: marshalErrorLike(err) });
|
||||
});
|
||||
this.livePages.delete(page);
|
||||
this.pagePhase.delete(page);
|
||||
}
|
||||
|
||||
async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
|
||||
@ -743,6 +761,7 @@ export class PuppeteerControl extends AsyncService {
|
||||
const pdfUrls: string[] = [];
|
||||
let navigationResponse: HTTPResponse | undefined;
|
||||
const page = await this.getNextPage();
|
||||
this.pagePhase.set(page, 'active');
|
||||
page.on('response', (resp) => {
|
||||
if (resp.request().isNavigationRequest()) {
|
||||
navigationResponse = resp;
|
||||
@ -802,8 +821,6 @@ export class PuppeteerControl extends AsyncService {
|
||||
}
|
||||
const sn = this.snMap.get(page);
|
||||
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
||||
|
||||
this.logger.info(`Locale setting: ${options?.locale}`);
|
||||
if (options?.locale) {
|
||||
// Add headers via request interception to walk around this bug
|
||||
// https://github.com/puppeteer/puppeteer/issues/10235
|
||||
@ -896,6 +913,10 @@ export class PuppeteerControl extends AsyncService {
|
||||
page.on('snapshot', hdl);
|
||||
page.once('abuse', (event: any) => {
|
||||
this.emit('abuse', { ...event, url: parsedUrl });
|
||||
if (snapshot?.href && parsedUrl.href !== snapshot.href) {
|
||||
this.emit('abuse', { ...event, url: snapshot.href });
|
||||
}
|
||||
|
||||
nextSnapshotDeferred.reject(
|
||||
new SecurityCompromiseError(`Abuse detected: ${event.reason}`)
|
||||
);
|
||||
@ -1071,6 +1092,7 @@ export class PuppeteerControl extends AsyncService {
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
this.pagePhase.set(page, 'background');
|
||||
(waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => {
|
||||
page.off('snapshot', hdl);
|
||||
this.ditchPage(page);
|
||||
|
@ -152,7 +152,8 @@ export class SnapshotFormatter extends AsyncService {
|
||||
// in case of Google Web Cache content
|
||||
if (snapshot.pdfs?.length && (!snapshot.title || snapshot.title.startsWith('cache:'))) {
|
||||
const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0],
|
||||
this.threadLocal.get('cacheTolerance')
|
||||
this.threadLocal.get('cacheTolerance'),
|
||||
snapshot.pdfs[0].startsWith('http') ? undefined : snapshot.href,
|
||||
);
|
||||
if (pdf) {
|
||||
pdfMode = true;
|
||||
|
151
backend/functions/src/stand-alone/crawl.ts
Normal file
151
backend/functions/src/stand-alone/crawl.ts
Normal file
@ -0,0 +1,151 @@
|
||||
import 'reflect-metadata';
|
||||
import { container, singleton } from 'tsyringe';
|
||||
import { initializeApp, applicationDefault } from 'firebase-admin/app';
|
||||
|
||||
process.env['FIREBASE_CONFIG'] ??= JSON.stringify({
|
||||
projectId: process.env['GCLOUD_PROJECT'] || 'reader-6b7dc',
|
||||
storageBucket: `${process.env['GCLOUD_PROJECT'] || 'reader-6b7dc'}.appspot.com`,
|
||||
credential: applicationDefault(),
|
||||
});
|
||||
|
||||
initializeApp();
|
||||
|
||||
|
||||
import { Logger, CloudFunctionRegistry } from '../shared';
|
||||
import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc';
|
||||
import { ExpressServer } from 'civkit/civ-rpc/express';
|
||||
import http2 from 'http2';
|
||||
import { CrawlerHost } from '../cloud-functions/crawler';
|
||||
import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
|
||||
import path from 'path';
|
||||
import fs from 'fs';
|
||||
import { mimeOfExt } from 'civkit/mime';
|
||||
import { NextFunction, Request, Response } from 'express';
|
||||
|
||||
process.on('unhandledRejection', (err) => {
|
||||
console.error('Unhandled rejection', err);
|
||||
});
|
||||
|
||||
process.on('uncaughtException', (err) => {
|
||||
console.log('Uncaught exception', err);
|
||||
|
||||
// Looks like Firebase runtime does not handle error properly.
|
||||
// Make sure to quit the process.
|
||||
console.error('Uncaught exception, process quit.');
|
||||
process.nextTick(() => process.exit(1));
|
||||
});
|
||||
|
||||
@singleton()
|
||||
export class CrawlStandAloneServer extends ExpressServer {
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
httpAlternativeServer?: typeof this['httpServer'];
|
||||
assets = new Map<string, WalkOutEntity>();
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
protected registry: CloudFunctionRegistry,
|
||||
protected crawlerHost: CrawlerHost,
|
||||
) {
|
||||
super(...arguments);
|
||||
|
||||
registry.allHandsOnDeck().catch(() => void 0);
|
||||
registry.title = 'reader';
|
||||
registry.version = '0.1.0';
|
||||
}
|
||||
|
||||
h2c() {
|
||||
this.httpAlternativeServer = this.httpServer;
|
||||
this.httpServer = http2.createServer(this.expressApp);
|
||||
// useResourceBasedDefaultTracker();
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
override async init() {
|
||||
await this.walkForAssets();
|
||||
await super.init();
|
||||
}
|
||||
|
||||
async walkForAssets() {
|
||||
const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public'));
|
||||
|
||||
for (const file of files) {
|
||||
if (file.type !== 'file') {
|
||||
continue;
|
||||
}
|
||||
this.assets.set(file.relativePath.toString(), file);
|
||||
}
|
||||
}
|
||||
|
||||
makeAssetsServingController() {
|
||||
return (req: Request, res: Response, next: NextFunction) => {
|
||||
const requestPath = req.url;
|
||||
const file = requestPath.slice(1);
|
||||
if (!file) {
|
||||
return next();
|
||||
}
|
||||
|
||||
const asset = this.assets.get(file);
|
||||
if (asset?.type !== 'file') {
|
||||
return next();
|
||||
}
|
||||
res.type(mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream');
|
||||
res.set('Content-Length', asset.stats.size.toString());
|
||||
fs.createReadStream(asset.path).pipe(res);
|
||||
|
||||
return;
|
||||
};
|
||||
}
|
||||
|
||||
override listen(port: number) {
|
||||
const r = super.listen(port);
|
||||
if (this.httpAlternativeServer) {
|
||||
const altPort = port + 1;
|
||||
this.httpAlternativeServer.listen(altPort, () => {
|
||||
this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`);
|
||||
});
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
override registerRoutes(): void {
|
||||
|
||||
const openAPIManager = new OpenAPIManager();
|
||||
openAPIManager.document('/{url}', ['get', 'post'], this.registry.conf.get('crawl')!);
|
||||
const openapiJsonPath = '/openapi.json';
|
||||
this.expressRootRouter.get(openapiJsonPath, (req, res) => {
|
||||
const baseURL = new URL(req.url, `${req.protocol}://${req.headers.host}`);
|
||||
baseURL.pathname = baseURL.pathname.replace(new RegExp(`${openapiJsonPath}$`, 'i'), '').replace(/\/+$/g, '');
|
||||
baseURL.search = '';
|
||||
const content = openAPIManager.createOpenAPIObject(baseURL.toString(), {
|
||||
info: {
|
||||
title: this.registry.title,
|
||||
description: `${this.registry.title} openAPI documentations`,
|
||||
'x-logo': {
|
||||
url: this.registry.logoUrl || `https://www.openapis.org/wp-content/uploads/sites/3/2018/02/OpenAPI_Logo_Pantone-1.png`
|
||||
}
|
||||
}
|
||||
}, (this.registry.constructor as typeof AbstractRPCRegistry).envelope, req.query as any);
|
||||
res.statusCode = 200;
|
||||
res.end(JSON.stringify(content));
|
||||
});
|
||||
|
||||
this.expressRootRouter.use('/', ...this.registry.expressMiddlewares, this.makeAssetsServingController(), this.registry.makeShimController('crawl'));
|
||||
}
|
||||
|
||||
protected override featureSelect(): void {
|
||||
this.insertAsyncHookMiddleware();
|
||||
this.insertHealthCheckMiddleware(this.healthCheckEndpoint);
|
||||
this.insertLogRequestsMiddleware();
|
||||
this.registerOpenAPIDocsRoutes('/docs');
|
||||
|
||||
this.registerRoutes();
|
||||
}
|
||||
}
|
||||
const instance = container.resolve(CrawlStandAloneServer);
|
||||
|
||||
export default instance;
|
||||
|
||||
instance.serviceReady().then((s) => s.listen(parseInt(process.env.PORT || '') || 3000));
|
151
backend/functions/src/stand-alone/search.ts
Normal file
151
backend/functions/src/stand-alone/search.ts
Normal file
@ -0,0 +1,151 @@
|
||||
import 'reflect-metadata';
|
||||
import { container, singleton } from 'tsyringe';
|
||||
import { initializeApp, applicationDefault } from 'firebase-admin/app';
|
||||
|
||||
process.env['FIREBASE_CONFIG'] ??= JSON.stringify({
|
||||
projectId: process.env['GCLOUD_PROJECT'] || 'reader-6b7dc',
|
||||
storageBucket: `${process.env['GCLOUD_PROJECT'] || 'reader-6b7dc'}.appspot.com`,
|
||||
credential: applicationDefault(),
|
||||
});
|
||||
|
||||
initializeApp();
|
||||
|
||||
|
||||
import { Logger, CloudFunctionRegistry } from '../shared';
|
||||
import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc';
|
||||
import { ExpressServer } from 'civkit/civ-rpc/express';
|
||||
import http2 from 'http2';
|
||||
import { SearcherHost } from '../cloud-functions/searcher';
|
||||
import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
|
||||
import path from 'path';
|
||||
import fs from 'fs';
|
||||
import { mimeOfExt } from 'civkit/mime';
|
||||
import { NextFunction, Request, Response } from 'express';
|
||||
|
||||
process.on('unhandledRejection', (err) => {
|
||||
console.error('Unhandled rejection', err);
|
||||
});
|
||||
|
||||
process.on('uncaughtException', (err) => {
|
||||
console.log('Uncaught exception', err);
|
||||
|
||||
// Looks like Firebase runtime does not handle error properly.
|
||||
// Make sure to quit the process.
|
||||
console.error('Uncaught exception, process quit.');
|
||||
process.nextTick(() => process.exit(1));
|
||||
});
|
||||
|
||||
@singleton()
|
||||
export class SearchStandAloneServer extends ExpressServer {
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
httpAlternativeServer?: typeof this['httpServer'];
|
||||
assets = new Map<string, WalkOutEntity>();
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
protected registry: CloudFunctionRegistry,
|
||||
protected searcherHost: SearcherHost,
|
||||
) {
|
||||
super(...arguments);
|
||||
|
||||
registry.allHandsOnDeck().catch(() => void 0);
|
||||
registry.title = 'reader';
|
||||
registry.version = '0.1.0';
|
||||
}
|
||||
|
||||
h2c() {
|
||||
this.httpAlternativeServer = this.httpServer;
|
||||
this.httpServer = http2.createServer(this.expressApp);
|
||||
// useResourceBasedDefaultTracker();
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
override async init() {
|
||||
await this.walkForAssets();
|
||||
await super.init();
|
||||
}
|
||||
|
||||
async walkForAssets() {
|
||||
const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public'));
|
||||
|
||||
for (const file of files) {
|
||||
if (file.type !== 'file') {
|
||||
continue;
|
||||
}
|
||||
this.assets.set(file.relativePath.toString(), file);
|
||||
}
|
||||
}
|
||||
|
||||
makeAssetsServingController() {
|
||||
return (req: Request, res: Response, next: NextFunction) => {
|
||||
const requestPath = req.url;
|
||||
const file = requestPath.slice(1);
|
||||
if (!file) {
|
||||
return next();
|
||||
}
|
||||
|
||||
const asset = this.assets.get(file);
|
||||
if (asset?.type !== 'file') {
|
||||
return next();
|
||||
}
|
||||
res.type(mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream');
|
||||
res.set('Content-Length', asset.stats.size.toString());
|
||||
fs.createReadStream(asset.path).pipe(res);
|
||||
|
||||
return;
|
||||
};
|
||||
}
|
||||
|
||||
override listen(port: number) {
|
||||
const r = super.listen(port);
|
||||
if (this.httpAlternativeServer) {
|
||||
const altPort = port + 1;
|
||||
this.httpAlternativeServer.listen(altPort, () => {
|
||||
this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`);
|
||||
});
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
override registerRoutes(): void {
|
||||
|
||||
const openAPIManager = new OpenAPIManager();
|
||||
openAPIManager.document('/{q}', ['get', 'post'], this.registry.conf.get('search')!);
|
||||
const openapiJsonPath = '/openapi.json';
|
||||
this.expressRootRouter.get(openapiJsonPath, (req, res) => {
|
||||
const baseURL = new URL(req.url, `${req.protocol}://${req.headers.host}`);
|
||||
baseURL.pathname = baseURL.pathname.replace(new RegExp(`${openapiJsonPath}$`, 'i'), '').replace(/\/+$/g, '');
|
||||
baseURL.search = '';
|
||||
const content = openAPIManager.createOpenAPIObject(baseURL.toString(), {
|
||||
info: {
|
||||
title: this.registry.title,
|
||||
description: `${this.registry.title} openAPI documentations`,
|
||||
'x-logo': {
|
||||
url: this.registry.logoUrl || `https://www.openapis.org/wp-content/uploads/sites/3/2018/02/OpenAPI_Logo_Pantone-1.png`
|
||||
}
|
||||
}
|
||||
}, (this.registry.constructor as typeof AbstractRPCRegistry).envelope, req.query as any);
|
||||
res.statusCode = 200;
|
||||
res.end(JSON.stringify(content));
|
||||
});
|
||||
|
||||
this.expressRootRouter.use('/', ...this.registry.expressMiddlewares, this.makeAssetsServingController(), this.registry.makeShimController('search'));
|
||||
}
|
||||
|
||||
protected override featureSelect(): void {
|
||||
this.insertAsyncHookMiddleware();
|
||||
this.insertHealthCheckMiddleware(this.healthCheckEndpoint);
|
||||
this.insertLogRequestsMiddleware();
|
||||
this.registerOpenAPIDocsRoutes('/docs');
|
||||
|
||||
this.registerRoutes();
|
||||
}
|
||||
}
|
||||
const instance = container.resolve(SearchStandAloneServer);
|
||||
|
||||
export default instance;
|
||||
|
||||
instance.serviceReady().then((s) => s.listen(parseInt(process.env.PORT || '') || 3000));
|
Loading…
x
Reference in New Issue
Block a user