From 6a58de590cc6da2a8acdb8d36452f22a3a0d435e Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Wed, 5 Feb 2025 14:50:18 +0800 Subject: [PATCH] deployment: dedicated server script for cloud-run (#1139) * refactor: domain profile and attempt direct engine * fix: direct engine * fix: abuse in background phase * fix * wip * use curl-impersonate in custom image * local pdf for curl * listen port from env * fix * fix * fix * fix: ditch http2 * cd: using gh action * ci: token for thinapps-shared * ci: setup node lock file path * ci: tweak * ci: mmdb * ci: docker build * fix: ci * fix: ci --- .github/workflows/cd.yml | 76 +++++++++ backend/functions/.dockerignore | 1 + backend/functions/Dockerfile | 37 +++++ backend/functions/package-lock.json | 8 +- backend/functions/package.json | 2 +- backend/functions/public/favicon.ico | Bin 0 -> 14671 bytes .../functions/src/cloud-functions/crawler.ts | 102 ++++++++---- backend/functions/src/db/domain-profile.ts | 5 +- .../functions/src/dto/scrapping-options.ts | 14 +- backend/functions/src/services/curl.ts | 127 ++++++++++++--- backend/functions/src/services/pdf-extract.ts | 18 +-- backend/functions/src/services/puppeteer.ts | 30 +++- .../src/services/snapshot-formatter.ts | 3 +- backend/functions/src/stand-alone/crawl.ts | 151 ++++++++++++++++++ backend/functions/src/stand-alone/search.ts | 151 ++++++++++++++++++ 15 files changed, 639 insertions(+), 86 deletions(-) create mode 100644 .github/workflows/cd.yml create mode 100644 backend/functions/.dockerignore create mode 100644 backend/functions/Dockerfile create mode 100644 backend/functions/public/favicon.ico create mode 100644 backend/functions/src/stand-alone/crawl.ts create mode 100644 backend/functions/src/stand-alone/search.ts diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml new file mode 100644 index 0000000..1e19f89 --- /dev/null +++ b/.github/workflows/cd.yml @@ -0,0 +1,76 @@ +run-name: Build push and deploy (CD) +on: + push: + branches: + - main + - ci-debug + tags: + - '*' + +jobs: + build-and-push-to-gcr: + runs-on: ubuntu-latest + concurrency: + group: ${{ github.ref_type == 'branch' && github.ref }} + cancel-in-progress: true + defaults: + run: + working-directory: backend/functions + permissions: + contents: read + steps: + - uses: actions/checkout@v4 + with: + lfs: true + submodules: true + token: ${{ secrets.THINAPPS_SHARED_READ_TOKEN }} + - uses: 'google-github-actions/auth@v2' + with: + credentials_json: '${{ secrets.GCLOUD_SERVICE_ACCOUNT_SECRET_JSON }}' + - name: 'Set up Cloud SDK' + uses: 'google-github-actions/setup-gcloud@v2' + - name: "Docker auth" + run: |- + gcloud auth configure-docker us-docker.pkg.dev --quiet + - name: Set controller release version + run: echo "RELEASE_VERSION=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: 22.12.0 + cache: npm + cache-dependency-path: backend/functions/package-lock.json + + - name: npm install + run: npm ci + - name: get maxmind mmdb + run: mkdir -p licensed && curl -o licensed/GeoLite2-City.mmdb https://github.com/P3TERX/GeoLite.mmdb/raw/download/GeoLite2-City.mmdb + - name: build application + run: npm run build + - name: Set package version + run: npm version --no-git-tag-version ${{ env.RELEASE_VERSION }} + if: github.ref_type == 'tag' + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: | + us-docker.pkg.dev/reader-6b7dc/jina-reader/reader + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Build and push + id: container + uses: docker/build-push-action@v6 + with: + context: backend/functions + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + - name: Deploy CRAWL with Tag + run: | + gcloud run deploy crawl --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/crawl.js --region us-central1 --async --min-instances 0 + - name: Deploy SEARCH with Tag + run: | + gcloud run deploy search --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region us-central1 --async --min-instances 0 \ No newline at end of file diff --git a/backend/functions/.dockerignore b/backend/functions/.dockerignore new file mode 100644 index 0000000..c2658d7 --- /dev/null +++ b/backend/functions/.dockerignore @@ -0,0 +1 @@ +node_modules/ diff --git a/backend/functions/Dockerfile b/backend/functions/Dockerfile new file mode 100644 index 0000000..63bb9cf --- /dev/null +++ b/backend/functions/Dockerfile @@ -0,0 +1,37 @@ +# syntax=docker/dockerfile:1 +FROM lwthiker/curl-impersonate:0.6-chrome-slim-bullseye + +FROM node:20 + +RUN apt-get update \ + && apt-get install -y wget gnupg \ + && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \ + && sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \ + && apt-get update \ + && apt-get install -y google-chrome-stable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-freefont-ttf libxss1 \ + --no-install-recommends \ + && rm -rf /var/lib/apt/lists/* + +COPY --from=0 /usr/local/lib/libcurl-impersonate.so /usr/local/lib/libcurl-impersonate.so + +RUN groupadd -r jina +RUN useradd -g jina -G audio,video -m jina +USER jina + +WORKDIR /app + +COPY package.json package-lock.json ./ +RUN npm ci + +COPY build ./build +COPY public ./public +COPY licensed ./licensed + +RUN rm -rf ~/.config/chromium && mkdir -p ~/.config/chromium + +ENV LD_PRELOAD=/usr/local/lib/libcurl-impersonate.so CURL_IMPERSONATE=chrome116 CURL_IMPERSONATE_HEADERS=no +ENV PORT=8080 + +EXPOSE 3000 3001 8080 8081 +ENTRYPOINT ["node"] +CMD [ "build/stand-alone/crawl.js" ] diff --git a/backend/functions/package-lock.json b/backend/functions/package-lock.json index ca2e5a6..ae431d8 100644 --- a/backend/functions/package-lock.json +++ b/backend/functions/package-lock.json @@ -16,7 +16,7 @@ "axios": "^1.3.3", "bcrypt": "^5.1.0", "busboy": "^1.6.0", - "civkit": "^0.8.2-4c0357a", + "civkit": "^0.8.2-03243fe", "core-js": "^3.37.1", "cors": "^2.8.5", "dayjs": "^1.11.9", @@ -3979,9 +3979,9 @@ } }, "node_modules/civkit": { - "version": "0.8.2-4c0357a", - "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.2-4c0357a.tgz", - "integrity": "sha512-8/RcapAm8YYImf+YVBRhybEFuSuV5Pg1p/s6Niql3VAY2cV1/OC1fTCDZY689yeq8zFcwxwBvaqyIEGo69F+IA==", + "version": "0.8.2-03243fe", + "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.2-03243fe.tgz", + "integrity": "sha512-hoTxGeGdD27iOCDi51cVY0PHlRN3OSC640QRJ1YSmD42o+LP7mZtbdy8dN7j/FSkPP/5yLuB2ch9BMSOp54POQ==", "license": "AGPL", "dependencies": { "lodash": "^4.17.21", diff --git a/backend/functions/package.json b/backend/functions/package.json index 989692f..57a0cd0 100644 --- a/backend/functions/package.json +++ b/backend/functions/package.json @@ -36,7 +36,7 @@ "axios": "^1.3.3", "bcrypt": "^5.1.0", "busboy": "^1.6.0", - "civkit": "^0.8.2-4c0357a", + "civkit": "^0.8.2-03243fe", "core-js": "^3.37.1", "cors": "^2.8.5", "dayjs": "^1.11.9", diff --git a/backend/functions/public/favicon.ico b/backend/functions/public/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..1f9ee243a401ce06caae97d3559dea34e8ab8de6 GIT binary patch literal 14671 zcmb`O1zZ$;*Y{_a?xnj?LRzJilrBM9I+RdRPz1@PTS7$?P{II21O-H?1xZo5L|TzX z5RiuVpk8-8_jBLR^LgKQt_yc(XLja4^FQbN`<*j@ASi?cIap8#7t+CmASv)3i~YTx zMvNfmnGwX!?)UmC0le}ch^FT6^+tRIS>s0#C8gi%y0Bg$j3DCTzt>gp5QInqLAbbn zuWO+ZMC%xWFu{2a&UX)mAjwh)g2DV==Y#blG6)iFpszte%0h}D2!)oW>Y0PB2S3Dw z@M7hedmavPS4&md*e87SEBny~^H;}3_$+A*&qk@Z z3JZp7b!t9lj-G#i#;_~qc9byv%kR<6Z8hsN`)}k6zpq$-UpYA?JGFcN#<##>zkr(c z2UGqdQ!AV4?XpvBk9%f<3gd8d`#Mw34u|uqZ7cVec`Vn-&`QJ}bfgG3GSn1BBRv=|3Dk0<9GF?tih=7eBc0>U!{b6` zg$1)cc{zlXY`IVLGxB}c7oYb%w`9yCHb69=XcART#2mTSq^1$UlrH0;QbUQ#BzC%4 zzK!YjTYRH29JswO8%)GuK+CTmr;87RJjFNi!n`B6v!!s^s=4$GbjU<(+}`finvqGqaVM{0yrNfg6xC_F`Y7tzCu#0oja0$J z7tY>6D|M;GDT;{bc};gHD{6c>EssI@nDpBQRk?L$hqEXK`e!e^!yfEq)p_E3rBNg@ z{U~lTRGB9C`PtGgH0Hb^{dWN^#^_TLmk3fPsVH4POP+f_t?>=fe5CnMMWECP7^gKV9^-1t}??C2?Fd!9h!4H3rQr;LIx;cRprZTPEw==6N4T86F1A@ zT1v3TuYjE(lSlq~pUl0A%tL_rXDxrGBEUKktKr-Qka|Xznq8QT2m}0sjcq(1uXR*}Clf{F7z% zjG{zkaI)p_#>Ul z2b=zRH@d&whkv{$f4#RAK>j`!-(T;m4UqZB-{}}726qnL`@=nlJNF9*w)J>20bAEf zIK%J>4`$`pnSYHUJv?sb_x>;O8*uu+hF$e*sQ)^Ke{)_UbmAv%s_%1F^bQKZru_l} zIS~ikHYf6OKu9gN@TJ#2H)4@=>jb3+KoX4g4oJyqEi{LR^At|%>hetr7VK#}sL!4j=Y!*5O4<$aAaWIQH>m9dz5 zYx4JZjy&ZQ#2;5eX>yCJj2++NQH#mFQ20jOSc_Fbm8)WLvDmSxr|i<_ z9=ZHE!W3bP>=aS!{Js~q4>4ioF~KDSgM@HN!QI$5s^o1KR>cp(N;}`mj;j+x)S&pA!w7Q~PSJW_Y>mA-Q(@Sm7 z4+*1p#B3EzO5IEX*5*GMJvT2_iD#A8*Gm;wzVzwkq3bGKmrm88@H7YNZIJp*yPl}e zY0{Sq-+IkGDXjWA7}nW3ElPsB+San$pJIXcBFRUdK_G)NnDU@|9_N9UYV*;_N zN1G>CpY5`S_|RUUV3jfsC89FDbHq&HT~(7`_H}zT@~>CNSBCt)SIrhyES|Ysz4?S8 zB;$lP;W)*WQ;AI6AN`ibYK?ns2i|Z;l@Ey@*8LREB%uQLPT1bAg||phb0D}~P&P)? z`t9_U#~KM7vl-qavb@1sL-^ev>}k_5xD$7&yjhlyN$!kLk6u$H=Cg?Y`ZZO;v1izO z^f`y`G`F=Sw*SzzW^QtnLqv&KrSc+4Dy5`k-Pow_oQPKwyEk8)06j@nQE=)FXAyC| z=(A`af>en_3_l|e4#TCcx$;C4nI}crjfvCLZ=_mkBvb82pc*wN|F>#P^e4I~yLmm3m|*PW4UR0b4UqO*Vf_ zntU$1`}7I<#L~jm`>u%sn%N#Gz})s6e+Cyp}kg1#F;DSHJiJ;y>5grd@3$r zZMJvIFuBg&J`TZkbdIMTbIn#y;K&g8;?=o9K+JP8nB}-zZZs|bY`7i`rAKt3b>=a) zu;I~o3}PrRJXSj^J2NKATEjK2ROP#NZIYKXB9*QL&*B(ik>|0PDn=egWkE_)Wo(j{ZtNU(7ZxeFTOWbvb%7F+&{Z>DDNgbgh5#S}_~ASv4)!RXDf&Fb z?kmeNm&CUl6S1;e&eX2F!n_Y8&&9xmKV%}U$E*gB%rS`B6mVvIqBXE$|8U8XpVQxi zC8!V6PGXrt#U^8LIxTsF`zT_Lu6FAcdR<0vci4lFoGEkvBa@V06xQK6BBgpy!jaIK zz#*c3pNvt|IB0*@+i6|w8Y)q2@Z~ujuXhyESKf==wOiv7B5>VVcG}z7+>*$@f7gIO zX~?X6+kE=n%9Q6#VnJ29XGHvZ_f^HX4&%OrQ`j-u32&ecC$Ff`BQ56dE~)i$YV7wc zHS;F8pfZ=6@b3M1r*%fg;yp8*=hK_3Q{wvtrOoO*xg@w4sr=nop6Hm{u9`x0If zi;}9)BKPv^-uS;BzMd9%Gbd;<$>~g*)Fql_J_$yT{bd|LwdM&(%~*!ryjMoz`6 zBZ|kPq8#Lkz?iu;-DEGIJjUd)@8QlTgk)fl??a^@EMLuv9oPKq5bM!nQn|fYaj*yV zw>+b#vrPiS`yKd7a(wzz4TaScWyw7u3tij@+wMozp`e2CRE2=IfuggdZ5<~OgPOpA zT}_mxNLRHcq0O~UHD6xC2-G+3#eGIw|E#+|NjE+OprqDZ#qz+Cm2$J%lGcuGGImwwC3RVHLul8o4$$hbXLX?J7Jz>O^G zMMYvOy}W*6J>-$2q$ma9a(+?lQaGbc z?s}>-2acR?GxB-O6`B&Q>zHyRDlPQYE@Itnr_3?-vQ;z(VG1kXm-*~D{MM^6ZjZ|V zmH3rTs#I4;CkMfgdR=~?uW#Fi!i0}*wx96ZUF%=V(aL6fFN|1q!I)?<4m#N| zUq_wWdMtzP9ey9|hZt1Q9MYa%s1`&(U&V=-cOxlSpK$8U?FW+~SOj4}!ts8Z;~#!= zUvz?fw|H^1elX4ee{5%^y+45tYnSx1@hxJP{A`Bhp*tAS96>-+XdxPtor*zU>x&nj#acv@2n{>QaU=wqg)6 z)-+2`T1x8NFYUMKe!6#GZY$IPA7{MwioSj?Za+u4t$d)!{?tVMkz3yh2yiL}m}~O` z#kq<7Q7CE72FC ztT1rgb`@^Kh&*`egM9B-X?lG6x zPj)rRJfNt%;M=Eouj41aRk1_3+dvWbiE~O(0}fkM4g7T2v7~~1*2|SyhjmjiC%zB` zlVPu^q=%uW_lQEnOqWWeE)A-ph?%0dgLalRdY@ZL@20m9C8vV+3%xM?{QR zxRPYuIhX6f8b1+IzYDYX)U7xFPGN=krJRHUn!yLys07m&HqR3Xo+6JpT-#q%Z^b+? z#b$ngpLrdsuV>)D8qU8LlUK2%j3eCo(JEGoT(QOW@LXu`Y?SdLtoJw;gEz6YKHIDF zF;#-MuaQc{nrx|s|6y*Sbw$^PQC#^W;#Yhq$V>UA1kQfeq3YeApJy8wZFQsE(@E5j zM@Q@AhbwQq60$54uENssppWpk)dv_$QkU>UBqS)D0TIxi_hI)|SwS*Wh$z`QCk+UU5ZaH^4r15JRv@i$<&A* z&s{n)g9obvD!jU%W%(jqKDp^3$iSG2T6El==8oP&g^a}@QtB^FZ3=61{rw4L%!V2M>ZGg$; z`%^IXuorl1>Z%wgmdr}csRG_ zMDFO79joPB9#D(Av(^4@W=Csk6jZbr&L*x>_j6A6$ou{5X_xSSui$cY<^ousP95K3 zeKjE=YqxIA=Xxr^wV_1M4h+cIQa1^@W@IMS+@SR8l>Xa2StYB$%dTQiHm$4`sW-12 zdHl@WE|^p~{ZhL{@T;VmJTsdcYMdvPzqE5m&c?PT2)fLVCg{0cH1EA@1TKSwpvazB z)o!rFHCm4`m1={r#UN*dvsmcLA$2{15kpQ<#lGxJPnzAfH8$<9XoQdBo!*VvyrfThc97M2Uzq0-{;4KX1#;okZ=q&0FIykRNx)gyd?up#SGjo>21~nn zuxZn0Wgtr0KIa>%a(n!vBFSR!cppfL9cV43b>oZ3QPIxb- zM@?RQ$evh4)a}o-S+l?A#}Y&sHJ($GnPE^D8}h*ZT|^#ueb1hD{4e;3rURig)#H0n z0mbgfNbqwnbeER>&XskEA1^Fl-(1e8YT;D~RQCd%4zEa*r<@oZ^bnNadll zTZi?!BE`qZ%b*;)K%JPzZdN%;l~Hu76K!bhir zR+jW_ z`Cgc({}AgUbm4vTRA8jG@hLIhT)e!M3qjPlZucW=}r!?CrSG1$2Y}m&px}#2!=jB@aOy|Us~p|!Q^4C=sO7V;&TFJ za5?km*tNxB%To8@v-AQ6T+191Wx4gZw4&tp5b-CrHGyxH)Ev$led)^_)a&z&%52pX z8*hhl+^dbKP@#@j*&&Gbvru8ZK7t=j(dilpj?(%|{18%!>Y>8%BGcj{EFoMCxEGFJ zoSkwbDcSSARn?ikbRD}>67B8VZ`mDO&arUx^2o`9f-RsitI}*F^19^5Q!j%DND)mA zdkK>$`tVe#`Z{JGd^2k=lR7d?JH(2hG;an|rIk%mZMb%|?A32BjMqVqqR+K?2+p^- z5sw8t?C%IRBd+@1#Nwasbkcj#D!^?OdWr2DijsqA$(YhR=44ndc2#oBiEnn1+&=hF z!5Hb?KV}GJNOyOnfY?vbh%`xCLhvS4L6tuBs=d?N^HmwKcX<_MpRy6GHF3TG;Ifz zULABxK_|BIZK~dC>*#cct9=p?OSu_83I!ac^_xh+lS~R$7Pm7=9WdgVNQWv+Z28)mMgBMjvK9=lR)_lUn!AtlKrv0zqwHYnbxV3X4$GF zBBXb1eDkV{$eB;j&_g$cUfA5gk8DGJ;09y&wdtXX>kC~RgrdSS%Fa!5HkCm~-PNh; z4Y=Jm9xORfaw-&@VsZMmyaJ_)mnF`t$#2P~5|SgJIzo8+gk~zHsjQgd2!WF3qSFL( z^U8Edc{M1U)h_$@@1@*2A`dwmq{?5OV4v1ox0oi_WuuE@IS3` zVx{vZ6nt!_7BSy$&2BerbBew6s^LN`Wa-~+bG)J)=AB9U^Ra&o6B_!WzglD1&*J!8 zc4jS|JI66-N$oTr+hrNuHcQpqNS#^h`=Xj&z@yvWp#CRg<-6$tgNlA!5A)Gy2Q@T{^iRTGcz+L zCZ?dEAh^J%I?YFkJR8&+L7#Q;M@(KzHHa0ed zg@t>1db+#2#l*zc*VpOk>1AbQPnV$zvJ#*%alarIPvvW~V(Y0&Wyu7?dM@N7D{ORK2^8NdF zH8r)ncke!X_6&=~>gwvke8CL<`0)eIl9iQp?%cWj{QUFh&qIvCjfJSVb?cUnjt)$X zzP`S)va*nn5Sq3L9?u~|*Uso0?eD?l|F!(}{@@jU_V!_>elG{_4%UBtw!3!_qrX;u z@A%jE;iJD6*!+9>wd-G>{^!?&L;m{z;LCsQdhq7Y&+n~&ZTPpZzrK3#&$rEU4-f=@ z@XvF_&znKCYIMlb^|JeYhrMiB;t7`t3P(7ATt#%PBB*e51Z(UKy7d_3i0rtk zMqc^t4CBRPrxVfTLi(mCv!Od2-S~vq3{N4SVgh|uLfp*8R+Y`Dz$^T`!!n_01C2D} z8kC)yGtDgLJ?yIj!t_gc@Q))jDMEt<7wWbOan< z=*2qJ4VJjY(>Mb@<;&q5$#O~df z4{mJ4YS_-ou-GHoh5RUYoPvmR!B*oIiS+Og9l6x7=Ujc~>&SDTQzd;C!Snm)jdj>n7liupJE`RHOT3RHf{Z^CT_lnWU_e><5KH>{9 z%J+VDyqic$+?;zl=bT}PyysYUCRe35N+ZEDyr}LO9DkJNrG{k>n+3sIv4LcGGIryG zF0OM*am7PFr;hd&A-+0iF4tOcAHP~g;z!X*!mzRQJ&Y$JV1kcGc|YLdz@J)gNW?#R z$lUQ6Za0A*V$=kCbOkRipnSn1UB!B*X$EgVZBCqE=Tu&L5+^{^NCln zZt7y=RTA)vzjq=6mN!2yePX;Q^7HK(i_nbQbs?M`YmCLR9&B8Dyg#dvKCdrx{W-f~ z+Nc6=9@Ola#bd{4i#+k3>`|bGGN?-mbGaACoD?)&6lbI-gq zf5LzmSy-3SUSXBp2o*yX+V_gk2+mfAZ{6?+tDn!B#vVshD*>|8U99c9Unu!5slxL3 z?um^=A!)?!Q_%3{MPBDDz3Kw|s7s{ejUP#n=PUbVC%uc0_ElK4a$(CJOCNvULfYY- zaM_yulDIvs{8jg7anqK4MoGO7-d{5L#L;^gwS{MWeB@2Q4~JVnmySN+yIvyW>wo|A z^9Db*6W)hBPZm%v^p%)jaB3e3n0%G58xX40IciCKZB0FAy8^AdpSaiG6VeboU3)50 zeyes%`CG3-)U>T&GlNg_Ho+F^$phI!HgB;v0|=`f-Vc&H{j-~zJ~O>-O3HXQ5e5F- zjQgjoTn;?Of3PNYu-WdHHR)B4gx|qH&Hs}%0gC{F0nPvf415WQXn1%ym=93j1J3X7 zgEE2If#(3-0zDob9sMun1C;w8{m0+T2N)4BAK*N|;{5g;a5RuB5G`0h3qi9%gSol6 zK>|Ve!7I4CyMy?FYJ#SMYXOG=f}N0%0QLeT5DXCL{lkY3!3Kek0g(gQE-ftu`vUR? ztMT#iPoF+zW@ZM(1>*va;pow$_wU~q6%_?X0^)k(#tqP2&~H6GJrGFHZ!lZnLO|3( z!oib)2LiVNh5&^2#ful2nVA6h!GC~$gUiv- z(1?kN(bm=mui)n927U$HMs{{KEiG+tZ!ah}2>jKnSHZ-9wF27#M)26NW8h7U|=A)5U^ZEMn+)!o=`jcK@dFp7s>zkD*8)}|Lx{~koteW&;R|F zF`;We5d@F+k96N(H#3r;Ytq4n_;60B%;uWrXS=X#DwCdhOo=n3Bcm@Bpd;dej?wt?VnADM^C#auXm+chKf?3J4aGR7SL1j@PYt)rcS?q z6ce#!*5y*66a4+DO~!lK=;vQ-$uo-xw8v>K?WvvIE=qBWwPPkVFBA!(PpvVeC~m z%#@5Tbsym&);t^5_$lqRdbx4z)v1|1M*ksuFLe?DM6?T!>+Z(xhdQjN zFEVoWk?m_P+~_H)F-gy8v>6X~CiYnzvrw(OYvDOEROVgE={b#^J3pJ%7zmpy^r`G{ z;i%h6J}yjSgaOelT(d98qmhX1X~YO28lp?(DOX#UPJQcvyM)3;MWeX9uP9L}sO>3# zE1kDiBrT7`TYNQ75z)!=VYEL2g}dmCM^05lw2(z_89ILcrs9pTDj^QX7o_X7i&Y!l z&i>dVzT!twj_sT4i%#jBDyK@=$;KXz4MrXt`?k3~`(_GN88t8APAn$7w^uJAnl}-|g6O^e#GaZ>C7r^e3cD~g9V!5?3H&Ptr6TBag;PdIQrr%KJ zu|1SFK60%jKc6&-iu2Y2e-p3sIU(fs>4!C8v-g{advfB4pr)pKv{lkkr)Sb$b9&PK1A~u^Bs^1^ft{gx0Z>C#1E7;>U~>J%^zyag2iGD8Img)}G*K;FQ>0GJP;21NiALxBo4 zAt(#z6tEgV3b+P6f)W%M>+kOm*a!6ltpNc7>4Yj2$~I66kV_DS+1XjB)1W#9u>j=) zRRZY*g#t|kKJ)SMjf{*ufBqb#K~7E%%1bD2p&A5T6MgE~2lE&Izr^)#mHt=S{;Guj z_2d7CPgcx#d>JJMt+7a<)as_uk3#eA1sT2#mk8*KKy5D?Ym|HGomEQa*O9;tZp~z}gb4=TKbv zB+Xm)yl1?#H&0B9qPTV+oS0VlI!!gCOjVqz@gS7LQMuE+gH~9e@v4C^!zvw_Zv;Vu zy>feTCgX$933;Nb`Vbr0;B+e{*r{m zTDNoc1{raHiRksg*WcxI+Uhrw^femXf`xy=G@pfM%5A+u5)IqmUgbqg)4CP~vxII(u^Q6i z&wLbDJsUF2uKZm5w%r5jp`JlpUTLU-PvoaGRzv=hfk$K^n(MCK?yB+~QNBmGbKB*1 zWO9qhu>%|iO4Pfimjh#wbyG(Kdw)P$Z5O72l1r;6OcZw1jaxI0z84~B*33Hc zG@`NboO6gmwrZ5_&oJ=`l6z8z^< zN&;She*$tr>kIr)WMm{nIP`45SAl(mFa_)>7&YJm@D9=# z7RX`v?(eU_5g;J}Nq`lwkH8w}41w{5oq;KU49I`5TyS3Si;&wuL*N2n1%L=`GdMdy z0rFX0T^)=aFbm8Xz!Y`>8p06(CE#em$^G`S;P3#8;6A~8gX09E!Qf$Ra1cNw7<6du z02Tq$U{%3K0$Bh_fMYl**hF{_yav|@O)mf|00o*}fXLt13rvB=8}Jma7c4JuAD{-T zgy8`|fs(<&!O&iU)*YCB02Uw;d^Xr+;5*<9`jo(B;3M2{Fu{N-fLCpKo(W7($R9}k zfz81S{J<@Q(&5j4zgJ+xU&}#*=g;!zqklrx@2~wn&YypOZU3j8{yNs5*Ks;2eTa{h z-(>3Tcebn(#~@=g&&f*lt;#*VS6#`D%4TQ?9y6sIik(F7zaeMV6B8MnD)!;r;(p_o z#`ZXrhFooasa2B_qiGQKt&S*olWw3+5y6P##V27)uF2sgd?=Xz8q9Wx?#&G?yiG0~ z9UleVdrG<%O1f5xOqx%@XQHv%ftk8I2BOTliLAM|$&Iz}dw8%a!uc26vT~m#=*FI8 zJ#{2E>1{Y6lM7!@?s1mmL@qqYM*}aN3zmKw8h*}Z-fYct!NE3xwA))m*&-}#EYLw* z#pv`vu746wp1Tsgz*oJK=jz_L`axQp41H|RC!>2@aD_}oiJx$qaZ@(p5Xz1j zFU}wt?CFsq-(I2H=gfK0*Jv4C&Swhusp5APvloP7Jwx$RrQfW%kQF$WY2hf#5GNFB zQWRA*SQLvbLQis_2AXCCXOYEPZgRGn8#p#%@g}Y-p6{PMec#T73#3HJ&9lU)9>KR# zGRtm5^%}A8*NogCLCnTBL&wFxtbMyMHtKU#C--Rwd+T%O#m#lRVZ54skMYj?=9dFV zZ+d*!9CXgyjUM-G3e;^}AdTlk;+h<21DZ5!9$L4)JX_KJjFd_h`Ps0t^>dHgl$|?K z5wnytomZFO-TjoXK|PQu@5e{Jxu@Spk*=~O0{n;9s+VzoJO2Yq;7=G3@n0;l8zL!% zSdsiYOMnNEKwu=G&kNNqBp*~^5TOuUP!2=#LDGY=|4tD|GN|2t*Q$_h(9?%P&(qTr zj0e~%AP}TJq!gqYwDlo}z}rC5g8=|Dqp7J0i3lhHI|S|svJDDh;1;+SAPtxSFj3%_ z0s;ac+rWo_;ekX4ECC51`M|9K>HtU)5fRYo2K)eNARQrP0UKaiz`}v?fZPSNK<{(oss{!5jx+NFV&2ipB5T{@CYiRHuX?9LwafDe_j1&*pKc z70ryYF0Kz@GM?Ys-wz>)ep%stU-rCY(Wv|PfO6i+!!b>hD{MVwEG%g|MfdSUGsE-B zLqs$0eC?HD><}fBb5edPJpF_1T5AZ0MaP`%^a|T*YY2*Z<2(Hw!q7!a7E*$`cQrZG zr0O2|I?M%RyKlsN_(ON!4HmZ zR0>L}@f%c%-98}|RevhmXY>5YlPTtc`>XSDIl4hN0{r%mxZgSNL@~7>G52NVnPl$D z50W;PXZ*8fE63oPWsvHlm_}IX`g;a}B00ZKai$>SpI3{A3p!DZyhMr zrQwKUzazM`JCZU4)Ttx8yrGJo-s5{X9#Px={vqbaM31yiMKRLI*%Z!L$M_RE2kzfA zNNu8yKBLU1&ZAEdjpJ%T1P^Nx5uj=G9kv#eqOLE_AfAe(%GCB$*OB9fP>p1) z!l(kIK>87}C2|7&NZwdWc3_*Rf$(0=tI8iOgNhj*>^-f$!`RQ(MkWh^-H{WaLDy|= zAFfjDQ(QA~kzb|#&@_iq?0>*eaD@AiEL`hs`4dP08JH26 zL5L7AO@LFllTaiARKYGmy#`(hI)2b(1Kxt40{nvQ0+IZUO5iYsFAeMlT5Ipc)6CxABAK%tbm?Wws$NN-_62L&l7G&LnPYxy44PDsAQJYZ8%;SkP3CIy9R`EK9A z(Floq<%^RXkD{M7D2zC4tfIrp@VkYXutO-&WLFxLMxF$d2DF~258K!nN&R=oa93_J~n z+XNBH2AoQAZ4X#iIYHxBMB)VCmYpjhB(LGYkvh6lWsmX>gT zfes)i00O9cK}2DmKw<$9g@uJM$G`=+tuRSnB|h!J0Nj4x+4y!MQmi;RL5JJ0UyuCR zsoew@mRc2M34@J*#IVHubE&t_w3?2UzpSKM@*=)QOt0#Ze4bB3V9ZvMxS#;zh9xu# z#W`uj+G$SvlDs4KNJpmq+o!T=(~Vo-${+FeY1;JsO2CT{e0= undefined); }); puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => { @@ -581,9 +583,14 @@ export class CrawlerHost extends RPCHost { if (crawlerOpts?.respondWith.includes(CONTENT_FORMAT.READER_LM)) { const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, { - ...crawlOpts, engine: ENGINE_TYPE.AUTO + ...crawlOpts, + engine: crawlOpts?.engine || ENGINE_TYPE.AUTO, }, crawlerOpts); + if (!finalAutoSnapshot?.html) { + throw new AssertionFailureError(`Unexpected non HTML content for ReaderLM: ${urlToCrawl}`); + } + if (crawlerOpts?.instruction || crawlerOpts?.jsonSchema) { const jsonSchema = crawlerOpts.jsonSchema ? JSON.stringify(crawlerOpts.jsonSchema, undefined, 2) : undefined; yield* this.lmControl.readerLMFromSnapshot(crawlerOpts.instruction, jsonSchema, finalAutoSnapshot); @@ -628,18 +635,9 @@ export class CrawlerHost extends RPCHost { return; } - if (crawlOpts?.engine?.startsWith(ENGINE_TYPE.DIRECT)) { - const engine = crawlOpts?.engine; - try { - const snapshot = await this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts); - yield snapshot; - - return; - } catch (err) { - if (!engine.endsWith('?')) { - throw err; - } - } + if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) { + yield this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts); + return; } let cache; @@ -658,6 +656,24 @@ export class CrawlerHost extends RPCHost { return; } + if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && crawlerOpts?.browserIsNotRequired()) { + const { digest } = this.getDomainProfileUrlDigest(urlToCrawl); + const domainProfile = await DomainProfile.fromFirestore(digest); + if (domainProfile?.engine === ENGINE_TYPE.DIRECT) { + try { + const snapshot = await this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts); + + // Expect downstream code to "break" here if it's satisfied with the direct engine + yield snapshot; + if (crawlOpts?.engine === ENGINE_TYPE.AUTO) { + return; + } + } catch (err: any) { + this.logger.warn(`Failed to scrap ${urlToCrawl} with direct engine`, { err: marshalErrorLike(err) }); + } + } + } + try { if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe || crawlOpts?.withShadowDom) { for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) { @@ -855,7 +871,7 @@ export class CrawlerHost extends RPCHost { } async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise { - const it = this.cachedScrap(url, { ...opts, engine: ENGINE_TYPE.BROWSER }, crawlerOptions); + const it = this.cachedScrap(url, opts, crawlerOptions); let lastSnapshot; let lastError; @@ -912,36 +928,54 @@ export class CrawlerHost extends RPCHost { return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs); } - async exploreDirectEngine(targetUrl: URL, crawlerOptions: ScrappingOptions, knownSnapshot: PageSnapshot) { - const snapshot = await this.curlControl.urlToSnapshot(targetUrl, crawlerOptions, true); + async exploreDirectEngine(knownSnapshot: PageSnapshot) { + const realUrl = new URL(knownSnapshot.href); + const { digest, path } = this.getDomainProfileUrlDigest(realUrl); + const profile = await DomainProfile.fromFirestore(digest); - const thisFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', snapshot); - const knownFormatted: FormattedPage = await this.snapshotFormatter.formatSnapshot('markdown', knownSnapshot); + if (!profile) { + const record = DomainProfile.from({ + _id: digest, + origin: realUrl.origin.toLowerCase(), + path, + triggerUrl: realUrl.href, + engine: knownSnapshot.htmlModifiedByJs ? ENGINE_TYPE.BROWSER : ENGINE_TYPE.DIRECT, + createdAt: new Date(), + expireAt: new Date(Date.now() + this.domainProfileRetentionMs), + }); + await DomainProfile.save(record); - let engine = ENGINE_TYPE.DIRECT; - if (!(thisFormatted.content && knownFormatted.content && - thisFormatted.content.trim() === knownFormatted.content.trim())) { - engine = ENGINE_TYPE.BROWSER; + return; } - const realUrl = new URL(knownSnapshot.href); - - const profile = (await DomainProfile.fromFirestoreQuery( - DomainProfile.COLLECTION - .where('domain', '==', targetUrl.origin.toLowerCase()) - .limit(1) - ))[0] || new DomainProfile(); - + if (profile.engine === ENGINE_TYPE.BROWSER) { + // Mixed engine, always use browser + return; + } profile.origin = realUrl.origin.toLowerCase(); - profile.triggerReason ??= 'Auto Explore'; profile.triggerUrl = realUrl.href; - profile.engine = engine; - profile.createdAt ??= new Date(); + profile.path = path; + profile.engine = knownSnapshot.htmlModifiedByJs ? ENGINE_TYPE.BROWSER : ENGINE_TYPE.DIRECT; profile.expireAt = new Date(Date.now() + this.domainProfileRetentionMs); await DomainProfile.save(profile); - return true; + return; + } + + getDomainProfileUrlDigest(url: URL) { + const pathname = url.pathname; + const pathVec = pathname.split('/'); + const parentPath = pathVec.slice(0, -1).join('/'); + + const finalPath = parentPath || pathname; + + const key = url.origin.toLocaleLowerCase() + finalPath; + + return { + digest: md5Hasher.hash(key), + path: finalPath, + }; } } diff --git a/backend/functions/src/db/domain-profile.ts b/backend/functions/src/db/domain-profile.ts index 02c693b..6e552c1 100644 --- a/backend/functions/src/db/domain-profile.ts +++ b/backend/functions/src/db/domain-profile.ts @@ -13,10 +13,7 @@ export class DomainProfile extends FirestoreRecord { @Prop({ required: true }) - origin!: string; - - @Prop({ required: true }) - triggerReason!: string; + path!: string; @Prop() triggerUrl?: string; diff --git a/backend/functions/src/dto/scrapping-options.ts b/backend/functions/src/dto/scrapping-options.ts index 2f8f5ef..e2cfd41 100644 --- a/backend/functions/src/dto/scrapping-options.ts +++ b/backend/functions/src/dto/scrapping-options.ts @@ -439,7 +439,7 @@ export class CrawlerOptions extends AutoCastable { instance.engine = ENGINE_TYPE.BROWSER; instance.respondWith = CONTENT_FORMAT.VLM; } else if (instance.engine === ENGINE_TYPE.READER_LM) { - instance.engine = undefined; + instance.engine = ENGINE_TYPE.AUTO; instance.respondWith = CONTENT_FORMAT.READER_LM; } @@ -496,10 +496,6 @@ export class CrawlerOptions extends AutoCastable { instance.cacheTolerance = instance.cacheTolerance * 1000; } - if (instance.noCache || !instance.isTypicalRequest()) { - instance.engine ??= ENGINE_TYPE.BROWSER + '?'; - } - return instance; } @@ -544,13 +540,19 @@ export class CrawlerOptions extends AutoCastable { return !CONTENT_FORMAT_VALUES.has(this.respondWith); } - isTypicalRequest() { + browserIsNotRequired() { if (this.respondWith.includes(CONTENT_FORMAT.PAGESHOT) || this.respondWith.includes(CONTENT_FORMAT.SCREENSHOT)) { return false; } if (this.injectFrameScript?.length || this.injectPageScript?.length) { return false; } + if (this.waitForSelector?.length) { + return false; + } + if (this.withIframe || this.withShadowDom) { + return false; + } if (this.viewport) { return false; } diff --git a/backend/functions/src/services/curl.ts b/backend/functions/src/services/curl.ts index 898b836..ff5a5e8 100644 --- a/backend/functions/src/services/curl.ts +++ b/backend/functions/src/services/curl.ts @@ -2,11 +2,14 @@ import { marshalErrorLike } from 'civkit/lang'; import { AsyncService } from 'civkit/async-service'; import { singleton } from 'tsyringe'; -import { Curl, HeaderInfo } from 'node-libcurl'; +import { Curl, CurlFeature, HeaderInfo } from 'node-libcurl'; import { PageSnapshot, ScrappingOptions } from './puppeteer'; import { Logger } from '../shared/services/logger'; import { JSDomControl } from './jsdom'; -import { AssertionFailureError } from 'civkit'; +import { AssertionFailureError, FancyFile } from 'civkit'; +import { TempFileManager } from '../shared'; +import { readFile } from 'fs/promises'; +import { pathToFileURL } from 'url'; @singleton() export class CurlControl extends AsyncService { @@ -16,6 +19,7 @@ export class CurlControl extends AsyncService { constructor( protected globalLogger: Logger, protected jsdomControl: JSDomControl, + protected tempFileManager: TempFileManager, ) { super(...arguments); } @@ -26,25 +30,55 @@ export class CurlControl extends AsyncService { this.emit('ready'); } + curlImpersonateHeader(curl: Curl, headers?: object, chromeVersion: number = 132) { + const mixinHeaders = { + 'sch-ch-ua': `Not A(Brand";v="8", "Chromium";v="${chromeVersion}", "Google Chrome";v="${chromeVersion}"`, + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': 'Windows', + 'Upgrade-Insecure-Requests': '1', + 'User-Agent': `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${chromeVersion}.0.0.0 Safari/537.36`, + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'Sec-Fetch-Site': 'none', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-User': '?1', + 'Sec-Fetch-Dest': 'document', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept-Language': 'en-US,en;q=0.9', + }; + + curl.setOpt(Curl.option.HTTPHEADER, Object.entries({ ...mixinHeaders, ...headers }).map(([k, v]) => `${k}: ${v}`)); + + return curl; + } + async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions, throwOnNon200 = false): Promise { + const snapshot = { + href: urlToCrawl.toString(), + html: '', + title: '', + text: '', + } as PageSnapshot; + const result = await new Promise<{ statusCode: number, - data: string, + data?: FancyFile, headers: Buffer | HeaderInfo[], }>((resolve, reject) => { const curl = new Curl(); + curl.enable(CurlFeature.StreamResponse); curl.setOpt('URL', urlToCrawl.toString()); curl.setOpt(Curl.option.FOLLOWLOCATION, true); - if (crawlOpts?.timeoutMs) { - curl.setOpt(Curl.option.TIMEOUT_MS, crawlOpts.timeoutMs); - } + curl.setOpt(Curl.option.TIMEOUT_MS, Math.min(10_000, crawlOpts?.timeoutMs || 10_000)); + if (crawlOpts?.overrideUserAgent) { curl.setOpt(Curl.option.USERAGENT, crawlOpts.overrideUserAgent); } - if (crawlOpts?.extraHeaders) { - curl.setOpt(Curl.option.HTTPHEADER, Object.entries(crawlOpts.extraHeaders).map(([k, v]) => `${k}: ${v}`)); - } + + this.curlImpersonateHeader(curl, crawlOpts?.extraHeaders); + // if (crawlOpts?.extraHeaders) { + // curl.setOpt(Curl.option.HTTPHEADER, Object.entries(crawlOpts.extraHeaders).map(([k, v]) => `${k}: ${v}`)); + // } if (crawlOpts?.proxyUrl) { curl.setOpt(Curl.option.PROXY, crawlOpts.proxyUrl); } @@ -56,35 +90,82 @@ export class CurlControl extends AsyncService { curl.setOpt(Curl.option.REFERER, crawlOpts.referer); } - curl.on('end', (statusCode, data, headers) => { + curl.on('end', (statusCode, _data, headers) => { this.logger.debug(`CURL: [${statusCode}] ${urlToCrawl}`, { statusCode, headers }); - resolve({ - statusCode, - data: data.toString(), - headers, - }); curl.close(); }); curl.on('error', (err) => { - this.logger.warn(`Failed to curl ${urlToCrawl}`, { err: marshalErrorLike(err) }); curl.close(); + this.logger.warn(`Curl ${urlToCrawl}: ${err} (Not necessarily an error)`, { err: marshalErrorLike(err) }); reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: ${err.message}`)); }); + curl.setOpt(Curl.option.MAXFILESIZE, 1024 * 1024 * 1024); // 1GB + let status = -1; + let contentType = ''; + curl.on('stream', (stream, statusCode, headers) => { + status = statusCode; + outerLoop: + for (const headerVec of headers) { + for (const [k, v] of Object.entries(headerVec)) { + if (k.toLowerCase() === 'content-type') { + contentType = v.toLowerCase(); + break outerLoop; + } + } + } + + if (!contentType) { + reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: no content-type`)); + stream.destroy(); + return; + } + if (contentType.startsWith('image/')) { + snapshot.html = `${urlToCrawl.origin}${urlToCrawl.pathname}`; + stream.destroy(); + resolve({ + statusCode: status, + headers, + }); + return; + } + + const fpath = this.tempFileManager.alloc(); + const fancyFile = FancyFile.auto(stream, fpath); + this.tempFileManager.bindPathTo(fancyFile, fpath); + resolve({ + statusCode: status, + data: fancyFile, + headers, + }); + }); curl.perform(); }); if (throwOnNon200 && result.statusCode && (result.statusCode < 200 || result.statusCode >= 300)) { - throw new AssertionFailureError(`Failed to directly access ${urlToCrawl}: HTTP ${result.statusCode}`); + throw new AssertionFailureError(`Failed to access ${urlToCrawl}: HTTP ${result.statusCode}`); } - const snapshot = { - href: urlToCrawl.toString(), - html: result.data, - title: '', - text: '', - } as PageSnapshot; + if (result.data) { + const mimeType: string = await result.data.mimeType; + if (mimeType.startsWith('text/html')) { + if ((await result.data.size) > 1024 * 1024 * 32) { + throw new AssertionFailureError(`Failed to access ${urlToCrawl}: file too large`); + } + snapshot.html = await readFile(await result.data.filePath, { encoding: 'utf-8' }); + } else if (mimeType.startsWith('text/') || mimeType.startsWith('application/json')) { + if ((await result.data.size) > 1024 * 1024 * 32) { + throw new AssertionFailureError(`Failed to access ${urlToCrawl}: file too large`); + } + snapshot.text = await readFile(await result.data.filePath, { encoding: 'utf-8' }); + snapshot.html = `
${snapshot.text}
`; + } else if (mimeType.startsWith('application/pdf')) { + snapshot.pdfs = [pathToFileURL(await result.data.filePath).href]; + } else { + throw new AssertionFailureError(`Failed to access ${urlToCrawl}: unexpected type ${mimeType}`); + } + } const curlSnapshot = await this.jsdomControl.narrowSnapshot(snapshot, crawlOpts); diff --git a/backend/functions/src/services/pdf-extract.ts b/backend/functions/src/services/pdf-extract.ts index 396ef97..d6d2abe 100644 --- a/backend/functions/src/services/pdf-extract.ts +++ b/backend/functions/src/services/pdf-extract.ts @@ -266,12 +266,12 @@ export class PDFExtractor extends AsyncService { return { meta: meta.info as Record, content: mdChunks.join(''), text: rawChunks.join('') }; } - async cachedExtract(url: string | URL, cacheTolerance: number = 1000 * 3600 * 24) { + async cachedExtract(url: string | URL, cacheTolerance: number = 1000 * 3600 * 24, alternativeUrl?: string) { if (!url) { return undefined; } - - const digest = md5Hasher.hash(url.toString()); + const nameUrl = alternativeUrl || url.toString(); + const digest = md5Hasher.hash(nameUrl); const data = url; if (typeof url === 'string' && this.isDataUrl(url)) { @@ -283,8 +283,8 @@ export class PDFExtractor extends AsyncService { if (cache) { const age = Date.now() - cache?.createdAt.valueOf(); const stale = cache.createdAt.valueOf() < (Date.now() - cacheTolerance); - this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for PDF ${url}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, { - url, digest, age, stale, cacheTolerance + this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for PDF ${nameUrl}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, { + data: url, url: nameUrl, digest, age, stale, cacheTolerance }); if (!stale) { @@ -306,7 +306,7 @@ export class PDFExtractor extends AsyncService { text: cached.text }; } catch (err) { - this.logger.warn(`Unable to load cached content for ${url}`, { err }); + this.logger.warn(`Unable to load cached content for ${nameUrl}`, { err }); return undefined; } @@ -324,17 +324,17 @@ export class PDFExtractor extends AsyncService { PDFContent.save( PDFContent.from({ _id: theID, - src: url.toString(), + src: nameUrl, meta: extracted?.meta || {}, urlDigest: digest, createdAt: new Date(), expireAt: new Date(Date.now() + this.cacheRetentionMs) }).degradeForFireStore() ).catch((r) => { - this.logger.warn(`Unable to cache PDF content for ${url}`, { err: r }); + this.logger.warn(`Unable to cache PDF content for ${nameUrl}`, { err: r }); }); } catch (err) { - this.logger.warn(`Unable to extract from pdf ${url}`, { err }); + this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err }); } return extracted; diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index 5361aed..2e40689 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -48,6 +48,7 @@ export interface PageSnapshot { href: string; rebase?: string; html: string; + htmlModifiedByJs?: boolean; shadowExpanded?: string; text: string; status?: number; @@ -369,7 +370,9 @@ function shadowDomPresent(rootElement = document.documentElement) { return false; } +let initialHTML; function giveSnapshot(stopActiveSnapshot) { + initialHTML ??= document.documentElement?.outerHTML; if (stopActiveSnapshot) { window.haltSnapshot = true; } @@ -385,6 +388,7 @@ function giveSnapshot(stopActiveSnapshot) { description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '', href: document.location.href, html: document.documentElement?.outerHTML, + htmlModifiedByJs: false, text: document.body?.innerText, shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined, parsed: parsed, @@ -392,6 +396,9 @@ function giveSnapshot(stopActiveSnapshot) { maxElemDepth: domAnalysis.maxDepth, elemCount: domAnalysis.elementCount, }; + if (initialHTML) { + r.htmlModifiedByJs = initialHTML !== r.html && !r.shadowExpanded; + } if (document.baseURI !== r.href) { r.rebase = document.baseURI; } @@ -448,6 +455,7 @@ export class PuppeteerControl extends AsyncService { finalizerMap = new WeakMap>(); snMap = new WeakMap(); livePages = new Set(); + pagePhase = new WeakMap(); lastPageCratedAt: number = 0; rpsCap: number = 500; @@ -491,7 +499,8 @@ export class PuppeteerControl extends AsyncService { } } this.browser = await puppeteer.launch({ - timeout: 10_000 + timeout: 10_000, + args: ['--disable-dev-shm-usage'] }).catch((err: any) => { this.logger.error(`Unknown firebase issue, just die fast.`, { err }); process.nextTick(() => { @@ -611,7 +620,14 @@ export class PuppeteerControl extends AsyncService { const dt = Math.ceil((Date.now() - t0) / 1000); const rps = reqCounter / dt; // console.log(`rps: ${rps}`); + const pagePhase = this.pagePhase.get(page); + if (pagePhase === 'background') { + if (rps > 10 || reqCounter > 1000) { + halt = true; + return req.abort('blockedbyclient', 1000); + } + } if (reqCounter > 1000) { if (rps > 60 || reqCounter > 2000) { page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many requests` }); @@ -676,6 +692,7 @@ export class PuppeteerControl extends AsyncService { this.logger.info(`Page ${sn} created.`); this.lastPageCratedAt = Date.now(); this.livePages.add(page); + this.pagePhase.set(page, 'idle'); return page; } @@ -717,7 +734,6 @@ export class PuppeteerControl extends AsyncService { } const sn = this.snMap.get(page); this.logger.info(`Closing page ${sn}`); - this.livePages.delete(page); await Promise.race([ (async () => { const ctx = page.browserContext(); @@ -731,6 +747,8 @@ export class PuppeteerControl extends AsyncService { ]).catch((err) => { this.logger.error(`Failed to destroy page ${sn}`, { err: marshalErrorLike(err) }); }); + this.livePages.delete(page); + this.pagePhase.delete(page); } async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator { @@ -743,6 +761,7 @@ export class PuppeteerControl extends AsyncService { const pdfUrls: string[] = []; let navigationResponse: HTTPResponse | undefined; const page = await this.getNextPage(); + this.pagePhase.set(page, 'active'); page.on('response', (resp) => { if (resp.request().isNavigationRequest()) { navigationResponse = resp; @@ -802,8 +821,6 @@ export class PuppeteerControl extends AsyncService { } const sn = this.snMap.get(page); this.logger.info(`Page ${sn}: Scraping ${url}`, { url }); - - this.logger.info(`Locale setting: ${options?.locale}`); if (options?.locale) { // Add headers via request interception to walk around this bug // https://github.com/puppeteer/puppeteer/issues/10235 @@ -896,6 +913,10 @@ export class PuppeteerControl extends AsyncService { page.on('snapshot', hdl); page.once('abuse', (event: any) => { this.emit('abuse', { ...event, url: parsedUrl }); + if (snapshot?.href && parsedUrl.href !== snapshot.href) { + this.emit('abuse', { ...event, url: snapshot.href }); + } + nextSnapshotDeferred.reject( new SecurityCompromiseError(`Abuse detected: ${event.reason}`) ); @@ -1071,6 +1092,7 @@ export class PuppeteerControl extends AsyncService { } } } finally { + this.pagePhase.set(page, 'background'); (waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => { page.off('snapshot', hdl); this.ditchPage(page); diff --git a/backend/functions/src/services/snapshot-formatter.ts b/backend/functions/src/services/snapshot-formatter.ts index 5743738..0601ac9 100644 --- a/backend/functions/src/services/snapshot-formatter.ts +++ b/backend/functions/src/services/snapshot-formatter.ts @@ -152,7 +152,8 @@ export class SnapshotFormatter extends AsyncService { // in case of Google Web Cache content if (snapshot.pdfs?.length && (!snapshot.title || snapshot.title.startsWith('cache:'))) { const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0], - this.threadLocal.get('cacheTolerance') + this.threadLocal.get('cacheTolerance'), + snapshot.pdfs[0].startsWith('http') ? undefined : snapshot.href, ); if (pdf) { pdfMode = true; diff --git a/backend/functions/src/stand-alone/crawl.ts b/backend/functions/src/stand-alone/crawl.ts new file mode 100644 index 0000000..589ded2 --- /dev/null +++ b/backend/functions/src/stand-alone/crawl.ts @@ -0,0 +1,151 @@ +import 'reflect-metadata'; +import { container, singleton } from 'tsyringe'; +import { initializeApp, applicationDefault } from 'firebase-admin/app'; + +process.env['FIREBASE_CONFIG'] ??= JSON.stringify({ + projectId: process.env['GCLOUD_PROJECT'] || 'reader-6b7dc', + storageBucket: `${process.env['GCLOUD_PROJECT'] || 'reader-6b7dc'}.appspot.com`, + credential: applicationDefault(), +}); + +initializeApp(); + + +import { Logger, CloudFunctionRegistry } from '../shared'; +import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc'; +import { ExpressServer } from 'civkit/civ-rpc/express'; +import http2 from 'http2'; +import { CrawlerHost } from '../cloud-functions/crawler'; +import { FsWalk, WalkOutEntity } from 'civkit/fswalk'; +import path from 'path'; +import fs from 'fs'; +import { mimeOfExt } from 'civkit/mime'; +import { NextFunction, Request, Response } from 'express'; + +process.on('unhandledRejection', (err) => { + console.error('Unhandled rejection', err); +}); + +process.on('uncaughtException', (err) => { + console.log('Uncaught exception', err); + + // Looks like Firebase runtime does not handle error properly. + // Make sure to quit the process. + console.error('Uncaught exception, process quit.'); + process.nextTick(() => process.exit(1)); +}); + +@singleton() +export class CrawlStandAloneServer extends ExpressServer { + logger = this.globalLogger.child({ service: this.constructor.name }); + + httpAlternativeServer?: typeof this['httpServer']; + assets = new Map(); + + constructor( + protected globalLogger: Logger, + protected registry: CloudFunctionRegistry, + protected crawlerHost: CrawlerHost, + ) { + super(...arguments); + + registry.allHandsOnDeck().catch(() => void 0); + registry.title = 'reader'; + registry.version = '0.1.0'; + } + + h2c() { + this.httpAlternativeServer = this.httpServer; + this.httpServer = http2.createServer(this.expressApp); + // useResourceBasedDefaultTracker(); + + return this; + } + + override async init() { + await this.walkForAssets(); + await super.init(); + } + + async walkForAssets() { + const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public')); + + for (const file of files) { + if (file.type !== 'file') { + continue; + } + this.assets.set(file.relativePath.toString(), file); + } + } + + makeAssetsServingController() { + return (req: Request, res: Response, next: NextFunction) => { + const requestPath = req.url; + const file = requestPath.slice(1); + if (!file) { + return next(); + } + + const asset = this.assets.get(file); + if (asset?.type !== 'file') { + return next(); + } + res.type(mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream'); + res.set('Content-Length', asset.stats.size.toString()); + fs.createReadStream(asset.path).pipe(res); + + return; + }; + } + + override listen(port: number) { + const r = super.listen(port); + if (this.httpAlternativeServer) { + const altPort = port + 1; + this.httpAlternativeServer.listen(altPort, () => { + this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`); + }); + } + + return r; + } + + override registerRoutes(): void { + + const openAPIManager = new OpenAPIManager(); + openAPIManager.document('/{url}', ['get', 'post'], this.registry.conf.get('crawl')!); + const openapiJsonPath = '/openapi.json'; + this.expressRootRouter.get(openapiJsonPath, (req, res) => { + const baseURL = new URL(req.url, `${req.protocol}://${req.headers.host}`); + baseURL.pathname = baseURL.pathname.replace(new RegExp(`${openapiJsonPath}$`, 'i'), '').replace(/\/+$/g, ''); + baseURL.search = ''; + const content = openAPIManager.createOpenAPIObject(baseURL.toString(), { + info: { + title: this.registry.title, + description: `${this.registry.title} openAPI documentations`, + 'x-logo': { + url: this.registry.logoUrl || `https://www.openapis.org/wp-content/uploads/sites/3/2018/02/OpenAPI_Logo_Pantone-1.png` + } + } + }, (this.registry.constructor as typeof AbstractRPCRegistry).envelope, req.query as any); + res.statusCode = 200; + res.end(JSON.stringify(content)); + }); + + this.expressRootRouter.use('/', ...this.registry.expressMiddlewares, this.makeAssetsServingController(), this.registry.makeShimController('crawl')); + } + + protected override featureSelect(): void { + this.insertAsyncHookMiddleware(); + this.insertHealthCheckMiddleware(this.healthCheckEndpoint); + this.insertLogRequestsMiddleware(); + this.registerOpenAPIDocsRoutes('/docs'); + + this.registerRoutes(); + } +} +const instance = container.resolve(CrawlStandAloneServer); + +export default instance; + +instance.serviceReady().then((s) => s.listen(parseInt(process.env.PORT || '') || 3000)); diff --git a/backend/functions/src/stand-alone/search.ts b/backend/functions/src/stand-alone/search.ts new file mode 100644 index 0000000..ab04d5f --- /dev/null +++ b/backend/functions/src/stand-alone/search.ts @@ -0,0 +1,151 @@ +import 'reflect-metadata'; +import { container, singleton } from 'tsyringe'; +import { initializeApp, applicationDefault } from 'firebase-admin/app'; + +process.env['FIREBASE_CONFIG'] ??= JSON.stringify({ + projectId: process.env['GCLOUD_PROJECT'] || 'reader-6b7dc', + storageBucket: `${process.env['GCLOUD_PROJECT'] || 'reader-6b7dc'}.appspot.com`, + credential: applicationDefault(), +}); + +initializeApp(); + + +import { Logger, CloudFunctionRegistry } from '../shared'; +import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc'; +import { ExpressServer } from 'civkit/civ-rpc/express'; +import http2 from 'http2'; +import { SearcherHost } from '../cloud-functions/searcher'; +import { FsWalk, WalkOutEntity } from 'civkit/fswalk'; +import path from 'path'; +import fs from 'fs'; +import { mimeOfExt } from 'civkit/mime'; +import { NextFunction, Request, Response } from 'express'; + +process.on('unhandledRejection', (err) => { + console.error('Unhandled rejection', err); +}); + +process.on('uncaughtException', (err) => { + console.log('Uncaught exception', err); + + // Looks like Firebase runtime does not handle error properly. + // Make sure to quit the process. + console.error('Uncaught exception, process quit.'); + process.nextTick(() => process.exit(1)); +}); + +@singleton() +export class SearchStandAloneServer extends ExpressServer { + logger = this.globalLogger.child({ service: this.constructor.name }); + + httpAlternativeServer?: typeof this['httpServer']; + assets = new Map(); + + constructor( + protected globalLogger: Logger, + protected registry: CloudFunctionRegistry, + protected searcherHost: SearcherHost, + ) { + super(...arguments); + + registry.allHandsOnDeck().catch(() => void 0); + registry.title = 'reader'; + registry.version = '0.1.0'; + } + + h2c() { + this.httpAlternativeServer = this.httpServer; + this.httpServer = http2.createServer(this.expressApp); + // useResourceBasedDefaultTracker(); + + return this; + } + + override async init() { + await this.walkForAssets(); + await super.init(); + } + + async walkForAssets() { + const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public')); + + for (const file of files) { + if (file.type !== 'file') { + continue; + } + this.assets.set(file.relativePath.toString(), file); + } + } + + makeAssetsServingController() { + return (req: Request, res: Response, next: NextFunction) => { + const requestPath = req.url; + const file = requestPath.slice(1); + if (!file) { + return next(); + } + + const asset = this.assets.get(file); + if (asset?.type !== 'file') { + return next(); + } + res.type(mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream'); + res.set('Content-Length', asset.stats.size.toString()); + fs.createReadStream(asset.path).pipe(res); + + return; + }; + } + + override listen(port: number) { + const r = super.listen(port); + if (this.httpAlternativeServer) { + const altPort = port + 1; + this.httpAlternativeServer.listen(altPort, () => { + this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`); + }); + } + + return r; + } + + override registerRoutes(): void { + + const openAPIManager = new OpenAPIManager(); + openAPIManager.document('/{q}', ['get', 'post'], this.registry.conf.get('search')!); + const openapiJsonPath = '/openapi.json'; + this.expressRootRouter.get(openapiJsonPath, (req, res) => { + const baseURL = new URL(req.url, `${req.protocol}://${req.headers.host}`); + baseURL.pathname = baseURL.pathname.replace(new RegExp(`${openapiJsonPath}$`, 'i'), '').replace(/\/+$/g, ''); + baseURL.search = ''; + const content = openAPIManager.createOpenAPIObject(baseURL.toString(), { + info: { + title: this.registry.title, + description: `${this.registry.title} openAPI documentations`, + 'x-logo': { + url: this.registry.logoUrl || `https://www.openapis.org/wp-content/uploads/sites/3/2018/02/OpenAPI_Logo_Pantone-1.png` + } + } + }, (this.registry.constructor as typeof AbstractRPCRegistry).envelope, req.query as any); + res.statusCode = 200; + res.end(JSON.stringify(content)); + }); + + this.expressRootRouter.use('/', ...this.registry.expressMiddlewares, this.makeAssetsServingController(), this.registry.makeShimController('search')); + } + + protected override featureSelect(): void { + this.insertAsyncHookMiddleware(); + this.insertHealthCheckMiddleware(this.healthCheckEndpoint); + this.insertLogRequestsMiddleware(); + this.registerOpenAPIDocsRoutes('/docs'); + + this.registerRoutes(); + } +} +const instance = container.resolve(SearchStandAloneServer); + +export default instance; + +instance.serviceReady().then((s) => s.listen(parseInt(process.env.PORT || '') || 3000));