From ec9f0826ac00a210103008efd3cb25f3d4a05b86 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Thu, 17 Apr 2025 10:10:58 +0800 Subject: [PATCH 1/9] fix --- src/api/serp.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/api/serp.ts b/src/api/serp.ts index 292c05f..e11fc6c 100644 --- a/src/api/serp.ts +++ b/src/api/serp.ts @@ -133,7 +133,7 @@ export class SerpHost extends RPCHost { @Param('num', { validate: (v: number) => v >= 0 && v <= 20 }) num?: number, @Param('gl', { validate: (v: string) => WORLD_COUNTRY_CODES.includes(v?.toLowerCase()) }) gl?: string, - @Param('hl', { validate: (v: string) => WORLD_LANGUAGES.some(l => l.code === v) }) hl?: string, + @Param('hl', { validate: (v: string) => WORLD_LANGUAGES.some(l => l.code === v) }) _hl?: string, @Param('location') location?: string, @Param('page') page?: number, @Param('fallback') fallback?: boolean, @@ -294,7 +294,7 @@ export class SerpHost extends RPCHost { q, num, gl, - hl, + // hl, location, page, }, crawlerOptions); @@ -324,7 +324,7 @@ export class SerpHost extends RPCHost { q: realQuery, num, gl, - hl, + // hl, location, }, crawlerOptions); if (results?.length) { @@ -341,7 +341,7 @@ export class SerpHost extends RPCHost { q: realQuery, num, gl, - hl, + // hl, location, }, crawlerOptions); } From c795cdb7b33c2882695b7b09bd3c66fd4365f8d2 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Thu, 17 Apr 2025 22:04:06 +0800 Subject: [PATCH 2/9] fix: the other way of setting charset in html --- src/services/snapshot-formatter.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/services/snapshot-formatter.ts b/src/services/snapshot-formatter.ts index fbc06b5..3064a88 100644 --- a/src/services/snapshot-formatter.ts +++ b/src/services/snapshot-formatter.ts @@ -827,7 +827,10 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; throw new AssertionFailureError(`Failed to access ${url}: file too large`); } snapshot.html = await readFile(await file.filePath, encoding); - const innerCharset = snapshot.html.slice(0, 1024).match(/]+text\/html;\s*?charset=([^>"]+)\"/i)?.[1]?.toLowerCase(); + let innerCharset; + const peek = snapshot.html.slice(0, 1024); + innerCharset ??= peek.match(/]+text\/html;\s*?charset=([^>"]+)/i)?.[1]?.toLowerCase(); + innerCharset ??= peek.match(/]+charset="([^>"]+)\"/i)?.[1]?.toLowerCase(); if (innerCharset && innerCharset !== encoding) { snapshot.html = await readFile(await file.filePath, innerCharset); } From d2afa9ddc256861dd60ee8a3bcef9be99f0c4194 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Thu, 17 Apr 2025 22:08:17 +0800 Subject: [PATCH 3/9] fix: timeout respect --- src/api/crawler.ts | 12 +- src/api/searcher-serper.ts | 5 - src/dto/crawler-options.ts | 5 +- src/services/puppeteer.ts | 469 ++++--------------------------------- 4 files changed, 63 insertions(+), 428 deletions(-) diff --git a/src/api/crawler.ts b/src/api/crawler.ts index d0826f4..2788755 100644 --- a/src/api/crawler.ts +++ b/src/api/crawler.ts @@ -116,6 +116,10 @@ export class CrawlerHost extends RPCHost { if (snapshot.isIntermediate) { return; } + if (!snapshot.lastMutationIdle) { + // Never reached mutationIdle, presumably too short timeout + return; + } if (options.locale) { Reflect.set(snapshot, 'locale', options.locale); } @@ -313,7 +317,6 @@ export class CrawlerHost extends RPCHost { throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`); } } - const crawlOpts = await this.configure(crawlerOptions); if (crawlerOptions.robotsTxt) { await this.robotsTxtService.assertAccessAllowed(targetUrl, crawlerOptions.robotsTxt); @@ -461,7 +464,6 @@ export class CrawlerHost extends RPCHost { } throw new AssertionFailureError(`No content available for URL ${targetUrl}`); } - const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts); chargeAmount = this.assignChargeAmount(formatted, crawlerOptions); if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { @@ -798,6 +800,8 @@ export class CrawlerHost extends RPCHost { } if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && !this.knownUrlThatSideLoadingWouldCrashTheBrowser(urlToCrawl)) { + const sideLoadSnapshotPermitted = crawlerOpts?.browserIsNotRequired() && + [RESPOND_TIMING.HTML, RESPOND_TIMING.VISIBLE_CONTENT].includes(crawlerOpts.presumedRespondTiming); try { const altOpts = { ...crawlOpts }; let sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ? @@ -832,7 +836,7 @@ export class CrawlerHost extends RPCHost { let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html); draftSnapshot.title ??= analyzed.title; draftSnapshot.isIntermediate = true; - if (crawlerOpts?.browserIsNotRequired()) { + if (sideLoadSnapshotPermitted) { yield this.jsdomControl.narrowSnapshot(draftSnapshot, crawlOpts); } let fallbackProxyIsUsed = false; @@ -858,7 +862,7 @@ export class CrawlerHost extends RPCHost { analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html); if (proxyLoaded.status === 200 || analyzed.tokens >= 200) { proxySnapshot.isIntermediate = true; - if (crawlerOpts?.browserIsNotRequired()) { + if (sideLoadSnapshotPermitted) { yield this.jsdomControl.narrowSnapshot(proxySnapshot, crawlOpts); } sideLoaded = proxyLoaded; diff --git a/src/api/searcher-serper.ts b/src/api/searcher-serper.ts index 8314c8f..8cb8c67 100644 --- a/src/api/searcher-serper.ts +++ b/src/api/searcher-serper.ts @@ -318,11 +318,6 @@ export class SearcherHost extends RPCHost { throw new AssertionFailureError(`No search results available for query ${searchQuery}`); } - if (crawlOpts.timeoutMs && crawlOpts.timeoutMs < 30_000) { - delete crawlOpts.timeoutMs; - } - - let lastScrapped: any[] | undefined; const targetResultCount = crawlWithoutContent ? count : count + 2; const trimmedResults = results.filter((x) => Boolean(x.link)).slice(0, targetResultCount).map((x) => this.mapToFinalResults(x)); diff --git a/src/dto/crawler-options.ts b/src/dto/crawler-options.ts index 2004706..01dea59 100644 --- a/src/dto/crawler-options.ts +++ b/src/dto/crawler-options.ts @@ -655,8 +655,11 @@ export class CrawlerOptions extends AutoCastable { if (this.respondWith.includes('lm')) { return false; } + if (this.withIframe) { + return false; + } - return false; + return !snapshot.isIntermediate; } isCacheQueryApplicable() { diff --git a/src/services/puppeteer.ts b/src/services/puppeteer.ts index 67840af..5621a1a 100644 --- a/src/services/puppeteer.ts +++ b/src/services/puppeteer.ts @@ -846,7 +846,6 @@ export class PuppeteerControl extends AsyncService { async *scrap(parsedUrl: URL, options: ScrappingOptions = {}): AsyncGenerator { // parsedUrl.search = ''; const url = parsedUrl.toString(); - let snapshot: PageSnapshot | undefined; let screenshot: Buffer | undefined; let pageshot: Buffer | undefined; @@ -1097,7 +1096,7 @@ export class PuppeteerControl extends AsyncService { nextSnapshotDeferred.promise.finally(() => { this.off('crippled', crippleListener); }); - let finalized = false; + let successfullyDone = false; const hdl = (s: any) => { if (snapshot === s) { return; @@ -1143,6 +1142,39 @@ export class PuppeteerControl extends AsyncService { goToOptions.referer = options.referer; } + let waitForPromise: Promise | undefined; + let finalizationPromise: Promise | undefined; + const doFinalization = async () => { + if (!waitForPromise) { + successfullyDone = true; + } + try { + const pSubFrameSnapshots = this.snapshotChildFrames(page); + snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; + screenshot = (await this.takeScreenShot(page)) || screenshot; + pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot; + if (snapshot) { + snapshot.childFrames = await pSubFrameSnapshots; + } + } catch (err: any) { + this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err }); + } + if (!snapshot?.html) { + return; + } + + this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href }); + this.emit( + 'crawled', + { + ...snapshot, + status: navigationResponse?.status(), + statusText: navigationResponse?.statusText(), + pdfs: _.uniq(pdfUrls), screenshot, pageshot, + }, + { ...options, url: parsedUrl } + ); + }; const delayPromise = delay(timeout); const gotoPromise = page.goto(url, goToOptions) .catch((err) => { @@ -1170,50 +1202,14 @@ export class PuppeteerControl extends AsyncService { // Calling evaluate directly may stall the process. if (!snapshot) { if (stuff instanceof Error) { - finalized = true; throw stuff; } } await Promise.race([Promise.allSettled([...pageScriptEvaluations, ...frameScriptEvaluations]), delayPromise]) .catch(() => void 0); - try { - const pSubFrameSnapshots = this.snapshotChildFrames(page); - snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; - screenshot = (await this.takeScreenShot(page)) || screenshot; - pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot; - if (snapshot) { - snapshot.childFrames = await pSubFrameSnapshots; - } - } catch (err: any) { - this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err }); - if (stuff instanceof Error) { - finalized = true; - throw stuff; - } - } - if (!snapshot?.html) { - if (stuff instanceof Error) { - finalized = true; - throw stuff; - } - } - - finalized = true; - if (snapshot?.html) { - this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href }); - this.emit( - 'crawled', - { - ...snapshot, - status: navigationResponse?.status(), - statusText: navigationResponse?.statusText(), - pdfs: _.uniq(pdfUrls), screenshot, pageshot, - }, - { ...options, url: parsedUrl } - ); - } + finalizationPromise = doFinalization(); + return stuff; }); - let waitForPromise: Promise | undefined; if (options.waitForSelector) { const t0 = Date.now(); waitForPromise = nextSnapshotDeferred.promise.then(() => { @@ -1224,19 +1220,12 @@ export class PuppeteerControl extends AsyncService { const p = (Array.isArray(options.waitForSelector) ? Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) : page.waitForSelector(options.waitForSelector!, { timeout: thisTimeout })) - .then(async () => { - const pSubFrameSnapshots = this.snapshotChildFrames(page); - snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; - screenshot = (await this.takeScreenShot(page)) || screenshot; - pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot; - if (snapshot) { - snapshot.childFrames = await pSubFrameSnapshots; - } - finalized = true; + .then(() => { + successfullyDone = true; + finalizationPromise = doFinalization(); }) .catch((err) => { this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err }); - waitForPromise = undefined; }); return p as any; }); @@ -1254,11 +1243,8 @@ export class PuppeteerControl extends AsyncService { } let error; await Promise.race(ckpt).catch((err) => error = err); - if (finalized && !error) { + if (successfullyDone && !error) { if (!snapshot && !screenshot) { - if (error) { - throw error; - } throw new AssertionFailureError(`Could not extract any meaningful content from the page`); } yield { @@ -1286,10 +1272,20 @@ export class PuppeteerControl extends AsyncService { if (error) { throw error; } + if (successfullyDone) { + break; + } } + await finalizationPromise; + yield { + ...snapshot, + status: navigationResponse?.status(), + statusText: navigationResponse?.statusText(), + pdfs: _.uniq(pdfUrls), screenshot, pageshot + } as PageSnapshot; } finally { this.pagePhase.set(page, 'background'); - (waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => { + Promise.allSettled([gotoPromise, waitForPromise, finalizationPromise]).finally(() => { page.off('snapshot', hdl); this.ditchPage(page); }); @@ -1329,369 +1325,6 @@ export class PuppeteerControl extends AsyncService { return r.filter(Boolean); } - async simpleScrap(parsedUrl: URL, options: ScrappingOptions = {}): Promise { - // parsedUrl.search = ''; - const url = parsedUrl.toString(); - let snapshot: PageSnapshot | undefined; - let navigationResponse: HTTPResponse | undefined; - const page = await this.getNextPage(); - this.lifeCycleTrack.set(page, this.asyncLocalContext.ctx); - this.pagePhase.set(page, 'active'); - page.on('response', (resp) => { - this.blackHoleDetector.itWorked(); - const req = resp.request(); - if (req.frame() === page.mainFrame() && req.isNavigationRequest()) { - navigationResponse = resp; - } - if (!resp.ok()) { - return; - } - }); - page.on('request', async (req) => { - if (req.isInterceptResolutionHandled()) { - return; - }; - const reqUrlParsed = new URL(req.url()); - if (!reqUrlParsed.protocol.startsWith('http')) { - const overrides = req.continueRequestOverrides(); - - return req.continue(overrides, 0); - } - const typ = req.resourceType(); - if (typ === 'media') { - // Non-cooperative answer to block all media requests. - return req.abort('blockedbyclient'); - } - if (!options.proxyResources) { - const isDocRequest = ['document', 'xhr', 'fetch', 'websocket', 'prefetch', 'eventsource', 'ping'].includes(typ); - if (!isDocRequest) { - if (options.extraHeaders) { - const overrides = req.continueRequestOverrides(); - const continueArgs = [{ - ...overrides, - headers: { - ...req.headers(), - ...overrides?.headers, - ...options.extraHeaders, - } - }, 1] as const; - - return req.continue(continueArgs[0], continueArgs[1]); - } - const overrides = req.continueRequestOverrides(); - - return req.continue(overrides, 0); - } - } - const sideload = options.sideLoad; - - const impersonate = sideload?.impersonate[reqUrlParsed.href]; - if (impersonate) { - let body; - if (impersonate.body) { - body = await readFile(await impersonate.body.filePath); - if (req.isInterceptResolutionHandled()) { - return; - } - } - return req.respond({ - status: impersonate.status, - headers: impersonate.headers, - contentType: impersonate.contentType, - body: body ? Uint8Array.from(body) : undefined, - }, 999); - } - - const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin]; - const ctx = this.lifeCycleTrack.get(page); - if (proxy && ctx) { - return await this.asyncLocalContext.bridge(ctx, async () => { - try { - const curled = await this.curlControl.sideLoad(reqUrlParsed, { - ...options, - method: req.method(), - body: req.postData(), - extraHeaders: { - ...req.headers(), - ...options.extraHeaders, - }, - proxyUrl: proxy - }); - if (req.isInterceptResolutionHandled()) { - return; - }; - - if (curled.chain.length === 1) { - if (!curled.file) { - return req.respond({ - status: curled.status, - headers: _.omit(curled.headers, 'result'), - contentType: curled.contentType, - }, 3); - } - const body = await readFile(await curled.file.filePath); - if (req.isInterceptResolutionHandled()) { - return; - }; - return req.respond({ - status: curled.status, - headers: _.omit(curled.headers, 'result'), - contentType: curled.contentType, - body: Uint8Array.from(body), - }, 3); - } - options.sideLoad ??= curled.sideLoadOpts; - _.merge(options.sideLoad, curled.sideLoadOpts); - const firstReq = curled.chain[0]; - - return req.respond({ - status: firstReq.result!.code, - headers: _.omit(firstReq, 'result'), - }, 3); - } catch (err: any) { - this.logger.warn(`Failed to sideload browser request ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err, proxy }); - } - if (req.isInterceptResolutionHandled()) { - return; - }; - const overrides = req.continueRequestOverrides(); - const continueArgs = [{ - ...overrides, - headers: { - ...req.headers(), - ...overrides?.headers, - ...options.extraHeaders, - } - }, 1] as const; - - return req.continue(continueArgs[0], continueArgs[1]); - }); - } - - if (req.isInterceptResolutionHandled()) { - return; - }; - const overrides = req.continueRequestOverrides(); - const continueArgs = [{ - ...overrides, - headers: { - ...req.headers(), - ...overrides?.headers, - ...options.extraHeaders, - } - }, 1] as const; - - return req.continue(continueArgs[0], continueArgs[1]); - }); - - const sn = this.snMap.get(page); - this.logger.info(`Page ${sn}: Scraping ${url}`, { url }); - if (options.locale) { - // Add headers via request interception to walk around this bug - // https://github.com/puppeteer/puppeteer/issues/10235 - // await page.setExtraHTTPHeaders({ - // 'Accept-Language': options.locale - // }); - - await page.evaluateOnNewDocument(() => { - Object.defineProperty(navigator, "language", { - get: function () { - return options.locale; - } - }); - Object.defineProperty(navigator, "languages", { - get: function () { - return [options.locale]; - } - }); - }); - } - - if (options.cookies) { - const mapped = options.cookies.map((x) => { - const draft: CookieParam = { - name: x.name, - value: encodeURIComponent(x.value), - secure: x.secure, - domain: x.domain, - path: x.path, - expires: x.expires ? Math.floor(x.expires.valueOf() / 1000) : undefined, - sameSite: x.sameSite as any, - }; - if (!draft.expires && x.maxAge) { - draft.expires = Math.floor(Date.now() / 1000) + x.maxAge; - } - if (!draft.domain) { - draft.url = parsedUrl.toString(); - } - - return draft; - }); - try { - await page.setCookie(...mapped); - } catch (err: any) { - this.logger.warn(`Page ${sn}: Failed to set cookies`, { err }); - throw new ParamValidationError({ - path: 'cookies', - message: `Failed to set cookies: ${err?.message}` - }); - } - } - if (options.overrideUserAgent) { - await page.setUserAgent(options.overrideUserAgent); - } - if (options.viewport) { - await page.setViewport(options.viewport); - } - - let nextSnapshotDeferred = Defer(); - const crippleListener = () => nextSnapshotDeferred.reject(new ServiceCrashedError({ message: `Browser crashed, try again` })); - this.once('crippled', crippleListener); - nextSnapshotDeferred.promise.finally(() => { - this.off('crippled', crippleListener); - }); - let finalized = false; - const hdl = (s: any) => { - if (snapshot === s) { - return; - } - snapshot = s; - if (snapshot) { - const kit = this.pageReqCtrl.get(page); - snapshot.lastContentResourceLoaded = kit?.lastContentResourceLoadedAt; - snapshot.lastMediaResourceLoaded = kit?.lastMediaResourceLoadedAt; - } - if (s?.maxElemDepth && s.maxElemDepth > 256) { - return; - } - if (s?.elemCount && s.elemCount > 10_000) { - return; - } - nextSnapshotDeferred.resolve(s); - nextSnapshotDeferred = Defer(); - this.once('crippled', crippleListener); - nextSnapshotDeferred.promise.finally(() => { - this.off('crippled', crippleListener); - }); - }; - page.on('snapshot', hdl); - page.once('abuse', (event: any) => { - this.emit('abuse', { ...event, url: parsedUrl }); - if (snapshot?.href && parsedUrl.href !== snapshot.href) { - this.emit('abuse', { ...event, url: snapshot.href }); - } - - nextSnapshotDeferred.reject( - new SecurityCompromiseError(`Abuse detected: ${event.reason}`) - ); - }); - - const timeout = options.timeoutMs || 30_000; - const goToOptions: GoToOptions = { - waitUntil: ['load', 'domcontentloaded', 'networkidle0'], - timeout, - }; - - if (options.referer) { - goToOptions.referer = options.referer; - } - - const gotoPromise = page.goto(url, goToOptions) - .catch((err) => { - if (err instanceof TimeoutError) { - this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err }); - return new AssertionFailureError({ - message: `Failed to goto ${url}: ${err}`, - cause: err, - }); - } - - this.logger.warn(`Page ${sn}: Browsing of ${url} failed`, { err }); - return new AssertionFailureError({ - message: `Failed to goto ${url}: ${err}`, - cause: err, - }); - }).then(async (stuff) => { - // This check is necessary because without snapshot, the condition of the page is unclear - // Calling evaluate directly may stall the process. - if (!snapshot) { - if (stuff instanceof Error) { - finalized = true; - throw stuff; - } - } - try { - snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; - } catch (err: any) { - this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err }); - if (stuff instanceof Error) { - finalized = true; - throw stuff; - } - } - if (!snapshot?.html) { - if (stuff instanceof Error) { - finalized = true; - throw stuff; - } - } - - finalized = true; - if (snapshot?.html) { - this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href }); - this.emit( - 'crawled', - { - ...snapshot, - status: navigationResponse?.status(), - statusText: navigationResponse?.statusText(), - }, - { ...options, url: parsedUrl } - ); - } - }); - - try { - while (true) { - const ckpt = [nextSnapshotDeferred.promise, gotoPromise]; - if (options.minIntervalMs) { - ckpt.push(delay(options.minIntervalMs)); - } - let error; - await Promise.race(ckpt).catch((err) => error = err); - if (finalized && !error) { - if (!snapshot) { - if (error) { - throw error; - } - throw new AssertionFailureError(`Could not extract any meaningful content from the page`); - } - return { - ...snapshot, - status: navigationResponse?.status(), - statusText: navigationResponse?.statusText(), - } as PageSnapshot; - } - - if (snapshot?.lastMutationIdle) { - return { - ...snapshot, - status: navigationResponse?.status(), - statusText: navigationResponse?.statusText(), - } as PageSnapshot; - } - if (error) { - throw error; - } - } - } finally { - this.pagePhase.set(page, 'background'); - page.off('snapshot', hdl); - this.ditchPage(page); - nextSnapshotDeferred.resolve(); - } - } - } const puppeteerControl = container.resolve(PuppeteerControl); From 33ca16405e06ce439192a65472c7f67b2f4a6f4f Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Thu, 17 Apr 2025 22:38:07 +0800 Subject: [PATCH 4/9] fix --- src/services/puppeteer.ts | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/services/puppeteer.ts b/src/services/puppeteer.ts index 5621a1a..0e8d8a0 100644 --- a/src/services/puppeteer.ts +++ b/src/services/puppeteer.ts @@ -1096,7 +1096,7 @@ export class PuppeteerControl extends AsyncService { nextSnapshotDeferred.promise.finally(() => { this.off('crippled', crippleListener); }); - let successfullyDone = false; + let successfullyDone; const hdl = (s: any) => { if (snapshot === s) { return; @@ -1145,9 +1145,12 @@ export class PuppeteerControl extends AsyncService { let waitForPromise: Promise | undefined; let finalizationPromise: Promise | undefined; const doFinalization = async () => { - if (!waitForPromise) { - successfullyDone = true; + if (waitForPromise) { + // SuccessfullyDone is meant for the finish of the page. + // It doesn't matter if you are expecting something and it didn't show up. + await waitForPromise.catch(() => void 0); } + successfullyDone ??= true; try { const pSubFrameSnapshots = this.snapshotChildFrames(page); snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; @@ -1222,13 +1225,14 @@ export class PuppeteerControl extends AsyncService { page.waitForSelector(options.waitForSelector!, { timeout: thisTimeout })) .then(() => { successfullyDone = true; - finalizationPromise = doFinalization(); }) .catch((err) => { + waitForPromise = undefined; this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err }); }); return p as any; }); + } try { From 5a81177c56bb195ab9816d092d28a6690c0d7e50 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Fri, 18 Apr 2025 15:32:43 +0800 Subject: [PATCH 5/9] saas: new tier policy --- src/api/crawler.ts | 100 +++++++++++++++++++++----------- src/dto/jina-embeddings-auth.ts | 25 ++++++++ src/services/errors.ts | 2 +- 3 files changed, 93 insertions(+), 34 deletions(-) diff --git a/src/api/crawler.ts b/src/api/crawler.ts index 2788755..d3b811d 100644 --- a/src/api/crawler.ts +++ b/src/api/crawler.ts @@ -240,6 +240,7 @@ export class CrawlerHost extends RPCHost { const uid = await auth.solveUID(); let chargeAmount = 0; const crawlerOptions = ctx.method === 'GET' ? crawlerOptionsHeaderOnly : crawlerOptionsParamsAllowed; + const tierPolicy = await this.saasAssertTierPolicy(crawlerOptions, auth); // Use koa ctx.URL, a standard URL object to avoid node.js framework prop naming confusion const targetUrl = await this.getTargetUrl(tryDecodeURIComponent(`${ctx.URL.pathname}${ctx.URL.search}`), crawlerOptions); @@ -298,15 +299,13 @@ export class CrawlerHost extends RPCHost { if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { return; } - if (chargeAmount) { - apiRoll._ref?.set({ - chargeAmount, - }, { merge: true }).catch((err) => this.logger.warn(`Failed to log charge amount in apiRoll`, { err })); - } + apiRoll.chargeAmount = chargeAmount; }); } if (!uid) { + // Enforce no proxy is allocated for anonymous users due to abuse. + crawlerOptions.proxy = 'none'; const blockade = (await DomainBlockade.fromFirestoreQuery( DomainBlockade.COLLECTION .where('domain', '==', targetUrl.hostname.toLowerCase()) @@ -338,10 +337,7 @@ export class CrawlerHost extends RPCHost { } const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts); - chargeAmount = this.assignChargeAmount(formatted, crawlerOptions); - if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { - throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); - } + chargeAmount = this.assignChargeAmount(formatted, tierPolicy); sseStream.write({ event: 'data', data: formatted, @@ -379,11 +375,7 @@ export class CrawlerHost extends RPCHost { } const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts); - chargeAmount = this.assignChargeAmount(formatted, crawlerOptions); - - if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { - throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); - } + chargeAmount = this.assignChargeAmount(formatted, tierPolicy); if (scrapped?.pdfs?.length && !chargeAmount) { continue; @@ -405,10 +397,7 @@ export class CrawlerHost extends RPCHost { } const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts); - chargeAmount = this.assignChargeAmount(formatted, crawlerOptions); - if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { - throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); - } + chargeAmount = this.assignChargeAmount(formatted, tierPolicy); return formatted; } @@ -434,10 +423,7 @@ export class CrawlerHost extends RPCHost { } const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts); - chargeAmount = this.assignChargeAmount(formatted, crawlerOptions); - if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { - throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); - } + chargeAmount = this.assignChargeAmount(formatted, tierPolicy); if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) { return assignTransferProtocolMeta(`${formatted.textRepresentation}`, @@ -465,10 +451,7 @@ export class CrawlerHost extends RPCHost { throw new AssertionFailureError(`No content available for URL ${targetUrl}`); } const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts); - chargeAmount = this.assignChargeAmount(formatted, crawlerOptions); - if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { - throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); - } + chargeAmount = this.assignChargeAmount(formatted, tierPolicy); if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) { @@ -840,7 +823,8 @@ export class CrawlerHost extends RPCHost { yield this.jsdomControl.narrowSnapshot(draftSnapshot, crawlOpts); } let fallbackProxyIsUsed = false; - if (((!crawlOpts?.allocProxy || crawlOpts.allocProxy === 'none') && !crawlOpts?.proxyUrl) && + if ( + ((!crawlOpts?.allocProxy || crawlOpts.allocProxy !== 'none') && !crawlOpts?.proxyUrl) && (analyzed.tokens < 42 || sideLoaded.status !== 200) ) { const proxyLoaded = await this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts); @@ -911,18 +895,14 @@ export class CrawlerHost extends RPCHost { } } - assignChargeAmount(formatted: FormattedPage, crawlerOptions?: CrawlerOptions) { + assignChargeAmount(formatted: FormattedPage, saasTierPolicy?: Parameters[0]) { if (!formatted) { return 0; } let amount = 0; if (formatted.content) { - const x1 = estimateToken(formatted.content); - if (crawlerOptions?.respondWith?.toLowerCase().includes('lm')) { - amount += x1 * 2; - } - amount += x1; + amount = estimateToken(formatted.content); } else if (formatted.description) { amount += estimateToken(formatted.description); } @@ -939,6 +919,10 @@ export class CrawlerHost extends RPCHost { amount += 765; } + if (saasTierPolicy) { + amount = this.saasApplyTierPolicy(saasTierPolicy, amount); + } + Object.assign(formatted, { usage: { tokens: amount } }); assignMeta(formatted, { usage: { tokens: amount } }); @@ -1312,4 +1296,54 @@ export class CrawlerHost extends RPCHost { return false; } + + async saasAssertTierPolicy(opts: CrawlerOptions, auth: JinaEmbeddingsAuthDTO) { + let chargeScalar = 1; + let minimalCharge = 0; + + if (opts.injectPageScript || opts.injectFrameScript) { + await auth.assertTier(0, 'Script injection'); + minimalCharge = 4_000; + } + + if (opts.withGeneratedAlt) { + await auth.assertTier(0, 'Alt text generation'); + minimalCharge = 4_000; + } + + if (opts.withIframe) { + await auth.assertTier(0, 'Iframe'); + } + + if (opts.engine === ENGINE_TYPE.CF_BROWSER_RENDERING) { + await auth.assertTier(0, 'Cloudflare browser rendering'); + minimalCharge = 4_000; + } + + if (opts.respondWith.includes('lm') || opts.engine?.includes('lm')) { + await auth.assertTier(0, 'Language model'); + minimalCharge = 4_000; + chargeScalar = 3; + } + + if (opts.proxy && opts.proxy !== 'none') { + await auth.assertTier(['auto', 'any'].includes(opts.proxy) ? 0 : 2, 'Proxy allocation'); + chargeScalar = 5; + } + + return { + budget: opts.tokenBudget || 0, + chargeScalar, + minimalCharge, + }; + } + + saasApplyTierPolicy(policy: Awaited>, chargeAmount: number) { + const effectiveChargeAmount = policy.chargeScalar * Math.max(chargeAmount, policy.minimalCharge); + if (policy.budget && policy.budget < effectiveChargeAmount) { + throw new BudgetExceededError(`Token budget (${policy.budget}) exceeded, intended charge amount ${effectiveChargeAmount}`); + } + + return effectiveChargeAmount; + } } diff --git a/src/dto/jina-embeddings-auth.ts b/src/dto/jina-embeddings-auth.ts index ffd6625..095a5a9 100644 --- a/src/dto/jina-embeddings-auth.ts +++ b/src/dto/jina-embeddings-auth.ts @@ -17,6 +17,7 @@ import { AsyncLocalContext } from '../services/async-context'; import envConfig from '../shared/services/secrets'; import { JinaEmbeddingsDashboardHTTP } from '../shared/3rd-party/jina-embeddings'; import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account'; +import { TierFeatureConstraintError } from '../services/errors'; const authDtoLogger = logger.child({ service: 'JinaAuthDTO' }); @@ -236,6 +237,30 @@ export class JinaEmbeddingsAuthDTO extends AutoCastable { return this.user!; } + async assertTier(n: number, feature?: string) { + let user; + try { + user = await this.assertUser(); + } catch (err) { + if (err instanceof AuthenticationRequiredError) { + throw new AuthenticationRequiredError({ + message: `Authentication is required to use this feature${feature ? ` (${feature})` : ''}. Please provide a valid API key.` + }); + } + + throw err; + } + + const tier = parseInt(user.metadata?.speed_level); + if (isNaN(tier) || tier < n) { + throw new TierFeatureConstraintError({ + message: `Your current plan does not support this feature${feature ? ` (${feature})` : ''}. Please upgrade your plan.` + }); + } + + return true; + } + getRateLimits(...tags: string[]) { const descs = tags.map((x) => this.user?.customRateLimits?.[x] || []).flat().filter((x) => x.isEffective()); diff --git a/src/services/errors.ts b/src/services/errors.ts index 51344a2..8941ff7 100644 --- a/src/services/errors.ts +++ b/src/services/errors.ts @@ -27,7 +27,7 @@ export class EmailUnverifiedError extends ApplicationError { } export class InsufficientCreditsError extends ApplicationError { } @StatusCode(40202) -export class FreeFeatureLimitError extends ApplicationError { } +export class TierFeatureConstraintError extends ApplicationError { } @StatusCode(40203) export class InsufficientBalanceError extends ApplicationError { } From 5a7f2a4cb818ce548f6a99013df77fa5332f11af Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Fri, 18 Apr 2025 16:01:06 +0800 Subject: [PATCH 6/9] saas: minor tier policy tweak --- src/api/crawler.ts | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/api/crawler.ts b/src/api/crawler.ts index d3b811d..6b6b8cf 100644 --- a/src/api/crawler.ts +++ b/src/api/crawler.ts @@ -1301,13 +1301,13 @@ export class CrawlerHost extends RPCHost { let chargeScalar = 1; let minimalCharge = 0; - if (opts.injectPageScript || opts.injectFrameScript) { - await auth.assertTier(0, 'Script injection'); - minimalCharge = 4_000; - } - if (opts.withGeneratedAlt) { await auth.assertTier(0, 'Alt text generation'); + minimalCharge = 765; + } + + if (opts.injectPageScript || opts.injectFrameScript) { + await auth.assertTier(0, 'Script injection'); minimalCharge = 4_000; } From 65db6ec9d94b8972d750ef1f506f65b47992097e Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Fri, 18 Apr 2025 17:03:36 +0800 Subject: [PATCH 7/9] saas: audition --- src/stand-alone/serp.ts | 2 ++ thinapps-shared | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/stand-alone/serp.ts b/src/stand-alone/serp.ts index 0c039d7..76c91fc 100644 --- a/src/stand-alone/serp.ts +++ b/src/stand-alone/serp.ts @@ -19,6 +19,7 @@ import { AsyncLocalContext } from '../services/async-context'; import finalizer, { Finalizer } from '../services/finalizer'; import { SerpHost } from '../api/serp'; import koaCompress from 'koa-compress'; +import { getAuditionMiddleware } from '../shared/utils/audition'; @singleton() export class SERPStandAloneServer extends KoaServer { @@ -107,6 +108,7 @@ export class SERPStandAloneServer extends KoaServer { } registerRoutes(): void { + this.koaApp.use(getAuditionMiddleware()); this.koaApp.use(koaCompress({ filter(type) { if (type.startsWith('text/')) { diff --git a/thinapps-shared b/thinapps-shared index f89255c..6df12b6 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit f89255cd6546641f72eefba140a4aef96a0e03fc +Subproject commit 6df12b642f65df2a6d8b4b745e53f74eda93c10a From b60cde4d0d125b5e844372411c194af5e320436f Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Fri, 18 Apr 2025 17:26:36 +0800 Subject: [PATCH 8/9] bump: deps --- thinapps-shared | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thinapps-shared b/thinapps-shared index 6df12b6..6549b59 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit 6df12b642f65df2a6d8b4b745e53f74eda93c10a +Subproject commit 6549b593273844ab5fbf3a2b18ce1f5c6f0f1c5e From 0cf8371d1cf7ecd84199fb7ba3321a3aa32f3620 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Fri, 18 Apr 2025 18:22:40 +0800 Subject: [PATCH 9/9] bump: deps --- thinapps-shared | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thinapps-shared b/thinapps-shared index 6549b59..08ded7b 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit 6549b593273844ab5fbf3a2b18ce1f5c6f0f1c5e +Subproject commit 08ded7b8eceee7e931d52e77c87103f28c3ba9e8