From 6e05ea22431188505bb2c93c23cffb1a769b33d6 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Thu, 12 Sep 2024 19:05:06 +0800 Subject: [PATCH] feat: warn on non 200 response --- backend/functions/src/services/puppeteer.ts | 33 +++++++++++++++---- .../src/services/snapshot-formatter.ts | 23 +++++++++++++ thinapps-shared | 2 +- 3 files changed, 51 insertions(+), 7 deletions(-) diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index 4bf018e..960ddf7 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -4,7 +4,7 @@ import { container, singleton } from 'tsyringe'; import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, Deferred, perNextTick } from 'civkit'; import { Logger } from '../shared/services/logger'; -import type { Browser, CookieParam, GoToOptions, Page } from 'puppeteer'; +import type { Browser, CookieParam, GoToOptions, HTTPResponse, Page } from 'puppeteer'; import puppeteer from 'puppeteer-extra'; import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources'; @@ -42,11 +42,13 @@ export interface ReadabilityParsed { export interface PageSnapshot { title: string; - description: string; + description?: string; href: string; rebase?: string; html: string; text: string; + status?: number; + statusText?: string; parsed?: Partial | null; screenshot?: Buffer; pageshot?: Buffer; @@ -287,7 +289,7 @@ export class PuppeteerControl extends AsyncService { await this.serviceReady(); const dedicatedContext = await this.browser.createBrowserContext(); const sn = this._sn++; - let page + let page; try { page = await dedicatedContext.newPage(); } catch (err: any) { @@ -471,8 +473,12 @@ document.addEventListener('load', handlePageLoad); let screenshot: Buffer | undefined; let pageshot: Buffer | undefined; const pdfUrls: string[] = []; + let navigationResponse: HTTPResponse | undefined; const page = await this.getNextPage(); page.on('response', (resp) => { + if (resp.request().isNavigationRequest()) { + navigationResponse = resp; + } if (!resp.ok()) { return; } @@ -638,7 +644,12 @@ document.addEventListener('load', handlePageLoad); this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href }); this.emit( 'crawled', - { ...snapshot, pdfs: _.uniq(pdfUrls), screenshot, pageshot, }, + { + ...snapshot, + status: navigationResponse?.status(), + statusText: navigationResponse?.statusText(), + pdfs: _.uniq(pdfUrls), screenshot, pageshot, + }, { ...options, url: parsedUrl } ); } @@ -691,7 +702,12 @@ document.addEventListener('load', handlePageLoad); } throw new AssertionFailureError(`Could not extract any meaningful content from the page`); } - yield { ...snapshot, pdfs: _.uniq(pdfUrls), screenshot, pageshot } as PageSnapshot; + yield { + ...snapshot, + status: navigationResponse?.status(), + statusText: navigationResponse?.statusText(), + pdfs: _.uniq(pdfUrls), screenshot, pageshot + } as PageSnapshot; break; } if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) { @@ -700,7 +716,12 @@ document.addEventListener('load', handlePageLoad); lastHTML = snapshot.html; } if (snapshot || screenshot) { - yield { ...snapshot, pdfs: _.uniq(pdfUrls), screenshot, pageshot } as PageSnapshot; + yield { + ...snapshot, + status: navigationResponse?.status(), + statusText: navigationResponse?.statusText(), + pdfs: _.uniq(pdfUrls), screenshot, pageshot + } as PageSnapshot; } if (error) { throw error; diff --git a/backend/functions/src/services/snapshot-formatter.ts b/backend/functions/src/services/snapshot-formatter.ts index b659451..949bfc0 100644 --- a/backend/functions/src/services/snapshot-formatter.ts +++ b/backend/functions/src/services/snapshot-formatter.ts @@ -12,6 +12,7 @@ import { AltTextService } from './alt-text'; import { PDFExtractor } from './pdf-extract'; import { cleanAttribute } from '../utils/misc'; import _ from 'lodash'; +import { STATUS_CODES } from 'http'; export interface FormattedPage { @@ -28,6 +29,7 @@ export interface FormattedPage { pageshot?: Buffer; links?: { [k: string]: string; }; images?: { [k: string]: string; }; + warning?: string; usage?: { total_tokens?: number; totalTokens?: number; @@ -323,6 +325,15 @@ export class SnapshotFormatter extends AsyncService { [Symbol.dispose]: () => { }, }; + if (snapshot.status) { + const code = snapshot.status; + const n = code - 200; + if (n < 100 || n >= 100) { + const text = snapshot.statusText || STATUS_CODES[code]; + formatted.warning = `Target URL returned error ${code}${text? `: ${text}` : ''}`; + } + } + if (this.threadLocal.get('withImagesSummary')) { formatted.images = _(imageSummary) @@ -369,6 +380,10 @@ export class SnapshotFormatter extends AsyncService { suffixMixins.push(linkSummaryChunks.join('\n')); } + if (this.warning) { + mixins.push(`Warning: ${this.warning}`); + } + return `Title: ${this.title} URL Source: ${this.url} @@ -418,6 +433,14 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; inferred ??= this.jsdomControl.inferSnapshot(snapshot); mixin.links = _.invert(inferred.links || {}); } + if (snapshot.status) { + const code = snapshot.status; + const n = code - 200; + if (n < 100 || n >= 100) { + const text = snapshot.statusText || STATUS_CODES[code]; + mixin.warning = `Target URL returned error ${code}${text ? `: ${text}` : ''}`; + } + } return mixin; } diff --git a/thinapps-shared b/thinapps-shared index d287049..9258853 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit d287049d46781bff2032b02a2bd4322239145c95 +Subproject commit 9258853d626758cb14dce55ae4aeaaca9fc4cfd2