feat: warn on non 200 response

This commit is contained in:
Yanlong Wang 2024-09-12 19:05:06 +08:00
parent 6147a28609
commit 6e05ea2243
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
3 changed files with 51 additions and 7 deletions

View File

@ -4,7 +4,7 @@ import { container, singleton } from 'tsyringe';
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, Deferred, perNextTick } from 'civkit';
import { Logger } from '../shared/services/logger';
import type { Browser, CookieParam, GoToOptions, Page } from 'puppeteer';
import type { Browser, CookieParam, GoToOptions, HTTPResponse, Page } from 'puppeteer';
import puppeteer from 'puppeteer-extra';
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
@ -42,11 +42,13 @@ export interface ReadabilityParsed {
export interface PageSnapshot {
title: string;
description: string;
description?: string;
href: string;
rebase?: string;
html: string;
text: string;
status?: number;
statusText?: string;
parsed?: Partial<ReadabilityParsed> | null;
screenshot?: Buffer;
pageshot?: Buffer;
@ -287,7 +289,7 @@ export class PuppeteerControl extends AsyncService {
await this.serviceReady();
const dedicatedContext = await this.browser.createBrowserContext();
const sn = this._sn++;
let page
let page;
try {
page = await dedicatedContext.newPage();
} catch (err: any) {
@ -471,8 +473,12 @@ document.addEventListener('load', handlePageLoad);
let screenshot: Buffer | undefined;
let pageshot: Buffer | undefined;
const pdfUrls: string[] = [];
let navigationResponse: HTTPResponse | undefined;
const page = await this.getNextPage();
page.on('response', (resp) => {
if (resp.request().isNavigationRequest()) {
navigationResponse = resp;
}
if (!resp.ok()) {
return;
}
@ -638,7 +644,12 @@ document.addEventListener('load', handlePageLoad);
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
this.emit(
'crawled',
{ ...snapshot, pdfs: _.uniq(pdfUrls), screenshot, pageshot, },
{
...snapshot,
status: navigationResponse?.status(),
statusText: navigationResponse?.statusText(),
pdfs: _.uniq(pdfUrls), screenshot, pageshot,
},
{ ...options, url: parsedUrl }
);
}
@ -691,7 +702,12 @@ document.addEventListener('load', handlePageLoad);
}
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
}
yield { ...snapshot, pdfs: _.uniq(pdfUrls), screenshot, pageshot } as PageSnapshot;
yield {
...snapshot,
status: navigationResponse?.status(),
statusText: navigationResponse?.statusText(),
pdfs: _.uniq(pdfUrls), screenshot, pageshot
} as PageSnapshot;
break;
}
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
@ -700,7 +716,12 @@ document.addEventListener('load', handlePageLoad);
lastHTML = snapshot.html;
}
if (snapshot || screenshot) {
yield { ...snapshot, pdfs: _.uniq(pdfUrls), screenshot, pageshot } as PageSnapshot;
yield {
...snapshot,
status: navigationResponse?.status(),
statusText: navigationResponse?.statusText(),
pdfs: _.uniq(pdfUrls), screenshot, pageshot
} as PageSnapshot;
}
if (error) {
throw error;

View File

@ -12,6 +12,7 @@ import { AltTextService } from './alt-text';
import { PDFExtractor } from './pdf-extract';
import { cleanAttribute } from '../utils/misc';
import _ from 'lodash';
import { STATUS_CODES } from 'http';
export interface FormattedPage {
@ -28,6 +29,7 @@ export interface FormattedPage {
pageshot?: Buffer;
links?: { [k: string]: string; };
images?: { [k: string]: string; };
warning?: string;
usage?: {
total_tokens?: number;
totalTokens?: number;
@ -323,6 +325,15 @@ export class SnapshotFormatter extends AsyncService {
[Symbol.dispose]: () => { },
};
if (snapshot.status) {
const code = snapshot.status;
const n = code - 200;
if (n < 100 || n >= 100) {
const text = snapshot.statusText || STATUS_CODES[code];
formatted.warning = `Target URL returned error ${code}${text? `: ${text}` : ''}`;
}
}
if (this.threadLocal.get('withImagesSummary')) {
formatted.images =
_(imageSummary)
@ -369,6 +380,10 @@ export class SnapshotFormatter extends AsyncService {
suffixMixins.push(linkSummaryChunks.join('\n'));
}
if (this.warning) {
mixins.push(`Warning: ${this.warning}`);
}
return `Title: ${this.title}
URL Source: ${this.url}
@ -418,6 +433,14 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
mixin.links = _.invert(inferred.links || {});
}
if (snapshot.status) {
const code = snapshot.status;
const n = code - 200;
if (n < 100 || n >= 100) {
const text = snapshot.statusText || STATUS_CODES[code];
mixin.warning = `Target URL returned error ${code}${text ? `: ${text}` : ''}`;
}
}
return mixin;
}

@ -1 +1 @@
Subproject commit d287049d46781bff2032b02a2bd4322239145c95
Subproject commit 9258853d626758cb14dce55ae4aeaaca9fc4cfd2