mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-14 20:25:59 +08:00
feat: warn on non 200 response
This commit is contained in:
parent
6147a28609
commit
6e05ea2243
@ -4,7 +4,7 @@ import { container, singleton } from 'tsyringe';
|
||||
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, Deferred, perNextTick } from 'civkit';
|
||||
import { Logger } from '../shared/services/logger';
|
||||
|
||||
import type { Browser, CookieParam, GoToOptions, Page } from 'puppeteer';
|
||||
import type { Browser, CookieParam, GoToOptions, HTTPResponse, Page } from 'puppeteer';
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
|
||||
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
|
||||
@ -42,11 +42,13 @@ export interface ReadabilityParsed {
|
||||
|
||||
export interface PageSnapshot {
|
||||
title: string;
|
||||
description: string;
|
||||
description?: string;
|
||||
href: string;
|
||||
rebase?: string;
|
||||
html: string;
|
||||
text: string;
|
||||
status?: number;
|
||||
statusText?: string;
|
||||
parsed?: Partial<ReadabilityParsed> | null;
|
||||
screenshot?: Buffer;
|
||||
pageshot?: Buffer;
|
||||
@ -287,7 +289,7 @@ export class PuppeteerControl extends AsyncService {
|
||||
await this.serviceReady();
|
||||
const dedicatedContext = await this.browser.createBrowserContext();
|
||||
const sn = this._sn++;
|
||||
let page
|
||||
let page;
|
||||
try {
|
||||
page = await dedicatedContext.newPage();
|
||||
} catch (err: any) {
|
||||
@ -471,8 +473,12 @@ document.addEventListener('load', handlePageLoad);
|
||||
let screenshot: Buffer | undefined;
|
||||
let pageshot: Buffer | undefined;
|
||||
const pdfUrls: string[] = [];
|
||||
let navigationResponse: HTTPResponse | undefined;
|
||||
const page = await this.getNextPage();
|
||||
page.on('response', (resp) => {
|
||||
if (resp.request().isNavigationRequest()) {
|
||||
navigationResponse = resp;
|
||||
}
|
||||
if (!resp.ok()) {
|
||||
return;
|
||||
}
|
||||
@ -638,7 +644,12 @@ document.addEventListener('load', handlePageLoad);
|
||||
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
|
||||
this.emit(
|
||||
'crawled',
|
||||
{ ...snapshot, pdfs: _.uniq(pdfUrls), screenshot, pageshot, },
|
||||
{
|
||||
...snapshot,
|
||||
status: navigationResponse?.status(),
|
||||
statusText: navigationResponse?.statusText(),
|
||||
pdfs: _.uniq(pdfUrls), screenshot, pageshot,
|
||||
},
|
||||
{ ...options, url: parsedUrl }
|
||||
);
|
||||
}
|
||||
@ -691,7 +702,12 @@ document.addEventListener('load', handlePageLoad);
|
||||
}
|
||||
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
|
||||
}
|
||||
yield { ...snapshot, pdfs: _.uniq(pdfUrls), screenshot, pageshot } as PageSnapshot;
|
||||
yield {
|
||||
...snapshot,
|
||||
status: navigationResponse?.status(),
|
||||
statusText: navigationResponse?.statusText(),
|
||||
pdfs: _.uniq(pdfUrls), screenshot, pageshot
|
||||
} as PageSnapshot;
|
||||
break;
|
||||
}
|
||||
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
||||
@ -700,7 +716,12 @@ document.addEventListener('load', handlePageLoad);
|
||||
lastHTML = snapshot.html;
|
||||
}
|
||||
if (snapshot || screenshot) {
|
||||
yield { ...snapshot, pdfs: _.uniq(pdfUrls), screenshot, pageshot } as PageSnapshot;
|
||||
yield {
|
||||
...snapshot,
|
||||
status: navigationResponse?.status(),
|
||||
statusText: navigationResponse?.statusText(),
|
||||
pdfs: _.uniq(pdfUrls), screenshot, pageshot
|
||||
} as PageSnapshot;
|
||||
}
|
||||
if (error) {
|
||||
throw error;
|
||||
|
@ -12,6 +12,7 @@ import { AltTextService } from './alt-text';
|
||||
import { PDFExtractor } from './pdf-extract';
|
||||
import { cleanAttribute } from '../utils/misc';
|
||||
import _ from 'lodash';
|
||||
import { STATUS_CODES } from 'http';
|
||||
|
||||
|
||||
export interface FormattedPage {
|
||||
@ -28,6 +29,7 @@ export interface FormattedPage {
|
||||
pageshot?: Buffer;
|
||||
links?: { [k: string]: string; };
|
||||
images?: { [k: string]: string; };
|
||||
warning?: string;
|
||||
usage?: {
|
||||
total_tokens?: number;
|
||||
totalTokens?: number;
|
||||
@ -323,6 +325,15 @@ export class SnapshotFormatter extends AsyncService {
|
||||
[Symbol.dispose]: () => { },
|
||||
};
|
||||
|
||||
if (snapshot.status) {
|
||||
const code = snapshot.status;
|
||||
const n = code - 200;
|
||||
if (n < 100 || n >= 100) {
|
||||
const text = snapshot.statusText || STATUS_CODES[code];
|
||||
formatted.warning = `Target URL returned error ${code}${text? `: ${text}` : ''}`;
|
||||
}
|
||||
}
|
||||
|
||||
if (this.threadLocal.get('withImagesSummary')) {
|
||||
formatted.images =
|
||||
_(imageSummary)
|
||||
@ -369,6 +380,10 @@ export class SnapshotFormatter extends AsyncService {
|
||||
suffixMixins.push(linkSummaryChunks.join('\n'));
|
||||
}
|
||||
|
||||
if (this.warning) {
|
||||
mixins.push(`Warning: ${this.warning}`);
|
||||
}
|
||||
|
||||
return `Title: ${this.title}
|
||||
|
||||
URL Source: ${this.url}
|
||||
@ -418,6 +433,14 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
||||
mixin.links = _.invert(inferred.links || {});
|
||||
}
|
||||
if (snapshot.status) {
|
||||
const code = snapshot.status;
|
||||
const n = code - 200;
|
||||
if (n < 100 || n >= 100) {
|
||||
const text = snapshot.statusText || STATUS_CODES[code];
|
||||
mixin.warning = `Target URL returned error ${code}${text ? `: ${text}` : ''}`;
|
||||
}
|
||||
}
|
||||
|
||||
return mixin;
|
||||
}
|
||||
|
@ -1 +1 @@
|
||||
Subproject commit d287049d46781bff2032b02a2bd4322239145c95
|
||||
Subproject commit 9258853d626758cb14dce55ae4aeaaca9fc4cfd2
|
Loading…
x
Reference in New Issue
Block a user