mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-15 03:46:05 +08:00
feat: warn on non 200 response
This commit is contained in:
parent
6147a28609
commit
6e05ea2243
@ -4,7 +4,7 @@ import { container, singleton } from 'tsyringe';
|
|||||||
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, Deferred, perNextTick } from 'civkit';
|
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, Deferred, perNextTick } from 'civkit';
|
||||||
import { Logger } from '../shared/services/logger';
|
import { Logger } from '../shared/services/logger';
|
||||||
|
|
||||||
import type { Browser, CookieParam, GoToOptions, Page } from 'puppeteer';
|
import type { Browser, CookieParam, GoToOptions, HTTPResponse, Page } from 'puppeteer';
|
||||||
import puppeteer from 'puppeteer-extra';
|
import puppeteer from 'puppeteer-extra';
|
||||||
|
|
||||||
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
|
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
|
||||||
@ -42,11 +42,13 @@ export interface ReadabilityParsed {
|
|||||||
|
|
||||||
export interface PageSnapshot {
|
export interface PageSnapshot {
|
||||||
title: string;
|
title: string;
|
||||||
description: string;
|
description?: string;
|
||||||
href: string;
|
href: string;
|
||||||
rebase?: string;
|
rebase?: string;
|
||||||
html: string;
|
html: string;
|
||||||
text: string;
|
text: string;
|
||||||
|
status?: number;
|
||||||
|
statusText?: string;
|
||||||
parsed?: Partial<ReadabilityParsed> | null;
|
parsed?: Partial<ReadabilityParsed> | null;
|
||||||
screenshot?: Buffer;
|
screenshot?: Buffer;
|
||||||
pageshot?: Buffer;
|
pageshot?: Buffer;
|
||||||
@ -287,7 +289,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
await this.serviceReady();
|
await this.serviceReady();
|
||||||
const dedicatedContext = await this.browser.createBrowserContext();
|
const dedicatedContext = await this.browser.createBrowserContext();
|
||||||
const sn = this._sn++;
|
const sn = this._sn++;
|
||||||
let page
|
let page;
|
||||||
try {
|
try {
|
||||||
page = await dedicatedContext.newPage();
|
page = await dedicatedContext.newPage();
|
||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
@ -471,8 +473,12 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
let screenshot: Buffer | undefined;
|
let screenshot: Buffer | undefined;
|
||||||
let pageshot: Buffer | undefined;
|
let pageshot: Buffer | undefined;
|
||||||
const pdfUrls: string[] = [];
|
const pdfUrls: string[] = [];
|
||||||
|
let navigationResponse: HTTPResponse | undefined;
|
||||||
const page = await this.getNextPage();
|
const page = await this.getNextPage();
|
||||||
page.on('response', (resp) => {
|
page.on('response', (resp) => {
|
||||||
|
if (resp.request().isNavigationRequest()) {
|
||||||
|
navigationResponse = resp;
|
||||||
|
}
|
||||||
if (!resp.ok()) {
|
if (!resp.ok()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -638,7 +644,12 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
|
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
|
||||||
this.emit(
|
this.emit(
|
||||||
'crawled',
|
'crawled',
|
||||||
{ ...snapshot, pdfs: _.uniq(pdfUrls), screenshot, pageshot, },
|
{
|
||||||
|
...snapshot,
|
||||||
|
status: navigationResponse?.status(),
|
||||||
|
statusText: navigationResponse?.statusText(),
|
||||||
|
pdfs: _.uniq(pdfUrls), screenshot, pageshot,
|
||||||
|
},
|
||||||
{ ...options, url: parsedUrl }
|
{ ...options, url: parsedUrl }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -691,7 +702,12 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
}
|
}
|
||||||
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
|
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
|
||||||
}
|
}
|
||||||
yield { ...snapshot, pdfs: _.uniq(pdfUrls), screenshot, pageshot } as PageSnapshot;
|
yield {
|
||||||
|
...snapshot,
|
||||||
|
status: navigationResponse?.status(),
|
||||||
|
statusText: navigationResponse?.statusText(),
|
||||||
|
pdfs: _.uniq(pdfUrls), screenshot, pageshot
|
||||||
|
} as PageSnapshot;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
||||||
@ -700,7 +716,12 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
lastHTML = snapshot.html;
|
lastHTML = snapshot.html;
|
||||||
}
|
}
|
||||||
if (snapshot || screenshot) {
|
if (snapshot || screenshot) {
|
||||||
yield { ...snapshot, pdfs: _.uniq(pdfUrls), screenshot, pageshot } as PageSnapshot;
|
yield {
|
||||||
|
...snapshot,
|
||||||
|
status: navigationResponse?.status(),
|
||||||
|
statusText: navigationResponse?.statusText(),
|
||||||
|
pdfs: _.uniq(pdfUrls), screenshot, pageshot
|
||||||
|
} as PageSnapshot;
|
||||||
}
|
}
|
||||||
if (error) {
|
if (error) {
|
||||||
throw error;
|
throw error;
|
||||||
|
@ -12,6 +12,7 @@ import { AltTextService } from './alt-text';
|
|||||||
import { PDFExtractor } from './pdf-extract';
|
import { PDFExtractor } from './pdf-extract';
|
||||||
import { cleanAttribute } from '../utils/misc';
|
import { cleanAttribute } from '../utils/misc';
|
||||||
import _ from 'lodash';
|
import _ from 'lodash';
|
||||||
|
import { STATUS_CODES } from 'http';
|
||||||
|
|
||||||
|
|
||||||
export interface FormattedPage {
|
export interface FormattedPage {
|
||||||
@ -28,6 +29,7 @@ export interface FormattedPage {
|
|||||||
pageshot?: Buffer;
|
pageshot?: Buffer;
|
||||||
links?: { [k: string]: string; };
|
links?: { [k: string]: string; };
|
||||||
images?: { [k: string]: string; };
|
images?: { [k: string]: string; };
|
||||||
|
warning?: string;
|
||||||
usage?: {
|
usage?: {
|
||||||
total_tokens?: number;
|
total_tokens?: number;
|
||||||
totalTokens?: number;
|
totalTokens?: number;
|
||||||
@ -323,6 +325,15 @@ export class SnapshotFormatter extends AsyncService {
|
|||||||
[Symbol.dispose]: () => { },
|
[Symbol.dispose]: () => { },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if (snapshot.status) {
|
||||||
|
const code = snapshot.status;
|
||||||
|
const n = code - 200;
|
||||||
|
if (n < 100 || n >= 100) {
|
||||||
|
const text = snapshot.statusText || STATUS_CODES[code];
|
||||||
|
formatted.warning = `Target URL returned error ${code}${text? `: ${text}` : ''}`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (this.threadLocal.get('withImagesSummary')) {
|
if (this.threadLocal.get('withImagesSummary')) {
|
||||||
formatted.images =
|
formatted.images =
|
||||||
_(imageSummary)
|
_(imageSummary)
|
||||||
@ -369,6 +380,10 @@ export class SnapshotFormatter extends AsyncService {
|
|||||||
suffixMixins.push(linkSummaryChunks.join('\n'));
|
suffixMixins.push(linkSummaryChunks.join('\n'));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (this.warning) {
|
||||||
|
mixins.push(`Warning: ${this.warning}`);
|
||||||
|
}
|
||||||
|
|
||||||
return `Title: ${this.title}
|
return `Title: ${this.title}
|
||||||
|
|
||||||
URL Source: ${this.url}
|
URL Source: ${this.url}
|
||||||
@ -418,6 +433,14 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
||||||
mixin.links = _.invert(inferred.links || {});
|
mixin.links = _.invert(inferred.links || {});
|
||||||
}
|
}
|
||||||
|
if (snapshot.status) {
|
||||||
|
const code = snapshot.status;
|
||||||
|
const n = code - 200;
|
||||||
|
if (n < 100 || n >= 100) {
|
||||||
|
const text = snapshot.statusText || STATUS_CODES[code];
|
||||||
|
mixin.warning = `Target URL returned error ${code}${text ? `: ${text}` : ''}`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return mixin;
|
return mixin;
|
||||||
}
|
}
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit d287049d46781bff2032b02a2bd4322239145c95
|
Subproject commit 9258853d626758cb14dce55ae4aeaaca9fc4cfd2
|
Loading…
x
Reference in New Issue
Block a user