mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-15 22:35:58 +08:00
fix: several crash cases
This commit is contained in:
parent
6b9e14de62
commit
d0e20cc086
@ -8,6 +8,7 @@ import {
|
||||
AssertionFailureError, ParamValidationError,
|
||||
RawString,
|
||||
ApplicationError,
|
||||
DataStreamBrokenError,
|
||||
} from 'civkit/civ-rpc';
|
||||
import { marshalErrorLike } from 'civkit/lang';
|
||||
import { Defer } from 'civkit/defer';
|
||||
@ -817,7 +818,10 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
} catch (err: any) {
|
||||
this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href });
|
||||
if (err instanceof ApplicationError && !(err instanceof ServiceBadAttemptError)) {
|
||||
if (err instanceof ApplicationError &&
|
||||
!(err instanceof ServiceBadAttemptError) &&
|
||||
!(err instanceof DataStreamBrokenError)
|
||||
) {
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
@ -968,7 +972,7 @@ export class CrawlerHost extends RPCHost {
|
||||
crawlOpts.targetSelector = [crawlOpts.targetSelector];
|
||||
}
|
||||
for (const s of crawlOpts.targetSelector) {
|
||||
for (const e of s.split(',').map((x)=> x.trim())) {
|
||||
for (const e of s.split(',').map((x) => x.trim())) {
|
||||
if (e.startsWith('*') || e.startsWith(':') || e.includes('*:')) {
|
||||
throw new ParamValidationError({
|
||||
message: `Unacceptable selector: '${e}'. We cannot accept match-all selector for performance reasons. Sorry.`,
|
||||
|
@ -2,7 +2,7 @@ import 'core-js/actual/promise/with-resolvers';
|
||||
import { singleton } from 'tsyringe';
|
||||
import _ from 'lodash';
|
||||
import { TextItem } from 'pdfjs-dist/types/src/display/api';
|
||||
import { AsyncService, HashManager } from 'civkit';
|
||||
import { AssertionFailureError, AsyncService, HashManager } from 'civkit';
|
||||
import { GlobalLogger } from './logger';
|
||||
import { PDFContent } from '../db/pdf';
|
||||
import dayjs from 'dayjs';
|
||||
@ -325,27 +325,27 @@ export class PDFExtractor extends AsyncService {
|
||||
|
||||
try {
|
||||
extracted = await this.extract(data);
|
||||
} catch (err: any) {
|
||||
this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err, url, nameUrl });
|
||||
throw new AssertionFailureError(`Unable to process ${nameUrl} as pdf: ${err?.message}`);
|
||||
}
|
||||
|
||||
if (!this.asyncLocalContext.ctx.DNT) {
|
||||
const theID = randomUUID();
|
||||
await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`,
|
||||
Buffer.from(JSON.stringify(extracted), 'utf-8'), { contentType: 'application/json' });
|
||||
PDFContent.save(
|
||||
PDFContent.from({
|
||||
_id: theID,
|
||||
src: nameUrl,
|
||||
meta: extracted?.meta || {},
|
||||
urlDigest: digest,
|
||||
createdAt: new Date(),
|
||||
expireAt: new Date(Date.now() + this.cacheRetentionMs)
|
||||
}).degradeForFireStore()
|
||||
).catch((r) => {
|
||||
this.logger.warn(`Unable to cache PDF content for ${nameUrl}`, { err: r });
|
||||
});
|
||||
}
|
||||
} catch (err) {
|
||||
this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err });
|
||||
throw err;
|
||||
if (!this.asyncLocalContext.ctx.DNT) {
|
||||
const theID = randomUUID();
|
||||
await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`,
|
||||
Buffer.from(JSON.stringify(extracted), 'utf-8'), { contentType: 'application/json' });
|
||||
PDFContent.save(
|
||||
PDFContent.from({
|
||||
_id: theID,
|
||||
src: nameUrl,
|
||||
meta: extracted?.meta || {},
|
||||
urlDigest: digest,
|
||||
createdAt: new Date(),
|
||||
expireAt: new Date(Date.now() + this.cacheRetentionMs)
|
||||
}).degradeForFireStore()
|
||||
).catch((r) => {
|
||||
this.logger.warn(`Unable to cache PDF content for ${nameUrl}`, { err: r });
|
||||
});
|
||||
}
|
||||
|
||||
return extracted;
|
||||
|
@ -846,7 +846,7 @@ export class PuppeteerControl extends AsyncService {
|
||||
const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin];
|
||||
const ctx = this.lifeCycleTrack.get(page);
|
||||
if (proxy && ctx) {
|
||||
return this.asyncLocalContext.bridge(ctx, async () => {
|
||||
return await this.asyncLocalContext.bridge(ctx, async () => {
|
||||
try {
|
||||
const curled = await this.curlControl.sideLoad(reqUrlParsed, {
|
||||
...options,
|
||||
@ -890,7 +890,7 @@ export class PuppeteerControl extends AsyncService {
|
||||
headers: _.omit(firstReq, 'result'),
|
||||
}, 999);
|
||||
} catch (err: any) {
|
||||
this.logger.warn(`Failed to sideload ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err: marshalErrorLike(err) });
|
||||
this.logger.warn(`Failed to sideload browser request ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err, proxy });
|
||||
}
|
||||
if (req.isInterceptResolutionHandled()) {
|
||||
return;
|
||||
|
@ -1,6 +1,6 @@
|
||||
import { randomUUID } from 'crypto';
|
||||
import { container, singleton } from 'tsyringe';
|
||||
import { AssertionFailureError, AsyncService, FancyFile, HashManager, marshalErrorLike } from 'civkit';
|
||||
import { AssertionFailureError, AsyncService, DataStreamBrokenError, FancyFile, HashManager, marshalErrorLike } from 'civkit';
|
||||
import TurndownService, { Filter, Rule } from 'turndown';
|
||||
import { GlobalLogger } from './logger';
|
||||
import { PageSnapshot } from './puppeteer';
|
||||
@ -406,7 +406,7 @@ export class SnapshotFormatter extends AsyncService {
|
||||
const text = snapshot.statusText || STATUS_CODES[code];
|
||||
formatted.warning ??= '';
|
||||
const msg = `Target URL returned error ${code}${text ? `: ${text}` : ''}`;
|
||||
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
|
||||
formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
|
||||
}
|
||||
}
|
||||
|
||||
@ -441,23 +441,23 @@ export class SnapshotFormatter extends AsyncService {
|
||||
formatted.warning ??= '';
|
||||
if (snapshot.isIntermediate) {
|
||||
const msg = 'This page maybe not yet fully loaded, consider explicitly specify a timeout.';
|
||||
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
|
||||
formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
|
||||
}
|
||||
if (snapshot.childFrames?.length && !this.threadLocal.get('withIframe')) {
|
||||
const msg = 'This page contains iframe that are currently hidden, consider enabling iframe processing.';
|
||||
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
|
||||
formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
|
||||
}
|
||||
if (snapshot.shadowExpanded && !this.threadLocal.get('withShadowDom')) {
|
||||
const msg = 'This page contains shadow DOM that are currently hidden, consider enabling shadow DOM processing.';
|
||||
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
|
||||
formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
|
||||
}
|
||||
if (snapshot.html.includes('captcha') || snapshot.html.includes('cf-turnstile-response')) {
|
||||
const msg = 'This page maybe requiring CAPTCHA, please make sure you are authorized to access this page.';
|
||||
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
|
||||
formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
|
||||
}
|
||||
if (snapshot.isFromCache) {
|
||||
const msg = 'This is a cached snapshot of the original page, consider retry with caching opt-out.';
|
||||
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
|
||||
formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
|
||||
}
|
||||
}
|
||||
|
||||
@ -565,7 +565,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
const text = snapshot.statusText || STATUS_CODES[code];
|
||||
mixin.warning ??= '';
|
||||
const msg = `Target URL returned error ${code}${text ? `: ${text}` : ''}`;
|
||||
mixin.warning = `${mixin.warning}${mixin.warning ? '\n': ''}${msg}`;
|
||||
mixin.warning = `${mixin.warning}${mixin.warning ? '\n' : ''}${msg}`;
|
||||
}
|
||||
}
|
||||
|
||||
@ -629,6 +629,21 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
}
|
||||
}
|
||||
|
||||
turnDownService.addRule('improved-heading', {
|
||||
filter: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'],
|
||||
replacement: (content, node, options) => {
|
||||
const hLevel = Number(node.nodeName.charAt(1));
|
||||
if (options.headingStyle === 'setext' && hLevel < 3) {
|
||||
const underline = _.repeat((hLevel === 1 ? '=' : '-'), Math.min(128, content.length));
|
||||
return (
|
||||
'\n\n' + content + '\n' + underline + '\n\n'
|
||||
);
|
||||
} else {
|
||||
return '\n\n' + _.repeat('#', hLevel) + ' ' + content + '\n\n';
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
turnDownService.addRule('improved-paragraph', {
|
||||
filter: 'p',
|
||||
replacement: (innerText) => {
|
||||
@ -751,27 +766,32 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
|
||||
return snapshot;
|
||||
}
|
||||
if (contentType.startsWith('text/html')) {
|
||||
if ((await file.size) > 1024 * 1024 * 32) {
|
||||
throw new AssertionFailureError(`Failed to access ${url}: file too large`);
|
||||
try {
|
||||
if (contentType.startsWith('text/html')) {
|
||||
if ((await file.size) > 1024 * 1024 * 32) {
|
||||
throw new AssertionFailureError(`Failed to access ${url}: file too large`);
|
||||
}
|
||||
snapshot.html = await readFile(await file.filePath, { encoding: 'utf-8' });
|
||||
|
||||
return snapshot;
|
||||
}
|
||||
snapshot.html = await readFile(await file.filePath, { encoding: 'utf-8' });
|
||||
if (contentType.startsWith('text/') || contentType.startsWith('application/json')) {
|
||||
if ((await file.size) > 1024 * 1024 * 32) {
|
||||
throw new AssertionFailureError(`Failed to access ${url}: file too large`);
|
||||
}
|
||||
snapshot.text = await readFile(await file.filePath, { encoding: 'utf-8' });
|
||||
snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`;
|
||||
|
||||
return snapshot;
|
||||
}
|
||||
if (contentType.startsWith('text/') || contentType.startsWith('application/json')) {
|
||||
if ((await file.size) > 1024 * 1024 * 32) {
|
||||
throw new AssertionFailureError(`Failed to access ${url}: file too large`);
|
||||
return snapshot;
|
||||
}
|
||||
snapshot.text = await readFile(await file.filePath, { encoding: 'utf-8' });
|
||||
snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`;
|
||||
if (contentType.startsWith('application/pdf')) {
|
||||
snapshot.pdfs = [pathToFileURL(await file.filePath).href];
|
||||
|
||||
return snapshot;
|
||||
}
|
||||
if (contentType.startsWith('application/pdf')) {
|
||||
snapshot.pdfs = [pathToFileURL(await file.filePath).href];
|
||||
|
||||
return snapshot;
|
||||
return snapshot;
|
||||
}
|
||||
} catch (err: any) {
|
||||
this.logger.warn(`Failed to read from file: ${url}`, { err, url });
|
||||
throw new DataStreamBrokenError(`Failed to access ${url}: ${err?.message}`);
|
||||
}
|
||||
|
||||
throw new AssertionFailureError(`Failed to access ${url}: unexpected type ${contentType}`);
|
||||
|
Loading…
x
Reference in New Issue
Block a user