fix: several crash cases

This commit is contained in:
Yanlong Wang 2025-03-09 12:01:52 +08:00
parent 6b9e14de62
commit d0e20cc086
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
4 changed files with 74 additions and 50 deletions

View File

@ -8,6 +8,7 @@ import {
AssertionFailureError, ParamValidationError,
RawString,
ApplicationError,
DataStreamBrokenError,
} from 'civkit/civ-rpc';
import { marshalErrorLike } from 'civkit/lang';
import { Defer } from 'civkit/defer';
@ -817,7 +818,10 @@ export class CrawlerHost extends RPCHost {
}
} catch (err: any) {
this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href });
if (err instanceof ApplicationError && !(err instanceof ServiceBadAttemptError)) {
if (err instanceof ApplicationError &&
!(err instanceof ServiceBadAttemptError) &&
!(err instanceof DataStreamBrokenError)
) {
throw err;
}
}
@ -968,7 +972,7 @@ export class CrawlerHost extends RPCHost {
crawlOpts.targetSelector = [crawlOpts.targetSelector];
}
for (const s of crawlOpts.targetSelector) {
for (const e of s.split(',').map((x)=> x.trim())) {
for (const e of s.split(',').map((x) => x.trim())) {
if (e.startsWith('*') || e.startsWith(':') || e.includes('*:')) {
throw new ParamValidationError({
message: `Unacceptable selector: '${e}'. We cannot accept match-all selector for performance reasons. Sorry.`,

View File

@ -2,7 +2,7 @@ import 'core-js/actual/promise/with-resolvers';
import { singleton } from 'tsyringe';
import _ from 'lodash';
import { TextItem } from 'pdfjs-dist/types/src/display/api';
import { AsyncService, HashManager } from 'civkit';
import { AssertionFailureError, AsyncService, HashManager } from 'civkit';
import { GlobalLogger } from './logger';
import { PDFContent } from '../db/pdf';
import dayjs from 'dayjs';
@ -325,27 +325,27 @@ export class PDFExtractor extends AsyncService {
try {
extracted = await this.extract(data);
} catch (err: any) {
this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err, url, nameUrl });
throw new AssertionFailureError(`Unable to process ${nameUrl} as pdf: ${err?.message}`);
}
if (!this.asyncLocalContext.ctx.DNT) {
const theID = randomUUID();
await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`,
Buffer.from(JSON.stringify(extracted), 'utf-8'), { contentType: 'application/json' });
PDFContent.save(
PDFContent.from({
_id: theID,
src: nameUrl,
meta: extracted?.meta || {},
urlDigest: digest,
createdAt: new Date(),
expireAt: new Date(Date.now() + this.cacheRetentionMs)
}).degradeForFireStore()
).catch((r) => {
this.logger.warn(`Unable to cache PDF content for ${nameUrl}`, { err: r });
});
}
} catch (err) {
this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err });
throw err;
if (!this.asyncLocalContext.ctx.DNT) {
const theID = randomUUID();
await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`,
Buffer.from(JSON.stringify(extracted), 'utf-8'), { contentType: 'application/json' });
PDFContent.save(
PDFContent.from({
_id: theID,
src: nameUrl,
meta: extracted?.meta || {},
urlDigest: digest,
createdAt: new Date(),
expireAt: new Date(Date.now() + this.cacheRetentionMs)
}).degradeForFireStore()
).catch((r) => {
this.logger.warn(`Unable to cache PDF content for ${nameUrl}`, { err: r });
});
}
return extracted;

View File

@ -846,7 +846,7 @@ export class PuppeteerControl extends AsyncService {
const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin];
const ctx = this.lifeCycleTrack.get(page);
if (proxy && ctx) {
return this.asyncLocalContext.bridge(ctx, async () => {
return await this.asyncLocalContext.bridge(ctx, async () => {
try {
const curled = await this.curlControl.sideLoad(reqUrlParsed, {
...options,
@ -890,7 +890,7 @@ export class PuppeteerControl extends AsyncService {
headers: _.omit(firstReq, 'result'),
}, 999);
} catch (err: any) {
this.logger.warn(`Failed to sideload ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err: marshalErrorLike(err) });
this.logger.warn(`Failed to sideload browser request ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err, proxy });
}
if (req.isInterceptResolutionHandled()) {
return;

View File

@ -1,6 +1,6 @@
import { randomUUID } from 'crypto';
import { container, singleton } from 'tsyringe';
import { AssertionFailureError, AsyncService, FancyFile, HashManager, marshalErrorLike } from 'civkit';
import { AssertionFailureError, AsyncService, DataStreamBrokenError, FancyFile, HashManager, marshalErrorLike } from 'civkit';
import TurndownService, { Filter, Rule } from 'turndown';
import { GlobalLogger } from './logger';
import { PageSnapshot } from './puppeteer';
@ -406,7 +406,7 @@ export class SnapshotFormatter extends AsyncService {
const text = snapshot.statusText || STATUS_CODES[code];
formatted.warning ??= '';
const msg = `Target URL returned error ${code}${text ? `: ${text}` : ''}`;
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
}
}
@ -441,23 +441,23 @@ export class SnapshotFormatter extends AsyncService {
formatted.warning ??= '';
if (snapshot.isIntermediate) {
const msg = 'This page maybe not yet fully loaded, consider explicitly specify a timeout.';
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
}
if (snapshot.childFrames?.length && !this.threadLocal.get('withIframe')) {
const msg = 'This page contains iframe that are currently hidden, consider enabling iframe processing.';
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
}
if (snapshot.shadowExpanded && !this.threadLocal.get('withShadowDom')) {
const msg = 'This page contains shadow DOM that are currently hidden, consider enabling shadow DOM processing.';
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
}
if (snapshot.html.includes('captcha') || snapshot.html.includes('cf-turnstile-response')) {
const msg = 'This page maybe requiring CAPTCHA, please make sure you are authorized to access this page.';
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
}
if (snapshot.isFromCache) {
const msg = 'This is a cached snapshot of the original page, consider retry with caching opt-out.';
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
}
}
@ -565,7 +565,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
const text = snapshot.statusText || STATUS_CODES[code];
mixin.warning ??= '';
const msg = `Target URL returned error ${code}${text ? `: ${text}` : ''}`;
mixin.warning = `${mixin.warning}${mixin.warning ? '\n': ''}${msg}`;
mixin.warning = `${mixin.warning}${mixin.warning ? '\n' : ''}${msg}`;
}
}
@ -629,6 +629,21 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
}
}
turnDownService.addRule('improved-heading', {
filter: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'],
replacement: (content, node, options) => {
const hLevel = Number(node.nodeName.charAt(1));
if (options.headingStyle === 'setext' && hLevel < 3) {
const underline = _.repeat((hLevel === 1 ? '=' : '-'), Math.min(128, content.length));
return (
'\n\n' + content + '\n' + underline + '\n\n'
);
} else {
return '\n\n' + _.repeat('#', hLevel) + ' ' + content + '\n\n';
}
}
});
turnDownService.addRule('improved-paragraph', {
filter: 'p',
replacement: (innerText) => {
@ -751,27 +766,32 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
return snapshot;
}
if (contentType.startsWith('text/html')) {
if ((await file.size) > 1024 * 1024 * 32) {
throw new AssertionFailureError(`Failed to access ${url}: file too large`);
try {
if (contentType.startsWith('text/html')) {
if ((await file.size) > 1024 * 1024 * 32) {
throw new AssertionFailureError(`Failed to access ${url}: file too large`);
}
snapshot.html = await readFile(await file.filePath, { encoding: 'utf-8' });
return snapshot;
}
snapshot.html = await readFile(await file.filePath, { encoding: 'utf-8' });
if (contentType.startsWith('text/') || contentType.startsWith('application/json')) {
if ((await file.size) > 1024 * 1024 * 32) {
throw new AssertionFailureError(`Failed to access ${url}: file too large`);
}
snapshot.text = await readFile(await file.filePath, { encoding: 'utf-8' });
snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`;
return snapshot;
}
if (contentType.startsWith('text/') || contentType.startsWith('application/json')) {
if ((await file.size) > 1024 * 1024 * 32) {
throw new AssertionFailureError(`Failed to access ${url}: file too large`);
return snapshot;
}
snapshot.text = await readFile(await file.filePath, { encoding: 'utf-8' });
snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`;
if (contentType.startsWith('application/pdf')) {
snapshot.pdfs = [pathToFileURL(await file.filePath).href];
return snapshot;
}
if (contentType.startsWith('application/pdf')) {
snapshot.pdfs = [pathToFileURL(await file.filePath).href];
return snapshot;
return snapshot;
}
} catch (err: any) {
this.logger.warn(`Failed to read from file: ${url}`, { err, url });
throw new DataStreamBrokenError(`Failed to access ${url}: ${err?.message}`);
}
throw new AssertionFailureError(`Failed to access ${url}: unexpected type ${contentType}`);