fix: several crash cases

This commit is contained in:
Yanlong Wang 2025-03-09 12:01:52 +08:00
parent 6b9e14de62
commit d0e20cc086
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
4 changed files with 74 additions and 50 deletions

View File

@ -8,6 +8,7 @@ import {
AssertionFailureError, ParamValidationError, AssertionFailureError, ParamValidationError,
RawString, RawString,
ApplicationError, ApplicationError,
DataStreamBrokenError,
} from 'civkit/civ-rpc'; } from 'civkit/civ-rpc';
import { marshalErrorLike } from 'civkit/lang'; import { marshalErrorLike } from 'civkit/lang';
import { Defer } from 'civkit/defer'; import { Defer } from 'civkit/defer';
@ -817,7 +818,10 @@ export class CrawlerHost extends RPCHost {
} }
} catch (err: any) { } catch (err: any) {
this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href }); this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href });
if (err instanceof ApplicationError && !(err instanceof ServiceBadAttemptError)) { if (err instanceof ApplicationError &&
!(err instanceof ServiceBadAttemptError) &&
!(err instanceof DataStreamBrokenError)
) {
throw err; throw err;
} }
} }
@ -968,7 +972,7 @@ export class CrawlerHost extends RPCHost {
crawlOpts.targetSelector = [crawlOpts.targetSelector]; crawlOpts.targetSelector = [crawlOpts.targetSelector];
} }
for (const s of crawlOpts.targetSelector) { for (const s of crawlOpts.targetSelector) {
for (const e of s.split(',').map((x)=> x.trim())) { for (const e of s.split(',').map((x) => x.trim())) {
if (e.startsWith('*') || e.startsWith(':') || e.includes('*:')) { if (e.startsWith('*') || e.startsWith(':') || e.includes('*:')) {
throw new ParamValidationError({ throw new ParamValidationError({
message: `Unacceptable selector: '${e}'. We cannot accept match-all selector for performance reasons. Sorry.`, message: `Unacceptable selector: '${e}'. We cannot accept match-all selector for performance reasons. Sorry.`,

View File

@ -2,7 +2,7 @@ import 'core-js/actual/promise/with-resolvers';
import { singleton } from 'tsyringe'; import { singleton } from 'tsyringe';
import _ from 'lodash'; import _ from 'lodash';
import { TextItem } from 'pdfjs-dist/types/src/display/api'; import { TextItem } from 'pdfjs-dist/types/src/display/api';
import { AsyncService, HashManager } from 'civkit'; import { AssertionFailureError, AsyncService, HashManager } from 'civkit';
import { GlobalLogger } from './logger'; import { GlobalLogger } from './logger';
import { PDFContent } from '../db/pdf'; import { PDFContent } from '../db/pdf';
import dayjs from 'dayjs'; import dayjs from 'dayjs';
@ -325,27 +325,27 @@ export class PDFExtractor extends AsyncService {
try { try {
extracted = await this.extract(data); extracted = await this.extract(data);
} catch (err: any) {
this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err, url, nameUrl });
throw new AssertionFailureError(`Unable to process ${nameUrl} as pdf: ${err?.message}`);
}
if (!this.asyncLocalContext.ctx.DNT) { if (!this.asyncLocalContext.ctx.DNT) {
const theID = randomUUID(); const theID = randomUUID();
await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`, await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`,
Buffer.from(JSON.stringify(extracted), 'utf-8'), { contentType: 'application/json' }); Buffer.from(JSON.stringify(extracted), 'utf-8'), { contentType: 'application/json' });
PDFContent.save( PDFContent.save(
PDFContent.from({ PDFContent.from({
_id: theID, _id: theID,
src: nameUrl, src: nameUrl,
meta: extracted?.meta || {}, meta: extracted?.meta || {},
urlDigest: digest, urlDigest: digest,
createdAt: new Date(), createdAt: new Date(),
expireAt: new Date(Date.now() + this.cacheRetentionMs) expireAt: new Date(Date.now() + this.cacheRetentionMs)
}).degradeForFireStore() }).degradeForFireStore()
).catch((r) => { ).catch((r) => {
this.logger.warn(`Unable to cache PDF content for ${nameUrl}`, { err: r }); this.logger.warn(`Unable to cache PDF content for ${nameUrl}`, { err: r });
}); });
}
} catch (err) {
this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err });
throw err;
} }
return extracted; return extracted;

View File

@ -846,7 +846,7 @@ export class PuppeteerControl extends AsyncService {
const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin]; const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin];
const ctx = this.lifeCycleTrack.get(page); const ctx = this.lifeCycleTrack.get(page);
if (proxy && ctx) { if (proxy && ctx) {
return this.asyncLocalContext.bridge(ctx, async () => { return await this.asyncLocalContext.bridge(ctx, async () => {
try { try {
const curled = await this.curlControl.sideLoad(reqUrlParsed, { const curled = await this.curlControl.sideLoad(reqUrlParsed, {
...options, ...options,
@ -890,7 +890,7 @@ export class PuppeteerControl extends AsyncService {
headers: _.omit(firstReq, 'result'), headers: _.omit(firstReq, 'result'),
}, 999); }, 999);
} catch (err: any) { } catch (err: any) {
this.logger.warn(`Failed to sideload ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err: marshalErrorLike(err) }); this.logger.warn(`Failed to sideload browser request ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err, proxy });
} }
if (req.isInterceptResolutionHandled()) { if (req.isInterceptResolutionHandled()) {
return; return;

View File

@ -1,6 +1,6 @@
import { randomUUID } from 'crypto'; import { randomUUID } from 'crypto';
import { container, singleton } from 'tsyringe'; import { container, singleton } from 'tsyringe';
import { AssertionFailureError, AsyncService, FancyFile, HashManager, marshalErrorLike } from 'civkit'; import { AssertionFailureError, AsyncService, DataStreamBrokenError, FancyFile, HashManager, marshalErrorLike } from 'civkit';
import TurndownService, { Filter, Rule } from 'turndown'; import TurndownService, { Filter, Rule } from 'turndown';
import { GlobalLogger } from './logger'; import { GlobalLogger } from './logger';
import { PageSnapshot } from './puppeteer'; import { PageSnapshot } from './puppeteer';
@ -406,7 +406,7 @@ export class SnapshotFormatter extends AsyncService {
const text = snapshot.statusText || STATUS_CODES[code]; const text = snapshot.statusText || STATUS_CODES[code];
formatted.warning ??= ''; formatted.warning ??= '';
const msg = `Target URL returned error ${code}${text ? `: ${text}` : ''}`; const msg = `Target URL returned error ${code}${text ? `: ${text}` : ''}`;
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`; formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
} }
} }
@ -441,23 +441,23 @@ export class SnapshotFormatter extends AsyncService {
formatted.warning ??= ''; formatted.warning ??= '';
if (snapshot.isIntermediate) { if (snapshot.isIntermediate) {
const msg = 'This page maybe not yet fully loaded, consider explicitly specify a timeout.'; const msg = 'This page maybe not yet fully loaded, consider explicitly specify a timeout.';
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`; formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
} }
if (snapshot.childFrames?.length && !this.threadLocal.get('withIframe')) { if (snapshot.childFrames?.length && !this.threadLocal.get('withIframe')) {
const msg = 'This page contains iframe that are currently hidden, consider enabling iframe processing.'; const msg = 'This page contains iframe that are currently hidden, consider enabling iframe processing.';
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`; formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
} }
if (snapshot.shadowExpanded && !this.threadLocal.get('withShadowDom')) { if (snapshot.shadowExpanded && !this.threadLocal.get('withShadowDom')) {
const msg = 'This page contains shadow DOM that are currently hidden, consider enabling shadow DOM processing.'; const msg = 'This page contains shadow DOM that are currently hidden, consider enabling shadow DOM processing.';
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`; formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
} }
if (snapshot.html.includes('captcha') || snapshot.html.includes('cf-turnstile-response')) { if (snapshot.html.includes('captcha') || snapshot.html.includes('cf-turnstile-response')) {
const msg = 'This page maybe requiring CAPTCHA, please make sure you are authorized to access this page.'; const msg = 'This page maybe requiring CAPTCHA, please make sure you are authorized to access this page.';
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`; formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
} }
if (snapshot.isFromCache) { if (snapshot.isFromCache) {
const msg = 'This is a cached snapshot of the original page, consider retry with caching opt-out.'; const msg = 'This is a cached snapshot of the original page, consider retry with caching opt-out.';
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`; formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`;
} }
} }
@ -565,7 +565,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
const text = snapshot.statusText || STATUS_CODES[code]; const text = snapshot.statusText || STATUS_CODES[code];
mixin.warning ??= ''; mixin.warning ??= '';
const msg = `Target URL returned error ${code}${text ? `: ${text}` : ''}`; const msg = `Target URL returned error ${code}${text ? `: ${text}` : ''}`;
mixin.warning = `${mixin.warning}${mixin.warning ? '\n': ''}${msg}`; mixin.warning = `${mixin.warning}${mixin.warning ? '\n' : ''}${msg}`;
} }
} }
@ -629,6 +629,21 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
} }
} }
turnDownService.addRule('improved-heading', {
filter: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'],
replacement: (content, node, options) => {
const hLevel = Number(node.nodeName.charAt(1));
if (options.headingStyle === 'setext' && hLevel < 3) {
const underline = _.repeat((hLevel === 1 ? '=' : '-'), Math.min(128, content.length));
return (
'\n\n' + content + '\n' + underline + '\n\n'
);
} else {
return '\n\n' + _.repeat('#', hLevel) + ' ' + content + '\n\n';
}
}
});
turnDownService.addRule('improved-paragraph', { turnDownService.addRule('improved-paragraph', {
filter: 'p', filter: 'p',
replacement: (innerText) => { replacement: (innerText) => {
@ -751,27 +766,32 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
return snapshot; return snapshot;
} }
if (contentType.startsWith('text/html')) { try {
if ((await file.size) > 1024 * 1024 * 32) { if (contentType.startsWith('text/html')) {
throw new AssertionFailureError(`Failed to access ${url}: file too large`); if ((await file.size) > 1024 * 1024 * 32) {
throw new AssertionFailureError(`Failed to access ${url}: file too large`);
}
snapshot.html = await readFile(await file.filePath, { encoding: 'utf-8' });
return snapshot;
} }
snapshot.html = await readFile(await file.filePath, { encoding: 'utf-8' }); if (contentType.startsWith('text/') || contentType.startsWith('application/json')) {
if ((await file.size) > 1024 * 1024 * 32) {
throw new AssertionFailureError(`Failed to access ${url}: file too large`);
}
snapshot.text = await readFile(await file.filePath, { encoding: 'utf-8' });
snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`;
return snapshot; return snapshot;
}
if (contentType.startsWith('text/') || contentType.startsWith('application/json')) {
if ((await file.size) > 1024 * 1024 * 32) {
throw new AssertionFailureError(`Failed to access ${url}: file too large`);
} }
snapshot.text = await readFile(await file.filePath, { encoding: 'utf-8' }); if (contentType.startsWith('application/pdf')) {
snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`; snapshot.pdfs = [pathToFileURL(await file.filePath).href];
return snapshot; return snapshot;
} }
if (contentType.startsWith('application/pdf')) { } catch (err: any) {
snapshot.pdfs = [pathToFileURL(await file.filePath).href]; this.logger.warn(`Failed to read from file: ${url}`, { err, url });
throw new DataStreamBrokenError(`Failed to access ${url}: ${err?.message}`);
return snapshot;
} }
throw new AssertionFailureError(`Failed to access ${url}: unexpected type ${contentType}`); throw new AssertionFailureError(`Failed to access ${url}: unexpected type ${contentType}`);