diff --git a/src/api/crawler.ts b/src/api/crawler.ts index 05334cc..5d41824 100644 --- a/src/api/crawler.ts +++ b/src/api/crawler.ts @@ -8,6 +8,7 @@ import { AssertionFailureError, ParamValidationError, RawString, ApplicationError, + DataStreamBrokenError, } from 'civkit/civ-rpc'; import { marshalErrorLike } from 'civkit/lang'; import { Defer } from 'civkit/defer'; @@ -817,7 +818,10 @@ export class CrawlerHost extends RPCHost { } } catch (err: any) { this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href }); - if (err instanceof ApplicationError && !(err instanceof ServiceBadAttemptError)) { + if (err instanceof ApplicationError && + !(err instanceof ServiceBadAttemptError) && + !(err instanceof DataStreamBrokenError) + ) { throw err; } } @@ -968,7 +972,7 @@ export class CrawlerHost extends RPCHost { crawlOpts.targetSelector = [crawlOpts.targetSelector]; } for (const s of crawlOpts.targetSelector) { - for (const e of s.split(',').map((x)=> x.trim())) { + for (const e of s.split(',').map((x) => x.trim())) { if (e.startsWith('*') || e.startsWith(':') || e.includes('*:')) { throw new ParamValidationError({ message: `Unacceptable selector: '${e}'. We cannot accept match-all selector for performance reasons. Sorry.`, diff --git a/src/services/pdf-extract.ts b/src/services/pdf-extract.ts index 4a8e388..634e909 100644 --- a/src/services/pdf-extract.ts +++ b/src/services/pdf-extract.ts @@ -2,7 +2,7 @@ import 'core-js/actual/promise/with-resolvers'; import { singleton } from 'tsyringe'; import _ from 'lodash'; import { TextItem } from 'pdfjs-dist/types/src/display/api'; -import { AsyncService, HashManager } from 'civkit'; +import { AssertionFailureError, AsyncService, HashManager } from 'civkit'; import { GlobalLogger } from './logger'; import { PDFContent } from '../db/pdf'; import dayjs from 'dayjs'; @@ -325,27 +325,27 @@ export class PDFExtractor extends AsyncService { try { extracted = await this.extract(data); + } catch (err: any) { + this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err, url, nameUrl }); + throw new AssertionFailureError(`Unable to process ${nameUrl} as pdf: ${err?.message}`); + } - if (!this.asyncLocalContext.ctx.DNT) { - const theID = randomUUID(); - await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`, - Buffer.from(JSON.stringify(extracted), 'utf-8'), { contentType: 'application/json' }); - PDFContent.save( - PDFContent.from({ - _id: theID, - src: nameUrl, - meta: extracted?.meta || {}, - urlDigest: digest, - createdAt: new Date(), - expireAt: new Date(Date.now() + this.cacheRetentionMs) - }).degradeForFireStore() - ).catch((r) => { - this.logger.warn(`Unable to cache PDF content for ${nameUrl}`, { err: r }); - }); - } - } catch (err) { - this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err }); - throw err; + if (!this.asyncLocalContext.ctx.DNT) { + const theID = randomUUID(); + await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`, + Buffer.from(JSON.stringify(extracted), 'utf-8'), { contentType: 'application/json' }); + PDFContent.save( + PDFContent.from({ + _id: theID, + src: nameUrl, + meta: extracted?.meta || {}, + urlDigest: digest, + createdAt: new Date(), + expireAt: new Date(Date.now() + this.cacheRetentionMs) + }).degradeForFireStore() + ).catch((r) => { + this.logger.warn(`Unable to cache PDF content for ${nameUrl}`, { err: r }); + }); } return extracted; diff --git a/src/services/puppeteer.ts b/src/services/puppeteer.ts index e3cc31e..ed851a5 100644 --- a/src/services/puppeteer.ts +++ b/src/services/puppeteer.ts @@ -846,7 +846,7 @@ export class PuppeteerControl extends AsyncService { const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin]; const ctx = this.lifeCycleTrack.get(page); if (proxy && ctx) { - return this.asyncLocalContext.bridge(ctx, async () => { + return await this.asyncLocalContext.bridge(ctx, async () => { try { const curled = await this.curlControl.sideLoad(reqUrlParsed, { ...options, @@ -890,7 +890,7 @@ export class PuppeteerControl extends AsyncService { headers: _.omit(firstReq, 'result'), }, 999); } catch (err: any) { - this.logger.warn(`Failed to sideload ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err: marshalErrorLike(err) }); + this.logger.warn(`Failed to sideload browser request ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err, proxy }); } if (req.isInterceptResolutionHandled()) { return; diff --git a/src/services/snapshot-formatter.ts b/src/services/snapshot-formatter.ts index 11382d6..115f6b5 100644 --- a/src/services/snapshot-formatter.ts +++ b/src/services/snapshot-formatter.ts @@ -1,6 +1,6 @@ import { randomUUID } from 'crypto'; import { container, singleton } from 'tsyringe'; -import { AssertionFailureError, AsyncService, FancyFile, HashManager, marshalErrorLike } from 'civkit'; +import { AssertionFailureError, AsyncService, DataStreamBrokenError, FancyFile, HashManager, marshalErrorLike } from 'civkit'; import TurndownService, { Filter, Rule } from 'turndown'; import { GlobalLogger } from './logger'; import { PageSnapshot } from './puppeteer'; @@ -406,7 +406,7 @@ export class SnapshotFormatter extends AsyncService { const text = snapshot.statusText || STATUS_CODES[code]; formatted.warning ??= ''; const msg = `Target URL returned error ${code}${text ? `: ${text}` : ''}`; - formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`; + formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`; } } @@ -441,23 +441,23 @@ export class SnapshotFormatter extends AsyncService { formatted.warning ??= ''; if (snapshot.isIntermediate) { const msg = 'This page maybe not yet fully loaded, consider explicitly specify a timeout.'; - formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`; + formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`; } if (snapshot.childFrames?.length && !this.threadLocal.get('withIframe')) { const msg = 'This page contains iframe that are currently hidden, consider enabling iframe processing.'; - formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`; + formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`; } if (snapshot.shadowExpanded && !this.threadLocal.get('withShadowDom')) { const msg = 'This page contains shadow DOM that are currently hidden, consider enabling shadow DOM processing.'; - formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`; + formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`; } if (snapshot.html.includes('captcha') || snapshot.html.includes('cf-turnstile-response')) { const msg = 'This page maybe requiring CAPTCHA, please make sure you are authorized to access this page.'; - formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`; + formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`; } if (snapshot.isFromCache) { const msg = 'This is a cached snapshot of the original page, consider retry with caching opt-out.'; - formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`; + formatted.warning = `${formatted.warning}${formatted.warning ? '\n' : ''}${msg}`; } } @@ -565,7 +565,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; const text = snapshot.statusText || STATUS_CODES[code]; mixin.warning ??= ''; const msg = `Target URL returned error ${code}${text ? `: ${text}` : ''}`; - mixin.warning = `${mixin.warning}${mixin.warning ? '\n': ''}${msg}`; + mixin.warning = `${mixin.warning}${mixin.warning ? '\n' : ''}${msg}`; } } @@ -629,6 +629,21 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; } } + turnDownService.addRule('improved-heading', { + filter: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'], + replacement: (content, node, options) => { + const hLevel = Number(node.nodeName.charAt(1)); + if (options.headingStyle === 'setext' && hLevel < 3) { + const underline = _.repeat((hLevel === 1 ? '=' : '-'), Math.min(128, content.length)); + return ( + '\n\n' + content + '\n' + underline + '\n\n' + ); + } else { + return '\n\n' + _.repeat('#', hLevel) + ' ' + content + '\n\n'; + } + } + }); + turnDownService.addRule('improved-paragraph', { filter: 'p', replacement: (innerText) => { @@ -751,27 +766,32 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; return snapshot; } - if (contentType.startsWith('text/html')) { - if ((await file.size) > 1024 * 1024 * 32) { - throw new AssertionFailureError(`Failed to access ${url}: file too large`); + try { + if (contentType.startsWith('text/html')) { + if ((await file.size) > 1024 * 1024 * 32) { + throw new AssertionFailureError(`Failed to access ${url}: file too large`); + } + snapshot.html = await readFile(await file.filePath, { encoding: 'utf-8' }); + + return snapshot; } - snapshot.html = await readFile(await file.filePath, { encoding: 'utf-8' }); + if (contentType.startsWith('text/') || contentType.startsWith('application/json')) { + if ((await file.size) > 1024 * 1024 * 32) { + throw new AssertionFailureError(`Failed to access ${url}: file too large`); + } + snapshot.text = await readFile(await file.filePath, { encoding: 'utf-8' }); + snapshot.html = `
${snapshot.text}`; - return snapshot; - } - if (contentType.startsWith('text/') || contentType.startsWith('application/json')) { - if ((await file.size) > 1024 * 1024 * 32) { - throw new AssertionFailureError(`Failed to access ${url}: file too large`); + return snapshot; } - snapshot.text = await readFile(await file.filePath, { encoding: 'utf-8' }); - snapshot.html = `
${snapshot.text}`; + if (contentType.startsWith('application/pdf')) { + snapshot.pdfs = [pathToFileURL(await file.filePath).href]; - return snapshot; - } - if (contentType.startsWith('application/pdf')) { - snapshot.pdfs = [pathToFileURL(await file.filePath).href]; - - return snapshot; + return snapshot; + } + } catch (err: any) { + this.logger.warn(`Failed to read from file: ${url}`, { err, url }); + throw new DataStreamBrokenError(`Failed to access ${url}: ${err?.message}`); } throw new AssertionFailureError(`Failed to access ${url}: unexpected type ${contentType}`);