diff --git a/package-lock.json b/package-lock.json index 7040fde..973931c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -17,7 +17,7 @@ "axios": "^1.3.3", "bcrypt": "^5.1.0", "busboy": "^1.6.0", - "civkit": "^0.8.4-9d62ed1", + "civkit": "^0.8.4-31171c2", "core-js": "^3.37.1", "cors": "^2.8.5", "dayjs": "^1.11.9", @@ -4095,9 +4095,9 @@ } }, "node_modules/civkit": { - "version": "0.8.4-9d62ed1", - "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-9d62ed1.tgz", - "integrity": "sha512-uDmUCjsISAVrJvandGCUm7zTseDhAKISaPwYev73s2VGwZsvG8K/pF4ErSKWp54soNA96RSamyrkVDayqEpHmQ==", + "version": "0.8.4-31171c2", + "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-31171c2.tgz", + "integrity": "sha512-Orr2pl/LmXpwKICWmW/IrUFeOXnuECTceqpL0GdYAbnzh66Zlew5CxM+fyZdBStq1DqXjh5wJyCBqHe+aM3nNQ==", "license": "AGPL", "dependencies": { "lodash": "^4.17.21", diff --git a/package.json b/package.json index d51809a..7058a4d 100644 --- a/package.json +++ b/package.json @@ -25,7 +25,7 @@ "axios": "^1.3.3", "bcrypt": "^5.1.0", "busboy": "^1.6.0", - "civkit": "^0.8.4-9d62ed1", + "civkit": "^0.8.4-31171c2", "core-js": "^3.37.1", "cors": "^2.8.5", "dayjs": "^1.11.9", diff --git a/src/api/crawler.ts b/src/api/crawler.ts index 77cd822..d241d87 100644 --- a/src/api/crawler.ts +++ b/src/api/crawler.ts @@ -960,6 +960,21 @@ export class CrawlerHost extends RPCHost { proxyResources: (opts.proxyUrl || opts.proxy?.endsWith('+')) ? true : false, private: Boolean(opts.doNotTrack), }; + if (crawlOpts.targetSelector?.length) { + if (typeof crawlOpts.targetSelector === 'string') { + crawlOpts.targetSelector = [crawlOpts.targetSelector]; + } + for (const s of crawlOpts.targetSelector) { + for (const e of s.split(',').map((x)=> x.trim())) { + if (e.startsWith('*') || e.startsWith(':') || e.includes('*:')) { + throw new ParamValidationError({ + message: `Unacceptable selector: '${e}'. We cannot accept match-all selector for performance reasons. Sorry.`, + path: 'targetSelector' + }); + } + } + } + } if (opts.locale) { crawlOpts.extraHeaders ??= {}; diff --git a/src/dto/crawler-options.ts b/src/dto/crawler-options.ts index 391aed6..1101719 100644 --- a/src/dto/crawler-options.ts +++ b/src/dto/crawler-options.ts @@ -436,7 +436,6 @@ export class CrawlerOptions extends AutoCastable { instance.targetSelector ??= targetSelector?.length ? targetSelector : undefined; const waitForSelector = ctx?.get('x-wait-for-selector')?.split(', ').filter(Boolean); instance.waitForSelector ??= (waitForSelector?.length ? waitForSelector : undefined) || instance.targetSelector; - instance.targetSelector = filterSelector(instance.targetSelector); const overrideUserAgent = ctx?.get('x-user-agent') || undefined; instance.userAgent ??= overrideUserAgent; @@ -590,21 +589,4 @@ export class CrawlerOptionsHeaderOnly extends CrawlerOptions { return instance; } -} - -function filterSelector(s?: string | string[]) { - if (!s) { - return s; - } - const sr = Array.isArray(s) ? s : [s]; - const selectors = sr.filter((i) => { - const innerSelectors = i.split(',').map((s) => s.trim()); - const someViolation = innerSelectors.find((x) => x.startsWith('*') || x.startsWith(':') || x.includes('*:')); - if (someViolation) { - return false; - } - return true; - }); - - return selectors; -}; +} \ No newline at end of file diff --git a/src/services/jsdom.ts b/src/services/jsdom.ts index 1bf2ac0..6bd960d 100644 --- a/src/services/jsdom.ts +++ b/src/services/jsdom.ts @@ -1,5 +1,4 @@ import { container, singleton } from 'tsyringe'; -import { AsyncService, marshalErrorLike } from 'civkit'; import { GlobalLogger } from './logger'; import { ExtendedSnapshot, ImgBrief, PageSnapshot } from './puppeteer'; import { Readability } from '@mozilla/readability'; @@ -8,6 +7,8 @@ import { Threaded } from '../services/threaded'; import type { ExtraScrappingOptions } from '../api/crawler'; import { tailwindClasses } from '../utils/tailwind-classes'; import { countGPTToken } from '../shared/utils/openai'; +import { AsyncService } from 'civkit/async-service'; +import { ApplicationError, AssertionFailureError } from 'civkit/civ-rpc'; const pLinkedom = import('linkedom'); @@ -38,8 +39,17 @@ export class JSDomControl extends AsyncService { return snapshot; } - // SideLoad contains native objects that cannot go through thread boundaries. - return this.actualNarrowSnapshot(snapshot, { ...options, sideLoad: undefined }); + try { + // SideLoad contains native objects that cannot go through thread boundaries. + return await this.actualNarrowSnapshot(snapshot, { ...options, sideLoad: undefined }); + } catch (err: any) { + this.logger.warn(`Error narrowing snapshot`, { err }); + if (err instanceof ApplicationError) { + throw err; + } + + throw new AssertionFailureError(`Failed to process the page: ${err?.message}`); + } } @Threaded() @@ -151,7 +161,7 @@ export class JSDomControl extends AsyncService { try { parsed = new Readability(rootDoc.cloneNode(true) as any).parse(); } catch (err: any) { - this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) }); + this.logger.warn(`Failed to parse selected element`, { err }); } const imgSet = new Set();