fix: catch jsdom errors

This commit is contained in:
Yanlong Wang 2025-03-08 23:17:53 +08:00
parent da48d0e4a7
commit 3020d589b6
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
5 changed files with 35 additions and 28 deletions

8
package-lock.json generated
View File

@ -17,7 +17,7 @@
"axios": "^1.3.3",
"bcrypt": "^5.1.0",
"busboy": "^1.6.0",
"civkit": "^0.8.4-9d62ed1",
"civkit": "^0.8.4-31171c2",
"core-js": "^3.37.1",
"cors": "^2.8.5",
"dayjs": "^1.11.9",
@ -4095,9 +4095,9 @@
}
},
"node_modules/civkit": {
"version": "0.8.4-9d62ed1",
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-9d62ed1.tgz",
"integrity": "sha512-uDmUCjsISAVrJvandGCUm7zTseDhAKISaPwYev73s2VGwZsvG8K/pF4ErSKWp54soNA96RSamyrkVDayqEpHmQ==",
"version": "0.8.4-31171c2",
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-31171c2.tgz",
"integrity": "sha512-Orr2pl/LmXpwKICWmW/IrUFeOXnuECTceqpL0GdYAbnzh66Zlew5CxM+fyZdBStq1DqXjh5wJyCBqHe+aM3nNQ==",
"license": "AGPL",
"dependencies": {
"lodash": "^4.17.21",

View File

@ -25,7 +25,7 @@
"axios": "^1.3.3",
"bcrypt": "^5.1.0",
"busboy": "^1.6.0",
"civkit": "^0.8.4-9d62ed1",
"civkit": "^0.8.4-31171c2",
"core-js": "^3.37.1",
"cors": "^2.8.5",
"dayjs": "^1.11.9",

View File

@ -960,6 +960,21 @@ export class CrawlerHost extends RPCHost {
proxyResources: (opts.proxyUrl || opts.proxy?.endsWith('+')) ? true : false,
private: Boolean(opts.doNotTrack),
};
if (crawlOpts.targetSelector?.length) {
if (typeof crawlOpts.targetSelector === 'string') {
crawlOpts.targetSelector = [crawlOpts.targetSelector];
}
for (const s of crawlOpts.targetSelector) {
for (const e of s.split(',').map((x)=> x.trim())) {
if (e.startsWith('*') || e.startsWith(':') || e.includes('*:')) {
throw new ParamValidationError({
message: `Unacceptable selector: '${e}'. We cannot accept match-all selector for performance reasons. Sorry.`,
path: 'targetSelector'
});
}
}
}
}
if (opts.locale) {
crawlOpts.extraHeaders ??= {};

View File

@ -436,7 +436,6 @@ export class CrawlerOptions extends AutoCastable {
instance.targetSelector ??= targetSelector?.length ? targetSelector : undefined;
const waitForSelector = ctx?.get('x-wait-for-selector')?.split(', ').filter(Boolean);
instance.waitForSelector ??= (waitForSelector?.length ? waitForSelector : undefined) || instance.targetSelector;
instance.targetSelector = filterSelector(instance.targetSelector);
const overrideUserAgent = ctx?.get('x-user-agent') || undefined;
instance.userAgent ??= overrideUserAgent;
@ -590,21 +589,4 @@ export class CrawlerOptionsHeaderOnly extends CrawlerOptions {
return instance;
}
}
function filterSelector(s?: string | string[]) {
if (!s) {
return s;
}
const sr = Array.isArray(s) ? s : [s];
const selectors = sr.filter((i) => {
const innerSelectors = i.split(',').map((s) => s.trim());
const someViolation = innerSelectors.find((x) => x.startsWith('*') || x.startsWith(':') || x.includes('*:'));
if (someViolation) {
return false;
}
return true;
});
return selectors;
};
}

View File

@ -1,5 +1,4 @@
import { container, singleton } from 'tsyringe';
import { AsyncService, marshalErrorLike } from 'civkit';
import { GlobalLogger } from './logger';
import { ExtendedSnapshot, ImgBrief, PageSnapshot } from './puppeteer';
import { Readability } from '@mozilla/readability';
@ -8,6 +7,8 @@ import { Threaded } from '../services/threaded';
import type { ExtraScrappingOptions } from '../api/crawler';
import { tailwindClasses } from '../utils/tailwind-classes';
import { countGPTToken } from '../shared/utils/openai';
import { AsyncService } from 'civkit/async-service';
import { ApplicationError, AssertionFailureError } from 'civkit/civ-rpc';
const pLinkedom = import('linkedom');
@ -38,8 +39,17 @@ export class JSDomControl extends AsyncService {
return snapshot;
}
// SideLoad contains native objects that cannot go through thread boundaries.
return this.actualNarrowSnapshot(snapshot, { ...options, sideLoad: undefined });
try {
// SideLoad contains native objects that cannot go through thread boundaries.
return await this.actualNarrowSnapshot(snapshot, { ...options, sideLoad: undefined });
} catch (err: any) {
this.logger.warn(`Error narrowing snapshot`, { err });
if (err instanceof ApplicationError) {
throw err;
}
throw new AssertionFailureError(`Failed to process the page: ${err?.message}`);
}
}
@Threaded()
@ -151,7 +161,7 @@ export class JSDomControl extends AsyncService {
try {
parsed = new Readability(rootDoc.cloneNode(true) as any).parse();
} catch (err: any) {
this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
this.logger.warn(`Failed to parse selected element`, { err });
}
const imgSet = new Set<string>();