mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-13 22:25:56 +08:00
fix: catch jsdom errors
This commit is contained in:
parent
da48d0e4a7
commit
3020d589b6
8
package-lock.json
generated
8
package-lock.json
generated
@ -17,7 +17,7 @@
|
||||
"axios": "^1.3.3",
|
||||
"bcrypt": "^5.1.0",
|
||||
"busboy": "^1.6.0",
|
||||
"civkit": "^0.8.4-9d62ed1",
|
||||
"civkit": "^0.8.4-31171c2",
|
||||
"core-js": "^3.37.1",
|
||||
"cors": "^2.8.5",
|
||||
"dayjs": "^1.11.9",
|
||||
@ -4095,9 +4095,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/civkit": {
|
||||
"version": "0.8.4-9d62ed1",
|
||||
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-9d62ed1.tgz",
|
||||
"integrity": "sha512-uDmUCjsISAVrJvandGCUm7zTseDhAKISaPwYev73s2VGwZsvG8K/pF4ErSKWp54soNA96RSamyrkVDayqEpHmQ==",
|
||||
"version": "0.8.4-31171c2",
|
||||
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-31171c2.tgz",
|
||||
"integrity": "sha512-Orr2pl/LmXpwKICWmW/IrUFeOXnuECTceqpL0GdYAbnzh66Zlew5CxM+fyZdBStq1DqXjh5wJyCBqHe+aM3nNQ==",
|
||||
"license": "AGPL",
|
||||
"dependencies": {
|
||||
"lodash": "^4.17.21",
|
||||
|
@ -25,7 +25,7 @@
|
||||
"axios": "^1.3.3",
|
||||
"bcrypt": "^5.1.0",
|
||||
"busboy": "^1.6.0",
|
||||
"civkit": "^0.8.4-9d62ed1",
|
||||
"civkit": "^0.8.4-31171c2",
|
||||
"core-js": "^3.37.1",
|
||||
"cors": "^2.8.5",
|
||||
"dayjs": "^1.11.9",
|
||||
|
@ -960,6 +960,21 @@ export class CrawlerHost extends RPCHost {
|
||||
proxyResources: (opts.proxyUrl || opts.proxy?.endsWith('+')) ? true : false,
|
||||
private: Boolean(opts.doNotTrack),
|
||||
};
|
||||
if (crawlOpts.targetSelector?.length) {
|
||||
if (typeof crawlOpts.targetSelector === 'string') {
|
||||
crawlOpts.targetSelector = [crawlOpts.targetSelector];
|
||||
}
|
||||
for (const s of crawlOpts.targetSelector) {
|
||||
for (const e of s.split(',').map((x)=> x.trim())) {
|
||||
if (e.startsWith('*') || e.startsWith(':') || e.includes('*:')) {
|
||||
throw new ParamValidationError({
|
||||
message: `Unacceptable selector: '${e}'. We cannot accept match-all selector for performance reasons. Sorry.`,
|
||||
path: 'targetSelector'
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (opts.locale) {
|
||||
crawlOpts.extraHeaders ??= {};
|
||||
|
@ -436,7 +436,6 @@ export class CrawlerOptions extends AutoCastable {
|
||||
instance.targetSelector ??= targetSelector?.length ? targetSelector : undefined;
|
||||
const waitForSelector = ctx?.get('x-wait-for-selector')?.split(', ').filter(Boolean);
|
||||
instance.waitForSelector ??= (waitForSelector?.length ? waitForSelector : undefined) || instance.targetSelector;
|
||||
instance.targetSelector = filterSelector(instance.targetSelector);
|
||||
const overrideUserAgent = ctx?.get('x-user-agent') || undefined;
|
||||
instance.userAgent ??= overrideUserAgent;
|
||||
|
||||
@ -590,21 +589,4 @@ export class CrawlerOptionsHeaderOnly extends CrawlerOptions {
|
||||
|
||||
return instance;
|
||||
}
|
||||
}
|
||||
|
||||
function filterSelector(s?: string | string[]) {
|
||||
if (!s) {
|
||||
return s;
|
||||
}
|
||||
const sr = Array.isArray(s) ? s : [s];
|
||||
const selectors = sr.filter((i) => {
|
||||
const innerSelectors = i.split(',').map((s) => s.trim());
|
||||
const someViolation = innerSelectors.find((x) => x.startsWith('*') || x.startsWith(':') || x.includes('*:'));
|
||||
if (someViolation) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
});
|
||||
|
||||
return selectors;
|
||||
};
|
||||
}
|
@ -1,5 +1,4 @@
|
||||
import { container, singleton } from 'tsyringe';
|
||||
import { AsyncService, marshalErrorLike } from 'civkit';
|
||||
import { GlobalLogger } from './logger';
|
||||
import { ExtendedSnapshot, ImgBrief, PageSnapshot } from './puppeteer';
|
||||
import { Readability } from '@mozilla/readability';
|
||||
@ -8,6 +7,8 @@ import { Threaded } from '../services/threaded';
|
||||
import type { ExtraScrappingOptions } from '../api/crawler';
|
||||
import { tailwindClasses } from '../utils/tailwind-classes';
|
||||
import { countGPTToken } from '../shared/utils/openai';
|
||||
import { AsyncService } from 'civkit/async-service';
|
||||
import { ApplicationError, AssertionFailureError } from 'civkit/civ-rpc';
|
||||
|
||||
const pLinkedom = import('linkedom');
|
||||
|
||||
@ -38,8 +39,17 @@ export class JSDomControl extends AsyncService {
|
||||
return snapshot;
|
||||
}
|
||||
|
||||
// SideLoad contains native objects that cannot go through thread boundaries.
|
||||
return this.actualNarrowSnapshot(snapshot, { ...options, sideLoad: undefined });
|
||||
try {
|
||||
// SideLoad contains native objects that cannot go through thread boundaries.
|
||||
return await this.actualNarrowSnapshot(snapshot, { ...options, sideLoad: undefined });
|
||||
} catch (err: any) {
|
||||
this.logger.warn(`Error narrowing snapshot`, { err });
|
||||
if (err instanceof ApplicationError) {
|
||||
throw err;
|
||||
}
|
||||
|
||||
throw new AssertionFailureError(`Failed to process the page: ${err?.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
@Threaded()
|
||||
@ -151,7 +161,7 @@ export class JSDomControl extends AsyncService {
|
||||
try {
|
||||
parsed = new Readability(rootDoc.cloneNode(true) as any).parse();
|
||||
} catch (err: any) {
|
||||
this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
|
||||
this.logger.warn(`Failed to parse selected element`, { err });
|
||||
}
|
||||
|
||||
const imgSet = new Set<string>();
|
||||
|
Loading…
x
Reference in New Issue
Block a user