mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-15 01:55:53 +08:00
fix: catch jsdom errors
This commit is contained in:
parent
da48d0e4a7
commit
3020d589b6
8
package-lock.json
generated
8
package-lock.json
generated
@ -17,7 +17,7 @@
|
|||||||
"axios": "^1.3.3",
|
"axios": "^1.3.3",
|
||||||
"bcrypt": "^5.1.0",
|
"bcrypt": "^5.1.0",
|
||||||
"busboy": "^1.6.0",
|
"busboy": "^1.6.0",
|
||||||
"civkit": "^0.8.4-9d62ed1",
|
"civkit": "^0.8.4-31171c2",
|
||||||
"core-js": "^3.37.1",
|
"core-js": "^3.37.1",
|
||||||
"cors": "^2.8.5",
|
"cors": "^2.8.5",
|
||||||
"dayjs": "^1.11.9",
|
"dayjs": "^1.11.9",
|
||||||
@ -4095,9 +4095,9 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/civkit": {
|
"node_modules/civkit": {
|
||||||
"version": "0.8.4-9d62ed1",
|
"version": "0.8.4-31171c2",
|
||||||
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-9d62ed1.tgz",
|
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-31171c2.tgz",
|
||||||
"integrity": "sha512-uDmUCjsISAVrJvandGCUm7zTseDhAKISaPwYev73s2VGwZsvG8K/pF4ErSKWp54soNA96RSamyrkVDayqEpHmQ==",
|
"integrity": "sha512-Orr2pl/LmXpwKICWmW/IrUFeOXnuECTceqpL0GdYAbnzh66Zlew5CxM+fyZdBStq1DqXjh5wJyCBqHe+aM3nNQ==",
|
||||||
"license": "AGPL",
|
"license": "AGPL",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"lodash": "^4.17.21",
|
"lodash": "^4.17.21",
|
||||||
|
@ -25,7 +25,7 @@
|
|||||||
"axios": "^1.3.3",
|
"axios": "^1.3.3",
|
||||||
"bcrypt": "^5.1.0",
|
"bcrypt": "^5.1.0",
|
||||||
"busboy": "^1.6.0",
|
"busboy": "^1.6.0",
|
||||||
"civkit": "^0.8.4-9d62ed1",
|
"civkit": "^0.8.4-31171c2",
|
||||||
"core-js": "^3.37.1",
|
"core-js": "^3.37.1",
|
||||||
"cors": "^2.8.5",
|
"cors": "^2.8.5",
|
||||||
"dayjs": "^1.11.9",
|
"dayjs": "^1.11.9",
|
||||||
|
@ -960,6 +960,21 @@ export class CrawlerHost extends RPCHost {
|
|||||||
proxyResources: (opts.proxyUrl || opts.proxy?.endsWith('+')) ? true : false,
|
proxyResources: (opts.proxyUrl || opts.proxy?.endsWith('+')) ? true : false,
|
||||||
private: Boolean(opts.doNotTrack),
|
private: Boolean(opts.doNotTrack),
|
||||||
};
|
};
|
||||||
|
if (crawlOpts.targetSelector?.length) {
|
||||||
|
if (typeof crawlOpts.targetSelector === 'string') {
|
||||||
|
crawlOpts.targetSelector = [crawlOpts.targetSelector];
|
||||||
|
}
|
||||||
|
for (const s of crawlOpts.targetSelector) {
|
||||||
|
for (const e of s.split(',').map((x)=> x.trim())) {
|
||||||
|
if (e.startsWith('*') || e.startsWith(':') || e.includes('*:')) {
|
||||||
|
throw new ParamValidationError({
|
||||||
|
message: `Unacceptable selector: '${e}'. We cannot accept match-all selector for performance reasons. Sorry.`,
|
||||||
|
path: 'targetSelector'
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (opts.locale) {
|
if (opts.locale) {
|
||||||
crawlOpts.extraHeaders ??= {};
|
crawlOpts.extraHeaders ??= {};
|
||||||
|
@ -436,7 +436,6 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
instance.targetSelector ??= targetSelector?.length ? targetSelector : undefined;
|
instance.targetSelector ??= targetSelector?.length ? targetSelector : undefined;
|
||||||
const waitForSelector = ctx?.get('x-wait-for-selector')?.split(', ').filter(Boolean);
|
const waitForSelector = ctx?.get('x-wait-for-selector')?.split(', ').filter(Boolean);
|
||||||
instance.waitForSelector ??= (waitForSelector?.length ? waitForSelector : undefined) || instance.targetSelector;
|
instance.waitForSelector ??= (waitForSelector?.length ? waitForSelector : undefined) || instance.targetSelector;
|
||||||
instance.targetSelector = filterSelector(instance.targetSelector);
|
|
||||||
const overrideUserAgent = ctx?.get('x-user-agent') || undefined;
|
const overrideUserAgent = ctx?.get('x-user-agent') || undefined;
|
||||||
instance.userAgent ??= overrideUserAgent;
|
instance.userAgent ??= overrideUserAgent;
|
||||||
|
|
||||||
@ -591,20 +590,3 @@ export class CrawlerOptionsHeaderOnly extends CrawlerOptions {
|
|||||||
return instance;
|
return instance;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function filterSelector(s?: string | string[]) {
|
|
||||||
if (!s) {
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
const sr = Array.isArray(s) ? s : [s];
|
|
||||||
const selectors = sr.filter((i) => {
|
|
||||||
const innerSelectors = i.split(',').map((s) => s.trim());
|
|
||||||
const someViolation = innerSelectors.find((x) => x.startsWith('*') || x.startsWith(':') || x.includes('*:'));
|
|
||||||
if (someViolation) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
});
|
|
||||||
|
|
||||||
return selectors;
|
|
||||||
};
|
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
import { container, singleton } from 'tsyringe';
|
import { container, singleton } from 'tsyringe';
|
||||||
import { AsyncService, marshalErrorLike } from 'civkit';
|
|
||||||
import { GlobalLogger } from './logger';
|
import { GlobalLogger } from './logger';
|
||||||
import { ExtendedSnapshot, ImgBrief, PageSnapshot } from './puppeteer';
|
import { ExtendedSnapshot, ImgBrief, PageSnapshot } from './puppeteer';
|
||||||
import { Readability } from '@mozilla/readability';
|
import { Readability } from '@mozilla/readability';
|
||||||
@ -8,6 +7,8 @@ import { Threaded } from '../services/threaded';
|
|||||||
import type { ExtraScrappingOptions } from '../api/crawler';
|
import type { ExtraScrappingOptions } from '../api/crawler';
|
||||||
import { tailwindClasses } from '../utils/tailwind-classes';
|
import { tailwindClasses } from '../utils/tailwind-classes';
|
||||||
import { countGPTToken } from '../shared/utils/openai';
|
import { countGPTToken } from '../shared/utils/openai';
|
||||||
|
import { AsyncService } from 'civkit/async-service';
|
||||||
|
import { ApplicationError, AssertionFailureError } from 'civkit/civ-rpc';
|
||||||
|
|
||||||
const pLinkedom = import('linkedom');
|
const pLinkedom = import('linkedom');
|
||||||
|
|
||||||
@ -38,8 +39,17 @@ export class JSDomControl extends AsyncService {
|
|||||||
return snapshot;
|
return snapshot;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
// SideLoad contains native objects that cannot go through thread boundaries.
|
// SideLoad contains native objects that cannot go through thread boundaries.
|
||||||
return this.actualNarrowSnapshot(snapshot, { ...options, sideLoad: undefined });
|
return await this.actualNarrowSnapshot(snapshot, { ...options, sideLoad: undefined });
|
||||||
|
} catch (err: any) {
|
||||||
|
this.logger.warn(`Error narrowing snapshot`, { err });
|
||||||
|
if (err instanceof ApplicationError) {
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new AssertionFailureError(`Failed to process the page: ${err?.message}`);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Threaded()
|
@Threaded()
|
||||||
@ -151,7 +161,7 @@ export class JSDomControl extends AsyncService {
|
|||||||
try {
|
try {
|
||||||
parsed = new Readability(rootDoc.cloneNode(true) as any).parse();
|
parsed = new Readability(rootDoc.cloneNode(true) as any).parse();
|
||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
|
this.logger.warn(`Failed to parse selected element`, { err });
|
||||||
}
|
}
|
||||||
|
|
||||||
const imgSet = new Set<string>();
|
const imgSet = new Set<string>();
|
||||||
|
Loading…
x
Reference in New Issue
Block a user