mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-14 05:56:03 +08:00
fix: robots-txt not loaded error conditions
This commit is contained in:
parent
4e5abd345e
commit
0da71cad34
@ -134,6 +134,16 @@ class Viewport extends AutoCastable {
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Robots-Txt': {
|
||||
description: `Load and conform to the respective robot.txt on the target origin.\n\nOptionally specify a bot UA to check against.\n\n`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'DNT': {
|
||||
description: `When set to 1, prevent the result of this request to be cached in the system.\n\n`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Set-Cookie': {
|
||||
description: `Sets cookie(s) to the headless browser for your request. \n\n` +
|
||||
`Syntax is the same with standard Set-Cookie`,
|
||||
|
@ -1,6 +1,6 @@
|
||||
import { singleton } from 'tsyringe';
|
||||
import { URL } from 'url';
|
||||
import { DownstreamServiceFailureError, ResourcePolicyDenyError } from 'civkit/civ-rpc';
|
||||
import { AssertionFailureError, DownstreamServiceFailureError, ResourcePolicyDenyError } from 'civkit/civ-rpc';
|
||||
import { AsyncService } from 'civkit/async-service';
|
||||
import { HashManager } from 'civkit/hash';
|
||||
import { marshalErrorLike } from 'civkit/lang';
|
||||
@ -40,7 +40,7 @@ export class RobotsTxtService extends AsyncService {
|
||||
|
||||
const r = await fetch(new URL('robots.txt', origin).href, { signal: AbortSignal.timeout(5000) });
|
||||
if (!r.ok) {
|
||||
throw new DownstreamServiceFailureError(`Failed to fetch robots.txt from ${origin}`);
|
||||
throw new DownstreamServiceFailureError(`Failed to fetch robots.txt from ${origin}: ${r.status} ${r.statusText}`);
|
||||
}
|
||||
buff = Buffer.from(await r.arrayBuffer());
|
||||
|
||||
@ -60,9 +60,10 @@ export class RobotsTxtService extends AsyncService {
|
||||
robotTxt = await this.getCachedRobotTxt(url.origin);
|
||||
} catch (err) {
|
||||
if (err instanceof DownstreamServiceFailureError) {
|
||||
// Remote server is reachable but cannot provide a robot.txt; this is treated as public access
|
||||
return true;
|
||||
}
|
||||
throw err;
|
||||
throw new AssertionFailureError(`Failed to load robots.txt from ${url.origin}: ${err}`);
|
||||
}
|
||||
const myUa = inputMyUa.toLowerCase();
|
||||
const lines = robotTxt.split(/\r?\n/g);
|
||||
|
Loading…
x
Reference in New Issue
Block a user