mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-15 08:15:52 +08:00
fix: robots-txt not loaded error conditions
This commit is contained in:
parent
4e5abd345e
commit
0da71cad34
@ -134,6 +134,16 @@ class Viewport extends AutoCastable {
|
|||||||
in: 'header',
|
in: 'header',
|
||||||
schema: { type: 'string' }
|
schema: { type: 'string' }
|
||||||
},
|
},
|
||||||
|
'X-Robots-Txt': {
|
||||||
|
description: `Load and conform to the respective robot.txt on the target origin.\n\nOptionally specify a bot UA to check against.\n\n`,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
'DNT': {
|
||||||
|
description: `When set to 1, prevent the result of this request to be cached in the system.\n\n`,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
'X-Set-Cookie': {
|
'X-Set-Cookie': {
|
||||||
description: `Sets cookie(s) to the headless browser for your request. \n\n` +
|
description: `Sets cookie(s) to the headless browser for your request. \n\n` +
|
||||||
`Syntax is the same with standard Set-Cookie`,
|
`Syntax is the same with standard Set-Cookie`,
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import { singleton } from 'tsyringe';
|
import { singleton } from 'tsyringe';
|
||||||
import { URL } from 'url';
|
import { URL } from 'url';
|
||||||
import { DownstreamServiceFailureError, ResourcePolicyDenyError } from 'civkit/civ-rpc';
|
import { AssertionFailureError, DownstreamServiceFailureError, ResourcePolicyDenyError } from 'civkit/civ-rpc';
|
||||||
import { AsyncService } from 'civkit/async-service';
|
import { AsyncService } from 'civkit/async-service';
|
||||||
import { HashManager } from 'civkit/hash';
|
import { HashManager } from 'civkit/hash';
|
||||||
import { marshalErrorLike } from 'civkit/lang';
|
import { marshalErrorLike } from 'civkit/lang';
|
||||||
@ -40,7 +40,7 @@ export class RobotsTxtService extends AsyncService {
|
|||||||
|
|
||||||
const r = await fetch(new URL('robots.txt', origin).href, { signal: AbortSignal.timeout(5000) });
|
const r = await fetch(new URL('robots.txt', origin).href, { signal: AbortSignal.timeout(5000) });
|
||||||
if (!r.ok) {
|
if (!r.ok) {
|
||||||
throw new DownstreamServiceFailureError(`Failed to fetch robots.txt from ${origin}`);
|
throw new DownstreamServiceFailureError(`Failed to fetch robots.txt from ${origin}: ${r.status} ${r.statusText}`);
|
||||||
}
|
}
|
||||||
buff = Buffer.from(await r.arrayBuffer());
|
buff = Buffer.from(await r.arrayBuffer());
|
||||||
|
|
||||||
@ -60,9 +60,10 @@ export class RobotsTxtService extends AsyncService {
|
|||||||
robotTxt = await this.getCachedRobotTxt(url.origin);
|
robotTxt = await this.getCachedRobotTxt(url.origin);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
if (err instanceof DownstreamServiceFailureError) {
|
if (err instanceof DownstreamServiceFailureError) {
|
||||||
|
// Remote server is reachable but cannot provide a robot.txt; this is treated as public access
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
throw err;
|
throw new AssertionFailureError(`Failed to load robots.txt from ${url.origin}: ${err}`);
|
||||||
}
|
}
|
||||||
const myUa = inputMyUa.toLowerCase();
|
const myUa = inputMyUa.toLowerCase();
|
||||||
const lines = robotTxt.split(/\r?\n/g);
|
const lines = robotTxt.split(/\r?\n/g);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user