fix: robots-txt not loaded error conditions

This commit is contained in:
yanlong.wang 2025-03-10 17:19:25 +08:00
parent 4e5abd345e
commit 0da71cad34
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
2 changed files with 14 additions and 3 deletions

View File

@ -134,6 +134,16 @@ class Viewport extends AutoCastable {
in: 'header',
schema: { type: 'string' }
},
'X-Robots-Txt': {
description: `Load and conform to the respective robot.txt on the target origin.\n\nOptionally specify a bot UA to check against.\n\n`,
in: 'header',
schema: { type: 'string' }
},
'DNT': {
description: `When set to 1, prevent the result of this request to be cached in the system.\n\n`,
in: 'header',
schema: { type: 'string' }
},
'X-Set-Cookie': {
description: `Sets cookie(s) to the headless browser for your request. \n\n` +
`Syntax is the same with standard Set-Cookie`,

View File

@ -1,6 +1,6 @@
import { singleton } from 'tsyringe';
import { URL } from 'url';
import { DownstreamServiceFailureError, ResourcePolicyDenyError } from 'civkit/civ-rpc';
import { AssertionFailureError, DownstreamServiceFailureError, ResourcePolicyDenyError } from 'civkit/civ-rpc';
import { AsyncService } from 'civkit/async-service';
import { HashManager } from 'civkit/hash';
import { marshalErrorLike } from 'civkit/lang';
@ -40,7 +40,7 @@ export class RobotsTxtService extends AsyncService {
const r = await fetch(new URL('robots.txt', origin).href, { signal: AbortSignal.timeout(5000) });
if (!r.ok) {
throw new DownstreamServiceFailureError(`Failed to fetch robots.txt from ${origin}`);
throw new DownstreamServiceFailureError(`Failed to fetch robots.txt from ${origin}: ${r.status} ${r.statusText}`);
}
buff = Buffer.from(await r.arrayBuffer());
@ -60,9 +60,10 @@ export class RobotsTxtService extends AsyncService {
robotTxt = await this.getCachedRobotTxt(url.origin);
} catch (err) {
if (err instanceof DownstreamServiceFailureError) {
// Remote server is reachable but cannot provide a robot.txt; this is treated as public access
return true;
}
throw err;
throw new AssertionFailureError(`Failed to load robots.txt from ${url.origin}: ${err}`);
}
const myUa = inputMyUa.toLowerCase();
const lines = robotTxt.split(/\r?\n/g);