From 0da71cad342ddc709241d1362c5f1b24dda3fb8b Mon Sep 17 00:00:00 2001 From: "yanlong.wang" Date: Mon, 10 Mar 2025 17:19:25 +0800 Subject: [PATCH] fix: robots-txt not loaded error conditions --- src/dto/crawler-options.ts | 10 ++++++++++ src/services/robots-text.ts | 7 ++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/dto/crawler-options.ts b/src/dto/crawler-options.ts index 38ddc2a..a562c0c 100644 --- a/src/dto/crawler-options.ts +++ b/src/dto/crawler-options.ts @@ -134,6 +134,16 @@ class Viewport extends AutoCastable { in: 'header', schema: { type: 'string' } }, + 'X-Robots-Txt': { + description: `Load and conform to the respective robot.txt on the target origin.\n\nOptionally specify a bot UA to check against.\n\n`, + in: 'header', + schema: { type: 'string' } + }, + 'DNT': { + description: `When set to 1, prevent the result of this request to be cached in the system.\n\n`, + in: 'header', + schema: { type: 'string' } + }, 'X-Set-Cookie': { description: `Sets cookie(s) to the headless browser for your request. \n\n` + `Syntax is the same with standard Set-Cookie`, diff --git a/src/services/robots-text.ts b/src/services/robots-text.ts index 80830b4..365af3c 100644 --- a/src/services/robots-text.ts +++ b/src/services/robots-text.ts @@ -1,6 +1,6 @@ import { singleton } from 'tsyringe'; import { URL } from 'url'; -import { DownstreamServiceFailureError, ResourcePolicyDenyError } from 'civkit/civ-rpc'; +import { AssertionFailureError, DownstreamServiceFailureError, ResourcePolicyDenyError } from 'civkit/civ-rpc'; import { AsyncService } from 'civkit/async-service'; import { HashManager } from 'civkit/hash'; import { marshalErrorLike } from 'civkit/lang'; @@ -40,7 +40,7 @@ export class RobotsTxtService extends AsyncService { const r = await fetch(new URL('robots.txt', origin).href, { signal: AbortSignal.timeout(5000) }); if (!r.ok) { - throw new DownstreamServiceFailureError(`Failed to fetch robots.txt from ${origin}`); + throw new DownstreamServiceFailureError(`Failed to fetch robots.txt from ${origin}: ${r.status} ${r.statusText}`); } buff = Buffer.from(await r.arrayBuffer()); @@ -60,9 +60,10 @@ export class RobotsTxtService extends AsyncService { robotTxt = await this.getCachedRobotTxt(url.origin); } catch (err) { if (err instanceof DownstreamServiceFailureError) { + // Remote server is reachable but cannot provide a robot.txt; this is treated as public access return true; } - throw err; + throw new AssertionFailureError(`Failed to load robots.txt from ${url.origin}: ${err}`); } const myUa = inputMyUa.toLowerCase(); const lines = robotTxt.split(/\r?\n/g);