Merge pull request #808 from mendableai/feat/skipTlsVerification

feat: skipTlsVerification
This commit is contained in:
Nicolas 2024-10-22 20:47:13 -03:00 committed by GitHub
commit e0d3b761fc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 18 additions and 6 deletions

View File

@ -78,7 +78,7 @@ export async function crawlController(
const crawler = crawlToCrawler(id, sc);
try {
sc.robots = await crawler.getRobotsTxt();
sc.robots = await crawler.getRobotsTxt(pageOptions.skipTlsVerification);
} catch (e) {
Logger.debug(
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(

View File

@ -117,6 +117,7 @@ export const scrapeOptions = z.object({
}
).transform(val => val ? val.toUpperCase() : 'US')
}).optional(),
skipTlsVerification: z.boolean().default(false),
}).strict(strictMessage)
@ -433,6 +434,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
parsePDF: x.parsePDF,
actions: x.actions as Action[], // no strict null checking grrrr - mogery
geolocation: x.geolocation,
skipTlsVerification: x.skipTlsVerification
};
}

View File

@ -54,6 +54,7 @@ export type PageOptions = {
geolocation?: {
country?: string;
};
skipTlsVerification?: boolean;
};
export type ExtractorOptions = {

View File

@ -9,7 +9,7 @@ import robotsParser from "robots-parser";
import { getURLDepth } from "./utils/maxDepthUtils";
import { axiosTimeout } from "../../../src/lib/timeout";
import { Logger } from "../../../src/lib/logger";
import https from "https";
export class WebCrawler {
private jobId: string;
private initialUrl: string;
@ -145,8 +145,14 @@ export class WebCrawler {
.slice(0, limit);
}
public async getRobotsTxt(): Promise<string> {
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
public async getRobotsTxt(skipTlsVerification = false): Promise<string> {
let extraArgs = {};
if(skipTlsVerification) {
extraArgs["httpsAgent"] = new https.Agent({
rejectUnauthorized: false
});
}
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout, ...extraArgs });
return response.data;
}

View File

@ -594,6 +594,7 @@ export class WebScraperDataProvider {
atsv: options.pageOptions?.atsv ?? false,
actions: options.pageOptions?.actions ?? undefined,
geolocation: options.pageOptions?.geolocation ?? undefined,
skipTlsVerification: options.pageOptions?.skipTlsVerification ?? false,
};
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
this.replaceAllPathsWithAbsolutePaths =

View File

@ -28,7 +28,7 @@ export async function scrapWithFireEngine({
waitFor = 0,
screenshot = false,
fullPageScreenshot = false,
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" } },
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" }, skipTlsVerification: false },
fireEngineOptions = {},
headers,
options,
@ -40,7 +40,7 @@ export async function scrapWithFireEngine({
waitFor?: number;
screenshot?: boolean;
fullPageScreenshot?: boolean;
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string } };
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string }, skipTlsVerification?: boolean };
fireEngineOptions?: FireEngineOptions;
headers?: Record<string, string>;
options?: any;
@ -119,6 +119,7 @@ export async function scrapWithFireEngine({
atsv: pageOptions?.atsv ?? false,
scrollXPaths: pageOptions?.scrollXPaths ?? [],
geolocation: pageOptions?.geolocation,
skipTlsVerification: pageOptions?.skipTlsVerification ?? false,
actions: actions,
},
{

View File

@ -157,6 +157,7 @@ export async function scrapSingleUrl(
atsv: pageOptions.atsv ?? false,
actions: pageOptions.actions ?? undefined,
geolocation: pageOptions.geolocation ?? undefined,
skipTlsVerification: pageOptions.skipTlsVerification ?? false,
}
if (extractorOptions) {