mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-06 00:36:08 +08:00
Merge pull request #808 from mendableai/feat/skipTlsVerification
feat: skipTlsVerification
This commit is contained in:
commit
e0d3b761fc
@ -78,7 +78,7 @@ export async function crawlController(
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
try {
|
||||
sc.robots = await crawler.getRobotsTxt();
|
||||
sc.robots = await crawler.getRobotsTxt(pageOptions.skipTlsVerification);
|
||||
} catch (e) {
|
||||
Logger.debug(
|
||||
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
|
||||
|
@ -117,6 +117,7 @@ export const scrapeOptions = z.object({
|
||||
}
|
||||
).transform(val => val ? val.toUpperCase() : 'US')
|
||||
}).optional(),
|
||||
skipTlsVerification: z.boolean().default(false),
|
||||
}).strict(strictMessage)
|
||||
|
||||
|
||||
@ -433,6 +434,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
||||
parsePDF: x.parsePDF,
|
||||
actions: x.actions as Action[], // no strict null checking grrrr - mogery
|
||||
geolocation: x.geolocation,
|
||||
skipTlsVerification: x.skipTlsVerification
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -54,6 +54,7 @@ export type PageOptions = {
|
||||
geolocation?: {
|
||||
country?: string;
|
||||
};
|
||||
skipTlsVerification?: boolean;
|
||||
};
|
||||
|
||||
export type ExtractorOptions = {
|
||||
|
@ -9,7 +9,7 @@ import robotsParser from "robots-parser";
|
||||
import { getURLDepth } from "./utils/maxDepthUtils";
|
||||
import { axiosTimeout } from "../../../src/lib/timeout";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
|
||||
import https from "https";
|
||||
export class WebCrawler {
|
||||
private jobId: string;
|
||||
private initialUrl: string;
|
||||
@ -145,8 +145,14 @@ export class WebCrawler {
|
||||
.slice(0, limit);
|
||||
}
|
||||
|
||||
public async getRobotsTxt(): Promise<string> {
|
||||
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
|
||||
public async getRobotsTxt(skipTlsVerification = false): Promise<string> {
|
||||
let extraArgs = {};
|
||||
if(skipTlsVerification) {
|
||||
extraArgs["httpsAgent"] = new https.Agent({
|
||||
rejectUnauthorized: false
|
||||
});
|
||||
}
|
||||
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout, ...extraArgs });
|
||||
return response.data;
|
||||
}
|
||||
|
||||
|
@ -594,6 +594,7 @@ export class WebScraperDataProvider {
|
||||
atsv: options.pageOptions?.atsv ?? false,
|
||||
actions: options.pageOptions?.actions ?? undefined,
|
||||
geolocation: options.pageOptions?.geolocation ?? undefined,
|
||||
skipTlsVerification: options.pageOptions?.skipTlsVerification ?? false,
|
||||
};
|
||||
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
|
||||
this.replaceAllPathsWithAbsolutePaths =
|
||||
|
@ -28,7 +28,7 @@ export async function scrapWithFireEngine({
|
||||
waitFor = 0,
|
||||
screenshot = false,
|
||||
fullPageScreenshot = false,
|
||||
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" } },
|
||||
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" }, skipTlsVerification: false },
|
||||
fireEngineOptions = {},
|
||||
headers,
|
||||
options,
|
||||
@ -40,7 +40,7 @@ export async function scrapWithFireEngine({
|
||||
waitFor?: number;
|
||||
screenshot?: boolean;
|
||||
fullPageScreenshot?: boolean;
|
||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string } };
|
||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string }, skipTlsVerification?: boolean };
|
||||
fireEngineOptions?: FireEngineOptions;
|
||||
headers?: Record<string, string>;
|
||||
options?: any;
|
||||
@ -119,6 +119,7 @@ export async function scrapWithFireEngine({
|
||||
atsv: pageOptions?.atsv ?? false,
|
||||
scrollXPaths: pageOptions?.scrollXPaths ?? [],
|
||||
geolocation: pageOptions?.geolocation,
|
||||
skipTlsVerification: pageOptions?.skipTlsVerification ?? false,
|
||||
actions: actions,
|
||||
},
|
||||
{
|
||||
|
@ -157,6 +157,7 @@ export async function scrapSingleUrl(
|
||||
atsv: pageOptions.atsv ?? false,
|
||||
actions: pageOptions.actions ?? undefined,
|
||||
geolocation: pageOptions.geolocation ?? undefined,
|
||||
skipTlsVerification: pageOptions.skipTlsVerification ?? false,
|
||||
}
|
||||
|
||||
if (extractorOptions) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user