mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-13 04:29:00 +08:00
fix(scrapeURL/fetch): block loopback and link-local IPs
This commit is contained in:
parent
e255301005
commit
4d1f92f4c8
@ -86,6 +86,7 @@
|
||||
"glob": "^10.4.2",
|
||||
"gpt3-tokenizer": "^1.1.5",
|
||||
"ioredis": "^5.4.1",
|
||||
"ip-address": "^10.0.1",
|
||||
"joplin-turndown-plugin-gfm": "^1.0.12",
|
||||
"json-schema-to-zod": "^2.3.0",
|
||||
"keyword-extractor": "^0.0.28",
|
||||
|
9
apps/api/pnpm-lock.yaml
generated
9
apps/api/pnpm-lock.yaml
generated
@ -113,6 +113,9 @@ importers:
|
||||
ioredis:
|
||||
specifier: ^5.4.1
|
||||
version: 5.4.1
|
||||
ip-address:
|
||||
specifier: ^10.0.1
|
||||
version: 10.0.1
|
||||
joplin-turndown-plugin-gfm:
|
||||
specifier: ^1.0.12
|
||||
version: 1.0.12
|
||||
@ -2690,6 +2693,10 @@ packages:
|
||||
resolution: {integrity: sha512-2YZsvl7jopIa1gaePkeMtd9rAcSjOOjPtpcLlOeusyO+XH2SK5ZcT+UCrElPP+WVIInh2TzeI4XW9ENaSLVVHA==}
|
||||
engines: {node: '>=12.22.0'}
|
||||
|
||||
ip-address@10.0.1:
|
||||
resolution: {integrity: sha512-NWv9YLW4PoW2B7xtzaS3NCot75m6nK7Icdv0o3lfMceJVRfSoQwqD4wEH5rLwoKJwUiZ/rfpiVBhnaF0FK4HoA==}
|
||||
engines: {node: '>= 12'}
|
||||
|
||||
ip-address@9.0.5:
|
||||
resolution: {integrity: sha512-zHtQzGojZXTwZTHQqra+ETKd4Sn3vgi7uBmlPoXVWZqYvuKmtI0l/VZTjqGmJY9x88GGOaZ9+G9ES8hC4T4X8g==}
|
||||
engines: {node: '>= 12'}
|
||||
@ -7845,6 +7852,8 @@ snapshots:
|
||||
transitivePeerDependencies:
|
||||
- supports-color
|
||||
|
||||
ip-address@10.0.1: {}
|
||||
|
||||
ip-address@9.0.5:
|
||||
dependencies:
|
||||
jsbn: 1.1.0
|
||||
|
@ -1,7 +1,9 @@
|
||||
import * as undici from "undici";
|
||||
import { EngineScrapeResult } from "..";
|
||||
import { Meta } from "../..";
|
||||
import { TimeoutError } from "../../error";
|
||||
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||
import { InsecureConnectionError, makeSecureDispatcher } from "../utils/safeFetch";
|
||||
|
||||
export async function scrapeURLWithFetch(
|
||||
meta: Meta,
|
||||
@ -9,8 +11,11 @@ export async function scrapeURLWithFetch(
|
||||
): Promise<EngineScrapeResult> {
|
||||
const timeout = timeToRun ?? 300000;
|
||||
|
||||
const response = await Promise.race([
|
||||
fetch(meta.url, {
|
||||
let response: undici.Response;
|
||||
try {
|
||||
response = await Promise.race([
|
||||
undici.fetch(meta.url, {
|
||||
dispatcher: await makeSecureDispatcher(meta.url),
|
||||
redirect: "follow",
|
||||
headers: meta.options.headers,
|
||||
}),
|
||||
@ -22,6 +27,13 @@ export async function scrapeURLWithFetch(
|
||||
);
|
||||
})(),
|
||||
]);
|
||||
} catch (error) {
|
||||
if (error instanceof TypeError && error.cause instanceof InsecureConnectionError) {
|
||||
throw error.cause;
|
||||
} else {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
specialtyScrapeCheck(
|
||||
meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }),
|
||||
|
@ -5,6 +5,7 @@ import { EngineError } from "../../error";
|
||||
import { Writable } from "stream";
|
||||
import { v4 as uuid } from "uuid";
|
||||
import * as undici from "undici";
|
||||
import { makeSecureDispatcher } from "./safeFetch";
|
||||
|
||||
export async function fetchFileToBuffer(url: string): Promise<{
|
||||
response: Response;
|
||||
@ -28,16 +29,9 @@ export async function downloadFile(
|
||||
const tempFileWrite = createWriteStream(tempFilePath);
|
||||
|
||||
// TODO: maybe we could use tlsclient for this? for proxying
|
||||
// use undici to ignore SSL for now
|
||||
const response = await undici.fetch(url, {
|
||||
dispatcher: new undici.Agent({
|
||||
connect: {
|
||||
rejectUnauthorized: false,
|
||||
},
|
||||
}),
|
||||
});
|
||||
const response = await undici.fetch(url, { dispatcher: await makeSecureDispatcher(url) });
|
||||
|
||||
// This should never happen in the current state of JS (2024), but let's check anyways.
|
||||
// This should never happen in the current state of JS/Undici (2024), but let's check anyways.
|
||||
if (response.body === null) {
|
||||
throw new EngineError("Response body was null", { cause: { response } });
|
||||
}
|
||||
|
60
apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts
Normal file
60
apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts
Normal file
@ -0,0 +1,60 @@
|
||||
import type { Socket } from "net";
|
||||
import type { TLSSocket } from "tls";
|
||||
import * as undici from "undici";
|
||||
import { Address6 } from "ip-address";
|
||||
|
||||
export class InsecureConnectionError extends Error {
|
||||
constructor() {
|
||||
super("Connection violated security rules.")
|
||||
}
|
||||
}
|
||||
|
||||
function isIPv4Private(address: string): boolean {
|
||||
const parts = address.split(".").map(x => parseInt(x, 10));
|
||||
return parts[0] === 0 // Current (local, "this") network
|
||||
|| parts[0] === 10 // Used for local communications within a private network
|
||||
|| (parts[0] === 100 && parts[1] >= 64 && parts[1] < 128) // Shared address space for communications between a service provider and its subscribers when using a carrier-grade NAT
|
||||
|| parts[0] === 127 // Used for loopback addresses to the local host
|
||||
|| (parts[0] === 169 && parts[1] === 254) // Used for link-local addresses between two hosts on a single link when no IP address is otherwise specified, such as would have normally been retrieved from a DHCP server
|
||||
|| (parts[0] === 127 && parts[1] >= 16 && parts[2] < 32) // Used for local communications within a private network
|
||||
|| (parts[0] === 192 && parts[1] === 0 && parts[2] === 0) // IETF Porotocol Assignments, DS-Lite (/29)
|
||||
|| (parts[0] === 192 && parts[1] === 0 && parts[2] === 2) // Assigned as TEST-NET-1, documentation and examples
|
||||
|| (parts[0] === 192 && parts[1] === 88 && parts[2] === 99) // Reserved. Formerly used for IPv6 to IPv4 relay (included IPv6 address block 2002::/16).
|
||||
|| (parts[0] === 192 && parts[1] === 168) // Used for local communications within a private network
|
||||
|| (parts[0] === 192 && parts[1] >= 18 && parts[1] < 20) // Used for benchmark testing of inter-network communications between two separate subnets
|
||||
|| (parts[0] === 198 && parts[1] === 51 && parts[2] === 100) // Assigned as TEST-NET-2, documentation and examples
|
||||
|| (parts[0] === 203 && parts[1] === 0 && parts[2] === 113) // Assigned as TEST-NET-3, documentation and examples
|
||||
|| (parts[0] >= 224 && parts[0] < 240) // In use for multicast (former Class D network)
|
||||
|| (parts[0] === 233 && parts[1] === 252 && parts[2] === 0) // Assigned as MCAST-TEST-NET, documentation and examples (Note that this is part of the above multicast space.)
|
||||
|| parts[0] >= 240 // Reserved for future use (former class E network)
|
||||
|| (parts[0] === 255 && parts[1] === 255 && parts[2] === 255 && parts[3] === 255) // Reserved for the "limited broadcast" destination address
|
||||
}
|
||||
|
||||
function isIPv6Private(ipv6) {
|
||||
return new Address6(ipv6).getScope() !== "Global";
|
||||
}
|
||||
|
||||
export function makeSecureDispatcher(url: string, options?: undici.Agent.Options) {
|
||||
const agent = new undici.Agent({
|
||||
connect: {
|
||||
rejectUnauthorized: false, // bypass SSL failures -- this is fine
|
||||
// lookup: secureLookup,
|
||||
},
|
||||
maxRedirections: 5000,
|
||||
...options,
|
||||
});
|
||||
|
||||
agent.on("connect", (_, targets) => {
|
||||
const client: undici.Client = targets.slice(-1)[0] as undici.Client;
|
||||
const socketSymbol = Object.getOwnPropertySymbols(client).find(x => x.description === "socket")!;
|
||||
const socket: Socket | TLSSocket = (client as any)[socketSymbol];
|
||||
|
||||
if (socket.remoteAddress) {
|
||||
if (socket.remoteFamily === "IPv4" ? isIPv4Private(socket.remoteAddress!) : isIPv6Private(socket.remoteAddress!)) {
|
||||
socket.destroy(new InsecureConnectionError())
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return agent;
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user