mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 07:39:00 +08:00
Merge branch 'v1-webscraper' of https://github.com/mendableai/firecrawl into v1-webscraper
This commit is contained in:
commit
fb553a020d
64
apps/api/src/controllers/v1/__tests__/urlValidation.test.ts
Normal file
64
apps/api/src/controllers/v1/__tests__/urlValidation.test.ts
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
import { url } from "../types";
|
||||||
|
|
||||||
|
describe("URL Schema Validation", () => {
|
||||||
|
beforeEach(() => {
|
||||||
|
jest.resetAllMocks();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should prepend http:// to URLs without a protocol", () => {
|
||||||
|
const result = url.parse("example.com");
|
||||||
|
expect(result).toBe("http://example.com");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should allow valid URLs with http or https", () => {
|
||||||
|
expect(() => url.parse("http://example.com")).not.toThrow();
|
||||||
|
expect(() => url.parse("https://example.com")).not.toThrow();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should allow valid URLs with http or https", () => {
|
||||||
|
expect(() => url.parse("example.com")).not.toThrow();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should reject URLs with unsupported protocols", () => {
|
||||||
|
expect(() => url.parse("ftp://example.com")).toThrow("Invalid URL");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should reject URLs without a valid top-level domain", () => {
|
||||||
|
expect(() => url.parse("http://example")).toThrow("URL must have a valid top-level domain or be a valid path");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should reject blocked URLs", () => {
|
||||||
|
expect(() => url.parse("https://facebook.com")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should handle URLs with subdomains correctly", () => {
|
||||||
|
expect(() => url.parse("http://sub.example.com")).not.toThrow();
|
||||||
|
expect(() => url.parse("https://blog.example.com")).not.toThrow();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should handle URLs with paths correctly", () => {
|
||||||
|
expect(() => url.parse("http://example.com/path")).not.toThrow();
|
||||||
|
expect(() => url.parse("https://example.com/another/path")).not.toThrow();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should handle URLs with subdomains that are blocked", () => {
|
||||||
|
expect(() => url.parse("https://sub.facebook.com")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should handle URLs with paths that are blocked", () => {
|
||||||
|
expect(() => url.parse("http://facebook.com/path")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||||
|
expect(() => url.parse("https://facebook.com/another/path")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should reject malformed URLs starting with 'http://http'", () => {
|
||||||
|
expect(() => url.parse("http://http://example.com")).toThrow("Invalid URL. Invalid protocol.");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should reject malformed URLs containing multiple 'http://'", () => {
|
||||||
|
expect(() => url.parse("http://example.com/http://example.com")).not.toThrow();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should reject malformed URLs containing multiple 'http://'", () => {
|
||||||
|
expect(() => url.parse("http://ex ample.com/")).toThrow("Invalid URL");
|
||||||
|
});
|
||||||
|
})
|
@ -2,6 +2,7 @@ import { Request, Response } from "express";
|
|||||||
import { z } from "zod";
|
import { z } from "zod";
|
||||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||||
import { PageOptions } from "../../lib/entities";
|
import { PageOptions } from "../../lib/entities";
|
||||||
|
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
|
||||||
|
|
||||||
export type Format =
|
export type Format =
|
||||||
| "markdown"
|
| "markdown"
|
||||||
@ -11,17 +12,12 @@ export type Format =
|
|||||||
| "screenshot"
|
| "screenshot"
|
||||||
| "screenshot@fullPage";
|
| "screenshot@fullPage";
|
||||||
|
|
||||||
const url = z.preprocess(
|
export const url = z.preprocess(
|
||||||
(x) => {
|
(x) => {
|
||||||
if (typeof x === "string" && !/^([^.:]+:\/\/)/.test(x)) {
|
if (!protocolIncluded(x as string)) {
|
||||||
if (x.startsWith("://")) {
|
return `http://${x}`;
|
||||||
return "http" + x;
|
|
||||||
} else {
|
|
||||||
return "http://" + x;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
return x;
|
|
||||||
}
|
}
|
||||||
|
return x;
|
||||||
},
|
},
|
||||||
z
|
z
|
||||||
.string()
|
.string()
|
||||||
@ -32,7 +28,11 @@ const url = z.preprocess(
|
|||||||
"URL must have a valid top-level domain or be a valid path"
|
"URL must have a valid top-level domain or be a valid path"
|
||||||
)
|
)
|
||||||
.refine(
|
.refine(
|
||||||
(x) => !isUrlBlocked(x),
|
(x) => checkUrl(x as string),
|
||||||
|
"Invalid URL"
|
||||||
|
)
|
||||||
|
.refine(
|
||||||
|
(x) => !isUrlBlocked(x as string),
|
||||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
const protocolIncluded = (url: string) => {
|
export const protocolIncluded = (url: string) => {
|
||||||
// if :// not in the start of the url assume http (maybe https?)
|
// if :// not in the start of the url assume http (maybe https?)
|
||||||
// regex checks if :// appears before any .
|
// regex checks if :// appears before any .
|
||||||
return /^([^.:]+:\/\/)/.test(url);
|
return /^([^.:]+:\/\/)/.test(url);
|
||||||
@ -35,6 +35,25 @@ export const checkAndUpdateURL = (url: string) => {
|
|||||||
return { urlObj: typedUrlObj, url: url };
|
return { urlObj: typedUrlObj, url: url };
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export const checkUrl = (url: string) => {
|
||||||
|
const { error, urlObj } = getURLobj(url);
|
||||||
|
if (error) {
|
||||||
|
throw new Error("Invalid URL");
|
||||||
|
}
|
||||||
|
|
||||||
|
const typedUrlObj = urlObj as URL;
|
||||||
|
|
||||||
|
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
|
||||||
|
throw new Error("Invalid URL");
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((url.split(".")[0].match(/:/g) || []).length !== 1) {
|
||||||
|
throw new Error("Invalid URL. Invalid protocol."); // for this one: http://http://example.com
|
||||||
|
}
|
||||||
|
|
||||||
|
return url;
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Same domain check
|
* Same domain check
|
||||||
* It checks if the domain of the url is the same as the base url
|
* It checks if the domain of the url is the same as the base url
|
||||||
|
Loading…
x
Reference in New Issue
Block a user