Merge branch 'v1-webscraper' of https://github.com/mendableai/firecrawl into v1-webscraper

This commit is contained in:
Nicolas 2024-08-26 19:57:28 -03:00
commit fb553a020d
3 changed files with 94 additions and 11 deletions

View File

@ -0,0 +1,64 @@
import { url } from "../types";
describe("URL Schema Validation", () => {
beforeEach(() => {
jest.resetAllMocks();
});
it("should prepend http:// to URLs without a protocol", () => {
const result = url.parse("example.com");
expect(result).toBe("http://example.com");
});
it("should allow valid URLs with http or https", () => {
expect(() => url.parse("http://example.com")).not.toThrow();
expect(() => url.parse("https://example.com")).not.toThrow();
});
it("should allow valid URLs with http or https", () => {
expect(() => url.parse("example.com")).not.toThrow();
});
it("should reject URLs with unsupported protocols", () => {
expect(() => url.parse("ftp://example.com")).toThrow("Invalid URL");
});
it("should reject URLs without a valid top-level domain", () => {
expect(() => url.parse("http://example")).toThrow("URL must have a valid top-level domain or be a valid path");
});
it("should reject blocked URLs", () => {
expect(() => url.parse("https://facebook.com")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
});
it("should handle URLs with subdomains correctly", () => {
expect(() => url.parse("http://sub.example.com")).not.toThrow();
expect(() => url.parse("https://blog.example.com")).not.toThrow();
});
it("should handle URLs with paths correctly", () => {
expect(() => url.parse("http://example.com/path")).not.toThrow();
expect(() => url.parse("https://example.com/another/path")).not.toThrow();
});
it("should handle URLs with subdomains that are blocked", () => {
expect(() => url.parse("https://sub.facebook.com")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
});
it("should handle URLs with paths that are blocked", () => {
expect(() => url.parse("http://facebook.com/path")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
expect(() => url.parse("https://facebook.com/another/path")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
});
it("should reject malformed URLs starting with 'http://http'", () => {
expect(() => url.parse("http://http://example.com")).toThrow("Invalid URL. Invalid protocol.");
});
it("should reject malformed URLs containing multiple 'http://'", () => {
expect(() => url.parse("http://example.com/http://example.com")).not.toThrow();
});
it("should reject malformed URLs containing multiple 'http://'", () => {
expect(() => url.parse("http://ex ample.com/")).toThrow("Invalid URL");
});
})

View File

@ -2,6 +2,7 @@ import { Request, Response } from "express";
import { z } from "zod"; import { z } from "zod";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { PageOptions } from "../../lib/entities"; import { PageOptions } from "../../lib/entities";
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
export type Format = export type Format =
| "markdown" | "markdown"
@ -11,17 +12,12 @@ export type Format =
| "screenshot" | "screenshot"
| "screenshot@fullPage"; | "screenshot@fullPage";
const url = z.preprocess( export const url = z.preprocess(
(x) => { (x) => {
if (typeof x === "string" && !/^([^.:]+:\/\/)/.test(x)) { if (!protocolIncluded(x as string)) {
if (x.startsWith("://")) { return `http://${x}`;
return "http" + x;
} else {
return "http://" + x;
}
} else {
return x;
} }
return x;
}, },
z z
.string() .string()
@ -32,7 +28,11 @@ const url = z.preprocess(
"URL must have a valid top-level domain or be a valid path" "URL must have a valid top-level domain or be a valid path"
) )
.refine( .refine(
(x) => !isUrlBlocked(x), (x) => checkUrl(x as string),
"Invalid URL"
)
.refine(
(x) => !isUrlBlocked(x as string),
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
) )
); );

View File

@ -1,4 +1,4 @@
const protocolIncluded = (url: string) => { export const protocolIncluded = (url: string) => {
// if :// not in the start of the url assume http (maybe https?) // if :// not in the start of the url assume http (maybe https?)
// regex checks if :// appears before any . // regex checks if :// appears before any .
return /^([^.:]+:\/\/)/.test(url); return /^([^.:]+:\/\/)/.test(url);
@ -35,6 +35,25 @@ export const checkAndUpdateURL = (url: string) => {
return { urlObj: typedUrlObj, url: url }; return { urlObj: typedUrlObj, url: url };
}; };
export const checkUrl = (url: string) => {
const { error, urlObj } = getURLobj(url);
if (error) {
throw new Error("Invalid URL");
}
const typedUrlObj = urlObj as URL;
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
throw new Error("Invalid URL");
}
if ((url.split(".")[0].match(/:/g) || []).length !== 1) {
throw new Error("Invalid URL. Invalid protocol."); // for this one: http://http://example.com
}
return url;
};
/** /**
* Same domain check * Same domain check
* It checks if the domain of the url is the same as the base url * It checks if the domain of the url is the same as the base url