mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-16 16:05:59 +08:00
url splits and better url normalization
This commit is contained in:
parent
1d733d169a
commit
550d6bf7bd
@ -10,7 +10,8 @@ import { scrapePDF } from "./pdf";
|
||||
import { scrapeURLWithFetch } from "./fetch";
|
||||
import { scrapeURLWithPlaywright } from "./playwright";
|
||||
import { scrapeCache } from "./cache";
|
||||
import { scrapeURLWithIndex, useIndex } from "./index/index";
|
||||
import { scrapeURLWithIndex } from "./index/index";
|
||||
import { useIndex } from "../../../services";
|
||||
|
||||
export type Engine =
|
||||
| "fire-engine;chrome-cdp"
|
||||
|
@ -1,46 +1,10 @@
|
||||
import { Document } from "../../../../controllers/v1/types";
|
||||
import { EngineScrapeResult } from "..";
|
||||
import { Meta } from "../..";
|
||||
import { getIndexFromGCS, index_supabase_service, saveIndexToGCS } from "../../../../services";
|
||||
import { getIndexFromGCS, hashURL, index_supabase_service, normalizeURLForIndex, saveIndexToGCS, generateURLSplits } from "../../../../services";
|
||||
import { EngineError, IndexMissError } from "../../error";
|
||||
import crypto from "crypto";
|
||||
|
||||
export const useIndex =
|
||||
process.env.INDEX_SUPABASE_URL !== "" &&
|
||||
process.env.INDEX_SUPABASE_URL !== undefined;
|
||||
|
||||
function normalizeURLForIndex(url: string): string {
|
||||
const urlObj = new URL(url);
|
||||
urlObj.hash = "";
|
||||
urlObj.protocol = "https";
|
||||
|
||||
if (urlObj.port === "80" || urlObj.port === "443") {
|
||||
urlObj.port = "";
|
||||
}
|
||||
|
||||
if (urlObj.pathname.endsWith("/index.html")) {
|
||||
urlObj.pathname = urlObj.pathname.slice(0, -10);
|
||||
} else if (urlObj.pathname.endsWith("/index.php")) {
|
||||
urlObj.pathname = urlObj.pathname.slice(0, -9);
|
||||
} else if (urlObj.pathname.endsWith("/index.htm")) {
|
||||
urlObj.pathname = urlObj.pathname.slice(0, -9);
|
||||
} else if (urlObj.pathname.endsWith("/index.shtml")) {
|
||||
urlObj.pathname = urlObj.pathname.slice(0, -11);
|
||||
} else if (urlObj.pathname.endsWith("/index.xml")) {
|
||||
urlObj.pathname = urlObj.pathname.slice(0, -9);
|
||||
}
|
||||
|
||||
if (urlObj.pathname.endsWith("/")) {
|
||||
urlObj.pathname = urlObj.pathname.slice(0, -1);
|
||||
}
|
||||
|
||||
return urlObj.toString();
|
||||
}
|
||||
|
||||
async function hashURL(url: string): Promise<string> {
|
||||
return "\\x" + crypto.createHash("sha256").update(url).digest("hex");
|
||||
}
|
||||
|
||||
export async function sendDocumentToIndex(meta: Meta, document: Document) {
|
||||
if (meta.winnerEngine === "cache" || meta.winnerEngine === "index") {
|
||||
return document;
|
||||
@ -53,7 +17,7 @@ export async function sendDocumentToIndex(meta: Meta, document: Document) {
|
||||
const normalizedURL = normalizeURLForIndex(meta.url);
|
||||
const urlHash = await hashURL(normalizedURL);
|
||||
|
||||
const urlSplits = []; // TODO
|
||||
const urlSplits = generateURLSplits(normalizedURL);
|
||||
const urlSplitsHash = await Promise.all(urlSplits.map(split => hashURL(split)));
|
||||
|
||||
const indexId = crypto.randomUUID();
|
||||
@ -82,6 +46,8 @@ export async function sendDocumentToIndex(meta: Meta, document: Document) {
|
||||
url_hash: urlHash,
|
||||
url_splits: urlSplits,
|
||||
url_splits_hash: urlSplitsHash,
|
||||
original_url: document.metadata.sourceURL ?? meta.url,
|
||||
resolved_url: document.metadata.url ?? document.metadata.sourceURL ?? meta.url,
|
||||
});
|
||||
|
||||
if (error) {
|
||||
|
@ -2,6 +2,7 @@ import { createClient, SupabaseClient } from "@supabase/supabase-js";
|
||||
import { logger } from "../lib/logger";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { Storage } from "@google-cloud/storage";
|
||||
import crypto from "crypto";
|
||||
configDotenv();
|
||||
|
||||
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
|
||||
@ -130,3 +131,60 @@ export async function saveIndexToGCS(id: string, doc: {
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
export const useIndex =
|
||||
process.env.INDEX_SUPABASE_URL !== "" &&
|
||||
process.env.INDEX_SUPABASE_URL !== undefined;
|
||||
|
||||
export function normalizeURLForIndex(url: string): string {
|
||||
const urlObj = new URL(url);
|
||||
urlObj.hash = "";
|
||||
urlObj.protocol = "https";
|
||||
|
||||
if (urlObj.port === "80" || urlObj.port === "443") {
|
||||
urlObj.port = "";
|
||||
}
|
||||
|
||||
if (urlObj.hostname.startsWith("www.")) {
|
||||
urlObj.hostname = urlObj.hostname.slice(4);
|
||||
}
|
||||
|
||||
if (urlObj.pathname.endsWith("/index.html")) {
|
||||
urlObj.pathname = urlObj.pathname.slice(0, -10);
|
||||
} else if (urlObj.pathname.endsWith("/index.php")) {
|
||||
urlObj.pathname = urlObj.pathname.slice(0, -9);
|
||||
} else if (urlObj.pathname.endsWith("/index.htm")) {
|
||||
urlObj.pathname = urlObj.pathname.slice(0, -9);
|
||||
} else if (urlObj.pathname.endsWith("/index.shtml")) {
|
||||
urlObj.pathname = urlObj.pathname.slice(0, -11);
|
||||
} else if (urlObj.pathname.endsWith("/index.xml")) {
|
||||
urlObj.pathname = urlObj.pathname.slice(0, -9);
|
||||
}
|
||||
|
||||
if (urlObj.pathname.endsWith("/")) {
|
||||
urlObj.pathname = urlObj.pathname.slice(0, -1);
|
||||
}
|
||||
|
||||
return urlObj.toString();
|
||||
}
|
||||
|
||||
export async function hashURL(url: string): Promise<string> {
|
||||
return "\\x" + crypto.createHash("sha256").update(url).digest("hex");
|
||||
}
|
||||
|
||||
export function generateURLSplits(url: string): string[] {
|
||||
const urls: string[] = [];
|
||||
const urlObj = new URL(url);
|
||||
urlObj.hash = "";
|
||||
urlObj.search = "";
|
||||
const pathnameParts = urlObj.pathname.split("/");
|
||||
|
||||
for (let i = 0; i <= pathnameParts.length; i++) {
|
||||
urlObj.pathname = pathnameParts.slice(0, i).join("/");
|
||||
urls.push(urlObj.href);
|
||||
}
|
||||
|
||||
urls.push(url);
|
||||
|
||||
return [...new Set(urls.map(x => normalizeURLForIndex(x)))];
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user