mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-17 11:55:53 +08:00
url splits and better url normalization
This commit is contained in:
parent
1d733d169a
commit
550d6bf7bd
@ -10,7 +10,8 @@ import { scrapePDF } from "./pdf";
|
|||||||
import { scrapeURLWithFetch } from "./fetch";
|
import { scrapeURLWithFetch } from "./fetch";
|
||||||
import { scrapeURLWithPlaywright } from "./playwright";
|
import { scrapeURLWithPlaywright } from "./playwright";
|
||||||
import { scrapeCache } from "./cache";
|
import { scrapeCache } from "./cache";
|
||||||
import { scrapeURLWithIndex, useIndex } from "./index/index";
|
import { scrapeURLWithIndex } from "./index/index";
|
||||||
|
import { useIndex } from "../../../services";
|
||||||
|
|
||||||
export type Engine =
|
export type Engine =
|
||||||
| "fire-engine;chrome-cdp"
|
| "fire-engine;chrome-cdp"
|
||||||
|
@ -1,46 +1,10 @@
|
|||||||
import { Document } from "../../../../controllers/v1/types";
|
import { Document } from "../../../../controllers/v1/types";
|
||||||
import { EngineScrapeResult } from "..";
|
import { EngineScrapeResult } from "..";
|
||||||
import { Meta } from "../..";
|
import { Meta } from "../..";
|
||||||
import { getIndexFromGCS, index_supabase_service, saveIndexToGCS } from "../../../../services";
|
import { getIndexFromGCS, hashURL, index_supabase_service, normalizeURLForIndex, saveIndexToGCS, generateURLSplits } from "../../../../services";
|
||||||
import { EngineError, IndexMissError } from "../../error";
|
import { EngineError, IndexMissError } from "../../error";
|
||||||
import crypto from "crypto";
|
import crypto from "crypto";
|
||||||
|
|
||||||
export const useIndex =
|
|
||||||
process.env.INDEX_SUPABASE_URL !== "" &&
|
|
||||||
process.env.INDEX_SUPABASE_URL !== undefined;
|
|
||||||
|
|
||||||
function normalizeURLForIndex(url: string): string {
|
|
||||||
const urlObj = new URL(url);
|
|
||||||
urlObj.hash = "";
|
|
||||||
urlObj.protocol = "https";
|
|
||||||
|
|
||||||
if (urlObj.port === "80" || urlObj.port === "443") {
|
|
||||||
urlObj.port = "";
|
|
||||||
}
|
|
||||||
|
|
||||||
if (urlObj.pathname.endsWith("/index.html")) {
|
|
||||||
urlObj.pathname = urlObj.pathname.slice(0, -10);
|
|
||||||
} else if (urlObj.pathname.endsWith("/index.php")) {
|
|
||||||
urlObj.pathname = urlObj.pathname.slice(0, -9);
|
|
||||||
} else if (urlObj.pathname.endsWith("/index.htm")) {
|
|
||||||
urlObj.pathname = urlObj.pathname.slice(0, -9);
|
|
||||||
} else if (urlObj.pathname.endsWith("/index.shtml")) {
|
|
||||||
urlObj.pathname = urlObj.pathname.slice(0, -11);
|
|
||||||
} else if (urlObj.pathname.endsWith("/index.xml")) {
|
|
||||||
urlObj.pathname = urlObj.pathname.slice(0, -9);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (urlObj.pathname.endsWith("/")) {
|
|
||||||
urlObj.pathname = urlObj.pathname.slice(0, -1);
|
|
||||||
}
|
|
||||||
|
|
||||||
return urlObj.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
async function hashURL(url: string): Promise<string> {
|
|
||||||
return "\\x" + crypto.createHash("sha256").update(url).digest("hex");
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function sendDocumentToIndex(meta: Meta, document: Document) {
|
export async function sendDocumentToIndex(meta: Meta, document: Document) {
|
||||||
if (meta.winnerEngine === "cache" || meta.winnerEngine === "index") {
|
if (meta.winnerEngine === "cache" || meta.winnerEngine === "index") {
|
||||||
return document;
|
return document;
|
||||||
@ -53,7 +17,7 @@ export async function sendDocumentToIndex(meta: Meta, document: Document) {
|
|||||||
const normalizedURL = normalizeURLForIndex(meta.url);
|
const normalizedURL = normalizeURLForIndex(meta.url);
|
||||||
const urlHash = await hashURL(normalizedURL);
|
const urlHash = await hashURL(normalizedURL);
|
||||||
|
|
||||||
const urlSplits = []; // TODO
|
const urlSplits = generateURLSplits(normalizedURL);
|
||||||
const urlSplitsHash = await Promise.all(urlSplits.map(split => hashURL(split)));
|
const urlSplitsHash = await Promise.all(urlSplits.map(split => hashURL(split)));
|
||||||
|
|
||||||
const indexId = crypto.randomUUID();
|
const indexId = crypto.randomUUID();
|
||||||
@ -82,6 +46,8 @@ export async function sendDocumentToIndex(meta: Meta, document: Document) {
|
|||||||
url_hash: urlHash,
|
url_hash: urlHash,
|
||||||
url_splits: urlSplits,
|
url_splits: urlSplits,
|
||||||
url_splits_hash: urlSplitsHash,
|
url_splits_hash: urlSplitsHash,
|
||||||
|
original_url: document.metadata.sourceURL ?? meta.url,
|
||||||
|
resolved_url: document.metadata.url ?? document.metadata.sourceURL ?? meta.url,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (error) {
|
if (error) {
|
||||||
|
@ -2,6 +2,7 @@ import { createClient, SupabaseClient } from "@supabase/supabase-js";
|
|||||||
import { logger } from "../lib/logger";
|
import { logger } from "../lib/logger";
|
||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
import { Storage } from "@google-cloud/storage";
|
import { Storage } from "@google-cloud/storage";
|
||||||
|
import crypto from "crypto";
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
|
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
|
||||||
@ -130,3 +131,60 @@ export async function saveIndexToGCS(id: string, doc: {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export const useIndex =
|
||||||
|
process.env.INDEX_SUPABASE_URL !== "" &&
|
||||||
|
process.env.INDEX_SUPABASE_URL !== undefined;
|
||||||
|
|
||||||
|
export function normalizeURLForIndex(url: string): string {
|
||||||
|
const urlObj = new URL(url);
|
||||||
|
urlObj.hash = "";
|
||||||
|
urlObj.protocol = "https";
|
||||||
|
|
||||||
|
if (urlObj.port === "80" || urlObj.port === "443") {
|
||||||
|
urlObj.port = "";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (urlObj.hostname.startsWith("www.")) {
|
||||||
|
urlObj.hostname = urlObj.hostname.slice(4);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (urlObj.pathname.endsWith("/index.html")) {
|
||||||
|
urlObj.pathname = urlObj.pathname.slice(0, -10);
|
||||||
|
} else if (urlObj.pathname.endsWith("/index.php")) {
|
||||||
|
urlObj.pathname = urlObj.pathname.slice(0, -9);
|
||||||
|
} else if (urlObj.pathname.endsWith("/index.htm")) {
|
||||||
|
urlObj.pathname = urlObj.pathname.slice(0, -9);
|
||||||
|
} else if (urlObj.pathname.endsWith("/index.shtml")) {
|
||||||
|
urlObj.pathname = urlObj.pathname.slice(0, -11);
|
||||||
|
} else if (urlObj.pathname.endsWith("/index.xml")) {
|
||||||
|
urlObj.pathname = urlObj.pathname.slice(0, -9);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (urlObj.pathname.endsWith("/")) {
|
||||||
|
urlObj.pathname = urlObj.pathname.slice(0, -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
return urlObj.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function hashURL(url: string): Promise<string> {
|
||||||
|
return "\\x" + crypto.createHash("sha256").update(url).digest("hex");
|
||||||
|
}
|
||||||
|
|
||||||
|
export function generateURLSplits(url: string): string[] {
|
||||||
|
const urls: string[] = [];
|
||||||
|
const urlObj = new URL(url);
|
||||||
|
urlObj.hash = "";
|
||||||
|
urlObj.search = "";
|
||||||
|
const pathnameParts = urlObj.pathname.split("/");
|
||||||
|
|
||||||
|
for (let i = 0; i <= pathnameParts.length; i++) {
|
||||||
|
urlObj.pathname = pathnameParts.slice(0, i).join("/");
|
||||||
|
urls.push(urlObj.href);
|
||||||
|
}
|
||||||
|
|
||||||
|
urls.push(url);
|
||||||
|
|
||||||
|
return [...new Set(urls.map(x => normalizeURLForIndex(x)))];
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user