mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 07:05:57 +08:00
Nick: url normalization + max metadata size
This commit is contained in:
parent
bf9d41d0b2
commit
7a31306be5
@ -17,7 +17,6 @@ import expressWs from "express-ws";
|
|||||||
import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types";
|
import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types";
|
||||||
import { ZodError } from "zod";
|
import { ZodError } from "zod";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { searchSimilarPages } from "./lib/extract/index/pinecone";
|
|
||||||
|
|
||||||
const { createBullBoard } = require("@bull-board/api");
|
const { createBullBoard } = require("@bull-board/api");
|
||||||
const { BullAdapter } = require("@bull-board/api/bullAdapter");
|
const { BullAdapter } = require("@bull-board/api/bullAdapter");
|
||||||
@ -255,3 +254,4 @@ logger.info(`Worker ${process.pid} started`);
|
|||||||
// sq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
|
// sq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
|
||||||
// sq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
|
// sq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
|
||||||
// sq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));
|
// sq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));
|
||||||
|
//
|
||||||
|
@ -13,6 +13,8 @@ const pinecone = new Pinecone({
|
|||||||
|
|
||||||
const INDEX_NAME = process.env.PINECONE_INDEX_NAME ?? "";
|
const INDEX_NAME = process.env.PINECONE_INDEX_NAME ?? "";
|
||||||
|
|
||||||
|
const MAX_METADATA_SIZE = 30 * 1024; // 30KB in bytes
|
||||||
|
|
||||||
export interface PageMetadata {
|
export interface PageMetadata {
|
||||||
url: string;
|
url: string;
|
||||||
originUrl: string;
|
originUrl: string;
|
||||||
@ -42,40 +44,54 @@ function normalizeUrl(url: string) {
|
|||||||
return urlO.href;
|
return urlO.href;
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function indexPage(
|
export async function indexPage({
|
||||||
document: Document,
|
document,
|
||||||
originUrl: string,
|
originUrl,
|
||||||
crawlId?: string,
|
crawlId,
|
||||||
teamId?: string
|
teamId
|
||||||
|
}: {
|
||||||
|
document: Document;
|
||||||
|
originUrl: string;
|
||||||
|
crawlId?: string;
|
||||||
|
teamId?: string;
|
||||||
|
}
|
||||||
) {
|
) {
|
||||||
try {
|
try {
|
||||||
const index = pinecone.index(INDEX_NAME);
|
const index = pinecone.index(INDEX_NAME);
|
||||||
|
|
||||||
|
// Trim markdown if it's too long
|
||||||
|
let trimmedMarkdown = document.markdown;
|
||||||
|
if (trimmedMarkdown && Buffer.byteLength(trimmedMarkdown, 'utf-8') > MAX_METADATA_SIZE) {
|
||||||
|
trimmedMarkdown = trimmedMarkdown.slice(0, Math.floor(MAX_METADATA_SIZE / 2)); // Using half the size to be safe with UTF-8 encoding
|
||||||
|
}
|
||||||
|
|
||||||
// Create text to embed
|
// Create text to embed
|
||||||
const textToEmbed = [
|
const textToEmbed = [
|
||||||
document.metadata.title,
|
document.metadata.title,
|
||||||
document.metadata.description,
|
document.metadata.description,
|
||||||
document.markdown
|
trimmedMarkdown
|
||||||
].filter(Boolean).join('\n\n');
|
].filter(Boolean).join('\n\n');
|
||||||
|
|
||||||
// Get embedding from OpenAI
|
// Get embedding from OpenAI
|
||||||
const embedding = await getEmbedding(textToEmbed);
|
const embedding = await getEmbedding(textToEmbed);
|
||||||
|
|
||||||
|
const normalizedUrl = normalizeUrl(document.metadata.sourceURL || document.metadata.url!);
|
||||||
|
|
||||||
// Prepare metadata
|
// Prepare metadata
|
||||||
const metadata: PageMetadata = {
|
const metadata: PageMetadata = {
|
||||||
url: normalizeUrl(document.metadata.sourceURL || document.metadata.url!),
|
url: normalizedUrl,
|
||||||
originUrl: normalizeUrl(originUrl),
|
originUrl: normalizeUrl(originUrl),
|
||||||
title: document.metadata.title,
|
title: document.metadata.title,
|
||||||
description: document.metadata.description,
|
description: document.metadata.description,
|
||||||
crawlId,
|
crawlId,
|
||||||
teamId,
|
teamId,
|
||||||
markdown: document.markdown,
|
markdown: trimmedMarkdown,
|
||||||
timestamp: Date.now()
|
timestamp: Date.now()
|
||||||
};
|
};
|
||||||
|
|
||||||
// Upsert to Pinecone
|
// Upsert to Pinecone
|
||||||
await index.upsert([{
|
await index.upsert([{
|
||||||
id: document.metadata.sourceURL || document.metadata.url!,
|
id: normalizedUrl,
|
||||||
values: embedding,
|
values: embedding,
|
||||||
metadata: {
|
metadata: {
|
||||||
...metadata,
|
...metadata,
|
||||||
@ -114,10 +130,11 @@ export async function searchSimilarPages(
|
|||||||
includeMetadata: true
|
includeMetadata: true
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const normalizedOriginUrl = originUrl ? normalizeUrl(originUrl) : undefined;
|
||||||
// Add filter if originUrl is provided
|
// Add filter if originUrl is provided
|
||||||
if (originUrl) {
|
if (normalizedOriginUrl) {
|
||||||
queryParams.filter = {
|
queryParams.filter = {
|
||||||
[originUrl]: { $contains: normalizeUrl(originUrl) }
|
originUrl: { $eq: normalizedOriginUrl }
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user