Merge branch 'main' into mog/stricten-timeout

This commit is contained in:
Gergő Móricz 2025-06-02 22:18:02 +02:00 committed by GitHub
commit 19d45f5fd0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 8 additions and 248 deletions

View File

@ -69,7 +69,6 @@
"@google-cloud/storage": "^7.16.0", "@google-cloud/storage": "^7.16.0",
"@nangohq/node": "^0.40.8", "@nangohq/node": "^0.40.8",
"@openrouter/ai-sdk-provider": "^0.4.5", "@openrouter/ai-sdk-provider": "^0.4.5",
"@pinecone-database/pinecone": "^4.0.0",
"@sentry/cli": "^2.33.1", "@sentry/cli": "^2.33.1",
"@sentry/node": "^8.26.0", "@sentry/node": "^8.26.0",
"@sentry/profiling-node": "^8.26.0", "@sentry/profiling-node": "^8.26.0",

View File

@ -59,9 +59,6 @@ importers:
'@openrouter/ai-sdk-provider': '@openrouter/ai-sdk-provider':
specifier: ^0.4.5 specifier: ^0.4.5
version: 0.4.5(zod@3.24.2) version: 0.4.5(zod@3.24.2)
'@pinecone-database/pinecone':
specifier: ^4.0.0
version: 4.0.0
'@sentry/cli': '@sentry/cli':
specifier: ^2.33.1 specifier: ^2.33.1
version: 2.33.1(encoding@0.1.13) version: 2.33.1(encoding@0.1.13)
@ -1232,10 +1229,6 @@ packages:
'@pdf-lib/upng@1.0.1': '@pdf-lib/upng@1.0.1':
resolution: {integrity: sha512-dQK2FUMQtowVP00mtIksrlZhdFXQZPC+taih1q4CvPZ5vqdxR/LKBaFg0oAfzd1GlHZXXSPdQfzQnt+ViGvEIQ==} resolution: {integrity: sha512-dQK2FUMQtowVP00mtIksrlZhdFXQZPC+taih1q4CvPZ5vqdxR/LKBaFg0oAfzd1GlHZXXSPdQfzQnt+ViGvEIQ==}
'@pinecone-database/pinecone@4.0.0':
resolution: {integrity: sha512-INYS+GBys9v5BRTyn0tv8srVsPTlSRvE3BPE4Wkc/lOEyAIyB9F7DEMXbeF19FOLEgRwCuHTLjzm1niENl+4FA==}
engines: {node: '>=18.0.0'}
'@pkgjs/parseargs@0.11.0': '@pkgjs/parseargs@0.11.0':
resolution: {integrity: sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==} resolution: {integrity: sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==}
engines: {node: '>=14'} engines: {node: '>=14'}
@ -6331,10 +6324,6 @@ snapshots:
dependencies: dependencies:
pako: 1.0.11 pako: 1.0.11
'@pinecone-database/pinecone@4.0.0':
dependencies:
encoding: 0.1.13
'@pkgjs/parseargs@0.11.0': '@pkgjs/parseargs@0.11.0':
optional: true optional: true
@ -7921,6 +7910,7 @@ snapshots:
encoding@0.1.13: encoding@0.1.13:
dependencies: dependencies:
iconv-lite: 0.6.3 iconv-lite: 0.6.3
optional: true
end-of-stream@1.4.4: end-of-stream@1.4.4:
dependencies: dependencies:

View File

@ -76,6 +76,7 @@ async function scrapeSearchResult(
logger: Logger, logger: Logger,
costTracking: CostTracking, costTracking: CostTracking,
flags: TeamFlags, flags: TeamFlags,
directToBullMQ: boolean = false,
): Promise<Document> { ): Promise<Document> {
const jobId = uuidv4(); const jobId = uuidv4();
const jobPriority = await getJobPriority({ const jobPriority = await getJobPriority({
@ -107,6 +108,7 @@ async function scrapeSearchResult(
{}, {},
jobId, jobId,
jobPriority, jobPriority,
directToBullMQ,
); );
const doc: Document = await waitForJob(jobId, options.timeout); const doc: Document = await waitForJob(jobId, options.timeout);
@ -229,7 +231,7 @@ export async function searchController(
origin: req.body.origin, origin: req.body.origin,
timeout: req.body.timeout, timeout: req.body.timeout,
scrapeOptions: req.body.scrapeOptions, scrapeOptions: req.body.scrapeOptions,
}, logger, costTracking, req.acuc?.flags ?? null), }, logger, costTracking, req.acuc?.flags ?? null, (req.acuc?.price_credits ?? 0) <= 3000),
); );
const docs = await Promise.all(scrapePromises); const docs = await Promise.all(scrapePromises);

View File

@ -1,164 +0,0 @@
import { Pinecone } from "@pinecone-database/pinecone";
import { Document } from "../../../controllers/v1/types";
import { logger } from "../../logger";
import { embed } from "ai";
import { getEmbeddingModel } from "../../generic-ai";
const pinecone = new Pinecone({
apiKey: process.env.PINECONE_API_KEY!,
});
const INDEX_NAME = process.env.PINECONE_INDEX_NAME ?? "";
const MAX_METADATA_SIZE = 30 * 1024; // 30KB in bytes
export interface PageMetadata {
url: string;
originUrl: string;
title?: string;
description?: string;
crawlId?: string;
teamId?: string;
timestamp: number;
markdown?: string;
}
async function getEmbedding(text: string) {
const { embedding } = await embed({
model: getEmbeddingModel("text-embedding-3-small"),
value: text,
});
return embedding;
}
function normalizeUrl(url: string) {
const urlO = new URL(url);
if (!urlO.hostname.startsWith("www.")) {
urlO.hostname = "www." + urlO.hostname;
}
return urlO.href;
}
export async function indexPage({
document,
originUrl,
crawlId,
teamId,
}: {
document: Document;
originUrl: string;
crawlId?: string;
teamId?: string;
}) {
try {
const index = pinecone.index(INDEX_NAME);
// Trim markdown if it's too long
let trimmedMarkdown = document.markdown;
if (
trimmedMarkdown &&
Buffer.byteLength(trimmedMarkdown, "utf-8") > MAX_METADATA_SIZE
) {
trimmedMarkdown = trimmedMarkdown.slice(
0,
Math.floor(MAX_METADATA_SIZE / 2),
); // Using half the size to be safe with UTF-8 encoding
}
// Create text to embed
const textToEmbed = [
document.metadata.title,
document.metadata.description,
trimmedMarkdown,
]
.filter(Boolean)
.join("\n\n");
// Get embedding from OpenAI
const embedding = await getEmbedding(textToEmbed);
const normalizedUrl = normalizeUrl(
document.metadata.sourceURL || document.metadata.url!,
);
// Prepare metadata
const metadata: PageMetadata = {
url: normalizedUrl,
originUrl: normalizeUrl(originUrl),
title: document.metadata.title ?? document.metadata.ogTitle ?? "",
description:
document.metadata.description ?? document.metadata.ogDescription ?? "",
crawlId,
teamId,
markdown: trimmedMarkdown,
timestamp: Date.now(),
};
// Upsert to Pinecone
await index.upsert([
{
id: normalizedUrl,
values: embedding,
metadata: {
...metadata,
[document.metadata.sourceURL || document.metadata.url!]: true,
},
},
]);
logger.debug("Successfully indexed page in Pinecone", {
url: metadata.url,
crawlId,
});
} catch (error) {
logger.error("Failed to index page in Pinecone", {
error,
url: document.metadata.sourceURL || document.metadata.url,
crawlId,
});
}
}
export async function searchSimilarPages(
query: string,
originUrl?: string,
limit: number = 1000,
): Promise<any[]> {
try {
const index = pinecone.index(INDEX_NAME);
// Get query embedding from OpenAI
const queryEmbedding = await getEmbedding(query);
const queryParams: any = {
vector: queryEmbedding,
topK: limit,
includeMetadata: true,
};
const normalizedOriginUrl = originUrl ? normalizeUrl(originUrl) : undefined;
// Add filter if originUrl is provided
if (normalizedOriginUrl) {
queryParams.filter = {
originUrl: { $eq: normalizedOriginUrl },
};
}
const results = await index.query(queryParams);
return results.matches.map((match) => ({
url: match.metadata?.url,
title: match.metadata?.title,
description: match.metadata?.description,
score: match.score,
markdown: match.metadata?.markdown,
}));
} catch (error) {
logger.error("Failed to search similar pages in Pinecone", {
error,
query,
originUrl,
});
return [];
}
}

View File

@ -4,7 +4,6 @@ import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { logger } from "../logger"; import { logger } from "../logger";
import { CohereClient } from "cohere-ai"; import { CohereClient } from "cohere-ai";
import { extractConfig } from "./config"; import { extractConfig } from "./config";
import { searchSimilarPages } from "./index/pinecone";
import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExtract"; import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
import { buildRerankerUserPrompt } from "./build-prompts"; import { buildRerankerUserPrompt } from "./build-prompts";
import { buildRerankerSystemPrompt } from "./build-prompts"; import { buildRerankerSystemPrompt } from "./build-prompts";

View File

@ -15,7 +15,6 @@ import {
ScrapeUrlResponse, ScrapeUrlResponse,
} from "../scraper/scrapeURL"; } from "../scraper/scrapeURL";
import { Engine } from "../scraper/scrapeURL/engines"; import { Engine } from "../scraper/scrapeURL/engines";
import { indexPage } from "../lib/extract/index/pinecone";
import { CostTracking } from "../lib/extract/extraction-service"; import { CostTracking } from "../lib/extract/extraction-service";
configDotenv(); configDotenv();

View File

@ -21,47 +21,6 @@ function cleanOfNull<T>(x: T): T {
} }
} }
async function indexJob(job: FirecrawlJob): Promise<void> {
try {
if (job.mode !== "single_urls" && job.mode !== "scrape") {
return;
}
const response = await fetch(`${process.env.FIRE_INDEX_SERVER_URL}/api/jobs`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
url: job.url,
mode: job.mode || "scrape",
docs: job.docs,
origin: job.origin,
success: job.success,
time_taken: job.time_taken,
num_tokens: job.num_tokens,
page_options: job.scrapeOptions,
date_added: new Date().toISOString(),
}),
});
if (!response.ok) {
const errorData = await response.json();
// logger.error(`Failed to send job to external server: ${response.status} ${response.statusText}`, {
// error: errorData,
// scrapeId: job.job_id,
// });
} else {
// logger.debug("Job sent to external server successfully!", { scrapeId: job.job_id });
}
} catch (error) {
// logger.error(`Error sending job to external server: ${error.message}`, {
// error,
// scrapeId: job.job_id,
// });
}
}
export async function logJob(job: FirecrawlJob, force: boolean = false) { export async function logJob(job: FirecrawlJob, force: boolean = false) {
try { try {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
@ -107,11 +66,6 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) {
credits_billed: job.credits_billed ?? null, credits_billed: job.credits_billed ?? null,
}; };
// Send job to external server
if (process.env.FIRE_INDEX_SERVER_URL) {
indexJob(job);
}
if (process.env.GCS_BUCKET_NAME) { if (process.env.GCS_BUCKET_NAME) {
await saveJobToGCS(job); await saveJobToGCS(job);
} }

View File

@ -97,6 +97,7 @@ async function addScrapeJobRaw(
options: any, options: any,
jobId: string, jobId: string,
jobPriority: number, jobPriority: number,
directToBullMQ: boolean = false,
) { ) {
const hasCrawlDelay = webScraperOptions.crawl_id && webScraperOptions.crawlerOptions?.delay; const hasCrawlDelay = webScraperOptions.crawl_id && webScraperOptions.crawlerOptions?.delay;
@ -127,7 +128,7 @@ async function addScrapeJobRaw(
const concurrencyQueueJobs = await getConcurrencyQueueJobsCount(webScraperOptions.team_id); const concurrencyQueueJobs = await getConcurrencyQueueJobsCount(webScraperOptions.team_id);
if (concurrencyLimited) { if (concurrencyLimited && !directToBullMQ) {
// Detect if they hit their concurrent limit // Detect if they hit their concurrent limit
// If above by 2x, send them an email // If above by 2x, send them an email
// No need to 2x as if there are more than the max concurrency in the concurrency queue, it is already 2x // No need to 2x as if there are more than the max concurrency in the concurrency queue, it is already 2x
@ -161,6 +162,7 @@ export async function addScrapeJob(
options: any = {}, options: any = {},
jobId: string = uuidv4(), jobId: string = uuidv4(),
jobPriority: number = 10, jobPriority: number = 10,
directToBullMQ: boolean = false,
) { ) {
if (Sentry.isInitialized()) { if (Sentry.isInitialized()) {
const size = JSON.stringify(webScraperOptions).length; const size = JSON.stringify(webScraperOptions).length;
@ -187,6 +189,7 @@ export async function addScrapeJob(
options, options,
jobId, jobId,
jobPriority, jobPriority,
directToBullMQ,
); );
}, },
); );

View File

@ -61,7 +61,6 @@ import {
} from "../lib/concurrency-limit"; } from "../lib/concurrency-limit";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings"; import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
import { indexPage } from "../lib/extract/index/pinecone";
import { Document } from "../controllers/v1/types"; import { Document } from "../controllers/v1/types";
import { import {
ExtractResult, ExtractResult,
@ -1046,25 +1045,6 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
} }
} }
async function indexJob(job: Job & { id: string }, document: Document) {
if (
document &&
document.markdown &&
job.data.team_id === process.env.BACKGROUND_INDEX_TEAM_ID!
) {
// indexPage({
// document: document,
// originUrl: job.data.crawl_id
// ? (await getCrawl(job.data.crawl_id))?.originUrl!
// : document.metadata.sourceURL!,
// crawlId: job.data.crawl_id,
// teamId: job.data.team_id,
// }).catch((error) => {
// _logger.error("Error indexing page", { error });
// });
}
}
async function billScrapeJob(job: Job & { id: string }, document: Document, logger: Logger, costTracking?: CostTracking) { async function billScrapeJob(job: Job & { id: string }, document: Document, logger: Logger, costTracking?: CostTracking) {
let creditsToBeBilled: number | null = null; let creditsToBeBilled: number | null = null;
@ -1447,8 +1427,6 @@ async function processJob(job: Job & { id: string }, token: string) {
pdf_num_pages: doc.metadata.numPages, pdf_num_pages: doc.metadata.numPages,
credits_billed, credits_billed,
}); });
indexJob(job, doc);
} }
logger.info(`🐂 Job done ${job.id}`); logger.info(`🐂 Job done ${job.id}`);