Merge branch 'main' into mog/index

This commit is contained in:
Gergő Móricz 2025-06-03 16:09:57 +02:00 committed by GitHub
commit d7fef33224
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
31 changed files with 458 additions and 429 deletions

View File

@ -128,7 +128,7 @@ jobs:
- name: Kill SearXNG - name: Kill SearXNG
if: always() && matrix.search == 'searxng' if: always() && matrix.search == 'searxng'
run: | run: |
docker logs searxng > searxng/searxng.log 2>&1 docker logs searxng > searxng.log 2>&1
docker kill searxng docker kill searxng
working-directory: ./ working-directory: ./
- uses: actions/upload-artifact@v4 - uses: actions/upload-artifact@v4
@ -149,5 +149,4 @@ jobs:
with: with:
name: SearXNG (${{ matrix.ai }}, ${{ matrix.engine }}, ${{ matrix.proxy }}) name: SearXNG (${{ matrix.ai }}, ${{ matrix.engine }}, ${{ matrix.proxy }})
path: | path: |
./searxng/searxng.log ./searxng.log
./searxng/settings.yml

View File

@ -69,7 +69,6 @@
"@google-cloud/storage": "^7.16.0", "@google-cloud/storage": "^7.16.0",
"@nangohq/node": "^0.40.8", "@nangohq/node": "^0.40.8",
"@openrouter/ai-sdk-provider": "^0.4.5", "@openrouter/ai-sdk-provider": "^0.4.5",
"@pinecone-database/pinecone": "^4.0.0",
"@sentry/cli": "^2.33.1", "@sentry/cli": "^2.33.1",
"@sentry/node": "^8.26.0", "@sentry/node": "^8.26.0",
"@sentry/profiling-node": "^8.26.0", "@sentry/profiling-node": "^8.26.0",

View File

@ -59,9 +59,6 @@ importers:
'@openrouter/ai-sdk-provider': '@openrouter/ai-sdk-provider':
specifier: ^0.4.5 specifier: ^0.4.5
version: 0.4.5(zod@3.24.2) version: 0.4.5(zod@3.24.2)
'@pinecone-database/pinecone':
specifier: ^4.0.0
version: 4.0.0
'@sentry/cli': '@sentry/cli':
specifier: ^2.33.1 specifier: ^2.33.1
version: 2.33.1(encoding@0.1.13) version: 2.33.1(encoding@0.1.13)
@ -1232,10 +1229,6 @@ packages:
'@pdf-lib/upng@1.0.1': '@pdf-lib/upng@1.0.1':
resolution: {integrity: sha512-dQK2FUMQtowVP00mtIksrlZhdFXQZPC+taih1q4CvPZ5vqdxR/LKBaFg0oAfzd1GlHZXXSPdQfzQnt+ViGvEIQ==} resolution: {integrity: sha512-dQK2FUMQtowVP00mtIksrlZhdFXQZPC+taih1q4CvPZ5vqdxR/LKBaFg0oAfzd1GlHZXXSPdQfzQnt+ViGvEIQ==}
'@pinecone-database/pinecone@4.0.0':
resolution: {integrity: sha512-INYS+GBys9v5BRTyn0tv8srVsPTlSRvE3BPE4Wkc/lOEyAIyB9F7DEMXbeF19FOLEgRwCuHTLjzm1niENl+4FA==}
engines: {node: '>=18.0.0'}
'@pkgjs/parseargs@0.11.0': '@pkgjs/parseargs@0.11.0':
resolution: {integrity: sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==} resolution: {integrity: sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==}
engines: {node: '>=14'} engines: {node: '>=14'}
@ -6331,10 +6324,6 @@ snapshots:
dependencies: dependencies:
pako: 1.0.11 pako: 1.0.11
'@pinecone-database/pinecone@4.0.0':
dependencies:
encoding: 0.1.13
'@pkgjs/parseargs@0.11.0': '@pkgjs/parseargs@0.11.0':
optional: true optional: true
@ -7921,6 +7910,7 @@ snapshots:
encoding@0.1.13: encoding@0.1.13:
dependencies: dependencies:
iconv-lite: 0.6.3 iconv-lite: 0.6.3
optional: true
end-of-stream@1.4.4: end-of-stream@1.4.4:
dependencies: dependencies:

View File

@ -78,24 +78,24 @@ describe("Scrape tests", () => {
expect(JSON.stringify(status)).toBe(JSON.stringify(response)); expect(JSON.stringify(status)).toBe(JSON.stringify(response));
}, 60000); }, 60000);
// describe("Ad blocking (f-e dependant)", () => { describe("Ad blocking (f-e dependant)", () => {
// it.concurrent("blocks ads by default", async () => { it.concurrent("blocks ads by default", async () => {
// const response = await scrape({ const response = await scrape({
// url: "https://www.allrecipes.com/recipe/18185/yum/", url: "https://www.allrecipes.com/recipe/18185/yum/",
// }); });
// expect(response.markdown).not.toContain(".g.doubleclick.net/"); expect(response.markdown).not.toContain(".g.doubleclick.net/");
// }, 30000); }, 30000);
// it.concurrent("doesn't block ads if explicitly disabled", async () => { it.concurrent("doesn't block ads if explicitly disabled", async () => {
// const response = await scrape({ const response = await scrape({
// url: "https://www.allrecipes.com/recipe/18185/yum/", url: "https://www.allrecipes.com/recipe/18185/yum/",
// blockAds: false, blockAds: false,
// }); });
// expect(response.markdown).toMatch(/(\.g\.doubleclick\.net|amazon-adsystem\.com)\//); expect(response.markdown).toMatch(/(\.g\.doubleclick\.net|amazon-adsystem\.com)\//);
// }, 30000); }, 30000);
// }); });
describe("Index", () => { describe("Index", () => {
it.concurrent("caches properly", async () => { it.concurrent("caches properly", async () => {

View File

@ -82,6 +82,7 @@ export async function scrapeHelper(
internalOptions, internalOptions,
origin: req.body.origin ?? defaultOrigin, origin: req.body.origin ?? defaultOrigin,
is_scrape: true, is_scrape: true,
startTime: Date.now(),
}, },
{}, {},
jobId, jobId,

View File

@ -112,6 +112,7 @@ export async function searchHelper(
team_id: team_id, team_id: team_id,
scrapeOptions, scrapeOptions,
internalOptions, internalOptions,
startTime: Date.now(),
}, },
opts: { opts: {
jobId: uuid, jobId: uuid,

View File

@ -13,6 +13,7 @@ import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
import { getJobPriority } from "../../lib/job-priority"; import { getJobPriority } from "../../lib/job-priority";
import { getScrapeQueue } from "../../services/queue-service"; import { getScrapeQueue } from "../../services/queue-service";
import { supabaseGetJobById } from "../../lib/supabase-jobs"; import { supabaseGetJobById } from "../../lib/supabase-jobs";
import { calculateCreditsToBeBilled } from "../../lib/scrape-billing";
export async function scrapeController( export async function scrapeController(
req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>, req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>,
@ -30,7 +31,6 @@ export async function scrapeController(
}); });
req.body = scrapeRequestSchema.parse(req.body); req.body = scrapeRequestSchema.parse(req.body);
let earlyReturn = false;
const origin = req.body.origin; const origin = req.body.origin;
const timeout = req.body.timeout; const timeout = req.body.timeout;
@ -42,6 +42,8 @@ export async function scrapeController(
}); });
// //
const isDirectToBullMQ = process.env.SEARCH_PREVIEW_TOKEN === req.body.__searchPreviewToken;
await addScrapeJob( await addScrapeJob(
{ {
url: req.body.url, url: req.body.url,
@ -52,13 +54,16 @@ export async function scrapeController(
teamId: req.auth.team_id, teamId: req.auth.team_id,
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false, saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
unnormalizedSourceURL: preNormalizedBody.url, unnormalizedSourceURL: preNormalizedBody.url,
useCache: req.body.__experimental_cache ? true : false,
bypassBilling: isDirectToBullMQ,
}, },
origin: req.body.origin, origin: req.body.origin,
is_scrape: true, startTime,
}, },
{}, {},
jobId, jobId,
jobPriority, jobPriority,
isDirectToBullMQ,
); );
const totalWait = const totalWait =
@ -124,57 +129,13 @@ export async function scrapeController(
await getScrapeQueue().remove(jobId); await getScrapeQueue().remove(jobId);
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
const numTokens =
doc && doc.extract
? // ? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
0 // TODO: fix
: 0;
let creditsToBeBilled = 1; // Assuming 1 credit per document
if (earlyReturn) {
// Don't bill if we're early returning
return;
}
if ((req.body.extract && req.body.formats?.includes("extract")) || (req.body.formats?.includes("changeTracking") && req.body.changeTrackingOptions?.modes?.includes("json"))) {
creditsToBeBilled = 5;
}
if (req.body.agent?.model?.toLowerCase() === "fire-1" || req.body.extract?.agent?.model?.toLowerCase() === "fire-1" || req.body.jsonOptions?.agent?.model?.toLowerCase() === "fire-1") {
if (process.env.USE_DB_AUTHENTICATION === "true") {
// @Nick this is a hack pushed at 2AM pls help - mogery
const job = await supabaseGetJobById(jobId);
if (!job?.cost_tracking) {
logger.warn("No cost tracking found for job", {
jobId,
});
}
creditsToBeBilled = Math.ceil((job?.cost_tracking?.totalCost ?? 1) * 1800);
} else {
creditsToBeBilled = 150;
}
}
if (doc?.metadata?.proxyUsed === "stealth") {
creditsToBeBilled += 4;
}
billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(
(error) => {
logger.error(
`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`,
);
// Optionally, you could notify an admin or add to a retry queue here
},
);
if (!req.body.formats.includes("rawHtml")) { if (!req.body.formats.includes("rawHtml")) {
if (doc && doc.rawHtml) { if (doc && doc.rawHtml) {
delete doc.rawHtml; delete doc.rawHtml;
} }
} }
return res.status(200).json({ return res.status(200).json({
success: true, success: true,
data: doc, data: doc,

View File

@ -40,24 +40,24 @@ export async function searchAndScrapeSearchResult(
try { try {
const searchResults = await search({ const searchResults = await search({
query, query,
num_results: 5 num_results: 5,
}); });
const documents = await Promise.all( const documents = await Promise.all(
searchResults.map(result => searchResults.map((result) =>
scrapeSearchResult( scrapeSearchResult(
{ {
url: result.url, url: result.url,
title: result.title, title: result.title,
description: result.description description: result.description,
}, },
options, options,
logger, logger,
costTracking, costTracking,
flags flags,
) ),
) ),
); );
return documents; return documents;
} catch (error) { } catch (error) {
@ -76,6 +76,8 @@ async function scrapeSearchResult(
logger: Logger, logger: Logger,
costTracking: CostTracking, costTracking: CostTracking,
flags: TeamFlags, flags: TeamFlags,
directToBullMQ: boolean = false,
isSearchPreview: boolean = false,
): Promise<Document> { ): Promise<Document> {
const jobId = uuidv4(); const jobId = uuidv4();
const jobPriority = await getJobPriority({ const jobPriority = await getJobPriority({
@ -99,18 +101,19 @@ async function scrapeSearchResult(
mode: "single_urls" as Mode, mode: "single_urls" as Mode,
team_id: options.teamId, team_id: options.teamId,
scrapeOptions: options.scrapeOptions, scrapeOptions: options.scrapeOptions,
internalOptions: { teamId: options.teamId, useCache: true }, internalOptions: { teamId: options.teamId, useCache: true, bypassBilling: true },
origin: options.origin, origin: options.origin,
is_scrape: true, is_scrape: true,
startTime: Date.now(),
}, },
{}, {},
jobId, jobId,
jobPriority, jobPriority,
directToBullMQ,
); );
const doc: Document = await waitForJob(jobId, options.timeout); const doc: Document = await waitForJob(jobId, options.timeout);
logger.info("Scrape job completed", { logger.info("Scrape job completed", {
scrapeId: jobId, scrapeId: jobId,
url: searchResult.url, url: searchResult.url,
@ -169,6 +172,8 @@ export async function searchController(
}; };
const startTime = new Date().getTime(); const startTime = new Date().getTime();
const costTracking = new CostTracking(); const costTracking = new CostTracking();
const isSearchPreview =
process.env.SEARCH_PREVIEW_TOKEN === req.body.__searchPreviewToken;
try { try {
req.body = searchRequestSchema.parse(req.body); req.body = searchRequestSchema.parse(req.body);
@ -197,7 +202,9 @@ export async function searchController(
}); });
if (req.body.ignoreInvalidURLs) { if (req.body.ignoreInvalidURLs) {
searchResults = searchResults.filter((result) => !isUrlBlocked(result.url, req.acuc?.flags ?? null)); searchResults = searchResults.filter(
(result) => !isUrlBlocked(result.url, req.acuc?.flags ?? null),
);
} }
logger.info("Searching completed", { logger.info("Searching completed", {
@ -224,12 +231,20 @@ export async function searchController(
} else { } else {
logger.info("Scraping search results"); logger.info("Scraping search results");
const scrapePromises = searchResults.map((result) => const scrapePromises = searchResults.map((result) =>
scrapeSearchResult(result, { scrapeSearchResult(
teamId: req.auth.team_id, result,
origin: req.body.origin, {
timeout: req.body.timeout, teamId: req.auth.team_id,
scrapeOptions: req.body.scrapeOptions, origin: req.body.origin,
}, logger, costTracking, req.acuc?.flags ?? null), timeout: req.body.timeout,
scrapeOptions: req.body.scrapeOptions,
},
logger,
costTracking,
req.acuc?.flags ?? null,
(req.acuc?.price_credits ?? 0) <= 3000,
isSearchPreview,
),
); );
const docs = await Promise.all(scrapePromises); const docs = await Promise.all(scrapePromises);
@ -255,11 +270,23 @@ export async function searchController(
} }
// Bill team once for all successful results // Bill team once for all successful results
billTeam(req.auth.team_id, req.acuc?.sub_id, responseData.data.length).catch((error) => { if (!isSearchPreview) {
logger.error( billTeam(
`Failed to bill team ${req.auth.team_id} for ${responseData.data.length} credits: ${error}`, req.auth.team_id,
); req.acuc?.sub_id,
}); responseData.data.reduce((a, x) => {
if (x.metadata?.numPages !== undefined && x.metadata.numPages > 0) {
return a + x.metadata.numPages;
} else {
return a + 1;
}
}, 0),
).catch((error) => {
logger.error(
`Failed to bill team ${req.auth.team_id} for ${responseData.data.length} credits: ${error}`,
);
});
}
const endTime = new Date().getTime(); const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000; const timeTakenInSeconds = (endTime - startTime) / 1000;
@ -269,22 +296,25 @@ export async function searchController(
time_taken: timeTakenInSeconds, time_taken: timeTakenInSeconds,
}); });
logJob({ logJob(
job_id: jobId, {
success: true, job_id: jobId,
num_docs: responseData.data.length, success: true,
docs: responseData.data, num_docs: responseData.data.length,
time_taken: timeTakenInSeconds, docs: responseData.data,
team_id: req.auth.team_id, time_taken: timeTakenInSeconds,
mode: "search", team_id: req.auth.team_id,
url: req.body.query, mode: "search",
scrapeOptions: req.body.scrapeOptions, url: req.body.query,
origin: req.body.origin, scrapeOptions: req.body.scrapeOptions,
cost_tracking: costTracking, origin: req.body.origin,
}); cost_tracking: costTracking,
},
false,
isSearchPreview,
);
return res.status(200).json(responseData); return res.status(200).json(responseData);
} catch (error) { } catch (error) {
if ( if (
error instanceof Error && error instanceof Error &&

View File

@ -311,6 +311,8 @@ const baseScrapeOptions = z
proxy: z.enum(["basic", "stealth", "auto"]).optional(), proxy: z.enum(["basic", "stealth", "auto"]).optional(),
maxAge: z.number().int().gte(0).safe().default(0), maxAge: z.number().int().gte(0).safe().default(0),
storeInCache: z.boolean().default(true), storeInCache: z.boolean().default(true),
__experimental_cache: z.boolean().default(false).optional(),
__searchPreviewToken: z.string().optional(),
}) })
.strict(strictMessage); .strict(strictMessage);
@ -1172,6 +1174,7 @@ export const searchRequestSchema = z
origin: z.string().optional().default("api"), origin: z.string().optional().default("api"),
timeout: z.number().int().positive().finite().safe().default(60000), timeout: z.number().int().positive().finite().safe().default(60000),
ignoreInvalidURLs: z.boolean().optional().default(false), ignoreInvalidURLs: z.boolean().optional().default(false),
__searchPreviewToken: z.string().optional(),
scrapeOptions: baseScrapeOptions scrapeOptions: baseScrapeOptions
.extend({ .extend({
formats: z formats: z

View File

@ -95,8 +95,13 @@ function startServer(port = DEFAULT_PORT) {
logger.info(`Worker ${process.pid} listening on port ${port}`); logger.info(`Worker ${process.pid} listening on port ${port}`);
}); });
const exitHandler = () => { const exitHandler = async () => {
logger.info("SIGTERM signal received: closing HTTP server"); logger.info("SIGTERM signal received: closing HTTP server");
if (process.env.IS_KUBERNETES === "true") {
// Account for GCE load balancer drain timeout
logger.info("Waiting 60s for GCE load balancer drain timeout");
await new Promise((resolve) => setTimeout(resolve, 60000));
}
server.close(() => { server.close(() => {
logger.info("Server closed."); logger.info("Server closed.");
process.exit(0); process.exit(0);

View File

@ -133,6 +133,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
blockAds: false, blockAds: false,
maxAge: 0, maxAge: 0,
dontStoreInCache: false, dontStoreInCache: false,
__experimental_cache: true,
}, },
}, logger, costTracking, acuc?.flags ?? null); }, logger, costTracking, acuc?.flags ?? null);
return response.length > 0 ? response : []; return response.length > 0 ? response : [];

View File

@ -49,6 +49,7 @@ export async function scrapeDocument(
origin: options.origin, origin: options.origin,
is_scrape: true, is_scrape: true,
from_extract: true, from_extract: true,
startTime: Date.now(),
}, },
{}, {},
jobId, jobId,

View File

@ -47,6 +47,7 @@ export async function scrapeDocument_F0(
origin: options.origin, origin: options.origin,
is_scrape: true, is_scrape: true,
from_extract: true, from_extract: true,
startTime: Date.now(),
}, },
{}, {},
jobId, jobId,

View File

@ -1,164 +0,0 @@
import { Pinecone } from "@pinecone-database/pinecone";
import { Document } from "../../../controllers/v1/types";
import { logger } from "../../logger";
import { embed } from "ai";
import { getEmbeddingModel } from "../../generic-ai";
const pinecone = new Pinecone({
apiKey: process.env.PINECONE_API_KEY!,
});
const INDEX_NAME = process.env.PINECONE_INDEX_NAME ?? "";
const MAX_METADATA_SIZE = 30 * 1024; // 30KB in bytes
export interface PageMetadata {
url: string;
originUrl: string;
title?: string;
description?: string;
crawlId?: string;
teamId?: string;
timestamp: number;
markdown?: string;
}
async function getEmbedding(text: string) {
const { embedding } = await embed({
model: getEmbeddingModel("text-embedding-3-small"),
value: text,
});
return embedding;
}
function normalizeUrl(url: string) {
const urlO = new URL(url);
if (!urlO.hostname.startsWith("www.")) {
urlO.hostname = "www." + urlO.hostname;
}
return urlO.href;
}
export async function indexPage({
document,
originUrl,
crawlId,
teamId,
}: {
document: Document;
originUrl: string;
crawlId?: string;
teamId?: string;
}) {
try {
const index = pinecone.index(INDEX_NAME);
// Trim markdown if it's too long
let trimmedMarkdown = document.markdown;
if (
trimmedMarkdown &&
Buffer.byteLength(trimmedMarkdown, "utf-8") > MAX_METADATA_SIZE
) {
trimmedMarkdown = trimmedMarkdown.slice(
0,
Math.floor(MAX_METADATA_SIZE / 2),
); // Using half the size to be safe with UTF-8 encoding
}
// Create text to embed
const textToEmbed = [
document.metadata.title,
document.metadata.description,
trimmedMarkdown,
]
.filter(Boolean)
.join("\n\n");
// Get embedding from OpenAI
const embedding = await getEmbedding(textToEmbed);
const normalizedUrl = normalizeUrl(
document.metadata.sourceURL || document.metadata.url!,
);
// Prepare metadata
const metadata: PageMetadata = {
url: normalizedUrl,
originUrl: normalizeUrl(originUrl),
title: document.metadata.title ?? document.metadata.ogTitle ?? "",
description:
document.metadata.description ?? document.metadata.ogDescription ?? "",
crawlId,
teamId,
markdown: trimmedMarkdown,
timestamp: Date.now(),
};
// Upsert to Pinecone
await index.upsert([
{
id: normalizedUrl,
values: embedding,
metadata: {
...metadata,
[document.metadata.sourceURL || document.metadata.url!]: true,
},
},
]);
logger.debug("Successfully indexed page in Pinecone", {
url: metadata.url,
crawlId,
});
} catch (error) {
logger.error("Failed to index page in Pinecone", {
error,
url: document.metadata.sourceURL || document.metadata.url,
crawlId,
});
}
}
export async function searchSimilarPages(
query: string,
originUrl?: string,
limit: number = 1000,
): Promise<any[]> {
try {
const index = pinecone.index(INDEX_NAME);
// Get query embedding from OpenAI
const queryEmbedding = await getEmbedding(query);
const queryParams: any = {
vector: queryEmbedding,
topK: limit,
includeMetadata: true,
};
const normalizedOriginUrl = originUrl ? normalizeUrl(originUrl) : undefined;
// Add filter if originUrl is provided
if (normalizedOriginUrl) {
queryParams.filter = {
originUrl: { $eq: normalizedOriginUrl },
};
}
const results = await index.query(queryParams);
return results.matches.map((match) => ({
url: match.metadata?.url,
title: match.metadata?.title,
description: match.metadata?.description,
score: match.score,
markdown: match.metadata?.markdown,
}));
} catch (error) {
logger.error("Failed to search similar pages in Pinecone", {
error,
query,
originUrl,
});
return [];
}
}

View File

@ -4,7 +4,6 @@ import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { logger } from "../logger"; import { logger } from "../logger";
import { CohereClient } from "cohere-ai"; import { CohereClient } from "cohere-ai";
import { extractConfig } from "./config"; import { extractConfig } from "./config";
import { searchSimilarPages } from "./index/pinecone";
import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExtract"; import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
import { buildRerankerUserPrompt } from "./build-prompts"; import { buildRerankerUserPrompt } from "./build-prompts";
import { buildRerankerSystemPrompt } from "./build-prompts"; import { buildRerankerSystemPrompt } from "./build-prompts";

View File

@ -0,0 +1,49 @@
import { Document, ScrapeOptions } from "../controllers/v1/types";
import { supabaseGetJobById } from "./supabase-jobs";
import { logger } from "./logger";
import { CostTracking } from "./extract/extraction-service";
const creditsPerPDFPage = 1;
const stealthProxyCostBonus = 4;
export async function calculateCreditsToBeBilled(options: ScrapeOptions, document: Document, jobId: string, costTracking?: any) {
let creditsToBeBilled = 1; // Assuming 1 credit per document
if ((options.extract && options.formats?.includes("extract")) || (options.formats?.includes("changeTracking") && options.changeTrackingOptions?.modes?.includes("json"))) {
creditsToBeBilled = 5;
}
if (options.agent?.model?.toLowerCase() === "fire-1" || options.extract?.agent?.model?.toLowerCase() === "fire-1" || options.jsonOptions?.agent?.model?.toLowerCase() === "fire-1") {
if (process.env.USE_DB_AUTHENTICATION === "true") {
// @Nick this is a hack pushed at 2AM pls help - mogery
if (!costTracking) {
const job = await supabaseGetJobById(jobId);
costTracking = job?.cost_tracking;
}
if (!costTracking) {
logger.warn("No cost tracking found for job", {
jobId,
scrapeId: jobId
});
}
if (costTracking instanceof CostTracking) {
costTracking = costTracking.toJSON();
}
creditsToBeBilled = Math.ceil((costTracking?.totalCost ?? 1) * 1800);
} else {
creditsToBeBilled = 150;
}
}
if (document.metadata.numPages !== undefined && document.metadata.numPages > 1) {
creditsToBeBilled += creditsPerPDFPage * (document.metadata.numPages - 1);
}
if (document?.metadata?.proxyUsed === "stealth") {
creditsToBeBilled += stealthProxyCostBonus;
}
return creditsToBeBilled;
}

View File

@ -15,7 +15,6 @@ import {
ScrapeUrlResponse, ScrapeUrlResponse,
} from "../scraper/scrapeURL"; } from "../scraper/scrapeURL";
import { Engine } from "../scraper/scrapeURL/engines"; import { Engine } from "../scraper/scrapeURL/engines";
import { indexPage } from "../lib/extract/index/pinecone";
import { CostTracking } from "../lib/extract/extraction-service"; import { CostTracking } from "../lib/extract/extraction-service";
configDotenv(); configDotenv();

View File

@ -224,7 +224,6 @@ export async function scrapeURLWithFireEngineChromeCDP(
mobile: meta.options.mobile, mobile: meta.options.mobile,
timeout, // TODO: better timeout logic timeout, // TODO: better timeout logic
disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache, disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache,
blockAds: meta.options.blockAds,
mobileProxy: meta.featureFlags.has("stealthProxy"), mobileProxy: meta.featureFlags.has("stealthProxy"),
saveScrapeResultToGCS: meta.internalOptions.saveScrapeResultToGCS, saveScrapeResultToGCS: meta.internalOptions.saveScrapeResultToGCS,
// TODO: scrollXPaths // TODO: scrollXPaths

View File

@ -12,7 +12,6 @@ export type FireEngineScrapeRequestCommon = {
headers?: { [K: string]: string }; headers?: { [K: string]: string };
blockMedia?: boolean; // default: true blockMedia?: boolean; // default: true
blockAds?: boolean; // default: true
// pageOptions?: any; // unused, .scrollXPaths is considered on FE side // pageOptions?: any; // unused, .scrollXPaths is considered on FE side
// useProxy?: boolean; // unused, default: true // useProxy?: boolean; // unused, default: true
@ -39,7 +38,6 @@ export type FireEngineScrapeRequestChromeCDP = {
blockMedia?: true; // cannot be false blockMedia?: true; // cannot be false
mobile?: boolean; mobile?: boolean;
disableSmartWaitCache?: boolean; disableSmartWaitCache?: boolean;
blockAds?: boolean; // default: true
saveScrapeResultToGCS?: boolean; saveScrapeResultToGCS?: boolean;
}; };
@ -58,7 +56,6 @@ export type FireEngineScrapeRequestTLSClient = {
engine: "tlsclient"; engine: "tlsclient";
atsv?: boolean; // v0 only, default: false atsv?: boolean; // v0 only, default: false
disableJsDom?: boolean; // v0 only, default: false disableJsDom?: boolean; // v0 only, default: false
// blockAds?: boolean; // default: true
}; };
const schema = z.object({ const schema = z.object({

View File

@ -74,6 +74,7 @@ export const featureFlags = [
"skipTlsVerification", "skipTlsVerification",
"useFastMode", "useFastMode",
"stealthProxy", "stealthProxy",
"disableAdblock",
] as const; ] as const;
export type FeatureFlag = (typeof featureFlags)[number]; export type FeatureFlag = (typeof featureFlags)[number];
@ -95,6 +96,7 @@ export const featureFlagOptions: {
mobile: { priority: 10 }, mobile: { priority: 10 },
skipTlsVerification: { priority: 10 }, skipTlsVerification: { priority: 10 },
stealthProxy: { priority: 20 }, stealthProxy: { priority: 20 },
disableAdblock: { priority: 10 },
} as const; } as const;
export type EngineScrapeResult = { export type EngineScrapeResult = {
@ -171,6 +173,7 @@ export const engineOptions: {
skipTlsVerification: false, skipTlsVerification: false,
useFastMode: false, useFastMode: false,
stealthProxy: false, stealthProxy: false,
disableAdblock: false,
}, },
quality: 1000, // cache should always be tried first quality: 1000, // cache should always be tried first
}, },
@ -205,6 +208,7 @@ export const engineOptions: {
skipTlsVerification: true, skipTlsVerification: true,
useFastMode: false, useFastMode: false,
stealthProxy: false, stealthProxy: false,
disableAdblock: false,
}, },
quality: 50, quality: 50,
}, },
@ -222,6 +226,7 @@ export const engineOptions: {
skipTlsVerification: true, skipTlsVerification: true,
useFastMode: false, useFastMode: false,
stealthProxy: false, stealthProxy: false,
disableAdblock: false,
}, },
quality: 45, quality: 45,
}, },
@ -256,6 +261,7 @@ export const engineOptions: {
skipTlsVerification: true, skipTlsVerification: true,
useFastMode: false, useFastMode: false,
stealthProxy: true, stealthProxy: true,
disableAdblock: false,
}, },
quality: -2, quality: -2,
}, },
@ -273,6 +279,7 @@ export const engineOptions: {
skipTlsVerification: true, skipTlsVerification: true,
useFastMode: false, useFastMode: false,
stealthProxy: true, stealthProxy: true,
disableAdblock: false,
}, },
quality: -5, quality: -5,
}, },
@ -290,6 +297,7 @@ export const engineOptions: {
skipTlsVerification: false, skipTlsVerification: false,
useFastMode: false, useFastMode: false,
stealthProxy: false, stealthProxy: false,
disableAdblock: true,
}, },
quality: 40, quality: 40,
}, },
@ -307,6 +315,7 @@ export const engineOptions: {
skipTlsVerification: false, skipTlsVerification: false,
useFastMode: false, useFastMode: false,
stealthProxy: true, stealthProxy: true,
disableAdblock: true,
}, },
quality: -10, quality: -10,
}, },
@ -324,6 +333,7 @@ export const engineOptions: {
skipTlsVerification: false, skipTlsVerification: false,
useFastMode: false, useFastMode: false,
stealthProxy: false, stealthProxy: false,
disableAdblock: false,
}, },
quality: 20, quality: 20,
}, },
@ -341,6 +351,7 @@ export const engineOptions: {
skipTlsVerification: false, skipTlsVerification: false,
useFastMode: true, useFastMode: true,
stealthProxy: false, stealthProxy: false,
disableAdblock: false,
}, },
quality: 10, quality: 10,
}, },
@ -358,6 +369,7 @@ export const engineOptions: {
skipTlsVerification: false, skipTlsVerification: false,
useFastMode: true, useFastMode: true,
stealthProxy: true, stealthProxy: true,
disableAdblock: false,
}, },
quality: -15, quality: -15,
}, },
@ -375,6 +387,7 @@ export const engineOptions: {
skipTlsVerification: false, skipTlsVerification: false,
useFastMode: true, useFastMode: true,
stealthProxy: false, stealthProxy: false,
disableAdblock: false,
}, },
quality: 5, quality: 5,
}, },
@ -392,6 +405,7 @@ export const engineOptions: {
skipTlsVerification: false, skipTlsVerification: false,
useFastMode: true, useFastMode: true,
stealthProxy: true, // kinda... stealthProxy: true, // kinda...
disableAdblock: true,
}, },
quality: -20, quality: -20,
}, },
@ -409,6 +423,7 @@ export const engineOptions: {
skipTlsVerification: false, skipTlsVerification: false,
useFastMode: true, useFastMode: true,
stealthProxy: true, // kinda... stealthProxy: true, // kinda...
disableAdblock: true,
}, },
quality: -20, quality: -20,
}, },

View File

@ -120,6 +120,10 @@ function buildFeatureFlags(
flags.add("docx"); flags.add("docx");
} }
if (options.blockAds === false) {
flags.add("disableAdblock");
}
return flags; return flags;
} }
@ -187,6 +191,7 @@ export type InternalOptions = {
unnormalizedSourceURL?: string; unnormalizedSourceURL?: string;
saveScrapeResultToGCS?: boolean; // Passed along to fire-engine saveScrapeResultToGCS?: boolean; // Passed along to fire-engine
bypassBilling?: boolean;
}; };
export type EngineResultsTracker = { export type EngineResultsTracker = {

View File

@ -5,6 +5,58 @@ import { logger } from "../lib/logger";
dotenv.config(); dotenv.config();
export async function fire_engine_search(
q: string,
options: {
tbs?: string;
filter?: string;
lang?: string;
country?: string;
location?: string;
numResults: number;
page?: number;
},
abort?: AbortSignal,
): Promise<SearchResult[]> {
try {
let data = JSON.stringify({
query: q,
lang: options.lang,
country: options.country,
location: options.location,
tbs: options.tbs,
numResults: options.numResults,
page: options.page ?? 1,
});
if (!process.env.FIRE_ENGINE_BETA_URL) {
return [];
}
const response = await fetch(`${process.env.FIRE_ENGINE_BETA_URL}/search`, {
method: "POST",
headers: {
"Content-Type": "application/json",
"X-Disable-Cache": "true",
},
body: data,
signal: abort,
});
if (response.ok) {
const responseData = await response.json();
return responseData;
} else {
return [];
}
} catch (error) {
logger.error(error);
Sentry.captureException(error);
return [];
}
}
export async function fireEngineMap( export async function fireEngineMap(
q: string, q: string,
options: { options: {
@ -34,7 +86,7 @@ export async function fireEngineMap(
return []; return [];
} }
const response = await fetch(`${process.env.FIRE_ENGINE_BETA_URL}/search`, { const response = await fetch(`${process.env.FIRE_ENGINE_BETA_URL}/map`, {
method: "POST", method: "POST",
headers: { headers: {
"Content-Type": "application/json", "Content-Type": "application/json",

View File

@ -4,6 +4,7 @@ import { googleSearch } from "./googlesearch";
import { searchapi_search } from "./searchapi"; import { searchapi_search } from "./searchapi";
import { serper_search } from "./serper"; import { serper_search } from "./serper";
import { searxng_search } from "./searxng"; import { searxng_search } from "./searxng";
import { fire_engine_search } from "./fireEngine";
export async function search({ export async function search({
query, query,
@ -31,8 +32,19 @@ export async function search({
timeout?: number; timeout?: number;
}): Promise<SearchResult[]> { }): Promise<SearchResult[]> {
try { try {
if (process.env.FIRE_ENGINE_BETA_URL) {
const results = await fire_engine_search(query, {
numResults: num_results,
tbs,
filter,
lang,
country,
location,
});
if (results.length > 0) return results;
}
if (process.env.SERPER_API_KEY) { if (process.env.SERPER_API_KEY) {
return await serper_search(query, { const results = await serper_search(query, {
num_results, num_results,
tbs, tbs,
filter, filter,
@ -40,9 +52,10 @@ export async function search({
country, country,
location, location,
}); });
if (results.length > 0) return results;
} }
if (process.env.SEARCHAPI_API_KEY) { if (process.env.SEARCHAPI_API_KEY) {
return await searchapi_search(query, { const results = await searchapi_search(query, {
num_results, num_results,
tbs, tbs,
filter, filter,
@ -50,9 +63,10 @@ export async function search({
country, country,
location, location,
}); });
if (results.length > 0) return results;
} }
if (process.env.SEARXNG_ENDPOINT) { if (process.env.SEARXNG_ENDPOINT) {
return await searxng_search(query, { const results = await searxng_search(query, {
num_results, num_results,
tbs, tbs,
filter, filter,
@ -60,6 +74,7 @@ export async function search({
country, country,
location, location,
}); });
if (results.length > 0) return results;
} }
return await googleSearch( return await googleSearch(
query, query,

View File

@ -21,12 +21,13 @@ function cleanOfNull<T>(x: T): T {
} }
} }
export async function logJob(job: FirecrawlJob, force: boolean = false) { export async function logJob(job: FirecrawlJob, force: boolean = false, bypassLogging: boolean = false) {
try { try {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
if (!useDbAuthentication) { if (!useDbAuthentication) {
return; return;
} }
// Redact any pages that have an authorization header // Redact any pages that have an authorization header
// actually, Don't. we use the db to retrieve results now. this breaks authed crawls - mogery // actually, Don't. we use the db to retrieve results now. this breaks authed crawls - mogery
@ -63,12 +64,17 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) {
is_migrated: true, is_migrated: true,
cost_tracking: job.cost_tracking, cost_tracking: job.cost_tracking,
pdf_num_pages: job.pdf_num_pages ?? null, pdf_num_pages: job.pdf_num_pages ?? null,
credits_billed: job.credits_billed ?? null,
}; };
if (process.env.GCS_BUCKET_NAME) { if (process.env.GCS_BUCKET_NAME) {
await saveJobToGCS(job); await saveJobToGCS(job);
} }
if (bypassLogging) {
return;
}
if (force) { if (force) {
let i = 0, let i = 0,
done = false; done = false;

View File

@ -97,6 +97,7 @@ async function addScrapeJobRaw(
options: any, options: any,
jobId: string, jobId: string,
jobPriority: number, jobPriority: number,
directToBullMQ: boolean = false,
) { ) {
const hasCrawlDelay = webScraperOptions.crawl_id && webScraperOptions.crawlerOptions?.delay; const hasCrawlDelay = webScraperOptions.crawl_id && webScraperOptions.crawlerOptions?.delay;
@ -127,7 +128,7 @@ async function addScrapeJobRaw(
const concurrencyQueueJobs = await getConcurrencyQueueJobsCount(webScraperOptions.team_id); const concurrencyQueueJobs = await getConcurrencyQueueJobsCount(webScraperOptions.team_id);
if (concurrencyLimited) { if (concurrencyLimited && !directToBullMQ) {
// Detect if they hit their concurrent limit // Detect if they hit their concurrent limit
// If above by 2x, send them an email // If above by 2x, send them an email
// No need to 2x as if there are more than the max concurrency in the concurrency queue, it is already 2x // No need to 2x as if there are more than the max concurrency in the concurrency queue, it is already 2x
@ -161,6 +162,7 @@ export async function addScrapeJob(
options: any = {}, options: any = {},
jobId: string = uuidv4(), jobId: string = uuidv4(),
jobPriority: number = 10, jobPriority: number = 10,
directToBullMQ: boolean = false,
) { ) {
if (Sentry.isInitialized()) { if (Sentry.isInitialized()) {
const size = JSON.stringify(webScraperOptions).length; const size = JSON.stringify(webScraperOptions).length;
@ -187,11 +189,12 @@ export async function addScrapeJob(
options, options,
jobId, jobId,
jobPriority, jobPriority,
directToBullMQ,
); );
}, },
); );
} else { } else {
await addScrapeJobRaw(webScraperOptions, options, jobId, jobPriority); await addScrapeJobRaw(webScraperOptions, options, jobId, jobPriority, directToBullMQ);
} }
} }

View File

@ -66,10 +66,10 @@ export function getIndexQueue() {
connection: redisConnection, connection: redisConnection,
defaultJobOptions: { defaultJobOptions: {
removeOnComplete: { removeOnComplete: {
age: 90000, // 25 hours age: 3600, // 1 hour
}, },
removeOnFail: { removeOnFail: {
age: 90000, // 25 hours age: 3600, // 1 hour
}, },
}, },
}); });
@ -120,10 +120,10 @@ export function getBillingQueue() {
connection: redisConnection, connection: redisConnection,
defaultJobOptions: { defaultJobOptions: {
removeOnComplete: { removeOnComplete: {
age: 90000, // 25 hours age: 3600, // 1 hour
}, },
removeOnFail: { removeOnFail: {
age: 90000, // 25 hours age: 3600, // 1 hour
}, },
}, },
}); });

View File

@ -61,7 +61,6 @@ import {
} from "../lib/concurrency-limit"; } from "../lib/concurrency-limit";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings"; import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
import { indexPage } from "../lib/extract/index/pinecone";
import { Document } from "../controllers/v1/types"; import { Document } from "../controllers/v1/types";
import { import {
ExtractResult, ExtractResult,
@ -85,9 +84,11 @@ import https from "https";
import { cacheableLookup } from "../scraper/scrapeURL/lib/cacheableLookup"; import { cacheableLookup } from "../scraper/scrapeURL/lib/cacheableLookup";
import { robustFetch } from "../scraper/scrapeURL/lib/fetch"; import { robustFetch } from "../scraper/scrapeURL/lib/fetch";
import { RateLimiterMode } from "../types"; import { RateLimiterMode } from "../types";
import { calculateCreditsToBeBilled } from "../lib/scrape-billing";
import { redisEvictConnection } from "./redis"; import { redisEvictConnection } from "./redis";
import { generateURLSplits, hashURL, index_supabase_service, useIndex } from "./index"; import { generateURLSplits, hashURL, index_supabase_service, useIndex } from "./index";
import { WebCrawler } from "../scraper/WebScraper/crawler"; import { WebCrawler } from "../scraper/WebScraper/crawler";
import type { Logger } from "winston";
configDotenv(); configDotenv();
@ -320,7 +321,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
scrapeOptions: sc.scrapeOptions, scrapeOptions: sc.scrapeOptions,
crawlerOptions: sc.crawlerOptions, crawlerOptions: sc.crawlerOptions,
origin: job.data.origin, origin: job.data.origin,
}); }, false, job.data.internalOptions?.bypassBilling ?? false);
logger.info("Logged crawl!"); logger.info("Logged crawl!");
const data = { const data = {
@ -372,8 +373,10 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
origin: job.data.origin, origin: job.data.origin,
}, },
true, true,
job.data.internalOptions?.bypassBilling ?? false,
); );
// v1 web hooks, call when done with no data, but with event completed // v1 web hooks, call when done with no data, but with event completed
if (job.data.v1 && job.data.webhook) { if (job.data.v1 && job.data.webhook) {
callWebhook( callWebhook(
@ -1133,6 +1136,59 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
} }
} }
async function billScrapeJob(job: Job & { id: string }, document: Document, logger: Logger, costTracking?: CostTracking) {
let creditsToBeBilled: number | null = null;
if (job.data.is_scrape !== true && !job.data.internalOptions?.bypassBilling) {
creditsToBeBilled = await calculateCreditsToBeBilled(job.data.scrapeOptions, document, job.id, costTracking);
if (
job.data.team_id !== process.env.BACKGROUND_INDEX_TEAM_ID! &&
process.env.USE_DB_AUTHENTICATION === "true"
) {
try {
const billingJobId = uuidv4();
logger.debug(
`Adding billing job to queue for team ${job.data.team_id}`,
{
billingJobId,
credits: creditsToBeBilled,
is_extract: false,
},
);
// Add directly to the billing queue - the billing worker will handle the rest
await getBillingQueue().add(
"bill_team",
{
team_id: job.data.team_id,
subscription_id: undefined,
credits: creditsToBeBilled,
is_extract: false,
timestamp: new Date().toISOString(),
originating_job_id: job.id,
},
{
jobId: billingJobId,
priority: 10,
},
);
return creditsToBeBilled;
} catch (error) {
logger.error(
`Failed to add billing job to queue for team ${job.data.team_id} for ${creditsToBeBilled} credits`,
{ error },
);
Sentry.captureException(error);
return creditsToBeBilled;
}
}
}
return creditsToBeBilled;
}
async function processJob(job: Job & { id: string }, token: string) { async function processJob(job: Job & { id: string }, token: string) {
const logger = _logger.child({ const logger = _logger.child({
module: "queue-worker", module: "queue-worker",
@ -1143,25 +1199,9 @@ async function processJob(job: Job & { id: string }, token: string) {
teamId: job.data?.team_id ?? undefined, teamId: job.data?.team_id ?? undefined,
}); });
logger.info(`🐂 Worker taking job ${job.id}`, { url: job.data.url }); logger.info(`🐂 Worker taking job ${job.id}`, { url: job.data.url });
const start = Date.now(); const start = job.data.startTime ?? Date.now();
const remainingTime = job.data.scrapeOptions.timeout ? (job.data.scrapeOptions.timeout - (Date.now() - start)) : undefined;
// Check if the job URL is researchhub and block it immediately
// TODO: remove this once solve the root issue
// if (
// job.data.url &&
// (job.data.url.includes("researchhub.com") ||
// job.data.url.includes("ebay.com"))
// ) {
// logger.info(`🐂 Blocking job ${job.id} with URL ${job.data.url}`);
// const data = {
// success: false,
// document: null,
// project_id: job.data.project_id,
// error:
// "URL is blocked. Suspecious activity detected. Please contact help@firecrawl.com if you believe this is an error.",
// };
// return data;
// }
const costTracking = new CostTracking(); const costTracking = new CostTracking();
try { try {
@ -1172,6 +1212,11 @@ async function processJob(job: Job & { id: string }, token: string) {
current_url: "", current_url: "",
}); });
if (remainingTime !== undefined && remainingTime < 0) {
throw new Error("timeout");
}
const signal = remainingTime ? AbortSignal.timeout(remainingTime) : undefined;
if (job.data.crawl_id) { if (job.data.crawl_id) {
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl; const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
if (sc && sc.cancelled) { if (sc && sc.cancelled) {
@ -1185,16 +1230,22 @@ async function processJob(job: Job & { id: string }, token: string) {
token, token,
costTracking, costTracking,
}), }),
...(job.data.scrapeOptions.timeout !== undefined ...(remainingTime !== undefined
? [ ? [
(async () => { (async () => {
await sleep(job.data.scrapeOptions.timeout); await sleep(remainingTime);
throw new Error("timeout"); throw new Error("timeout");
})(), })(),
] ]
: []), : []),
]); ]);
try {
signal?.throwIfAborted();
} catch (e) {
throw new Error("timeout");
}
if (!pipeline.success) { if (!pipeline.success) {
throw pipeline.error; throw pipeline.error;
} }
@ -1295,45 +1346,6 @@ async function processJob(job: Job & { id: string }, token: string) {
} }
} }
logger.debug("Logging job to DB...");
await logJob(
{
job_id: job.id as string,
success: true,
num_docs: 1,
docs: [doc],
time_taken: timeTakenInSeconds,
team_id: job.data.team_id,
mode: job.data.mode,
url: job.data.url,
crawlerOptions: sc.crawlerOptions,
scrapeOptions: job.data.scrapeOptions,
origin: job.data.origin,
crawl_id: job.data.crawl_id,
cost_tracking: costTracking,
pdf_num_pages: doc.metadata.numPages,
},
true,
);
if (job.data.webhook && job.data.mode !== "crawl" && job.data.v1) {
logger.debug("Calling webhook with success...", {
webhook: job.data.webhook,
});
await callWebhook(
job.data.team_id,
job.data.crawl_id,
data,
job.data.webhook,
job.data.v1,
job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page",
true,
);
}
logger.debug("Declaring job as done...");
await addCrawlJobDone(job.data.crawl_id, job.id, true);
if (job.data.crawlerOptions !== null) { if (job.data.crawlerOptions !== null) {
if (!sc.cancelled) { if (!sc.cancelled) {
const crawler = crawlToCrawler( const crawler = crawlToCrawler(
@ -1429,8 +1441,65 @@ async function processJob(job: Job & { id: string }, token: string) {
} }
} }
try {
signal?.throwIfAborted();
} catch (e) {
throw new Error("timeout");
}
const credits_billed = await billScrapeJob(job, doc, logger, costTracking);
logger.debug("Logging job to DB...");
await logJob(
{
job_id: job.id as string,
success: true,
num_docs: 1,
docs: [doc],
time_taken: timeTakenInSeconds,
team_id: job.data.team_id,
mode: job.data.mode,
url: job.data.url,
crawlerOptions: sc.crawlerOptions,
scrapeOptions: job.data.scrapeOptions,
origin: job.data.origin,
crawl_id: job.data.crawl_id,
cost_tracking: costTracking,
pdf_num_pages: doc.metadata.numPages,
credits_billed,
},
true,
job.data.internalOptions?.bypassBilling ?? false,
);
if (job.data.webhook && job.data.mode !== "crawl" && job.data.v1) {
logger.debug("Calling webhook with success...", {
webhook: job.data.webhook,
});
await callWebhook(
job.data.team_id,
job.data.crawl_id,
data,
job.data.webhook,
job.data.v1,
job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page",
true,
);
}
logger.debug("Declaring job as done...");
await addCrawlJobDone(job.data.crawl_id, job.id, true);
await finishCrawlIfNeeded(job, sc); await finishCrawlIfNeeded(job, sc);
} else { } else {
try {
signal?.throwIfAborted();
} catch (e) {
throw new Error("timeout");
}
const credits_billed = await billScrapeJob(job, doc, logger, costTracking);
await logJob({ await logJob({
job_id: job.id, job_id: job.id,
success: true, success: true,
@ -1446,66 +1515,8 @@ async function processJob(job: Job & { id: string }, token: string) {
num_tokens: 0, // TODO: fix num_tokens: 0, // TODO: fix
cost_tracking: costTracking, cost_tracking: costTracking,
pdf_num_pages: doc.metadata.numPages, pdf_num_pages: doc.metadata.numPages,
}); credits_billed,
} }, false, job.data.internalOptions?.bypassBilling ?? false);
if (job.data.is_scrape !== true) {
let creditsToBeBilled = 1; // Assuming 1 credit per document
if ((job.data.scrapeOptions.extract && job.data.scrapeOptions.formats?.includes("extract")) || (job.data.scrapeOptions.formats?.includes("changeTracking") && job.data.scrapeOptions.changeTrackingOptions?.modes?.includes("json"))) {
creditsToBeBilled = 5;
}
if (job.data.scrapeOptions.agent?.model?.toLowerCase() === "fire-1" || job.data.scrapeOptions.extract?.agent?.model?.toLowerCase() === "fire-1" || job.data.scrapeOptions.jsonOptions?.agent?.model?.toLowerCase() === "fire-1") {
if (process.env.USE_DB_AUTHENTICATION === "true") {
creditsToBeBilled = Math.ceil((costTracking.toJSON().totalCost ?? 1) * 1800);
} else {
creditsToBeBilled = 150;
}
}
if (doc.metadata?.proxyUsed === "stealth") {
creditsToBeBilled += 4;
}
if (
job.data.team_id !== process.env.BACKGROUND_INDEX_TEAM_ID! &&
process.env.USE_DB_AUTHENTICATION === "true"
) {
try {
const billingJobId = uuidv4();
logger.debug(
`Adding billing job to queue for team ${job.data.team_id}`,
{
billingJobId,
credits: creditsToBeBilled,
is_extract: false,
},
);
// Add directly to the billing queue - the billing worker will handle the rest
await getBillingQueue().add(
"bill_team",
{
team_id: job.data.team_id,
subscription_id: undefined,
credits: creditsToBeBilled,
is_extract: false,
timestamp: new Date().toISOString(),
originating_job_id: job.id,
},
{
jobId: billingJobId,
priority: 10,
},
);
} catch (error) {
logger.error(
`Failed to add billing job to queue for team ${job.data.team_id} for ${creditsToBeBilled} credits`,
{ error },
);
Sentry.captureException(error);
}
}
} }
logger.info(`🐂 Job done ${job.id}`); logger.info(`🐂 Job done ${job.id}`);
@ -1604,6 +1615,7 @@ async function processJob(job: Job & { id: string }, token: string) {
cost_tracking: costTracking, cost_tracking: costTracking,
}, },
true, true,
job.data.internalOptions?.bypassBilling ?? false,
); );
return data; return data;
} }

View File

@ -44,9 +44,15 @@ export interface WebScraperOptions {
sitemapped?: boolean; sitemapped?: boolean;
webhook?: z.infer<typeof webhookSchema>; webhook?: z.infer<typeof webhookSchema>;
v1?: boolean; v1?: boolean;
/**
* Disables billing on the worker side.
*/
is_scrape?: boolean; is_scrape?: boolean;
isCrawlSourceScrape?: boolean; isCrawlSourceScrape?: boolean;
from_extract?: boolean; from_extract?: boolean;
startTime?: number;
} }
export interface RunWebScraperParams { export interface RunWebScraperParams {
@ -95,6 +101,7 @@ export interface FirecrawlJob {
sources?: Record<string, string[]>; sources?: Record<string, string[]>;
cost_tracking?: CostTracking; cost_tracking?: CostTracking;
pdf_num_pages?: number; pdf_num_pages?: number;
credits_billed?: number | null;
} }
export interface FirecrawlScrapeResponse { export interface FirecrawlScrapeResponse {

View File

@ -680,7 +680,7 @@ checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a"
[[package]] [[package]]
name = "firecrawl" name = "firecrawl"
version = "1.1.0" version = "1.2.0"
dependencies = [ dependencies = [
"assert_matches", "assert_matches",
"axum", "axum",

View File

@ -1,7 +1,7 @@
[package] [package]
name = "firecrawl" name = "firecrawl"
author= "Mendable.ai" author= "Mendable.ai"
version = "1.1.0" version = "1.2.0"
edition = "2021" edition = "2021"
license = "MIT" license = "MIT"
homepage = "https://www.firecrawl.dev/" homepage = "https://www.firecrawl.dev/"

View File

@ -99,6 +99,49 @@ impl From<CrawlScrapeOptions> for ScrapeOptions {
} }
} }
/// Options for webhook notifications
#[serde_with::skip_serializing_none]
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
#[serde(rename_all = "camelCase")]
pub struct WebhookOptions {
/// URL to send webhook notifications to
pub url: String,
/// Custom headers to include in webhook requests
pub headers: Option<HashMap<String, String>>,
/// Custom data included in all webhook payloads
pub metadata: Option<HashMap<String, String>>,
/// Event types to receive
pub events: Option<Vec<WebhookEvent>>,
}
impl From<String> for WebhookOptions {
fn from(value: String) -> Self {
Self {
url: value,
..Default::default()
}
}
}
#[derive(Deserialize, Serialize, Debug, PartialEq, Eq, Clone, Copy)]
#[serde(rename_all = "camelCase")]
pub enum WebhookEvent {
/// Crawl finished successfully
Completed,
/// Crawl encountered an error
Failed,
/// Individual page scraped
Page,
/// Crawl job initiated
Started,
}
#[serde_with::skip_serializing_none] #[serde_with::skip_serializing_none]
#[derive(Deserialize, Serialize, Debug, Default, Clone)] #[derive(Deserialize, Serialize, Debug, Default, Clone)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
@ -132,7 +175,7 @@ pub struct CrawlOptions {
pub allow_external_links: Option<bool>, pub allow_external_links: Option<bool>,
/// URL to send Webhook crawl events to. /// URL to send Webhook crawl events to.
pub webhook: Option<String>, pub webhook: Option<WebhookOptions>,
/// Idempotency key to send to the crawl endpoint. /// Idempotency key to send to the crawl endpoint.
#[serde(skip)] #[serde(skip)]