diff --git a/.github/workflows/test-server-self-host.yml b/.github/workflows/test-server-self-host.yml index 845c9eab..56232fb8 100644 --- a/.github/workflows/test-server-self-host.yml +++ b/.github/workflows/test-server-self-host.yml @@ -128,7 +128,7 @@ jobs: - name: Kill SearXNG if: always() && matrix.search == 'searxng' run: | - docker logs searxng > searxng/searxng.log 2>&1 + docker logs searxng > searxng.log 2>&1 docker kill searxng working-directory: ./ - uses: actions/upload-artifact@v4 @@ -149,5 +149,4 @@ jobs: with: name: SearXNG (${{ matrix.ai }}, ${{ matrix.engine }}, ${{ matrix.proxy }}) path: | - ./searxng/searxng.log - ./searxng/settings.yml + ./searxng.log diff --git a/apps/api/package.json b/apps/api/package.json index 2461df18..9866c3bc 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -69,7 +69,6 @@ "@google-cloud/storage": "^7.16.0", "@nangohq/node": "^0.40.8", "@openrouter/ai-sdk-provider": "^0.4.5", - "@pinecone-database/pinecone": "^4.0.0", "@sentry/cli": "^2.33.1", "@sentry/node": "^8.26.0", "@sentry/profiling-node": "^8.26.0", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 333cd85c..d394f007 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -59,9 +59,6 @@ importers: '@openrouter/ai-sdk-provider': specifier: ^0.4.5 version: 0.4.5(zod@3.24.2) - '@pinecone-database/pinecone': - specifier: ^4.0.0 - version: 4.0.0 '@sentry/cli': specifier: ^2.33.1 version: 2.33.1(encoding@0.1.13) @@ -1232,10 +1229,6 @@ packages: '@pdf-lib/upng@1.0.1': resolution: {integrity: sha512-dQK2FUMQtowVP00mtIksrlZhdFXQZPC+taih1q4CvPZ5vqdxR/LKBaFg0oAfzd1GlHZXXSPdQfzQnt+ViGvEIQ==} - '@pinecone-database/pinecone@4.0.0': - resolution: {integrity: sha512-INYS+GBys9v5BRTyn0tv8srVsPTlSRvE3BPE4Wkc/lOEyAIyB9F7DEMXbeF19FOLEgRwCuHTLjzm1niENl+4FA==} - engines: {node: '>=18.0.0'} - '@pkgjs/parseargs@0.11.0': resolution: {integrity: sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==} engines: {node: '>=14'} @@ -6331,10 +6324,6 @@ snapshots: dependencies: pako: 1.0.11 - '@pinecone-database/pinecone@4.0.0': - dependencies: - encoding: 0.1.13 - '@pkgjs/parseargs@0.11.0': optional: true @@ -7921,6 +7910,7 @@ snapshots: encoding@0.1.13: dependencies: iconv-lite: 0.6.3 + optional: true end-of-stream@1.4.4: dependencies: diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index 06920f18..67c9f490 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -78,24 +78,24 @@ describe("Scrape tests", () => { expect(JSON.stringify(status)).toBe(JSON.stringify(response)); }, 60000); - // describe("Ad blocking (f-e dependant)", () => { - // it.concurrent("blocks ads by default", async () => { - // const response = await scrape({ - // url: "https://www.allrecipes.com/recipe/18185/yum/", - // }); + describe("Ad blocking (f-e dependant)", () => { + it.concurrent("blocks ads by default", async () => { + const response = await scrape({ + url: "https://www.allrecipes.com/recipe/18185/yum/", + }); - // expect(response.markdown).not.toContain(".g.doubleclick.net/"); - // }, 30000); + expect(response.markdown).not.toContain(".g.doubleclick.net/"); + }, 30000); - // it.concurrent("doesn't block ads if explicitly disabled", async () => { - // const response = await scrape({ - // url: "https://www.allrecipes.com/recipe/18185/yum/", - // blockAds: false, - // }); + it.concurrent("doesn't block ads if explicitly disabled", async () => { + const response = await scrape({ + url: "https://www.allrecipes.com/recipe/18185/yum/", + blockAds: false, + }); - // expect(response.markdown).toMatch(/(\.g\.doubleclick\.net|amazon-adsystem\.com)\//); - // }, 30000); - // }); + expect(response.markdown).toMatch(/(\.g\.doubleclick\.net|amazon-adsystem\.com)\//); + }, 30000); + }); describe("Index", () => { it.concurrent("caches properly", async () => { diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index f7a9fbdd..559d222c 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -82,6 +82,7 @@ export async function scrapeHelper( internalOptions, origin: req.body.origin ?? defaultOrigin, is_scrape: true, + startTime: Date.now(), }, {}, jobId, diff --git a/apps/api/src/controllers/v0/search.ts b/apps/api/src/controllers/v0/search.ts index 15172e37..e4be9bd7 100644 --- a/apps/api/src/controllers/v0/search.ts +++ b/apps/api/src/controllers/v0/search.ts @@ -112,6 +112,7 @@ export async function searchHelper( team_id: team_id, scrapeOptions, internalOptions, + startTime: Date.now(), }, opts: { jobId: uuid, diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index 092d86e8..75998c87 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -13,6 +13,7 @@ import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; import { getJobPriority } from "../../lib/job-priority"; import { getScrapeQueue } from "../../services/queue-service"; import { supabaseGetJobById } from "../../lib/supabase-jobs"; +import { calculateCreditsToBeBilled } from "../../lib/scrape-billing"; export async function scrapeController( req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>, @@ -30,7 +31,6 @@ export async function scrapeController( }); req.body = scrapeRequestSchema.parse(req.body); - let earlyReturn = false; const origin = req.body.origin; const timeout = req.body.timeout; @@ -42,6 +42,8 @@ export async function scrapeController( }); // + const isDirectToBullMQ = process.env.SEARCH_PREVIEW_TOKEN === req.body.__searchPreviewToken; + await addScrapeJob( { url: req.body.url, @@ -52,13 +54,16 @@ export async function scrapeController( teamId: req.auth.team_id, saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false, unnormalizedSourceURL: preNormalizedBody.url, + useCache: req.body.__experimental_cache ? true : false, + bypassBilling: isDirectToBullMQ, }, origin: req.body.origin, - is_scrape: true, + startTime, }, {}, jobId, jobPriority, + isDirectToBullMQ, ); const totalWait = @@ -124,57 +129,13 @@ export async function scrapeController( await getScrapeQueue().remove(jobId); - const endTime = new Date().getTime(); - const timeTakenInSeconds = (endTime - startTime) / 1000; - const numTokens = - doc && doc.extract - ? // ? numTokensFromString(doc.markdown, "gpt-3.5-turbo") - 0 // TODO: fix - : 0; - - let creditsToBeBilled = 1; // Assuming 1 credit per document - if (earlyReturn) { - // Don't bill if we're early returning - return; - } - if ((req.body.extract && req.body.formats?.includes("extract")) || (req.body.formats?.includes("changeTracking") && req.body.changeTrackingOptions?.modes?.includes("json"))) { - creditsToBeBilled = 5; - } - - if (req.body.agent?.model?.toLowerCase() === "fire-1" || req.body.extract?.agent?.model?.toLowerCase() === "fire-1" || req.body.jsonOptions?.agent?.model?.toLowerCase() === "fire-1") { - if (process.env.USE_DB_AUTHENTICATION === "true") { - // @Nick this is a hack pushed at 2AM pls help - mogery - const job = await supabaseGetJobById(jobId); - if (!job?.cost_tracking) { - logger.warn("No cost tracking found for job", { - jobId, - }); - } - creditsToBeBilled = Math.ceil((job?.cost_tracking?.totalCost ?? 1) * 1800); - } else { - creditsToBeBilled = 150; - } - } - - if (doc?.metadata?.proxyUsed === "stealth") { - creditsToBeBilled += 4; - } - - billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch( - (error) => { - logger.error( - `Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`, - ); - // Optionally, you could notify an admin or add to a retry queue here - }, - ); - if (!req.body.formats.includes("rawHtml")) { if (doc && doc.rawHtml) { delete doc.rawHtml; } } + return res.status(200).json({ success: true, data: doc, diff --git a/apps/api/src/controllers/v1/search.ts b/apps/api/src/controllers/v1/search.ts index c9c9bf3c..92f3becf 100644 --- a/apps/api/src/controllers/v1/search.ts +++ b/apps/api/src/controllers/v1/search.ts @@ -40,24 +40,24 @@ export async function searchAndScrapeSearchResult( try { const searchResults = await search({ query, - num_results: 5 - }); + num_results: 5, + }); - const documents = await Promise.all( - searchResults.map(result => - scrapeSearchResult( - { - url: result.url, - title: result.title, - description: result.description - }, - options, - logger, - costTracking, - flags - ) - ) - ); + const documents = await Promise.all( + searchResults.map((result) => + scrapeSearchResult( + { + url: result.url, + title: result.title, + description: result.description, + }, + options, + logger, + costTracking, + flags, + ), + ), + ); return documents; } catch (error) { @@ -76,6 +76,8 @@ async function scrapeSearchResult( logger: Logger, costTracking: CostTracking, flags: TeamFlags, + directToBullMQ: boolean = false, + isSearchPreview: boolean = false, ): Promise { const jobId = uuidv4(); const jobPriority = await getJobPriority({ @@ -99,18 +101,19 @@ async function scrapeSearchResult( mode: "single_urls" as Mode, team_id: options.teamId, scrapeOptions: options.scrapeOptions, - internalOptions: { teamId: options.teamId, useCache: true }, + internalOptions: { teamId: options.teamId, useCache: true, bypassBilling: true }, origin: options.origin, is_scrape: true, - + startTime: Date.now(), }, {}, jobId, jobPriority, + directToBullMQ, ); const doc: Document = await waitForJob(jobId, options.timeout); - + logger.info("Scrape job completed", { scrapeId: jobId, url: searchResult.url, @@ -169,6 +172,8 @@ export async function searchController( }; const startTime = new Date().getTime(); const costTracking = new CostTracking(); + const isSearchPreview = + process.env.SEARCH_PREVIEW_TOKEN === req.body.__searchPreviewToken; try { req.body = searchRequestSchema.parse(req.body); @@ -197,7 +202,9 @@ export async function searchController( }); if (req.body.ignoreInvalidURLs) { - searchResults = searchResults.filter((result) => !isUrlBlocked(result.url, req.acuc?.flags ?? null)); + searchResults = searchResults.filter( + (result) => !isUrlBlocked(result.url, req.acuc?.flags ?? null), + ); } logger.info("Searching completed", { @@ -224,12 +231,20 @@ export async function searchController( } else { logger.info("Scraping search results"); const scrapePromises = searchResults.map((result) => - scrapeSearchResult(result, { - teamId: req.auth.team_id, - origin: req.body.origin, - timeout: req.body.timeout, - scrapeOptions: req.body.scrapeOptions, - }, logger, costTracking, req.acuc?.flags ?? null), + scrapeSearchResult( + result, + { + teamId: req.auth.team_id, + origin: req.body.origin, + timeout: req.body.timeout, + scrapeOptions: req.body.scrapeOptions, + }, + logger, + costTracking, + req.acuc?.flags ?? null, + (req.acuc?.price_credits ?? 0) <= 3000, + isSearchPreview, + ), ); const docs = await Promise.all(scrapePromises); @@ -255,11 +270,23 @@ export async function searchController( } // Bill team once for all successful results - billTeam(req.auth.team_id, req.acuc?.sub_id, responseData.data.length).catch((error) => { - logger.error( - `Failed to bill team ${req.auth.team_id} for ${responseData.data.length} credits: ${error}`, - ); - }); + if (!isSearchPreview) { + billTeam( + req.auth.team_id, + req.acuc?.sub_id, + responseData.data.reduce((a, x) => { + if (x.metadata?.numPages !== undefined && x.metadata.numPages > 0) { + return a + x.metadata.numPages; + } else { + return a + 1; + } + }, 0), + ).catch((error) => { + logger.error( + `Failed to bill team ${req.auth.team_id} for ${responseData.data.length} credits: ${error}`, + ); + }); + } const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; @@ -269,22 +296,25 @@ export async function searchController( time_taken: timeTakenInSeconds, }); - logJob({ - job_id: jobId, - success: true, - num_docs: responseData.data.length, - docs: responseData.data, - time_taken: timeTakenInSeconds, - team_id: req.auth.team_id, - mode: "search", - url: req.body.query, - scrapeOptions: req.body.scrapeOptions, - origin: req.body.origin, - cost_tracking: costTracking, - }); + logJob( + { + job_id: jobId, + success: true, + num_docs: responseData.data.length, + docs: responseData.data, + time_taken: timeTakenInSeconds, + team_id: req.auth.team_id, + mode: "search", + url: req.body.query, + scrapeOptions: req.body.scrapeOptions, + origin: req.body.origin, + cost_tracking: costTracking, + }, + false, + isSearchPreview, + ); return res.status(200).json(responseData); - } catch (error) { if ( error instanceof Error && diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 762e7b45..88433b4c 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -311,6 +311,8 @@ const baseScrapeOptions = z proxy: z.enum(["basic", "stealth", "auto"]).optional(), maxAge: z.number().int().gte(0).safe().default(0), storeInCache: z.boolean().default(true), + __experimental_cache: z.boolean().default(false).optional(), + __searchPreviewToken: z.string().optional(), }) .strict(strictMessage); @@ -1172,6 +1174,7 @@ export const searchRequestSchema = z origin: z.string().optional().default("api"), timeout: z.number().int().positive().finite().safe().default(60000), ignoreInvalidURLs: z.boolean().optional().default(false), + __searchPreviewToken: z.string().optional(), scrapeOptions: baseScrapeOptions .extend({ formats: z diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 933a980f..05fe005e 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -95,8 +95,13 @@ function startServer(port = DEFAULT_PORT) { logger.info(`Worker ${process.pid} listening on port ${port}`); }); - const exitHandler = () => { + const exitHandler = async () => { logger.info("SIGTERM signal received: closing HTTP server"); + if (process.env.IS_KUBERNETES === "true") { + // Account for GCE load balancer drain timeout + logger.info("Waiting 60s for GCE load balancer drain timeout"); + await new Promise((resolve) => setTimeout(resolve, 60000)); + } server.close(() => { logger.info("Server closed."); process.exit(0); diff --git a/apps/api/src/lib/deep-research/deep-research-service.ts b/apps/api/src/lib/deep-research/deep-research-service.ts index 2d2b5f91..52668839 100644 --- a/apps/api/src/lib/deep-research/deep-research-service.ts +++ b/apps/api/src/lib/deep-research/deep-research-service.ts @@ -133,6 +133,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { blockAds: false, maxAge: 0, dontStoreInCache: false, + __experimental_cache: true, }, }, logger, costTracking, acuc?.flags ?? null); return response.length > 0 ? response : []; diff --git a/apps/api/src/lib/extract/document-scraper.ts b/apps/api/src/lib/extract/document-scraper.ts index 190252c8..799062b6 100644 --- a/apps/api/src/lib/extract/document-scraper.ts +++ b/apps/api/src/lib/extract/document-scraper.ts @@ -49,6 +49,7 @@ export async function scrapeDocument( origin: options.origin, is_scrape: true, from_extract: true, + startTime: Date.now(), }, {}, jobId, diff --git a/apps/api/src/lib/extract/fire-0/document-scraper-f0.ts b/apps/api/src/lib/extract/fire-0/document-scraper-f0.ts index b5f8f0bb..fa2f289e 100644 --- a/apps/api/src/lib/extract/fire-0/document-scraper-f0.ts +++ b/apps/api/src/lib/extract/fire-0/document-scraper-f0.ts @@ -47,6 +47,7 @@ export async function scrapeDocument_F0( origin: options.origin, is_scrape: true, from_extract: true, + startTime: Date.now(), }, {}, jobId, diff --git a/apps/api/src/lib/extract/index/pinecone.ts b/apps/api/src/lib/extract/index/pinecone.ts deleted file mode 100644 index 44aed960..00000000 --- a/apps/api/src/lib/extract/index/pinecone.ts +++ /dev/null @@ -1,164 +0,0 @@ -import { Pinecone } from "@pinecone-database/pinecone"; -import { Document } from "../../../controllers/v1/types"; -import { logger } from "../../logger"; -import { embed } from "ai"; -import { getEmbeddingModel } from "../../generic-ai"; - -const pinecone = new Pinecone({ - apiKey: process.env.PINECONE_API_KEY!, -}); - -const INDEX_NAME = process.env.PINECONE_INDEX_NAME ?? ""; - -const MAX_METADATA_SIZE = 30 * 1024; // 30KB in bytes - -export interface PageMetadata { - url: string; - originUrl: string; - title?: string; - description?: string; - crawlId?: string; - teamId?: string; - timestamp: number; - markdown?: string; -} - -async function getEmbedding(text: string) { - const { embedding } = await embed({ - model: getEmbeddingModel("text-embedding-3-small"), - value: text, - }); - - return embedding; -} - -function normalizeUrl(url: string) { - const urlO = new URL(url); - if (!urlO.hostname.startsWith("www.")) { - urlO.hostname = "www." + urlO.hostname; - } - return urlO.href; -} - -export async function indexPage({ - document, - originUrl, - crawlId, - teamId, -}: { - document: Document; - originUrl: string; - crawlId?: string; - teamId?: string; -}) { - try { - const index = pinecone.index(INDEX_NAME); - - // Trim markdown if it's too long - let trimmedMarkdown = document.markdown; - if ( - trimmedMarkdown && - Buffer.byteLength(trimmedMarkdown, "utf-8") > MAX_METADATA_SIZE - ) { - trimmedMarkdown = trimmedMarkdown.slice( - 0, - Math.floor(MAX_METADATA_SIZE / 2), - ); // Using half the size to be safe with UTF-8 encoding - } - - // Create text to embed - const textToEmbed = [ - document.metadata.title, - document.metadata.description, - trimmedMarkdown, - ] - .filter(Boolean) - .join("\n\n"); - - // Get embedding from OpenAI - const embedding = await getEmbedding(textToEmbed); - - const normalizedUrl = normalizeUrl( - document.metadata.sourceURL || document.metadata.url!, - ); - - // Prepare metadata - const metadata: PageMetadata = { - url: normalizedUrl, - originUrl: normalizeUrl(originUrl), - title: document.metadata.title ?? document.metadata.ogTitle ?? "", - description: - document.metadata.description ?? document.metadata.ogDescription ?? "", - crawlId, - teamId, - markdown: trimmedMarkdown, - timestamp: Date.now(), - }; - - // Upsert to Pinecone - await index.upsert([ - { - id: normalizedUrl, - values: embedding, - metadata: { - ...metadata, - [document.metadata.sourceURL || document.metadata.url!]: true, - }, - }, - ]); - - logger.debug("Successfully indexed page in Pinecone", { - url: metadata.url, - crawlId, - }); - } catch (error) { - logger.error("Failed to index page in Pinecone", { - error, - url: document.metadata.sourceURL || document.metadata.url, - crawlId, - }); - } -} - -export async function searchSimilarPages( - query: string, - originUrl?: string, - limit: number = 1000, -): Promise { - try { - const index = pinecone.index(INDEX_NAME); - - // Get query embedding from OpenAI - const queryEmbedding = await getEmbedding(query); - - const queryParams: any = { - vector: queryEmbedding, - topK: limit, - includeMetadata: true, - }; - - const normalizedOriginUrl = originUrl ? normalizeUrl(originUrl) : undefined; - // Add filter if originUrl is provided - if (normalizedOriginUrl) { - queryParams.filter = { - originUrl: { $eq: normalizedOriginUrl }, - }; - } - - const results = await index.query(queryParams); - return results.matches.map((match) => ({ - url: match.metadata?.url, - title: match.metadata?.title, - description: match.metadata?.description, - score: match.score, - markdown: match.metadata?.markdown, - })); - } catch (error) { - logger.error("Failed to search similar pages in Pinecone", { - error, - query, - originUrl, - }); - return []; - } -} diff --git a/apps/api/src/lib/extract/reranker.ts b/apps/api/src/lib/extract/reranker.ts index 62b1d266..6ff6e77c 100644 --- a/apps/api/src/lib/extract/reranker.ts +++ b/apps/api/src/lib/extract/reranker.ts @@ -4,7 +4,6 @@ import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { logger } from "../logger"; import { CohereClient } from "cohere-ai"; import { extractConfig } from "./config"; -import { searchSimilarPages } from "./index/pinecone"; import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExtract"; import { buildRerankerUserPrompt } from "./build-prompts"; import { buildRerankerSystemPrompt } from "./build-prompts"; diff --git a/apps/api/src/lib/scrape-billing.ts b/apps/api/src/lib/scrape-billing.ts new file mode 100644 index 00000000..aabe42b0 --- /dev/null +++ b/apps/api/src/lib/scrape-billing.ts @@ -0,0 +1,49 @@ +import { Document, ScrapeOptions } from "../controllers/v1/types"; +import { supabaseGetJobById } from "./supabase-jobs"; +import { logger } from "./logger"; +import { CostTracking } from "./extract/extraction-service"; + +const creditsPerPDFPage = 1; +const stealthProxyCostBonus = 4; + +export async function calculateCreditsToBeBilled(options: ScrapeOptions, document: Document, jobId: string, costTracking?: any) { + let creditsToBeBilled = 1; // Assuming 1 credit per document + if ((options.extract && options.formats?.includes("extract")) || (options.formats?.includes("changeTracking") && options.changeTrackingOptions?.modes?.includes("json"))) { + creditsToBeBilled = 5; + } + + if (options.agent?.model?.toLowerCase() === "fire-1" || options.extract?.agent?.model?.toLowerCase() === "fire-1" || options.jsonOptions?.agent?.model?.toLowerCase() === "fire-1") { + if (process.env.USE_DB_AUTHENTICATION === "true") { + // @Nick this is a hack pushed at 2AM pls help - mogery + if (!costTracking) { + const job = await supabaseGetJobById(jobId); + costTracking = job?.cost_tracking; + } + + if (!costTracking) { + logger.warn("No cost tracking found for job", { + jobId, + scrapeId: jobId + }); + } + + if (costTracking instanceof CostTracking) { + costTracking = costTracking.toJSON(); + } + + creditsToBeBilled = Math.ceil((costTracking?.totalCost ?? 1) * 1800); + } else { + creditsToBeBilled = 150; + } + } + + if (document.metadata.numPages !== undefined && document.metadata.numPages > 1) { + creditsToBeBilled += creditsPerPDFPage * (document.metadata.numPages - 1); + } + + if (document?.metadata?.proxyUsed === "stealth") { + creditsToBeBilled += stealthProxyCostBonus; + } + + return creditsToBeBilled; +} \ No newline at end of file diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 2f9b6495..cddba97b 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -15,7 +15,6 @@ import { ScrapeUrlResponse, } from "../scraper/scrapeURL"; import { Engine } from "../scraper/scrapeURL/engines"; -import { indexPage } from "../lib/extract/index/pinecone"; import { CostTracking } from "../lib/extract/extraction-service"; configDotenv(); diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 8bbacb4c..3e8490c4 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -224,7 +224,6 @@ export async function scrapeURLWithFireEngineChromeCDP( mobile: meta.options.mobile, timeout, // TODO: better timeout logic disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache, - blockAds: meta.options.blockAds, mobileProxy: meta.featureFlags.has("stealthProxy"), saveScrapeResultToGCS: meta.internalOptions.saveScrapeResultToGCS, // TODO: scrollXPaths diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts index e0c44e18..5261b697 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts @@ -12,7 +12,6 @@ export type FireEngineScrapeRequestCommon = { headers?: { [K: string]: string }; blockMedia?: boolean; // default: true - blockAds?: boolean; // default: true // pageOptions?: any; // unused, .scrollXPaths is considered on FE side // useProxy?: boolean; // unused, default: true @@ -39,7 +38,6 @@ export type FireEngineScrapeRequestChromeCDP = { blockMedia?: true; // cannot be false mobile?: boolean; disableSmartWaitCache?: boolean; - blockAds?: boolean; // default: true saveScrapeResultToGCS?: boolean; }; @@ -58,7 +56,6 @@ export type FireEngineScrapeRequestTLSClient = { engine: "tlsclient"; atsv?: boolean; // v0 only, default: false disableJsDom?: boolean; // v0 only, default: false - // blockAds?: boolean; // default: true }; const schema = z.object({ diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index 450bb121..79e70f1f 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -74,6 +74,7 @@ export const featureFlags = [ "skipTlsVerification", "useFastMode", "stealthProxy", + "disableAdblock", ] as const; export type FeatureFlag = (typeof featureFlags)[number]; @@ -95,6 +96,7 @@ export const featureFlagOptions: { mobile: { priority: 10 }, skipTlsVerification: { priority: 10 }, stealthProxy: { priority: 20 }, + disableAdblock: { priority: 10 }, } as const; export type EngineScrapeResult = { @@ -171,6 +173,7 @@ export const engineOptions: { skipTlsVerification: false, useFastMode: false, stealthProxy: false, + disableAdblock: false, }, quality: 1000, // cache should always be tried first }, @@ -205,6 +208,7 @@ export const engineOptions: { skipTlsVerification: true, useFastMode: false, stealthProxy: false, + disableAdblock: false, }, quality: 50, }, @@ -222,6 +226,7 @@ export const engineOptions: { skipTlsVerification: true, useFastMode: false, stealthProxy: false, + disableAdblock: false, }, quality: 45, }, @@ -256,6 +261,7 @@ export const engineOptions: { skipTlsVerification: true, useFastMode: false, stealthProxy: true, + disableAdblock: false, }, quality: -2, }, @@ -273,6 +279,7 @@ export const engineOptions: { skipTlsVerification: true, useFastMode: false, stealthProxy: true, + disableAdblock: false, }, quality: -5, }, @@ -290,6 +297,7 @@ export const engineOptions: { skipTlsVerification: false, useFastMode: false, stealthProxy: false, + disableAdblock: true, }, quality: 40, }, @@ -307,6 +315,7 @@ export const engineOptions: { skipTlsVerification: false, useFastMode: false, stealthProxy: true, + disableAdblock: true, }, quality: -10, }, @@ -324,6 +333,7 @@ export const engineOptions: { skipTlsVerification: false, useFastMode: false, stealthProxy: false, + disableAdblock: false, }, quality: 20, }, @@ -341,6 +351,7 @@ export const engineOptions: { skipTlsVerification: false, useFastMode: true, stealthProxy: false, + disableAdblock: false, }, quality: 10, }, @@ -358,6 +369,7 @@ export const engineOptions: { skipTlsVerification: false, useFastMode: true, stealthProxy: true, + disableAdblock: false, }, quality: -15, }, @@ -375,6 +387,7 @@ export const engineOptions: { skipTlsVerification: false, useFastMode: true, stealthProxy: false, + disableAdblock: false, }, quality: 5, }, @@ -392,6 +405,7 @@ export const engineOptions: { skipTlsVerification: false, useFastMode: true, stealthProxy: true, // kinda... + disableAdblock: true, }, quality: -20, }, @@ -409,6 +423,7 @@ export const engineOptions: { skipTlsVerification: false, useFastMode: true, stealthProxy: true, // kinda... + disableAdblock: true, }, quality: -20, }, diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 2d4c57d6..7fc043b7 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -120,6 +120,10 @@ function buildFeatureFlags( flags.add("docx"); } + if (options.blockAds === false) { + flags.add("disableAdblock"); + } + return flags; } @@ -187,6 +191,7 @@ export type InternalOptions = { unnormalizedSourceURL?: string; saveScrapeResultToGCS?: boolean; // Passed along to fire-engine + bypassBilling?: boolean; }; export type EngineResultsTracker = { diff --git a/apps/api/src/search/fireEngine.ts b/apps/api/src/search/fireEngine.ts index 832c4405..2928b3b6 100644 --- a/apps/api/src/search/fireEngine.ts +++ b/apps/api/src/search/fireEngine.ts @@ -5,6 +5,58 @@ import { logger } from "../lib/logger"; dotenv.config(); + +export async function fire_engine_search( + q: string, + options: { + tbs?: string; + filter?: string; + lang?: string; + country?: string; + location?: string; + numResults: number; + page?: number; + }, + abort?: AbortSignal, +): Promise { + try { + let data = JSON.stringify({ + query: q, + lang: options.lang, + country: options.country, + location: options.location, + tbs: options.tbs, + numResults: options.numResults, + page: options.page ?? 1, + }); + + if (!process.env.FIRE_ENGINE_BETA_URL) { + return []; + } + + const response = await fetch(`${process.env.FIRE_ENGINE_BETA_URL}/search`, { + method: "POST", + headers: { + "Content-Type": "application/json", + "X-Disable-Cache": "true", + }, + body: data, + signal: abort, + }); + + if (response.ok) { + const responseData = await response.json(); + return responseData; + } else { + return []; + } + } catch (error) { + logger.error(error); + Sentry.captureException(error); + return []; + } +} + export async function fireEngineMap( q: string, options: { @@ -34,7 +86,7 @@ export async function fireEngineMap( return []; } - const response = await fetch(`${process.env.FIRE_ENGINE_BETA_URL}/search`, { + const response = await fetch(`${process.env.FIRE_ENGINE_BETA_URL}/map`, { method: "POST", headers: { "Content-Type": "application/json", diff --git a/apps/api/src/search/index.ts b/apps/api/src/search/index.ts index 05948b35..f0a2d612 100644 --- a/apps/api/src/search/index.ts +++ b/apps/api/src/search/index.ts @@ -4,6 +4,7 @@ import { googleSearch } from "./googlesearch"; import { searchapi_search } from "./searchapi"; import { serper_search } from "./serper"; import { searxng_search } from "./searxng"; +import { fire_engine_search } from "./fireEngine"; export async function search({ query, @@ -31,8 +32,19 @@ export async function search({ timeout?: number; }): Promise { try { + if (process.env.FIRE_ENGINE_BETA_URL) { + const results = await fire_engine_search(query, { + numResults: num_results, + tbs, + filter, + lang, + country, + location, + }); + if (results.length > 0) return results; + } if (process.env.SERPER_API_KEY) { - return await serper_search(query, { + const results = await serper_search(query, { num_results, tbs, filter, @@ -40,9 +52,10 @@ export async function search({ country, location, }); + if (results.length > 0) return results; } if (process.env.SEARCHAPI_API_KEY) { - return await searchapi_search(query, { + const results = await searchapi_search(query, { num_results, tbs, filter, @@ -50,9 +63,10 @@ export async function search({ country, location, }); + if (results.length > 0) return results; } if (process.env.SEARXNG_ENDPOINT) { - return await searxng_search(query, { + const results = await searxng_search(query, { num_results, tbs, filter, @@ -60,6 +74,7 @@ export async function search({ country, location, }); + if (results.length > 0) return results; } return await googleSearch( query, diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index 162ddb43..262d4015 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -21,12 +21,13 @@ function cleanOfNull(x: T): T { } } -export async function logJob(job: FirecrawlJob, force: boolean = false) { +export async function logJob(job: FirecrawlJob, force: boolean = false, bypassLogging: boolean = false) { try { const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; if (!useDbAuthentication) { return; } + // Redact any pages that have an authorization header // actually, Don't. we use the db to retrieve results now. this breaks authed crawls - mogery @@ -63,12 +64,17 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) { is_migrated: true, cost_tracking: job.cost_tracking, pdf_num_pages: job.pdf_num_pages ?? null, + credits_billed: job.credits_billed ?? null, }; if (process.env.GCS_BUCKET_NAME) { await saveJobToGCS(job); } + if (bypassLogging) { + return; + } + if (force) { let i = 0, done = false; diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index 3456c82c..c368d121 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -97,6 +97,7 @@ async function addScrapeJobRaw( options: any, jobId: string, jobPriority: number, + directToBullMQ: boolean = false, ) { const hasCrawlDelay = webScraperOptions.crawl_id && webScraperOptions.crawlerOptions?.delay; @@ -127,7 +128,7 @@ async function addScrapeJobRaw( const concurrencyQueueJobs = await getConcurrencyQueueJobsCount(webScraperOptions.team_id); - if (concurrencyLimited) { + if (concurrencyLimited && !directToBullMQ) { // Detect if they hit their concurrent limit // If above by 2x, send them an email // No need to 2x as if there are more than the max concurrency in the concurrency queue, it is already 2x @@ -161,6 +162,7 @@ export async function addScrapeJob( options: any = {}, jobId: string = uuidv4(), jobPriority: number = 10, + directToBullMQ: boolean = false, ) { if (Sentry.isInitialized()) { const size = JSON.stringify(webScraperOptions).length; @@ -187,11 +189,12 @@ export async function addScrapeJob( options, jobId, jobPriority, + directToBullMQ, ); }, ); } else { - await addScrapeJobRaw(webScraperOptions, options, jobId, jobPriority); + await addScrapeJobRaw(webScraperOptions, options, jobId, jobPriority, directToBullMQ); } } diff --git a/apps/api/src/services/queue-service.ts b/apps/api/src/services/queue-service.ts index b2353296..b4938dce 100644 --- a/apps/api/src/services/queue-service.ts +++ b/apps/api/src/services/queue-service.ts @@ -66,10 +66,10 @@ export function getIndexQueue() { connection: redisConnection, defaultJobOptions: { removeOnComplete: { - age: 90000, // 25 hours + age: 3600, // 1 hour }, removeOnFail: { - age: 90000, // 25 hours + age: 3600, // 1 hour }, }, }); @@ -120,10 +120,10 @@ export function getBillingQueue() { connection: redisConnection, defaultJobOptions: { removeOnComplete: { - age: 90000, // 25 hours + age: 3600, // 1 hour }, removeOnFail: { - age: 90000, // 25 hours + age: 3600, // 1 hour }, }, }); diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 75ed92c1..791a945f 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -61,7 +61,6 @@ import { } from "../lib/concurrency-limit"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings"; -import { indexPage } from "../lib/extract/index/pinecone"; import { Document } from "../controllers/v1/types"; import { ExtractResult, @@ -85,9 +84,11 @@ import https from "https"; import { cacheableLookup } from "../scraper/scrapeURL/lib/cacheableLookup"; import { robustFetch } from "../scraper/scrapeURL/lib/fetch"; import { RateLimiterMode } from "../types"; +import { calculateCreditsToBeBilled } from "../lib/scrape-billing"; import { redisEvictConnection } from "./redis"; import { generateURLSplits, hashURL, index_supabase_service, useIndex } from "./index"; import { WebCrawler } from "../scraper/WebScraper/crawler"; +import type { Logger } from "winston"; configDotenv(); @@ -320,7 +321,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { scrapeOptions: sc.scrapeOptions, crawlerOptions: sc.crawlerOptions, origin: job.data.origin, - }); + }, false, job.data.internalOptions?.bypassBilling ?? false); logger.info("Logged crawl!"); const data = { @@ -372,8 +373,10 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { origin: job.data.origin, }, true, + job.data.internalOptions?.bypassBilling ?? false, ); + // v1 web hooks, call when done with no data, but with event completed if (job.data.v1 && job.data.webhook) { callWebhook( @@ -1133,6 +1136,59 @@ async function processKickoffJob(job: Job & { id: string }, token: string) { } } +async function billScrapeJob(job: Job & { id: string }, document: Document, logger: Logger, costTracking?: CostTracking) { + let creditsToBeBilled: number | null = null; + + if (job.data.is_scrape !== true && !job.data.internalOptions?.bypassBilling) { + creditsToBeBilled = await calculateCreditsToBeBilled(job.data.scrapeOptions, document, job.id, costTracking); + + if ( + job.data.team_id !== process.env.BACKGROUND_INDEX_TEAM_ID! && + process.env.USE_DB_AUTHENTICATION === "true" + ) { + try { + const billingJobId = uuidv4(); + logger.debug( + `Adding billing job to queue for team ${job.data.team_id}`, + { + billingJobId, + credits: creditsToBeBilled, + is_extract: false, + }, + ); + + // Add directly to the billing queue - the billing worker will handle the rest + await getBillingQueue().add( + "bill_team", + { + team_id: job.data.team_id, + subscription_id: undefined, + credits: creditsToBeBilled, + is_extract: false, + timestamp: new Date().toISOString(), + originating_job_id: job.id, + }, + { + jobId: billingJobId, + priority: 10, + }, + ); + + return creditsToBeBilled; + } catch (error) { + logger.error( + `Failed to add billing job to queue for team ${job.data.team_id} for ${creditsToBeBilled} credits`, + { error }, + ); + Sentry.captureException(error); + return creditsToBeBilled; + } + } + } + + return creditsToBeBilled; +} + async function processJob(job: Job & { id: string }, token: string) { const logger = _logger.child({ module: "queue-worker", @@ -1143,25 +1199,9 @@ async function processJob(job: Job & { id: string }, token: string) { teamId: job.data?.team_id ?? undefined, }); logger.info(`🐂 Worker taking job ${job.id}`, { url: job.data.url }); - const start = Date.now(); + const start = job.data.startTime ?? Date.now(); + const remainingTime = job.data.scrapeOptions.timeout ? (job.data.scrapeOptions.timeout - (Date.now() - start)) : undefined; - // Check if the job URL is researchhub and block it immediately - // TODO: remove this once solve the root issue - // if ( - // job.data.url && - // (job.data.url.includes("researchhub.com") || - // job.data.url.includes("ebay.com")) - // ) { - // logger.info(`🐂 Blocking job ${job.id} with URL ${job.data.url}`); - // const data = { - // success: false, - // document: null, - // project_id: job.data.project_id, - // error: - // "URL is blocked. Suspecious activity detected. Please contact help@firecrawl.com if you believe this is an error.", - // }; - // return data; - // } const costTracking = new CostTracking(); try { @@ -1172,6 +1212,11 @@ async function processJob(job: Job & { id: string }, token: string) { current_url: "", }); + if (remainingTime !== undefined && remainingTime < 0) { + throw new Error("timeout"); + } + const signal = remainingTime ? AbortSignal.timeout(remainingTime) : undefined; + if (job.data.crawl_id) { const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl; if (sc && sc.cancelled) { @@ -1185,16 +1230,22 @@ async function processJob(job: Job & { id: string }, token: string) { token, costTracking, }), - ...(job.data.scrapeOptions.timeout !== undefined + ...(remainingTime !== undefined ? [ (async () => { - await sleep(job.data.scrapeOptions.timeout); + await sleep(remainingTime); throw new Error("timeout"); })(), ] : []), ]); + try { + signal?.throwIfAborted(); + } catch (e) { + throw new Error("timeout"); + } + if (!pipeline.success) { throw pipeline.error; } @@ -1295,45 +1346,6 @@ async function processJob(job: Job & { id: string }, token: string) { } } - logger.debug("Logging job to DB..."); - await logJob( - { - job_id: job.id as string, - success: true, - num_docs: 1, - docs: [doc], - time_taken: timeTakenInSeconds, - team_id: job.data.team_id, - mode: job.data.mode, - url: job.data.url, - crawlerOptions: sc.crawlerOptions, - scrapeOptions: job.data.scrapeOptions, - origin: job.data.origin, - crawl_id: job.data.crawl_id, - cost_tracking: costTracking, - pdf_num_pages: doc.metadata.numPages, - }, - true, - ); - - if (job.data.webhook && job.data.mode !== "crawl" && job.data.v1) { - logger.debug("Calling webhook with success...", { - webhook: job.data.webhook, - }); - await callWebhook( - job.data.team_id, - job.data.crawl_id, - data, - job.data.webhook, - job.data.v1, - job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page", - true, - ); - } - - logger.debug("Declaring job as done..."); - await addCrawlJobDone(job.data.crawl_id, job.id, true); - if (job.data.crawlerOptions !== null) { if (!sc.cancelled) { const crawler = crawlToCrawler( @@ -1429,8 +1441,65 @@ async function processJob(job: Job & { id: string }, token: string) { } } + try { + signal?.throwIfAborted(); + } catch (e) { + throw new Error("timeout"); + } + + const credits_billed = await billScrapeJob(job, doc, logger, costTracking); + + logger.debug("Logging job to DB..."); + await logJob( + { + job_id: job.id as string, + success: true, + num_docs: 1, + docs: [doc], + time_taken: timeTakenInSeconds, + team_id: job.data.team_id, + mode: job.data.mode, + url: job.data.url, + crawlerOptions: sc.crawlerOptions, + scrapeOptions: job.data.scrapeOptions, + origin: job.data.origin, + crawl_id: job.data.crawl_id, + cost_tracking: costTracking, + pdf_num_pages: doc.metadata.numPages, + credits_billed, + }, + true, + job.data.internalOptions?.bypassBilling ?? false, + ); + + if (job.data.webhook && job.data.mode !== "crawl" && job.data.v1) { + logger.debug("Calling webhook with success...", { + webhook: job.data.webhook, + }); + await callWebhook( + job.data.team_id, + job.data.crawl_id, + data, + job.data.webhook, + job.data.v1, + job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page", + true, + ); + } + + logger.debug("Declaring job as done..."); + await addCrawlJobDone(job.data.crawl_id, job.id, true); + await finishCrawlIfNeeded(job, sc); } else { + try { + signal?.throwIfAborted(); + } catch (e) { + throw new Error("timeout"); + } + + const credits_billed = await billScrapeJob(job, doc, logger, costTracking); + await logJob({ job_id: job.id, success: true, @@ -1446,66 +1515,8 @@ async function processJob(job: Job & { id: string }, token: string) { num_tokens: 0, // TODO: fix cost_tracking: costTracking, pdf_num_pages: doc.metadata.numPages, - }); - } - - if (job.data.is_scrape !== true) { - let creditsToBeBilled = 1; // Assuming 1 credit per document - if ((job.data.scrapeOptions.extract && job.data.scrapeOptions.formats?.includes("extract")) || (job.data.scrapeOptions.formats?.includes("changeTracking") && job.data.scrapeOptions.changeTrackingOptions?.modes?.includes("json"))) { - creditsToBeBilled = 5; - } - - if (job.data.scrapeOptions.agent?.model?.toLowerCase() === "fire-1" || job.data.scrapeOptions.extract?.agent?.model?.toLowerCase() === "fire-1" || job.data.scrapeOptions.jsonOptions?.agent?.model?.toLowerCase() === "fire-1") { - if (process.env.USE_DB_AUTHENTICATION === "true") { - creditsToBeBilled = Math.ceil((costTracking.toJSON().totalCost ?? 1) * 1800); - } else { - creditsToBeBilled = 150; - } - } - - if (doc.metadata?.proxyUsed === "stealth") { - creditsToBeBilled += 4; - } - - if ( - job.data.team_id !== process.env.BACKGROUND_INDEX_TEAM_ID! && - process.env.USE_DB_AUTHENTICATION === "true" - ) { - try { - const billingJobId = uuidv4(); - logger.debug( - `Adding billing job to queue for team ${job.data.team_id}`, - { - billingJobId, - credits: creditsToBeBilled, - is_extract: false, - }, - ); - - // Add directly to the billing queue - the billing worker will handle the rest - await getBillingQueue().add( - "bill_team", - { - team_id: job.data.team_id, - subscription_id: undefined, - credits: creditsToBeBilled, - is_extract: false, - timestamp: new Date().toISOString(), - originating_job_id: job.id, - }, - { - jobId: billingJobId, - priority: 10, - }, - ); - } catch (error) { - logger.error( - `Failed to add billing job to queue for team ${job.data.team_id} for ${creditsToBeBilled} credits`, - { error }, - ); - Sentry.captureException(error); - } - } + credits_billed, + }, false, job.data.internalOptions?.bypassBilling ?? false); } logger.info(`🐂 Job done ${job.id}`); @@ -1604,6 +1615,7 @@ async function processJob(job: Job & { id: string }, token: string) { cost_tracking: costTracking, }, true, + job.data.internalOptions?.bypassBilling ?? false, ); return data; } diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 7b2cc640..2e18ba3d 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -44,9 +44,15 @@ export interface WebScraperOptions { sitemapped?: boolean; webhook?: z.infer; v1?: boolean; + + /** + * Disables billing on the worker side. + */ is_scrape?: boolean; + isCrawlSourceScrape?: boolean; from_extract?: boolean; + startTime?: number; } export interface RunWebScraperParams { @@ -95,6 +101,7 @@ export interface FirecrawlJob { sources?: Record; cost_tracking?: CostTracking; pdf_num_pages?: number; + credits_billed?: number | null; } export interface FirecrawlScrapeResponse { diff --git a/apps/rust-sdk/Cargo.lock b/apps/rust-sdk/Cargo.lock index d7a7b64e..7fe40518 100644 --- a/apps/rust-sdk/Cargo.lock +++ b/apps/rust-sdk/Cargo.lock @@ -680,7 +680,7 @@ checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" [[package]] name = "firecrawl" -version = "1.1.0" +version = "1.2.0" dependencies = [ "assert_matches", "axum", diff --git a/apps/rust-sdk/Cargo.toml b/apps/rust-sdk/Cargo.toml index 3affd864..b703a22c 100644 --- a/apps/rust-sdk/Cargo.toml +++ b/apps/rust-sdk/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "firecrawl" author= "Mendable.ai" -version = "1.1.0" +version = "1.2.0" edition = "2021" license = "MIT" homepage = "https://www.firecrawl.dev/" diff --git a/apps/rust-sdk/src/crawl.rs b/apps/rust-sdk/src/crawl.rs index b6522c45..6d27ff28 100644 --- a/apps/rust-sdk/src/crawl.rs +++ b/apps/rust-sdk/src/crawl.rs @@ -99,6 +99,49 @@ impl From for ScrapeOptions { } } +/// Options for webhook notifications +#[serde_with::skip_serializing_none] +#[derive(Deserialize, Serialize, Debug, Default, Clone)] +#[serde(rename_all = "camelCase")] +pub struct WebhookOptions { + /// URL to send webhook notifications to + pub url: String, + + /// Custom headers to include in webhook requests + pub headers: Option>, + + /// Custom data included in all webhook payloads + pub metadata: Option>, + + /// Event types to receive + pub events: Option>, +} + +impl From for WebhookOptions { + fn from(value: String) -> Self { + Self { + url: value, + ..Default::default() + } + } +} + +#[derive(Deserialize, Serialize, Debug, PartialEq, Eq, Clone, Copy)] +#[serde(rename_all = "camelCase")] +pub enum WebhookEvent { + /// Crawl finished successfully + Completed, + + /// Crawl encountered an error + Failed, + + /// Individual page scraped + Page, + + /// Crawl job initiated + Started, +} + #[serde_with::skip_serializing_none] #[derive(Deserialize, Serialize, Debug, Default, Clone)] #[serde(rename_all = "camelCase")] @@ -132,7 +175,7 @@ pub struct CrawlOptions { pub allow_external_links: Option, /// URL to send Webhook crawl events to. - pub webhook: Option, + pub webhook: Option, /// Idempotency key to send to the crawl endpoint. #[serde(skip)]