diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index c178f0c8..ac1ab544 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -54,6 +54,7 @@ export async function scrapeController( saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false, unnormalizedSourceURL: preNormalizedBody.url, useCache: req.body.__experimental_cache ? true : false, + bypassBilling: isDirectToBullMQ, }, origin: req.body.origin, startTime, @@ -133,6 +134,7 @@ export async function scrapeController( } } + return res.status(200).json({ success: true, data: doc, diff --git a/apps/api/src/controllers/v1/search.ts b/apps/api/src/controllers/v1/search.ts index 68e426da..6f91aa34 100644 --- a/apps/api/src/controllers/v1/search.ts +++ b/apps/api/src/controllers/v1/search.ts @@ -40,24 +40,24 @@ export async function searchAndScrapeSearchResult( try { const searchResults = await search({ query, - num_results: 5 - }); + num_results: 5, + }); - const documents = await Promise.all( - searchResults.map(result => - scrapeSearchResult( - { - url: result.url, - title: result.title, - description: result.description - }, - options, - logger, - costTracking, - flags - ) - ) - ); + const documents = await Promise.all( + searchResults.map((result) => + scrapeSearchResult( + { + url: result.url, + title: result.title, + description: result.description, + }, + options, + logger, + costTracking, + flags, + ), + ), + ); return documents; } catch (error) { @@ -112,7 +112,7 @@ async function scrapeSearchResult( ); const doc: Document = await waitForJob(jobId, options.timeout); - + logger.info("Scrape job completed", { scrapeId: jobId, url: searchResult.url, @@ -171,6 +171,8 @@ export async function searchController( }; const startTime = new Date().getTime(); const costTracking = new CostTracking(); + const isSearchPreview = + process.env.SEARCH_PREVIEW_TOKEN === req.body.__searchPreviewToken; try { req.body = searchRequestSchema.parse(req.body); @@ -199,7 +201,9 @@ export async function searchController( }); if (req.body.ignoreInvalidURLs) { - searchResults = searchResults.filter((result) => !isUrlBlocked(result.url, req.acuc?.flags ?? null)); + searchResults = searchResults.filter( + (result) => !isUrlBlocked(result.url, req.acuc?.flags ?? null), + ); } logger.info("Searching completed", { @@ -226,12 +230,19 @@ export async function searchController( } else { logger.info("Scraping search results"); const scrapePromises = searchResults.map((result) => - scrapeSearchResult(result, { - teamId: req.auth.team_id, - origin: req.body.origin, - timeout: req.body.timeout, - scrapeOptions: req.body.scrapeOptions, - }, logger, costTracking, req.acuc?.flags ?? null, (req.acuc?.price_credits ?? 0) <= 3000), + scrapeSearchResult( + result, + { + teamId: req.auth.team_id, + origin: req.body.origin, + timeout: req.body.timeout, + scrapeOptions: req.body.scrapeOptions, + }, + logger, + costTracking, + req.acuc?.flags ?? null, + (req.acuc?.price_credits ?? 0) <= 3000, + ), ); const docs = await Promise.all(scrapePromises); @@ -257,17 +268,23 @@ export async function searchController( } // Bill team once for all successful results - billTeam(req.auth.team_id, req.acuc?.sub_id, responseData.data.reduce((a,x) => { - if (x.metadata?.numPages !== undefined && x.metadata.numPages > 0) { - return a + x.metadata.numPages; - } else { - return a + 1; - } - }, 0)).catch((error) => { - logger.error( - `Failed to bill team ${req.auth.team_id} for ${responseData.data.length} credits: ${error}`, - ); - }); + if (!isSearchPreview) { + billTeam( + req.auth.team_id, + req.acuc?.sub_id, + responseData.data.reduce((a, x) => { + if (x.metadata?.numPages !== undefined && x.metadata.numPages > 0) { + return a + x.metadata.numPages; + } else { + return a + 1; + } + }, 0), + ).catch((error) => { + logger.error( + `Failed to bill team ${req.auth.team_id} for ${responseData.data.length} credits: ${error}`, + ); + }); + } const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; @@ -277,22 +294,25 @@ export async function searchController( time_taken: timeTakenInSeconds, }); - logJob({ - job_id: jobId, - success: true, - num_docs: responseData.data.length, - docs: responseData.data, - time_taken: timeTakenInSeconds, - team_id: req.auth.team_id, - mode: "search", - url: req.body.query, - scrapeOptions: req.body.scrapeOptions, - origin: req.body.origin, - cost_tracking: costTracking, - }); + logJob( + { + job_id: jobId, + success: true, + num_docs: responseData.data.length, + docs: responseData.data, + time_taken: timeTakenInSeconds, + team_id: req.auth.team_id, + mode: "search", + url: req.body.query, + scrapeOptions: req.body.scrapeOptions, + origin: req.body.origin, + cost_tracking: costTracking, + }, + false, + isSearchPreview, + ); return res.status(200).json(responseData); - } catch (error) { if ( error instanceof Error && diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 24ad28e1..541ee920 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -1169,6 +1169,7 @@ export const searchRequestSchema = z origin: z.string().optional().default("api"), timeout: z.number().int().positive().finite().safe().default(60000), ignoreInvalidURLs: z.boolean().optional().default(false), + __searchPreviewToken: z.string().optional(), scrapeOptions: baseScrapeOptions .extend({ formats: z diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 2f926a96..bb4e8ed4 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -189,6 +189,7 @@ export type InternalOptions = { unnormalizedSourceURL?: string; saveScrapeResultToGCS?: boolean; // Passed along to fire-engine + bypassBilling?: boolean; }; export type EngineResultsTracker = { diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index efa6b158..262d4015 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -21,12 +21,13 @@ function cleanOfNull(x: T): T { } } -export async function logJob(job: FirecrawlJob, force: boolean = false) { +export async function logJob(job: FirecrawlJob, force: boolean = false, bypassLogging: boolean = false) { try { const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; if (!useDbAuthentication) { return; } + // Redact any pages that have an authorization header // actually, Don't. we use the db to retrieve results now. this breaks authed crawls - mogery @@ -70,6 +71,10 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) { await saveJobToGCS(job); } + if (bypassLogging) { + return; + } + if (force) { let i = 0, done = false; diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index afc93b6a..6d419873 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -319,7 +319,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { scrapeOptions: sc.scrapeOptions, crawlerOptions: sc.crawlerOptions, origin: job.data.origin, - }); + }, false, job.data.internalOptions?.bypassBilling ?? false); logger.info("Logged crawl!"); const data = { @@ -371,8 +371,10 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { origin: job.data.origin, }, true, + job.data.internalOptions?.bypassBilling ?? false, ); + // v1 web hooks, call when done with no data, but with event completed if (job.data.v1 && job.data.webhook) { callWebhook( @@ -1048,7 +1050,7 @@ async function processKickoffJob(job: Job & { id: string }, token: string) { async function billScrapeJob(job: Job & { id: string }, document: Document, logger: Logger, costTracking?: CostTracking) { let creditsToBeBilled: number | null = null; - if (job.data.is_scrape !== true) { + if (job.data.is_scrape !== true && !job.data.internalOptions?.bypassBilling) { creditsToBeBilled = await calculateCreditsToBeBilled(job.data.scrapeOptions, document, job.id, costTracking); if ( @@ -1378,6 +1380,7 @@ async function processJob(job: Job & { id: string }, token: string) { credits_billed, }, true, + job.data.internalOptions?.bypassBilling ?? false, ); if (job.data.webhook && job.data.mode !== "crawl" && job.data.v1) { @@ -1424,7 +1427,7 @@ async function processJob(job: Job & { id: string }, token: string) { cost_tracking: costTracking, pdf_num_pages: doc.metadata.numPages, credits_billed, - }); + }, false, job.data.internalOptions?.bypassBilling ?? false); } logger.info(`🐂 Job done ${job.id}`); @@ -1523,6 +1526,7 @@ async function processJob(job: Job & { id: string }, token: string) { cost_tracking: costTracking, }, true, + job.data.internalOptions?.bypassBilling ?? false, ); return data; }