import { ExtractorOptions, PageOptions } from "./../../lib/entities"; import { Request, Response } from "express"; import { billTeam, checkTeamCredits, } from "../../services/billing/credit_billing"; import { authenticateUser } from "../auth"; import { RateLimiterMode } from "../../types"; import { logJob } from "../../services/logging/log_job"; import { fromLegacyCombo, toLegacyDocument, url as urlSchema, } from "../v1/types"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function import { numTokensFromString } from "../../lib/LLM-extraction/helpers"; import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin, } from "../../lib/default-values"; import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; import { getScrapeQueue, redisConnection } from "../../services/queue-service"; import { v4 as uuidv4 } from "uuid"; import { logger } from "../../lib/logger"; import * as Sentry from "@sentry/node"; import { getJobPriority } from "../../lib/job-priority"; import { fromLegacyScrapeOptions } from "../v1/types"; import { ZodError } from "zod"; import { Document as V0Document } from "./../../lib/entities"; import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; import { getJobFromGCS } from "../../lib/gcs-jobs"; export async function scrapeHelper( jobId: string, req: Request, team_id: string, crawlerOptions: any, pageOptions: PageOptions, extractorOptions: ExtractorOptions, timeout: number, ): Promise<{ success: boolean; error?: string; data?: V0Document | { url: string }; returnCode: number; }> { const url = urlSchema.parse(req.body.url); if (typeof url !== "string") { return { success: false, error: "Url is required", returnCode: 400 }; } if (isUrlBlocked(url)) { return { success: false, error: BLOCKLISTED_URL_MESSAGE, returnCode: 403, }; } const jobPriority = await getJobPriority({ team_id, basePriority: 10 }); const { scrapeOptions, internalOptions } = fromLegacyCombo( pageOptions, extractorOptions, timeout, crawlerOptions, team_id, ); await addScrapeJob( { url, mode: "single_urls", team_id, scrapeOptions, internalOptions, origin: req.body.origin ?? defaultOrigin, is_scrape: true, }, {}, jobId, jobPriority, ); let doc; const err = await Sentry.startSpan( { name: "Wait for job to finish", op: "bullmq.wait", attributes: { job: jobId }, }, async (span) => { try { doc = await waitForJob(jobId, timeout); } catch (e) { if ( e instanceof Error && (e.message.startsWith("Job wait") || e.message === "timeout") ) { span.setAttribute("timedOut", true); return { success: false, error: "Request timed out", returnCode: 408, }; } else if ( typeof e === "string" && (e.includes("Error generating completions: ") || e.includes("Invalid schema for function") || e.includes( "LLM extraction did not match the extraction schema you provided.", )) ) { return { success: false, error: e, returnCode: 500, }; } else { throw e; } } span.setAttribute("result", JSON.stringify(doc)); return null; }, ); if (err !== null) { return err; } await getScrapeQueue().remove(jobId); if (!doc) { console.error("!!! PANIC DOC IS", doc); return { success: true, error: "No page found", returnCode: 200, data: doc, }; } delete doc.index; delete doc.provider; // Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html if ( !pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html" ) { if (doc.rawHtml) { delete doc.rawHtml; } } if (!pageOptions.includeHtml) { if (doc.html) { delete doc.html; } } return { success: true, data: toLegacyDocument(doc, internalOptions), returnCode: 200, }; } export async function scrapeController(req: Request, res: Response) { try { let earlyReturn = false; // make sure to authenticate user first, Bearer const auth = await authenticateUser(req, res, RateLimiterMode.Scrape); if (!auth.success) { return res.status(auth.status).json({ error: auth.error }); } const { team_id, chunk } = auth; redisConnection.sadd("teams_using_v0", team_id) .catch(error => logger.error("Failed to add team to teams_using_v0", { error, team_id })); const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions }; const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions, }; const origin = req.body.origin ?? defaultOrigin; let timeout = req.body.timeout ?? defaultTimeout; if (extractorOptions.mode.includes("llm-extraction")) { if ( typeof extractorOptions.extractionSchema !== "object" || extractorOptions.extractionSchema === null ) { return res.status(400).json({ error: "extractorOptions.extractionSchema must be an object if llm-extraction mode is specified", }); } pageOptions.onlyMainContent = true; timeout = req.body.timeout ?? 90000; } // checkCredits try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(chunk, team_id, 1); if (!creditsCheckSuccess) { earlyReturn = true; return res.status(402).json({ error: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing", }); } } catch (error) { logger.error(error); earlyReturn = true; return res.status(500).json({ error: "Error checking team credits. Please contact help@firecrawl.com for help.", }); } const jobId = uuidv4(); const startTime = new Date().getTime(); const result = await scrapeHelper( jobId, req, team_id, crawlerOptions, pageOptions, extractorOptions, timeout, ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; const numTokens = result.data && (result.data as V0Document).markdown ? numTokensFromString( (result.data as V0Document).markdown!, "gpt-3.5-turbo", ) : 0; if (result.success) { let creditsToBeBilled = 1; const creditsPerLLMExtract = 4; if (extractorOptions.mode.includes("llm-extraction")) { // creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length); creditsToBeBilled += creditsPerLLMExtract; } let startTimeBilling = new Date().getTime(); if (earlyReturn) { // Don't bill if we're early returning return; } if (creditsToBeBilled > 0) { // billing for doc done on queue end, bill only for llm extraction billTeam(team_id, chunk?.sub_id, creditsToBeBilled, logger).catch( (error) => { logger.error( `Failed to bill team ${team_id} for ${creditsToBeBilled} credits`, { error }, ); // Optionally, you could notify an admin or add to a retry queue here }, ); } } let doc = result.data; if (!pageOptions || !pageOptions.includeRawHtml) { if (doc && (doc as V0Document).rawHtml) { delete (doc as V0Document).rawHtml; } } if (pageOptions && pageOptions.includeExtract) { if (!pageOptions.includeMarkdown && doc && (doc as V0Document).markdown) { delete (doc as V0Document).markdown; } } return res.status(result.returnCode).json(result); } catch (error) { Sentry.captureException(error); logger.error("Scrape error occcurred", { error }); return res.status(500).json({ error: error instanceof ZodError ? "Invalid URL" : typeof error === "string" ? error : (error?.message ?? "Internal Server Error"), }); } }