diff --git a/apps/api/src/controllers/v1/auth.ts b/apps/api/src/controllers/v1/auth.ts index bd45648d..bc6951c9 100644 --- a/apps/api/src/controllers/v1/auth.ts +++ b/apps/api/src/controllers/v1/auth.ts @@ -26,13 +26,7 @@ export async function supaAuthenticateUser( req, res, mode?: RateLimiterMode -): Promise<{ - success: boolean; - team_id?: string; - error?: string; - status?: number; - plan?: string; -}> { +): Promise { const authHeader = req.headers.authorization; if (!authHeader) { return { success: false, error: "Unauthorized", status: 401 }; @@ -106,7 +100,7 @@ export async function supaAuthenticateUser( setTrace(team_id, normalizedApi); subscriptionData = { team_id: team_id, - plan: plan + plan: plan, } switch (mode) { case RateLimiterMode.Crawl: @@ -195,7 +189,12 @@ export async function supaAuthenticateUser( subscriptionData = data[0]; } - return { success: true, team_id: subscriptionData.team_id, plan: subscriptionData.plan ?? ""}; + return { + success: true, + team_id: subscriptionData.team_id, + plan: subscriptionData.plan ?? "", + api_key: normalizedApi + }; } function getPlanByPriceId(price_id: string) { switch (price_id) { diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index a61a7f6a..3a7d0e32 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -1,217 +1,120 @@ import { Request, Response } from "express"; import { Logger } from '../../lib/logger'; -import { checkAndUpdateURL } from '../../lib/validateUrl'; -import { RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types"; +import { Document, legacyScrapeOptions, RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types"; +import { billTeam } from "../../services/billing/credit_billing"; +import { v4 as uuidv4 } from 'uuid'; +import { numTokensFromString } from "../../lib/LLM-extraction/helpers"; +import { addScrapeJob } from "../../services/queue-jobs"; +import { scrapeQueueEvents } from '../../services/queue-service'; +import { logJob } from "../../services/logging/log_job"; export async function scrapeController(req: RequestWithAuth, res: Response) { - req.body = scrapeRequestSchema.parse(req.body); - console.log(req.body); - - // TODO: check req.body - // mockup req.body - // req.body = { - // url: "test", - // headers: { - // "x-key": "test" - // }, - // formats: ["markdown", "html", "rawHtml", "content", "linksOnPage", "screenshot", "fullPageScreenshot"], - // includeTags: ["test"], - // excludeTags: ["test"], - // onlyMainContent: false, - // timeout: 30000, - // waitFor: number - // } - + req.body = scrapeRequestSchema.parse(req.body); let earlyReturn = false; - // make sure to authenticate user first, Bearer - // check credits + const origin = req.body.origin; + const timeout = req.body.timeout; + const pageOptions = legacyScrapeOptions(req.body); - const result: ScrapeResponse = { - success: true, - warning: "test", - data: { - markdown: "test", - html: "test", - rawHtml: "test", - links: ["test1", "test2"], - screenshot: "test", - metadata: { - title: "test", - description: "test", - language: "test", - sourceURL: "test", - statusCode: 200, - error: "test" - } + const jobId = uuidv4(); + + const startTime = new Date().getTime(); + const job = await addScrapeJob({ + url: req.body.url, + mode: "single_urls", + crawlerOptions: {}, + team_id: req.auth.team_id, + pageOptions, + extractorOptions: {}, + origin: req.body.origin, + }, {}, jobId); + + let doc: any | undefined; + try { + doc = (await job.waitUntilFinished(scrapeQueueEvents, timeout))[0]; // 60 seconds timeout + } catch (e) { + Logger.error(`Error in scrapeController: ${e}`); + if (e instanceof Error && e.message.startsWith("Job wait")) { + return res.status(408).json({ + success: false, + error: "Request timed out", + }); + } else { + return res.status(500).json({ + success: false, + error: "Internal server error", + }); } } - return res.status(200).json(result); + await job.remove(); - // const crawlerOptions = req.body.crawlerOptions ?? {}; - // const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions }; - // const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions }; - // const origin = req.body.origin ?? defaultOrigin; - // let timeout = req.body.timeout ?? defaultTimeout; + if (!doc) { + console.error("!!! PANIC DOC IS", doc, job); + return res.status(200).json({ + success: true, + warning: "No page found", + data: doc + }); + } - // if (extractorOptions.mode.includes("llm-extraction")) { - // pageOptions.onlyMainContent = true; - // timeout = req.body.timeout ?? 90000; - // } + delete doc.index; + delete doc.provider; - // const checkCredits = async () => { - // try { - // const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1); - // if (!creditsCheckSuccess) { - // earlyReturn = true; - // return res.status(402).json({ error: "Insufficient credits" }); - // } - // } catch (error) { - // Logger.error(error); - // earlyReturn = true; - // return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." }); - // } - // }; + const endTime = new Date().getTime(); + const timeTakenInSeconds = (endTime - startTime) / 1000; + const numTokens = (doc && doc.markdown) ? numTokensFromString(doc.markdown, "gpt-3.5-turbo") : 0; + let creditsToBeBilled = 1; // Assuming 1 credit per document + if (earlyReturn) { + // Don't bill if we're early returning + return; + } - // await checkCredits(); + const billingResult = await billTeam( + req.auth.team_id, + creditsToBeBilled + ); + if (!billingResult.success) { + return res.status(402).json({ + success: false, + error: "Failed to bill team. Insufficient credits or subscription not found.", + }); + } - // const jobId = uuidv4(); + logJob({ + job_id: jobId, + success: true, + message: "Scrape completed", + num_docs: 1, + docs: [doc], + time_taken: timeTakenInSeconds, + team_id: req.auth.team_id, + mode: "scrape", + url: req.body.url, + crawlerOptions: {}, + pageOptions: pageOptions, + origin: origin, + extractor_options: { mode: "markdown" }, + num_tokens: numTokens, + }); - // const startTime = new Date().getTime(); - // const result = await scrapeHelper( - // jobId, - // req, - // team_id, - // crawlerOptions, - // pageOptions, - // extractorOptions, - // timeout, - // plan - // ); - // const endTime = new Date().getTime(); - // const timeTakenInSeconds = (endTime - startTime) / 1000; - // const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0; - - // if (result.success) { - // let creditsToBeBilled = 1; // Assuming 1 credit per document - // const creditsPerLLMExtract = 50; - - // if (extractorOptions.mode.includes("llm-extraction")) { - // // creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length); - // creditsToBeBilled += creditsPerLLMExtract; - // } - - // let startTimeBilling = new Date().getTime(); - - // if (earlyReturn) { - // // Don't bill if we're early returning - // return; - // } - // const billingResult = await billTeam( - // team_id, - // creditsToBeBilled - // ); - // if (!billingResult.success) { - // return res.status(402).json({ - // success: false, - // error: "Failed to bill team. Insufficient credits or subscription not found.", - // }); - // } - // } - - // logJob({ - // job_id: jobId, - // success: result.success, - // message: result.error, - // num_docs: 1, - // docs: [result.data], - // time_taken: timeTakenInSeconds, - // team_id: team_id, - // mode: "scrape", - // url: req.body.url, - // crawlerOptions: crawlerOptions, - // pageOptions: pageOptions, - // origin: origin, - // extractor_options: extractorOptions, - // num_tokens: numTokens, - // }); - - - // return res.status(result.returnCode).json(result); -} - - -// export async function scrapeHelper( -// jobId: string, -// req: Request, -// team_id: string, -// crawlerOptions: any, -// pageOptions: PageOptions, -// extractorOptions: ExtractorOptions, -// timeout: number, -// plan?: string -// ): Promise<{ -// success: boolean; -// error?: string; -// data?: Document; -// returnCode: number; -// }> { - - // const url = req.body.url; - // if (!url) { - // return { success: false, error: "Url is required", returnCode: 400 }; - // } - - // if (isUrlBlocked(url)) { - // return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 }; - // } - - // const a = new WebScraperDataProvider(); - // await a.setOptions({ - // jobId, - // mode: "single_urls", - // urls: [url], - // crawlerOptions: { - // ...crawlerOptions, - // }, - // pageOptions: pageOptions, - // extractorOptions: extractorOptions, - // }); - - // const timeoutPromise = new Promise<{ success: boolean; error?: string; returnCode: number }>((_, reject) => - // setTimeout(() => reject({ success: false, error: "Request timed out. Increase the timeout by passing `timeout` param to the request.", returnCode: 408 }), timeout) - // ); - - // const docsPromise = a.getDocuments(false); - - // let docs; - // try { - // docs = await Promise.race([docsPromise, timeoutPromise]); - // } catch (error) { - // return error; - // } - - // // make sure doc.content is not empty - // let filteredDocs = docs.filter( - // (doc: { content?: string }) => doc.content && doc.content.trim().length > 0 - // ); - // if (filteredDocs.length === 0) { - // return { success: true, error: "No page found", returnCode: 200, data: docs[0] }; - // } - - - // // Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html - // if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") { - // filteredDocs.forEach(doc => { - // delete doc.rawHtml; - // }); - // } - - // return { - // success: true, - // data: filteredDocs[0], - // returnCode: 200, - // }; -// } \ No newline at end of file + return res.status(200).json({ + success: true, + data: { + markdown: doc.markdown, + links: doc.linksOnPage, + rawHtml: doc.rawHtml, + html: doc.html, + screenshot: doc.screenshot, + fullPageScreenshot: doc.fullPageScreenshot, + metadata: { + ...doc.metadata, + pageError: undefined, + pageStatusCode: undefined, + error: doc.metadata.pageError, + statusCode: doc.metadata.pageStatusCode, + }, + } as Document + }); +} \ No newline at end of file diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 3caf6f7e..df586b3a 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -204,5 +204,8 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions { removeTags: x.excludeTags, onlyMainContent: x.onlyMainContent, waitFor: x.waitFor, + includeLinks: x.formats.includes("links"), + screenshot: x.formats.includes("screenshot"), + fullPageScreenshot: x.formats.includes("screenshot@fullPage"), }; } diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 92dd4c7c..361017e8 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -24,6 +24,7 @@ export type PageOptions = { parsePDF?: boolean; removeTags?: string | string[]; onlyIncludeTags?: string | string[]; + includeLinks?: boolean; }; export type ExtractorOptions = { diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index df9d04ab..434464ae 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -130,6 +130,7 @@ export async function scrapSingleUrl( screenshot: false, fullPageScreenshot: false, headers: undefined, + includeLinks: true }, extractorOptions: ExtractorOptions = { mode: "llm-extraction-from-markdown", @@ -361,7 +362,9 @@ export async function scrapSingleUrl( let linksOnPage: string[] | undefined; - linksOnPage = extractLinks(rawHtml, urlToScrap); + if (pageOptions.includeLinks) { + linksOnPage = extractLinks(rawHtml, urlToScrap); + } let document: Document; if (screenshot && screenshot.length > 0) { @@ -374,7 +377,7 @@ export async function scrapSingleUrl( extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined, - linksOnPage, + linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined, metadata: { ...metadata, screenshot: screenshot, @@ -399,7 +402,7 @@ export async function scrapSingleUrl( pageStatusCode: pageStatusCode, pageError: pageError, }, - linksOnPage, + linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined, }; } @@ -415,7 +418,7 @@ export async function scrapSingleUrl( content: "", markdown: "", html: "", - linksOnPage: [], + linksOnPage: pageOptions.includeLinks ? [] : undefined, metadata: { sourceURL: urlToScrap, pageStatusCode: pageStatusCode, diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index b092d310..ca5ff3e3 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -114,6 +114,7 @@ export interface AuthResponse { error?: string; status?: number; plan?: string; + api_key?: string; }