diff --git a/apps/api/src/controllers/v0/crawl.ts b/apps/api/src/controllers/v0/crawl.ts index 889b5c2b..b54a14df 100644 --- a/apps/api/src/controllers/v0/crawl.ts +++ b/apps/api/src/controllers/v0/crawl.ts @@ -161,7 +161,7 @@ export async function crawlController(req: Request, res: Response) { team_id ); internalOptions.disableSmartWaitCache = true; // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter - + internalOptions.saveScrapeResultToGCS = process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false; delete (scrapeOptions as any).timeout; const sc: StoredCrawl = { diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index b89d6c10..55bbc691 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -69,6 +69,8 @@ export async function scrapeHelper( team_id, ); + internalOptions.saveScrapeResultToGCS = process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false; + await addScrapeJob( { url, diff --git a/apps/api/src/controllers/v1/batch-scrape.ts b/apps/api/src/controllers/v1/batch-scrape.ts index d5896f6d..962cd333 100644 --- a/apps/api/src/controllers/v1/batch-scrape.ts +++ b/apps/api/src/controllers/v1/batch-scrape.ts @@ -82,7 +82,11 @@ export async function batchScrapeController( : { crawlerOptions: null, scrapeOptions: req.body, - internalOptions: { disableSmartWaitCache: true, teamId: req.auth.team_id }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter + internalOptions: { + disableSmartWaitCache: true, + teamId: req.auth.team_id, + saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false, + }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter team_id: req.auth.team_id, createdAt: Date.now(), }; @@ -121,6 +125,9 @@ export async function batchScrapeController( sitemapped: true, v1: true, webhook: req.body.webhook, + internalOptions: { + saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false, + }, }, opts: { jobId: uuidv4(), diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index 17089c7b..bd81bdc8 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -80,7 +80,11 @@ export async function crawlController( originUrl: req.body.url, crawlerOptions: toLegacyCrawlerOptions(crawlerOptions), scrapeOptions, - internalOptions: { disableSmartWaitCache: true, teamId: req.auth.team_id }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter + internalOptions: { + disableSmartWaitCache: true, + teamId: req.auth.team_id, + saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false, + }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter team_id: req.auth.team_id, createdAt: Date.now(), }; diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index 3567809d..d052431b 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -49,7 +49,10 @@ export async function scrapeController( mode: "single_urls", team_id: req.auth.team_id, scrapeOptions: req.body, - internalOptions: { teamId: req.auth.team_id }, + internalOptions: { + teamId: req.auth.team_id, + saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false, + }, origin: req.body.origin, is_scrape: true, }, diff --git a/apps/api/src/lib/extract/document-scraper.ts b/apps/api/src/lib/extract/document-scraper.ts index b1d14193..190252c8 100644 --- a/apps/api/src/lib/extract/document-scraper.ts +++ b/apps/api/src/lib/extract/document-scraper.ts @@ -44,6 +44,7 @@ export async function scrapeDocument( internalOptions: { useCache: true, teamId: options.teamId, + saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false, }, origin: options.origin, is_scrape: true, diff --git a/apps/api/src/lib/gcs-jobs.ts b/apps/api/src/lib/gcs-jobs.ts index f4e68cd2..278e6e19 100644 --- a/apps/api/src/lib/gcs-jobs.ts +++ b/apps/api/src/lib/gcs-jobs.ts @@ -101,4 +101,33 @@ export async function getJobFromGCS(jobId: string): Promise { }); return null; } +} + +// TODO: fix the any type (we have multiple Document types in the codebase) +export async function getDocFromGCS(url: string): Promise { + logger.info(`Getting f-engine document from GCS`, { + url, + }); + try { + if (!process.env.GCS_FIRE_ENGINE_BUCKET_NAME) { + return null; + } + + const storage = new Storage({ credentials }); + const bucket = storage.bucket(process.env.GCS_FIRE_ENGINE_BUCKET_NAME); + const blob = bucket.file(`${url}`); + const [exists] = await blob.exists(); + if (!exists) { + return null; + } + const [blobContent] = await blob.download(); + const parsed = JSON.parse(blobContent.toString()); + return parsed; + } catch (error) { + logger.error(`Error getting f-engine document from GCS`, { + error, + url, + }); + return null; + } } \ No newline at end of file diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts index 47322ef0..fadce779 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts @@ -11,6 +11,7 @@ import { } from "../../error"; import { MockState } from "../../lib/mock"; import { fireEngineURL } from "./scrape"; +import { getDocFromGCS } from "../../../../lib/gcs-jobs"; const successSchema = z.object({ jobId: z.string(), @@ -81,6 +82,8 @@ const successSchema = z.object({ }) .optional() .or(z.null()), + + docUrl: z.string().optional(), }); export type FireEngineCheckStatusSuccess = z.infer; @@ -117,7 +120,7 @@ export async function fireEngineCheckStatus( mock: MockState | null, abort?: AbortSignal, ): Promise { - const status = await Sentry.startSpan( + let status = await Sentry.startSpan( { name: "fire-engine: Check status", attributes: { @@ -142,6 +145,15 @@ export async function fireEngineCheckStatus( }, ); + // Fire-engine now saves the content to GCS + if (!status.content && status.docUrl) { + const doc = await getDocFromGCS(status.docUrl.split('/').pop() ?? ""); + if (doc) { + status = { ...status, ...doc }; + delete status.docUrl; + } + } + const successParse = successSchema.safeParse(status); const processingParse = processingSchema.safeParse(status); const failedParse = failedSchema.safeParse(status); diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 93596fce..1c2c7bd3 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -222,6 +222,7 @@ export async function scrapeURLWithFireEngineChromeCDP( disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache, blockAds: meta.options.blockAds, mobileProxy: meta.options.proxy === undefined ? undefined : meta.options.proxy === "stealth" ? true : false, + saveScrapeResultToGCS: meta.internalOptions.saveScrapeResultToGCS, // TODO: scrollXPaths }; diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts index 9cdd8cdc..e0c44e18 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts @@ -40,6 +40,7 @@ export type FireEngineScrapeRequestChromeCDP = { mobile?: boolean; disableSmartWaitCache?: boolean; blockAds?: boolean; // default: true + saveScrapeResultToGCS?: boolean; }; export type FireEngineScrapeRequestPlaywright = { diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index b20fbdfd..7dfc821f 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -180,6 +180,8 @@ export type InternalOptions = { fromCache?: boolean; // Indicates if the document was retrieved from cache abort?: AbortSignal; urlInvisibleInCurrentCrawl?: boolean; + + saveScrapeResultToGCS?: boolean; // Passed along to fire-engine }; export type EngineResultsTracker = {