[feat] Implement GCS storage option for scrape results across controllers an… (#1500)

* Implement GCS storage option for scrape results across controllers and update GCS document retrieval functionality

* done!

* Update gcs-jobs.ts
This commit is contained in:
Rafael Miller 2025-04-29 15:15:44 -03:00 committed by GitHub
parent f0b1507290
commit eee613d1bc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 67 additions and 5 deletions

View File

@ -161,7 +161,7 @@ export async function crawlController(req: Request, res: Response) {
team_id
);
internalOptions.disableSmartWaitCache = true; // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
internalOptions.saveScrapeResultToGCS = process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false;
delete (scrapeOptions as any).timeout;
const sc: StoredCrawl = {

View File

@ -69,6 +69,8 @@ export async function scrapeHelper(
team_id,
);
internalOptions.saveScrapeResultToGCS = process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false;
await addScrapeJob(
{
url,

View File

@ -82,7 +82,11 @@ export async function batchScrapeController(
: {
crawlerOptions: null,
scrapeOptions: req.body,
internalOptions: { disableSmartWaitCache: true, teamId: req.auth.team_id }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter
internalOptions: {
disableSmartWaitCache: true,
teamId: req.auth.team_id,
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
}, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter
team_id: req.auth.team_id,
createdAt: Date.now(),
};
@ -121,6 +125,9 @@ export async function batchScrapeController(
sitemapped: true,
v1: true,
webhook: req.body.webhook,
internalOptions: {
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
},
},
opts: {
jobId: uuidv4(),

View File

@ -80,7 +80,11 @@ export async function crawlController(
originUrl: req.body.url,
crawlerOptions: toLegacyCrawlerOptions(crawlerOptions),
scrapeOptions,
internalOptions: { disableSmartWaitCache: true, teamId: req.auth.team_id }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
internalOptions: {
disableSmartWaitCache: true,
teamId: req.auth.team_id,
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
}, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
team_id: req.auth.team_id,
createdAt: Date.now(),
};

View File

@ -49,7 +49,10 @@ export async function scrapeController(
mode: "single_urls",
team_id: req.auth.team_id,
scrapeOptions: req.body,
internalOptions: { teamId: req.auth.team_id },
internalOptions: {
teamId: req.auth.team_id,
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
},
origin: req.body.origin,
is_scrape: true,
},

View File

@ -44,6 +44,7 @@ export async function scrapeDocument(
internalOptions: {
useCache: true,
teamId: options.teamId,
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
},
origin: options.origin,
is_scrape: true,

View File

@ -101,4 +101,33 @@ export async function getJobFromGCS(jobId: string): Promise<Document[] | null> {
});
return null;
}
}
// TODO: fix the any type (we have multiple Document types in the codebase)
export async function getDocFromGCS(url: string): Promise<any | null> {
logger.info(`Getting f-engine document from GCS`, {
url,
});
try {
if (!process.env.GCS_FIRE_ENGINE_BUCKET_NAME) {
return null;
}
const storage = new Storage({ credentials });
const bucket = storage.bucket(process.env.GCS_FIRE_ENGINE_BUCKET_NAME);
const blob = bucket.file(`${url}`);
const [exists] = await blob.exists();
if (!exists) {
return null;
}
const [blobContent] = await blob.download();
const parsed = JSON.parse(blobContent.toString());
return parsed;
} catch (error) {
logger.error(`Error getting f-engine document from GCS`, {
error,
url,
});
return null;
}
}

View File

@ -11,6 +11,7 @@ import {
} from "../../error";
import { MockState } from "../../lib/mock";
import { fireEngineURL } from "./scrape";
import { getDocFromGCS } from "../../../../lib/gcs-jobs";
const successSchema = z.object({
jobId: z.string(),
@ -81,6 +82,8 @@ const successSchema = z.object({
})
.optional()
.or(z.null()),
docUrl: z.string().optional(),
});
export type FireEngineCheckStatusSuccess = z.infer<typeof successSchema>;
@ -117,7 +120,7 @@ export async function fireEngineCheckStatus(
mock: MockState | null,
abort?: AbortSignal,
): Promise<FireEngineCheckStatusSuccess> {
const status = await Sentry.startSpan(
let status = await Sentry.startSpan(
{
name: "fire-engine: Check status",
attributes: {
@ -142,6 +145,15 @@ export async function fireEngineCheckStatus(
},
);
// Fire-engine now saves the content to GCS
if (!status.content && status.docUrl) {
const doc = await getDocFromGCS(status.docUrl.split('/').pop() ?? "");
if (doc) {
status = { ...status, ...doc };
delete status.docUrl;
}
}
const successParse = successSchema.safeParse(status);
const processingParse = processingSchema.safeParse(status);
const failedParse = failedSchema.safeParse(status);

View File

@ -222,6 +222,7 @@ export async function scrapeURLWithFireEngineChromeCDP(
disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache,
blockAds: meta.options.blockAds,
mobileProxy: meta.options.proxy === undefined ? undefined : meta.options.proxy === "stealth" ? true : false,
saveScrapeResultToGCS: meta.internalOptions.saveScrapeResultToGCS,
// TODO: scrollXPaths
};

View File

@ -40,6 +40,7 @@ export type FireEngineScrapeRequestChromeCDP = {
mobile?: boolean;
disableSmartWaitCache?: boolean;
blockAds?: boolean; // default: true
saveScrapeResultToGCS?: boolean;
};
export type FireEngineScrapeRequestPlaywright = {

View File

@ -180,6 +180,8 @@ export type InternalOptions = {
fromCache?: boolean; // Indicates if the document was retrieved from cache
abort?: AbortSignal;
urlInvisibleInCurrentCrawl?: boolean;
saveScrapeResultToGCS?: boolean; // Passed along to fire-engine
};
export type EngineResultsTracker = {