mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 22:59:04 +08:00
[feat] Implement GCS storage option for scrape results across controllers an… (#1500)
* Implement GCS storage option for scrape results across controllers and update GCS document retrieval functionality * done! * Update gcs-jobs.ts
This commit is contained in:
parent
f0b1507290
commit
eee613d1bc
@ -161,7 +161,7 @@ export async function crawlController(req: Request, res: Response) {
|
||||
team_id
|
||||
);
|
||||
internalOptions.disableSmartWaitCache = true; // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
|
||||
|
||||
internalOptions.saveScrapeResultToGCS = process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false;
|
||||
delete (scrapeOptions as any).timeout;
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
|
@ -69,6 +69,8 @@ export async function scrapeHelper(
|
||||
team_id,
|
||||
);
|
||||
|
||||
internalOptions.saveScrapeResultToGCS = process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false;
|
||||
|
||||
await addScrapeJob(
|
||||
{
|
||||
url,
|
||||
|
@ -82,7 +82,11 @@ export async function batchScrapeController(
|
||||
: {
|
||||
crawlerOptions: null,
|
||||
scrapeOptions: req.body,
|
||||
internalOptions: { disableSmartWaitCache: true, teamId: req.auth.team_id }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter
|
||||
internalOptions: {
|
||||
disableSmartWaitCache: true,
|
||||
teamId: req.auth.team_id,
|
||||
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
|
||||
}, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
};
|
||||
@ -121,6 +125,9 @@ export async function batchScrapeController(
|
||||
sitemapped: true,
|
||||
v1: true,
|
||||
webhook: req.body.webhook,
|
||||
internalOptions: {
|
||||
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
|
||||
},
|
||||
},
|
||||
opts: {
|
||||
jobId: uuidv4(),
|
||||
|
@ -80,7 +80,11 @@ export async function crawlController(
|
||||
originUrl: req.body.url,
|
||||
crawlerOptions: toLegacyCrawlerOptions(crawlerOptions),
|
||||
scrapeOptions,
|
||||
internalOptions: { disableSmartWaitCache: true, teamId: req.auth.team_id }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
|
||||
internalOptions: {
|
||||
disableSmartWaitCache: true,
|
||||
teamId: req.auth.team_id,
|
||||
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
|
||||
}, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
};
|
||||
|
@ -49,7 +49,10 @@ export async function scrapeController(
|
||||
mode: "single_urls",
|
||||
team_id: req.auth.team_id,
|
||||
scrapeOptions: req.body,
|
||||
internalOptions: { teamId: req.auth.team_id },
|
||||
internalOptions: {
|
||||
teamId: req.auth.team_id,
|
||||
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
|
||||
},
|
||||
origin: req.body.origin,
|
||||
is_scrape: true,
|
||||
},
|
||||
|
@ -44,6 +44,7 @@ export async function scrapeDocument(
|
||||
internalOptions: {
|
||||
useCache: true,
|
||||
teamId: options.teamId,
|
||||
saveScrapeResultToGCS: process.env.GCS_FIRE_ENGINE_BUCKET_NAME ? true : false,
|
||||
},
|
||||
origin: options.origin,
|
||||
is_scrape: true,
|
||||
|
@ -101,4 +101,33 @@ export async function getJobFromGCS(jobId: string): Promise<Document[] | null> {
|
||||
});
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: fix the any type (we have multiple Document types in the codebase)
|
||||
export async function getDocFromGCS(url: string): Promise<any | null> {
|
||||
logger.info(`Getting f-engine document from GCS`, {
|
||||
url,
|
||||
});
|
||||
try {
|
||||
if (!process.env.GCS_FIRE_ENGINE_BUCKET_NAME) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const storage = new Storage({ credentials });
|
||||
const bucket = storage.bucket(process.env.GCS_FIRE_ENGINE_BUCKET_NAME);
|
||||
const blob = bucket.file(`${url}`);
|
||||
const [exists] = await blob.exists();
|
||||
if (!exists) {
|
||||
return null;
|
||||
}
|
||||
const [blobContent] = await blob.download();
|
||||
const parsed = JSON.parse(blobContent.toString());
|
||||
return parsed;
|
||||
} catch (error) {
|
||||
logger.error(`Error getting f-engine document from GCS`, {
|
||||
error,
|
||||
url,
|
||||
});
|
||||
return null;
|
||||
}
|
||||
}
|
@ -11,6 +11,7 @@ import {
|
||||
} from "../../error";
|
||||
import { MockState } from "../../lib/mock";
|
||||
import { fireEngineURL } from "./scrape";
|
||||
import { getDocFromGCS } from "../../../../lib/gcs-jobs";
|
||||
|
||||
const successSchema = z.object({
|
||||
jobId: z.string(),
|
||||
@ -81,6 +82,8 @@ const successSchema = z.object({
|
||||
})
|
||||
.optional()
|
||||
.or(z.null()),
|
||||
|
||||
docUrl: z.string().optional(),
|
||||
});
|
||||
|
||||
export type FireEngineCheckStatusSuccess = z.infer<typeof successSchema>;
|
||||
@ -117,7 +120,7 @@ export async function fireEngineCheckStatus(
|
||||
mock: MockState | null,
|
||||
abort?: AbortSignal,
|
||||
): Promise<FireEngineCheckStatusSuccess> {
|
||||
const status = await Sentry.startSpan(
|
||||
let status = await Sentry.startSpan(
|
||||
{
|
||||
name: "fire-engine: Check status",
|
||||
attributes: {
|
||||
@ -142,6 +145,15 @@ export async function fireEngineCheckStatus(
|
||||
},
|
||||
);
|
||||
|
||||
// Fire-engine now saves the content to GCS
|
||||
if (!status.content && status.docUrl) {
|
||||
const doc = await getDocFromGCS(status.docUrl.split('/').pop() ?? "");
|
||||
if (doc) {
|
||||
status = { ...status, ...doc };
|
||||
delete status.docUrl;
|
||||
}
|
||||
}
|
||||
|
||||
const successParse = successSchema.safeParse(status);
|
||||
const processingParse = processingSchema.safeParse(status);
|
||||
const failedParse = failedSchema.safeParse(status);
|
||||
|
@ -222,6 +222,7 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
||||
disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache,
|
||||
blockAds: meta.options.blockAds,
|
||||
mobileProxy: meta.options.proxy === undefined ? undefined : meta.options.proxy === "stealth" ? true : false,
|
||||
saveScrapeResultToGCS: meta.internalOptions.saveScrapeResultToGCS,
|
||||
// TODO: scrollXPaths
|
||||
};
|
||||
|
||||
|
@ -40,6 +40,7 @@ export type FireEngineScrapeRequestChromeCDP = {
|
||||
mobile?: boolean;
|
||||
disableSmartWaitCache?: boolean;
|
||||
blockAds?: boolean; // default: true
|
||||
saveScrapeResultToGCS?: boolean;
|
||||
};
|
||||
|
||||
export type FireEngineScrapeRequestPlaywright = {
|
||||
|
@ -180,6 +180,8 @@ export type InternalOptions = {
|
||||
fromCache?: boolean; // Indicates if the document was retrieved from cache
|
||||
abort?: AbortSignal;
|
||||
urlInvisibleInCurrentCrawl?: boolean;
|
||||
|
||||
saveScrapeResultToGCS?: boolean; // Passed along to fire-engine
|
||||
};
|
||||
|
||||
export type EngineResultsTracker = {
|
||||
|
Loading…
x
Reference in New Issue
Block a user