Nick: cache /extract scrapes

This commit is contained in:
Nicolas 2025-01-03 21:19:40 -03:00
parent 81cf05885b
commit 6b2e1cbb28
3 changed files with 17 additions and 6 deletions

View File

@ -14,10 +14,13 @@ interface ScrapeDocumentOptions {
timeout: number;
}
export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces: URLTrace[]): Promise<Document | null> {
export async function scrapeDocument(
options: ScrapeDocumentOptions,
urlTraces: URLTrace[],
): Promise<Document | null> {
const trace = urlTraces.find((t) => t.url === options.url);
if (trace) {
trace.status = 'scraped';
trace.status = "scraped";
trace.timing.scrapedAt = new Date().toISOString();
}
@ -35,7 +38,9 @@ export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces:
mode: "single_urls",
team_id: options.teamId,
scrapeOptions: scrapeOptions.parse({}),
internalOptions: {},
internalOptions: {
useCache: true,
},
plan: options.plan,
origin: options.origin,
is_scrape: true,
@ -61,9 +66,9 @@ export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces:
} catch (error) {
logger.error(`Error in scrapeDocument: ${error}`);
if (trace) {
trace.status = 'error';
trace.status = "error";
trace.error = error.message;
}
return null;
}
}
}

View File

@ -298,6 +298,12 @@ export function buildFallbackList(meta: Meta): {
engine: Engine;
unsupportedFeatures: Set<FeatureFlag>;
}[] {
if (meta.internalOptions.useCache !== true) {
engines.splice(engines.indexOf("cache"), 1);
}else{
meta.logger.debug("Cache engine enabled by useCache option");
}
const prioritySum = [...meta.featureFlags].reduce(
(a, x) => a + featureFlagOptions[x].priority,
0,

View File

@ -151,7 +151,7 @@ export type InternalOptions = {
v0CrawlOnlyUrls?: boolean;
v0DisableJsDom?: boolean;
useCache?: boolean;
disableSmartWaitCache?: boolean; // Passed along to fire-engine
isBackgroundIndex?: boolean;
};