From 8d7e8c4f500e3c60b0673fc7431fc5f5949fc2fe Mon Sep 17 00:00:00 2001 From: Rafael Miller <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 31 Jan 2025 13:58:52 -0300 Subject: [PATCH] added cached scrapes to extract (#1107) * added cached scrapes to extract * dont save if exists * no duplicates * experimental tag * Update requests.http --------- Co-authored-by: Nicolas --- apps/api/requests.http | 4 ++ apps/api/src/controllers/v1/types.ts | 4 +- .../api/src/lib/extract/extraction-service.ts | 40 ++++++++++-- .../src/lib/extract/helpers/cached-docs.ts | 62 +++++++++++++++++++ 4 files changed, 103 insertions(+), 7 deletions(-) create mode 100644 apps/api/src/lib/extract/helpers/cached-docs.ts diff --git a/apps/api/requests.http b/apps/api/requests.http index f7b7de9c..c9495cf8 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -96,3 +96,7 @@ content-type: application/json # @name extractFirecrawlStatus GET {{baseUrl}}/v1/extract/{{extractFirecrawlId}} HTTP/1.1 Authorization: Bearer {{$dotenv TEST_API_KEY}} + +### +DELETE {{baseUrl}}/v1/crawl/c94136f9-86c1-4a97-966c-1c8e0274778f HTTP/1.1 +Authorization: Bearer {{$dotenv TEST_API_KEY}} diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index c3ba3a69..76cccf7d 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -228,10 +228,12 @@ export const extractV1Options = z enableWebSearch: z.boolean().default(false), origin: z.string().optional().default("api"), urlTrace: z.boolean().default(false), + timeout: z.number().int().positive().finite().safe().default(60000), __experimental_streamSteps: z.boolean().default(false), __experimental_llmUsage: z.boolean().default(false), __experimental_showSources: z.boolean().default(false), - timeout: z.number().int().positive().finite().safe().default(60000), + __experimental_cacheKey: z.string().optional(), + __experimental_cacheMode: z.enum(["direct", "save", "load"]).default("direct").optional() }) .strict(strictMessage) .transform((obj) => ({ diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 3725ff8b..68213c2c 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -36,12 +36,16 @@ import { checkShouldExtract } from "./completions/checkShouldExtract"; import { batchExtractPromise } from "./completions/batchExtract"; import { singleAnswerCompletion } from "./completions/singleAnswer"; import { SourceTracker } from "./helpers/source-tracker"; +import { getCachedDocs, saveCachedDocs } from "./helpers/cached-docs"; +import { normalizeUrl } from "../canonical-url"; interface ExtractServiceOptions { request: ExtractRequest; teamId: string; plan: PlanType; subId?: string; + cacheMode?: "load" | "save" | "direct"; + cacheKey?: string; } interface ExtractResult { @@ -86,6 +90,20 @@ export async function performExtraction( extractId, }); + if (request.__experimental_cacheMode == "load" && request.__experimental_cacheKey) { + logger.debug("Loading cached docs..."); + try { + const cache = await getCachedDocs(request.urls, request.__experimental_cacheKey); + for (const doc of cache) { + if (doc.metadata.url) { + docsMap.set(normalizeUrl(doc.metadata.url), doc); + } + } + } catch (error) { + logger.error("Error loading cached docs", { error }); + } + } + // Token tracking let tokenUsage: TokenUsage[] = []; @@ -257,8 +275,9 @@ export async function performExtraction( logger.debug("Starting multi-entity scrape..."); let startScrape = Date.now(); + const scrapePromises = links.map((url) => { - if (!docsMap.has(url)) { + if (!docsMap.has(normalizeUrl(url))) { return scrapeDocument( { url, @@ -280,7 +299,7 @@ export async function performExtraction( } ); } - return docsMap.get(url); + return docsMap.get(normalizeUrl(url)); }); let multyEntityDocs = (await Promise.all(scrapePromises)).filter( @@ -309,7 +328,7 @@ export async function performExtraction( for (const doc of multyEntityDocs) { if (doc?.metadata?.url) { - docsMap.set(doc.metadata.url, doc); + docsMap.set(normalizeUrl(doc.metadata.url), doc); } } @@ -519,7 +538,7 @@ export async function performExtraction( ], }); const scrapePromises = links.map((url) => { - if (!docsMap.has(url)) { + if (!docsMap.has(normalizeUrl(url))) { return scrapeDocument( { url, @@ -537,7 +556,7 @@ export async function performExtraction( }), ); } - return docsMap.get(url); + return docsMap.get(normalizeUrl(url)); }); try { @@ -545,7 +564,7 @@ export async function performExtraction( for (const doc of results) { if (doc?.metadata?.url) { - docsMap.set(doc.metadata.url, doc); + docsMap.set(normalizeUrl(doc.metadata.url), doc); } } logger.debug("Updated docsMap.", { docsMapSize: docsMap.size }); // useful for error probing @@ -744,6 +763,15 @@ export async function performExtraction( logger.debug("Done!"); + if (request.__experimental_cacheMode == "save" && request.__experimental_cacheKey) { + logger.debug("Saving cached docs..."); + try { + await saveCachedDocs([...docsMap.values()], request.__experimental_cacheKey); + } catch (error) { + logger.error("Error saving cached docs", { error }); + } + } + return { success: true, data: finalResult ?? {}, diff --git a/apps/api/src/lib/extract/helpers/cached-docs.ts b/apps/api/src/lib/extract/helpers/cached-docs.ts new file mode 100644 index 00000000..a2cb0ea3 --- /dev/null +++ b/apps/api/src/lib/extract/helpers/cached-docs.ts @@ -0,0 +1,62 @@ +import { Document } from "../../../controllers/v1/types"; +import { supabase_service } from "../../../services/supabase"; +import { normalizeUrl } from "../../../lib/canonical-url"; + +export async function getCachedDocs(urls: string[], cacheKey: string): Promise { + const normalizedUrls = urls.map(normalizeUrl); + const { data, error } = await supabase_service + .from('cached_scrapes') + .select('doc') + .in('url', normalizedUrls) + .eq('cache_key', cacheKey); + + if (error) { + console.error('Error fetching cached docs:', error); + return []; + } + + const uniqueDocs = new Map(); + data.forEach((res: any) => { + const doc = JSON.parse(JSON.stringify(res.doc)) as Document; + const docKey = `${doc.metadata.url}-${cacheKey}`; + if (!uniqueDocs.has(docKey)) { + uniqueDocs.set(docKey, doc); + } + }); + + return Array.from(uniqueDocs.values()); +} + +export async function saveCachedDocs(docs: Document[], cacheKey: string): Promise { + for (const doc of docs) { + if (!doc.metadata.url) { + throw new Error("Document has no URL"); + } + + const normalizedUrl = normalizeUrl(doc.metadata.url); + const { data, error } = await supabase_service + .from('cached_scrapes') + .select('url') + .eq('url', normalizedUrl) + .eq('cache_key', cacheKey); + + if (error) { + console.error('Error checking existing cached doc:', error); + continue; + } + + if (data.length === 0) { + const { error: upsertError } = await supabase_service + .from('cached_scrapes') + .upsert({ + url: normalizedUrl, + doc: doc, + cache_key: cacheKey, + }); + + if (upsertError) { + console.error('Error saving cached doc:', upsertError); + } + } + } +}