mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 01:36:02 +08:00
added cached scrapes to extract (#1107)
* added cached scrapes to extract * dont save if exists * no duplicates * experimental tag * Update requests.http --------- Co-authored-by: Nicolas <nicolascamara29@gmail.com>
This commit is contained in:
parent
4341b6e12e
commit
8d7e8c4f50
@ -96,3 +96,7 @@ content-type: application/json
|
|||||||
# @name extractFirecrawlStatus
|
# @name extractFirecrawlStatus
|
||||||
GET {{baseUrl}}/v1/extract/{{extractFirecrawlId}} HTTP/1.1
|
GET {{baseUrl}}/v1/extract/{{extractFirecrawlId}} HTTP/1.1
|
||||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
|
|
||||||
|
###
|
||||||
|
DELETE {{baseUrl}}/v1/crawl/c94136f9-86c1-4a97-966c-1c8e0274778f HTTP/1.1
|
||||||
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
|
@ -228,10 +228,12 @@ export const extractV1Options = z
|
|||||||
enableWebSearch: z.boolean().default(false),
|
enableWebSearch: z.boolean().default(false),
|
||||||
origin: z.string().optional().default("api"),
|
origin: z.string().optional().default("api"),
|
||||||
urlTrace: z.boolean().default(false),
|
urlTrace: z.boolean().default(false),
|
||||||
|
timeout: z.number().int().positive().finite().safe().default(60000),
|
||||||
__experimental_streamSteps: z.boolean().default(false),
|
__experimental_streamSteps: z.boolean().default(false),
|
||||||
__experimental_llmUsage: z.boolean().default(false),
|
__experimental_llmUsage: z.boolean().default(false),
|
||||||
__experimental_showSources: z.boolean().default(false),
|
__experimental_showSources: z.boolean().default(false),
|
||||||
timeout: z.number().int().positive().finite().safe().default(60000),
|
__experimental_cacheKey: z.string().optional(),
|
||||||
|
__experimental_cacheMode: z.enum(["direct", "save", "load"]).default("direct").optional()
|
||||||
})
|
})
|
||||||
.strict(strictMessage)
|
.strict(strictMessage)
|
||||||
.transform((obj) => ({
|
.transform((obj) => ({
|
||||||
|
@ -36,12 +36,16 @@ import { checkShouldExtract } from "./completions/checkShouldExtract";
|
|||||||
import { batchExtractPromise } from "./completions/batchExtract";
|
import { batchExtractPromise } from "./completions/batchExtract";
|
||||||
import { singleAnswerCompletion } from "./completions/singleAnswer";
|
import { singleAnswerCompletion } from "./completions/singleAnswer";
|
||||||
import { SourceTracker } from "./helpers/source-tracker";
|
import { SourceTracker } from "./helpers/source-tracker";
|
||||||
|
import { getCachedDocs, saveCachedDocs } from "./helpers/cached-docs";
|
||||||
|
import { normalizeUrl } from "../canonical-url";
|
||||||
|
|
||||||
interface ExtractServiceOptions {
|
interface ExtractServiceOptions {
|
||||||
request: ExtractRequest;
|
request: ExtractRequest;
|
||||||
teamId: string;
|
teamId: string;
|
||||||
plan: PlanType;
|
plan: PlanType;
|
||||||
subId?: string;
|
subId?: string;
|
||||||
|
cacheMode?: "load" | "save" | "direct";
|
||||||
|
cacheKey?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
interface ExtractResult {
|
interface ExtractResult {
|
||||||
@ -86,6 +90,20 @@ export async function performExtraction(
|
|||||||
extractId,
|
extractId,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
if (request.__experimental_cacheMode == "load" && request.__experimental_cacheKey) {
|
||||||
|
logger.debug("Loading cached docs...");
|
||||||
|
try {
|
||||||
|
const cache = await getCachedDocs(request.urls, request.__experimental_cacheKey);
|
||||||
|
for (const doc of cache) {
|
||||||
|
if (doc.metadata.url) {
|
||||||
|
docsMap.set(normalizeUrl(doc.metadata.url), doc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.error("Error loading cached docs", { error });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Token tracking
|
// Token tracking
|
||||||
let tokenUsage: TokenUsage[] = [];
|
let tokenUsage: TokenUsage[] = [];
|
||||||
|
|
||||||
@ -257,8 +275,9 @@ export async function performExtraction(
|
|||||||
|
|
||||||
logger.debug("Starting multi-entity scrape...");
|
logger.debug("Starting multi-entity scrape...");
|
||||||
let startScrape = Date.now();
|
let startScrape = Date.now();
|
||||||
|
|
||||||
const scrapePromises = links.map((url) => {
|
const scrapePromises = links.map((url) => {
|
||||||
if (!docsMap.has(url)) {
|
if (!docsMap.has(normalizeUrl(url))) {
|
||||||
return scrapeDocument(
|
return scrapeDocument(
|
||||||
{
|
{
|
||||||
url,
|
url,
|
||||||
@ -280,7 +299,7 @@ export async function performExtraction(
|
|||||||
}
|
}
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
return docsMap.get(url);
|
return docsMap.get(normalizeUrl(url));
|
||||||
});
|
});
|
||||||
|
|
||||||
let multyEntityDocs = (await Promise.all(scrapePromises)).filter(
|
let multyEntityDocs = (await Promise.all(scrapePromises)).filter(
|
||||||
@ -309,7 +328,7 @@ export async function performExtraction(
|
|||||||
|
|
||||||
for (const doc of multyEntityDocs) {
|
for (const doc of multyEntityDocs) {
|
||||||
if (doc?.metadata?.url) {
|
if (doc?.metadata?.url) {
|
||||||
docsMap.set(doc.metadata.url, doc);
|
docsMap.set(normalizeUrl(doc.metadata.url), doc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -519,7 +538,7 @@ export async function performExtraction(
|
|||||||
],
|
],
|
||||||
});
|
});
|
||||||
const scrapePromises = links.map((url) => {
|
const scrapePromises = links.map((url) => {
|
||||||
if (!docsMap.has(url)) {
|
if (!docsMap.has(normalizeUrl(url))) {
|
||||||
return scrapeDocument(
|
return scrapeDocument(
|
||||||
{
|
{
|
||||||
url,
|
url,
|
||||||
@ -537,7 +556,7 @@ export async function performExtraction(
|
|||||||
}),
|
}),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
return docsMap.get(url);
|
return docsMap.get(normalizeUrl(url));
|
||||||
});
|
});
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@ -545,7 +564,7 @@ export async function performExtraction(
|
|||||||
|
|
||||||
for (const doc of results) {
|
for (const doc of results) {
|
||||||
if (doc?.metadata?.url) {
|
if (doc?.metadata?.url) {
|
||||||
docsMap.set(doc.metadata.url, doc);
|
docsMap.set(normalizeUrl(doc.metadata.url), doc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
logger.debug("Updated docsMap.", { docsMapSize: docsMap.size }); // useful for error probing
|
logger.debug("Updated docsMap.", { docsMapSize: docsMap.size }); // useful for error probing
|
||||||
@ -744,6 +763,15 @@ export async function performExtraction(
|
|||||||
|
|
||||||
logger.debug("Done!");
|
logger.debug("Done!");
|
||||||
|
|
||||||
|
if (request.__experimental_cacheMode == "save" && request.__experimental_cacheKey) {
|
||||||
|
logger.debug("Saving cached docs...");
|
||||||
|
try {
|
||||||
|
await saveCachedDocs([...docsMap.values()], request.__experimental_cacheKey);
|
||||||
|
} catch (error) {
|
||||||
|
logger.error("Error saving cached docs", { error });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
success: true,
|
success: true,
|
||||||
data: finalResult ?? {},
|
data: finalResult ?? {},
|
||||||
|
62
apps/api/src/lib/extract/helpers/cached-docs.ts
Normal file
62
apps/api/src/lib/extract/helpers/cached-docs.ts
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
import { Document } from "../../../controllers/v1/types";
|
||||||
|
import { supabase_service } from "../../../services/supabase";
|
||||||
|
import { normalizeUrl } from "../../../lib/canonical-url";
|
||||||
|
|
||||||
|
export async function getCachedDocs(urls: string[], cacheKey: string): Promise<Document[]> {
|
||||||
|
const normalizedUrls = urls.map(normalizeUrl);
|
||||||
|
const { data, error } = await supabase_service
|
||||||
|
.from('cached_scrapes')
|
||||||
|
.select('doc')
|
||||||
|
.in('url', normalizedUrls)
|
||||||
|
.eq('cache_key', cacheKey);
|
||||||
|
|
||||||
|
if (error) {
|
||||||
|
console.error('Error fetching cached docs:', error);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const uniqueDocs = new Map<string, Document>();
|
||||||
|
data.forEach((res: any) => {
|
||||||
|
const doc = JSON.parse(JSON.stringify(res.doc)) as Document;
|
||||||
|
const docKey = `${doc.metadata.url}-${cacheKey}`;
|
||||||
|
if (!uniqueDocs.has(docKey)) {
|
||||||
|
uniqueDocs.set(docKey, doc);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return Array.from(uniqueDocs.values());
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function saveCachedDocs(docs: Document[], cacheKey: string): Promise<void> {
|
||||||
|
for (const doc of docs) {
|
||||||
|
if (!doc.metadata.url) {
|
||||||
|
throw new Error("Document has no URL");
|
||||||
|
}
|
||||||
|
|
||||||
|
const normalizedUrl = normalizeUrl(doc.metadata.url);
|
||||||
|
const { data, error } = await supabase_service
|
||||||
|
.from('cached_scrapes')
|
||||||
|
.select('url')
|
||||||
|
.eq('url', normalizedUrl)
|
||||||
|
.eq('cache_key', cacheKey);
|
||||||
|
|
||||||
|
if (error) {
|
||||||
|
console.error('Error checking existing cached doc:', error);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (data.length === 0) {
|
||||||
|
const { error: upsertError } = await supabase_service
|
||||||
|
.from('cached_scrapes')
|
||||||
|
.upsert({
|
||||||
|
url: normalizedUrl,
|
||||||
|
doc: doc,
|
||||||
|
cache_key: cacheKey,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (upsertError) {
|
||||||
|
console.error('Error saving cached doc:', upsertError);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user