mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-18 06:55:54 +08:00
poc progress
This commit is contained in:
parent
474e5a0543
commit
8fc02f8604
40
apps/api/src/scraper/scrapeURL/engines/index/index.ts
Normal file
40
apps/api/src/scraper/scrapeURL/engines/index/index.ts
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
import { EngineScrapeResult } from "..";
|
||||||
|
import { Meta } from "../..";
|
||||||
|
import { getIndexFromGCS, index_supabase_service } from "../../../../services";
|
||||||
|
import { EngineError, IndexMissError } from "../../error";
|
||||||
|
|
||||||
|
export async function scrapeURLWithIndex(meta: Meta): Promise<EngineScrapeResult> {
|
||||||
|
const { data, error } = await index_supabase_service
|
||||||
|
.from("pages")
|
||||||
|
.select("*")
|
||||||
|
.eq("url", meta.url)
|
||||||
|
.order("created_at", { ascending: false })
|
||||||
|
.limit(1);
|
||||||
|
|
||||||
|
if (error) {
|
||||||
|
throw new EngineError("Failed to scrape URL with index", {
|
||||||
|
cause: error,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (data.length === 0) {
|
||||||
|
throw new IndexMissError();
|
||||||
|
}
|
||||||
|
|
||||||
|
const id = data[0].id;
|
||||||
|
|
||||||
|
const doc = await getIndexFromGCS(id + ".json");
|
||||||
|
if (!doc) {
|
||||||
|
throw new EngineError("No document found in index");
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
url: doc.url,
|
||||||
|
html: doc.html,
|
||||||
|
statusCode: doc.statusCode,
|
||||||
|
error: doc.error,
|
||||||
|
screenshot: doc.screenshot,
|
||||||
|
markdown: doc.markdown,
|
||||||
|
numPages: doc.numPages,
|
||||||
|
};
|
||||||
|
}
|
@ -92,3 +92,9 @@ export class PDFInsufficientTimeError extends Error {
|
|||||||
super(`Insufficient time to process PDF of ${pageCount} pages. Please increase the timeout parameter in your scrape request to at least ${minTimeout}ms.`);
|
super(`Insufficient time to process PDF of ${pageCount} pages. Please increase the timeout parameter in your scrape request to at least ${minTimeout}ms.`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export class IndexMissError extends Error {
|
||||||
|
constructor() {
|
||||||
|
super("Index doesn't have the page we're looking for");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
90
apps/api/src/services/index.ts
Normal file
90
apps/api/src/services/index.ts
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
import { createClient, SupabaseClient } from "@supabase/supabase-js";
|
||||||
|
import { logger } from "../lib/logger";
|
||||||
|
import { configDotenv } from "dotenv";
|
||||||
|
import { Storage } from "@google-cloud/storage";
|
||||||
|
configDotenv();
|
||||||
|
|
||||||
|
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
|
||||||
|
class IndexSupabaseService {
|
||||||
|
private client: SupabaseClient | null = null;
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
const supabaseUrl = process.env.INDEX_SUPABASE_URL;
|
||||||
|
const supabaseServiceToken = process.env.INDEX_SUPABASE_SERVICE_TOKEN;
|
||||||
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
||||||
|
// Only initialize the Supabase client if both URL and Service Token are provided.
|
||||||
|
if (!useDbAuthentication) {
|
||||||
|
// Warn the user that Authentication is disabled by setting the client to null
|
||||||
|
logger.warn(
|
||||||
|
"Authentication is disabled. Index supabase client will not be initialized.",
|
||||||
|
);
|
||||||
|
this.client = null;
|
||||||
|
} else if (!supabaseUrl || !supabaseServiceToken) {
|
||||||
|
logger.error(
|
||||||
|
"Index supabase environment variables aren't configured correctly. Index supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable",
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
this.client = createClient(supabaseUrl, supabaseServiceToken);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Provides access to the initialized Supabase client, if available.
|
||||||
|
getClient(): SupabaseClient | null {
|
||||||
|
return this.client;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const serv = new IndexSupabaseService();
|
||||||
|
|
||||||
|
// Using a Proxy to handle dynamic access to the Supabase client or service methods.
|
||||||
|
// This approach ensures that if Supabase is not configured, any attempt to use it will result in a clear error.
|
||||||
|
export const index_supabase_service: SupabaseClient = new Proxy(
|
||||||
|
serv,
|
||||||
|
{
|
||||||
|
get: function (target, prop, receiver) {
|
||||||
|
const client = target.getClient();
|
||||||
|
// If the Supabase client is not initialized, intercept property access to provide meaningful error feedback.
|
||||||
|
if (client === null) {
|
||||||
|
return () => {
|
||||||
|
throw new Error("Index supabase client is not configured.");
|
||||||
|
};
|
||||||
|
}
|
||||||
|
// Direct access to SupabaseService properties takes precedence.
|
||||||
|
if (prop in target) {
|
||||||
|
return Reflect.get(target, prop, receiver);
|
||||||
|
}
|
||||||
|
// Otherwise, delegate access to the Supabase client.
|
||||||
|
return Reflect.get(client, prop, receiver);
|
||||||
|
},
|
||||||
|
},
|
||||||
|
) as unknown as SupabaseClient;
|
||||||
|
|
||||||
|
const credentials = process.env.GCS_CREDENTIALS ? JSON.parse(atob(process.env.GCS_CREDENTIALS)) : undefined;
|
||||||
|
|
||||||
|
export async function getIndexFromGCS(url: string): Promise<any | null> {
|
||||||
|
// logger.info(`Getting f-engine document from GCS`, {
|
||||||
|
// url,
|
||||||
|
// });
|
||||||
|
try {
|
||||||
|
if (!process.env.GCS_INDEX_BUCKET_NAME) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const storage = new Storage({ credentials });
|
||||||
|
const bucket = storage.bucket(process.env.GCS_INDEX_BUCKET_NAME);
|
||||||
|
const blob = bucket.file(`${url}`);
|
||||||
|
const [exists] = await blob.exists();
|
||||||
|
if (!exists) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
const [blobContent] = await blob.download();
|
||||||
|
const parsed = JSON.parse(blobContent.toString());
|
||||||
|
return parsed;
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(`Error getting f-engine document from GCS`, {
|
||||||
|
error,
|
||||||
|
url,
|
||||||
|
});
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
@ -21,47 +21,6 @@ function cleanOfNull<T>(x: T): T {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function indexJob(job: FirecrawlJob): Promise<void> {
|
|
||||||
try {
|
|
||||||
if (job.mode !== "single_urls" && job.mode !== "scrape") {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const response = await fetch(`${process.env.FIRE_INDEX_SERVER_URL}/api/jobs`, {
|
|
||||||
method: 'POST',
|
|
||||||
headers: {
|
|
||||||
'Content-Type': 'application/json',
|
|
||||||
},
|
|
||||||
body: JSON.stringify({
|
|
||||||
url: job.url,
|
|
||||||
mode: job.mode || "scrape",
|
|
||||||
docs: job.docs,
|
|
||||||
origin: job.origin,
|
|
||||||
success: job.success,
|
|
||||||
time_taken: job.time_taken,
|
|
||||||
num_tokens: job.num_tokens,
|
|
||||||
page_options: job.scrapeOptions,
|
|
||||||
date_added: new Date().toISOString(),
|
|
||||||
}),
|
|
||||||
});
|
|
||||||
|
|
||||||
if (!response.ok) {
|
|
||||||
const errorData = await response.json();
|
|
||||||
// logger.error(`Failed to send job to external server: ${response.status} ${response.statusText}`, {
|
|
||||||
// error: errorData,
|
|
||||||
// scrapeId: job.job_id,
|
|
||||||
// });
|
|
||||||
} else {
|
|
||||||
// logger.debug("Job sent to external server successfully!", { scrapeId: job.job_id });
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
// logger.error(`Error sending job to external server: ${error.message}`, {
|
|
||||||
// error,
|
|
||||||
// scrapeId: job.job_id,
|
|
||||||
// });
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function logJob(job: FirecrawlJob, force: boolean = false) {
|
export async function logJob(job: FirecrawlJob, force: boolean = false) {
|
||||||
try {
|
try {
|
||||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
||||||
@ -106,11 +65,6 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) {
|
|||||||
pdf_num_pages: job.pdf_num_pages ?? null,
|
pdf_num_pages: job.pdf_num_pages ?? null,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Send job to external server
|
|
||||||
if (process.env.FIRE_INDEX_SERVER_URL) {
|
|
||||||
indexJob(job);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (process.env.GCS_BUCKET_NAME) {
|
if (process.env.GCS_BUCKET_NAME) {
|
||||||
await saveJobToGCS(job);
|
await saveJobToGCS(job);
|
||||||
}
|
}
|
||||||
|
@ -1046,25 +1046,6 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function indexJob(job: Job & { id: string }, document: Document) {
|
|
||||||
if (
|
|
||||||
document &&
|
|
||||||
document.markdown &&
|
|
||||||
job.data.team_id === process.env.BACKGROUND_INDEX_TEAM_ID!
|
|
||||||
) {
|
|
||||||
// indexPage({
|
|
||||||
// document: document,
|
|
||||||
// originUrl: job.data.crawl_id
|
|
||||||
// ? (await getCrawl(job.data.crawl_id))?.originUrl!
|
|
||||||
// : document.metadata.sourceURL!,
|
|
||||||
// crawlId: job.data.crawl_id,
|
|
||||||
// teamId: job.data.team_id,
|
|
||||||
// }).catch((error) => {
|
|
||||||
// _logger.error("Error indexing page", { error });
|
|
||||||
// });
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function processJob(job: Job & { id: string }, token: string) {
|
async function processJob(job: Job & { id: string }, token: string) {
|
||||||
const logger = _logger.child({
|
const logger = _logger.child({
|
||||||
module: "queue-worker",
|
module: "queue-worker",
|
||||||
@ -1263,8 +1244,6 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
true,
|
true,
|
||||||
);
|
);
|
||||||
|
|
||||||
indexJob(job, doc);
|
|
||||||
|
|
||||||
logger.debug("Declaring job as done...");
|
logger.debug("Declaring job as done...");
|
||||||
await addCrawlJobDone(job.data.crawl_id, job.id, true);
|
await addCrawlJobDone(job.data.crawl_id, job.id, true);
|
||||||
|
|
||||||
@ -1381,8 +1360,6 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
cost_tracking: costTracking,
|
cost_tracking: costTracking,
|
||||||
pdf_num_pages: doc.metadata.numPages,
|
pdf_num_pages: doc.metadata.numPages,
|
||||||
});
|
});
|
||||||
|
|
||||||
indexJob(job, doc);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (job.data.is_scrape !== true) {
|
if (job.data.is_scrape !== true) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user