mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-08 16:19:02 +08:00
Nick: fixed
This commit is contained in:
parent
a4f7c38834
commit
c655c6859f
7
apps/api/src/lib/canonical-url.ts
Normal file
7
apps/api/src/lib/canonical-url.ts
Normal file
@ -0,0 +1,7 @@
|
||||
export function normalizeUrl(url: string) {
|
||||
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
|
||||
if (url.endsWith("/")) {
|
||||
url = url.slice(0, -1);
|
||||
}
|
||||
return url;
|
||||
}
|
@ -51,6 +51,7 @@ import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
||||
import { indexPage } from "../lib/extract/index/pinecone";
|
||||
import { Document } from "../controllers/v1/types";
|
||||
import { supabase_service } from "../services/supabase";
|
||||
import { normalizeUrl } from "../lib/canonical-url";
|
||||
|
||||
configDotenv();
|
||||
|
||||
@ -78,44 +79,57 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
|
||||
|
||||
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
||||
if (await finishCrawl(job.data.crawl_id)) {
|
||||
(async () => {
|
||||
const originUrl = sc.originUrl ? normalizeUrl(sc.originUrl) : undefined;
|
||||
// Get all visited URLs from Redis
|
||||
const visitedUrls = await redisConnection.smembers("crawl:" + job.data.crawl_id + ":visited");
|
||||
|
||||
const visitedUrls = await redisConnection.smembers(
|
||||
"crawl:" + job.data.crawl_id + ":visited",
|
||||
);
|
||||
// Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
|
||||
if (visitedUrls.length > 0 && job.data.crawlerOptions !== null) {
|
||||
if (visitedUrls.length > 0 && job.data.crawlerOptions !== null && originUrl) {
|
||||
// Fire and forget the upload to Supabase
|
||||
try {
|
||||
// Standardize URLs to canonical form (https, no www)
|
||||
const standardizedUrls = [
|
||||
...new Set(
|
||||
visitedUrls.map((url) => {
|
||||
return normalizeUrl(url);
|
||||
}),
|
||||
),
|
||||
];
|
||||
// First check if entry exists for this origin URL
|
||||
const { data: existingMap } = await supabase_service
|
||||
.from('crawl_maps')
|
||||
.select('urls')
|
||||
.eq('origin_url', sc.originUrl)
|
||||
.from("crawl_maps")
|
||||
.select("urls")
|
||||
.eq("origin_url", originUrl)
|
||||
.single();
|
||||
|
||||
if (existingMap) {
|
||||
// Merge URLs, removing duplicates
|
||||
const mergedUrls = [...new Set([...existingMap.urls, ...visitedUrls])];
|
||||
const mergedUrls = [
|
||||
...new Set([...existingMap.urls, ...standardizedUrls]),
|
||||
];
|
||||
|
||||
const { error } = await supabase_service
|
||||
.from('crawl_maps')
|
||||
.from("crawl_maps")
|
||||
.update({
|
||||
urls: mergedUrls,
|
||||
num_urls: mergedUrls.length,
|
||||
updated_at: new Date().toISOString()
|
||||
updated_at: new Date().toISOString(),
|
||||
})
|
||||
.eq('origin_url', sc.originUrl);
|
||||
.eq("origin_url", originUrl);
|
||||
|
||||
if (error) {
|
||||
_logger.error("Failed to update crawl map", { error });
|
||||
}
|
||||
} else {
|
||||
// Insert new entry if none exists
|
||||
const { error } = await supabase_service
|
||||
.from('crawl_maps')
|
||||
.insert({
|
||||
origin_url: sc.originUrl,
|
||||
urls: visitedUrls,
|
||||
num_urls: visitedUrls.length,
|
||||
created_at: new Date().toISOString()
|
||||
const { error } = await supabase_service.from("crawl_maps").insert({
|
||||
origin_url: originUrl,
|
||||
urls: standardizedUrls,
|
||||
num_urls: standardizedUrls.length,
|
||||
created_at: new Date().toISOString(),
|
||||
updated_at: new Date().toISOString(),
|
||||
});
|
||||
|
||||
if (error) {
|
||||
@ -126,6 +140,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
||||
_logger.error("Error saving crawl map", { error });
|
||||
}
|
||||
}
|
||||
})();
|
||||
|
||||
if (!job.data.v1) {
|
||||
const jobIDs = await getCrawlJobs(job.data.crawl_id);
|
||||
|
Loading…
x
Reference in New Issue
Block a user