mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-08 16:19:02 +08:00
Nick: fixed
This commit is contained in:
parent
a4f7c38834
commit
c655c6859f
7
apps/api/src/lib/canonical-url.ts
Normal file
7
apps/api/src/lib/canonical-url.ts
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
export function normalizeUrl(url: string) {
|
||||||
|
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
|
||||||
|
if (url.endsWith("/")) {
|
||||||
|
url = url.slice(0, -1);
|
||||||
|
}
|
||||||
|
return url;
|
||||||
|
}
|
@ -51,6 +51,7 @@ import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
|||||||
import { indexPage } from "../lib/extract/index/pinecone";
|
import { indexPage } from "../lib/extract/index/pinecone";
|
||||||
import { Document } from "../controllers/v1/types";
|
import { Document } from "../controllers/v1/types";
|
||||||
import { supabase_service } from "../services/supabase";
|
import { supabase_service } from "../services/supabase";
|
||||||
|
import { normalizeUrl } from "../lib/canonical-url";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
@ -78,54 +79,68 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
|
|||||||
|
|
||||||
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
||||||
if (await finishCrawl(job.data.crawl_id)) {
|
if (await finishCrawl(job.data.crawl_id)) {
|
||||||
// Get all visited URLs from Redis
|
(async () => {
|
||||||
const visitedUrls = await redisConnection.smembers("crawl:" + job.data.crawl_id + ":visited");
|
const originUrl = sc.originUrl ? normalizeUrl(sc.originUrl) : undefined;
|
||||||
|
// Get all visited URLs from Redis
|
||||||
// Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
|
const visitedUrls = await redisConnection.smembers(
|
||||||
if (visitedUrls.length > 0 && job.data.crawlerOptions !== null) {
|
"crawl:" + job.data.crawl_id + ":visited",
|
||||||
try {
|
);
|
||||||
// First check if entry exists for this origin URL
|
// Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
|
||||||
const { data: existingMap } = await supabase_service
|
if (visitedUrls.length > 0 && job.data.crawlerOptions !== null && originUrl) {
|
||||||
.from('crawl_maps')
|
// Fire and forget the upload to Supabase
|
||||||
.select('urls')
|
try {
|
||||||
.eq('origin_url', sc.originUrl)
|
// Standardize URLs to canonical form (https, no www)
|
||||||
.single();
|
const standardizedUrls = [
|
||||||
|
...new Set(
|
||||||
|
visitedUrls.map((url) => {
|
||||||
|
return normalizeUrl(url);
|
||||||
|
}),
|
||||||
|
),
|
||||||
|
];
|
||||||
|
// First check if entry exists for this origin URL
|
||||||
|
const { data: existingMap } = await supabase_service
|
||||||
|
.from("crawl_maps")
|
||||||
|
.select("urls")
|
||||||
|
.eq("origin_url", originUrl)
|
||||||
|
.single();
|
||||||
|
|
||||||
if (existingMap) {
|
if (existingMap) {
|
||||||
// Merge URLs, removing duplicates
|
// Merge URLs, removing duplicates
|
||||||
const mergedUrls = [...new Set([...existingMap.urls, ...visitedUrls])];
|
const mergedUrls = [
|
||||||
|
...new Set([...existingMap.urls, ...standardizedUrls]),
|
||||||
const { error } = await supabase_service
|
];
|
||||||
.from('crawl_maps')
|
|
||||||
.update({
|
|
||||||
urls: mergedUrls,
|
|
||||||
num_urls: mergedUrls.length,
|
|
||||||
updated_at: new Date().toISOString()
|
|
||||||
})
|
|
||||||
.eq('origin_url', sc.originUrl);
|
|
||||||
|
|
||||||
if (error) {
|
const { error } = await supabase_service
|
||||||
_logger.error("Failed to update crawl map", { error });
|
.from("crawl_maps")
|
||||||
}
|
.update({
|
||||||
} else {
|
urls: mergedUrls,
|
||||||
// Insert new entry if none exists
|
num_urls: mergedUrls.length,
|
||||||
const { error } = await supabase_service
|
updated_at: new Date().toISOString(),
|
||||||
.from('crawl_maps')
|
})
|
||||||
.insert({
|
.eq("origin_url", originUrl);
|
||||||
origin_url: sc.originUrl,
|
|
||||||
urls: visitedUrls,
|
if (error) {
|
||||||
num_urls: visitedUrls.length,
|
_logger.error("Failed to update crawl map", { error });
|
||||||
created_at: new Date().toISOString()
|
}
|
||||||
|
} else {
|
||||||
|
// Insert new entry if none exists
|
||||||
|
const { error } = await supabase_service.from("crawl_maps").insert({
|
||||||
|
origin_url: originUrl,
|
||||||
|
urls: standardizedUrls,
|
||||||
|
num_urls: standardizedUrls.length,
|
||||||
|
created_at: new Date().toISOString(),
|
||||||
|
updated_at: new Date().toISOString(),
|
||||||
});
|
});
|
||||||
|
|
||||||
if (error) {
|
if (error) {
|
||||||
_logger.error("Failed to save crawl map", { error });
|
_logger.error("Failed to save crawl map", { error });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
} catch (error) {
|
||||||
|
_logger.error("Error saving crawl map", { error });
|
||||||
}
|
}
|
||||||
} catch (error) {
|
|
||||||
_logger.error("Error saving crawl map", { error });
|
|
||||||
}
|
}
|
||||||
}
|
})();
|
||||||
|
|
||||||
if (!job.data.v1) {
|
if (!job.data.v1) {
|
||||||
const jobIDs = await getCrawlJobs(job.data.crawl_id);
|
const jobIDs = await getCrawlJobs(job.data.crawl_id);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user