mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 07:19:03 +08:00
Nick: map improvements
This commit is contained in:
parent
c19af6ef42
commit
513f61a2d1
@ -226,6 +226,8 @@ export async function getMapResults({
|
||||
? links
|
||||
: links.slice(0, limit);
|
||||
|
||||
//
|
||||
|
||||
return {
|
||||
success: true,
|
||||
links: linksToReturn,
|
||||
|
@ -12,19 +12,33 @@ import { withAuth } from "../../lib/withAuth";
|
||||
async function querySitemapIndexFunction(url: string) {
|
||||
const originUrl = normalizeUrlOnlyHostname(url);
|
||||
|
||||
const { data, error } = await supabase_service
|
||||
.from("crawl_maps")
|
||||
.select("urls")
|
||||
.eq("origin_url", originUrl);
|
||||
for (let attempt = 1; attempt <= 3; attempt++) {
|
||||
try {
|
||||
const { data, error } = await supabase_service
|
||||
.from("crawl_maps")
|
||||
.select("urls")
|
||||
.eq("origin_url", originUrl);
|
||||
|
||||
if (error) {
|
||||
logger.error("(sitemap-index) Error querying the index", { error });
|
||||
return [];
|
||||
if (error) {
|
||||
throw error;
|
||||
}
|
||||
|
||||
const allUrls = data.map((entry) => entry.urls).flat();
|
||||
return allUrls;
|
||||
|
||||
} catch (error) {
|
||||
logger.error("(sitemap-index) Error querying the index", {
|
||||
error,
|
||||
attempt
|
||||
});
|
||||
|
||||
if (attempt === 3) {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const allUrls = data.map((entry) => entry.urls).flat();
|
||||
|
||||
return allUrls;
|
||||
return [];
|
||||
}
|
||||
|
||||
export const querySitemapIndex = withAuth(querySitemapIndexFunction, []);
|
||||
|
62
apps/api/src/services/indexing/crawl-maps-index.ts
Normal file
62
apps/api/src/services/indexing/crawl-maps-index.ts
Normal file
@ -0,0 +1,62 @@
|
||||
import { logger } from "../../lib/logger";
|
||||
import {
|
||||
normalizeUrl,
|
||||
normalizeUrlOnlyHostname,
|
||||
} from "../../lib/canonical-url";
|
||||
import { supabase_service } from "../supabase";
|
||||
|
||||
export async function saveCrawlMap(originUrl: string, visitedUrls: string[]) {
|
||||
originUrl = normalizeUrlOnlyHostname(originUrl);
|
||||
// Fire and forget the upload to Supabase
|
||||
try {
|
||||
// Standardize URLs to canonical form (https, no www)
|
||||
const standardizedUrls = [
|
||||
...new Set(
|
||||
visitedUrls.map((url) => {
|
||||
return normalizeUrl(url);
|
||||
}),
|
||||
),
|
||||
];
|
||||
// First check if entry exists for this origin URL
|
||||
const { data: existingMap } = await supabase_service
|
||||
.from("crawl_maps")
|
||||
.select("urls")
|
||||
.eq("origin_url", originUrl)
|
||||
.single();
|
||||
|
||||
if (existingMap) {
|
||||
// Merge URLs, removing duplicates
|
||||
const mergedUrls = [
|
||||
...new Set([...existingMap.urls, ...standardizedUrls]),
|
||||
];
|
||||
|
||||
const { error } = await supabase_service
|
||||
.from("crawl_maps")
|
||||
.update({
|
||||
urls: mergedUrls,
|
||||
num_urls: mergedUrls.length,
|
||||
updated_at: new Date().toISOString(),
|
||||
})
|
||||
.eq("origin_url", originUrl);
|
||||
|
||||
if (error) {
|
||||
logger.error("Failed to update crawl map", { error });
|
||||
}
|
||||
} else {
|
||||
// Insert new entry if none exists
|
||||
const { error } = await supabase_service.from("crawl_maps").insert({
|
||||
origin_url: originUrl,
|
||||
urls: standardizedUrls,
|
||||
num_urls: standardizedUrls.length,
|
||||
created_at: new Date().toISOString(),
|
||||
updated_at: new Date().toISOString(),
|
||||
});
|
||||
|
||||
if (error) {
|
||||
logger.error("Failed to save crawl map", { error });
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error("Error saving crawl map", { error });
|
||||
}
|
||||
}
|
@ -61,6 +61,7 @@ import { supabase_service } from "../services/supabase";
|
||||
import { normalizeUrl, normalizeUrlOnlyHostname } from "../lib/canonical-url";
|
||||
import { saveExtract, updateExtract } from "../lib/extract/extract-redis";
|
||||
import { billTeam } from "./billing/credit_billing";
|
||||
import { saveCrawlMap } from "./indexing/crawl-maps-index";
|
||||
|
||||
configDotenv();
|
||||
|
||||
@ -102,58 +103,9 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
||||
job.data.crawlerOptions !== null &&
|
||||
originUrl
|
||||
) {
|
||||
// Fire and forget the upload to Supabase
|
||||
try {
|
||||
// Standardize URLs to canonical form (https, no www)
|
||||
const standardizedUrls = [
|
||||
...new Set(
|
||||
visitedUrls.map((url) => {
|
||||
return normalizeUrl(url);
|
||||
}),
|
||||
),
|
||||
];
|
||||
// First check if entry exists for this origin URL
|
||||
const { data: existingMap } = await supabase_service
|
||||
.from("crawl_maps")
|
||||
.select("urls")
|
||||
.eq("origin_url", originUrl)
|
||||
.single();
|
||||
|
||||
if (existingMap) {
|
||||
// Merge URLs, removing duplicates
|
||||
const mergedUrls = [
|
||||
...new Set([...existingMap.urls, ...standardizedUrls]),
|
||||
];
|
||||
|
||||
const { error } = await supabase_service
|
||||
.from("crawl_maps")
|
||||
.update({
|
||||
urls: mergedUrls,
|
||||
num_urls: mergedUrls.length,
|
||||
updated_at: new Date().toISOString(),
|
||||
})
|
||||
.eq("origin_url", originUrl);
|
||||
|
||||
if (error) {
|
||||
_logger.error("Failed to update crawl map", { error });
|
||||
}
|
||||
} else {
|
||||
// Insert new entry if none exists
|
||||
const { error } = await supabase_service.from("crawl_maps").insert({
|
||||
origin_url: originUrl,
|
||||
urls: standardizedUrls,
|
||||
num_urls: standardizedUrls.length,
|
||||
created_at: new Date().toISOString(),
|
||||
updated_at: new Date().toISOString(),
|
||||
});
|
||||
|
||||
if (error) {
|
||||
_logger.error("Failed to save crawl map", { error });
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
_logger.error("Error saving crawl map", { error });
|
||||
}
|
||||
saveCrawlMap(originUrl, visitedUrls).catch((e) => {
|
||||
_logger.error("Error saving crawl map", { error: e });
|
||||
});
|
||||
}
|
||||
})();
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user