async saving to index

This commit is contained in:
Gergő Móricz 2025-06-03 21:16:13 +02:00
parent d1b5e2ef47
commit 60525220a2

View File

@ -20,56 +20,64 @@ export async function sendDocumentToIndex(meta: Meta, document: Document) {
return document; return document;
} }
const normalizedURL = normalizeURLForIndex(meta.url); (async () => {
const urlHash = await hashURL(normalizedURL); try {
const normalizedURL = normalizeURLForIndex(meta.url);
const urlHash = await hashURL(normalizedURL);
const urlSplits = generateURLSplits(normalizedURL); const urlSplits = generateURLSplits(normalizedURL);
const urlSplitsHash = await Promise.all(urlSplits.map(split => hashURL(split))); const urlSplitsHash = await Promise.all(urlSplits.map(split => hashURL(split)));
const indexId = crypto.randomUUID(); const indexId = crypto.randomUUID();
try { try {
await saveIndexToGCS(indexId, { await saveIndexToGCS(indexId, {
url: normalizedURL, url: normalizedURL,
html: document.rawHtml!, html: document.rawHtml!,
statusCode: document.metadata.statusCode, statusCode: document.metadata.statusCode,
error: document.metadata.error, error: document.metadata.error,
screenshot: document.screenshot, screenshot: document.screenshot,
numPages: document.metadata.numPages, numPages: document.metadata.numPages,
}); });
} catch (error) { } catch (error) {
meta.logger.error("Failed to save document to index", { meta.logger.error("Failed to save document to index", {
error, error,
}); });
return document; return document;
} }
try { try {
await addIndexInsertJob({ await addIndexInsertJob({
id: indexId, id: indexId,
url: normalizedURL, url: normalizedURL,
url_hash: urlHash, url_hash: urlHash,
url_splits: urlSplits, url_splits: urlSplits,
url_splits_hash: urlSplitsHash, url_splits_hash: urlSplitsHash,
original_url: document.metadata.sourceURL ?? meta.url, original_url: document.metadata.sourceURL ?? meta.url,
resolved_url: document.metadata.url ?? document.metadata.sourceURL ?? meta.url, resolved_url: document.metadata.url ?? document.metadata.sourceURL ?? meta.url,
has_screenshot: document.screenshot !== undefined && meta.featureFlags.has("screenshot"), has_screenshot: document.screenshot !== undefined && meta.featureFlags.has("screenshot"),
has_screenshot_fullscreen: document.screenshot !== undefined && meta.featureFlags.has("screenshot@fullScreen"), has_screenshot_fullscreen: document.screenshot !== undefined && meta.featureFlags.has("screenshot@fullScreen"),
is_mobile: meta.options.mobile, is_mobile: meta.options.mobile,
block_ads: meta.options.blockAds, block_ads: meta.options.blockAds,
location_country: meta.options.location?.country ?? null, location_country: meta.options.location?.country ?? null,
location_languages: meta.options.location?.languages ?? null, location_languages: meta.options.location?.languages ?? null,
status: document.metadata.statusCode, status: document.metadata.statusCode,
...(urlSplitsHash.slice(0, 10).reduce((a,x,i) => ({ ...(urlSplitsHash.slice(0, 10).reduce((a,x,i) => ({
...a, ...a,
[`url_split_${i}_hash`]: x, [`url_split_${i}_hash`]: x,
}), {})), }), {})),
}); });
} catch (error) { } catch (error) {
meta.logger.error("Failed to add document to index insert queue", { meta.logger.error("Failed to add document to index insert queue", {
error, error,
}); });
} }
} catch (error) {
meta.logger.error("Failed to save document to index (outer)", {
error,
});
}
})();
return document; return document;
} }