feat(scrape): add error tallying instead of empty response

This commit is contained in:
Gergő Móricz 2024-09-19 19:21:13 +02:00
parent 712ca31615
commit 01f42b980d
2 changed files with 21 additions and 8 deletions

View File

@ -64,22 +64,21 @@ export async function scrapeController(
success: false,
error: "Request timed out",
});
} else {
} else if (typeof e === "string" && e.startsWith("{\"type\":\"all\",")) {
return res.status(500).json({
success: false,
error: `(Internal server error) - ${e && e?.message ? e.message : e} ${
extractorOptions && extractorOptions.mode !== "markdown"
? " - Could be due to LLM parsing issues"
: ""
}`,
error: "All scraping methods failed for URL: " + req.body.url,
details: JSON.parse(e).errors as string[],
});
} else {
throw e;
}
}
await job.remove();
if (!doc) {
console.error("!!! PANIC DOC IS", doc, job);
// console.error("!!! PANIC DOC IS", doc, job);
return res.status(200).json({
success: true,
warning: "No page found",

View File

@ -351,6 +351,9 @@ export async function scrapSingleUrl(
pageStatusCode: 200,
pageError: undefined,
};
const errors: Record<string, string> = {};
try {
let urlKey = urlToScrap;
try {
@ -392,6 +395,12 @@ export async function scrapSingleUrl(
pageError = undefined;
}
if (attempt.pageError) {
errors[scraper] = attempt.pageError;
} else {
errors[scraper] = null;
}
if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) {
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`);
break;
@ -443,12 +452,17 @@ export async function scrapSingleUrl(
return document;
} catch (error) {
Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`);
Logger.error(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`);
ScrapeEvents.insert(jobId, {
type: "error",
message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error),
stack: error.stack,
});
if (error instanceof Error && error.message.startsWith("All scraping methods failed")) {
throw new Error(JSON.stringify({"type": "all", "errors": Object.values(errors)}));
}
return {
content: "",
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined,