fix(scrapeURL): better timeToRun distribution

This commit is contained in:
Gergő Móricz 2024-12-16 23:01:34 +01:00
parent 0013bdfcb4
commit 284a6ccedd

View File

@ -203,15 +203,20 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
const results: EngineResultsTracker = {}; const results: EngineResultsTracker = {};
let result: EngineScrapeResultWithContext | null = null; let result: EngineScrapeResultWithContext | null = null;
const timeToRun = meta.options.timeout !== undefined let ttrInstanceCount = Math.min(fallbackList.length, 3);
? Math.round(meta.options.timeout / Math.min(fallbackList.length, 3)) let ttrRatios = new Array(ttrInstanceCount).fill(0).map((_, i) => ttrInstanceCount - i);
: undefined let ttrRatioSum = ttrRatios.reduce((a, x) => a + x, 0);
for (const { engine, unsupportedFeatures } of fallbackList) { const timeToRun = meta.options.timeout !== undefined
? ttrRatios.map(ratio => Math.round(meta.options.timeout! * ratio / ttrRatioSum)).map(ratio => isNaN(ratio) ? undefined : ratio)
: [undefined]
for (const i in fallbackList) {
const { engine, unsupportedFeatures } = fallbackList[i];
const startedAt = Date.now(); const startedAt = Date.now();
try { try {
meta.logger.info("Scraping via " + engine + "..."); meta.logger.info("Scraping via " + engine + "...");
const _engineResult = await scrapeURLWithEngine(meta, engine, timeToRun); const _engineResult = await scrapeURLWithEngine(meta, engine, timeToRun[i] ?? timeToRun.slice(-1)[0]);
if (_engineResult.markdown === undefined) { if (_engineResult.markdown === undefined) {
// Some engines emit Markdown directly. // Some engines emit Markdown directly.
_engineResult.markdown = await parseMarkdown(_engineResult.html); _engineResult.markdown = await parseMarkdown(_engineResult.html);