fix: search descriptions

This commit is contained in:
yanlong.wang 2024-06-05 19:47:04 +08:00
parent 165cce6c91
commit a9936d322e
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
4 changed files with 41 additions and 6 deletions

View File

@ -870,6 +870,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
this.threadLocal.set('userAgent', opts.userAgent);
if (opts.timeout) {
this.threadLocal.set('timeout', opts.timeout * 1000);
}
const crawlOpts: ExtraScrappingOptions = {
proxyUrl: opts.proxyUrl,
@ -878,6 +881,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
waitForSelector: opts.waitForSelector,
targetSelector: opts.targetSelector,
overrideUserAgent: opts.userAgent,
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
};
return crawlOpts;

View File

@ -30,7 +30,7 @@ export class SearcherHost extends RPCHost {
cacheValidMs = 1000 * 3600;
pageCacheToleranceMs = 1000 * 3600 * 24;
reasonableDelayMs = 10_000;
reasonableDelayMs = 15_000;
targetResultCount = 5;
@ -163,6 +163,10 @@ export class SearcherHost extends RPCHost {
throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
}
if (crawlOpts.timeoutMs && crawlOpts.timeoutMs < 30_000) {
delete crawlOpts.timeoutMs;
}
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
crawlerOptions.cacheTolerance || this.pageCacheToleranceMs
);
@ -213,7 +217,7 @@ export class SearcherHost extends RPCHost {
chargeAmount = this.getChargeAmount(lastScrapped);
rpcReflect.return(lastScrapped);
earlyReturn = true;
}, this.reasonableDelayMs);
}, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
};
for await (const scrapped of it) {
@ -259,7 +263,7 @@ export class SearcherHost extends RPCHost {
chargeAmount = this.getChargeAmount(lastScrapped);
rpcReflect.return(assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null }));
earlyReturn = true;
}, this.reasonableDelayMs);
}, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
};
for await (const scrapped of it) {
@ -317,7 +321,12 @@ export class SearcherHost extends RPCHost {
description: upstreamSearchResult.description,
};
}
return this.crawler.formatSnapshot(mode, x, urls[i]);
return this.crawler.formatSnapshot(mode, x, urls[i]).then((r) => {
r.title ??= upstreamSearchResult.title;
r.description = upstreamSearchResult.description;
return r;
});
});
const resultArray = await Promise.all(mapped) as FormattedPage[];
@ -343,7 +352,7 @@ export class SearcherHost extends RPCHost {
return {
...x,
toString(this: any) {
if (this.description) {
if (!this.content && this.description) {
if (this.title) {
return `[${i + 1}] Title: ${this.title}
[${i + 1}] URL Source: ${this.url}
@ -355,6 +364,9 @@ export class SearcherHost extends RPCHost {
}
const mixins = [];
if (this.description) {
mixins.push(`[${i + 1}] Description: ${this.description}`);
}
if (this.publishedTime) {
mixins.push(`[${i + 1}] Published Time: ${this.publishedTime}`);
}

View File

@ -91,6 +91,11 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
in: 'header',
schema: { type: 'string' }
},
'X-Timeout': {
description: `Specify timeout in seconds. Max 180.`,
in: 'header',
schema: { type: 'string' }
},
}
}
}
@ -142,6 +147,11 @@ export class CrawlerOptions extends AutoCastable {
@Prop()
userAgent?: string;
@Prop({
validate: (v: number) => v > 0 && v <= 180,
})
timeout?: number;
static override from(input: any) {
const instance = super.from(input) as CrawlerOptions;
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
@ -178,6 +188,11 @@ export class CrawlerOptions extends AutoCastable {
instance.cacheTolerance = cacheTolerance;
}
let timeoutSeconds = parseInt(ctx?.req.get('x-timeout') || '');
if (!isNaN(timeoutSeconds)) {
instance.timeout = timeoutSeconds;
}
const targetSelector = ctx?.req.get('x-target-selector');
instance.targetSelector ??= targetSelector;
const waitForSelector = ctx?.req.get('x-wait-for-selector');

View File

@ -66,6 +66,7 @@ export interface ScrappingOptions {
waitForSelector?: string;
minIntervalMs?: number;
overrideUserAgent?: string;
timeoutMs?: number;
}
@ -449,7 +450,10 @@ document.addEventListener('load', handlePageLoad);
);
});
const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 })
const gotoPromise = page.goto(url, {
waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
timeout: options?.timeoutMs || 30_000
})
.catch((err) => {
this.logger.warn(`Page ${sn}: Browsing of ${url} did not fully succeed`, { err: marshalErrorLike(err) });
return Promise.reject(new AssertionFailureError({