mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-18 06:55:55 +08:00
fix: search descriptions
This commit is contained in:
parent
165cce6c91
commit
a9936d322e
@ -870,6 +870,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
|
||||
this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
|
||||
this.threadLocal.set('userAgent', opts.userAgent);
|
||||
if (opts.timeout) {
|
||||
this.threadLocal.set('timeout', opts.timeout * 1000);
|
||||
}
|
||||
|
||||
const crawlOpts: ExtraScrappingOptions = {
|
||||
proxyUrl: opts.proxyUrl,
|
||||
@ -878,6 +881,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
waitForSelector: opts.waitForSelector,
|
||||
targetSelector: opts.targetSelector,
|
||||
overrideUserAgent: opts.userAgent,
|
||||
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
|
||||
};
|
||||
|
||||
return crawlOpts;
|
||||
|
@ -30,7 +30,7 @@ export class SearcherHost extends RPCHost {
|
||||
cacheValidMs = 1000 * 3600;
|
||||
pageCacheToleranceMs = 1000 * 3600 * 24;
|
||||
|
||||
reasonableDelayMs = 10_000;
|
||||
reasonableDelayMs = 15_000;
|
||||
|
||||
targetResultCount = 5;
|
||||
|
||||
@ -163,6 +163,10 @@ export class SearcherHost extends RPCHost {
|
||||
throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
|
||||
}
|
||||
|
||||
if (crawlOpts.timeoutMs && crawlOpts.timeoutMs < 30_000) {
|
||||
delete crawlOpts.timeoutMs;
|
||||
}
|
||||
|
||||
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
|
||||
crawlerOptions.cacheTolerance || this.pageCacheToleranceMs
|
||||
);
|
||||
@ -213,7 +217,7 @@ export class SearcherHost extends RPCHost {
|
||||
chargeAmount = this.getChargeAmount(lastScrapped);
|
||||
rpcReflect.return(lastScrapped);
|
||||
earlyReturn = true;
|
||||
}, this.reasonableDelayMs);
|
||||
}, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
|
||||
};
|
||||
|
||||
for await (const scrapped of it) {
|
||||
@ -259,7 +263,7 @@ export class SearcherHost extends RPCHost {
|
||||
chargeAmount = this.getChargeAmount(lastScrapped);
|
||||
rpcReflect.return(assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null }));
|
||||
earlyReturn = true;
|
||||
}, this.reasonableDelayMs);
|
||||
}, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
|
||||
};
|
||||
|
||||
for await (const scrapped of it) {
|
||||
@ -317,7 +321,12 @@ export class SearcherHost extends RPCHost {
|
||||
description: upstreamSearchResult.description,
|
||||
};
|
||||
}
|
||||
return this.crawler.formatSnapshot(mode, x, urls[i]);
|
||||
return this.crawler.formatSnapshot(mode, x, urls[i]).then((r) => {
|
||||
r.title ??= upstreamSearchResult.title;
|
||||
r.description = upstreamSearchResult.description;
|
||||
|
||||
return r;
|
||||
});
|
||||
});
|
||||
|
||||
const resultArray = await Promise.all(mapped) as FormattedPage[];
|
||||
@ -343,7 +352,7 @@ export class SearcherHost extends RPCHost {
|
||||
return {
|
||||
...x,
|
||||
toString(this: any) {
|
||||
if (this.description) {
|
||||
if (!this.content && this.description) {
|
||||
if (this.title) {
|
||||
return `[${i + 1}] Title: ${this.title}
|
||||
[${i + 1}] URL Source: ${this.url}
|
||||
@ -355,6 +364,9 @@ export class SearcherHost extends RPCHost {
|
||||
}
|
||||
|
||||
const mixins = [];
|
||||
if (this.description) {
|
||||
mixins.push(`[${i + 1}] Description: ${this.description}`);
|
||||
}
|
||||
if (this.publishedTime) {
|
||||
mixins.push(`[${i + 1}] Published Time: ${this.publishedTime}`);
|
||||
}
|
||||
|
@ -91,6 +91,11 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Timeout': {
|
||||
description: `Specify timeout in seconds. Max 180.`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -142,6 +147,11 @@ export class CrawlerOptions extends AutoCastable {
|
||||
@Prop()
|
||||
userAgent?: string;
|
||||
|
||||
@Prop({
|
||||
validate: (v: number) => v > 0 && v <= 180,
|
||||
})
|
||||
timeout?: number;
|
||||
|
||||
static override from(input: any) {
|
||||
const instance = super.from(input) as CrawlerOptions;
|
||||
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
|
||||
@ -178,6 +188,11 @@ export class CrawlerOptions extends AutoCastable {
|
||||
instance.cacheTolerance = cacheTolerance;
|
||||
}
|
||||
|
||||
let timeoutSeconds = parseInt(ctx?.req.get('x-timeout') || '');
|
||||
if (!isNaN(timeoutSeconds)) {
|
||||
instance.timeout = timeoutSeconds;
|
||||
}
|
||||
|
||||
const targetSelector = ctx?.req.get('x-target-selector');
|
||||
instance.targetSelector ??= targetSelector;
|
||||
const waitForSelector = ctx?.req.get('x-wait-for-selector');
|
||||
|
@ -66,6 +66,7 @@ export interface ScrappingOptions {
|
||||
waitForSelector?: string;
|
||||
minIntervalMs?: number;
|
||||
overrideUserAgent?: string;
|
||||
timeoutMs?: number;
|
||||
}
|
||||
|
||||
|
||||
@ -449,7 +450,10 @@ document.addEventListener('load', handlePageLoad);
|
||||
);
|
||||
});
|
||||
|
||||
const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 })
|
||||
const gotoPromise = page.goto(url, {
|
||||
waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
|
||||
timeout: options?.timeoutMs || 30_000
|
||||
})
|
||||
.catch((err) => {
|
||||
this.logger.warn(`Page ${sn}: Browsing of ${url} did not fully succeed`, { err: marshalErrorLike(err) });
|
||||
return Promise.reject(new AssertionFailureError({
|
||||
|
Loading…
x
Reference in New Issue
Block a user