mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-19 01:25:51 +08:00
fix: search descriptions
This commit is contained in:
parent
165cce6c91
commit
a9936d322e
@ -870,6 +870,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
|
this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
|
||||||
this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
|
this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
|
||||||
this.threadLocal.set('userAgent', opts.userAgent);
|
this.threadLocal.set('userAgent', opts.userAgent);
|
||||||
|
if (opts.timeout) {
|
||||||
|
this.threadLocal.set('timeout', opts.timeout * 1000);
|
||||||
|
}
|
||||||
|
|
||||||
const crawlOpts: ExtraScrappingOptions = {
|
const crawlOpts: ExtraScrappingOptions = {
|
||||||
proxyUrl: opts.proxyUrl,
|
proxyUrl: opts.proxyUrl,
|
||||||
@ -878,6 +881,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
waitForSelector: opts.waitForSelector,
|
waitForSelector: opts.waitForSelector,
|
||||||
targetSelector: opts.targetSelector,
|
targetSelector: opts.targetSelector,
|
||||||
overrideUserAgent: opts.userAgent,
|
overrideUserAgent: opts.userAgent,
|
||||||
|
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
|
||||||
};
|
};
|
||||||
|
|
||||||
return crawlOpts;
|
return crawlOpts;
|
||||||
|
@ -30,7 +30,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
cacheValidMs = 1000 * 3600;
|
cacheValidMs = 1000 * 3600;
|
||||||
pageCacheToleranceMs = 1000 * 3600 * 24;
|
pageCacheToleranceMs = 1000 * 3600 * 24;
|
||||||
|
|
||||||
reasonableDelayMs = 10_000;
|
reasonableDelayMs = 15_000;
|
||||||
|
|
||||||
targetResultCount = 5;
|
targetResultCount = 5;
|
||||||
|
|
||||||
@ -163,6 +163,10 @@ export class SearcherHost extends RPCHost {
|
|||||||
throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
|
throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (crawlOpts.timeoutMs && crawlOpts.timeoutMs < 30_000) {
|
||||||
|
delete crawlOpts.timeoutMs;
|
||||||
|
}
|
||||||
|
|
||||||
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
|
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
|
||||||
crawlerOptions.cacheTolerance || this.pageCacheToleranceMs
|
crawlerOptions.cacheTolerance || this.pageCacheToleranceMs
|
||||||
);
|
);
|
||||||
@ -213,7 +217,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
chargeAmount = this.getChargeAmount(lastScrapped);
|
chargeAmount = this.getChargeAmount(lastScrapped);
|
||||||
rpcReflect.return(lastScrapped);
|
rpcReflect.return(lastScrapped);
|
||||||
earlyReturn = true;
|
earlyReturn = true;
|
||||||
}, this.reasonableDelayMs);
|
}, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
|
||||||
};
|
};
|
||||||
|
|
||||||
for await (const scrapped of it) {
|
for await (const scrapped of it) {
|
||||||
@ -259,7 +263,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
chargeAmount = this.getChargeAmount(lastScrapped);
|
chargeAmount = this.getChargeAmount(lastScrapped);
|
||||||
rpcReflect.return(assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null }));
|
rpcReflect.return(assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null }));
|
||||||
earlyReturn = true;
|
earlyReturn = true;
|
||||||
}, this.reasonableDelayMs);
|
}, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
|
||||||
};
|
};
|
||||||
|
|
||||||
for await (const scrapped of it) {
|
for await (const scrapped of it) {
|
||||||
@ -317,7 +321,12 @@ export class SearcherHost extends RPCHost {
|
|||||||
description: upstreamSearchResult.description,
|
description: upstreamSearchResult.description,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
return this.crawler.formatSnapshot(mode, x, urls[i]);
|
return this.crawler.formatSnapshot(mode, x, urls[i]).then((r) => {
|
||||||
|
r.title ??= upstreamSearchResult.title;
|
||||||
|
r.description = upstreamSearchResult.description;
|
||||||
|
|
||||||
|
return r;
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
const resultArray = await Promise.all(mapped) as FormattedPage[];
|
const resultArray = await Promise.all(mapped) as FormattedPage[];
|
||||||
@ -343,7 +352,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
return {
|
return {
|
||||||
...x,
|
...x,
|
||||||
toString(this: any) {
|
toString(this: any) {
|
||||||
if (this.description) {
|
if (!this.content && this.description) {
|
||||||
if (this.title) {
|
if (this.title) {
|
||||||
return `[${i + 1}] Title: ${this.title}
|
return `[${i + 1}] Title: ${this.title}
|
||||||
[${i + 1}] URL Source: ${this.url}
|
[${i + 1}] URL Source: ${this.url}
|
||||||
@ -355,6 +364,9 @@ export class SearcherHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const mixins = [];
|
const mixins = [];
|
||||||
|
if (this.description) {
|
||||||
|
mixins.push(`[${i + 1}] Description: ${this.description}`);
|
||||||
|
}
|
||||||
if (this.publishedTime) {
|
if (this.publishedTime) {
|
||||||
mixins.push(`[${i + 1}] Published Time: ${this.publishedTime}`);
|
mixins.push(`[${i + 1}] Published Time: ${this.publishedTime}`);
|
||||||
}
|
}
|
||||||
|
@ -91,6 +91,11 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
|||||||
in: 'header',
|
in: 'header',
|
||||||
schema: { type: 'string' }
|
schema: { type: 'string' }
|
||||||
},
|
},
|
||||||
|
'X-Timeout': {
|
||||||
|
description: `Specify timeout in seconds. Max 180.`,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -142,6 +147,11 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
@Prop()
|
@Prop()
|
||||||
userAgent?: string;
|
userAgent?: string;
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
validate: (v: number) => v > 0 && v <= 180,
|
||||||
|
})
|
||||||
|
timeout?: number;
|
||||||
|
|
||||||
static override from(input: any) {
|
static override from(input: any) {
|
||||||
const instance = super.from(input) as CrawlerOptions;
|
const instance = super.from(input) as CrawlerOptions;
|
||||||
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
|
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
|
||||||
@ -178,6 +188,11 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
instance.cacheTolerance = cacheTolerance;
|
instance.cacheTolerance = cacheTolerance;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let timeoutSeconds = parseInt(ctx?.req.get('x-timeout') || '');
|
||||||
|
if (!isNaN(timeoutSeconds)) {
|
||||||
|
instance.timeout = timeoutSeconds;
|
||||||
|
}
|
||||||
|
|
||||||
const targetSelector = ctx?.req.get('x-target-selector');
|
const targetSelector = ctx?.req.get('x-target-selector');
|
||||||
instance.targetSelector ??= targetSelector;
|
instance.targetSelector ??= targetSelector;
|
||||||
const waitForSelector = ctx?.req.get('x-wait-for-selector');
|
const waitForSelector = ctx?.req.get('x-wait-for-selector');
|
||||||
|
@ -66,6 +66,7 @@ export interface ScrappingOptions {
|
|||||||
waitForSelector?: string;
|
waitForSelector?: string;
|
||||||
minIntervalMs?: number;
|
minIntervalMs?: number;
|
||||||
overrideUserAgent?: string;
|
overrideUserAgent?: string;
|
||||||
|
timeoutMs?: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -449,7 +450,10 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 })
|
const gotoPromise = page.goto(url, {
|
||||||
|
waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
|
||||||
|
timeout: options?.timeoutMs || 30_000
|
||||||
|
})
|
||||||
.catch((err) => {
|
.catch((err) => {
|
||||||
this.logger.warn(`Page ${sn}: Browsing of ${url} did not fully succeed`, { err: marshalErrorLike(err) });
|
this.logger.warn(`Page ${sn}: Browsing of ${url} did not fully succeed`, { err: marshalErrorLike(err) });
|
||||||
return Promise.reject(new AssertionFailureError({
|
return Promise.reject(new AssertionFailureError({
|
||||||
|
Loading…
x
Reference in New Issue
Block a user