mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-16 12:56:00 +08:00
fix
This commit is contained in:
parent
09dbbd3b0f
commit
9bcde30f11
@ -776,7 +776,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
|
|
||||||
getUrlDigest(urlToCrawl: URL) {
|
getUrlDigest(urlToCrawl: URL) {
|
||||||
const normalizedURL = new URL(urlToCrawl);
|
const normalizedURL = new URL(urlToCrawl);
|
||||||
normalizedURL.hash = '';
|
if (!normalizedURL.hash.startsWith('/')) {
|
||||||
|
normalizedURL.hash = '';
|
||||||
|
}
|
||||||
const normalizedUrl = normalizedURL.toString().toLowerCase();
|
const normalizedUrl = normalizedURL.toString().toLowerCase();
|
||||||
const digest = md5Hasher.hash(normalizedUrl.toString());
|
const digest = md5Hasher.hash(normalizedUrl.toString());
|
||||||
|
|
||||||
|
@ -5,7 +5,7 @@ import {
|
|||||||
objHashMd5B64Of,
|
objHashMd5B64Of,
|
||||||
} from 'civkit';
|
} from 'civkit';
|
||||||
import { singleton } from 'tsyringe';
|
import { singleton } from 'tsyringe';
|
||||||
import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, Param, RPCReflect } from '../shared';
|
||||||
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
||||||
import _ from 'lodash';
|
import _ from 'lodash';
|
||||||
import { Request, Response } from 'express';
|
import { Request, Response } from 'express';
|
||||||
@ -83,6 +83,8 @@ export class SearcherHost extends RPCHost {
|
|||||||
res: Response,
|
res: Response,
|
||||||
},
|
},
|
||||||
auth: JinaEmbeddingsAuthDTO,
|
auth: JinaEmbeddingsAuthDTO,
|
||||||
|
@Param('count', { default: 5, validate: (v) => v >= 3 && v <= 10 })
|
||||||
|
count: number,
|
||||||
crawlerOptions: CrawlerOptions,
|
crawlerOptions: CrawlerOptions,
|
||||||
braveSearchExplicitOperators: BraveSearchExplicitOperatorsDto,
|
braveSearchExplicitOperators: BraveSearchExplicitOperatorsDto,
|
||||||
) {
|
) {
|
||||||
@ -157,7 +159,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
const searchQuery = braveSearchExplicitOperators.addTo(ctx.req.path.slice(1));
|
const searchQuery = braveSearchExplicitOperators.addTo(ctx.req.path.slice(1));
|
||||||
const r = await this.cachedWebSearch({
|
const r = await this.cachedWebSearch({
|
||||||
q: searchQuery,
|
q: searchQuery,
|
||||||
count: 10
|
count: Math.floor(count * 2)
|
||||||
}, crawlerOptions.noCache);
|
}, crawlerOptions.noCache);
|
||||||
|
|
||||||
if (!r.web?.results.length) {
|
if (!r.web?.results.length) {
|
||||||
@ -226,7 +228,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
if (_.some(scrapped, (x) => this.pageQualified(x))) {
|
if (_.some(scrapped, (x) => this.pageQualified(x))) {
|
||||||
setEarlyReturnTimer();
|
setEarlyReturnTimer();
|
||||||
}
|
}
|
||||||
if (!this.searchResultsQualified(scrapped)) {
|
if (!this.searchResultsQualified(scrapped, count)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (earlyReturnTimer) {
|
if (earlyReturnTimer) {
|
||||||
@ -274,7 +276,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
setEarlyReturnTimer();
|
setEarlyReturnTimer();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!this.searchResultsQualified(scrapped)) {
|
if (!this.searchResultsQualified(scrapped, count)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -425,8 +427,8 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
|
|||||||
formattedPage.html;
|
formattedPage.html;
|
||||||
}
|
}
|
||||||
|
|
||||||
searchResultsQualified(results: FormattedPage[]) {
|
searchResultsQualified(results: FormattedPage[], targetResultCount = this.targetResultCount) {
|
||||||
return _.every(results, (x) => this.pageQualified(x)) && results.length >= this.targetResultCount;
|
return _.every(results, (x) => this.pageQualified(x)) && results.length >= targetResultCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
async cachedWebSearch(query: WebSearchQueryParams, noCache: boolean = false) {
|
async cachedWebSearch(query: WebSearchQueryParams, noCache: boolean = false) {
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import { AsyncService, AutoCastable, DownstreamServiceFailureError, Prop, RPC_CALL_ENVIRONMENT, marshalErrorLike, retry } from 'civkit';
|
import { AsyncService, AutoCastable, DownstreamServiceFailureError, Prop, RPC_CALL_ENVIRONMENT, delay, marshalErrorLike } from 'civkit';
|
||||||
import { singleton } from 'tsyringe';
|
import { singleton } from 'tsyringe';
|
||||||
import { Logger } from '../shared/services/logger';
|
import { Logger } from '../shared/services/logger';
|
||||||
import { SecretExposer } from '../shared/services/secrets';
|
import { SecretExposer } from '../shared/services/secrets';
|
||||||
@ -31,7 +31,6 @@ export class BraveSearchService extends AsyncService {
|
|||||||
this.braveSearchHTTP = new BraveSearchHTTP(this.secretExposer.BRAVE_SEARCH_API_KEY);
|
this.braveSearchHTTP = new BraveSearchHTTP(this.secretExposer.BRAVE_SEARCH_API_KEY);
|
||||||
}
|
}
|
||||||
|
|
||||||
@retry(3, Math.ceil(500 + 500 * Math.random()))
|
|
||||||
async webSearch(query: WebSearchQueryParams) {
|
async webSearch(query: WebSearchQueryParams) {
|
||||||
const ip = this.threadLocal.get('ip');
|
const ip = this.threadLocal.get('ip');
|
||||||
const extraHeaders: WebSearchOptionalHeaderOptions = {};
|
const extraHeaders: WebSearchOptionalHeaderOptions = {};
|
||||||
@ -65,16 +64,25 @@ export class BraveSearchService extends AsyncService {
|
|||||||
encoded.q = (Buffer.from(encoded.q).toString('ascii') === encoded.q) ? encoded.q : encodeURIComponent(encoded.q);
|
encoded.q = (Buffer.from(encoded.q).toString('ascii') === encoded.q) ? encoded.q : encodeURIComponent(encoded.q);
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
let maxTries = 11;
|
||||||
const r = await this.braveSearchHTTP.webSearch(encoded, { headers: extraHeaders as Record<string, string> });
|
|
||||||
|
|
||||||
return r.parsed;
|
while (maxTries--) {
|
||||||
} catch (err: any) {
|
try {
|
||||||
this.logger.error(`Web search failed: ${err?.message}`, { err: marshalErrorLike(err) });
|
const r = await this.braveSearchHTTP.webSearch(encoded, { headers: extraHeaders as Record<string, string> });
|
||||||
|
|
||||||
throw new DownstreamServiceFailureError({ message: `Search failed` });
|
return r.parsed;
|
||||||
|
} catch (err: any) {
|
||||||
|
this.logger.error(`Web search failed: ${err?.message}`, { err: marshalErrorLike(err) });
|
||||||
|
if (err?.status === 429) {
|
||||||
|
await delay(500 + 1000 * Math.random());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new DownstreamServiceFailureError({ message: `Search failed` });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
throw new DownstreamServiceFailureError({ message: `Search failed` });
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user