diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index 46412e8..907265d 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -776,7 +776,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; getUrlDigest(urlToCrawl: URL) { const normalizedURL = new URL(urlToCrawl); - normalizedURL.hash = ''; + if (!normalizedURL.hash.startsWith('/')) { + normalizedURL.hash = ''; + } const normalizedUrl = normalizedURL.toString().toLowerCase(); const digest = md5Hasher.hash(normalizedUrl.toString()); diff --git a/backend/functions/src/cloud-functions/searcher.ts b/backend/functions/src/cloud-functions/searcher.ts index 6f1daf8..031a4a8 100644 --- a/backend/functions/src/cloud-functions/searcher.ts +++ b/backend/functions/src/cloud-functions/searcher.ts @@ -5,7 +5,7 @@ import { objHashMd5B64Of, } from 'civkit'; import { singleton } from 'tsyringe'; -import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared'; +import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, Param, RPCReflect } from '../shared'; import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit'; import _ from 'lodash'; import { Request, Response } from 'express'; @@ -83,6 +83,8 @@ export class SearcherHost extends RPCHost { res: Response, }, auth: JinaEmbeddingsAuthDTO, + @Param('count', { default: 5, validate: (v) => v >= 3 && v <= 10 }) + count: number, crawlerOptions: CrawlerOptions, braveSearchExplicitOperators: BraveSearchExplicitOperatorsDto, ) { @@ -157,7 +159,7 @@ export class SearcherHost extends RPCHost { const searchQuery = braveSearchExplicitOperators.addTo(ctx.req.path.slice(1)); const r = await this.cachedWebSearch({ q: searchQuery, - count: 10 + count: Math.floor(count * 2) }, crawlerOptions.noCache); if (!r.web?.results.length) { @@ -226,7 +228,7 @@ export class SearcherHost extends RPCHost { if (_.some(scrapped, (x) => this.pageQualified(x))) { setEarlyReturnTimer(); } - if (!this.searchResultsQualified(scrapped)) { + if (!this.searchResultsQualified(scrapped, count)) { continue; } if (earlyReturnTimer) { @@ -274,7 +276,7 @@ export class SearcherHost extends RPCHost { setEarlyReturnTimer(); } - if (!this.searchResultsQualified(scrapped)) { + if (!this.searchResultsQualified(scrapped, count)) { continue; } @@ -425,8 +427,8 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`; formattedPage.html; } - searchResultsQualified(results: FormattedPage[]) { - return _.every(results, (x) => this.pageQualified(x)) && results.length >= this.targetResultCount; + searchResultsQualified(results: FormattedPage[], targetResultCount = this.targetResultCount) { + return _.every(results, (x) => this.pageQualified(x)) && results.length >= targetResultCount; } async cachedWebSearch(query: WebSearchQueryParams, noCache: boolean = false) { diff --git a/backend/functions/src/services/brave-search.ts b/backend/functions/src/services/brave-search.ts index c07dde1..2fe7ffc 100644 --- a/backend/functions/src/services/brave-search.ts +++ b/backend/functions/src/services/brave-search.ts @@ -1,4 +1,4 @@ -import { AsyncService, AutoCastable, DownstreamServiceFailureError, Prop, RPC_CALL_ENVIRONMENT, marshalErrorLike, retry } from 'civkit'; +import { AsyncService, AutoCastable, DownstreamServiceFailureError, Prop, RPC_CALL_ENVIRONMENT, delay, marshalErrorLike } from 'civkit'; import { singleton } from 'tsyringe'; import { Logger } from '../shared/services/logger'; import { SecretExposer } from '../shared/services/secrets'; @@ -31,7 +31,6 @@ export class BraveSearchService extends AsyncService { this.braveSearchHTTP = new BraveSearchHTTP(this.secretExposer.BRAVE_SEARCH_API_KEY); } - @retry(3, Math.ceil(500 + 500 * Math.random())) async webSearch(query: WebSearchQueryParams) { const ip = this.threadLocal.get('ip'); const extraHeaders: WebSearchOptionalHeaderOptions = {}; @@ -65,16 +64,25 @@ export class BraveSearchService extends AsyncService { encoded.q = (Buffer.from(encoded.q).toString('ascii') === encoded.q) ? encoded.q : encodeURIComponent(encoded.q); } - try { - const r = await this.braveSearchHTTP.webSearch(encoded, { headers: extraHeaders as Record }); + let maxTries = 11; - return r.parsed; - } catch (err: any) { - this.logger.error(`Web search failed: ${err?.message}`, { err: marshalErrorLike(err) }); + while (maxTries--) { + try { + const r = await this.braveSearchHTTP.webSearch(encoded, { headers: extraHeaders as Record }); - throw new DownstreamServiceFailureError({ message: `Search failed` }); + return r.parsed; + } catch (err: any) { + this.logger.error(`Web search failed: ${err?.message}`, { err: marshalErrorLike(err) }); + if (err?.status === 429) { + await delay(500 + 1000 * Math.random()); + continue; + } + + throw new DownstreamServiceFailureError({ message: `Search failed` }); + } } + throw new DownstreamServiceFailureError({ message: `Search failed` }); } }