chore: improve type definitions and type safety (#1153)

* chore: improve type definitions and type safety * chore: format search result * chore: use 'x-details' * chore: remove 'domain' from search result * chore: use 'x-respond-with' * chore: optimize logic to return content * chore: cleanup * chore: optimize logic
2025-08-18 08:45:58 +08:00 · 2025-02-28 18:44:38 +08:00 · 2025-02-28 18:44:38 +08:00 · 51109e01d2
commit 51109e01d2
parent 4873578c62
2 changed files with 85 additions and 44 deletions
--- a/backend/functions/src/cloud-functions/searcher-serper.ts
+++ b/backend/functions/src/cloud-functions/searcher-serper.ts
@ -3,6 +3,7 @@ import {
    RPCHost, RPCReflection,
    AssertionFailureError,
    objHashMd5B64Of,
+    assignMeta,
 } from 'civkit';
 import { singleton } from 'tsyringe';
 import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, Param, RPCReflect } from '../shared';
@ -88,8 +89,9 @@ export class SearcherHost extends RPCHost {
        @Param('q') q?: string,
    ) {
        const uid = await auth.solveUID();
-        const version = ctx?.req.get('x-version');
-        const isVersion2 = version?.replace('v', '') === '2';
+        // Return content by default
+        const respondWith = ctx.req.get('X-Respond-With') ?? 'content';
+        const crawlWithoutContent = !respondWith.includes('content');

        let chargeAmount = 0;
        const noSlashPath = decodeURIComponent(ctx.req.path).slice(1);
@ -157,31 +159,26 @@ export class SearcherHost extends RPCHost {
            delete crawlOpts.timeoutMs;
        }

-        if (isVersion2) {
-            chargeAmount = 10000;
-            const result = [];
-            for (const x of r.organic.slice(0, count)) {
-                const url = new URL(x.link);
-                const favicon = await this.getFavicon(url.origin);

-                result.push({
-                    url: x.link,
-                    title: x.title,
-                    snippet: x.snippet,
-                    domain: url.origin,
-                    favicon: favicon,
-                });
+        let lastScrapped: any[] | undefined;
+        const targetResultCount = crawlWithoutContent ? count : count + 2;
+        const organicSearchResults = r.organic.slice(0, targetResultCount);
+        if (crawlWithoutContent || count === 0) {
+            const fakeResults = await this.fakeResult(crawlerOptions.respondWith, organicSearchResults, !crawlWithoutContent);
+            lastScrapped = fakeResults;
+            if (!crawlWithoutContent) {
+                chargeAmount = this.assignChargeAmount(lastScrapped);
+            } else {
+                chargeAmount = 10000;
            }
-
-            return {
-                result,
-                usage: {
-                    tokens: chargeAmount,
-                }
-            };
+            this.assignTokenUsage(lastScrapped, chargeAmount, crawlWithoutContent);
+            if ((!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) || count === 0) {
+                return lastScrapped;
+            }
+            return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
        }

-        const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic.slice(0, count + 2), crawlOpts,
+        const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic.slice(0, targetResultCount), crawlOpts,
            CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
            count,
        );
@ -217,7 +214,6 @@ export class SearcherHost extends RPCHost {
            return sseStream;
        }

-        let lastScrapped: any[] | undefined;
        let earlyReturn = false;
        if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
            let earlyReturnTimer: ReturnType<typeof setTimeout> | undefined;
@ -248,6 +244,7 @@ export class SearcherHost extends RPCHost {
                }
                chargeAmount = this.assignChargeAmount(scrapped);

+                this.assignTokenUsage(scrapped, chargeAmount, crawlWithoutContent);
                return scrapped;
            }

@ -263,6 +260,7 @@ export class SearcherHost extends RPCHost {
                chargeAmount = this.assignChargeAmount(lastScrapped);
            }

+            this.assignTokenUsage(lastScrapped, chargeAmount, crawlWithoutContent);
            return lastScrapped;
        }

@ -316,9 +314,70 @@ export class SearcherHost extends RPCHost {
        return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
    }

-    async *fetchSearchResults(
+    assignTokenUsage(result: FormattedPage[], chargeAmount: number, crawlWithoutContent: boolean) {
+        if (crawlWithoutContent) {
+            chargeAmount = 10000;
+            if (result) {
+                result.forEach((x) => {
+                    delete x.usage;
+                });
+            }
+        }
+
+        assignMeta(result, { usage: { tokens: chargeAmount } });
+    }
+
+    async fakeResult(
        mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
        searchResults?: SerperSearchResponse['organic'],
+        withContent: boolean = false
+    ) {
+        if (!searchResults) {
+            return [];
+        }
+
+        const resultArray = await Promise.all(searchResults.map(async (upstreamSearchResult, index) => {
+            const result = {
+                url: upstreamSearchResult.link,
+                title: upstreamSearchResult.title,
+                description: upstreamSearchResult.snippet,
+            } as FormattedPage;
+
+            const dataItems = [
+                { key: 'title', label: 'Title' },
+                { key: 'url', label: 'URL Source' },
+                { key: 'description', label: 'Description'},
+            ]
+
+            if (withContent) {
+                result.content = ['html', 'text', 'screenshot'].includes(mode) ? undefined : '';
+            }
+            if (mode.includes('favicon')) {
+                const url = new URL(upstreamSearchResult.link);
+                result.favicon = await this.getFavicon(url.origin);
+                dataItems.push({
+                    key: 'favicon',
+                    label: 'Favicon',
+                });
+            }
+
+            result.toString = function () {
+                const self = this as any;
+                return dataItems.map((x) => `[${index + 1}] ${x.label}: ${self[x.key]}`).join('\n') + '\n';
+            }
+            return result;
+        }));
+
+        resultArray.toString = function () {
+            return this.map((x, i) => x ? x.toString() : '').join('\n\n').trimEnd() + '\n';
+        };
+
+        return resultArray;
+    }
+
+    async *fetchSearchResults(
+        mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'favicon' | 'content',
+        searchResults?: SerperSearchResponse['organic'],
        options?: ExtraScrappingOptions,
        crawlerOptions?: CrawlerOptions,
        count?: number,
@ -326,26 +385,7 @@ export class SearcherHost extends RPCHost {
        if (!searchResults) {
            return;
        }
-        if (count === 0) {
-            const resultArray = searchResults.map((upstreamSearchResult, i) => ({
-                url: upstreamSearchResult.link,
-                title: upstreamSearchResult.title,
-                description: upstreamSearchResult.snippet,
-                content: ['html', 'text', 'screenshot'].includes(mode) ? undefined : '',
-                toString() {
-                    return `[${i + 1}] Title: ${this.title}
-[${i + 1}] URL Source: ${this.url}
-[${i + 1}] Description: ${this.description}
-`;
-                }

-            })) as FormattedPage[];
-            resultArray.toString = function () {
-                return this.map((x, i) => x ? x.toString() : '').join('\n\n').trimEnd() + '\n';
-            };
-            yield resultArray;
-            return;
-        }
        const urls = searchResults.map((x) => new URL(x.link));
        const snapshotMap = new WeakMap();
        for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
--- a/backend/functions/src/services/snapshot-formatter.ts
+++ b/backend/functions/src/services/snapshot-formatter.ts
@ -31,6 +31,7 @@ export interface FormattedPage {
    links?: { [k: string]: string; } | [string, string][];
    images?: { [k: string]: string; } | [string, string][];
    warning?: string;
+    favicon?: string;
    usage?: {
        total_tokens?: number;
        totalTokens?: number;