From 29774ac637c59e64e11306209c9841e416127978 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Mon, 24 Feb 2025 23:44:29 +0800 Subject: [PATCH] fix: scrapMany and searcher --- .../functions/src/cloud-functions/crawler.ts | 3 ++- .../{sercher-serper.ts => searcher-serper.ts} | 11 +++++----- .../functions/src/cloud-functions/searcher.ts | 12 +++++------ backend/functions/src/services/jsdom.ts | 2 +- .../src/services/snapshot-formatter.ts | 20 ++++++++++++++----- backend/functions/src/stand-alone/search.ts | 2 +- thinapps-shared | 2 +- 7 files changed, 32 insertions(+), 20 deletions(-) rename backend/functions/src/cloud-functions/{sercher-serper.ts => searcher-serper.ts} (98%) diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index 8c23c60..86abf9d 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -752,7 +752,7 @@ export class CrawlerHost extends RPCHost { } }; - Promise.all( + Promise.allSettled( iterators.map((it, idx) => handler(it, idx)) ).finally(() => { concluded = true; @@ -767,6 +767,7 @@ export class CrawlerHost extends RPCHost { yield results; } + yield results; } finally { for (const x of iterators) { x.return(); diff --git a/backend/functions/src/cloud-functions/sercher-serper.ts b/backend/functions/src/cloud-functions/searcher-serper.ts similarity index 98% rename from backend/functions/src/cloud-functions/sercher-serper.ts rename to backend/functions/src/cloud-functions/searcher-serper.ts index 2cc71b1..f82e36b 100644 --- a/backend/functions/src/cloud-functions/sercher-serper.ts +++ b/backend/functions/src/cloud-functions/searcher-serper.ts @@ -154,7 +154,7 @@ export class SearcherHost extends RPCHost { delete crawlOpts.timeoutMs; } - const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic, crawlOpts, + const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic.slice(0, count + 2), crawlOpts, CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }), count, ); @@ -324,7 +324,7 @@ export class SearcherHost extends RPCHost { for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) { const mapped = scrapped.map((x, i) => { const upstreamSearchResult = searchResults[i]; - if (!x || (!x.parsed && mode !== 'markdown')) { + if (!x) { return { url: upstreamSearchResult.link, title: upstreamSearchResult.title, @@ -370,7 +370,6 @@ export class SearcherHost extends RPCHost { } const filtered = searchResults.filter((x) => acceptSet.has(x)).slice(0, targetResultCount); - filtered.toString = searchResults.toString; const resultArray = filtered.map((x, i) => { @@ -378,10 +377,11 @@ export class SearcherHost extends RPCHost { ...x, toString(this: any) { if (!this.content && this.description) { - if (this.title) { + if (this.title || x.textRepresentation) { + const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : ''; return `[${i + 1}] Title: ${this.title} [${i + 1}] URL Source: ${this.url} -[${i + 1}] Description: ${this.description} +[${i + 1}] Description: ${this.description}${textRep} `; } @@ -444,6 +444,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`; return formattedPage.title && formattedPage.content || formattedPage.screenshotUrl || + formattedPage.pageshotUrl || formattedPage.text || formattedPage.html; } diff --git a/backend/functions/src/cloud-functions/searcher.ts b/backend/functions/src/cloud-functions/searcher.ts index 95d34f4..06b7710 100644 --- a/backend/functions/src/cloud-functions/searcher.ts +++ b/backend/functions/src/cloud-functions/searcher.ts @@ -155,7 +155,7 @@ export class SearcherHost extends RPCHost { delete crawlOpts.timeoutMs; } - const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts, + const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results.slice(0, count + 2), crawlOpts, CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }), count, ); @@ -325,7 +325,7 @@ export class SearcherHost extends RPCHost { for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) { const mapped = scrapped.map((x, i) => { const upstreamSearchResult = searchResults[i]; - if (!x || (!x.parsed && mode !== 'markdown')) { + if (!x) { return { url: upstreamSearchResult.url, title: upstreamSearchResult.title, @@ -371,18 +371,17 @@ export class SearcherHost extends RPCHost { } const filtered = searchResults.filter((x) => acceptSet.has(x)).slice(0, targetResultCount); - filtered.toString = searchResults.toString; const resultArray = filtered.map((x, i) => { - return { ...x, toString(this: any) { if (!this.content && this.description) { - if (this.title) { + if (this.title || x.textRepresentation) { + const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : ''; return `[${i + 1}] Title: ${this.title} [${i + 1}] URL Source: ${this.url} -[${i + 1}] Description: ${this.description} +[${i + 1}] Description: ${this.description}${textRep} `; } @@ -445,6 +444,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`; return formattedPage.title && formattedPage.content || formattedPage.screenshotUrl || + formattedPage.pageshotUrl || formattedPage.text || formattedPage.html; } diff --git a/backend/functions/src/services/jsdom.ts b/backend/functions/src/services/jsdom.ts index 594c41d..ffcaaa8 100644 --- a/backend/functions/src/services/jsdom.ts +++ b/backend/functions/src/services/jsdom.ts @@ -199,7 +199,7 @@ export class JSDomControl extends AsyncService { } @Threaded() - inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot { + async inferSnapshot(snapshot: PageSnapshot) { const t0 = Date.now(); const extendedSnapshot = { ...snapshot } as ExtendedSnapshot; try { diff --git a/backend/functions/src/services/snapshot-formatter.ts b/backend/functions/src/services/snapshot-formatter.ts index 74c8e64..f52af58 100644 --- a/backend/functions/src/services/snapshot-formatter.ts +++ b/backend/functions/src/services/snapshot-formatter.ts @@ -101,7 +101,7 @@ export class SnapshotFormatter extends AsyncService { }, nominalUrl?: URL, urlValidMs = 3600 * 1000 * 4) { const t0 = Date.now(); const f = { - ...this.getGeneralSnapshotMixins(snapshot), + ...(await this.getGeneralSnapshotMixins(snapshot)), }; let modeOK = false; @@ -190,6 +190,16 @@ export class SnapshotFormatter extends AsyncService { const dt = Date.now() - t0; this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt }); + const formatted: FormattedPage = { + title: (snapshot.parsed?.title || snapshot.title || '').trim(), + description: (snapshot.description || '').trim(), + url: nominalUrl?.toString() || snapshot.href?.trim(), + publishedTime: snapshot.parsed?.publishedTime || undefined, + [Symbol.dispose]: () => { }, + }; + + Object.assign(f, formatted); + return f; } @@ -412,7 +422,7 @@ export class SnapshotFormatter extends AsyncService { .value(); } if (this.threadLocal.get('withLinksSummary')) { - const links = this.jsdomControl.inferSnapshot(snapshot).links; + const links = (await this.jsdomControl.inferSnapshot(snapshot)).links; if (this.threadLocal.get('withLinksSummary') === 'all') { formatted.links = links; @@ -482,11 +492,11 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; return f as FormattedPage; } - getGeneralSnapshotMixins(snapshot: PageSnapshot) { + async getGeneralSnapshotMixins(snapshot: PageSnapshot) { let inferred; const mixin: any = {}; if (this.threadLocal.get('withImagesSummary')) { - inferred ??= this.jsdomControl.inferSnapshot(snapshot); + inferred ??= await this.jsdomControl.inferSnapshot(snapshot); const imageSummary = {} as { [k: string]: string; }; const imageIdxTrack = new Map(); @@ -511,7 +521,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; .value(); } if (this.threadLocal.get('withLinksSummary')) { - inferred ??= this.jsdomControl.inferSnapshot(snapshot); + inferred ??= await this.jsdomControl.inferSnapshot(snapshot); if (this.threadLocal.get('withLinksSummary') === 'all') { mixin.links = inferred.links; } else { diff --git a/backend/functions/src/stand-alone/search.ts b/backend/functions/src/stand-alone/search.ts index 180cc9b..6d7e7d0 100644 --- a/backend/functions/src/stand-alone/search.ts +++ b/backend/functions/src/stand-alone/search.ts @@ -15,7 +15,7 @@ import { Logger, CloudFunctionRegistry, AsyncContext } from '../shared'; import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc'; import { ExpressServer } from 'civkit/civ-rpc/express'; import http2 from 'http2'; -import { SearcherHost } from '../cloud-functions/sercher-serper'; +import { SearcherHost } from '../cloud-functions/searcher'; import { FsWalk, WalkOutEntity } from 'civkit/fswalk'; import path from 'path'; import fs from 'fs'; diff --git a/thinapps-shared b/thinapps-shared index 5e25cdd..b5e6883 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit 5e25cdd295bdbc41422055491532ea713c142b45 +Subproject commit b5e688359eaa87538ef5f43c1323ab92eca8ea33