fix: scrapMany and searcher

This commit is contained in:
Yanlong Wang 2025-02-24 23:44:29 +08:00
parent 380bbffc0c
commit 29774ac637
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
7 changed files with 32 additions and 20 deletions

View File

@ -752,7 +752,7 @@ export class CrawlerHost extends RPCHost {
} }
}; };
Promise.all( Promise.allSettled(
iterators.map((it, idx) => handler(it, idx)) iterators.map((it, idx) => handler(it, idx))
).finally(() => { ).finally(() => {
concluded = true; concluded = true;
@ -767,6 +767,7 @@ export class CrawlerHost extends RPCHost {
yield results; yield results;
} }
yield results;
} finally { } finally {
for (const x of iterators) { for (const x of iterators) {
x.return(); x.return();

View File

@ -154,7 +154,7 @@ export class SearcherHost extends RPCHost {
delete crawlOpts.timeoutMs; delete crawlOpts.timeoutMs;
} }
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic, crawlOpts, const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic.slice(0, count + 2), crawlOpts,
CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }), CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
count, count,
); );
@ -324,7 +324,7 @@ export class SearcherHost extends RPCHost {
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) { for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
const mapped = scrapped.map((x, i) => { const mapped = scrapped.map((x, i) => {
const upstreamSearchResult = searchResults[i]; const upstreamSearchResult = searchResults[i];
if (!x || (!x.parsed && mode !== 'markdown')) { if (!x) {
return { return {
url: upstreamSearchResult.link, url: upstreamSearchResult.link,
title: upstreamSearchResult.title, title: upstreamSearchResult.title,
@ -370,7 +370,6 @@ export class SearcherHost extends RPCHost {
} }
const filtered = searchResults.filter((x) => acceptSet.has(x)).slice(0, targetResultCount); const filtered = searchResults.filter((x) => acceptSet.has(x)).slice(0, targetResultCount);
filtered.toString = searchResults.toString;
const resultArray = filtered.map((x, i) => { const resultArray = filtered.map((x, i) => {
@ -378,10 +377,11 @@ export class SearcherHost extends RPCHost {
...x, ...x,
toString(this: any) { toString(this: any) {
if (!this.content && this.description) { if (!this.content && this.description) {
if (this.title) { if (this.title || x.textRepresentation) {
const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : '';
return `[${i + 1}] Title: ${this.title} return `[${i + 1}] Title: ${this.title}
[${i + 1}] URL Source: ${this.url} [${i + 1}] URL Source: ${this.url}
[${i + 1}] Description: ${this.description} [${i + 1}] Description: ${this.description}${textRep}
`; `;
} }
@ -444,6 +444,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
return formattedPage.title && return formattedPage.title &&
formattedPage.content || formattedPage.content ||
formattedPage.screenshotUrl || formattedPage.screenshotUrl ||
formattedPage.pageshotUrl ||
formattedPage.text || formattedPage.text ||
formattedPage.html; formattedPage.html;
} }

View File

@ -155,7 +155,7 @@ export class SearcherHost extends RPCHost {
delete crawlOpts.timeoutMs; delete crawlOpts.timeoutMs;
} }
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts, const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results.slice(0, count + 2), crawlOpts,
CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }), CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
count, count,
); );
@ -325,7 +325,7 @@ export class SearcherHost extends RPCHost {
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) { for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
const mapped = scrapped.map((x, i) => { const mapped = scrapped.map((x, i) => {
const upstreamSearchResult = searchResults[i]; const upstreamSearchResult = searchResults[i];
if (!x || (!x.parsed && mode !== 'markdown')) { if (!x) {
return { return {
url: upstreamSearchResult.url, url: upstreamSearchResult.url,
title: upstreamSearchResult.title, title: upstreamSearchResult.title,
@ -371,18 +371,17 @@ export class SearcherHost extends RPCHost {
} }
const filtered = searchResults.filter((x) => acceptSet.has(x)).slice(0, targetResultCount); const filtered = searchResults.filter((x) => acceptSet.has(x)).slice(0, targetResultCount);
filtered.toString = searchResults.toString;
const resultArray = filtered.map((x, i) => { const resultArray = filtered.map((x, i) => {
return { return {
...x, ...x,
toString(this: any) { toString(this: any) {
if (!this.content && this.description) { if (!this.content && this.description) {
if (this.title) { if (this.title || x.textRepresentation) {
const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : '';
return `[${i + 1}] Title: ${this.title} return `[${i + 1}] Title: ${this.title}
[${i + 1}] URL Source: ${this.url} [${i + 1}] URL Source: ${this.url}
[${i + 1}] Description: ${this.description} [${i + 1}] Description: ${this.description}${textRep}
`; `;
} }
@ -445,6 +444,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
return formattedPage.title && return formattedPage.title &&
formattedPage.content || formattedPage.content ||
formattedPage.screenshotUrl || formattedPage.screenshotUrl ||
formattedPage.pageshotUrl ||
formattedPage.text || formattedPage.text ||
formattedPage.html; formattedPage.html;
} }

View File

@ -199,7 +199,7 @@ export class JSDomControl extends AsyncService {
} }
@Threaded() @Threaded()
inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot { async inferSnapshot(snapshot: PageSnapshot) {
const t0 = Date.now(); const t0 = Date.now();
const extendedSnapshot = { ...snapshot } as ExtendedSnapshot; const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
try { try {

View File

@ -101,7 +101,7 @@ export class SnapshotFormatter extends AsyncService {
}, nominalUrl?: URL, urlValidMs = 3600 * 1000 * 4) { }, nominalUrl?: URL, urlValidMs = 3600 * 1000 * 4) {
const t0 = Date.now(); const t0 = Date.now();
const f = { const f = {
...this.getGeneralSnapshotMixins(snapshot), ...(await this.getGeneralSnapshotMixins(snapshot)),
}; };
let modeOK = false; let modeOK = false;
@ -190,6 +190,16 @@ export class SnapshotFormatter extends AsyncService {
const dt = Date.now() - t0; const dt = Date.now() - t0;
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt }); this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
const formatted: FormattedPage = {
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
description: (snapshot.description || '').trim(),
url: nominalUrl?.toString() || snapshot.href?.trim(),
publishedTime: snapshot.parsed?.publishedTime || undefined,
[Symbol.dispose]: () => { },
};
Object.assign(f, formatted);
return f; return f;
} }
@ -412,7 +422,7 @@ export class SnapshotFormatter extends AsyncService {
.value(); .value();
} }
if (this.threadLocal.get('withLinksSummary')) { if (this.threadLocal.get('withLinksSummary')) {
const links = this.jsdomControl.inferSnapshot(snapshot).links; const links = (await this.jsdomControl.inferSnapshot(snapshot)).links;
if (this.threadLocal.get('withLinksSummary') === 'all') { if (this.threadLocal.get('withLinksSummary') === 'all') {
formatted.links = links; formatted.links = links;
@ -482,11 +492,11 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
return f as FormattedPage; return f as FormattedPage;
} }
getGeneralSnapshotMixins(snapshot: PageSnapshot) { async getGeneralSnapshotMixins(snapshot: PageSnapshot) {
let inferred; let inferred;
const mixin: any = {}; const mixin: any = {};
if (this.threadLocal.get('withImagesSummary')) { if (this.threadLocal.get('withImagesSummary')) {
inferred ??= this.jsdomControl.inferSnapshot(snapshot); inferred ??= await this.jsdomControl.inferSnapshot(snapshot);
const imageSummary = {} as { [k: string]: string; }; const imageSummary = {} as { [k: string]: string; };
const imageIdxTrack = new Map<string, number[]>(); const imageIdxTrack = new Map<string, number[]>();
@ -511,7 +521,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
.value(); .value();
} }
if (this.threadLocal.get('withLinksSummary')) { if (this.threadLocal.get('withLinksSummary')) {
inferred ??= this.jsdomControl.inferSnapshot(snapshot); inferred ??= await this.jsdomControl.inferSnapshot(snapshot);
if (this.threadLocal.get('withLinksSummary') === 'all') { if (this.threadLocal.get('withLinksSummary') === 'all') {
mixin.links = inferred.links; mixin.links = inferred.links;
} else { } else {

View File

@ -15,7 +15,7 @@ import { Logger, CloudFunctionRegistry, AsyncContext } from '../shared';
import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc'; import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc';
import { ExpressServer } from 'civkit/civ-rpc/express'; import { ExpressServer } from 'civkit/civ-rpc/express';
import http2 from 'http2'; import http2 from 'http2';
import { SearcherHost } from '../cloud-functions/sercher-serper'; import { SearcherHost } from '../cloud-functions/searcher';
import { FsWalk, WalkOutEntity } from 'civkit/fswalk'; import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
import path from 'path'; import path from 'path';
import fs from 'fs'; import fs from 'fs';

@ -1 +1 @@
Subproject commit 5e25cdd295bdbc41422055491532ea713c142b45 Subproject commit b5e688359eaa87538ef5f43c1323ab92eca8ea33