mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-18 01:05:59 +08:00
fix: scrapMany and searcher
This commit is contained in:
parent
380bbffc0c
commit
29774ac637
@ -752,7 +752,7 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
};
|
||||
|
||||
Promise.all(
|
||||
Promise.allSettled(
|
||||
iterators.map((it, idx) => handler(it, idx))
|
||||
).finally(() => {
|
||||
concluded = true;
|
||||
@ -767,6 +767,7 @@ export class CrawlerHost extends RPCHost {
|
||||
|
||||
yield results;
|
||||
}
|
||||
yield results;
|
||||
} finally {
|
||||
for (const x of iterators) {
|
||||
x.return();
|
||||
|
@ -154,7 +154,7 @@ export class SearcherHost extends RPCHost {
|
||||
delete crawlOpts.timeoutMs;
|
||||
}
|
||||
|
||||
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic, crawlOpts,
|
||||
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic.slice(0, count + 2), crawlOpts,
|
||||
CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
|
||||
count,
|
||||
);
|
||||
@ -324,7 +324,7 @@ export class SearcherHost extends RPCHost {
|
||||
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
|
||||
const mapped = scrapped.map((x, i) => {
|
||||
const upstreamSearchResult = searchResults[i];
|
||||
if (!x || (!x.parsed && mode !== 'markdown')) {
|
||||
if (!x) {
|
||||
return {
|
||||
url: upstreamSearchResult.link,
|
||||
title: upstreamSearchResult.title,
|
||||
@ -370,7 +370,6 @@ export class SearcherHost extends RPCHost {
|
||||
}
|
||||
|
||||
const filtered = searchResults.filter((x) => acceptSet.has(x)).slice(0, targetResultCount);
|
||||
filtered.toString = searchResults.toString;
|
||||
|
||||
const resultArray = filtered.map((x, i) => {
|
||||
|
||||
@ -378,10 +377,11 @@ export class SearcherHost extends RPCHost {
|
||||
...x,
|
||||
toString(this: any) {
|
||||
if (!this.content && this.description) {
|
||||
if (this.title) {
|
||||
if (this.title || x.textRepresentation) {
|
||||
const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : '';
|
||||
return `[${i + 1}] Title: ${this.title}
|
||||
[${i + 1}] URL Source: ${this.url}
|
||||
[${i + 1}] Description: ${this.description}
|
||||
[${i + 1}] Description: ${this.description}${textRep}
|
||||
`;
|
||||
}
|
||||
|
||||
@ -444,6 +444,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
|
||||
return formattedPage.title &&
|
||||
formattedPage.content ||
|
||||
formattedPage.screenshotUrl ||
|
||||
formattedPage.pageshotUrl ||
|
||||
formattedPage.text ||
|
||||
formattedPage.html;
|
||||
}
|
@ -155,7 +155,7 @@ export class SearcherHost extends RPCHost {
|
||||
delete crawlOpts.timeoutMs;
|
||||
}
|
||||
|
||||
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
|
||||
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results.slice(0, count + 2), crawlOpts,
|
||||
CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
|
||||
count,
|
||||
);
|
||||
@ -325,7 +325,7 @@ export class SearcherHost extends RPCHost {
|
||||
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
|
||||
const mapped = scrapped.map((x, i) => {
|
||||
const upstreamSearchResult = searchResults[i];
|
||||
if (!x || (!x.parsed && mode !== 'markdown')) {
|
||||
if (!x) {
|
||||
return {
|
||||
url: upstreamSearchResult.url,
|
||||
title: upstreamSearchResult.title,
|
||||
@ -371,18 +371,17 @@ export class SearcherHost extends RPCHost {
|
||||
}
|
||||
|
||||
const filtered = searchResults.filter((x) => acceptSet.has(x)).slice(0, targetResultCount);
|
||||
filtered.toString = searchResults.toString;
|
||||
|
||||
const resultArray = filtered.map((x, i) => {
|
||||
|
||||
return {
|
||||
...x,
|
||||
toString(this: any) {
|
||||
if (!this.content && this.description) {
|
||||
if (this.title) {
|
||||
if (this.title || x.textRepresentation) {
|
||||
const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : '';
|
||||
return `[${i + 1}] Title: ${this.title}
|
||||
[${i + 1}] URL Source: ${this.url}
|
||||
[${i + 1}] Description: ${this.description}
|
||||
[${i + 1}] Description: ${this.description}${textRep}
|
||||
`;
|
||||
}
|
||||
|
||||
@ -445,6 +444,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
|
||||
return formattedPage.title &&
|
||||
formattedPage.content ||
|
||||
formattedPage.screenshotUrl ||
|
||||
formattedPage.pageshotUrl ||
|
||||
formattedPage.text ||
|
||||
formattedPage.html;
|
||||
}
|
||||
|
@ -199,7 +199,7 @@ export class JSDomControl extends AsyncService {
|
||||
}
|
||||
|
||||
@Threaded()
|
||||
inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot {
|
||||
async inferSnapshot(snapshot: PageSnapshot) {
|
||||
const t0 = Date.now();
|
||||
const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
|
||||
try {
|
||||
|
@ -101,7 +101,7 @@ export class SnapshotFormatter extends AsyncService {
|
||||
}, nominalUrl?: URL, urlValidMs = 3600 * 1000 * 4) {
|
||||
const t0 = Date.now();
|
||||
const f = {
|
||||
...this.getGeneralSnapshotMixins(snapshot),
|
||||
...(await this.getGeneralSnapshotMixins(snapshot)),
|
||||
};
|
||||
let modeOK = false;
|
||||
|
||||
@ -190,6 +190,16 @@ export class SnapshotFormatter extends AsyncService {
|
||||
const dt = Date.now() - t0;
|
||||
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
|
||||
|
||||
const formatted: FormattedPage = {
|
||||
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
||||
description: (snapshot.description || '').trim(),
|
||||
url: nominalUrl?.toString() || snapshot.href?.trim(),
|
||||
publishedTime: snapshot.parsed?.publishedTime || undefined,
|
||||
[Symbol.dispose]: () => { },
|
||||
};
|
||||
|
||||
Object.assign(f, formatted);
|
||||
|
||||
return f;
|
||||
}
|
||||
|
||||
@ -412,7 +422,7 @@ export class SnapshotFormatter extends AsyncService {
|
||||
.value();
|
||||
}
|
||||
if (this.threadLocal.get('withLinksSummary')) {
|
||||
const links = this.jsdomControl.inferSnapshot(snapshot).links;
|
||||
const links = (await this.jsdomControl.inferSnapshot(snapshot)).links;
|
||||
|
||||
if (this.threadLocal.get('withLinksSummary') === 'all') {
|
||||
formatted.links = links;
|
||||
@ -482,11 +492,11 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
return f as FormattedPage;
|
||||
}
|
||||
|
||||
getGeneralSnapshotMixins(snapshot: PageSnapshot) {
|
||||
async getGeneralSnapshotMixins(snapshot: PageSnapshot) {
|
||||
let inferred;
|
||||
const mixin: any = {};
|
||||
if (this.threadLocal.get('withImagesSummary')) {
|
||||
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
||||
inferred ??= await this.jsdomControl.inferSnapshot(snapshot);
|
||||
const imageSummary = {} as { [k: string]: string; };
|
||||
const imageIdxTrack = new Map<string, number[]>();
|
||||
|
||||
@ -511,7 +521,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
.value();
|
||||
}
|
||||
if (this.threadLocal.get('withLinksSummary')) {
|
||||
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
||||
inferred ??= await this.jsdomControl.inferSnapshot(snapshot);
|
||||
if (this.threadLocal.get('withLinksSummary') === 'all') {
|
||||
mixin.links = inferred.links;
|
||||
} else {
|
||||
|
@ -15,7 +15,7 @@ import { Logger, CloudFunctionRegistry, AsyncContext } from '../shared';
|
||||
import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc';
|
||||
import { ExpressServer } from 'civkit/civ-rpc/express';
|
||||
import http2 from 'http2';
|
||||
import { SearcherHost } from '../cloud-functions/sercher-serper';
|
||||
import { SearcherHost } from '../cloud-functions/searcher';
|
||||
import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
|
||||
import path from 'path';
|
||||
import fs from 'fs';
|
||||
|
@ -1 +1 @@
|
||||
Subproject commit 5e25cdd295bdbc41422055491532ea713c142b45
|
||||
Subproject commit b5e688359eaa87538ef5f43c1323ab92eca8ea33
|
Loading…
x
Reference in New Issue
Block a user