mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-18 15:35:55 +08:00
fix: scrapMany and searcher
This commit is contained in:
parent
380bbffc0c
commit
29774ac637
@ -752,7 +752,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
Promise.all(
|
Promise.allSettled(
|
||||||
iterators.map((it, idx) => handler(it, idx))
|
iterators.map((it, idx) => handler(it, idx))
|
||||||
).finally(() => {
|
).finally(() => {
|
||||||
concluded = true;
|
concluded = true;
|
||||||
@ -767,6 +767,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
|
|
||||||
yield results;
|
yield results;
|
||||||
}
|
}
|
||||||
|
yield results;
|
||||||
} finally {
|
} finally {
|
||||||
for (const x of iterators) {
|
for (const x of iterators) {
|
||||||
x.return();
|
x.return();
|
||||||
|
@ -154,7 +154,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
delete crawlOpts.timeoutMs;
|
delete crawlOpts.timeoutMs;
|
||||||
}
|
}
|
||||||
|
|
||||||
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic, crawlOpts,
|
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic.slice(0, count + 2), crawlOpts,
|
||||||
CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
|
CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
|
||||||
count,
|
count,
|
||||||
);
|
);
|
||||||
@ -324,7 +324,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
|
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
|
||||||
const mapped = scrapped.map((x, i) => {
|
const mapped = scrapped.map((x, i) => {
|
||||||
const upstreamSearchResult = searchResults[i];
|
const upstreamSearchResult = searchResults[i];
|
||||||
if (!x || (!x.parsed && mode !== 'markdown')) {
|
if (!x) {
|
||||||
return {
|
return {
|
||||||
url: upstreamSearchResult.link,
|
url: upstreamSearchResult.link,
|
||||||
title: upstreamSearchResult.title,
|
title: upstreamSearchResult.title,
|
||||||
@ -370,7 +370,6 @@ export class SearcherHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const filtered = searchResults.filter((x) => acceptSet.has(x)).slice(0, targetResultCount);
|
const filtered = searchResults.filter((x) => acceptSet.has(x)).slice(0, targetResultCount);
|
||||||
filtered.toString = searchResults.toString;
|
|
||||||
|
|
||||||
const resultArray = filtered.map((x, i) => {
|
const resultArray = filtered.map((x, i) => {
|
||||||
|
|
||||||
@ -378,10 +377,11 @@ export class SearcherHost extends RPCHost {
|
|||||||
...x,
|
...x,
|
||||||
toString(this: any) {
|
toString(this: any) {
|
||||||
if (!this.content && this.description) {
|
if (!this.content && this.description) {
|
||||||
if (this.title) {
|
if (this.title || x.textRepresentation) {
|
||||||
|
const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : '';
|
||||||
return `[${i + 1}] Title: ${this.title}
|
return `[${i + 1}] Title: ${this.title}
|
||||||
[${i + 1}] URL Source: ${this.url}
|
[${i + 1}] URL Source: ${this.url}
|
||||||
[${i + 1}] Description: ${this.description}
|
[${i + 1}] Description: ${this.description}${textRep}
|
||||||
`;
|
`;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -444,6 +444,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
|
|||||||
return formattedPage.title &&
|
return formattedPage.title &&
|
||||||
formattedPage.content ||
|
formattedPage.content ||
|
||||||
formattedPage.screenshotUrl ||
|
formattedPage.screenshotUrl ||
|
||||||
|
formattedPage.pageshotUrl ||
|
||||||
formattedPage.text ||
|
formattedPage.text ||
|
||||||
formattedPage.html;
|
formattedPage.html;
|
||||||
}
|
}
|
@ -155,7 +155,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
delete crawlOpts.timeoutMs;
|
delete crawlOpts.timeoutMs;
|
||||||
}
|
}
|
||||||
|
|
||||||
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
|
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results.slice(0, count + 2), crawlOpts,
|
||||||
CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
|
CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
|
||||||
count,
|
count,
|
||||||
);
|
);
|
||||||
@ -325,7 +325,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
|
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
|
||||||
const mapped = scrapped.map((x, i) => {
|
const mapped = scrapped.map((x, i) => {
|
||||||
const upstreamSearchResult = searchResults[i];
|
const upstreamSearchResult = searchResults[i];
|
||||||
if (!x || (!x.parsed && mode !== 'markdown')) {
|
if (!x) {
|
||||||
return {
|
return {
|
||||||
url: upstreamSearchResult.url,
|
url: upstreamSearchResult.url,
|
||||||
title: upstreamSearchResult.title,
|
title: upstreamSearchResult.title,
|
||||||
@ -371,18 +371,17 @@ export class SearcherHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const filtered = searchResults.filter((x) => acceptSet.has(x)).slice(0, targetResultCount);
|
const filtered = searchResults.filter((x) => acceptSet.has(x)).slice(0, targetResultCount);
|
||||||
filtered.toString = searchResults.toString;
|
|
||||||
|
|
||||||
const resultArray = filtered.map((x, i) => {
|
const resultArray = filtered.map((x, i) => {
|
||||||
|
|
||||||
return {
|
return {
|
||||||
...x,
|
...x,
|
||||||
toString(this: any) {
|
toString(this: any) {
|
||||||
if (!this.content && this.description) {
|
if (!this.content && this.description) {
|
||||||
if (this.title) {
|
if (this.title || x.textRepresentation) {
|
||||||
|
const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : '';
|
||||||
return `[${i + 1}] Title: ${this.title}
|
return `[${i + 1}] Title: ${this.title}
|
||||||
[${i + 1}] URL Source: ${this.url}
|
[${i + 1}] URL Source: ${this.url}
|
||||||
[${i + 1}] Description: ${this.description}
|
[${i + 1}] Description: ${this.description}${textRep}
|
||||||
`;
|
`;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -445,6 +444,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
|
|||||||
return formattedPage.title &&
|
return formattedPage.title &&
|
||||||
formattedPage.content ||
|
formattedPage.content ||
|
||||||
formattedPage.screenshotUrl ||
|
formattedPage.screenshotUrl ||
|
||||||
|
formattedPage.pageshotUrl ||
|
||||||
formattedPage.text ||
|
formattedPage.text ||
|
||||||
formattedPage.html;
|
formattedPage.html;
|
||||||
}
|
}
|
||||||
|
@ -199,7 +199,7 @@ export class JSDomControl extends AsyncService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Threaded()
|
@Threaded()
|
||||||
inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot {
|
async inferSnapshot(snapshot: PageSnapshot) {
|
||||||
const t0 = Date.now();
|
const t0 = Date.now();
|
||||||
const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
|
const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
|
||||||
try {
|
try {
|
||||||
|
@ -101,7 +101,7 @@ export class SnapshotFormatter extends AsyncService {
|
|||||||
}, nominalUrl?: URL, urlValidMs = 3600 * 1000 * 4) {
|
}, nominalUrl?: URL, urlValidMs = 3600 * 1000 * 4) {
|
||||||
const t0 = Date.now();
|
const t0 = Date.now();
|
||||||
const f = {
|
const f = {
|
||||||
...this.getGeneralSnapshotMixins(snapshot),
|
...(await this.getGeneralSnapshotMixins(snapshot)),
|
||||||
};
|
};
|
||||||
let modeOK = false;
|
let modeOK = false;
|
||||||
|
|
||||||
@ -190,6 +190,16 @@ export class SnapshotFormatter extends AsyncService {
|
|||||||
const dt = Date.now() - t0;
|
const dt = Date.now() - t0;
|
||||||
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
|
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
|
||||||
|
|
||||||
|
const formatted: FormattedPage = {
|
||||||
|
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
||||||
|
description: (snapshot.description || '').trim(),
|
||||||
|
url: nominalUrl?.toString() || snapshot.href?.trim(),
|
||||||
|
publishedTime: snapshot.parsed?.publishedTime || undefined,
|
||||||
|
[Symbol.dispose]: () => { },
|
||||||
|
};
|
||||||
|
|
||||||
|
Object.assign(f, formatted);
|
||||||
|
|
||||||
return f;
|
return f;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -412,7 +422,7 @@ export class SnapshotFormatter extends AsyncService {
|
|||||||
.value();
|
.value();
|
||||||
}
|
}
|
||||||
if (this.threadLocal.get('withLinksSummary')) {
|
if (this.threadLocal.get('withLinksSummary')) {
|
||||||
const links = this.jsdomControl.inferSnapshot(snapshot).links;
|
const links = (await this.jsdomControl.inferSnapshot(snapshot)).links;
|
||||||
|
|
||||||
if (this.threadLocal.get('withLinksSummary') === 'all') {
|
if (this.threadLocal.get('withLinksSummary') === 'all') {
|
||||||
formatted.links = links;
|
formatted.links = links;
|
||||||
@ -482,11 +492,11 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
return f as FormattedPage;
|
return f as FormattedPage;
|
||||||
}
|
}
|
||||||
|
|
||||||
getGeneralSnapshotMixins(snapshot: PageSnapshot) {
|
async getGeneralSnapshotMixins(snapshot: PageSnapshot) {
|
||||||
let inferred;
|
let inferred;
|
||||||
const mixin: any = {};
|
const mixin: any = {};
|
||||||
if (this.threadLocal.get('withImagesSummary')) {
|
if (this.threadLocal.get('withImagesSummary')) {
|
||||||
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
inferred ??= await this.jsdomControl.inferSnapshot(snapshot);
|
||||||
const imageSummary = {} as { [k: string]: string; };
|
const imageSummary = {} as { [k: string]: string; };
|
||||||
const imageIdxTrack = new Map<string, number[]>();
|
const imageIdxTrack = new Map<string, number[]>();
|
||||||
|
|
||||||
@ -511,7 +521,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
.value();
|
.value();
|
||||||
}
|
}
|
||||||
if (this.threadLocal.get('withLinksSummary')) {
|
if (this.threadLocal.get('withLinksSummary')) {
|
||||||
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
inferred ??= await this.jsdomControl.inferSnapshot(snapshot);
|
||||||
if (this.threadLocal.get('withLinksSummary') === 'all') {
|
if (this.threadLocal.get('withLinksSummary') === 'all') {
|
||||||
mixin.links = inferred.links;
|
mixin.links = inferred.links;
|
||||||
} else {
|
} else {
|
||||||
|
@ -15,7 +15,7 @@ import { Logger, CloudFunctionRegistry, AsyncContext } from '../shared';
|
|||||||
import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc';
|
import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc';
|
||||||
import { ExpressServer } from 'civkit/civ-rpc/express';
|
import { ExpressServer } from 'civkit/civ-rpc/express';
|
||||||
import http2 from 'http2';
|
import http2 from 'http2';
|
||||||
import { SearcherHost } from '../cloud-functions/sercher-serper';
|
import { SearcherHost } from '../cloud-functions/searcher';
|
||||||
import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
|
import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
|
||||||
import path from 'path';
|
import path from 'path';
|
||||||
import fs from 'fs';
|
import fs from 'fs';
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit 5e25cdd295bdbc41422055491532ea713c142b45
|
Subproject commit b5e688359eaa87538ef5f43c1323ab92eca8ea33
|
Loading…
x
Reference in New Issue
Block a user