mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-18 21:55:53 +08:00
chore: improve type definitions and type safety (#1153)
* chore: improve type definitions and type safety * chore: format search result * chore: use 'x-details' * chore: remove 'domain' from search result * chore: use 'x-respond-with' * chore: optimize logic to return content * chore: cleanup * chore: optimize logic
This commit is contained in:
parent
4873578c62
commit
51109e01d2
@ -3,6 +3,7 @@ import {
|
|||||||
RPCHost, RPCReflection,
|
RPCHost, RPCReflection,
|
||||||
AssertionFailureError,
|
AssertionFailureError,
|
||||||
objHashMd5B64Of,
|
objHashMd5B64Of,
|
||||||
|
assignMeta,
|
||||||
} from 'civkit';
|
} from 'civkit';
|
||||||
import { singleton } from 'tsyringe';
|
import { singleton } from 'tsyringe';
|
||||||
import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, Param, RPCReflect } from '../shared';
|
import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, Param, RPCReflect } from '../shared';
|
||||||
@ -88,8 +89,9 @@ export class SearcherHost extends RPCHost {
|
|||||||
@Param('q') q?: string,
|
@Param('q') q?: string,
|
||||||
) {
|
) {
|
||||||
const uid = await auth.solveUID();
|
const uid = await auth.solveUID();
|
||||||
const version = ctx?.req.get('x-version');
|
// Return content by default
|
||||||
const isVersion2 = version?.replace('v', '') === '2';
|
const respondWith = ctx.req.get('X-Respond-With') ?? 'content';
|
||||||
|
const crawlWithoutContent = !respondWith.includes('content');
|
||||||
|
|
||||||
let chargeAmount = 0;
|
let chargeAmount = 0;
|
||||||
const noSlashPath = decodeURIComponent(ctx.req.path).slice(1);
|
const noSlashPath = decodeURIComponent(ctx.req.path).slice(1);
|
||||||
@ -157,31 +159,26 @@ export class SearcherHost extends RPCHost {
|
|||||||
delete crawlOpts.timeoutMs;
|
delete crawlOpts.timeoutMs;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isVersion2) {
|
|
||||||
chargeAmount = 10000;
|
|
||||||
const result = [];
|
|
||||||
for (const x of r.organic.slice(0, count)) {
|
|
||||||
const url = new URL(x.link);
|
|
||||||
const favicon = await this.getFavicon(url.origin);
|
|
||||||
|
|
||||||
result.push({
|
let lastScrapped: any[] | undefined;
|
||||||
url: x.link,
|
const targetResultCount = crawlWithoutContent ? count : count + 2;
|
||||||
title: x.title,
|
const organicSearchResults = r.organic.slice(0, targetResultCount);
|
||||||
snippet: x.snippet,
|
if (crawlWithoutContent || count === 0) {
|
||||||
domain: url.origin,
|
const fakeResults = await this.fakeResult(crawlerOptions.respondWith, organicSearchResults, !crawlWithoutContent);
|
||||||
favicon: favicon,
|
lastScrapped = fakeResults;
|
||||||
});
|
if (!crawlWithoutContent) {
|
||||||
|
chargeAmount = this.assignChargeAmount(lastScrapped);
|
||||||
|
} else {
|
||||||
|
chargeAmount = 10000;
|
||||||
}
|
}
|
||||||
|
this.assignTokenUsage(lastScrapped, chargeAmount, crawlWithoutContent);
|
||||||
return {
|
if ((!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) || count === 0) {
|
||||||
result,
|
return lastScrapped;
|
||||||
usage: {
|
}
|
||||||
tokens: chargeAmount,
|
return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic.slice(0, count + 2), crawlOpts,
|
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic.slice(0, targetResultCount), crawlOpts,
|
||||||
CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
|
CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
|
||||||
count,
|
count,
|
||||||
);
|
);
|
||||||
@ -217,7 +214,6 @@ export class SearcherHost extends RPCHost {
|
|||||||
return sseStream;
|
return sseStream;
|
||||||
}
|
}
|
||||||
|
|
||||||
let lastScrapped: any[] | undefined;
|
|
||||||
let earlyReturn = false;
|
let earlyReturn = false;
|
||||||
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
||||||
let earlyReturnTimer: ReturnType<typeof setTimeout> | undefined;
|
let earlyReturnTimer: ReturnType<typeof setTimeout> | undefined;
|
||||||
@ -248,6 +244,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
chargeAmount = this.assignChargeAmount(scrapped);
|
chargeAmount = this.assignChargeAmount(scrapped);
|
||||||
|
|
||||||
|
this.assignTokenUsage(scrapped, chargeAmount, crawlWithoutContent);
|
||||||
return scrapped;
|
return scrapped;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -263,6 +260,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
chargeAmount = this.assignChargeAmount(lastScrapped);
|
chargeAmount = this.assignChargeAmount(lastScrapped);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
this.assignTokenUsage(lastScrapped, chargeAmount, crawlWithoutContent);
|
||||||
return lastScrapped;
|
return lastScrapped;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -316,9 +314,70 @@ export class SearcherHost extends RPCHost {
|
|||||||
return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
|
return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
|
||||||
}
|
}
|
||||||
|
|
||||||
async *fetchSearchResults(
|
assignTokenUsage(result: FormattedPage[], chargeAmount: number, crawlWithoutContent: boolean) {
|
||||||
|
if (crawlWithoutContent) {
|
||||||
|
chargeAmount = 10000;
|
||||||
|
if (result) {
|
||||||
|
result.forEach((x) => {
|
||||||
|
delete x.usage;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
assignMeta(result, { usage: { tokens: chargeAmount } });
|
||||||
|
}
|
||||||
|
|
||||||
|
async fakeResult(
|
||||||
mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
|
mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
|
||||||
searchResults?: SerperSearchResponse['organic'],
|
searchResults?: SerperSearchResponse['organic'],
|
||||||
|
withContent: boolean = false
|
||||||
|
) {
|
||||||
|
if (!searchResults) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const resultArray = await Promise.all(searchResults.map(async (upstreamSearchResult, index) => {
|
||||||
|
const result = {
|
||||||
|
url: upstreamSearchResult.link,
|
||||||
|
title: upstreamSearchResult.title,
|
||||||
|
description: upstreamSearchResult.snippet,
|
||||||
|
} as FormattedPage;
|
||||||
|
|
||||||
|
const dataItems = [
|
||||||
|
{ key: 'title', label: 'Title' },
|
||||||
|
{ key: 'url', label: 'URL Source' },
|
||||||
|
{ key: 'description', label: 'Description'},
|
||||||
|
]
|
||||||
|
|
||||||
|
if (withContent) {
|
||||||
|
result.content = ['html', 'text', 'screenshot'].includes(mode) ? undefined : '';
|
||||||
|
}
|
||||||
|
if (mode.includes('favicon')) {
|
||||||
|
const url = new URL(upstreamSearchResult.link);
|
||||||
|
result.favicon = await this.getFavicon(url.origin);
|
||||||
|
dataItems.push({
|
||||||
|
key: 'favicon',
|
||||||
|
label: 'Favicon',
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
result.toString = function () {
|
||||||
|
const self = this as any;
|
||||||
|
return dataItems.map((x) => `[${index + 1}] ${x.label}: ${self[x.key]}`).join('\n') + '\n';
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}));
|
||||||
|
|
||||||
|
resultArray.toString = function () {
|
||||||
|
return this.map((x, i) => x ? x.toString() : '').join('\n\n').trimEnd() + '\n';
|
||||||
|
};
|
||||||
|
|
||||||
|
return resultArray;
|
||||||
|
}
|
||||||
|
|
||||||
|
async *fetchSearchResults(
|
||||||
|
mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'favicon' | 'content',
|
||||||
|
searchResults?: SerperSearchResponse['organic'],
|
||||||
options?: ExtraScrappingOptions,
|
options?: ExtraScrappingOptions,
|
||||||
crawlerOptions?: CrawlerOptions,
|
crawlerOptions?: CrawlerOptions,
|
||||||
count?: number,
|
count?: number,
|
||||||
@ -326,26 +385,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
if (!searchResults) {
|
if (!searchResults) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (count === 0) {
|
|
||||||
const resultArray = searchResults.map((upstreamSearchResult, i) => ({
|
|
||||||
url: upstreamSearchResult.link,
|
|
||||||
title: upstreamSearchResult.title,
|
|
||||||
description: upstreamSearchResult.snippet,
|
|
||||||
content: ['html', 'text', 'screenshot'].includes(mode) ? undefined : '',
|
|
||||||
toString() {
|
|
||||||
return `[${i + 1}] Title: ${this.title}
|
|
||||||
[${i + 1}] URL Source: ${this.url}
|
|
||||||
[${i + 1}] Description: ${this.description}
|
|
||||||
`;
|
|
||||||
}
|
|
||||||
|
|
||||||
})) as FormattedPage[];
|
|
||||||
resultArray.toString = function () {
|
|
||||||
return this.map((x, i) => x ? x.toString() : '').join('\n\n').trimEnd() + '\n';
|
|
||||||
};
|
|
||||||
yield resultArray;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
const urls = searchResults.map((x) => new URL(x.link));
|
const urls = searchResults.map((x) => new URL(x.link));
|
||||||
const snapshotMap = new WeakMap();
|
const snapshotMap = new WeakMap();
|
||||||
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
|
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
|
||||||
|
@ -31,6 +31,7 @@ export interface FormattedPage {
|
|||||||
links?: { [k: string]: string; } | [string, string][];
|
links?: { [k: string]: string; } | [string, string][];
|
||||||
images?: { [k: string]: string; } | [string, string][];
|
images?: { [k: string]: string; } | [string, string][];
|
||||||
warning?: string;
|
warning?: string;
|
||||||
|
favicon?: string;
|
||||||
usage?: {
|
usage?: {
|
||||||
total_tokens?: number;
|
total_tokens?: number;
|
||||||
totalTokens?: number;
|
totalTokens?: number;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user