mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-20 21:39:14 +08:00
fix
This commit is contained in:
parent
5ed3f90b9c
commit
5199b00eeb
@ -1,4 +1,4 @@
|
|||||||
import { assignTransferProtocolMeta, marshalErrorLike, RPCHost, RPCReflection } from 'civkit';
|
import { assignTransferProtocolMeta, marshalErrorLike, RPCHost, RPCReflection, AssertionFailureError } from 'civkit';
|
||||||
import { singleton } from 'tsyringe';
|
import { singleton } from 'tsyringe';
|
||||||
import { CloudHTTPv2, Ctx, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
import { CloudHTTPv2, Ctx, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
||||||
import _ from 'lodash';
|
import _ from 'lodash';
|
||||||
@ -90,10 +90,6 @@ ${this.content}
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
|
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
|
||||||
if (!scrapped) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
const formatted = this.formatSnapshot(scrapped);
|
const formatted = this.formatSnapshot(scrapped);
|
||||||
|
|
||||||
if (scrapped.screenshot && screenshotEnabled) {
|
if (scrapped.screenshot && screenshotEnabled) {
|
||||||
@ -134,6 +130,10 @@ ${this.content}
|
|||||||
return formatted;
|
return formatted;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!lastScrapped) {
|
||||||
|
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
||||||
|
}
|
||||||
|
|
||||||
return this.formatSnapshot(lastScrapped);
|
return this.formatSnapshot(lastScrapped);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -148,6 +148,10 @@ ${this.content}
|
|||||||
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!lastScrapped) {
|
||||||
|
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
||||||
|
}
|
||||||
|
|
||||||
return `${this.formatSnapshot(lastScrapped)}`;
|
return `${this.formatSnapshot(lastScrapped)}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -153,7 +153,7 @@ function giveSnapshot() {
|
|||||||
return page;
|
return page;
|
||||||
}
|
}
|
||||||
|
|
||||||
async *scrap(url: string, noCache: string | boolean = false) {
|
async *scrap(url: string, noCache: string | boolean = false): AsyncGenerator<PageSnapshot> {
|
||||||
const parsedUrl = new URL(url);
|
const parsedUrl = new URL(url);
|
||||||
// parsedUrl.search = '';
|
// parsedUrl.search = '';
|
||||||
parsedUrl.hash = '';
|
parsedUrl.hash = '';
|
||||||
|
Loading…
x
Reference in New Issue
Block a user