mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-20 01:09:06 +08:00
fix: favor nominal url over real url
This commit is contained in:
parent
bda7e76e50
commit
11a5a90611
@ -67,7 +67,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
this.emit('ready');
|
this.emit('ready');
|
||||||
}
|
}
|
||||||
|
|
||||||
async formatSnapshot(snapshot: PageSnapshot) {
|
async formatSnapshot(snapshot: PageSnapshot, nominalUrl?: string) {
|
||||||
const toBeTurnedToMd = snapshot.parsed?.content;
|
const toBeTurnedToMd = snapshot.parsed?.content;
|
||||||
let turnDownService = new TurndownService();
|
let turnDownService = new TurndownService();
|
||||||
for (const plugin of this.turnDownPlugins) {
|
for (const plugin of this.turnDownPlugins) {
|
||||||
@ -121,7 +121,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
|
|
||||||
const formatted = {
|
const formatted = {
|
||||||
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
||||||
url: snapshot.href?.trim(),
|
url: nominalUrl || snapshot.href?.trim(),
|
||||||
content: cleanText,
|
content: cleanText,
|
||||||
|
|
||||||
toString() {
|
toString() {
|
||||||
@ -188,7 +188,7 @@ ${this.content}
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(scrapped);
|
const formatted = await this.formatSnapshot(scrapped, urlToCrawl?.toString());
|
||||||
|
|
||||||
if (scrapped.screenshot && screenshotEnabled) {
|
if (scrapped.screenshot && screenshotEnabled) {
|
||||||
sseStream.write({
|
sseStream.write({
|
||||||
@ -223,7 +223,7 @@ ${this.content}
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(scrapped);
|
const formatted = await this.formatSnapshot(scrapped, urlToCrawl?.toString());
|
||||||
|
|
||||||
return formatted;
|
return formatted;
|
||||||
}
|
}
|
||||||
@ -232,7 +232,7 @@ ${this.content}
|
|||||||
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
return await this.formatSnapshot(lastScrapped);
|
return await this.formatSnapshot(lastScrapped, urlToCrawl?.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
|
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
|
||||||
@ -241,7 +241,7 @@ ${this.content}
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(scrapped);
|
const formatted = await this.formatSnapshot(scrapped, urlToCrawl?.toString());
|
||||||
|
|
||||||
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
||||||
}
|
}
|
||||||
@ -250,7 +250,7 @@ ${this.content}
|
|||||||
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
return `${await this.formatSnapshot(lastScrapped)}`;
|
return `${await this.formatSnapshot(lastScrapped, urlToCrawl?.toString())}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user