mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-15 17:35:58 +08:00
fix
This commit is contained in:
parent
2dc0850c8c
commit
664d4b1c9f
@ -32,13 +32,21 @@ export class CrawlerHost extends RPCHost {
|
|||||||
const toBeTurnedToMd = snapshot.parsed?.content;
|
const toBeTurnedToMd = snapshot.parsed?.content;
|
||||||
const contentText = toBeTurnedToMd ? this.turnDownService.turndown(toBeTurnedToMd) : snapshot.text;
|
const contentText = toBeTurnedToMd ? this.turnDownService.turndown(toBeTurnedToMd) : snapshot.text;
|
||||||
|
|
||||||
const formatted = `Title: ${(snapshot.parsed?.title || snapshot.title || '').trim()}
|
const formatted = {
|
||||||
|
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
||||||
|
urlSource: snapshot.href.trim(),
|
||||||
|
markdownContent: contentText.trim(),
|
||||||
|
|
||||||
URL Source: ${snapshot.href.trim()}
|
toString() {
|
||||||
|
return `Title: ${this.title}
|
||||||
|
|
||||||
|
URL Source: ${this.urlSource}
|
||||||
|
|
||||||
Markdown Content:
|
Markdown Content:
|
||||||
${contentText.trim()}
|
${contentText}
|
||||||
`;
|
`;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
return formatted;
|
return formatted;
|
||||||
}
|
}
|
||||||
@ -47,6 +55,7 @@ ${contentText.trim()}
|
|||||||
runtime: {
|
runtime: {
|
||||||
memory: '4GiB',
|
memory: '4GiB',
|
||||||
timeoutSeconds: 540,
|
timeoutSeconds: 540,
|
||||||
|
concurrency: 4,
|
||||||
},
|
},
|
||||||
httpMethod: ['get', 'post'],
|
httpMethod: ['get', 'post'],
|
||||||
returnType: [String, OutputServerEventStream],
|
returnType: [String, OutputServerEventStream],
|
||||||
@ -60,20 +69,22 @@ ${contentText.trim()}
|
|||||||
) {
|
) {
|
||||||
const noSlashURL = ctx.req.url.slice(1);
|
const noSlashURL = ctx.req.url.slice(1);
|
||||||
const urlToCrawl = new URL(normalizeUrl(noSlashURL));
|
const urlToCrawl = new URL(normalizeUrl(noSlashURL));
|
||||||
|
const screenshotEnabled = Boolean(ctx.req.headers['x-screenshot']);
|
||||||
|
const noCache = Boolean(ctx.req.headers['x-no-cache']);
|
||||||
|
|
||||||
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
||||||
const sseStream = new OutputServerEventStream();
|
const sseStream = new OutputServerEventStream();
|
||||||
rpcReflect.return(sseStream);
|
rpcReflect.return(sseStream);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString())) {
|
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
|
||||||
if (!scrapped) {
|
if (!scrapped) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const formatted = this.formatSnapshot(scrapped);
|
const formatted = this.formatSnapshot(scrapped);
|
||||||
|
|
||||||
if (scrapped.screenshot) {
|
if (scrapped.screenshot && screenshotEnabled) {
|
||||||
sseStream.write({
|
sseStream.write({
|
||||||
event: 'screenshot',
|
event: 'screenshot',
|
||||||
data: scrapped.screenshot.toString('base64'),
|
data: scrapped.screenshot.toString('base64'),
|
||||||
@ -99,37 +110,25 @@ ${contentText.trim()}
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
||||||
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString())) {
|
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
|
||||||
if (!scrapped?.parsed?.content) {
|
if (!scrapped?.parsed?.content) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const formatted = this.formatSnapshot(scrapped);
|
const formatted = this.formatSnapshot(scrapped);
|
||||||
|
|
||||||
if (scrapped.screenshot) {
|
|
||||||
|
|
||||||
return [
|
|
||||||
{
|
|
||||||
type: 'image_url', image_url: {
|
|
||||||
url: `data:image/jpeg;base64,${scrapped.screenshot.toString('base64')}`,
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ type: 'text', content: formatted },
|
|
||||||
];
|
|
||||||
}
|
|
||||||
|
|
||||||
return formatted;
|
return formatted;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString())) {
|
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
|
||||||
if (!scrapped?.parsed?.content) {
|
if (!scrapped?.parsed?.content) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const formatted = this.formatSnapshot(scrapped);
|
const formatted = this.formatSnapshot(scrapped);
|
||||||
|
|
||||||
return assignTransferProtocolMeta(formatted, { contentType: 'text/plain', envelope: null });
|
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
||||||
}
|
}
|
||||||
|
|
||||||
throw new Error('Unreachable');
|
throw new Error('Unreachable');
|
||||||
|
@ -49,9 +49,9 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
return page.browser().connected && !page.isClosed();
|
return page.browser().connected && !page.isClosed();
|
||||||
}
|
}
|
||||||
}, {
|
}, {
|
||||||
max: 1 + Math.floor(os.freemem() / 1024 * 1024 * 1024),
|
max: Math.max(1 + Math.floor(os.freemem() / 1024 * 1024 * 1024), 4),
|
||||||
min: 1,
|
min: 1,
|
||||||
acquireTimeoutMillis: 15_000,
|
acquireTimeoutMillis: 60_000,
|
||||||
testOnBorrow: true,
|
testOnBorrow: true,
|
||||||
testOnReturn: true,
|
testOnReturn: true,
|
||||||
});
|
});
|
||||||
@ -72,7 +72,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
}
|
}
|
||||||
this.browser = await puppeteer.launch({
|
this.browser = await puppeteer.launch({
|
||||||
headless: true,
|
headless: true,
|
||||||
timeout: 300_000
|
timeout: 60_000
|
||||||
});
|
});
|
||||||
this.browser.once('disconnected', () => {
|
this.browser.once('disconnected', () => {
|
||||||
this.logger.warn(`Browser disconnected`);
|
this.logger.warn(`Browser disconnected`);
|
||||||
@ -91,6 +91,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
const preparations = [];
|
const preparations = [];
|
||||||
|
|
||||||
preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`));
|
preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`));
|
||||||
|
preparations.push(page.setBypassCSP(true));
|
||||||
preparations.push(page.setViewport({ width: 1920, height: 1080 }));
|
preparations.push(page.setViewport({ width: 1920, height: 1080 }));
|
||||||
preparations.push(page.exposeFunction('reportSnapshot', (snapshot: any) => {
|
preparations.push(page.exposeFunction('reportSnapshot', (snapshot: any) => {
|
||||||
page.emit('snapshot', snapshot);
|
page.emit('snapshot', snapshot);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user