This commit is contained in:
Yanlong Wang 2024-04-12 09:25:19 +08:00
parent 2dc0850c8c
commit 664d4b1c9f
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
2 changed files with 23 additions and 23 deletions

View File

@ -32,13 +32,21 @@ export class CrawlerHost extends RPCHost {
const toBeTurnedToMd = snapshot.parsed?.content; const toBeTurnedToMd = snapshot.parsed?.content;
const contentText = toBeTurnedToMd ? this.turnDownService.turndown(toBeTurnedToMd) : snapshot.text; const contentText = toBeTurnedToMd ? this.turnDownService.turndown(toBeTurnedToMd) : snapshot.text;
const formatted = `Title: ${(snapshot.parsed?.title || snapshot.title || '').trim()} const formatted = {
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
urlSource: snapshot.href.trim(),
markdownContent: contentText.trim(),
URL Source: ${snapshot.href.trim()} toString() {
return `Title: ${this.title}
URL Source: ${this.urlSource}
Markdown Content: Markdown Content:
${contentText.trim()} ${contentText}
`; `;
}
};
return formatted; return formatted;
} }
@ -47,6 +55,7 @@ ${contentText.trim()}
runtime: { runtime: {
memory: '4GiB', memory: '4GiB',
timeoutSeconds: 540, timeoutSeconds: 540,
concurrency: 4,
}, },
httpMethod: ['get', 'post'], httpMethod: ['get', 'post'],
returnType: [String, OutputServerEventStream], returnType: [String, OutputServerEventStream],
@ -60,20 +69,22 @@ ${contentText.trim()}
) { ) {
const noSlashURL = ctx.req.url.slice(1); const noSlashURL = ctx.req.url.slice(1);
const urlToCrawl = new URL(normalizeUrl(noSlashURL)); const urlToCrawl = new URL(normalizeUrl(noSlashURL));
const screenshotEnabled = Boolean(ctx.req.headers['x-screenshot']);
const noCache = Boolean(ctx.req.headers['x-no-cache']);
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) { if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
const sseStream = new OutputServerEventStream(); const sseStream = new OutputServerEventStream();
rpcReflect.return(sseStream); rpcReflect.return(sseStream);
try { try {
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString())) { for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
if (!scrapped) { if (!scrapped) {
continue; continue;
} }
const formatted = this.formatSnapshot(scrapped); const formatted = this.formatSnapshot(scrapped);
if (scrapped.screenshot) { if (scrapped.screenshot && screenshotEnabled) {
sseStream.write({ sseStream.write({
event: 'screenshot', event: 'screenshot',
data: scrapped.screenshot.toString('base64'), data: scrapped.screenshot.toString('base64'),
@ -99,37 +110,25 @@ ${contentText.trim()}
} }
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) { if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString())) { for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
if (!scrapped?.parsed?.content) { if (!scrapped?.parsed?.content) {
continue; continue;
} }
const formatted = this.formatSnapshot(scrapped); const formatted = this.formatSnapshot(scrapped);
if (scrapped.screenshot) {
return [
{
type: 'image_url', image_url: {
url: `data:image/jpeg;base64,${scrapped.screenshot.toString('base64')}`,
}
},
{ type: 'text', content: formatted },
];
}
return formatted; return formatted;
} }
} }
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString())) { for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
if (!scrapped?.parsed?.content) { if (!scrapped?.parsed?.content) {
continue; continue;
} }
const formatted = this.formatSnapshot(scrapped); const formatted = this.formatSnapshot(scrapped);
return assignTransferProtocolMeta(formatted, { contentType: 'text/plain', envelope: null }); return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
} }
throw new Error('Unreachable'); throw new Error('Unreachable');

View File

@ -49,9 +49,9 @@ export class PuppeteerControl extends AsyncService {
return page.browser().connected && !page.isClosed(); return page.browser().connected && !page.isClosed();
} }
}, { }, {
max: 1 + Math.floor(os.freemem() / 1024 * 1024 * 1024), max: Math.max(1 + Math.floor(os.freemem() / 1024 * 1024 * 1024), 4),
min: 1, min: 1,
acquireTimeoutMillis: 15_000, acquireTimeoutMillis: 60_000,
testOnBorrow: true, testOnBorrow: true,
testOnReturn: true, testOnReturn: true,
}); });
@ -72,7 +72,7 @@ export class PuppeteerControl extends AsyncService {
} }
this.browser = await puppeteer.launch({ this.browser = await puppeteer.launch({
headless: true, headless: true,
timeout: 300_000 timeout: 60_000
}); });
this.browser.once('disconnected', () => { this.browser.once('disconnected', () => {
this.logger.warn(`Browser disconnected`); this.logger.warn(`Browser disconnected`);
@ -91,6 +91,7 @@ export class PuppeteerControl extends AsyncService {
const preparations = []; const preparations = [];
preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`)); preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`));
preparations.push(page.setBypassCSP(true));
preparations.push(page.setViewport({ width: 1920, height: 1080 })); preparations.push(page.setViewport({ width: 1920, height: 1080 }));
preparations.push(page.exposeFunction('reportSnapshot', (snapshot: any) => { preparations.push(page.exposeFunction('reportSnapshot', (snapshot: any) => {
page.emit('snapshot', snapshot); page.emit('snapshot', snapshot);