From 664d4b1c9f65f1712a7545a484b35d6fc795dda5 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Fri, 12 Apr 2024 09:25:19 +0800 Subject: [PATCH] fix --- .../functions/src/cloud-functions/crawler.ts | 39 +++++++++---------- backend/functions/src/services/puppeteer.ts | 7 ++-- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index f58f8ee..d8e754e 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -32,13 +32,21 @@ export class CrawlerHost extends RPCHost { const toBeTurnedToMd = snapshot.parsed?.content; const contentText = toBeTurnedToMd ? this.turnDownService.turndown(toBeTurnedToMd) : snapshot.text; - const formatted = `Title: ${(snapshot.parsed?.title || snapshot.title || '').trim()} + const formatted = { + title: (snapshot.parsed?.title || snapshot.title || '').trim(), + urlSource: snapshot.href.trim(), + markdownContent: contentText.trim(), -URL Source: ${snapshot.href.trim()} + toString() { + return `Title: ${this.title} + +URL Source: ${this.urlSource} Markdown Content: -${contentText.trim()} +${contentText} `; + } + }; return formatted; } @@ -47,6 +55,7 @@ ${contentText.trim()} runtime: { memory: '4GiB', timeoutSeconds: 540, + concurrency: 4, }, httpMethod: ['get', 'post'], returnType: [String, OutputServerEventStream], @@ -60,20 +69,22 @@ ${contentText.trim()} ) { const noSlashURL = ctx.req.url.slice(1); const urlToCrawl = new URL(normalizeUrl(noSlashURL)); + const screenshotEnabled = Boolean(ctx.req.headers['x-screenshot']); + const noCache = Boolean(ctx.req.headers['x-no-cache']); if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) { const sseStream = new OutputServerEventStream(); rpcReflect.return(sseStream); try { - for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString())) { + for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) { if (!scrapped) { continue; } const formatted = this.formatSnapshot(scrapped); - if (scrapped.screenshot) { + if (scrapped.screenshot && screenshotEnabled) { sseStream.write({ event: 'screenshot', data: scrapped.screenshot.toString('base64'), @@ -99,37 +110,25 @@ ${contentText.trim()} } if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) { - for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString())) { + for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) { if (!scrapped?.parsed?.content) { continue; } const formatted = this.formatSnapshot(scrapped); - if (scrapped.screenshot) { - - return [ - { - type: 'image_url', image_url: { - url: `data:image/jpeg;base64,${scrapped.screenshot.toString('base64')}`, - } - }, - { type: 'text', content: formatted }, - ]; - } - return formatted; } } - for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString())) { + for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) { if (!scrapped?.parsed?.content) { continue; } const formatted = this.formatSnapshot(scrapped); - return assignTransferProtocolMeta(formatted, { contentType: 'text/plain', envelope: null }); + return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null }); } throw new Error('Unreachable'); diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index 6fd24fd..8545f25 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -49,9 +49,9 @@ export class PuppeteerControl extends AsyncService { return page.browser().connected && !page.isClosed(); } }, { - max: 1 + Math.floor(os.freemem() / 1024 * 1024 * 1024), + max: Math.max(1 + Math.floor(os.freemem() / 1024 * 1024 * 1024), 4), min: 1, - acquireTimeoutMillis: 15_000, + acquireTimeoutMillis: 60_000, testOnBorrow: true, testOnReturn: true, }); @@ -72,7 +72,7 @@ export class PuppeteerControl extends AsyncService { } this.browser = await puppeteer.launch({ headless: true, - timeout: 300_000 + timeout: 60_000 }); this.browser.once('disconnected', () => { this.logger.warn(`Browser disconnected`); @@ -91,6 +91,7 @@ export class PuppeteerControl extends AsyncService { const preparations = []; preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`)); + preparations.push(page.setBypassCSP(true)); preparations.push(page.setViewport({ width: 1920, height: 1080 })); preparations.push(page.exposeFunction('reportSnapshot', (snapshot: any) => { page.emit('snapshot', snapshot);