diff --git a/backend/functions/package.json b/backend/functions/package.json index 6a6b277..8f81e54 100644 --- a/backend/functions/package.json +++ b/backend/functions/package.json @@ -18,8 +18,7 @@ "from-preset": "npm run build && npm run emu:reset && npm run emu:start", "start": "npm run shell", "deploy": "firebase deploy --only functions", - "logs": "firebase functions:log", - "gcp-build": "node node_modules/puppeteer/install.js" + "logs": "firebase functions:log" }, "engines": { "node": "18" diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index d8e754e..320251f 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -30,7 +30,9 @@ export class CrawlerHost extends RPCHost { formatSnapshot(snapshot: PageSnapshot) { const toBeTurnedToMd = snapshot.parsed?.content; - const contentText = toBeTurnedToMd ? this.turnDownService.turndown(toBeTurnedToMd) : snapshot.text; + const turnedDown = toBeTurnedToMd ? this.turnDownService.turndown(toBeTurnedToMd).trim() : ''; + + const contentText = turnedDown && !(turnedDown.startsWith('<') && turnedDown.endsWith('>')) ? turnedDown : snapshot.text.trim(); const formatted = { title: (snapshot.parsed?.title || snapshot.title || '').trim(), @@ -51,6 +53,16 @@ ${contentText} return formatted; } + @CloudHTTPv2({ + name: 'crawl2', + runtime: { + memory: '4GiB', + timeoutSeconds: 540, + concurrency: 4, + }, + httpMethod: ['get', 'post'], + returnType: [String, OutputServerEventStream], + }) @CloudHTTPv2({ runtime: { memory: '4GiB', diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index 8545f25..7205d8c 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -71,8 +71,7 @@ export class PuppeteerControl extends AsyncService { } } this.browser = await puppeteer.launch({ - headless: true, - timeout: 60_000 + headless: true }); this.browser.once('disconnected', () => { this.logger.warn(`Browser disconnected`);