This commit is contained in:
Yanlong Wang 2024-04-12 10:59:37 +08:00
parent 629ab270be
commit 78c8444096
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
3 changed files with 18 additions and 17 deletions

View File

@ -18,7 +18,8 @@
"from-preset": "npm run build && npm run emu:reset && npm run emu:start", "from-preset": "npm run build && npm run emu:reset && npm run emu:start",
"start": "npm run shell", "start": "npm run shell",
"deploy": "firebase deploy --only functions", "deploy": "firebase deploy --only functions",
"logs": "firebase functions:log" "logs": "firebase functions:log",
"gcp-build": "npx puppeteer browsers install chrome"
}, },
"engines": { "engines": {
"node": "18" "node": "18"

View File

@ -36,16 +36,16 @@ export class CrawlerHost extends RPCHost {
const formatted = { const formatted = {
title: (snapshot.parsed?.title || snapshot.title || '').trim(), title: (snapshot.parsed?.title || snapshot.title || '').trim(),
urlSource: snapshot.href.trim(), url: snapshot.href.trim(),
markdownContent: contentText.trim(), content: contentText.trim(),
toString() { toString() {
return `Title: ${this.title} return `Title: ${this.title}
URL Source: ${this.urlSource} URL Source: ${this.url}
Markdown Content: Markdown Content:
${contentText} ${this.content}
`; `;
} }
}; };

View File

@ -145,7 +145,7 @@ function giveSnapshot() {
async *scrap(url: string, noCache: string | boolean = false) { async *scrap(url: string, noCache: string | boolean = false) {
const parsedUrl = new URL(url); const parsedUrl = new URL(url);
parsedUrl.search = ''; // parsedUrl.search = '';
parsedUrl.hash = ''; parsedUrl.hash = '';
const normalizedUrl = parsedUrl.toString().toLowerCase(); const normalizedUrl = parsedUrl.toString().toLowerCase();
const digest = md5Hasher.hash(normalizedUrl); const digest = md5Hasher.hash(normalizedUrl);
@ -191,7 +191,17 @@ function giveSnapshot() {
page.on('snapshot', hdl); page.on('snapshot', hdl);
const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 }) const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 })
.then(async (r) => { .catch((err) => {
this.logger.warn(`Browsing of ${url} did not fully succeed`, { err: marshalErrorLike(err) });
return Promise.reject(new AssertionFailureError({
message: `Failed to goto ${url}: ${err}`,
cause: err,
}));
}).finally(async () => {
finalized = true;
if (!snapshot?.html) {
return;
}
screenshot = await page.screenshot({ screenshot = await page.screenshot({
type: 'jpeg', type: 'jpeg',
quality: 85, quality: 85,
@ -210,16 +220,6 @@ function giveSnapshot() {
).catch((err) => { ).catch((err) => {
this.logger.warn(`Failed to save snapshot`, { err: marshalErrorLike(err) }); this.logger.warn(`Failed to save snapshot`, { err: marshalErrorLike(err) });
}); });
return r;
}).catch((err) => {
this.logger.warn(`Failed to goto ${url}`, { err: marshalErrorLike(err) });
return Promise.reject(new AssertionFailureError({
message: `Failed to goto ${url}: ${err}`,
cause: err,
}));
}).finally(() => {
finalized = true;
}); });
try { try {