mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-16 18:25:58 +08:00
fix
This commit is contained in:
parent
629ab270be
commit
78c8444096
@ -18,7 +18,8 @@
|
|||||||
"from-preset": "npm run build && npm run emu:reset && npm run emu:start",
|
"from-preset": "npm run build && npm run emu:reset && npm run emu:start",
|
||||||
"start": "npm run shell",
|
"start": "npm run shell",
|
||||||
"deploy": "firebase deploy --only functions",
|
"deploy": "firebase deploy --only functions",
|
||||||
"logs": "firebase functions:log"
|
"logs": "firebase functions:log",
|
||||||
|
"gcp-build": "npx puppeteer browsers install chrome"
|
||||||
},
|
},
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": "18"
|
"node": "18"
|
||||||
|
@ -36,16 +36,16 @@ export class CrawlerHost extends RPCHost {
|
|||||||
|
|
||||||
const formatted = {
|
const formatted = {
|
||||||
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
||||||
urlSource: snapshot.href.trim(),
|
url: snapshot.href.trim(),
|
||||||
markdownContent: contentText.trim(),
|
content: contentText.trim(),
|
||||||
|
|
||||||
toString() {
|
toString() {
|
||||||
return `Title: ${this.title}
|
return `Title: ${this.title}
|
||||||
|
|
||||||
URL Source: ${this.urlSource}
|
URL Source: ${this.url}
|
||||||
|
|
||||||
Markdown Content:
|
Markdown Content:
|
||||||
${contentText}
|
${this.content}
|
||||||
`;
|
`;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -145,7 +145,7 @@ function giveSnapshot() {
|
|||||||
|
|
||||||
async *scrap(url: string, noCache: string | boolean = false) {
|
async *scrap(url: string, noCache: string | boolean = false) {
|
||||||
const parsedUrl = new URL(url);
|
const parsedUrl = new URL(url);
|
||||||
parsedUrl.search = '';
|
// parsedUrl.search = '';
|
||||||
parsedUrl.hash = '';
|
parsedUrl.hash = '';
|
||||||
const normalizedUrl = parsedUrl.toString().toLowerCase();
|
const normalizedUrl = parsedUrl.toString().toLowerCase();
|
||||||
const digest = md5Hasher.hash(normalizedUrl);
|
const digest = md5Hasher.hash(normalizedUrl);
|
||||||
@ -191,7 +191,17 @@ function giveSnapshot() {
|
|||||||
page.on('snapshot', hdl);
|
page.on('snapshot', hdl);
|
||||||
|
|
||||||
const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 })
|
const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 })
|
||||||
.then(async (r) => {
|
.catch((err) => {
|
||||||
|
this.logger.warn(`Browsing of ${url} did not fully succeed`, { err: marshalErrorLike(err) });
|
||||||
|
return Promise.reject(new AssertionFailureError({
|
||||||
|
message: `Failed to goto ${url}: ${err}`,
|
||||||
|
cause: err,
|
||||||
|
}));
|
||||||
|
}).finally(async () => {
|
||||||
|
finalized = true;
|
||||||
|
if (!snapshot?.html) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
screenshot = await page.screenshot({
|
screenshot = await page.screenshot({
|
||||||
type: 'jpeg',
|
type: 'jpeg',
|
||||||
quality: 85,
|
quality: 85,
|
||||||
@ -210,16 +220,6 @@ function giveSnapshot() {
|
|||||||
).catch((err) => {
|
).catch((err) => {
|
||||||
this.logger.warn(`Failed to save snapshot`, { err: marshalErrorLike(err) });
|
this.logger.warn(`Failed to save snapshot`, { err: marshalErrorLike(err) });
|
||||||
});
|
});
|
||||||
|
|
||||||
return r;
|
|
||||||
}).catch((err) => {
|
|
||||||
this.logger.warn(`Failed to goto ${url}`, { err: marshalErrorLike(err) });
|
|
||||||
return Promise.reject(new AssertionFailureError({
|
|
||||||
message: `Failed to goto ${url}: ${err}`,
|
|
||||||
cause: err,
|
|
||||||
}));
|
|
||||||
}).finally(() => {
|
|
||||||
finalized = true;
|
|
||||||
});
|
});
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user