fix: encoding of from file snapshots

This commit is contained in:
Yanlong Wang 2025-04-11 11:50:13 +08:00
parent b6ac1782dc
commit 5f83d862dd
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
2 changed files with 39 additions and 4 deletions

View File

@ -14,7 +14,7 @@ import { cleanAttribute } from '../utils/misc';
import _ from 'lodash'; import _ from 'lodash';
import { STATUS_CODES } from 'http'; import { STATUS_CODES } from 'http';
import type { CrawlerOptions } from '../dto/crawler-options'; import type { CrawlerOptions } from '../dto/crawler-options';
import { readFile } from 'fs/promises'; import { readFile } from '../utils/encoding';
import { pathToFileURL } from 'url'; import { pathToFileURL } from 'url';
import { countGPTToken } from '../shared/utils/openai'; import { countGPTToken } from '../shared/utils/openai';
@ -804,7 +804,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
overrideContentType = undefined; overrideContentType = undefined;
} }
const contentType = (overrideContentType || await file.mimeType).toLowerCase(); const contentType: string = (overrideContentType || await file.mimeType).toLowerCase();
const fileName = overrideFileName || `${url.origin}${url.pathname}`; const fileName = overrideFileName || `${url.origin}${url.pathname}`;
const snapshot: PageSnapshot = { const snapshot: PageSnapshot = {
title: '', title: '',
@ -821,11 +821,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
return snapshot; return snapshot;
} }
try { try {
const encoding: string | undefined = contentType.includes('charset=') ? contentType.split('charset=')[1]?.trim().toLowerCase() : 'utf-8';
if (contentType.startsWith('text/html')) { if (contentType.startsWith('text/html')) {
if ((await file.size) > 1024 * 1024 * 32) { if ((await file.size) > 1024 * 1024 * 32) {
throw new AssertionFailureError(`Failed to access ${url}: file too large`); throw new AssertionFailureError(`Failed to access ${url}: file too large`);
} }
snapshot.html = await readFile(await file.filePath, { encoding: 'utf-8' }); snapshot.html = await readFile(await file.filePath, encoding);
return snapshot; return snapshot;
} }
@ -833,7 +834,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
if ((await file.size) > 1024 * 1024 * 32) { if ((await file.size) > 1024 * 1024 * 32) {
throw new AssertionFailureError(`Failed to access ${url}: file too large`); throw new AssertionFailureError(`Failed to access ${url}: file too large`);
} }
snapshot.text = await readFile(await file.filePath, { encoding: 'utf-8' }); snapshot.text = await readFile(await file.filePath, encoding);
snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`; snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`;
return snapshot; return snapshot;

34
src/utils/encoding.ts Normal file
View File

@ -0,0 +1,34 @@
import { createReadStream } from 'fs';
import { Readable } from 'stream';
import { TextDecoderStream } from 'stream/web';
export async function decodeFileStream(
fileStream: Readable,
encoding: string = 'utf-8',
): Promise<string> {
const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false });
Readable.toWeb(fileStream).pipeThrough(decodeStream);
const chunks = [];
for await (const chunk of decodeStream.readable) {
chunks.push(chunk);
}
return chunks.join('');
}
export async function readFile(
filePath: string,
encoding: string = 'utf-8',
): Promise<string> {
const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false });
Readable.toWeb(createReadStream(filePath)).pipeThrough(decodeStream);
const chunks = [];
for await (const chunk of decodeStream.readable) {
chunks.push(chunk);
}
return chunks.join('');
}