diff --git a/src/services/snapshot-formatter.ts b/src/services/snapshot-formatter.ts index ffb6792..ab7eb32 100644 --- a/src/services/snapshot-formatter.ts +++ b/src/services/snapshot-formatter.ts @@ -14,7 +14,7 @@ import { cleanAttribute } from '../utils/misc'; import _ from 'lodash'; import { STATUS_CODES } from 'http'; import type { CrawlerOptions } from '../dto/crawler-options'; -import { readFile } from 'fs/promises'; +import { readFile } from '../utils/encoding'; import { pathToFileURL } from 'url'; import { countGPTToken } from '../shared/utils/openai'; @@ -804,7 +804,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; overrideContentType = undefined; } - const contentType = (overrideContentType || await file.mimeType).toLowerCase(); + const contentType: string = (overrideContentType || await file.mimeType).toLowerCase(); const fileName = overrideFileName || `${url.origin}${url.pathname}`; const snapshot: PageSnapshot = { title: '', @@ -821,11 +821,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; return snapshot; } try { + const encoding: string | undefined = contentType.includes('charset=') ? contentType.split('charset=')[1]?.trim().toLowerCase() : 'utf-8'; if (contentType.startsWith('text/html')) { if ((await file.size) > 1024 * 1024 * 32) { throw new AssertionFailureError(`Failed to access ${url}: file too large`); } - snapshot.html = await readFile(await file.filePath, { encoding: 'utf-8' }); + snapshot.html = await readFile(await file.filePath, encoding); return snapshot; } @@ -833,7 +834,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; if ((await file.size) > 1024 * 1024 * 32) { throw new AssertionFailureError(`Failed to access ${url}: file too large`); } - snapshot.text = await readFile(await file.filePath, { encoding: 'utf-8' }); + snapshot.text = await readFile(await file.filePath, encoding); snapshot.html = `
${snapshot.text}`; return snapshot; diff --git a/src/utils/encoding.ts b/src/utils/encoding.ts new file mode 100644 index 0000000..0f34a9b --- /dev/null +++ b/src/utils/encoding.ts @@ -0,0 +1,34 @@ +import { createReadStream } from 'fs'; +import { Readable } from 'stream'; +import { TextDecoderStream } from 'stream/web'; + +export async function decodeFileStream( + fileStream: Readable, + encoding: string = 'utf-8', +): Promise