fix: encoding of from file snapshots

This commit is contained in:
Yanlong Wang 2025-04-11 11:50:13 +08:00
parent b6ac1782dc
commit 5f83d862dd
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
2 changed files with 39 additions and 4 deletions

View File

@ -14,7 +14,7 @@ import { cleanAttribute } from '../utils/misc';
import _ from 'lodash';
import { STATUS_CODES } from 'http';
import type { CrawlerOptions } from '../dto/crawler-options';
import { readFile } from 'fs/promises';
import { readFile } from '../utils/encoding';
import { pathToFileURL } from 'url';
import { countGPTToken } from '../shared/utils/openai';
@ -804,7 +804,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
overrideContentType = undefined;
}
const contentType = (overrideContentType || await file.mimeType).toLowerCase();
const contentType: string = (overrideContentType || await file.mimeType).toLowerCase();
const fileName = overrideFileName || `${url.origin}${url.pathname}`;
const snapshot: PageSnapshot = {
title: '',
@ -821,11 +821,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
return snapshot;
}
try {
const encoding: string | undefined = contentType.includes('charset=') ? contentType.split('charset=')[1]?.trim().toLowerCase() : 'utf-8';
if (contentType.startsWith('text/html')) {
if ((await file.size) > 1024 * 1024 * 32) {
throw new AssertionFailureError(`Failed to access ${url}: file too large`);
}
snapshot.html = await readFile(await file.filePath, { encoding: 'utf-8' });
snapshot.html = await readFile(await file.filePath, encoding);
return snapshot;
}
@ -833,7 +834,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
if ((await file.size) > 1024 * 1024 * 32) {
throw new AssertionFailureError(`Failed to access ${url}: file too large`);
}
snapshot.text = await readFile(await file.filePath, { encoding: 'utf-8' });
snapshot.text = await readFile(await file.filePath, encoding);
snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`;
return snapshot;

34
src/utils/encoding.ts Normal file
View File

@ -0,0 +1,34 @@
import { createReadStream } from 'fs';
import { Readable } from 'stream';
import { TextDecoderStream } from 'stream/web';
export async function decodeFileStream(
fileStream: Readable,
encoding: string = 'utf-8',
): Promise<string> {
const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false });
Readable.toWeb(fileStream).pipeThrough(decodeStream);
const chunks = [];
for await (const chunk of decodeStream.readable) {
chunks.push(chunk);
}
return chunks.join('');
}
export async function readFile(
filePath: string,
encoding: string = 'utf-8',
): Promise<string> {
const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false });
Readable.toWeb(createReadStream(filePath)).pipeThrough(decodeStream);
const chunks = [];
for await (const chunk of decodeStream.readable) {
chunks.push(chunk);
}
return chunks.join('');
}