mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-20 02:29:13 +08:00
fix: encoding of from file snapshots
This commit is contained in:
parent
b6ac1782dc
commit
5f83d862dd
@ -14,7 +14,7 @@ import { cleanAttribute } from '../utils/misc';
|
||||
import _ from 'lodash';
|
||||
import { STATUS_CODES } from 'http';
|
||||
import type { CrawlerOptions } from '../dto/crawler-options';
|
||||
import { readFile } from 'fs/promises';
|
||||
import { readFile } from '../utils/encoding';
|
||||
import { pathToFileURL } from 'url';
|
||||
import { countGPTToken } from '../shared/utils/openai';
|
||||
|
||||
@ -804,7 +804,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
overrideContentType = undefined;
|
||||
}
|
||||
|
||||
const contentType = (overrideContentType || await file.mimeType).toLowerCase();
|
||||
const contentType: string = (overrideContentType || await file.mimeType).toLowerCase();
|
||||
const fileName = overrideFileName || `${url.origin}${url.pathname}`;
|
||||
const snapshot: PageSnapshot = {
|
||||
title: '',
|
||||
@ -821,11 +821,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
return snapshot;
|
||||
}
|
||||
try {
|
||||
const encoding: string | undefined = contentType.includes('charset=') ? contentType.split('charset=')[1]?.trim().toLowerCase() : 'utf-8';
|
||||
if (contentType.startsWith('text/html')) {
|
||||
if ((await file.size) > 1024 * 1024 * 32) {
|
||||
throw new AssertionFailureError(`Failed to access ${url}: file too large`);
|
||||
}
|
||||
snapshot.html = await readFile(await file.filePath, { encoding: 'utf-8' });
|
||||
snapshot.html = await readFile(await file.filePath, encoding);
|
||||
|
||||
return snapshot;
|
||||
}
|
||||
@ -833,7 +834,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
if ((await file.size) > 1024 * 1024 * 32) {
|
||||
throw new AssertionFailureError(`Failed to access ${url}: file too large`);
|
||||
}
|
||||
snapshot.text = await readFile(await file.filePath, { encoding: 'utf-8' });
|
||||
snapshot.text = await readFile(await file.filePath, encoding);
|
||||
snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`;
|
||||
|
||||
return snapshot;
|
||||
|
34
src/utils/encoding.ts
Normal file
34
src/utils/encoding.ts
Normal file
@ -0,0 +1,34 @@
|
||||
import { createReadStream } from 'fs';
|
||||
import { Readable } from 'stream';
|
||||
import { TextDecoderStream } from 'stream/web';
|
||||
|
||||
export async function decodeFileStream(
|
||||
fileStream: Readable,
|
||||
encoding: string = 'utf-8',
|
||||
): Promise<string> {
|
||||
const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false });
|
||||
Readable.toWeb(fileStream).pipeThrough(decodeStream);
|
||||
const chunks = [];
|
||||
|
||||
for await (const chunk of decodeStream.readable) {
|
||||
chunks.push(chunk);
|
||||
}
|
||||
|
||||
return chunks.join('');
|
||||
}
|
||||
|
||||
|
||||
export async function readFile(
|
||||
filePath: string,
|
||||
encoding: string = 'utf-8',
|
||||
): Promise<string> {
|
||||
const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false });
|
||||
Readable.toWeb(createReadStream(filePath)).pipeThrough(decodeStream);
|
||||
const chunks = [];
|
||||
|
||||
for await (const chunk of decodeStream.readable) {
|
||||
chunks.push(chunk);
|
||||
}
|
||||
|
||||
return chunks.join('');
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user