mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-18 17:45:55 +08:00
fix: compressed response from curl
This commit is contained in:
parent
6a58de590c
commit
0f36fe81a6
@ -10,6 +10,7 @@ import { AssertionFailureError, FancyFile } from 'civkit';
|
|||||||
import { TempFileManager } from '../shared';
|
import { TempFileManager } from '../shared';
|
||||||
import { readFile } from 'fs/promises';
|
import { readFile } from 'fs/promises';
|
||||||
import { pathToFileURL } from 'url';
|
import { pathToFileURL } from 'url';
|
||||||
|
import { createBrotliDecompress, createInflate, createGunzip } from 'zlib';
|
||||||
|
|
||||||
@singleton()
|
@singleton()
|
||||||
export class CurlControl extends AsyncService {
|
export class CurlControl extends AsyncService {
|
||||||
@ -59,6 +60,7 @@ export class CurlControl extends AsyncService {
|
|||||||
text: '',
|
text: '',
|
||||||
} as PageSnapshot;
|
} as PageSnapshot;
|
||||||
|
|
||||||
|
let contentType = '';
|
||||||
const result = await new Promise<{
|
const result = await new Promise<{
|
||||||
statusCode: number,
|
statusCode: number,
|
||||||
data?: FancyFile,
|
data?: FancyFile,
|
||||||
@ -102,14 +104,20 @@ export class CurlControl extends AsyncService {
|
|||||||
});
|
});
|
||||||
curl.setOpt(Curl.option.MAXFILESIZE, 1024 * 1024 * 1024); // 1GB
|
curl.setOpt(Curl.option.MAXFILESIZE, 1024 * 1024 * 1024); // 1GB
|
||||||
let status = -1;
|
let status = -1;
|
||||||
let contentType = '';
|
let contentEncoding = '';
|
||||||
curl.on('stream', (stream, statusCode, headers) => {
|
curl.on('stream', (stream, statusCode, headers) => {
|
||||||
status = statusCode;
|
status = statusCode;
|
||||||
outerLoop:
|
outerLoop:
|
||||||
for (const headerVec of headers) {
|
for (const headerVec of headers) {
|
||||||
for (const [k, v] of Object.entries(headerVec)) {
|
for (const [k, v] of Object.entries(headerVec)) {
|
||||||
if (k.toLowerCase() === 'content-type') {
|
const kl = k.toLowerCase();
|
||||||
|
if (kl === 'content-type') {
|
||||||
contentType = v.toLowerCase();
|
contentType = v.toLowerCase();
|
||||||
|
}
|
||||||
|
if (kl === 'content-encoding') {
|
||||||
|
contentEncoding = v.toLowerCase();
|
||||||
|
}
|
||||||
|
if (contentType && contentEncoding) {
|
||||||
break outerLoop;
|
break outerLoop;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -130,6 +138,30 @@ export class CurlControl extends AsyncService {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
switch (contentEncoding) {
|
||||||
|
case 'gzip': {
|
||||||
|
const decompressed = createGunzip();
|
||||||
|
stream.pipe(decompressed);
|
||||||
|
stream = decompressed;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 'deflate': {
|
||||||
|
const decompressed = createInflate();
|
||||||
|
stream.pipe(decompressed);
|
||||||
|
stream = decompressed;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 'br': {
|
||||||
|
const decompressed = createBrotliDecompress();
|
||||||
|
stream.pipe(decompressed);
|
||||||
|
stream = decompressed;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default: {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const fpath = this.tempFileManager.alloc();
|
const fpath = this.tempFileManager.alloc();
|
||||||
const fancyFile = FancyFile.auto(stream, fpath);
|
const fancyFile = FancyFile.auto(stream, fpath);
|
||||||
this.tempFileManager.bindPathTo(fancyFile, fpath);
|
this.tempFileManager.bindPathTo(fancyFile, fpath);
|
||||||
@ -147,8 +179,13 @@ export class CurlControl extends AsyncService {
|
|||||||
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: HTTP ${result.statusCode}`);
|
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: HTTP ${result.statusCode}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (contentType === 'application/octet-stream') {
|
||||||
|
// Content declared as binary is same as unknown.
|
||||||
|
contentType = '';
|
||||||
|
}
|
||||||
|
|
||||||
if (result.data) {
|
if (result.data) {
|
||||||
const mimeType: string = await result.data.mimeType;
|
const mimeType: string = contentType || await result.data.mimeType;
|
||||||
if (mimeType.startsWith('text/html')) {
|
if (mimeType.startsWith('text/html')) {
|
||||||
if ((await result.data.size) > 1024 * 1024 * 32) {
|
if ((await result.data.size) > 1024 * 1024 * 32) {
|
||||||
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: file too large`);
|
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: file too large`);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user