fix(scrapeURL/engines/fetch): discover charset and re-decode (#1221)

* fix(scrapeURL/engines/fetch): discover charset and re-decode

* fix(snips/scrape): allow more time for stealth proxy
This commit is contained in:
Gergő Móricz 2025-02-20 18:56:15 +01:00 committed by GitHub
parent e417f83c28
commit 283a3bfef3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 22 additions and 2 deletions

View File

@ -49,6 +49,14 @@ describe("Scrape tests", () => {
expect(response.markdown).toContain("Firecrawl"); expect(response.markdown).toContain("Firecrawl");
}, 10000); }, 10000);
it.concurrent("handles non-UTF-8 encodings", async () => {
const response = await scrape({
url: "https://www.rtpro.yamaha.co.jp/RT/docs/misc/kanji-sjis.html",
});
expect(response.markdown).toContain("ぐ け げ こ ご さ ざ し じ す ず せ ぜ そ ぞ た");
}, 15000);
if (process.env.TEST_SUITE_SELF_HOSTED && process.env.PROXY_SERVER) { if (process.env.TEST_SUITE_SELF_HOSTED && process.env.PROXY_SERVER) {
it.concurrent("self-hosted proxy works", async () => { it.concurrent("self-hosted proxy works", async () => {
const response = await scrape({ const response = await scrape({
@ -158,7 +166,7 @@ describe("Scrape tests", () => {
url: "http://firecrawl.dev", url: "http://firecrawl.dev",
proxy: "stealth", proxy: "stealth",
}); });
}, 15000); }, 30000);
}); });
describe("PDF (f-e dependant)", () => { describe("PDF (f-e dependant)", () => {

View File

@ -8,6 +8,7 @@ import {
makeSecureDispatcher, makeSecureDispatcher,
} from "../utils/safeFetch"; } from "../utils/safeFetch";
import { MockState, saveMock } from "../../lib/mock"; import { MockState, saveMock } from "../../lib/mock";
import { TextDecoder } from "util";
export async function scrapeURLWithFetch( export async function scrapeURLWithFetch(
meta: Meta, meta: Meta,
@ -71,9 +72,20 @@ export async function scrapeURLWithFetch(
})(), })(),
]); ]);
const buf = Buffer.from(await x.arrayBuffer());
let text = buf.toString("utf8");
const charset = (text.match(/charset=["']?(.+?)["']?>/) ?? [])[1]
try {
if (charset) {
text = new TextDecoder(charset.trim()).decode(buf);
}
} catch (error) {
meta.logger.warn("Failed to re-parse with correct charset", { charset, error })
}
response = { response = {
url: x.url, url: x.url,
body: await x.text(), body: text,
status: x.status, status: x.status,
headers: [...x.headers], headers: [...x.headers],
}; };