fix(scrapeURL/engines/fetch): discover charset and re-decode (#1221)

* fix(scrapeURL/engines/fetch): discover charset and re-decode

* fix(snips/scrape): allow more time for stealth proxy
This commit is contained in:
Gergő Móricz 2025-02-20 18:56:15 +01:00 committed by GitHub
parent e417f83c28
commit 283a3bfef3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 22 additions and 2 deletions

View File

@ -49,6 +49,14 @@ describe("Scrape tests", () => {
expect(response.markdown).toContain("Firecrawl");
}, 10000);
it.concurrent("handles non-UTF-8 encodings", async () => {
const response = await scrape({
url: "https://www.rtpro.yamaha.co.jp/RT/docs/misc/kanji-sjis.html",
});
expect(response.markdown).toContain("ぐ け げ こ ご さ ざ し じ す ず せ ぜ そ ぞ た");
}, 15000);
if (process.env.TEST_SUITE_SELF_HOSTED && process.env.PROXY_SERVER) {
it.concurrent("self-hosted proxy works", async () => {
const response = await scrape({
@ -158,7 +166,7 @@ describe("Scrape tests", () => {
url: "http://firecrawl.dev",
proxy: "stealth",
});
}, 15000);
}, 30000);
});
describe("PDF (f-e dependant)", () => {

View File

@ -8,6 +8,7 @@ import {
makeSecureDispatcher,
} from "../utils/safeFetch";
import { MockState, saveMock } from "../../lib/mock";
import { TextDecoder } from "util";
export async function scrapeURLWithFetch(
meta: Meta,
@ -71,9 +72,20 @@ export async function scrapeURLWithFetch(
})(),
]);
const buf = Buffer.from(await x.arrayBuffer());
let text = buf.toString("utf8");
const charset = (text.match(/charset=["']?(.+?)["']?>/) ?? [])[1]
try {
if (charset) {
text = new TextDecoder(charset.trim()).decode(buf);
}
} catch (error) {
meta.logger.warn("Failed to re-parse with correct charset", { charset, error })
}
response = {
url: x.url,
body: await x.text(),
body: text,
status: x.status,
headers: [...x.headers],
};