mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 17:59:00 +08:00
fix(scrapeURL/engines/fetch): discover charset and re-decode (#1221)
* fix(scrapeURL/engines/fetch): discover charset and re-decode * fix(snips/scrape): allow more time for stealth proxy
This commit is contained in:
parent
e417f83c28
commit
283a3bfef3
@ -49,6 +49,14 @@ describe("Scrape tests", () => {
|
||||
expect(response.markdown).toContain("Firecrawl");
|
||||
}, 10000);
|
||||
|
||||
it.concurrent("handles non-UTF-8 encodings", async () => {
|
||||
const response = await scrape({
|
||||
url: "https://www.rtpro.yamaha.co.jp/RT/docs/misc/kanji-sjis.html",
|
||||
});
|
||||
|
||||
expect(response.markdown).toContain("ぐ け げ こ ご さ ざ し じ す ず せ ぜ そ ぞ た");
|
||||
}, 15000);
|
||||
|
||||
if (process.env.TEST_SUITE_SELF_HOSTED && process.env.PROXY_SERVER) {
|
||||
it.concurrent("self-hosted proxy works", async () => {
|
||||
const response = await scrape({
|
||||
@ -158,7 +166,7 @@ describe("Scrape tests", () => {
|
||||
url: "http://firecrawl.dev",
|
||||
proxy: "stealth",
|
||||
});
|
||||
}, 15000);
|
||||
}, 30000);
|
||||
});
|
||||
|
||||
describe("PDF (f-e dependant)", () => {
|
||||
|
@ -8,6 +8,7 @@ import {
|
||||
makeSecureDispatcher,
|
||||
} from "../utils/safeFetch";
|
||||
import { MockState, saveMock } from "../../lib/mock";
|
||||
import { TextDecoder } from "util";
|
||||
|
||||
export async function scrapeURLWithFetch(
|
||||
meta: Meta,
|
||||
@ -71,9 +72,20 @@ export async function scrapeURLWithFetch(
|
||||
})(),
|
||||
]);
|
||||
|
||||
const buf = Buffer.from(await x.arrayBuffer());
|
||||
let text = buf.toString("utf8");
|
||||
const charset = (text.match(/charset=["']?(.+?)["']?>/) ?? [])[1]
|
||||
try {
|
||||
if (charset) {
|
||||
text = new TextDecoder(charset.trim()).decode(buf);
|
||||
}
|
||||
} catch (error) {
|
||||
meta.logger.warn("Failed to re-parse with correct charset", { charset, error })
|
||||
}
|
||||
|
||||
response = {
|
||||
url: x.url,
|
||||
body: await x.text(),
|
||||
body: text,
|
||||
status: x.status,
|
||||
headers: [...x.headers],
|
||||
};
|
||||
|
Loading…
x
Reference in New Issue
Block a user