mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 20:39:00 +08:00
fix(scrapeURL/engines/fetch): discover charset and re-decode (#1221)
* fix(scrapeURL/engines/fetch): discover charset and re-decode * fix(snips/scrape): allow more time for stealth proxy
This commit is contained in:
parent
e417f83c28
commit
283a3bfef3
@ -49,6 +49,14 @@ describe("Scrape tests", () => {
|
|||||||
expect(response.markdown).toContain("Firecrawl");
|
expect(response.markdown).toContain("Firecrawl");
|
||||||
}, 10000);
|
}, 10000);
|
||||||
|
|
||||||
|
it.concurrent("handles non-UTF-8 encodings", async () => {
|
||||||
|
const response = await scrape({
|
||||||
|
url: "https://www.rtpro.yamaha.co.jp/RT/docs/misc/kanji-sjis.html",
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(response.markdown).toContain("ぐ け げ こ ご さ ざ し じ す ず せ ぜ そ ぞ た");
|
||||||
|
}, 15000);
|
||||||
|
|
||||||
if (process.env.TEST_SUITE_SELF_HOSTED && process.env.PROXY_SERVER) {
|
if (process.env.TEST_SUITE_SELF_HOSTED && process.env.PROXY_SERVER) {
|
||||||
it.concurrent("self-hosted proxy works", async () => {
|
it.concurrent("self-hosted proxy works", async () => {
|
||||||
const response = await scrape({
|
const response = await scrape({
|
||||||
@ -158,7 +166,7 @@ describe("Scrape tests", () => {
|
|||||||
url: "http://firecrawl.dev",
|
url: "http://firecrawl.dev",
|
||||||
proxy: "stealth",
|
proxy: "stealth",
|
||||||
});
|
});
|
||||||
}, 15000);
|
}, 30000);
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("PDF (f-e dependant)", () => {
|
describe("PDF (f-e dependant)", () => {
|
||||||
|
@ -8,6 +8,7 @@ import {
|
|||||||
makeSecureDispatcher,
|
makeSecureDispatcher,
|
||||||
} from "../utils/safeFetch";
|
} from "../utils/safeFetch";
|
||||||
import { MockState, saveMock } from "../../lib/mock";
|
import { MockState, saveMock } from "../../lib/mock";
|
||||||
|
import { TextDecoder } from "util";
|
||||||
|
|
||||||
export async function scrapeURLWithFetch(
|
export async function scrapeURLWithFetch(
|
||||||
meta: Meta,
|
meta: Meta,
|
||||||
@ -71,9 +72,20 @@ export async function scrapeURLWithFetch(
|
|||||||
})(),
|
})(),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
|
const buf = Buffer.from(await x.arrayBuffer());
|
||||||
|
let text = buf.toString("utf8");
|
||||||
|
const charset = (text.match(/charset=["']?(.+?)["']?>/) ?? [])[1]
|
||||||
|
try {
|
||||||
|
if (charset) {
|
||||||
|
text = new TextDecoder(charset.trim()).decode(buf);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
meta.logger.warn("Failed to re-parse with correct charset", { charset, error })
|
||||||
|
}
|
||||||
|
|
||||||
response = {
|
response = {
|
||||||
url: x.url,
|
url: x.url,
|
||||||
body: await x.text(),
|
body: text,
|
||||||
status: x.status,
|
status: x.status,
|
||||||
headers: [...x.headers],
|
headers: [...x.headers],
|
||||||
};
|
};
|
||||||
|
Loading…
x
Reference in New Issue
Block a user