diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index 57fd92e0..5d9ca5c9 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -49,6 +49,14 @@ describe("Scrape tests", () => { expect(response.markdown).toContain("Firecrawl"); }, 10000); + it.concurrent("handles non-UTF-8 encodings", async () => { + const response = await scrape({ + url: "https://www.rtpro.yamaha.co.jp/RT/docs/misc/kanji-sjis.html", + }); + + expect(response.markdown).toContain("ぐ け げ こ ご さ ざ し じ す ず せ ぜ そ ぞ た"); + }, 15000); + if (process.env.TEST_SUITE_SELF_HOSTED && process.env.PROXY_SERVER) { it.concurrent("self-hosted proxy works", async () => { const response = await scrape({ @@ -158,7 +166,7 @@ describe("Scrape tests", () => { url: "http://firecrawl.dev", proxy: "stealth", }); - }, 15000); + }, 30000); }); describe("PDF (f-e dependant)", () => { diff --git a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts index ab100478..64269048 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts @@ -8,6 +8,7 @@ import { makeSecureDispatcher, } from "../utils/safeFetch"; import { MockState, saveMock } from "../../lib/mock"; +import { TextDecoder } from "util"; export async function scrapeURLWithFetch( meta: Meta, @@ -71,9 +72,20 @@ export async function scrapeURLWithFetch( })(), ]); + const buf = Buffer.from(await x.arrayBuffer()); + let text = buf.toString("utf8"); + const charset = (text.match(/charset=["']?(.+?)["']?>/) ?? [])[1] + try { + if (charset) { + text = new TextDecoder(charset.trim()).decode(buf); + } + } catch (error) { + meta.logger.warn("Failed to re-parse with correct charset", { charset, error }) + } + response = { url: x.url, - body: await x.text(), + body: text, status: x.status, headers: [...x.headers], };