fix(scrapeURL/engines/fetch): discover charset and re-decode (#1221)

* fix(scrapeURL/engines/fetch): discover charset and re-decode * fix(snips/scrape): allow more time for stealth proxy
2025-08-12 20:39:00 +08:00 · 2025-02-20 18:56:15 +01:00 · 2025-02-20 18:56:15 +01:00 · 283a3bfef3
commit 283a3bfef3
parent e417f83c28
2 changed files with 22 additions and 2 deletions
--- a/apps/api/src/tests/snips/scrape.test.ts
+++ b/apps/api/src/tests/snips/scrape.test.ts
@ -49,6 +49,14 @@ describe("Scrape tests", () => {
    expect(response.markdown).toContain("Firecrawl");
  }, 10000);
  it.concurrent("handles non-UTF-8 encodings", async () => {
    const response = await scrape({
      url: "https://www.rtpro.yamaha.co.jp/RT/docs/misc/kanji-sjis.html",
    });
    expect(response.markdown).toContain("ぐ け げ こ ご さ ざ し じ す ず せ ぜ そ ぞ た");
  }, 15000);
  if (process.env.TEST_SUITE_SELF_HOSTED && process.env.PROXY_SERVER) {
    it.concurrent("self-hosted proxy works", async () => {
      const response = await scrape({
@ -158,7 +166,7 @@ describe("Scrape tests", () => {
          url: "http://firecrawl.dev",
          proxy: "stealth",
        });
-      }, 15000);
+      }, 30000);
    });
    describe("PDF (f-e dependant)", () => {
--- a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts
@ -8,6 +8,7 @@ import {
  makeSecureDispatcher,
 } from "../utils/safeFetch";
 import { MockState, saveMock } from "../../lib/mock";
 import { TextDecoder } from "util";
 export async function scrapeURLWithFetch(
  meta: Meta,
@ -71,9 +72,20 @@ export async function scrapeURLWithFetch(
        })(),
      ]);
      const buf = Buffer.from(await x.arrayBuffer());
      let text = buf.toString("utf8");
      const charset = (text.match(/charset=["']?(.+?)["']?>/) ?? [])[1]
      try {
        if (charset) {
          text = new TextDecoder(charset.trim()).decode(buf);
        }
      } catch (error) {
        meta.logger.warn("Failed to re-parse with correct charset", { charset, error })
      }
      response = {
        url: x.url,
-        body: await x.text(),
+        body: text,
        status: x.status,
        headers: [...x.headers],
      };