From 283a3bfef3dc3bd5fcb63e90e5c17487472e3b08 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= <mo.geryy@gmail.com>
Date: Thu, 20 Feb 2025 18:56:15 +0100
Subject: [PATCH] fix(scrapeURL/engines/fetch): discover charset and re-decode
 (#1221)

* fix(scrapeURL/engines/fetch): discover charset and re-decode

* fix(snips/scrape): allow more time for stealth proxy
---
 apps/api/src/__tests__/snips/scrape.test.ts        | 10 +++++++++-
 .../src/scraper/scrapeURL/engines/fetch/index.ts   | 14 +++++++++++++-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts
index 57fd92e0..5d9ca5c9 100644
--- a/apps/api/src/__tests__/snips/scrape.test.ts
+++ b/apps/api/src/__tests__/snips/scrape.test.ts
@@ -49,6 +49,14 @@ describe("Scrape tests", () => {
     expect(response.markdown).toContain("Firecrawl");
   }, 10000);
 
+  it.concurrent("handles non-UTF-8 encodings", async () => {
+    const response = await scrape({
+      url: "https://www.rtpro.yamaha.co.jp/RT/docs/misc/kanji-sjis.html",
+    });
+
+    expect(response.markdown).toContain("ぐ け げ こ ご さ ざ し じ す ず せ ぜ そ ぞ た");
+  }, 15000);
+
   if (process.env.TEST_SUITE_SELF_HOSTED && process.env.PROXY_SERVER) {
     it.concurrent("self-hosted proxy works", async () => {
       const response = await scrape({
@@ -158,7 +166,7 @@ describe("Scrape tests", () => {
           url: "http://firecrawl.dev",
           proxy: "stealth",
         });
-      }, 15000);
+      }, 30000);
     });
     
     describe("PDF (f-e dependant)", () => {
diff --git a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts
index ab100478..64269048 100644
--- a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts
@@ -8,6 +8,7 @@ import {
   makeSecureDispatcher,
 } from "../utils/safeFetch";
 import { MockState, saveMock } from "../../lib/mock";
+import { TextDecoder } from "util";
 
 export async function scrapeURLWithFetch(
   meta: Meta,
@@ -71,9 +72,20 @@ export async function scrapeURLWithFetch(
         })(),
       ]);
 
+      const buf = Buffer.from(await x.arrayBuffer());
+      let text = buf.toString("utf8");
+      const charset = (text.match(/charset=["']?(.+?)["']?>/) ?? [])[1]
+      try {
+        if (charset) {
+          text = new TextDecoder(charset.trim()).decode(buf);
+        }
+      } catch (error) {
+        meta.logger.warn("Failed to re-parse with correct charset", { charset, error })
+      }
+
       response = {
         url: x.url,
-        body: await x.text(),
+        body: text,
         status: x.status,
         headers: [...x.headers],
       };