From 283a3bfef3dc3bd5fcb63e90e5c17487472e3b08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 20 Feb 2025 18:56:15 +0100 Subject: [PATCH] fix(scrapeURL/engines/fetch): discover charset and re-decode (#1221) * fix(scrapeURL/engines/fetch): discover charset and re-decode * fix(snips/scrape): allow more time for stealth proxy --- apps/api/src/__tests__/snips/scrape.test.ts | 10 +++++++++- .../src/scraper/scrapeURL/engines/fetch/index.ts | 14 +++++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index 57fd92e0..5d9ca5c9 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -49,6 +49,14 @@ describe("Scrape tests", () => { expect(response.markdown).toContain("Firecrawl"); }, 10000); + it.concurrent("handles non-UTF-8 encodings", async () => { + const response = await scrape({ + url: "https://www.rtpro.yamaha.co.jp/RT/docs/misc/kanji-sjis.html", + }); + + expect(response.markdown).toContain("ぐ け げ こ ご さ ざ し じ す ず せ ぜ そ ぞ た"); + }, 15000); + if (process.env.TEST_SUITE_SELF_HOSTED && process.env.PROXY_SERVER) { it.concurrent("self-hosted proxy works", async () => { const response = await scrape({ @@ -158,7 +166,7 @@ describe("Scrape tests", () => { url: "http://firecrawl.dev", proxy: "stealth", }); - }, 15000); + }, 30000); }); describe("PDF (f-e dependant)", () => { diff --git a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts index ab100478..64269048 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts @@ -8,6 +8,7 @@ import { makeSecureDispatcher, } from "../utils/safeFetch"; import { MockState, saveMock } from "../../lib/mock"; +import { TextDecoder } from "util"; export async function scrapeURLWithFetch( meta: Meta, @@ -71,9 +72,20 @@ export async function scrapeURLWithFetch( })(), ]); + const buf = Buffer.from(await x.arrayBuffer()); + let text = buf.toString("utf8"); + const charset = (text.match(/charset=["']?(.+?)["']?>/) ?? [])[1] + try { + if (charset) { + text = new TextDecoder(charset.trim()).decode(buf); + } + } catch (error) { + meta.logger.warn("Failed to re-parse with correct charset", { charset, error }) + } + response = { url: x.url, - body: await x.text(), + body: text, status: x.status, headers: [...x.headers], };