From c38dcd043212786896874947726cfdbc9756dfc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 20 Feb 2025 14:20:03 +0100 Subject: [PATCH] feat(self-host): proxy support (FIR-1111) (#1212) * feat(self-host): proxy support * fix(playwright-service-ts): return untreated text/plain --- .github/workflows/test-server-self-host.yml | 10 +++++++--- SELF_HOST.md | 7 +++++++ apps/api/src/__tests__/snips/scrape.test.ts | 10 ++++++++++ .../scraper/scrapeURL/engines/utils/safeFetch.ts | 14 ++++++++++++-- apps/playwright-service-ts/api.ts | 2 +- docker-compose.yaml | 16 +++++++++++----- 6 files changed, 48 insertions(+), 11 deletions(-) diff --git a/.github/workflows/test-server-self-host.yml b/.github/workflows/test-server-self-host.yml index de9e7d92..301353df 100644 --- a/.github/workflows/test-server-self-host.yml +++ b/.github/workflows/test-server-self-host.yml @@ -24,6 +24,7 @@ jobs: ai: ["openai", "no-ai"] search: ["searxng", "google"] engine: ["playwright", "fetch"] + proxy: ["proxy", "no-proxy"] fail-fast: false runs-on: ubuntu-latest services: @@ -35,6 +36,9 @@ jobs: OPENAI_API_KEY: ${{ matrix.ai == 'openai' && secrets.OPENAI_API_KEY || '' }} SEARXNG_ENDPOINT: ${{ matrix.search == 'searxng' && 'http://localhost:3434' || '' }} PLAYWRIGHT_MICROSERVICE_URL: ${{ matrix.engine == 'playwright' && 'http://localhost:3003/scrape' || '' }} + PROXY_SERVER: ${{ matrix.proxy == 'proxy' && secrets.PROXY_SERVER || '' }} + PROXY_USERNAME: ${{ matrix.proxy == 'proxy' && secrets.PROXY_USERNAME || '' }} + PROXY_PASSWORD: ${{ matrix.proxy == 'proxy' && secrets.PROXY_PASSWORD || '' }} steps: - uses: actions/checkout@v3 - name: Install pnpm @@ -115,20 +119,20 @@ jobs: - uses: actions/upload-artifact@v4 if: always() with: - name: Logs (${{ matrix.ai }}, ${{ matrix.search }}, ${{ matrix.engine }}) + name: Logs (${{ matrix.ai }}, ${{ matrix.search }}, ${{ matrix.engine }}, ${{ matrix.proxy }}) path: | ./apps/api/api.log ./apps/api/worker.log - uses: actions/upload-artifact@v4 if: always() && matrix.playwright with: - name: Playwright Logs (${{ matrix.ai }}, ${{ matrix.search }}) + name: Playwright Logs (${{ matrix.ai }}, ${{ matrix.search }}, ${{ matrix.proxy }}) path: | ./apps/playwright-service-ts/playwright.log - uses: actions/upload-artifact@v4 if: always() && matrix.search == 'searxng' with: - name: SearXNG (${{ matrix.ai }}, ${{ matrix.engine }}) + name: SearXNG (${{ matrix.ai }}, ${{ matrix.engine }}, ${{ matrix.proxy }}) path: | ./searxng/searxng.log ./searxng/settings.yml diff --git a/SELF_HOST.md b/SELF_HOST.md index 136e0d00..a5620176 100644 --- a/SELF_HOST.md +++ b/SELF_HOST.md @@ -51,6 +51,13 @@ USE_DB_AUTHENTICATION=false # Provide your OpenAI API key here to enable AI features # OPENAI_API_KEY= +## === Proxy === +# PROXY_SERVER can be a full URL (e.g. http://0.1.2.3:1234) or just an IP and port combo (e.g. 0.1.2.3:1234) +# Do not uncomment PROXY_USERNAME and PROXY_PASSWORD if your proxy is unauthenticated +# PROXY_SERVER= +# PROXY_USERNAME= +# PROXY_PASSWORD= + ## === /search API === # By default, the /search API will use Google search. diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index 685118a1..57fd92e0 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -49,6 +49,16 @@ describe("Scrape tests", () => { expect(response.markdown).toContain("Firecrawl"); }, 10000); + if (process.env.TEST_SUITE_SELF_HOSTED && process.env.PROXY_SERVER) { + it.concurrent("self-hosted proxy works", async () => { + const response = await scrape({ + url: "https://icanhazip.com" + }); + + expect(response.markdown?.trim()).toBe(process.env.PROXY_SERVER!.split("://").slice(-1)[0].split(":")[0]); + }); + } + if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.PLAYWRIGHT_MICROSERVICE_URL) { it.concurrent("waitFor works", async () => { const response = await scrape({ diff --git a/apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts b/apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts index 6cae9bd8..351a7742 100644 --- a/apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts +++ b/apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts @@ -43,14 +43,24 @@ export function makeSecureDispatcher( url: string, options?: undici.Agent.Options, ) { - const agent = new undici.Agent({ + const agentOpts: undici.Agent.Options = { connect: { rejectUnauthorized: false, // bypass SSL failures -- this is fine // lookup: secureLookup, }, maxRedirections: 5000, ...options, - }); + }; + + const agent = process.env.PROXY_SERVER + ? new undici.ProxyAgent({ + uri: process.env.PROXY_SERVER.includes("://") ? process.env.PROXY_SERVER : ("http://" + process.env.PROXY_SERVER), + token: process.env.PROXY_USERNAME + ? `Basic ${Buffer.from(process.env.PROXY_USERNAME + ":" + (process.env.PROXY_PASSWORD ?? "")).toString("base64")}` + : undefined, + ...agentOpts, + }) + : new undici.Agent(agentOpts); agent.on("connect", (_, targets) => { const client: undici.Client = targets.slice(-1)[0] as undici.Client; diff --git a/apps/playwright-service-ts/api.ts b/apps/playwright-service-ts/api.ts index 894b09d9..f6dd25a1 100644 --- a/apps/playwright-service-ts/api.ts +++ b/apps/playwright-service-ts/api.ts @@ -139,7 +139,7 @@ const scrapePage = async (page: Page, url: string, waitUntil: 'load' | 'networki if (response) { headers = await response.allHeaders(); const ct = Object.entries(headers).find(x => x[0].toLowerCase() === "content-type"); - if (ct && ct[1].includes("application/json")) { + if (ct && (ct[1].includes("application/json") || ct[1].includes("text/plain"))) { content = (await response.body()).toString("utf8"); // TODO: determine real encoding } } diff --git a/docker-compose.yaml b/docker-compose.yaml index 920b56e3..0fc22fca 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -15,11 +15,11 @@ services: playwright-service: build: apps/playwright-service-ts environment: - - PORT=3000 - - PROXY_SERVER=${PROXY_SERVER} - - PROXY_USERNAME=${PROXY_USERNAME} - - PROXY_PASSWORD=${PROXY_PASSWORD} - - BLOCK_MEDIA=${BLOCK_MEDIA} + PORT: 3000 + PROXY_SERVER: ${PROXY_SERVER} + PROXY_USERNAME: ${PROXY_USERNAME} + PROXY_PASSWORD: ${PROXY_PASSWORD} + BLOCK_MEDIA: ${BLOCK_MEDIA} networks: - backend @@ -51,6 +51,9 @@ services: SERPER_API_KEY: ${SERPER_API_KEY} SEARCHAPI_API_KEY: ${SEARCHAPI_API_KEY} LOGGING_LEVEL: ${LOGGING_LEVEL} + PROXY_SERVER: ${PROXY_SERVER} + PROXY_USERNAME: ${PROXY_USERNAME} + PROXY_PASSWORD: ${PROXY_PASSWORD} FLY_PROCESS_GROUP: app depends_on: - redis @@ -85,6 +88,9 @@ services: HOST: ${HOST:-0.0.0.0} SELF_HOSTED_WEBHOOK_URL: ${SELF_HOSTED_WEBHOOK_URL} LOGGING_LEVEL: ${LOGGING_LEVEL} + PROXY_SERVER: ${PROXY_SERVER} + PROXY_USERNAME: ${PROXY_USERNAME} + PROXY_PASSWORD: ${PROXY_PASSWORD} FLY_PROCESS_GROUP: worker depends_on: - redis