mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-04-18 12:09:42 +08:00
feat(self-host): proxy support (FIR-1111) (#1212)
* feat(self-host): proxy support * fix(playwright-service-ts): return untreated text/plain
This commit is contained in:
parent
c75522f535
commit
c38dcd0432
10
.github/workflows/test-server-self-host.yml
vendored
10
.github/workflows/test-server-self-host.yml
vendored
@ -24,6 +24,7 @@ jobs:
|
||||
ai: ["openai", "no-ai"]
|
||||
search: ["searxng", "google"]
|
||||
engine: ["playwright", "fetch"]
|
||||
proxy: ["proxy", "no-proxy"]
|
||||
fail-fast: false
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
@ -35,6 +36,9 @@ jobs:
|
||||
OPENAI_API_KEY: ${{ matrix.ai == 'openai' && secrets.OPENAI_API_KEY || '' }}
|
||||
SEARXNG_ENDPOINT: ${{ matrix.search == 'searxng' && 'http://localhost:3434' || '' }}
|
||||
PLAYWRIGHT_MICROSERVICE_URL: ${{ matrix.engine == 'playwright' && 'http://localhost:3003/scrape' || '' }}
|
||||
PROXY_SERVER: ${{ matrix.proxy == 'proxy' && secrets.PROXY_SERVER || '' }}
|
||||
PROXY_USERNAME: ${{ matrix.proxy == 'proxy' && secrets.PROXY_USERNAME || '' }}
|
||||
PROXY_PASSWORD: ${{ matrix.proxy == 'proxy' && secrets.PROXY_PASSWORD || '' }}
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Install pnpm
|
||||
@ -115,20 +119,20 @@ jobs:
|
||||
- uses: actions/upload-artifact@v4
|
||||
if: always()
|
||||
with:
|
||||
name: Logs (${{ matrix.ai }}, ${{ matrix.search }}, ${{ matrix.engine }})
|
||||
name: Logs (${{ matrix.ai }}, ${{ matrix.search }}, ${{ matrix.engine }}, ${{ matrix.proxy }})
|
||||
path: |
|
||||
./apps/api/api.log
|
||||
./apps/api/worker.log
|
||||
- uses: actions/upload-artifact@v4
|
||||
if: always() && matrix.playwright
|
||||
with:
|
||||
name: Playwright Logs (${{ matrix.ai }}, ${{ matrix.search }})
|
||||
name: Playwright Logs (${{ matrix.ai }}, ${{ matrix.search }}, ${{ matrix.proxy }})
|
||||
path: |
|
||||
./apps/playwright-service-ts/playwright.log
|
||||
- uses: actions/upload-artifact@v4
|
||||
if: always() && matrix.search == 'searxng'
|
||||
with:
|
||||
name: SearXNG (${{ matrix.ai }}, ${{ matrix.engine }})
|
||||
name: SearXNG (${{ matrix.ai }}, ${{ matrix.engine }}, ${{ matrix.proxy }})
|
||||
path: |
|
||||
./searxng/searxng.log
|
||||
./searxng/settings.yml
|
||||
|
@ -51,6 +51,13 @@ USE_DB_AUTHENTICATION=false
|
||||
# Provide your OpenAI API key here to enable AI features
|
||||
# OPENAI_API_KEY=
|
||||
|
||||
## === Proxy ===
|
||||
# PROXY_SERVER can be a full URL (e.g. http://0.1.2.3:1234) or just an IP and port combo (e.g. 0.1.2.3:1234)
|
||||
# Do not uncomment PROXY_USERNAME and PROXY_PASSWORD if your proxy is unauthenticated
|
||||
# PROXY_SERVER=
|
||||
# PROXY_USERNAME=
|
||||
# PROXY_PASSWORD=
|
||||
|
||||
## === /search API ===
|
||||
# By default, the /search API will use Google search.
|
||||
|
||||
|
@ -49,6 +49,16 @@ describe("Scrape tests", () => {
|
||||
expect(response.markdown).toContain("Firecrawl");
|
||||
}, 10000);
|
||||
|
||||
if (process.env.TEST_SUITE_SELF_HOSTED && process.env.PROXY_SERVER) {
|
||||
it.concurrent("self-hosted proxy works", async () => {
|
||||
const response = await scrape({
|
||||
url: "https://icanhazip.com"
|
||||
});
|
||||
|
||||
expect(response.markdown?.trim()).toBe(process.env.PROXY_SERVER!.split("://").slice(-1)[0].split(":")[0]);
|
||||
});
|
||||
}
|
||||
|
||||
if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.PLAYWRIGHT_MICROSERVICE_URL) {
|
||||
it.concurrent("waitFor works", async () => {
|
||||
const response = await scrape({
|
||||
|
@ -43,14 +43,24 @@ export function makeSecureDispatcher(
|
||||
url: string,
|
||||
options?: undici.Agent.Options,
|
||||
) {
|
||||
const agent = new undici.Agent({
|
||||
const agentOpts: undici.Agent.Options = {
|
||||
connect: {
|
||||
rejectUnauthorized: false, // bypass SSL failures -- this is fine
|
||||
// lookup: secureLookup,
|
||||
},
|
||||
maxRedirections: 5000,
|
||||
...options,
|
||||
});
|
||||
};
|
||||
|
||||
const agent = process.env.PROXY_SERVER
|
||||
? new undici.ProxyAgent({
|
||||
uri: process.env.PROXY_SERVER.includes("://") ? process.env.PROXY_SERVER : ("http://" + process.env.PROXY_SERVER),
|
||||
token: process.env.PROXY_USERNAME
|
||||
? `Basic ${Buffer.from(process.env.PROXY_USERNAME + ":" + (process.env.PROXY_PASSWORD ?? "")).toString("base64")}`
|
||||
: undefined,
|
||||
...agentOpts,
|
||||
})
|
||||
: new undici.Agent(agentOpts);
|
||||
|
||||
agent.on("connect", (_, targets) => {
|
||||
const client: undici.Client = targets.slice(-1)[0] as undici.Client;
|
||||
|
@ -139,7 +139,7 @@ const scrapePage = async (page: Page, url: string, waitUntil: 'load' | 'networki
|
||||
if (response) {
|
||||
headers = await response.allHeaders();
|
||||
const ct = Object.entries(headers).find(x => x[0].toLowerCase() === "content-type");
|
||||
if (ct && ct[1].includes("application/json")) {
|
||||
if (ct && (ct[1].includes("application/json") || ct[1].includes("text/plain"))) {
|
||||
content = (await response.body()).toString("utf8"); // TODO: determine real encoding
|
||||
}
|
||||
}
|
||||
|
@ -15,11 +15,11 @@ services:
|
||||
playwright-service:
|
||||
build: apps/playwright-service-ts
|
||||
environment:
|
||||
- PORT=3000
|
||||
- PROXY_SERVER=${PROXY_SERVER}
|
||||
- PROXY_USERNAME=${PROXY_USERNAME}
|
||||
- PROXY_PASSWORD=${PROXY_PASSWORD}
|
||||
- BLOCK_MEDIA=${BLOCK_MEDIA}
|
||||
PORT: 3000
|
||||
PROXY_SERVER: ${PROXY_SERVER}
|
||||
PROXY_USERNAME: ${PROXY_USERNAME}
|
||||
PROXY_PASSWORD: ${PROXY_PASSWORD}
|
||||
BLOCK_MEDIA: ${BLOCK_MEDIA}
|
||||
networks:
|
||||
- backend
|
||||
|
||||
@ -51,6 +51,9 @@ services:
|
||||
SERPER_API_KEY: ${SERPER_API_KEY}
|
||||
SEARCHAPI_API_KEY: ${SEARCHAPI_API_KEY}
|
||||
LOGGING_LEVEL: ${LOGGING_LEVEL}
|
||||
PROXY_SERVER: ${PROXY_SERVER}
|
||||
PROXY_USERNAME: ${PROXY_USERNAME}
|
||||
PROXY_PASSWORD: ${PROXY_PASSWORD}
|
||||
FLY_PROCESS_GROUP: app
|
||||
depends_on:
|
||||
- redis
|
||||
@ -85,6 +88,9 @@ services:
|
||||
HOST: ${HOST:-0.0.0.0}
|
||||
SELF_HOSTED_WEBHOOK_URL: ${SELF_HOSTED_WEBHOOK_URL}
|
||||
LOGGING_LEVEL: ${LOGGING_LEVEL}
|
||||
PROXY_SERVER: ${PROXY_SERVER}
|
||||
PROXY_USERNAME: ${PROXY_USERNAME}
|
||||
PROXY_PASSWORD: ${PROXY_PASSWORD}
|
||||
FLY_PROCESS_GROUP: worker
|
||||
depends_on:
|
||||
- redis
|
||||
|
Loading…
x
Reference in New Issue
Block a user