mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-07-31 21:21:59 +08:00
feat(self-host): proxy support (FIR-1111) (#1212)
* feat(self-host): proxy support * fix(playwright-service-ts): return untreated text/plain
This commit is contained in:
parent
c75522f535
commit
c38dcd0432
10
.github/workflows/test-server-self-host.yml
vendored
10
.github/workflows/test-server-self-host.yml
vendored
@ -24,6 +24,7 @@ jobs:
|
|||||||
ai: ["openai", "no-ai"]
|
ai: ["openai", "no-ai"]
|
||||||
search: ["searxng", "google"]
|
search: ["searxng", "google"]
|
||||||
engine: ["playwright", "fetch"]
|
engine: ["playwright", "fetch"]
|
||||||
|
proxy: ["proxy", "no-proxy"]
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
services:
|
services:
|
||||||
@ -35,6 +36,9 @@ jobs:
|
|||||||
OPENAI_API_KEY: ${{ matrix.ai == 'openai' && secrets.OPENAI_API_KEY || '' }}
|
OPENAI_API_KEY: ${{ matrix.ai == 'openai' && secrets.OPENAI_API_KEY || '' }}
|
||||||
SEARXNG_ENDPOINT: ${{ matrix.search == 'searxng' && 'http://localhost:3434' || '' }}
|
SEARXNG_ENDPOINT: ${{ matrix.search == 'searxng' && 'http://localhost:3434' || '' }}
|
||||||
PLAYWRIGHT_MICROSERVICE_URL: ${{ matrix.engine == 'playwright' && 'http://localhost:3003/scrape' || '' }}
|
PLAYWRIGHT_MICROSERVICE_URL: ${{ matrix.engine == 'playwright' && 'http://localhost:3003/scrape' || '' }}
|
||||||
|
PROXY_SERVER: ${{ matrix.proxy == 'proxy' && secrets.PROXY_SERVER || '' }}
|
||||||
|
PROXY_USERNAME: ${{ matrix.proxy == 'proxy' && secrets.PROXY_USERNAME || '' }}
|
||||||
|
PROXY_PASSWORD: ${{ matrix.proxy == 'proxy' && secrets.PROXY_PASSWORD || '' }}
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
- name: Install pnpm
|
- name: Install pnpm
|
||||||
@ -115,20 +119,20 @@ jobs:
|
|||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
if: always()
|
if: always()
|
||||||
with:
|
with:
|
||||||
name: Logs (${{ matrix.ai }}, ${{ matrix.search }}, ${{ matrix.engine }})
|
name: Logs (${{ matrix.ai }}, ${{ matrix.search }}, ${{ matrix.engine }}, ${{ matrix.proxy }})
|
||||||
path: |
|
path: |
|
||||||
./apps/api/api.log
|
./apps/api/api.log
|
||||||
./apps/api/worker.log
|
./apps/api/worker.log
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
if: always() && matrix.playwright
|
if: always() && matrix.playwright
|
||||||
with:
|
with:
|
||||||
name: Playwright Logs (${{ matrix.ai }}, ${{ matrix.search }})
|
name: Playwright Logs (${{ matrix.ai }}, ${{ matrix.search }}, ${{ matrix.proxy }})
|
||||||
path: |
|
path: |
|
||||||
./apps/playwright-service-ts/playwright.log
|
./apps/playwright-service-ts/playwright.log
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
if: always() && matrix.search == 'searxng'
|
if: always() && matrix.search == 'searxng'
|
||||||
with:
|
with:
|
||||||
name: SearXNG (${{ matrix.ai }}, ${{ matrix.engine }})
|
name: SearXNG (${{ matrix.ai }}, ${{ matrix.engine }}, ${{ matrix.proxy }})
|
||||||
path: |
|
path: |
|
||||||
./searxng/searxng.log
|
./searxng/searxng.log
|
||||||
./searxng/settings.yml
|
./searxng/settings.yml
|
||||||
|
@ -51,6 +51,13 @@ USE_DB_AUTHENTICATION=false
|
|||||||
# Provide your OpenAI API key here to enable AI features
|
# Provide your OpenAI API key here to enable AI features
|
||||||
# OPENAI_API_KEY=
|
# OPENAI_API_KEY=
|
||||||
|
|
||||||
|
## === Proxy ===
|
||||||
|
# PROXY_SERVER can be a full URL (e.g. http://0.1.2.3:1234) or just an IP and port combo (e.g. 0.1.2.3:1234)
|
||||||
|
# Do not uncomment PROXY_USERNAME and PROXY_PASSWORD if your proxy is unauthenticated
|
||||||
|
# PROXY_SERVER=
|
||||||
|
# PROXY_USERNAME=
|
||||||
|
# PROXY_PASSWORD=
|
||||||
|
|
||||||
## === /search API ===
|
## === /search API ===
|
||||||
# By default, the /search API will use Google search.
|
# By default, the /search API will use Google search.
|
||||||
|
|
||||||
|
@ -49,6 +49,16 @@ describe("Scrape tests", () => {
|
|||||||
expect(response.markdown).toContain("Firecrawl");
|
expect(response.markdown).toContain("Firecrawl");
|
||||||
}, 10000);
|
}, 10000);
|
||||||
|
|
||||||
|
if (process.env.TEST_SUITE_SELF_HOSTED && process.env.PROXY_SERVER) {
|
||||||
|
it.concurrent("self-hosted proxy works", async () => {
|
||||||
|
const response = await scrape({
|
||||||
|
url: "https://icanhazip.com"
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(response.markdown?.trim()).toBe(process.env.PROXY_SERVER!.split("://").slice(-1)[0].split(":")[0]);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.PLAYWRIGHT_MICROSERVICE_URL) {
|
if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.PLAYWRIGHT_MICROSERVICE_URL) {
|
||||||
it.concurrent("waitFor works", async () => {
|
it.concurrent("waitFor works", async () => {
|
||||||
const response = await scrape({
|
const response = await scrape({
|
||||||
|
@ -43,14 +43,24 @@ export function makeSecureDispatcher(
|
|||||||
url: string,
|
url: string,
|
||||||
options?: undici.Agent.Options,
|
options?: undici.Agent.Options,
|
||||||
) {
|
) {
|
||||||
const agent = new undici.Agent({
|
const agentOpts: undici.Agent.Options = {
|
||||||
connect: {
|
connect: {
|
||||||
rejectUnauthorized: false, // bypass SSL failures -- this is fine
|
rejectUnauthorized: false, // bypass SSL failures -- this is fine
|
||||||
// lookup: secureLookup,
|
// lookup: secureLookup,
|
||||||
},
|
},
|
||||||
maxRedirections: 5000,
|
maxRedirections: 5000,
|
||||||
...options,
|
...options,
|
||||||
});
|
};
|
||||||
|
|
||||||
|
const agent = process.env.PROXY_SERVER
|
||||||
|
? new undici.ProxyAgent({
|
||||||
|
uri: process.env.PROXY_SERVER.includes("://") ? process.env.PROXY_SERVER : ("http://" + process.env.PROXY_SERVER),
|
||||||
|
token: process.env.PROXY_USERNAME
|
||||||
|
? `Basic ${Buffer.from(process.env.PROXY_USERNAME + ":" + (process.env.PROXY_PASSWORD ?? "")).toString("base64")}`
|
||||||
|
: undefined,
|
||||||
|
...agentOpts,
|
||||||
|
})
|
||||||
|
: new undici.Agent(agentOpts);
|
||||||
|
|
||||||
agent.on("connect", (_, targets) => {
|
agent.on("connect", (_, targets) => {
|
||||||
const client: undici.Client = targets.slice(-1)[0] as undici.Client;
|
const client: undici.Client = targets.slice(-1)[0] as undici.Client;
|
||||||
|
@ -139,7 +139,7 @@ const scrapePage = async (page: Page, url: string, waitUntil: 'load' | 'networki
|
|||||||
if (response) {
|
if (response) {
|
||||||
headers = await response.allHeaders();
|
headers = await response.allHeaders();
|
||||||
const ct = Object.entries(headers).find(x => x[0].toLowerCase() === "content-type");
|
const ct = Object.entries(headers).find(x => x[0].toLowerCase() === "content-type");
|
||||||
if (ct && ct[1].includes("application/json")) {
|
if (ct && (ct[1].includes("application/json") || ct[1].includes("text/plain"))) {
|
||||||
content = (await response.body()).toString("utf8"); // TODO: determine real encoding
|
content = (await response.body()).toString("utf8"); // TODO: determine real encoding
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -15,11 +15,11 @@ services:
|
|||||||
playwright-service:
|
playwright-service:
|
||||||
build: apps/playwright-service-ts
|
build: apps/playwright-service-ts
|
||||||
environment:
|
environment:
|
||||||
- PORT=3000
|
PORT: 3000
|
||||||
- PROXY_SERVER=${PROXY_SERVER}
|
PROXY_SERVER: ${PROXY_SERVER}
|
||||||
- PROXY_USERNAME=${PROXY_USERNAME}
|
PROXY_USERNAME: ${PROXY_USERNAME}
|
||||||
- PROXY_PASSWORD=${PROXY_PASSWORD}
|
PROXY_PASSWORD: ${PROXY_PASSWORD}
|
||||||
- BLOCK_MEDIA=${BLOCK_MEDIA}
|
BLOCK_MEDIA: ${BLOCK_MEDIA}
|
||||||
networks:
|
networks:
|
||||||
- backend
|
- backend
|
||||||
|
|
||||||
@ -51,6 +51,9 @@ services:
|
|||||||
SERPER_API_KEY: ${SERPER_API_KEY}
|
SERPER_API_KEY: ${SERPER_API_KEY}
|
||||||
SEARCHAPI_API_KEY: ${SEARCHAPI_API_KEY}
|
SEARCHAPI_API_KEY: ${SEARCHAPI_API_KEY}
|
||||||
LOGGING_LEVEL: ${LOGGING_LEVEL}
|
LOGGING_LEVEL: ${LOGGING_LEVEL}
|
||||||
|
PROXY_SERVER: ${PROXY_SERVER}
|
||||||
|
PROXY_USERNAME: ${PROXY_USERNAME}
|
||||||
|
PROXY_PASSWORD: ${PROXY_PASSWORD}
|
||||||
FLY_PROCESS_GROUP: app
|
FLY_PROCESS_GROUP: app
|
||||||
depends_on:
|
depends_on:
|
||||||
- redis
|
- redis
|
||||||
@ -85,6 +88,9 @@ services:
|
|||||||
HOST: ${HOST:-0.0.0.0}
|
HOST: ${HOST:-0.0.0.0}
|
||||||
SELF_HOSTED_WEBHOOK_URL: ${SELF_HOSTED_WEBHOOK_URL}
|
SELF_HOSTED_WEBHOOK_URL: ${SELF_HOSTED_WEBHOOK_URL}
|
||||||
LOGGING_LEVEL: ${LOGGING_LEVEL}
|
LOGGING_LEVEL: ${LOGGING_LEVEL}
|
||||||
|
PROXY_SERVER: ${PROXY_SERVER}
|
||||||
|
PROXY_USERNAME: ${PROXY_USERNAME}
|
||||||
|
PROXY_PASSWORD: ${PROXY_PASSWORD}
|
||||||
FLY_PROCESS_GROUP: worker
|
FLY_PROCESS_GROUP: worker
|
||||||
depends_on:
|
depends_on:
|
||||||
- redis
|
- redis
|
||||||
|
Loading…
x
Reference in New Issue
Block a user