From e1455cecd8a863a97ab04a37b77cb2d5f94f8dd8 Mon Sep 17 00:00:00 2001
From: crazywoola <100913391+crazywoola@users.noreply.github.com>
Date: Wed, 16 Apr 2025 15:50:15 +0800
Subject: [PATCH] feat: add switches for jina firecrawl watercrawl (#18153)
---
docker/.env.example | 6 ++++++
docker/docker-compose-template.yaml | 4 +++-
docker/docker-compose.yaml | 7 ++++++-
web/.env.example | 5 +++++
.../datasets/create/step-one/index.tsx | 14 ++++++-------
.../datasets/create/website/index.tsx | 13 ++++++------
.../datasets/create/website/no-data.tsx | 20 ++++++++++---------
.../data-source-page/index.tsx | 7 ++++---
web/config/index.ts | 12 +++++++++++
web/docker/entrypoint.sh | 4 +++-
10 files changed, 64 insertions(+), 28 deletions(-)
diff --git a/docker/.env.example b/docker/.env.example
index acb09c0d4f..e49e8fee89 100644
--- a/docker/.env.example
+++ b/docker/.env.example
@@ -174,6 +174,12 @@ CELERY_MIN_WORKERS=
API_TOOL_DEFAULT_CONNECT_TIMEOUT=10
API_TOOL_DEFAULT_READ_TIMEOUT=60
+# -------------------------------
+# Datasource Configuration
+# --------------------------------
+ENABLE_WEBSITE_JINAREADER=true
+ENABLE_WEBSITE_FIRECRAWL=true
+ENABLE_WEBSITE_WATERCRAWL=true
# ------------------------------
# Database Configuration
diff --git a/docker/docker-compose-template.yaml b/docker/docker-compose-template.yaml
index 86976063c3..a8f7b755fb 100644
--- a/docker/docker-compose-template.yaml
+++ b/docker/docker-compose-template.yaml
@@ -75,7 +75,9 @@ services:
MAX_TOOLS_NUM: ${MAX_TOOLS_NUM:-10}
MAX_PARALLEL_LIMIT: ${MAX_PARALLEL_LIMIT:-10}
MAX_ITERATIONS_NUM: ${MAX_ITERATIONS_NUM:-5}
-
+ ENABLE_WEBSITE_JINAREADER: ${ENABLE_WEBSITE_JINAREADER:-true}
+ ENABLE_WEBSITE_FIRECRAWL: ${ENABLE_WEBSITE_FIRECRAWL:-true}
+ ENABLE_WEBSITE_WATERCRAWL: ${ENABLE_WEBSITE_WATERCRAWL:-true}
# The postgres database.
db:
image: postgres:15-alpine
diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml
index e9c8c8715a..25b0c56561 100644
--- a/docker/docker-compose.yaml
+++ b/docker/docker-compose.yaml
@@ -43,6 +43,9 @@ x-shared-env: &shared-api-worker-env
CELERY_MIN_WORKERS: ${CELERY_MIN_WORKERS:-}
API_TOOL_DEFAULT_CONNECT_TIMEOUT: ${API_TOOL_DEFAULT_CONNECT_TIMEOUT:-10}
API_TOOL_DEFAULT_READ_TIMEOUT: ${API_TOOL_DEFAULT_READ_TIMEOUT:-60}
+ ENABLE_WEBSITE_JINAREADER: ${ENABLE_WEBSITE_JINAREADER:-true}
+ ENABLE_WEBSITE_FIRECRAWL: ${ENABLE_WEBSITE_FIRECRAWL:-true}
+ ENABLE_WEBSITE_WATERCRAWL: ${ENABLE_WEBSITE_WATERCRAWL:-true}
DB_USERNAME: ${DB_USERNAME:-postgres}
DB_PASSWORD: ${DB_PASSWORD:-difyai123456}
DB_HOST: ${DB_HOST:-db}
@@ -543,7 +546,9 @@ services:
MAX_TOOLS_NUM: ${MAX_TOOLS_NUM:-10}
MAX_PARALLEL_LIMIT: ${MAX_PARALLEL_LIMIT:-10}
MAX_ITERATIONS_NUM: ${MAX_ITERATIONS_NUM:-5}
-
+ ENABLE_WEBSITE_JINAREADER: ${ENABLE_WEBSITE_JINAREADER:-true}
+ ENABLE_WEBSITE_FIRECRAWL: ${ENABLE_WEBSITE_FIRECRAWL:-true}
+ ENABLE_WEBSITE_WATERCRAWL: ${ENABLE_WEBSITE_WATERCRAWL:-true}
# The postgres database.
db:
image: postgres:15-alpine
diff --git a/web/.env.example b/web/.env.example
index 51dc3d6b3c..1c3f42ddfc 100644
--- a/web/.env.example
+++ b/web/.env.example
@@ -49,3 +49,8 @@ NEXT_PUBLIC_MAX_PARALLEL_LIMIT=10
# The maximum number of iterations for agent setting
NEXT_PUBLIC_MAX_ITERATIONS_NUM=5
+
+NEXT_PUBLIC_ENABLE_WEBSITE_JINAREADER=true
+NEXT_PUBLIC_ENABLE_WEBSITE_FIRECRAWL=true
+NEXT_PUBLIC_ENABLE_WEBSITE_WATERCRAWL=true
+
diff --git a/web/app/components/datasets/create/step-one/index.tsx b/web/app/components/datasets/create/step-one/index.tsx
index 6f4231bb1f..38c885ebe2 100644
--- a/web/app/components/datasets/create/step-one/index.tsx
+++ b/web/app/components/datasets/create/step-one/index.tsx
@@ -20,7 +20,7 @@ import { useProviderContext } from '@/context/provider-context'
import VectorSpaceFull from '@/app/components/billing/vector-space-full'
import classNames from '@/utils/classnames'
import { Icon3Dots } from '@/app/components/base/icons/src/vender/line/others'
-
+import { ENABLE_WEBSITE_FIRECRAWL, ENABLE_WEBSITE_JINAREADER, ENABLE_WEBSITE_WATERCRAWL } from '@/config'
type IStepOneProps = {
datasetId?: string
dataSourceType?: DataSourceType
@@ -126,9 +126,7 @@ const StepOne = ({
return true
if (files.some(file => !file.file.id))
return true
- if (isShowVectorSpaceFull)
- return true
- return false
+ return isShowVectorSpaceFull
}, [files, isShowVectorSpaceFull])
return (
@@ -193,7 +191,8 @@ const StepOne = ({
{t('datasetCreation.stepOne.dataSourceType.notion')}
-
changeType(DataSourceType.WEB)}
- >
+ >
{t('datasetCreation.stepOne.dataSourceType.web')}
-
+
+ )}
)
}
diff --git a/web/app/components/datasets/create/website/index.tsx b/web/app/components/datasets/create/website/index.tsx
index 5122ef6ed2..e2d0e2df99 100644
--- a/web/app/components/datasets/create/website/index.tsx
+++ b/web/app/components/datasets/create/website/index.tsx
@@ -12,6 +12,7 @@ import { useModalContext } from '@/context/modal-context'
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
import { fetchDataSources } from '@/service/datasets'
import { type DataSourceItem, DataSourceProvider } from '@/models/common'
+import { ENABLE_WEBSITE_FIRECRAWL, ENABLE_WEBSITE_JINAREADER, ENABLE_WEBSITE_WATERCRAWL } from '@/config'
type Props = {
onPreview: (payload: CrawlResultItem) => void
@@ -84,7 +85,7 @@ const Website: FC = ({
{t('datasetCreation.stepOne.website.chooseProvider')}
-
-
{source && selectedProvider === DataSourceProvider.fireCrawl && (
diff --git a/web/app/components/datasets/create/website/no-data.tsx b/web/app/components/datasets/create/website/no-data.tsx
index 14be2e29f6..65a314f516 100644
--- a/web/app/components/datasets/create/website/no-data.tsx
+++ b/web/app/components/datasets/create/website/no-data.tsx
@@ -6,6 +6,7 @@ import s from './index.module.css'
import { Icon3Dots } from '@/app/components/base/icons/src/vender/line/others'
import Button from '@/app/components/base/button'
import { DataSourceProvider } from '@/models/common'
+import { ENABLE_WEBSITE_FIRECRAWL, ENABLE_WEBSITE_JINAREADER, ENABLE_WEBSITE_WATERCRAWL } from '@/config'
const I18N_PREFIX = 'datasetCreation.stepOne.website'
@@ -16,29 +17,30 @@ type Props = {
const NoData: FC = ({
onConfig,
- provider,
}) => {
const { t } = useTranslation()
const providerConfig = {
- [DataSourceProvider.jinaReader]: {
+ [DataSourceProvider.jinaReader]: ENABLE_WEBSITE_JINAREADER ? {
emoji: ,
title: t(`${I18N_PREFIX}.jinaReaderNotConfigured`),
description: t(`${I18N_PREFIX}.jinaReaderNotConfiguredDescription`),
- },
- [DataSourceProvider.fireCrawl]: {
+ } : null,
+ [DataSourceProvider.fireCrawl]: ENABLE_WEBSITE_FIRECRAWL ? {
emoji: '🔥',
title: t(`${I18N_PREFIX}.fireCrawlNotConfigured`),
description: t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`),
- },
- [DataSourceProvider.waterCrawl]: {
- emoji: ,
+ } : null,
+ [DataSourceProvider.waterCrawl]: ENABLE_WEBSITE_WATERCRAWL ? {
+ emoji: '💧',
title: t(`${I18N_PREFIX}.waterCrawlNotConfigured`),
description: t(`${I18N_PREFIX}.waterCrawlNotConfiguredDescription`),
- },
+ } : null,
}
- const currentProvider = providerConfig[provider]
+ const currentProvider = Object.values(providerConfig).find(provider => provider !== null) || providerConfig[DataSourceProvider.jinaReader]
+
+ if (!currentProvider) return null
return (
<>
diff --git a/web/app/components/header/account-setting/data-source-page/index.tsx b/web/app/components/header/account-setting/data-source-page/index.tsx
index d99bd25e02..fb13813d70 100644
--- a/web/app/components/header/account-setting/data-source-page/index.tsx
+++ b/web/app/components/header/account-setting/data-source-page/index.tsx
@@ -3,6 +3,7 @@ import DataSourceNotion from './data-source-notion'
import DataSourceWebsite from './data-source-website'
import { fetchDataSource } from '@/service/common'
import { DataSourceProvider } from '@/models/common'
+import { ENABLE_WEBSITE_FIRECRAWL, ENABLE_WEBSITE_JINAREADER, ENABLE_WEBSITE_WATERCRAWL } from '@/config'
export default function DataSourcePage() {
const { data } = useSWR({ url: 'data-source/integrates' }, fetchDataSource)
@@ -11,9 +12,9 @@ export default function DataSourcePage() {
return (
-
-
-
+ {ENABLE_WEBSITE_JINAREADER && }
+ {ENABLE_WEBSITE_FIRECRAWL && }
+ {ENABLE_WEBSITE_WATERCRAWL && }
)
}
diff --git a/web/config/index.ts b/web/config/index.ts
index 2b81adb095..b164392c52 100644
--- a/web/config/index.ts
+++ b/web/config/index.ts
@@ -302,3 +302,15 @@ else if (globalThis.document?.body?.getAttribute('data-public-max-iterations-num
maxIterationsNum = Number.parseInt(globalThis.document.body.getAttribute('data-public-max-iterations-num') as string)
export const MAX_ITERATIONS_NUM = maxIterationsNum
+
+export const ENABLE_WEBSITE_JINAREADER = process.env.NEXT_PUBLIC_ENABLE_WEBSITE_JINAREADER !== undefined
+ ? process.env.NEXT_PUBLIC_ENABLE_WEBSITE_JINAREADER === 'true'
+ : true
+
+export const ENABLE_WEBSITE_FIRECRAWL = process.env.NEXT_PUBLIC_ENABLE_WEBSITE_FIRECRAWL !== undefined
+ ? process.env.NEXT_PUBLIC_ENABLE_WEBSITE_FIRECRAWL === 'true'
+ : true
+
+export const ENABLE_WEBSITE_WATERCRAWL = process.env.NEXT_PUBLIC_ENABLE_WEBSITE_WATERCRAWL !== undefined
+ ? process.env.NEXT_PUBLIC_ENABLE_WEBSITE_WATERCRAWL === 'true'
+ : true
diff --git a/web/docker/entrypoint.sh b/web/docker/entrypoint.sh
index 797b61081a..8395ac5f4d 100755
--- a/web/docker/entrypoint.sh
+++ b/web/docker/entrypoint.sh
@@ -28,5 +28,7 @@ export NEXT_PUBLIC_CSP_WHITELIST=${CSP_WHITELIST}
export NEXT_PUBLIC_TOP_K_MAX_VALUE=${TOP_K_MAX_VALUE}
export NEXT_PUBLIC_INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=${INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH}
export NEXT_PUBLIC_MAX_TOOLS_NUM=${MAX_TOOLS_NUM}
-
+export NEXT_PUBLIC_ENABLE_WEBSITE_JINAREADER=${ENABLE_WEBSITE_JINAREADER:-true}
+export NEXT_PUBLIC_ENABLE_WEBSITE_FIRECRAWL=${ENABLE_WEBSITE_FIRECRAWL:-true}
+export NEXT_PUBLIC_ENABLE_WEBSITE_WATERCRAWL=${ENABLE_WEBSITE_WATERCRAWL:-true}
pm2 start /app/web/server.js --name dify-web --cwd /app/web -i ${PM2_INSTANCES} --no-daemon