From e1455cecd8a863a97ab04a37b77cb2d5f94f8dd8 Mon Sep 17 00:00:00 2001 From: crazywoola <100913391+crazywoola@users.noreply.github.com> Date: Wed, 16 Apr 2025 15:50:15 +0800 Subject: [PATCH] feat: add switches for jina firecrawl watercrawl (#18153) --- docker/.env.example | 6 ++++++ docker/docker-compose-template.yaml | 4 +++- docker/docker-compose.yaml | 7 ++++++- web/.env.example | 5 +++++ .../datasets/create/step-one/index.tsx | 14 ++++++------- .../datasets/create/website/index.tsx | 13 ++++++------ .../datasets/create/website/no-data.tsx | 20 ++++++++++--------- .../data-source-page/index.tsx | 7 ++++--- web/config/index.ts | 12 +++++++++++ web/docker/entrypoint.sh | 4 +++- 10 files changed, 64 insertions(+), 28 deletions(-) diff --git a/docker/.env.example b/docker/.env.example index acb09c0d4f..e49e8fee89 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -174,6 +174,12 @@ CELERY_MIN_WORKERS= API_TOOL_DEFAULT_CONNECT_TIMEOUT=10 API_TOOL_DEFAULT_READ_TIMEOUT=60 +# ------------------------------- +# Datasource Configuration +# -------------------------------- +ENABLE_WEBSITE_JINAREADER=true +ENABLE_WEBSITE_FIRECRAWL=true +ENABLE_WEBSITE_WATERCRAWL=true # ------------------------------ # Database Configuration diff --git a/docker/docker-compose-template.yaml b/docker/docker-compose-template.yaml index 86976063c3..a8f7b755fb 100644 --- a/docker/docker-compose-template.yaml +++ b/docker/docker-compose-template.yaml @@ -75,7 +75,9 @@ services: MAX_TOOLS_NUM: ${MAX_TOOLS_NUM:-10} MAX_PARALLEL_LIMIT: ${MAX_PARALLEL_LIMIT:-10} MAX_ITERATIONS_NUM: ${MAX_ITERATIONS_NUM:-5} - + ENABLE_WEBSITE_JINAREADER: ${ENABLE_WEBSITE_JINAREADER:-true} + ENABLE_WEBSITE_FIRECRAWL: ${ENABLE_WEBSITE_FIRECRAWL:-true} + ENABLE_WEBSITE_WATERCRAWL: ${ENABLE_WEBSITE_WATERCRAWL:-true} # The postgres database. db: image: postgres:15-alpine diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index e9c8c8715a..25b0c56561 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -43,6 +43,9 @@ x-shared-env: &shared-api-worker-env CELERY_MIN_WORKERS: ${CELERY_MIN_WORKERS:-} API_TOOL_DEFAULT_CONNECT_TIMEOUT: ${API_TOOL_DEFAULT_CONNECT_TIMEOUT:-10} API_TOOL_DEFAULT_READ_TIMEOUT: ${API_TOOL_DEFAULT_READ_TIMEOUT:-60} + ENABLE_WEBSITE_JINAREADER: ${ENABLE_WEBSITE_JINAREADER:-true} + ENABLE_WEBSITE_FIRECRAWL: ${ENABLE_WEBSITE_FIRECRAWL:-true} + ENABLE_WEBSITE_WATERCRAWL: ${ENABLE_WEBSITE_WATERCRAWL:-true} DB_USERNAME: ${DB_USERNAME:-postgres} DB_PASSWORD: ${DB_PASSWORD:-difyai123456} DB_HOST: ${DB_HOST:-db} @@ -543,7 +546,9 @@ services: MAX_TOOLS_NUM: ${MAX_TOOLS_NUM:-10} MAX_PARALLEL_LIMIT: ${MAX_PARALLEL_LIMIT:-10} MAX_ITERATIONS_NUM: ${MAX_ITERATIONS_NUM:-5} - + ENABLE_WEBSITE_JINAREADER: ${ENABLE_WEBSITE_JINAREADER:-true} + ENABLE_WEBSITE_FIRECRAWL: ${ENABLE_WEBSITE_FIRECRAWL:-true} + ENABLE_WEBSITE_WATERCRAWL: ${ENABLE_WEBSITE_WATERCRAWL:-true} # The postgres database. db: image: postgres:15-alpine diff --git a/web/.env.example b/web/.env.example index 51dc3d6b3c..1c3f42ddfc 100644 --- a/web/.env.example +++ b/web/.env.example @@ -49,3 +49,8 @@ NEXT_PUBLIC_MAX_PARALLEL_LIMIT=10 # The maximum number of iterations for agent setting NEXT_PUBLIC_MAX_ITERATIONS_NUM=5 + +NEXT_PUBLIC_ENABLE_WEBSITE_JINAREADER=true +NEXT_PUBLIC_ENABLE_WEBSITE_FIRECRAWL=true +NEXT_PUBLIC_ENABLE_WEBSITE_WATERCRAWL=true + diff --git a/web/app/components/datasets/create/step-one/index.tsx b/web/app/components/datasets/create/step-one/index.tsx index 6f4231bb1f..38c885ebe2 100644 --- a/web/app/components/datasets/create/step-one/index.tsx +++ b/web/app/components/datasets/create/step-one/index.tsx @@ -20,7 +20,7 @@ import { useProviderContext } from '@/context/provider-context' import VectorSpaceFull from '@/app/components/billing/vector-space-full' import classNames from '@/utils/classnames' import { Icon3Dots } from '@/app/components/base/icons/src/vender/line/others' - +import { ENABLE_WEBSITE_FIRECRAWL, ENABLE_WEBSITE_JINAREADER, ENABLE_WEBSITE_WATERCRAWL } from '@/config' type IStepOneProps = { datasetId?: string dataSourceType?: DataSourceType @@ -126,9 +126,7 @@ const StepOne = ({ return true if (files.some(file => !file.file.id)) return true - if (isShowVectorSpaceFull) - return true - return false + return isShowVectorSpaceFull }, [files, isShowVectorSpaceFull]) return ( @@ -193,7 +191,8 @@ const StepOne = ({ {t('datasetCreation.stepOne.dataSourceType.notion')} -
changeType(DataSourceType.WEB)} - > + > {t('datasetCreation.stepOne.dataSourceType.web')} -
+ + )} ) } diff --git a/web/app/components/datasets/create/website/index.tsx b/web/app/components/datasets/create/website/index.tsx index 5122ef6ed2..e2d0e2df99 100644 --- a/web/app/components/datasets/create/website/index.tsx +++ b/web/app/components/datasets/create/website/index.tsx @@ -12,6 +12,7 @@ import { useModalContext } from '@/context/modal-context' import type { CrawlOptions, CrawlResultItem } from '@/models/datasets' import { fetchDataSources } from '@/service/datasets' import { type DataSourceItem, DataSourceProvider } from '@/models/common' +import { ENABLE_WEBSITE_FIRECRAWL, ENABLE_WEBSITE_JINAREADER, ENABLE_WEBSITE_WATERCRAWL } from '@/config' type Props = { onPreview: (payload: CrawlResultItem) => void @@ -84,7 +85,7 @@ const Website: FC = ({ {t('datasetCreation.stepOne.website.chooseProvider')}
- - - + }
{source && selectedProvider === DataSourceProvider.fireCrawl && ( diff --git a/web/app/components/datasets/create/website/no-data.tsx b/web/app/components/datasets/create/website/no-data.tsx index 14be2e29f6..65a314f516 100644 --- a/web/app/components/datasets/create/website/no-data.tsx +++ b/web/app/components/datasets/create/website/no-data.tsx @@ -6,6 +6,7 @@ import s from './index.module.css' import { Icon3Dots } from '@/app/components/base/icons/src/vender/line/others' import Button from '@/app/components/base/button' import { DataSourceProvider } from '@/models/common' +import { ENABLE_WEBSITE_FIRECRAWL, ENABLE_WEBSITE_JINAREADER, ENABLE_WEBSITE_WATERCRAWL } from '@/config' const I18N_PREFIX = 'datasetCreation.stepOne.website' @@ -16,29 +17,30 @@ type Props = { const NoData: FC = ({ onConfig, - provider, }) => { const { t } = useTranslation() const providerConfig = { - [DataSourceProvider.jinaReader]: { + [DataSourceProvider.jinaReader]: ENABLE_WEBSITE_JINAREADER ? { emoji: , title: t(`${I18N_PREFIX}.jinaReaderNotConfigured`), description: t(`${I18N_PREFIX}.jinaReaderNotConfiguredDescription`), - }, - [DataSourceProvider.fireCrawl]: { + } : null, + [DataSourceProvider.fireCrawl]: ENABLE_WEBSITE_FIRECRAWL ? { emoji: '🔥', title: t(`${I18N_PREFIX}.fireCrawlNotConfigured`), description: t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`), - }, - [DataSourceProvider.waterCrawl]: { - emoji: , + } : null, + [DataSourceProvider.waterCrawl]: ENABLE_WEBSITE_WATERCRAWL ? { + emoji: '💧', title: t(`${I18N_PREFIX}.waterCrawlNotConfigured`), description: t(`${I18N_PREFIX}.waterCrawlNotConfiguredDescription`), - }, + } : null, } - const currentProvider = providerConfig[provider] + const currentProvider = Object.values(providerConfig).find(provider => provider !== null) || providerConfig[DataSourceProvider.jinaReader] + + if (!currentProvider) return null return ( <> diff --git a/web/app/components/header/account-setting/data-source-page/index.tsx b/web/app/components/header/account-setting/data-source-page/index.tsx index d99bd25e02..fb13813d70 100644 --- a/web/app/components/header/account-setting/data-source-page/index.tsx +++ b/web/app/components/header/account-setting/data-source-page/index.tsx @@ -3,6 +3,7 @@ import DataSourceNotion from './data-source-notion' import DataSourceWebsite from './data-source-website' import { fetchDataSource } from '@/service/common' import { DataSourceProvider } from '@/models/common' +import { ENABLE_WEBSITE_FIRECRAWL, ENABLE_WEBSITE_JINAREADER, ENABLE_WEBSITE_WATERCRAWL } from '@/config' export default function DataSourcePage() { const { data } = useSWR({ url: 'data-source/integrates' }, fetchDataSource) @@ -11,9 +12,9 @@ export default function DataSourcePage() { return (
- - - + {ENABLE_WEBSITE_JINAREADER && } + {ENABLE_WEBSITE_FIRECRAWL && } + {ENABLE_WEBSITE_WATERCRAWL && }
) } diff --git a/web/config/index.ts b/web/config/index.ts index 2b81adb095..b164392c52 100644 --- a/web/config/index.ts +++ b/web/config/index.ts @@ -302,3 +302,15 @@ else if (globalThis.document?.body?.getAttribute('data-public-max-iterations-num maxIterationsNum = Number.parseInt(globalThis.document.body.getAttribute('data-public-max-iterations-num') as string) export const MAX_ITERATIONS_NUM = maxIterationsNum + +export const ENABLE_WEBSITE_JINAREADER = process.env.NEXT_PUBLIC_ENABLE_WEBSITE_JINAREADER !== undefined + ? process.env.NEXT_PUBLIC_ENABLE_WEBSITE_JINAREADER === 'true' + : true + +export const ENABLE_WEBSITE_FIRECRAWL = process.env.NEXT_PUBLIC_ENABLE_WEBSITE_FIRECRAWL !== undefined + ? process.env.NEXT_PUBLIC_ENABLE_WEBSITE_FIRECRAWL === 'true' + : true + +export const ENABLE_WEBSITE_WATERCRAWL = process.env.NEXT_PUBLIC_ENABLE_WEBSITE_WATERCRAWL !== undefined + ? process.env.NEXT_PUBLIC_ENABLE_WEBSITE_WATERCRAWL === 'true' + : true diff --git a/web/docker/entrypoint.sh b/web/docker/entrypoint.sh index 797b61081a..8395ac5f4d 100755 --- a/web/docker/entrypoint.sh +++ b/web/docker/entrypoint.sh @@ -28,5 +28,7 @@ export NEXT_PUBLIC_CSP_WHITELIST=${CSP_WHITELIST} export NEXT_PUBLIC_TOP_K_MAX_VALUE=${TOP_K_MAX_VALUE} export NEXT_PUBLIC_INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=${INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH} export NEXT_PUBLIC_MAX_TOOLS_NUM=${MAX_TOOLS_NUM} - +export NEXT_PUBLIC_ENABLE_WEBSITE_JINAREADER=${ENABLE_WEBSITE_JINAREADER:-true} +export NEXT_PUBLIC_ENABLE_WEBSITE_FIRECRAWL=${ENABLE_WEBSITE_FIRECRAWL:-true} +export NEXT_PUBLIC_ENABLE_WEBSITE_WATERCRAWL=${ENABLE_WEBSITE_WATERCRAWL:-true} pm2 start /app/web/server.js --name dify-web --cwd /app/web -i ${PM2_INSTANCES} --no-daemon