feat: add switches for jina firecrawl watercrawl (#18153)

This commit is contained in:
crazywoola 2025-04-16 15:50:15 +08:00 committed by GitHub
parent b247ef85bf
commit e1455cecd8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 64 additions and 28 deletions

View File

@ -174,6 +174,12 @@ CELERY_MIN_WORKERS=
API_TOOL_DEFAULT_CONNECT_TIMEOUT=10 API_TOOL_DEFAULT_CONNECT_TIMEOUT=10
API_TOOL_DEFAULT_READ_TIMEOUT=60 API_TOOL_DEFAULT_READ_TIMEOUT=60
# -------------------------------
# Datasource Configuration
# --------------------------------
ENABLE_WEBSITE_JINAREADER=true
ENABLE_WEBSITE_FIRECRAWL=true
ENABLE_WEBSITE_WATERCRAWL=true
# ------------------------------ # ------------------------------
# Database Configuration # Database Configuration

View File

@ -75,7 +75,9 @@ services:
MAX_TOOLS_NUM: ${MAX_TOOLS_NUM:-10} MAX_TOOLS_NUM: ${MAX_TOOLS_NUM:-10}
MAX_PARALLEL_LIMIT: ${MAX_PARALLEL_LIMIT:-10} MAX_PARALLEL_LIMIT: ${MAX_PARALLEL_LIMIT:-10}
MAX_ITERATIONS_NUM: ${MAX_ITERATIONS_NUM:-5} MAX_ITERATIONS_NUM: ${MAX_ITERATIONS_NUM:-5}
ENABLE_WEBSITE_JINAREADER: ${ENABLE_WEBSITE_JINAREADER:-true}
ENABLE_WEBSITE_FIRECRAWL: ${ENABLE_WEBSITE_FIRECRAWL:-true}
ENABLE_WEBSITE_WATERCRAWL: ${ENABLE_WEBSITE_WATERCRAWL:-true}
# The postgres database. # The postgres database.
db: db:
image: postgres:15-alpine image: postgres:15-alpine

View File

@ -43,6 +43,9 @@ x-shared-env: &shared-api-worker-env
CELERY_MIN_WORKERS: ${CELERY_MIN_WORKERS:-} CELERY_MIN_WORKERS: ${CELERY_MIN_WORKERS:-}
API_TOOL_DEFAULT_CONNECT_TIMEOUT: ${API_TOOL_DEFAULT_CONNECT_TIMEOUT:-10} API_TOOL_DEFAULT_CONNECT_TIMEOUT: ${API_TOOL_DEFAULT_CONNECT_TIMEOUT:-10}
API_TOOL_DEFAULT_READ_TIMEOUT: ${API_TOOL_DEFAULT_READ_TIMEOUT:-60} API_TOOL_DEFAULT_READ_TIMEOUT: ${API_TOOL_DEFAULT_READ_TIMEOUT:-60}
ENABLE_WEBSITE_JINAREADER: ${ENABLE_WEBSITE_JINAREADER:-true}
ENABLE_WEBSITE_FIRECRAWL: ${ENABLE_WEBSITE_FIRECRAWL:-true}
ENABLE_WEBSITE_WATERCRAWL: ${ENABLE_WEBSITE_WATERCRAWL:-true}
DB_USERNAME: ${DB_USERNAME:-postgres} DB_USERNAME: ${DB_USERNAME:-postgres}
DB_PASSWORD: ${DB_PASSWORD:-difyai123456} DB_PASSWORD: ${DB_PASSWORD:-difyai123456}
DB_HOST: ${DB_HOST:-db} DB_HOST: ${DB_HOST:-db}
@ -543,7 +546,9 @@ services:
MAX_TOOLS_NUM: ${MAX_TOOLS_NUM:-10} MAX_TOOLS_NUM: ${MAX_TOOLS_NUM:-10}
MAX_PARALLEL_LIMIT: ${MAX_PARALLEL_LIMIT:-10} MAX_PARALLEL_LIMIT: ${MAX_PARALLEL_LIMIT:-10}
MAX_ITERATIONS_NUM: ${MAX_ITERATIONS_NUM:-5} MAX_ITERATIONS_NUM: ${MAX_ITERATIONS_NUM:-5}
ENABLE_WEBSITE_JINAREADER: ${ENABLE_WEBSITE_JINAREADER:-true}
ENABLE_WEBSITE_FIRECRAWL: ${ENABLE_WEBSITE_FIRECRAWL:-true}
ENABLE_WEBSITE_WATERCRAWL: ${ENABLE_WEBSITE_WATERCRAWL:-true}
# The postgres database. # The postgres database.
db: db:
image: postgres:15-alpine image: postgres:15-alpine

View File

@ -49,3 +49,8 @@ NEXT_PUBLIC_MAX_PARALLEL_LIMIT=10
# The maximum number of iterations for agent setting # The maximum number of iterations for agent setting
NEXT_PUBLIC_MAX_ITERATIONS_NUM=5 NEXT_PUBLIC_MAX_ITERATIONS_NUM=5
NEXT_PUBLIC_ENABLE_WEBSITE_JINAREADER=true
NEXT_PUBLIC_ENABLE_WEBSITE_FIRECRAWL=true
NEXT_PUBLIC_ENABLE_WEBSITE_WATERCRAWL=true

View File

@ -20,7 +20,7 @@ import { useProviderContext } from '@/context/provider-context'
import VectorSpaceFull from '@/app/components/billing/vector-space-full' import VectorSpaceFull from '@/app/components/billing/vector-space-full'
import classNames from '@/utils/classnames' import classNames from '@/utils/classnames'
import { Icon3Dots } from '@/app/components/base/icons/src/vender/line/others' import { Icon3Dots } from '@/app/components/base/icons/src/vender/line/others'
import { ENABLE_WEBSITE_FIRECRAWL, ENABLE_WEBSITE_JINAREADER, ENABLE_WEBSITE_WATERCRAWL } from '@/config'
type IStepOneProps = { type IStepOneProps = {
datasetId?: string datasetId?: string
dataSourceType?: DataSourceType dataSourceType?: DataSourceType
@ -126,9 +126,7 @@ const StepOne = ({
return true return true
if (files.some(file => !file.file.id)) if (files.some(file => !file.file.id))
return true return true
if (isShowVectorSpaceFull) return isShowVectorSpaceFull
return true
return false
}, [files, isShowVectorSpaceFull]) }, [files, isShowVectorSpaceFull])
return ( return (
@ -193,7 +191,8 @@ const StepOne = ({
{t('datasetCreation.stepOne.dataSourceType.notion')} {t('datasetCreation.stepOne.dataSourceType.notion')}
</span> </span>
</div> </div>
<div {(ENABLE_WEBSITE_FIRECRAWL || ENABLE_WEBSITE_JINAREADER || ENABLE_WEBSITE_WATERCRAWL) && (
<div
className={cn( className={cn(
s.dataSourceItem, s.dataSourceItem,
'system-sm-medium', 'system-sm-medium',
@ -201,7 +200,7 @@ const StepOne = ({
dataSourceTypeDisable && dataSourceType !== DataSourceType.WEB && s.disabled, dataSourceTypeDisable && dataSourceType !== DataSourceType.WEB && s.disabled,
)} )}
onClick={() => changeType(DataSourceType.WEB)} onClick={() => changeType(DataSourceType.WEB)}
> >
<span className={cn(s.datasetIcon, s.web)} /> <span className={cn(s.datasetIcon, s.web)} />
<span <span
title={t('datasetCreation.stepOne.dataSourceType.web')} title={t('datasetCreation.stepOne.dataSourceType.web')}
@ -209,7 +208,8 @@ const StepOne = ({
> >
{t('datasetCreation.stepOne.dataSourceType.web')} {t('datasetCreation.stepOne.dataSourceType.web')}
</span> </span>
</div> </div>
)}
</div> </div>
) )
} }

View File

@ -12,6 +12,7 @@ import { useModalContext } from '@/context/modal-context'
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets' import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
import { fetchDataSources } from '@/service/datasets' import { fetchDataSources } from '@/service/datasets'
import { type DataSourceItem, DataSourceProvider } from '@/models/common' import { type DataSourceItem, DataSourceProvider } from '@/models/common'
import { ENABLE_WEBSITE_FIRECRAWL, ENABLE_WEBSITE_JINAREADER, ENABLE_WEBSITE_WATERCRAWL } from '@/config'
type Props = { type Props = {
onPreview: (payload: CrawlResultItem) => void onPreview: (payload: CrawlResultItem) => void
@ -84,7 +85,7 @@ const Website: FC<Props> = ({
{t('datasetCreation.stepOne.website.chooseProvider')} {t('datasetCreation.stepOne.website.chooseProvider')}
</div> </div>
<div className="flex space-x-2"> <div className="flex space-x-2">
<button {ENABLE_WEBSITE_JINAREADER && <button
className={cn('flex items-center justify-center rounded-lg px-4 py-2', className={cn('flex items-center justify-center rounded-lg px-4 py-2',
selectedProvider === DataSourceProvider.jinaReader selectedProvider === DataSourceProvider.jinaReader
? 'system-sm-medium border-[1.5px] border-components-option-card-option-selected-border bg-components-option-card-option-selected-bg text-text-primary' ? 'system-sm-medium border-[1.5px] border-components-option-card-option-selected-border bg-components-option-card-option-selected-bg text-text-primary'
@ -95,8 +96,8 @@ const Website: FC<Props> = ({
> >
<span className={cn(s.jinaLogo, 'mr-2')}/> <span className={cn(s.jinaLogo, 'mr-2')}/>
<span>Jina Reader</span> <span>Jina Reader</span>
</button> </button>}
<button {ENABLE_WEBSITE_FIRECRAWL && <button
className={cn('rounded-lg px-4 py-2', className={cn('rounded-lg px-4 py-2',
selectedProvider === DataSourceProvider.fireCrawl selectedProvider === DataSourceProvider.fireCrawl
? 'system-sm-medium border-[1.5px] border-components-option-card-option-selected-border bg-components-option-card-option-selected-bg text-text-primary' ? 'system-sm-medium border-[1.5px] border-components-option-card-option-selected-border bg-components-option-card-option-selected-bg text-text-primary'
@ -106,8 +107,8 @@ const Website: FC<Props> = ({
onClick={() => setSelectedProvider(DataSourceProvider.fireCrawl)} onClick={() => setSelectedProvider(DataSourceProvider.fireCrawl)}
> >
🔥 Firecrawl 🔥 Firecrawl
</button> </button>}
<button {ENABLE_WEBSITE_WATERCRAWL && <button
className={cn('flex items-center justify-center rounded-lg px-4 py-2', className={cn('flex items-center justify-center rounded-lg px-4 py-2',
selectedProvider === DataSourceProvider.waterCrawl selectedProvider === DataSourceProvider.waterCrawl
? 'system-sm-medium border-[1.5px] border-components-option-card-option-selected-border bg-components-option-card-option-selected-bg text-text-primary' ? 'system-sm-medium border-[1.5px] border-components-option-card-option-selected-border bg-components-option-card-option-selected-bg text-text-primary'
@ -118,7 +119,7 @@ const Website: FC<Props> = ({
> >
<span className={cn(s.watercrawlLogo, 'mr-2')}/> <span className={cn(s.watercrawlLogo, 'mr-2')}/>
<span>WaterCrawl</span> <span>WaterCrawl</span>
</button> </button>}
</div> </div>
</div> </div>
{source && selectedProvider === DataSourceProvider.fireCrawl && ( {source && selectedProvider === DataSourceProvider.fireCrawl && (

View File

@ -6,6 +6,7 @@ import s from './index.module.css'
import { Icon3Dots } from '@/app/components/base/icons/src/vender/line/others' import { Icon3Dots } from '@/app/components/base/icons/src/vender/line/others'
import Button from '@/app/components/base/button' import Button from '@/app/components/base/button'
import { DataSourceProvider } from '@/models/common' import { DataSourceProvider } from '@/models/common'
import { ENABLE_WEBSITE_FIRECRAWL, ENABLE_WEBSITE_JINAREADER, ENABLE_WEBSITE_WATERCRAWL } from '@/config'
const I18N_PREFIX = 'datasetCreation.stepOne.website' const I18N_PREFIX = 'datasetCreation.stepOne.website'
@ -16,29 +17,30 @@ type Props = {
const NoData: FC<Props> = ({ const NoData: FC<Props> = ({
onConfig, onConfig,
provider,
}) => { }) => {
const { t } = useTranslation() const { t } = useTranslation()
const providerConfig = { const providerConfig = {
[DataSourceProvider.jinaReader]: { [DataSourceProvider.jinaReader]: ENABLE_WEBSITE_JINAREADER ? {
emoji: <span className={s.jinaLogo} />, emoji: <span className={s.jinaLogo} />,
title: t(`${I18N_PREFIX}.jinaReaderNotConfigured`), title: t(`${I18N_PREFIX}.jinaReaderNotConfigured`),
description: t(`${I18N_PREFIX}.jinaReaderNotConfiguredDescription`), description: t(`${I18N_PREFIX}.jinaReaderNotConfiguredDescription`),
}, } : null,
[DataSourceProvider.fireCrawl]: { [DataSourceProvider.fireCrawl]: ENABLE_WEBSITE_FIRECRAWL ? {
emoji: '🔥', emoji: '🔥',
title: t(`${I18N_PREFIX}.fireCrawlNotConfigured`), title: t(`${I18N_PREFIX}.fireCrawlNotConfigured`),
description: t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`), description: t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`),
}, } : null,
[DataSourceProvider.waterCrawl]: { [DataSourceProvider.waterCrawl]: ENABLE_WEBSITE_WATERCRAWL ? {
emoji: <span className={s.watercrawlLogo} />, emoji: '💧',
title: t(`${I18N_PREFIX}.waterCrawlNotConfigured`), title: t(`${I18N_PREFIX}.waterCrawlNotConfigured`),
description: t(`${I18N_PREFIX}.waterCrawlNotConfiguredDescription`), description: t(`${I18N_PREFIX}.waterCrawlNotConfiguredDescription`),
}, } : null,
} }
const currentProvider = providerConfig[provider] const currentProvider = Object.values(providerConfig).find(provider => provider !== null) || providerConfig[DataSourceProvider.jinaReader]
if (!currentProvider) return null
return ( return (
<> <>

View File

@ -3,6 +3,7 @@ import DataSourceNotion from './data-source-notion'
import DataSourceWebsite from './data-source-website' import DataSourceWebsite from './data-source-website'
import { fetchDataSource } from '@/service/common' import { fetchDataSource } from '@/service/common'
import { DataSourceProvider } from '@/models/common' import { DataSourceProvider } from '@/models/common'
import { ENABLE_WEBSITE_FIRECRAWL, ENABLE_WEBSITE_JINAREADER, ENABLE_WEBSITE_WATERCRAWL } from '@/config'
export default function DataSourcePage() { export default function DataSourcePage() {
const { data } = useSWR({ url: 'data-source/integrates' }, fetchDataSource) const { data } = useSWR({ url: 'data-source/integrates' }, fetchDataSource)
@ -11,9 +12,9 @@ export default function DataSourcePage() {
return ( return (
<div className='mb-8'> <div className='mb-8'>
<DataSourceNotion workspaces={notionWorkspaces} /> <DataSourceNotion workspaces={notionWorkspaces} />
<DataSourceWebsite provider={DataSourceProvider.jinaReader} /> {ENABLE_WEBSITE_JINAREADER && <DataSourceWebsite provider={DataSourceProvider.jinaReader} />}
<DataSourceWebsite provider={DataSourceProvider.fireCrawl} /> {ENABLE_WEBSITE_FIRECRAWL && <DataSourceWebsite provider={DataSourceProvider.fireCrawl} />}
<DataSourceWebsite provider={DataSourceProvider.waterCrawl} /> {ENABLE_WEBSITE_WATERCRAWL && <DataSourceWebsite provider={DataSourceProvider.waterCrawl} />}
</div> </div>
) )
} }

View File

@ -302,3 +302,15 @@ else if (globalThis.document?.body?.getAttribute('data-public-max-iterations-num
maxIterationsNum = Number.parseInt(globalThis.document.body.getAttribute('data-public-max-iterations-num') as string) maxIterationsNum = Number.parseInt(globalThis.document.body.getAttribute('data-public-max-iterations-num') as string)
export const MAX_ITERATIONS_NUM = maxIterationsNum export const MAX_ITERATIONS_NUM = maxIterationsNum
export const ENABLE_WEBSITE_JINAREADER = process.env.NEXT_PUBLIC_ENABLE_WEBSITE_JINAREADER !== undefined
? process.env.NEXT_PUBLIC_ENABLE_WEBSITE_JINAREADER === 'true'
: true
export const ENABLE_WEBSITE_FIRECRAWL = process.env.NEXT_PUBLIC_ENABLE_WEBSITE_FIRECRAWL !== undefined
? process.env.NEXT_PUBLIC_ENABLE_WEBSITE_FIRECRAWL === 'true'
: true
export const ENABLE_WEBSITE_WATERCRAWL = process.env.NEXT_PUBLIC_ENABLE_WEBSITE_WATERCRAWL !== undefined
? process.env.NEXT_PUBLIC_ENABLE_WEBSITE_WATERCRAWL === 'true'
: true

View File

@ -28,5 +28,7 @@ export NEXT_PUBLIC_CSP_WHITELIST=${CSP_WHITELIST}
export NEXT_PUBLIC_TOP_K_MAX_VALUE=${TOP_K_MAX_VALUE} export NEXT_PUBLIC_TOP_K_MAX_VALUE=${TOP_K_MAX_VALUE}
export NEXT_PUBLIC_INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=${INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH} export NEXT_PUBLIC_INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=${INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH}
export NEXT_PUBLIC_MAX_TOOLS_NUM=${MAX_TOOLS_NUM} export NEXT_PUBLIC_MAX_TOOLS_NUM=${MAX_TOOLS_NUM}
export NEXT_PUBLIC_ENABLE_WEBSITE_JINAREADER=${ENABLE_WEBSITE_JINAREADER:-true}
export NEXT_PUBLIC_ENABLE_WEBSITE_FIRECRAWL=${ENABLE_WEBSITE_FIRECRAWL:-true}
export NEXT_PUBLIC_ENABLE_WEBSITE_WATERCRAWL=${ENABLE_WEBSITE_WATERCRAWL:-true}
pm2 start /app/web/server.js --name dify-web --cwd /app/web -i ${PM2_INSTANCES} --no-daemon pm2 start /app/web/server.js --name dify-web --cwd /app/web -i ${PM2_INSTANCES} --no-daemon