feat: add switches for jina firecrawl watercrawl (#18153)

This commit is contained in:
crazywoola 2025-04-16 15:50:15 +08:00 committed by GitHub
parent b247ef85bf
commit e1455cecd8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 64 additions and 28 deletions

View File

@ -174,6 +174,12 @@ CELERY_MIN_WORKERS=
API_TOOL_DEFAULT_CONNECT_TIMEOUT=10
API_TOOL_DEFAULT_READ_TIMEOUT=60
# -------------------------------
# Datasource Configuration
# --------------------------------
ENABLE_WEBSITE_JINAREADER=true
ENABLE_WEBSITE_FIRECRAWL=true
ENABLE_WEBSITE_WATERCRAWL=true
# ------------------------------
# Database Configuration

View File

@ -75,7 +75,9 @@ services:
MAX_TOOLS_NUM: ${MAX_TOOLS_NUM:-10}
MAX_PARALLEL_LIMIT: ${MAX_PARALLEL_LIMIT:-10}
MAX_ITERATIONS_NUM: ${MAX_ITERATIONS_NUM:-5}
ENABLE_WEBSITE_JINAREADER: ${ENABLE_WEBSITE_JINAREADER:-true}
ENABLE_WEBSITE_FIRECRAWL: ${ENABLE_WEBSITE_FIRECRAWL:-true}
ENABLE_WEBSITE_WATERCRAWL: ${ENABLE_WEBSITE_WATERCRAWL:-true}
# The postgres database.
db:
image: postgres:15-alpine

View File

@ -43,6 +43,9 @@ x-shared-env: &shared-api-worker-env
CELERY_MIN_WORKERS: ${CELERY_MIN_WORKERS:-}
API_TOOL_DEFAULT_CONNECT_TIMEOUT: ${API_TOOL_DEFAULT_CONNECT_TIMEOUT:-10}
API_TOOL_DEFAULT_READ_TIMEOUT: ${API_TOOL_DEFAULT_READ_TIMEOUT:-60}
ENABLE_WEBSITE_JINAREADER: ${ENABLE_WEBSITE_JINAREADER:-true}
ENABLE_WEBSITE_FIRECRAWL: ${ENABLE_WEBSITE_FIRECRAWL:-true}
ENABLE_WEBSITE_WATERCRAWL: ${ENABLE_WEBSITE_WATERCRAWL:-true}
DB_USERNAME: ${DB_USERNAME:-postgres}
DB_PASSWORD: ${DB_PASSWORD:-difyai123456}
DB_HOST: ${DB_HOST:-db}
@ -543,7 +546,9 @@ services:
MAX_TOOLS_NUM: ${MAX_TOOLS_NUM:-10}
MAX_PARALLEL_LIMIT: ${MAX_PARALLEL_LIMIT:-10}
MAX_ITERATIONS_NUM: ${MAX_ITERATIONS_NUM:-5}
ENABLE_WEBSITE_JINAREADER: ${ENABLE_WEBSITE_JINAREADER:-true}
ENABLE_WEBSITE_FIRECRAWL: ${ENABLE_WEBSITE_FIRECRAWL:-true}
ENABLE_WEBSITE_WATERCRAWL: ${ENABLE_WEBSITE_WATERCRAWL:-true}
# The postgres database.
db:
image: postgres:15-alpine

View File

@ -49,3 +49,8 @@ NEXT_PUBLIC_MAX_PARALLEL_LIMIT=10
# The maximum number of iterations for agent setting
NEXT_PUBLIC_MAX_ITERATIONS_NUM=5
NEXT_PUBLIC_ENABLE_WEBSITE_JINAREADER=true
NEXT_PUBLIC_ENABLE_WEBSITE_FIRECRAWL=true
NEXT_PUBLIC_ENABLE_WEBSITE_WATERCRAWL=true

View File

@ -20,7 +20,7 @@ import { useProviderContext } from '@/context/provider-context'
import VectorSpaceFull from '@/app/components/billing/vector-space-full'
import classNames from '@/utils/classnames'
import { Icon3Dots } from '@/app/components/base/icons/src/vender/line/others'
import { ENABLE_WEBSITE_FIRECRAWL, ENABLE_WEBSITE_JINAREADER, ENABLE_WEBSITE_WATERCRAWL } from '@/config'
type IStepOneProps = {
datasetId?: string
dataSourceType?: DataSourceType
@ -126,9 +126,7 @@ const StepOne = ({
return true
if (files.some(file => !file.file.id))
return true
if (isShowVectorSpaceFull)
return true
return false
return isShowVectorSpaceFull
}, [files, isShowVectorSpaceFull])
return (
@ -193,7 +191,8 @@ const StepOne = ({
{t('datasetCreation.stepOne.dataSourceType.notion')}
</span>
</div>
<div
{(ENABLE_WEBSITE_FIRECRAWL || ENABLE_WEBSITE_JINAREADER || ENABLE_WEBSITE_WATERCRAWL) && (
<div
className={cn(
s.dataSourceItem,
'system-sm-medium',
@ -201,7 +200,7 @@ const StepOne = ({
dataSourceTypeDisable && dataSourceType !== DataSourceType.WEB && s.disabled,
)}
onClick={() => changeType(DataSourceType.WEB)}
>
>
<span className={cn(s.datasetIcon, s.web)} />
<span
title={t('datasetCreation.stepOne.dataSourceType.web')}
@ -209,7 +208,8 @@ const StepOne = ({
>
{t('datasetCreation.stepOne.dataSourceType.web')}
</span>
</div>
</div>
)}
</div>
)
}

View File

@ -12,6 +12,7 @@ import { useModalContext } from '@/context/modal-context'
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
import { fetchDataSources } from '@/service/datasets'
import { type DataSourceItem, DataSourceProvider } from '@/models/common'
import { ENABLE_WEBSITE_FIRECRAWL, ENABLE_WEBSITE_JINAREADER, ENABLE_WEBSITE_WATERCRAWL } from '@/config'
type Props = {
onPreview: (payload: CrawlResultItem) => void
@ -84,7 +85,7 @@ const Website: FC<Props> = ({
{t('datasetCreation.stepOne.website.chooseProvider')}
</div>
<div className="flex space-x-2">
<button
{ENABLE_WEBSITE_JINAREADER && <button
className={cn('flex items-center justify-center rounded-lg px-4 py-2',
selectedProvider === DataSourceProvider.jinaReader
? 'system-sm-medium border-[1.5px] border-components-option-card-option-selected-border bg-components-option-card-option-selected-bg text-text-primary'
@ -95,8 +96,8 @@ const Website: FC<Props> = ({
>
<span className={cn(s.jinaLogo, 'mr-2')}/>
<span>Jina Reader</span>
</button>
<button
</button>}
{ENABLE_WEBSITE_FIRECRAWL && <button
className={cn('rounded-lg px-4 py-2',
selectedProvider === DataSourceProvider.fireCrawl
? 'system-sm-medium border-[1.5px] border-components-option-card-option-selected-border bg-components-option-card-option-selected-bg text-text-primary'
@ -106,8 +107,8 @@ const Website: FC<Props> = ({
onClick={() => setSelectedProvider(DataSourceProvider.fireCrawl)}
>
🔥 Firecrawl
</button>
<button
</button>}
{ENABLE_WEBSITE_WATERCRAWL && <button
className={cn('flex items-center justify-center rounded-lg px-4 py-2',
selectedProvider === DataSourceProvider.waterCrawl
? 'system-sm-medium border-[1.5px] border-components-option-card-option-selected-border bg-components-option-card-option-selected-bg text-text-primary'
@ -118,7 +119,7 @@ const Website: FC<Props> = ({
>
<span className={cn(s.watercrawlLogo, 'mr-2')}/>
<span>WaterCrawl</span>
</button>
</button>}
</div>
</div>
{source && selectedProvider === DataSourceProvider.fireCrawl && (

View File

@ -6,6 +6,7 @@ import s from './index.module.css'
import { Icon3Dots } from '@/app/components/base/icons/src/vender/line/others'
import Button from '@/app/components/base/button'
import { DataSourceProvider } from '@/models/common'
import { ENABLE_WEBSITE_FIRECRAWL, ENABLE_WEBSITE_JINAREADER, ENABLE_WEBSITE_WATERCRAWL } from '@/config'
const I18N_PREFIX = 'datasetCreation.stepOne.website'
@ -16,29 +17,30 @@ type Props = {
const NoData: FC<Props> = ({
onConfig,
provider,
}) => {
const { t } = useTranslation()
const providerConfig = {
[DataSourceProvider.jinaReader]: {
[DataSourceProvider.jinaReader]: ENABLE_WEBSITE_JINAREADER ? {
emoji: <span className={s.jinaLogo} />,
title: t(`${I18N_PREFIX}.jinaReaderNotConfigured`),
description: t(`${I18N_PREFIX}.jinaReaderNotConfiguredDescription`),
},
[DataSourceProvider.fireCrawl]: {
} : null,
[DataSourceProvider.fireCrawl]: ENABLE_WEBSITE_FIRECRAWL ? {
emoji: '🔥',
title: t(`${I18N_PREFIX}.fireCrawlNotConfigured`),
description: t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`),
},
[DataSourceProvider.waterCrawl]: {
emoji: <span className={s.watercrawlLogo} />,
} : null,
[DataSourceProvider.waterCrawl]: ENABLE_WEBSITE_WATERCRAWL ? {
emoji: '💧',
title: t(`${I18N_PREFIX}.waterCrawlNotConfigured`),
description: t(`${I18N_PREFIX}.waterCrawlNotConfiguredDescription`),
},
} : null,
}
const currentProvider = providerConfig[provider]
const currentProvider = Object.values(providerConfig).find(provider => provider !== null) || providerConfig[DataSourceProvider.jinaReader]
if (!currentProvider) return null
return (
<>

View File

@ -3,6 +3,7 @@ import DataSourceNotion from './data-source-notion'
import DataSourceWebsite from './data-source-website'
import { fetchDataSource } from '@/service/common'
import { DataSourceProvider } from '@/models/common'
import { ENABLE_WEBSITE_FIRECRAWL, ENABLE_WEBSITE_JINAREADER, ENABLE_WEBSITE_WATERCRAWL } from '@/config'
export default function DataSourcePage() {
const { data } = useSWR({ url: 'data-source/integrates' }, fetchDataSource)
@ -11,9 +12,9 @@ export default function DataSourcePage() {
return (
<div className='mb-8'>
<DataSourceNotion workspaces={notionWorkspaces} />
<DataSourceWebsite provider={DataSourceProvider.jinaReader} />
<DataSourceWebsite provider={DataSourceProvider.fireCrawl} />
<DataSourceWebsite provider={DataSourceProvider.waterCrawl} />
{ENABLE_WEBSITE_JINAREADER && <DataSourceWebsite provider={DataSourceProvider.jinaReader} />}
{ENABLE_WEBSITE_FIRECRAWL && <DataSourceWebsite provider={DataSourceProvider.fireCrawl} />}
{ENABLE_WEBSITE_WATERCRAWL && <DataSourceWebsite provider={DataSourceProvider.waterCrawl} />}
</div>
)
}

View File

@ -302,3 +302,15 @@ else if (globalThis.document?.body?.getAttribute('data-public-max-iterations-num
maxIterationsNum = Number.parseInt(globalThis.document.body.getAttribute('data-public-max-iterations-num') as string)
export const MAX_ITERATIONS_NUM = maxIterationsNum
export const ENABLE_WEBSITE_JINAREADER = process.env.NEXT_PUBLIC_ENABLE_WEBSITE_JINAREADER !== undefined
? process.env.NEXT_PUBLIC_ENABLE_WEBSITE_JINAREADER === 'true'
: true
export const ENABLE_WEBSITE_FIRECRAWL = process.env.NEXT_PUBLIC_ENABLE_WEBSITE_FIRECRAWL !== undefined
? process.env.NEXT_PUBLIC_ENABLE_WEBSITE_FIRECRAWL === 'true'
: true
export const ENABLE_WEBSITE_WATERCRAWL = process.env.NEXT_PUBLIC_ENABLE_WEBSITE_WATERCRAWL !== undefined
? process.env.NEXT_PUBLIC_ENABLE_WEBSITE_WATERCRAWL === 'true'
: true

View File

@ -28,5 +28,7 @@ export NEXT_PUBLIC_CSP_WHITELIST=${CSP_WHITELIST}
export NEXT_PUBLIC_TOP_K_MAX_VALUE=${TOP_K_MAX_VALUE}
export NEXT_PUBLIC_INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=${INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH}
export NEXT_PUBLIC_MAX_TOOLS_NUM=${MAX_TOOLS_NUM}
export NEXT_PUBLIC_ENABLE_WEBSITE_JINAREADER=${ENABLE_WEBSITE_JINAREADER:-true}
export NEXT_PUBLIC_ENABLE_WEBSITE_FIRECRAWL=${ENABLE_WEBSITE_FIRECRAWL:-true}
export NEXT_PUBLIC_ENABLE_WEBSITE_WATERCRAWL=${ENABLE_WEBSITE_WATERCRAWL:-true}
pm2 start /app/web/server.js --name dify-web --cwd /app/web -i ${PM2_INSTANCES} --no-daemon