Wu Tianwei 2b2263a349
Feat/parent child retrieval (#12086)
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
Signed-off-by: -LAN- <laipz8200@outlook.com>
Co-authored-by: AkaraChen <akarachen@outlook.com>
Co-authored-by: nite-knite <nkCoding@gmail.com>
Co-authored-by: Joel <iamjoel007@gmail.com>
Co-authored-by: Warren Chen <warren.chen830@gmail.com>
Co-authored-by: crazywoola <427733928@qq.com>
Co-authored-by: Yi Xiao <54782454+YIXIAO0@users.noreply.github.com>
Co-authored-by: yihong <zouzou0208@gmail.com>
Co-authored-by: -LAN- <laipz8200@outlook.com>
Co-authored-by: KVOJJJin <jzongcode@gmail.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: JzoNgKVO <27049666+JzoNgKVO@users.noreply.github.com>
Co-authored-by: Charlie.Wei <luowei@cvte.com>
Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
Co-authored-by: huayaoyue6 <huayaoyue@163.com>
Co-authored-by: kurokobo <kuro664@gmail.com>
Co-authored-by: Matsuda <yiyth.fcb6@gmail.com>
Co-authored-by: shirochan <s.yusuke0711@gmail.com>
Co-authored-by: Jyong <76649700+JohnJyong@users.noreply.github.com>
Co-authored-by: Huỳnh Gia Bôi <boihuynh147@gmail.com>
Co-authored-by: Julian Huynh <julian.huynh@immersio.io>
Co-authored-by: Hash Brown <hi@xzd.me>
Co-authored-by: 非法操作 <hjlarry@163.com>
Co-authored-by: Kazuki Takamatsu <kazuki.takamatsu@chowagiken.co.jp>
Co-authored-by: Trey Dong <1346650911@qq.com>
Co-authored-by: VoidIsVoid <343750470@qq.com>
Co-authored-by: Gimling <huangjl@ruyi.ai>
Co-authored-by: xiandan-erizo <xiandan.erizo@gmail.com>
Co-authored-by: Muneyuki Noguchi <nogu.dev@gmail.com>
Co-authored-by: zhaobingshuang <1475195565@qq.com>
Co-authored-by: zhaobs <zhaobs@cailian.net>
Co-authored-by: suzuki.sh <s2terminal@users.noreply.github.com>
Co-authored-by: Yingchun Lai <laiyingchun@apache.org>
Co-authored-by: huanshare <huanshare@live.com>
Co-authored-by: huanshare <liuhuan101@longfor.com>
Co-authored-by: orangeclk <orangeclk@users.noreply.github.com>
Co-authored-by: 문정현 <120004247+JungHyunMoon@users.noreply.github.com>
Co-authored-by: barabicu <kztk533@gmail.com>
Co-authored-by: Wei Mingzhi <whistler_wmz@users.sf.net>
Co-authored-by: Paul van Oorschot <20116814+pvoo@users.noreply.github.com>
Co-authored-by: zkyTech <zhangkunyuan@hotmail.com>
Co-authored-by: zhangkunyuan <zhangkunyuan@cmhi.chinamobile.com>
Co-authored-by: Tommy <34446820+Asterovim@users.noreply.github.com>
Co-authored-by: zxhlyh <jasonapring2015@outlook.com>
Co-authored-by: Novice <857526207@qq.com>
Co-authored-by: Novice Lee <novicelee@NovicedeMacBook-Pro.local>
Co-authored-by: Novice Lee <novicelee@NoviPro.local>
Co-authored-by: zxhlyh <16177003+zxhlyh@users.noreply.github.com>
Co-authored-by: liuzhenghua <1090179900@qq.com>
Co-authored-by: Jiang <65766008+AlwaysBluer@users.noreply.github.com>
Co-authored-by: jiangzhijie <jiangzhijie.jzj@alibaba-inc.com>
Co-authored-by: Joe <79627742+ZhouhaoJiang@users.noreply.github.com>
Co-authored-by: Alok Shrivastwa <alok.shrivastwa@gmail.com>
Co-authored-by: Alok Shrivastwa <Alok.Shrivastwa@microland.com>
Co-authored-by: JasonVV <jasonwangiii@outlook.com>
Co-authored-by: Hiroshi Fujita <fujita-h@users.noreply.github.com>
Co-authored-by: Kevin9703 <51311316+Kevin9703@users.noreply.github.com>
Co-authored-by: NFish <douxc512@gmail.com>
Co-authored-by: Junyan Qin <1010553892@qq.com>
Co-authored-by: IWAI, Masaharu <iwaim.sub@gmail.com>
Co-authored-by: IWAI, Masaharu <iwai_masaharu@funkit.co.jp>
Co-authored-by: Bowen Liang <liangbowen@gf.com.cn>
Co-authored-by: luckylhb90 <luckylhb90@gmail.com>
Co-authored-by: hobo.l <hobo.l@binance.com>
Co-authored-by: douxc <7553076+douxc@users.noreply.github.com>
2024-12-25 18:17:15 +08:00

314 lines
12 KiB
TypeScript

import type { FC } from 'react'
import React, { useCallback, useEffect, useMemo, useRef, useState } from 'react'
import useSWR from 'swr'
import { useContext } from 'use-context-selector'
import { useTranslation } from 'react-i18next'
import { omit } from 'lodash-es'
import { RiLoader2Line, RiPauseCircleLine, RiPlayCircleLine } from '@remixicon/react'
import Image from 'next/image'
import { FieldInfo } from '../metadata'
import { useDocumentContext } from '../index'
import { IndexingType } from '../../../create/step-two'
import { indexMethodIcon, retrievalIcon } from '../../../create/icons'
import EmbeddingSkeleton from './skeleton'
import { RETRIEVE_METHOD } from '@/types/app'
import cn from '@/utils/classnames'
import Divider from '@/app/components/base/divider'
import { ToastContext } from '@/app/components/base/toast'
import { ProcessMode, type ProcessRuleResponse } from '@/models/datasets'
import type { CommonResponse } from '@/models/common'
import { asyncRunSafe, sleep } from '@/utils'
import {
fetchIndexingStatus as doFetchIndexingStatus,
fetchProcessRule,
pauseDocIndexing,
resumeDocIndexing,
} from '@/service/datasets'
type IEmbeddingDetailProps = {
datasetId?: string
documentId?: string
indexingType?: IndexingType
retrievalMethod?: RETRIEVE_METHOD
detailUpdate: VoidFunction
}
type IRuleDetailProps = {
sourceData?: ProcessRuleResponse
indexingType?: IndexingType
retrievalMethod?: RETRIEVE_METHOD
}
const RuleDetail: FC<IRuleDetailProps> = React.memo(({
sourceData,
indexingType,
retrievalMethod,
}) => {
const { t } = useTranslation()
const segmentationRuleMap = {
mode: t('datasetDocuments.embedding.mode'),
segmentLength: t('datasetDocuments.embedding.segmentLength'),
textCleaning: t('datasetDocuments.embedding.textCleaning'),
}
const getRuleName = (key: string) => {
if (key === 'remove_extra_spaces')
return t('datasetCreation.stepTwo.removeExtraSpaces')
if (key === 'remove_urls_emails')
return t('datasetCreation.stepTwo.removeUrlEmails')
if (key === 'remove_stopwords')
return t('datasetCreation.stepTwo.removeStopwords')
}
const isNumber = (value: unknown) => {
return typeof value === 'number'
}
const getValue = useCallback((field: string) => {
let value: string | number | undefined = '-'
const maxTokens = isNumber(sourceData?.rules?.segmentation?.max_tokens)
? sourceData.rules.segmentation.max_tokens
: value
const childMaxTokens = isNumber(sourceData?.rules?.subchunk_segmentation?.max_tokens)
? sourceData.rules.subchunk_segmentation.max_tokens
: value
switch (field) {
case 'mode':
value = !sourceData?.mode
? value
: sourceData.mode === ProcessMode.general
? (t('datasetDocuments.embedding.custom') as string)
: `${t('datasetDocuments.embedding.hierarchical')} · ${sourceData?.rules?.parent_mode === 'paragraph'
? t('dataset.parentMode.paragraph')
: t('dataset.parentMode.fullDoc')}`
break
case 'segmentLength':
value = !sourceData?.mode
? value
: sourceData.mode === ProcessMode.general
? maxTokens
: `${t('datasetDocuments.embedding.parentMaxTokens')} ${maxTokens}; ${t('datasetDocuments.embedding.childMaxTokens')} ${childMaxTokens}`
break
default:
value = !sourceData?.mode
? value
: sourceData?.rules?.pre_processing_rules?.filter(rule =>
rule.enabled).map(rule => getRuleName(rule.id)).join(',')
break
}
return value
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [sourceData])
return <div className='py-3'>
<div className='flex flex-col gap-y-1'>
{Object.keys(segmentationRuleMap).map((field) => {
return <FieldInfo
key={field}
label={segmentationRuleMap[field as keyof typeof segmentationRuleMap]}
displayedValue={String(getValue(field))}
/>
})}
</div>
<Divider type='horizontal' className='bg-divider-subtle' />
<FieldInfo
label={t('datasetCreation.stepTwo.indexMode')}
displayedValue={t(`datasetCreation.stepTwo.${indexingType === IndexingType.ECONOMICAL ? 'economical' : 'qualified'}`) as string}
valueIcon={
<Image
className='size-4'
src={
indexingType === IndexingType.ECONOMICAL
? indexMethodIcon.economical
: indexMethodIcon.high_quality
}
alt=''
/>
}
/>
<FieldInfo
label={t('datasetSettings.form.retrievalSetting.title')}
displayedValue={t(`dataset.retrieval.${indexingType === IndexingType.ECONOMICAL ? 'invertedIndex' : retrievalMethod}.title`) as string}
valueIcon={
<Image
className='size-4'
src={
retrievalMethod === RETRIEVE_METHOD.fullText
? retrievalIcon.fullText
: retrievalMethod === RETRIEVE_METHOD.hybrid
? retrievalIcon.hybrid
: retrievalIcon.vector
}
alt=''
/>
}
/>
</div>
})
RuleDetail.displayName = 'RuleDetail'
const EmbeddingDetail: FC<IEmbeddingDetailProps> = ({
datasetId: dstId,
documentId: docId,
detailUpdate,
indexingType,
retrievalMethod,
}) => {
const { t } = useTranslation()
const { notify } = useContext(ToastContext)
const datasetId = useDocumentContext(s => s.datasetId)
const documentId = useDocumentContext(s => s.documentId)
const localDatasetId = dstId ?? datasetId
const localDocumentId = docId ?? documentId
const [indexingStatusDetail, setIndexingStatusDetail] = useState<any>(null)
const fetchIndexingStatus = async () => {
const status = await doFetchIndexingStatus({ datasetId: localDatasetId, documentId: localDocumentId })
setIndexingStatusDetail(status)
return status
}
const isStopQuery = useRef(false)
const stopQueryStatus = useCallback(() => {
isStopQuery.current = true
}, [])
const startQueryStatus = useCallback(async () => {
if (isStopQuery.current)
return
try {
const indexingStatusDetail = await fetchIndexingStatus()
if (['completed', 'error', 'paused'].includes(indexingStatusDetail?.indexing_status)) {
stopQueryStatus()
detailUpdate()
return
}
await sleep(2500)
await startQueryStatus()
}
catch (e) {
await sleep(2500)
await startQueryStatus()
}
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [stopQueryStatus])
useEffect(() => {
isStopQuery.current = false
startQueryStatus()
return () => {
stopQueryStatus()
}
}, [startQueryStatus, stopQueryStatus])
const { data: ruleDetail } = useSWR({
action: 'fetchProcessRule',
params: { documentId: localDocumentId },
}, apiParams => fetchProcessRule(omit(apiParams, 'action')), {
revalidateOnFocus: false,
})
const isEmbedding = useMemo(() => ['indexing', 'splitting', 'parsing', 'cleaning'].includes(indexingStatusDetail?.indexing_status || ''), [indexingStatusDetail])
const isEmbeddingCompleted = useMemo(() => ['completed'].includes(indexingStatusDetail?.indexing_status || ''), [indexingStatusDetail])
const isEmbeddingPaused = useMemo(() => ['paused'].includes(indexingStatusDetail?.indexing_status || ''), [indexingStatusDetail])
const isEmbeddingError = useMemo(() => ['error'].includes(indexingStatusDetail?.indexing_status || ''), [indexingStatusDetail])
const percent = useMemo(() => {
const completedCount = indexingStatusDetail?.completed_segments || 0
const totalCount = indexingStatusDetail?.total_segments || 0
if (totalCount === 0)
return 0
const percent = Math.round(completedCount * 100 / totalCount)
return percent > 100 ? 100 : percent
}, [indexingStatusDetail])
const handleSwitch = async () => {
const opApi = isEmbedding ? pauseDocIndexing : resumeDocIndexing
const [e] = await asyncRunSafe<CommonResponse>(opApi({ datasetId: localDatasetId, documentId: localDocumentId }) as Promise<CommonResponse>)
if (!e) {
notify({ type: 'success', message: t('common.actionMsg.modifiedSuccessfully') })
// if the embedding is resumed from paused, we need to start the query status
if (isEmbeddingPaused) {
isStopQuery.current = false
startQueryStatus()
detailUpdate()
}
setIndexingStatusDetail(null)
}
else {
notify({ type: 'error', message: t('common.actionMsg.modifiedUnsuccessfully') })
}
}
return (
<>
<div className='py-12 px-16 flex flex-col gap-y-2'>
<div className='flex items-center gap-x-1 h-6'>
{isEmbedding && <RiLoader2Line className='h-4 w-4 text-text-secondary animate-spin' />}
<span className='grow text-text-secondary system-md-semibold-uppercase'>
{isEmbedding && t('datasetDocuments.embedding.processing')}
{isEmbeddingCompleted && t('datasetDocuments.embedding.completed')}
{isEmbeddingPaused && t('datasetDocuments.embedding.paused')}
{isEmbeddingError && t('datasetDocuments.embedding.error')}
</span>
{isEmbedding && (
<button
type='button'
className={`px-1.5 py-1 border-[0.5px] border-components-button-secondary-border bg-components-button-secondary-bg
shadow-xs shadow-shadow-shadow-3 backdrop-blur-[5px] flex items-center gap-x-1 rounded-md`}
onClick={handleSwitch}
>
<RiPauseCircleLine className='w-3.5 h-3.5 text-components-button-secondary-text' />
<span className='pr-[3px] text-components-button-secondary-text system-xs-medium'>
{t('datasetDocuments.embedding.pause')}
</span>
</button>
)}
{isEmbeddingPaused && (
<button
type='button'
className={`px-1.5 py-1 border-[0.5px] border-components-button-secondary-border bg-components-button-secondary-bg
shadow-xs shadow-shadow-shadow-3 backdrop-blur-[5px] flex items-center gap-x-1 rounded-md`}
onClick={handleSwitch}
>
<RiPlayCircleLine className='w-3.5 h-3.5 text-components-button-secondary-text' />
<span className='pr-[3px] text-components-button-secondary-text system-xs-medium'>
{t('datasetDocuments.embedding.resume')}
</span>
</button>
)}
</div>
{/* progress bar */}
<div className={cn(
'flex items-center w-full h-2 rounded-md border border-components-progress-bar-border overflow-hidden',
isEmbedding ? 'bg-components-progress-bar-bg bg-opacity-50' : 'bg-components-progress-bar-bg',
)}>
<div
className={cn(
'h-full',
(isEmbedding || isEmbeddingCompleted) && 'bg-components-progress-bar-progress-solid',
(isEmbeddingPaused || isEmbeddingError) && 'bg-components-progress-bar-progress-highlight',
)}
style={{ width: `${percent}%` }}
/>
</div>
<div className={'w-full flex items-center'}>
<span className='text-text-secondary system-xs-medium'>
{`${t('datasetDocuments.embedding.segments')} ${indexingStatusDetail?.completed_segments || '--'}/${indexingStatusDetail?.total_segments || '--'} · ${percent}%`}
</span>
</div>
<RuleDetail sourceData={ruleDetail} indexingType={indexingType} retrievalMethod={retrievalMethod} />
</div>
<EmbeddingSkeleton />
</>
)
}
export default React.memo(EmbeddingDetail)