mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-06-04 11:14:10 +08:00
refactor(file extractor): file extractor (#1059)
This commit is contained in:
parent
d3f8ea2df0
commit
d33a269548
@ -83,7 +83,7 @@ class FileApi(Resource):
|
|||||||
raise FileTooLargeError(message)
|
raise FileTooLargeError(message)
|
||||||
|
|
||||||
extension = file.filename.split('.')[-1]
|
extension = file.filename.split('.')[-1]
|
||||||
if extension not in ALLOWED_EXTENSIONS:
|
if extension.lower() not in ALLOWED_EXTENSIONS:
|
||||||
raise UnsupportedFileTypeError()
|
raise UnsupportedFileTypeError()
|
||||||
|
|
||||||
# user uuid as file name
|
# user uuid as file name
|
||||||
@ -136,7 +136,7 @@ class FilePreviewApi(Resource):
|
|||||||
|
|
||||||
# extract text from file
|
# extract text from file
|
||||||
extension = upload_file.extension
|
extension = upload_file.extension
|
||||||
if extension not in ALLOWED_EXTENSIONS:
|
if extension.lower() not in ALLOWED_EXTENSIONS:
|
||||||
raise UnsupportedFileTypeError()
|
raise UnsupportedFileTypeError()
|
||||||
|
|
||||||
text = FileExtractor.load(upload_file, return_text=True)
|
text = FileExtractor.load(upload_file, return_text=True)
|
||||||
|
@ -47,17 +47,18 @@ class FileExtractor:
|
|||||||
upload_file: Optional[UploadFile] = None) -> Union[List[Document] | str]:
|
upload_file: Optional[UploadFile] = None) -> Union[List[Document] | str]:
|
||||||
input_file = Path(file_path)
|
input_file = Path(file_path)
|
||||||
delimiter = '\n'
|
delimiter = '\n'
|
||||||
if input_file.suffix == '.xlsx':
|
file_extension = input_file.suffix.lower()
|
||||||
|
if file_extension == '.xlsx':
|
||||||
loader = ExcelLoader(file_path)
|
loader = ExcelLoader(file_path)
|
||||||
elif input_file.suffix == '.pdf':
|
elif file_extension == '.pdf':
|
||||||
loader = PdfLoader(file_path, upload_file=upload_file)
|
loader = PdfLoader(file_path, upload_file=upload_file)
|
||||||
elif input_file.suffix in ['.md', '.markdown']:
|
elif file_extension in ['.md', '.markdown']:
|
||||||
loader = MarkdownLoader(file_path, autodetect_encoding=True)
|
loader = MarkdownLoader(file_path, autodetect_encoding=True)
|
||||||
elif input_file.suffix in ['.htm', '.html']:
|
elif file_extension in ['.htm', '.html']:
|
||||||
loader = HTMLLoader(file_path)
|
loader = HTMLLoader(file_path)
|
||||||
elif input_file.suffix == '.docx':
|
elif file_extension == '.docx':
|
||||||
loader = Docx2txtLoader(file_path)
|
loader = Docx2txtLoader(file_path)
|
||||||
elif input_file.suffix == '.csv':
|
elif file_extension == '.csv':
|
||||||
loader = CSVLoader(file_path, autodetect_encoding=True)
|
loader = CSVLoader(file_path, autodetect_encoding=True)
|
||||||
else:
|
else:
|
||||||
# txt
|
# txt
|
||||||
|
@ -78,7 +78,7 @@ const FileUploader = ({
|
|||||||
const isValid = useCallback((file: File) => {
|
const isValid = useCallback((file: File) => {
|
||||||
const { size } = file
|
const { size } = file
|
||||||
const ext = `.${getFileType(file)}`
|
const ext = `.${getFileType(file)}`
|
||||||
const isValidType = ACCEPTS.includes(ext)
|
const isValidType = ACCEPTS.includes(ext.toLowerCase())
|
||||||
if (!isValidType)
|
if (!isValidType)
|
||||||
notify({ type: 'error', message: t('datasetCreation.stepOne.uploader.validation.typeError') })
|
notify({ type: 'error', message: t('datasetCreation.stepOne.uploader.validation.typeError') })
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user