fix parser for pptx of which files are from filemanager (#2482)

### What problem does this PR solve?

#2467

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Kevin Hu 2024-09-18 19:13:37 +08:00 committed by GitHub
parent 2b0dc01a88
commit 2324b88579
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 10 additions and 11 deletions

View File

@ -77,7 +77,7 @@ def convert():
doc = DocumentService.insert({
"id": get_uuid(),
"kb_id": kb.id,
"parser_id": kb.parser_id,
"parser_id": FileService.get_parser(file.type, file.name, kb.parser_id),
"parser_config": kb.parser_config,
"created_by": current_user.id,
"type": file.type,
@ -85,7 +85,6 @@ def convert():
"location": file.location,
"size": file.size
})
FileService.set_constant_parser(doc, file.name)
file2document = File2DocumentService.insert({
"id": get_uuid(),
"file_id": id,

View File

@ -357,7 +357,7 @@ class FileService(CommonService):
doc = {
"id": get_uuid(),
"kb_id": kb.id,
"parser_id": kb.parser_id,
"parser_id": self.get_parser(filetype, filename, kb.parser_id),
"parser_config": kb.parser_config,
"created_by": user_id,
"type": filetype,
@ -366,7 +366,6 @@ class FileService(CommonService):
"size": len(blob),
"thumbnail": thumbnail(filename, blob)
}
self.set_constant_parser(doc, filename)
DocumentService.insert(doc)
FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id)
@ -377,12 +376,13 @@ class FileService(CommonService):
return err, files
@staticmethod
def set_constant_parser(doc, filename):
if doc["type"] == FileType.VISUAL:
doc["parser_id"] = ParserType.PICTURE.value
if doc["type"] == FileType.AURAL:
doc["parser_id"] = ParserType.AUDIO.value
def get_parser(doc_type, filename, default):
if doc_type == FileType.VISUAL:
return ParserType.PICTURE.value
if doc_type == FileType.AURAL:
return ParserType.AUDIO.value
if re.search(r"\.(ppt|pptx|pages)$", filename):
doc["parser_id"] = ParserType.PRESENTATION.value
return ParserType.PRESENTATION.value
if re.search(r"\.(eml)$", filename):
doc["parser_id"] = ParserType.EMAIL.value
return ParserType.EMAIL.value
return default