mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-04-23 22:50:17 +08:00
Add .doc
file parser. (#497)
### What problem does this PR solve? Add `.doc` file parser, using tika. ``` pip install tika ``` ``` from tika import parser from io import BytesIO def extract_text_from_doc_bytes(doc_bytes): file_like_object = BytesIO(doc_bytes) parsed = parser.from_buffer(file_like_object) return parsed["content"] ``` ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: chrysanthemum-boy <fannc@qq.com>
This commit is contained in:
parent
0dfc8ddc0f
commit
72384b191d
@ -147,7 +147,7 @@ def filename_type(filename):
|
||||
return FileType.PDF.value
|
||||
|
||||
if re.match(
|
||||
r".*\.(docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md)$", filename):
|
||||
r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md)$", filename):
|
||||
return FileType.DOC.value
|
||||
|
||||
if re.match(
|
||||
|
@ -11,6 +11,7 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
import copy
|
||||
from tika import parser
|
||||
import re
|
||||
from io import BytesIO
|
||||
|
||||
@ -103,9 +104,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
random_choices([t for t, _ in sections], k=200)))
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.doc$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
binary = BytesIO(binary)
|
||||
doc_parsed = parser.from_buffer(binary)
|
||||
sections = doc_parsed['content'].split('\n')
|
||||
sections = [(l, "") for l in sections if l]
|
||||
remove_contents_table(sections, eng=is_english(
|
||||
random_choices([t for t, _ in sections], k=200)))
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"file type not supported yet(docx, pdf, txt supported)")
|
||||
"file type not supported yet(doc, docx, pdf, txt supported)")
|
||||
|
||||
make_colon_as_title(sections)
|
||||
bull = bullets_category(
|
||||
|
@ -11,6 +11,7 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
import copy
|
||||
from tika import parser
|
||||
import re
|
||||
from io import BytesIO
|
||||
from docx import Document
|
||||
@ -123,9 +124,18 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
sections = txt.split("\n")
|
||||
sections = [l for l in sections if l]
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.doc$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
binary = BytesIO(binary)
|
||||
doc_parsed = parser.from_buffer(binary)
|
||||
sections = doc_parsed['content'].split('\n')
|
||||
sections = [l for l in sections if l]
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"file type not supported yet(docx, pdf, txt supported)")
|
||||
"file type not supported yet(doc, docx, pdf, txt supported)")
|
||||
|
||||
# is it English
|
||||
eng = lang.lower() == "english" # is_english(sections)
|
||||
|
@ -10,6 +10,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from tika import parser
|
||||
from io import BytesIO
|
||||
from docx import Document
|
||||
import re
|
||||
@ -154,9 +155,17 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
sections = [(l, "") for l in sections if l]
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.doc$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
binary = BytesIO(binary)
|
||||
doc_parsed = parser.from_buffer(binary)
|
||||
sections = doc_parsed['content'].split('\n')
|
||||
sections = [(l, "") for l in sections if l]
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"file type not supported yet(docx, pdf, txt supported)")
|
||||
"file type not supported yet(doc, docx, pdf, txt supported)")
|
||||
|
||||
chunks = naive_merge(
|
||||
sections, parser_config.get(
|
||||
|
@ -10,6 +10,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from tika import parser
|
||||
from io import BytesIO
|
||||
import re
|
||||
from rag.app import laws
|
||||
from rag.nlp import huqie, tokenize, find_codec
|
||||
@ -95,9 +97,17 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
sections = [s for s in sections if s]
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.doc$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
binary = BytesIO(binary)
|
||||
doc_parsed = parser.from_buffer(binary)
|
||||
sections = doc_parsed['content'].split('\n')
|
||||
sections = [l for l in sections if l]
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"file type not supported yet(docx, pdf, txt supported)")
|
||||
"file type not supported yet(doc, docx, pdf, txt supported)")
|
||||
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
|
@ -116,6 +116,7 @@ sniffio==1.3.1
|
||||
StrEnum==0.4.15
|
||||
sympy==1.12
|
||||
threadpoolctl==3.3.0
|
||||
tika==2.6.0
|
||||
tiktoken==0.6.0
|
||||
tokenizers==0.15.2
|
||||
torch==2.2.1
|
||||
@ -133,4 +134,4 @@ xxhash==3.4.1
|
||||
yarl==1.9.4
|
||||
zhipuai==2.0.1
|
||||
BCEmbedding
|
||||
loguru==0.7.2
|
||||
loguru==0.7.2
|
Loading…
x
Reference in New Issue
Block a user