mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-07-12 06:42:06 +08:00
Feat: text file support position retaining. (#6231)
### What problem does this PR solve? #5832 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
parent
6e5cbd0196
commit
a087d13ccb
@ -258,7 +258,7 @@ def tokenize(d, t, eng):
|
|||||||
def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
|
def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
|
||||||
res = []
|
res = []
|
||||||
# wrap up as es documents
|
# wrap up as es documents
|
||||||
for ck in chunks:
|
for ii, ck in enumerate(chunks):
|
||||||
if len(ck.strip()) == 0:
|
if len(ck.strip()) == 0:
|
||||||
continue
|
continue
|
||||||
logging.debug("-- {}".format(ck))
|
logging.debug("-- {}".format(ck))
|
||||||
@ -270,6 +270,8 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
|
|||||||
ck = pdf_parser.remove_tag(ck)
|
ck = pdf_parser.remove_tag(ck)
|
||||||
except NotImplementedError:
|
except NotImplementedError:
|
||||||
pass
|
pass
|
||||||
|
else:
|
||||||
|
add_positions(d, [[ii]*5])
|
||||||
tokenize(d, ck, eng)
|
tokenize(d, ck, eng)
|
||||||
res.append(d)
|
res.append(d)
|
||||||
return res
|
return res
|
||||||
|
Loading…
x
Reference in New Issue
Block a user