Feat: text file support position retaining. (#6231)

### What problem does this PR solve?

#5832

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Kevin Hu 2025-03-18 16:55:11 +08:00 committed by GitHub
parent 6e5cbd0196
commit a087d13ccb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -258,7 +258,7 @@ def tokenize(d, t, eng):
def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
res = []
# wrap up as es documents
for ck in chunks:
for ii, ck in enumerate(chunks):
if len(ck.strip()) == 0:
continue
logging.debug("-- {}".format(ck))
@ -270,6 +270,8 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
ck = pdf_parser.remove_tag(ck)
except NotImplementedError:
pass
else:
add_positions(d, [[ii]*5])
tokenize(d, ck, eng)
res.append(d)
return res