Fix: order chunks from docx by positions. (#7979)

### What problem does this PR solve?

#7934

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Kevin Hu 2025-05-30 17:20:53 +08:00 committed by GitHub
parent 9f38b22a3f
commit 93f5df716f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -279,12 +279,13 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
def tokenize_chunks_with_images(chunks, doc, eng, images):
res = []
# wrap up as es documents
for ck, image in zip(chunks, images):
for ii, (ck, image) in enumerate(zip(chunks, images)):
if len(ck.strip()) == 0:
continue
logging.debug("-- {}".format(ck))
d = copy.deepcopy(doc)
d["image"] = image
add_positions(d, [[ii]*5])
tokenize(d, ck, eng)
res.append(d)
return res