diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index 06f49623e..f88c059a5 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -279,12 +279,13 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None): def tokenize_chunks_with_images(chunks, doc, eng, images): res = [] # wrap up as es documents - for ck, image in zip(chunks, images): + for ii, (ck, image) in enumerate(zip(chunks, images)): if len(ck.strip()) == 0: continue logging.debug("-- {}".format(ck)) d = copy.deepcopy(doc) d["image"] = image + add_positions(d, [[ii]*5]) tokenize(d, ck, eng) res.append(d) return res