ragflow/rag/parser/__init__.py

225 lines
6.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import copy
import random
from .pdf_parser import HuParser as PdfParser
from .docx_parser import HuDocxParser as DocxParser
from .excel_parser import HuExcelParser as ExcelParser
import re
from nltk import word_tokenize
from rag.nlp import stemmer, huqie
from ..utils import num_tokens_from_string
BULLET_PATTERN = [[
r"第[零一二三四五六七八九十百0-9]+(分?编|部分)",
r"第[零一二三四五六七八九十百0-9]+章",
r"第[零一二三四五六七八九十百0-9]+节",
r"第[零一二三四五六七八九十百0-9]+条",
r"[\(][零一二三四五六七八九十百]+[\)]",
], [
r"第[0-9]+章",
r"第[0-9]+节",
r"[0-9]{,3}[\. 、]",
r"[0-9]{,2}\.[0-9]{,2}",
r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
], [
r"第[零一二三四五六七八九十百0-9]+章",
r"第[零一二三四五六七八九十百0-9]+节",
r"[零一二三四五六七八九十百]+[ 、]",
r"[\(][零一二三四五六七八九十百]+[\)]",
r"[\(][0-9]{,2}[\)]",
], [
r"PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)",
r"Chapter (I+V?|VI*|XI|IX|X)",
r"Section [0-9]+",
r"Article [0-9]+"
]
]
def random_choices(arr, k):
k = min(len(arr), k)
return random.choices(arr, k=k)
def bullets_category(sections):
global BULLET_PATTERN
hits = [0] * len(BULLET_PATTERN)
for i, pro in enumerate(BULLET_PATTERN):
for sec in sections:
for p in pro:
if re.match(p, sec):
hits[i] += 1
break
maxium = 0
res = -1
for i, h in enumerate(hits):
if h <= maxium: continue
res = i
maxium = h
return res
def is_english(texts):
eng = 0
for t in texts:
if re.match(r"[a-zA-Z]{2,}", t.strip()):
eng += 1
if eng / len(texts) > 0.8:
return True
return False
def tokenize(d, t, eng):
d["content_with_weight"] = t
if eng:
t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)])
else:
d["content_ltks"] = huqie.qie(t)
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
def remove_contents_table(sections, eng=False):
i = 0
while i < len(sections):
def get(i):
nonlocal sections
return (sections[i] if type(sections[i]) == type("") else sections[i][0]).strip()
if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)):
i += 1
continue
sections.pop(i)
if i >= len(sections): break
prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
while not prefix:
sections.pop(i)
if i >= len(sections): break
prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
sections.pop(i)
if i >= len(sections) or not prefix: break
for j in range(i, min(i + 128, len(sections))):
if not re.match(prefix, get(j)):
continue
for _ in range(i, j): sections.pop(i)
break
def make_colon_as_title(sections):
if not sections: return []
if type(sections[0]) == type(""): return sections
i = 0
while i < len(sections):
txt, layout = sections[i]
i += 1
txt = txt.split("@")[0].strip()
if not txt:
continue
if txt[-1] not in ":":
continue
txt = txt[::-1]
arr = re.split(r"([。?!!?;]| .)", txt)
if len(arr) < 2 or len(arr[1]) < 32:
continue
sections.insert(i - 1, (arr[0][::-1], "title"))
i += 1
def hierarchical_merge(bull, sections, depth):
if not sections or bull < 0: return []
if type(sections[0]) == type(""): sections = [(s, "") for s in sections]
sections = [(t,o) for t, o in sections if t and len(t.split("@")[0].strip()) > 1 and not re.match(r"[0-9]+$", t.split("@")[0].strip())]
bullets_size = len(BULLET_PATTERN[bull])
levels = [[] for _ in range(bullets_size + 2)]
def not_title(txt):
if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt): return False
if len(txt) >= 128: return True
return re.search(r"[,;,。;!!]", txt)
for i, (txt, layout) in enumerate(sections):
for j, p in enumerate(BULLET_PATTERN[bull]):
if re.match(p, txt.strip()) and not not_title(txt):
levels[j].append(i)
break
else:
if re.search(r"(title|head)", layout):
levels[bullets_size].append(i)
else:
levels[bullets_size + 1].append(i)
sections = [t for t, _ in sections]
for s in sections: print("--", s)
def binary_search(arr, target):
if not arr: return -1
if target > arr[-1]: return len(arr) - 1
if target < arr[0]: return -1
s, e = 0, len(arr)
while e - s > 1:
i = (e + s) // 2
if target > arr[i]:
s = i
continue
elif target < arr[i]:
e = i
continue
else:
assert False
return s
cks = []
readed = [False] * len(sections)
levels = levels[::-1]
for i, arr in enumerate(levels[:depth]):
for j in arr:
if readed[j]: continue
readed[j] = True
cks.append([j])
if i + 1 == len(levels) - 1: continue
for ii in range(i + 1, len(levels)):
jj = binary_search(levels[ii], j)
if jj < 0: continue
if jj > cks[-1][-1]: cks[-1].pop(-1)
cks[-1].append(levels[ii][jj])
for ii in cks[-1]: readed[ii] = True
for i in range(len(cks)):
cks[i] = [sections[j] for j in cks[i][::-1]]
print("--------------\n", "\n* ".join(cks[i]))
return cks
def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
if not sections: return []
if type(sections[0]) == type(""): sections = [(s, "") for s in sections]
cks = [""]
tk_nums = [0]
def add_chunk(t, pos):
nonlocal cks, tk_nums, delimiter
tnum = num_tokens_from_string(t)
if tnum < 8: pos = ""
if tk_nums[-1] > chunk_token_num:
cks.append(t + pos)
tk_nums.append(tnum)
else:
cks[-1] += t + pos
tk_nums[-1] += tnum
for sec, pos in sections:
s, e = 0, 1
while e < len(sec):
if sec[e] in delimiter:
add_chunk(sec[s: e+1], pos)
s = e + 1
e = s + 1
else:
e += 1
if s < e: add_chunk(sec[s: e], pos)
return cks