refine for English corpus (#135)

2025-08-12 06:49:00 +08:00 · 2024-03-20 16:56:16 +08:00 · 2024-03-20 16:56:16 +08:00 · 6999598101
commit 6999598101
parent 78727c8809
12 changed files with 216 additions and 125 deletions
--- a/deepdoc/parser/excel_parser.py
+++ b/deepdoc/parser/excel_parser.py
@ -5,6 +5,27 @@ from io import BytesIO


 class HuExcelParser:
+    def html(self, fnm):
+        if isinstance(fnm, str):
+            wb = load_workbook(fnm)
+        else:
+            wb = load_workbook(BytesIO(fnm))
+        tb = ""
+        for sheetname in wb.sheetnames:
+            ws = wb[sheetname]
+            rows = list(ws.rows)
+            tb += f"<table><caption>{sheetname}</caption><tr>"
+            for t in list(rows[0]): tb += f"<th>{t.value}</th>"
+            tb += "</tr>"
+            for r in list(rows[1:]):
+                tb += "<tr>"
+                for i,c in enumerate(r):
+                    if c.value is None: tb += "<td></td>"
+                    else: tb += f"<td>{c.value}</td>"
+                tb += "</tr>"
+            tb += "</table>\n"
+        return tb
+
    def __call__(self, fnm):
        if isinstance(fnm, str):
            wb = load_workbook(fnm)
--- a/deepdoc/parser/pdf_parser.py
+++ b/deepdoc/parser/pdf_parser.py
@ -17,7 +17,6 @@ from rag.nlp import huqie
 from copy import deepcopy
 from huggingface_hub import hf_hub_download

-
 logging.getLogger("pdfminer").setLevel(logging.WARNING)


@ -25,7 +24,7 @@ class HuParser:
    def __init__(self):
        self.ocr = OCR()
        if hasattr(self, "model_speciess"):
-            self.layouter = LayoutRecognizer("layout."+self.model_speciess)
+            self.layouter = LayoutRecognizer("layout." + self.model_speciess)
        else:
            self.layouter = LayoutRecognizer("layout")
        self.tbl_det = TableStructureRecognizer()
@ -141,7 +140,7 @@ class HuParser:
            for j in range(i, -1, -1):
                # restore the order using th
                if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
-                        and arr[j + 1]["top"] < arr[j]["top"]\
+                        and arr[j + 1]["top"] < arr[j]["top"] \
                        and arr[j + 1]["page_number"] == arr[j]["page_number"]:
                    tmp = arr[j]
                    arr[j] = arr[j + 1]
@ -278,8 +277,10 @@ class HuParser:

        for b in bxs:
            if not b["text"]:
-                left, right, top, bott = b["x0"]*ZM, b["x1"]*ZM, b["top"]*ZM, b["bottom"]*ZM
-                b["text"] = self.ocr.recognize(np.array(img), np.array([[left, top], [right, top], [right, bott], [left, bott]], dtype=np.float32))
+                left, right, top, bott = b["x0"] * ZM, b["x1"] * ZM, b["top"] * ZM, b["bottom"] * ZM
+                b["text"] = self.ocr.recognize(np.array(img),
+                                               np.array([[left, top], [right, top], [right, bott], [left, bott]],
+                                                        dtype=np.float32))
            del b["txt"]
        bxs = [b for b in bxs if b["text"]]
        if self.mean_height[-1] == 0:
@ -315,7 +316,8 @@ class HuParser:
        while i < len(bxs) - 1:
            b = bxs[i]
            b_ = bxs[i + 1]
-            if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure", "equation"]:
+            if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure",
+                                                                                                 "equation"]:
                i += 1
                continue
            if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
@ -376,9 +378,13 @@ class HuParser:
                b["page_number"] == b_["page_number"] and b_["top"] - \
                b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
                b["page_number"] < b_["page_number"] and abs(
-                    b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4
+                    b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4,
            ]
-            if any(feats) and not any(concatting_feats):
+            # split features
+            detach_feats = [b["x1"] < b_["x0"],
+                            b["x0"] > b_["x1"]]
+            if (any(feats) and not any(concatting_feats)) or any(detach_feats):
+                print(b["text"], b_["text"], any(feats), any(concatting_feats), any(detach_feats))
                i += 1
                continue
            # merge up and down
@ -503,18 +509,21 @@ class HuParser:
        findit = False
        i = 0
        while i < len(self.boxes):
-            if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
+            if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
+                            re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
                i += 1
                continue
            findit = True
            eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip())
            self.boxes.pop(i)
            if i >= len(self.boxes): break
-            prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
+            prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
+                self.boxes[i]["text"].strip().split(" ")[:2])
            while not prefix:
                self.boxes.pop(i)
                if i >= len(self.boxes): break
-                prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
+                prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
+                    self.boxes[i]["text"].strip().split(" ")[:2])
            self.boxes.pop(i)
            if i >= len(self.boxes) or not prefix: break
            for j in range(i, min(i + 128, len(self.boxes))):
@ -522,13 +531,13 @@ class HuParser:
                    continue
                for k in range(i, j): self.boxes.pop(i)
                break
-        if findit:return
+        if findit: return

        page_dirty = [0] * len(self.page_images)
        for b in self.boxes:
            if re.search(r"(··|··|··)", b["text"]):
-                page_dirty[b["page_number"]-1] += 1
-        page_dirty = set([i+1 for i, t in enumerate(page_dirty) if t > 3])
+                page_dirty[b["page_number"] - 1] += 1
+        page_dirty = set([i + 1 for i, t in enumerate(page_dirty) if t > 3])
        if not page_dirty: return
        i = 0
        while i < len(self.boxes):
@ -546,7 +555,7 @@ class HuParser:
                self.boxes.pop(i)
                continue
            if not b_["text"].strip():
-                self.boxes.pop(i+1)
+                self.boxes.pop(i + 1)
                continue

            if b["text"].strip()[0] != b_["text"].strip()[0] \
@ -574,8 +583,10 @@ class HuParser:
                continue
            lout_no = str(self.boxes[i]["page_number"]) + \
                      "-" + str(self.boxes[i]["layoutno"])
-            if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title",
-                                                                                  "figure caption", "reference"]:
+            if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption",
+                                                                                                      "title",
+                                                                                                      "figure caption",
+                                                                                                      "reference"]:
                nomerge_lout_no.append(lst_lout_no)
            if self.boxes[i]["layout_type"] == "table":
                if re.match(r"(数据|资料|图表)*来源[:： ]", self.boxes[i]["text"]):
@ -654,7 +665,7 @@ class HuParser:

            tk, tv = nearest(tables)
            fk, fv = nearest(figures)
-            #if min(tv, fv) > 2000:
+            # if min(tv, fv) > 2000:
            #    i += 1
            #    continue
            if tv < fv and tk:
@ -699,7 +710,7 @@ class HuParser:
                            "layoutno", "")))

                left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
-                poss.append((pn+self.page_from, left, right, top, bott))
+                poss.append((pn + self.page_from, left, right, top, bott))
                return self.page_images[pn] \
                    .crop((left * ZM, top * ZM,
                           right * ZM, bott * ZM))
@ -738,7 +749,7 @@ class HuParser:
        for k, bxs in tables.items():
            if not bxs:
                continue
-            bxs = Recognizer.sort_Y_firstly(bxs, np.mean([(b["bottom"]-b["top"])/2 for b in bxs]))
+            bxs = Recognizer.sort_Y_firstly(bxs, np.mean([(b["bottom"] - b["top"]) / 2 for b in bxs]))
            poss = []
            res.append((cropout(bxs, "table", poss),
                        self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
@ -879,7 +890,8 @@ class HuParser:
            self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
            self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
                                enumerate(self.pdf.pages[page_from:page_to])]
-            self.page_chars = [[c for c in page.chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
+            self.page_chars = [[c for c in page.chars if self._has_color(c)] for page in
+                               self.pdf.pages[page_from:page_to]]
            self.total_page = len(self.pdf.pages)
        except Exception as e:
            self.pdf = fitz.open(fnm) if isinstance(fnm, str) else fitz.open(stream=fnm, filetype="pdf")
@ -888,8 +900,8 @@ class HuParser:
            mat = fitz.Matrix(zoomin, zoomin)
            self.total_page = len(self.pdf)
            for i, page in enumerate(self.pdf):
-                if i < page_from:continue
-                if i >= page_to:break
+                if i < page_from: continue
+                if i >= page_to: break
                pix = page.get_pixmap(matrix=mat)
                img = Image.frombytes("RGB", [pix.width, pix.height],
                                      pix.samples)
@ -897,7 +909,9 @@ class HuParser:
                self.page_chars.append([])

        logging.info("Images converted.")
-        self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in range(len(self.page_chars))]
+        self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
+            random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
+                           range(len(self.page_chars))]
        if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2:
            self.is_english = True
        else:
@ -927,11 +941,12 @@ class HuParser:
            #         self.page_cum_height.append(
            #             np.max([c["bottom"] for c in chars]))
            self.__ocr(i + 1, img, chars, zoomin)
-            if callback: callback(prog=(i+1)*0.6/len(self.page_images), msg="")
+            if callback: callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")

        if not self.is_english and not any([c for c in self.page_chars]) and self.boxes:
            bxes = [b for bxs in self.boxes for b in bxs]
-            self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
+            self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}",
+                                        "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))

        logging.info("Is it English:", self.is_english)

@ -964,12 +979,13 @@ class HuParser:
            if need_position: return None, None
            return

-        max_width = np.max([right-left for (_, left, right, _, _) in poss])
+        max_width = np.max([right - left for (_, left, right, _, _) in poss])
        GAP = 6
        pos = poss[0]
-        poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3]-120), max(pos[3]-GAP, 0)))
+        poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
        pos = poss[-1]
-        poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1]/ZM, pos[4]+GAP), min(self.page_images[pos[0][-1]].size[1]/ZM, pos[4]+120)))
+        poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + GAP),
+                     min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + 120)))

        positions = []
        for ii, (pns, left, right, top, bottom) in enumerate(poss):
@ -984,9 +1000,9 @@ class HuParser:
                    bottom, self.page_images[pns[0]].size[1])
                                               ))
            )
-            if 0 < ii < len(poss)-1:
-                positions.append((pns[0]+self.page_from, left, right, top, min(
-                    bottom, self.page_images[pns[0]].size[1])/ZM))
+            if 0 < ii < len(poss) - 1:
+                positions.append((pns[0] + self.page_from, left, right, top, min(
+                    bottom, self.page_images[pns[0]].size[1]) / ZM))
            bottom -= self.page_images[pns[0]].size[1]
            for pn in pns[1:]:
                imgs.append(
@ -997,7 +1013,7 @@ class HuParser:
                                               ))
                )
                if 0 < ii < len(poss) - 1:
-                    positions.append((pn+self.page_from, left, right, 0, min(
+                    positions.append((pn + self.page_from, left, right, 0, min(
                        bottom, self.page_images[pn].size[1]) / ZM))
                bottom -= self.page_images[pn].size[1]

@ -1026,6 +1042,19 @@ class HuParser:
            return pic, positions
        return pic

+    def get_position(self, bx, ZM):
+        poss = []
+        pn = bx["page_number"]
+        top = bx["top"] - self.page_cum_height[pn - 1]
+        bott = bx["bottom"] - self.page_cum_height[pn - 1]
+        poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn - 1].size[1] / ZM)))
+        while bott * ZM > self.page_images[pn - 1].size[1]:
+            bott -= self.page_images[pn - 1].size[1] / ZM
+            top = 0
+            pn += 1
+            poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn - 1].size[1] / ZM)))
+        return poss
+

 if __name__ == "__main__":
    pass
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@ -30,19 +30,6 @@ class Pdf(PdfParser):
        #        print(b)
        print("OCR:", timer()-start)

-        def get_position(bx):
-            poss = []
-            pn = bx["page_number"]
-            top = bx["top"] - self.page_cum_height[pn - 1]
-            bott = bx["bottom"] - self.page_cum_height[pn - 1]
-            poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn-1].size[1]/zoomin)))
-            while bott * zoomin > self.page_images[pn - 1].size[1]:
-                bott -= self.page_images[pn- 1].size[1] / zoomin
-                top = 0
-                pn += 1
-                poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn - 1].size[1] / zoomin)))
-            return poss
-
        def tag(pn, left, right, top, bottom):
            return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
                .format(pn, left, right, top, bottom)
@ -54,7 +41,7 @@ class Pdf(PdfParser):
        callback(0.67, "Table analysis finished.")
        self._text_merge()
        tbls = self._extract_table_figure(True, zoomin, True, True)
-        self._naive_vertical_merge()
+        self._concat_downward()
        self._filter_forpages()
        callback(0.68, "Text merging finished")

@ -74,7 +61,7 @@ class Pdf(PdfParser):
            sec_ids.append(sid)
            #print(lvl, self.boxes[i]["text"], most_level)

-        sections = [(b["text"], sec_ids[i], get_position(b)) for i, b in enumerate(self.boxes)]
+        sections = [(b["text"], sec_ids[i], self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)]
        for (img, rows), poss in tbls:
            sections.append((rows if isinstance(rows, str) else rows[0], -1, [(p[0]+1-from_page, p[1], p[2], p[3], p[4]) for p in poss]))

--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -14,7 +14,7 @@ import copy
 import re
 from rag.app import laws
 from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions
-from deepdoc.parser import PdfParser
+from deepdoc.parser import PdfParser, ExcelParser
 from rag.settings import cron_logger


@ -74,6 +74,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
        sections, tbls = pdf_parser(filename if not binary else binary,
                              from_page=from_page, to_page=to_page, callback=callback)
        res = tokenize_table(tbls, doc, eng)
+    elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        excel_parser = ExcelParser()
+        sections = [(excel_parser.html(binary), "")]
    elif re.search(r"\.txt$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        txt = ""
--- a/rag/app/paper.py
+++ b/rag/app/paper.py
@ -15,7 +15,7 @@ import re
 from collections import Counter

 from api.db import ParserType
-from rag.nlp import huqie, tokenize, tokenize_table, add_positions
+from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency
 from deepdoc.parser import PdfParser
 import numpy as np
 from rag.utils import num_tokens_from_string
@ -46,11 +46,11 @@ class Pdf(PdfParser):
        self._table_transformer_job(zoomin)
        callback(0.68, "Table analysis finished")
        self._text_merge()
+        tbls = self._extract_table_figure(True, zoomin, True, True)
        column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
-        self._concat_downward(concat_between_pages=False)
+        self._concat_downward()
        self._filter_forpages()
        callback(0.75, "Text merging finished.")
-        tbls = self._extract_table_figure(True, zoomin, True, True)

        # clean mess
        if column_width < self.page_images[0].size[0] / zoomin / 2:
@ -59,24 +59,24 @@ class Pdf(PdfParser):
            self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
        for b in self.boxes:
            b["text"] = re.sub(r"([\t 　]|\u3000){2,}", " ", b["text"].strip())
-        freq = Counter([b["text"] for b in self.boxes])
-        garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6])
-        i = 0
-        while i < len(self.boxes):
-            if self.boxes[i]["text"] in garbage \
-                    or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \
-                    or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]):
-                self.boxes.pop(i)
-            elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno",
-                                                                                                         '1'):
-                # merge within same layouts
-                self.boxes[i + 1]["top"] = self.boxes[i]["top"]
-                self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"])
-                self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"])
-                self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"]
-                self.boxes.pop(i)
-            else:
-                i += 1
+        # freq = Counter([b["text"] for b in self.boxes])
+        # garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6])
+        # i = 0
+        # while i < len(self.boxes):
+        #     if self.boxes[i]["text"] in garbage \
+        #             or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \
+        #             or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]):
+        #         self.boxes.pop(i)
+        #     elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno",
+        #                                                                                                  '1'):
+        #         # merge within same layouts
+        #         self.boxes[i + 1]["top"] = self.boxes[i]["top"]
+        #         self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"])
+        #         self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"])
+        #         self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"]
+        #         self.boxes.pop(i)
+        #     else:
+        #         i += 1

        def _begin(txt):
            return re.match(
@ -88,7 +88,7 @@ class Pdf(PdfParser):
                "title":"",
                "authors": "",
                "abstract": "",
-                "lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
+                "sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes if
                          re.match(r"(text|title)", b.get("layoutno", "text"))],
                "tables": tbls
            }
@ -119,11 +119,10 @@ class Pdf(PdfParser):
            if re.match("(abstract|摘要)", txt):
                if len(txt.split(" ")) > 32 or len(txt) > 64:
                    abstr = txt + self._line_tag(b, zoomin)
-                    i += 1
                    break
-                txt = self.boxes[i + 1]["text"].lower().strip()
+                txt = self.boxes[i]["text"].lower().strip()
                if len(txt.split(" ")) > 32 or len(txt) > 64:
-                    abstr = txt + self._line_tag(self.boxes[i + 1], zoomin)
+                    abstr = txt + self._line_tag(self.boxes[i], zoomin)
                i += 1
                break
        if not abstr: i = 0
@ -136,7 +135,7 @@ class Pdf(PdfParser):
            "title": title if title else filename,
            "authors": " ".join(authors),
            "abstract": abstr,
-            "lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
+            "sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
                      re.match(r"(text|title)", b.get("layoutno", "text"))],
            "tables": tbls
        }
@ -153,7 +152,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
        paper = pdf_parser(filename if not binary else binary,
                           from_page=from_page, to_page=to_page, callback=callback)
    else: raise NotImplementedError("file type not supported yet(pdf supported)")
-    doc = {"docnm_kwd": filename, "authors_tks": paper["authors"],
+
+    doc = {"docnm_kwd": filename, "authors_tks": huqie.qie(paper["authors"]),
           "title_tks": huqie.qie(paper["title"] if paper["title"] else filename)}
    doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
    doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
@ -173,6 +173,38 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
        tokenize(d, txt, eng)
        res.append(d)

+    sorted_sections = paper["sections"]
+    # set pivot using the most frequent type of title,
+    # then merge between 2 pivot
+    bull = bullets_category([txt for txt, _ in sorted_sections])
+    most_level, levels = title_frequency(bull, sorted_sections)
+    assert len(sorted_sections) == len(levels)
+    sec_ids = []
+    sid = 0
+    for i, lvl in enumerate(levels):
+        if lvl <= most_level and i > 0 and lvl != levels[i-1]: sid += 1
+        sec_ids.append(sid)
+        print(lvl, sorted_sections[i][0], most_level, sid)
+
+    chunks = []
+    last_sid = -2
+    for (txt, _), sec_id in zip(sorted_sections, sec_ids):
+        if sec_id == last_sid:
+            if chunks:
+                chunks[-1] += "\n" + txt
+                continue
+        chunks.append(txt)
+        last_sid = sec_id
+    for txt in chunks:
+        d = copy.deepcopy(doc)
+        d["image"], poss = pdf_parser.crop(txt, need_position=True)
+        add_positions(d, poss)
+        tokenize(d, pdf_parser.remove_tag(txt), eng)
+        res.append(d)
+        print("----------------------\n", pdf_parser.remove_tag(txt))
+
+    return res
+
    readed = [0] * len(paper["lines"])
    # find colon firstly
    i = 0
@ -252,6 +284,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca

 if __name__ == "__main__":
    import sys
-    def dummy(a, b):
+    def dummy(prog=None, msg=""):
        pass
    chunk(sys.argv[1], callback=dummy)
--- a/rag/app/qa.py
+++ b/rag/app/qa.py
@ -16,7 +16,7 @@ from io import BytesIO
 from nltk import word_tokenize
 from openpyxl import load_workbook
 from rag.nlp import is_english, random_choices
-from rag.nlp import huqie, stemmer
+from rag.nlp import huqie
 from deepdoc.parser import ExcelParser


@ -73,12 +73,8 @@ def beAdoc(d, q, a, eng):
    aprefix = "Answer: " if eng else "回答："
    d["content_with_weight"] = "\t".join(
        [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
-    if eng:
-        d["content_ltks"] = " ".join([stemmer.stem(w)
-                                     for w in word_tokenize(q)])
-    else:
-        d["content_ltks"] = huqie.qie(q)
-        d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
+    d["content_ltks"] = huqie.qie(q)
+    d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
    return d


--- a/rag/app/table.py
+++ b/rag/app/table.py
@ -74,9 +74,9 @@ def trans_datatime(s):

 def trans_bool(s):
    if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$", str(s).strip(), flags=re.IGNORECASE):
-        return ["yes", "是"]
+        return "yes"
    if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE):
-        return ["no", "否"]
+        return "no"


 def column_data_type(arr):
@ -92,7 +92,7 @@ def column_data_type(arr):
            counts["int"] += 1
        elif re.match(r"[+-]?[0-9.]+$", str(a).replace("%%", "")):
            counts["float"] += 1
-        elif re.match(r"(true|false|yes|no|是|否)$", str(a), flags=re.IGNORECASE):
+        elif re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√|false|no|否|⍻|×)$", str(a), flags=re.IGNORECASE):
            counts["bool"] += 1
        elif trans_datatime(str(a)):
            counts["datetime"] += 1
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@ -3,14 +3,9 @@ from collections import Counter

 from rag.utils import num_tokens_from_string
 from . import huqie
-from nltk import word_tokenize
 import re
 import copy

-from nltk.stem import PorterStemmer
-
-stemmer = PorterStemmer()
-

 BULLET_PATTERN = [[
    r"第[零一二三四五六七八九十百0-9]+(分?编|部分)",
@ -77,13 +72,8 @@ def is_english(texts):
 def tokenize(d, t, eng):
    d["content_with_weight"] = t
    t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
-    if eng:
-        t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
-        d["content_ltks"] = " ".join([stemmer.stem(w)
-                                     for w in word_tokenize(t)])
-    else:
-        d["content_ltks"] = huqie.qie(t)
-        d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
+    d["content_ltks"] = huqie.qie(t)
+    d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])


 def tokenize_table(tbls, doc, eng, batch_size=10):
@ -94,8 +84,7 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
            continue
        if isinstance(rows, str):
            d = copy.deepcopy(doc)
-            r = re.sub(r"<[^<>]{,12}>", "", rows)
-            tokenize(d, r, eng)
+            tokenize(d, rows, eng)
            d["content_with_weight"] = rows
            d["image"] = img
            add_positions(d, poss)
--- a/rag/nlp/huqie.py
+++ b/rag/nlp/huqie.py
@ -8,7 +8,8 @@ import re
 import string
 import sys
 from hanziconv import HanziConv
-
+from nltk import word_tokenize
+from nltk.stem import PorterStemmer, WordNetLemmatizer
 from api.utils.file_utils import get_project_base_directory


@ -45,6 +46,9 @@ class Huqie:
        self.trie_ = datrie.Trie(string.printable)
        self.DIR_ = os.path.join(get_project_base_directory(), "rag/res", "huqie")

+        self.stemmer = PorterStemmer()
+        self.lemmatizer = WordNetLemmatizer()
+
        self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》，。？、；‘’：“”【】~！￥%……（）——-]+|[a-z\.-]+|[0-9,\.-]+)"
        try:
            self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
@ -239,6 +243,10 @@ class Huqie:
    def qie(self, line):
        line = self._strQ2B(line).lower()
        line = self._tradi2simp(line)
+        zh_num = len([1 for c in line if is_chinese(c)])
+        if zh_num < len(line) * 0.2:
+            return " ".join([self.stemmer.stem(self.lemmatizer.lemmatize(t)) for t in word_tokenize(line)])
+
        arr = re.split(self.SPLIT_CHAR, line)
        res = []
        for L in arr:
@ -290,8 +298,12 @@ class Huqie:
        return self.merge_(res)

    def qieqie(self, tks):
+        tks = tks.split(" ")
+        zh_num = len([1 for c in tks if c and is_chinese(c[0])])
+        if zh_num < len(tks) * 0.2:return " ".join(tks)
+
        res = []
-        for tk in tks.split(" "):
+        for tk in tks:
            if len(tk) < 3 or re.match(r"[0-9,\.-]+$", tk):
                res.append(tk)
                continue
--- a/rag/nlp/query.py
+++ b/rag/nlp/query.py
@ -4,8 +4,8 @@ import json
 import re
 import logging
 import copy
-import math
-from elasticsearch_dsl import Q, Search
+from elasticsearch_dsl import Q
+
 from rag.nlp import huqie, term_weight, synonym


@ -33,12 +33,14 @@ class EsQueryer:

    @staticmethod
    def rmWWW(txt):
-        txt = re.sub(
-            r"是*(什么样的|哪家|那家|啥样|咋样了|什么时候|何时|何地|何人|是否|是不是|多少|哪里|怎么|哪儿|怎么样|如何|哪些|是啥|啥是|啊|吗|呢|吧|咋|什么|有没有|呀)是*",
-            "",
-            txt)
-        return re.sub(
-            r"(what|who|how|which|where|why|(is|are|were|was) there) (is|are|were|was|to)*", "", txt, re.IGNORECASE)
+        patts = [
+            (r"是*(什么样的|哪家|那家|啥样|咋样了|什么时候|何时|何地|何人|是否|是不是|多少|哪里|怎么|哪儿|怎么样|如何|哪些|是啥|啥是|啊|吗|呢|吧|咋|什么|有没有|呀)是*", ""),
+            (r"(^| )(what|who|how|which|where|why)('re|'s)? ", " "),
+            (r"(^| )('s|'re|is|are|were|was|do|does|did|don't|doesn't|didn't|has|have|be|there|you|me|your|my|mine|just|please|may|i|should|would|wouldn't|will|won't|done|go|for|with|so|the|a|an|by|i'm|it's|he's|she's|they|they're|you're|as|by|on|in|at|up|out|down)", " ")
+        ]
+        for r, p in patts:
+            txt = re.sub(r, p, txt, flags=re.IGNORECASE)
+        return txt

    def question(self, txt, tbl="qa", min_match="60%"):
        txt = re.sub(
@ -50,7 +52,7 @@ class EsQueryer:
        txt = EsQueryer.rmWWW(txt)

        if not self.isChinese(txt):
-            tks = [t for t in txt.split(" ") if t.strip()]
+            tks = huqie.qie(txt).split(" ")
            q = tks
            for i in range(1, len(tks)):
                q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
@ -58,9 +60,9 @@ class EsQueryer:
                q.append(txt)
            return Q("bool",
                     must=Q("query_string", fields=self.flds,
-                            type="best_fields", query=" OR ".join(q),
+                            type="best_fields", query=" ".join(q),
                            boost=1, minimum_should_match=min_match)
-                     ), txt.split(" ")
+                     ), tks

        def needQieqie(tk):
            if len(tk) < 4:
@ -160,8 +162,8 @@ class EsQueryer:
                s += v# * dtwt[k]
        q = 1e-9
        for k, v in qtwt.items():
-            q += v * v
-        d = 1e-9
-        for k, v in dtwt.items():
-            d += v * v
-        return s / q#math.sqrt(q) / math.sqrt(d)
+            q += v #* v
+        #d = 1e-9
+        #for k, v in dtwt.items():
+        #    d += v * v
+        return s / q #math.sqrt(q) / math.sqrt(d)
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@ -196,7 +196,24 @@ class Dealer:
    def insert_citations(self, answer, chunks, chunk_v,
                         embd_mdl, tkweight=0.7, vtweight=0.3):
        assert len(chunks) == len(chunk_v)
-        pieces = re.split(r"([；。？!！\n]|[a-z][.?;!][ \n])", answer)
+        pieces = re.split(r"(```)", answer)
+        if len(pieces) >= 3:
+            i = 0
+            pieces_ = []
+            while i < len(pieces):
+                if pieces[i] == "```":
+                    st = i
+                    i += 1
+                    while i<len(pieces) and pieces[i] != "```":
+                        i += 1
+                    if i < len(pieces): i += 1
+                    pieces_.append("".join(pieces[st: i])+"\n")
+                else:
+                    pieces_.extend(re.split(r"([^\|][；。？!！\n]|[a-z][.?;!][ \n])", pieces[i]))
+                    i += 1
+            pieces = pieces_
+        else:
+            pieces = re.split(r"([^\|][；。？!！\n]|[a-z][.?;!][ \n])", answer)
        for i in range(1, len(pieces)):
            if re.match(r"[a-z][.?;!][ \n]", pieces[i]):
                pieces[i - 1] += pieces[i][0]
@ -226,7 +243,7 @@ class Dealer:
                                                            chunks_tks,
                                                            tkweight, vtweight)
            mx = np.max(sim) * 0.99
-            if mx < 0.66:
+            if mx < 0.7:
                continue
            cites[idx[i]] = list(
                set([str(ii) for ii in range(len(chunk_v)) if sim[ii] > mx]))[:4]
@ -249,6 +266,7 @@ class Dealer:

    def rerank(self, sres, query, tkweight=0.3,
               vtweight=0.7, cfield="content_ltks"):
+        _, keywords = self.qryr.question(query)
        ins_embd = [
            Dealer.trans2floats(
                sres.field[i].get("q_%d_vec" % len(sres.query_vector), "\t".join(["0"] * len(sres.query_vector)))) for i in sres.ids]
@ -258,8 +276,7 @@ class Dealer:
                  for i in sres.ids]
        sim, tksim, vtsim = self.qryr.hybrid_similarity(sres.query_vector,
                                                        ins_embd,
-                                                        huqie.qie(
-                                                            query).split(" "),
+                                                        keywords,
                                                        ins_tw, tkweight, vtweight)
        return sim, tksim, vtsim

--- a/rag/svr/task_broker.py
+++ b/rag/svr/task_broker.py
@ -82,12 +82,14 @@ def dispatch():
        tsks = []
        if r["type"] == FileType.PDF.value:
            pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
+            page_size = 5
+            if r["parser_id"] == "paper": page_size = 12
            for s,e in r["parser_config"].get("pages", [(0,100000)]):
                e = min(e, pages)
-                for p in range(s, e, 5):
+                for p in range(s, e, page_size):
                    task = new_task()
                    task["from_page"] = p
-                    task["to_page"] = min(p + 5, e)
+                    task["to_page"] = min(p + page_size, e)
                    tsks.append(task)
        elif r["parser_id"] == "table":
                rn = HuExcelParser.row_number(r["name"], MINIO.get(r["kb_id"], r["location"]))