fix too long query exception (#1195)

### What problem does this PR solve? #1161 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2025-08-12 06:38:59 +08:00 · 2024-06-18 09:50:59 +08:00 · 2024-06-18 09:50:59 +08:00 · e35f7610e7
commit e35f7610e7
parent 7920a5c78d
4 changed files with 19 additions and 14 deletions
--- a/deepdoc/parser/docx_parser.py
+++ b/deepdoc/parser/docx_parser.py
@ -113,19 +113,24 @@ class RAGFlowDocxParser:
    def __call__(self, fnm, from_page=0, to_page=100000):
        self.doc = Document(fnm) if isinstance(
            fnm, str) else Document(BytesIO(fnm))
-        pn = 0
-        secs = []
+        pn = 0 # parsed page
+        secs = [] # parsed contents
        for p in self.doc.paragraphs:
            if pn > to_page:
                break
-            if from_page <= pn < to_page and p.text.strip():
-                secs.append((p.text, p.style.name))
+
+            runs_within_single_paragraph = [] # save runs within the range of pages
            for run in p.runs:
-                if 'lastRenderedPageBreak' in run._element.xml:
-                    pn += 1
-                    continue
-                if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
+                if pn > to_page:
+                    break
+                if from_page <= pn < to_page and p.text.strip():
+                    runs_within_single_paragraph.append(run.text) # append run.text first
+
+                # wrap page break checker into a static method
+                if RAGFlowDocxParser.has_page_break(run._element.xml):
                    pn += 1

+            secs.append(("".join(runs_within_single_paragraph), p.style.name)) # then concat run.text as part of the paragraph
+
        tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
        return secs, tbls
--- a/rag/app/qa.py
+++ b/rag/app/qa.py
@ -145,6 +145,7 @@ def beAdoc(d, q, a, eng):
    d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
    return d

+
 def mdQuestionLevel(s):
    match = re.match(r'#*', s)
    return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
@ -244,7 +245,6 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
                        break
                    txt += l
        lines = txt.split("\n")
-        comma, tab = 0, 0
        last_question, last_answer = "", ""
        question_stack, level_stack = [], []
        code_block = False
@ -262,7 +262,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
                last_answer = f'{last_answer}\n{l}'
            else:   # is a question
                if last_answer:
-                    sum_question = ('\n').join(question_stack)
+                    sum_question = '\n'.join(question_stack)
                    if sum_question:
                        res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
                    last_answer = ''
@ -274,12 +274,11 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
                question_stack.append(question)
                level_stack.append(question_level)
        if last_answer:
-            sum_question = ('\n').join(question_stack)
+            sum_question = '\n'.join(question_stack)
            if sum_question:
                res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
        return res

-
    raise NotImplementedError(
        "Excel, csv(txt), pdf and markdown format files are supported.")

--- a/rag/nlp/query.py
+++ b/rag/nlp/query.py
@ -110,6 +110,7 @@ class EsQueryer:
                    sm = []

                keywords.append(re.sub(r"[ \\\"']+", "", tk))
+                if len(keywords) >= 12: break

                tk_syns = self.syn.lookup(tk)
                tk = EsQueryer.subSpecialChar(tk)
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@ -98,7 +98,7 @@ class Dealer:
        if not qst:
            if not req.get("sort"):
                s = s.sort(
-                    {"create_time": {"order": "desc", "unmapped_type": "date"}},
+                    #{"create_time": {"order": "desc", "unmapped_type": "date"}},
                    {"create_timestamp_flt": {
                        "order": "desc", "unmapped_type": "float"}}
                )
@ -108,7 +108,7 @@ class Dealer:
                                      "mode": "avg", "numeric_type": "double"}},
                    {"top_int": {"order": "asc", "unmapped_type": "float",
                                 "mode": "avg", "numeric_type": "double"}},
-                    {"create_time": {"order": "desc", "unmapped_type": "date"}},
+                    #{"create_time": {"order": "desc", "unmapped_type": "date"}},
                    {"create_timestamp_flt": {
                        "order": "desc", "unmapped_type": "float"}}
                )