fix too long query exception (#1195)

### What problem does this PR solve?

#1161 
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
KevinHuSh 2024-06-18 09:50:59 +08:00 committed by GitHub
parent 7920a5c78d
commit e35f7610e7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 19 additions and 14 deletions

View File

@ -113,19 +113,24 @@ class RAGFlowDocxParser:
def __call__(self, fnm, from_page=0, to_page=100000): def __call__(self, fnm, from_page=0, to_page=100000):
self.doc = Document(fnm) if isinstance( self.doc = Document(fnm) if isinstance(
fnm, str) else Document(BytesIO(fnm)) fnm, str) else Document(BytesIO(fnm))
pn = 0 pn = 0 # parsed page
secs = [] secs = [] # parsed contents
for p in self.doc.paragraphs: for p in self.doc.paragraphs:
if pn > to_page: if pn > to_page:
break break
if from_page <= pn < to_page and p.text.strip():
secs.append((p.text, p.style.name)) runs_within_single_paragraph = [] # save runs within the range of pages
for run in p.runs: for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml: if pn > to_page:
pn += 1 break
continue if from_page <= pn < to_page and p.text.strip():
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: runs_within_single_paragraph.append(run.text) # append run.text first
# wrap page break checker into a static method
if RAGFlowDocxParser.has_page_break(run._element.xml):
pn += 1 pn += 1
secs.append(("".join(runs_within_single_paragraph), p.style.name)) # then concat run.text as part of the paragraph
tbls = [self.__extract_table_content(tb) for tb in self.doc.tables] tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
return secs, tbls return secs, tbls

View File

@ -145,6 +145,7 @@ def beAdoc(d, q, a, eng):
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
return d return d
def mdQuestionLevel(s): def mdQuestionLevel(s):
match = re.match(r'#*', s) match = re.match(r'#*', s)
return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s) return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
@ -244,7 +245,6 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
break break
txt += l txt += l
lines = txt.split("\n") lines = txt.split("\n")
comma, tab = 0, 0
last_question, last_answer = "", "" last_question, last_answer = "", ""
question_stack, level_stack = [], [] question_stack, level_stack = [], []
code_block = False code_block = False
@ -262,7 +262,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
last_answer = f'{last_answer}\n{l}' last_answer = f'{last_answer}\n{l}'
else: # is a question else: # is a question
if last_answer: if last_answer:
sum_question = ('\n').join(question_stack) sum_question = '\n'.join(question_stack)
if sum_question: if sum_question:
res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng)) res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
last_answer = '' last_answer = ''
@ -274,12 +274,11 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
question_stack.append(question) question_stack.append(question)
level_stack.append(question_level) level_stack.append(question_level)
if last_answer: if last_answer:
sum_question = ('\n').join(question_stack) sum_question = '\n'.join(question_stack)
if sum_question: if sum_question:
res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng)) res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
return res return res
raise NotImplementedError( raise NotImplementedError(
"Excel, csv(txt), pdf and markdown format files are supported.") "Excel, csv(txt), pdf and markdown format files are supported.")

View File

@ -110,6 +110,7 @@ class EsQueryer:
sm = [] sm = []
keywords.append(re.sub(r"[ \\\"']+", "", tk)) keywords.append(re.sub(r"[ \\\"']+", "", tk))
if len(keywords) >= 12: break
tk_syns = self.syn.lookup(tk) tk_syns = self.syn.lookup(tk)
tk = EsQueryer.subSpecialChar(tk) tk = EsQueryer.subSpecialChar(tk)

View File

@ -98,7 +98,7 @@ class Dealer:
if not qst: if not qst:
if not req.get("sort"): if not req.get("sort"):
s = s.sort( s = s.sort(
{"create_time": {"order": "desc", "unmapped_type": "date"}}, #{"create_time": {"order": "desc", "unmapped_type": "date"}},
{"create_timestamp_flt": { {"create_timestamp_flt": {
"order": "desc", "unmapped_type": "float"}} "order": "desc", "unmapped_type": "float"}}
) )
@ -108,7 +108,7 @@ class Dealer:
"mode": "avg", "numeric_type": "double"}}, "mode": "avg", "numeric_type": "double"}},
{"top_int": {"order": "asc", "unmapped_type": "float", {"top_int": {"order": "asc", "unmapped_type": "float",
"mode": "avg", "numeric_type": "double"}}, "mode": "avg", "numeric_type": "double"}},
{"create_time": {"order": "desc", "unmapped_type": "date"}}, #{"create_time": {"order": "desc", "unmapped_type": "date"}},
{"create_timestamp_flt": { {"create_timestamp_flt": {
"order": "desc", "unmapped_type": "float"}} "order": "desc", "unmapped_type": "float"}}
) )