mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-13 08:39:01 +08:00
fix too long query exception (#1195)
### What problem does this PR solve? #1161 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
parent
7920a5c78d
commit
e35f7610e7
@ -113,19 +113,24 @@ class RAGFlowDocxParser:
|
|||||||
def __call__(self, fnm, from_page=0, to_page=100000):
|
def __call__(self, fnm, from_page=0, to_page=100000):
|
||||||
self.doc = Document(fnm) if isinstance(
|
self.doc = Document(fnm) if isinstance(
|
||||||
fnm, str) else Document(BytesIO(fnm))
|
fnm, str) else Document(BytesIO(fnm))
|
||||||
pn = 0
|
pn = 0 # parsed page
|
||||||
secs = []
|
secs = [] # parsed contents
|
||||||
for p in self.doc.paragraphs:
|
for p in self.doc.paragraphs:
|
||||||
if pn > to_page:
|
if pn > to_page:
|
||||||
break
|
break
|
||||||
if from_page <= pn < to_page and p.text.strip():
|
|
||||||
secs.append((p.text, p.style.name))
|
runs_within_single_paragraph = [] # save runs within the range of pages
|
||||||
for run in p.runs:
|
for run in p.runs:
|
||||||
if 'lastRenderedPageBreak' in run._element.xml:
|
if pn > to_page:
|
||||||
pn += 1
|
break
|
||||||
continue
|
if from_page <= pn < to_page and p.text.strip():
|
||||||
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
|
runs_within_single_paragraph.append(run.text) # append run.text first
|
||||||
|
|
||||||
|
# wrap page break checker into a static method
|
||||||
|
if RAGFlowDocxParser.has_page_break(run._element.xml):
|
||||||
pn += 1
|
pn += 1
|
||||||
|
|
||||||
|
secs.append(("".join(runs_within_single_paragraph), p.style.name)) # then concat run.text as part of the paragraph
|
||||||
|
|
||||||
tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
|
tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
|
||||||
return secs, tbls
|
return secs, tbls
|
||||||
|
@ -145,6 +145,7 @@ def beAdoc(d, q, a, eng):
|
|||||||
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||||
return d
|
return d
|
||||||
|
|
||||||
|
|
||||||
def mdQuestionLevel(s):
|
def mdQuestionLevel(s):
|
||||||
match = re.match(r'#*', s)
|
match = re.match(r'#*', s)
|
||||||
return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
|
return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
|
||||||
@ -244,7 +245,6 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|||||||
break
|
break
|
||||||
txt += l
|
txt += l
|
||||||
lines = txt.split("\n")
|
lines = txt.split("\n")
|
||||||
comma, tab = 0, 0
|
|
||||||
last_question, last_answer = "", ""
|
last_question, last_answer = "", ""
|
||||||
question_stack, level_stack = [], []
|
question_stack, level_stack = [], []
|
||||||
code_block = False
|
code_block = False
|
||||||
@ -262,7 +262,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|||||||
last_answer = f'{last_answer}\n{l}'
|
last_answer = f'{last_answer}\n{l}'
|
||||||
else: # is a question
|
else: # is a question
|
||||||
if last_answer:
|
if last_answer:
|
||||||
sum_question = ('\n').join(question_stack)
|
sum_question = '\n'.join(question_stack)
|
||||||
if sum_question:
|
if sum_question:
|
||||||
res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
|
res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
|
||||||
last_answer = ''
|
last_answer = ''
|
||||||
@ -274,12 +274,11 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|||||||
question_stack.append(question)
|
question_stack.append(question)
|
||||||
level_stack.append(question_level)
|
level_stack.append(question_level)
|
||||||
if last_answer:
|
if last_answer:
|
||||||
sum_question = ('\n').join(question_stack)
|
sum_question = '\n'.join(question_stack)
|
||||||
if sum_question:
|
if sum_question:
|
||||||
res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
|
res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Excel, csv(txt), pdf and markdown format files are supported.")
|
"Excel, csv(txt), pdf and markdown format files are supported.")
|
||||||
|
|
||||||
|
@ -110,6 +110,7 @@ class EsQueryer:
|
|||||||
sm = []
|
sm = []
|
||||||
|
|
||||||
keywords.append(re.sub(r"[ \\\"']+", "", tk))
|
keywords.append(re.sub(r"[ \\\"']+", "", tk))
|
||||||
|
if len(keywords) >= 12: break
|
||||||
|
|
||||||
tk_syns = self.syn.lookup(tk)
|
tk_syns = self.syn.lookup(tk)
|
||||||
tk = EsQueryer.subSpecialChar(tk)
|
tk = EsQueryer.subSpecialChar(tk)
|
||||||
|
@ -98,7 +98,7 @@ class Dealer:
|
|||||||
if not qst:
|
if not qst:
|
||||||
if not req.get("sort"):
|
if not req.get("sort"):
|
||||||
s = s.sort(
|
s = s.sort(
|
||||||
{"create_time": {"order": "desc", "unmapped_type": "date"}},
|
#{"create_time": {"order": "desc", "unmapped_type": "date"}},
|
||||||
{"create_timestamp_flt": {
|
{"create_timestamp_flt": {
|
||||||
"order": "desc", "unmapped_type": "float"}}
|
"order": "desc", "unmapped_type": "float"}}
|
||||||
)
|
)
|
||||||
@ -108,7 +108,7 @@ class Dealer:
|
|||||||
"mode": "avg", "numeric_type": "double"}},
|
"mode": "avg", "numeric_type": "double"}},
|
||||||
{"top_int": {"order": "asc", "unmapped_type": "float",
|
{"top_int": {"order": "asc", "unmapped_type": "float",
|
||||||
"mode": "avg", "numeric_type": "double"}},
|
"mode": "avg", "numeric_type": "double"}},
|
||||||
{"create_time": {"order": "desc", "unmapped_type": "date"}},
|
#{"create_time": {"order": "desc", "unmapped_type": "date"}},
|
||||||
{"create_timestamp_flt": {
|
{"create_timestamp_flt": {
|
||||||
"order": "desc", "unmapped_type": "float"}}
|
"order": "desc", "unmapped_type": "float"}}
|
||||||
)
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user