mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-13 11:29:01 +08:00
Fix some bugs in text2sql.(#4279)(#4281) ### What problem does this PR solve? - The incorrect results in parsing CSV files of the QA knowledge base in the text2sql scenario. Process CSV files using the csv library. Decouple CSV parsing from TXT parsing - Most llm return results in markdown format ```sql query ```, Fix execution error caused by LLM output SQLmarkdown format.### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
parent
8cdf10148d
commit
dd13a5d05c
@ -65,10 +65,8 @@ class ExeSQL(ComponentBase, ABC):
|
|||||||
self._loop += 1
|
self._loop += 1
|
||||||
|
|
||||||
ans = self.get_input()
|
ans = self.get_input()
|
||||||
|
|
||||||
|
|
||||||
ans = "".join([str(a) for a in ans["content"]]) if "content" in ans else ""
|
ans = "".join([str(a) for a in ans["content"]]) if "content" in ans else ""
|
||||||
if self._param.db_type == 'mssql':
|
|
||||||
# improve the information extraction, most llm return results in markdown format ```sql query ```
|
# improve the information extraction, most llm return results in markdown format ```sql query ```
|
||||||
match = re.search(r"```sql\s*(.*?)\s*```", ans, re.DOTALL)
|
match = re.search(r"```sql\s*(.*?)\s*```", ans, re.DOTALL)
|
||||||
if match:
|
if match:
|
||||||
@ -77,8 +75,6 @@ class ExeSQL(ComponentBase, ABC):
|
|||||||
else:
|
else:
|
||||||
print("no markdown")
|
print("no markdown")
|
||||||
ans = re.sub(r'^.*?SELECT ', 'SELECT ', (ans), flags=re.IGNORECASE)
|
ans = re.sub(r'^.*?SELECT ', 'SELECT ', (ans), flags=re.IGNORECASE)
|
||||||
else:
|
|
||||||
ans = re.sub(r'^.*?SELECT ', 'SELECT ', repr(ans), flags=re.IGNORECASE)
|
|
||||||
ans = re.sub(r';.*?SELECT ', '; SELECT ', ans, flags=re.IGNORECASE)
|
ans = re.sub(r';.*?SELECT ', '; SELECT ', ans, flags=re.IGNORECASE)
|
||||||
ans = re.sub(r';[^;]*$', r';', ans)
|
ans = re.sub(r';[^;]*$', r';', ans)
|
||||||
if not ans:
|
if not ans:
|
||||||
|
@ -12,6 +12,7 @@
|
|||||||
#
|
#
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
import csv
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
@ -25,7 +26,6 @@ from docx import Document
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
from markdown import markdown
|
from markdown import markdown
|
||||||
|
|
||||||
|
|
||||||
class Excel(ExcelParser):
|
class Excel(ExcelParser):
|
||||||
def __call__(self, fnm, binary=None, callback=None):
|
def __call__(self, fnm, binary=None, callback=None):
|
||||||
if not binary:
|
if not binary:
|
||||||
@ -320,7 +320,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|||||||
res.append(beAdoc(deepcopy(doc), q, a, eng))
|
res.append(beAdoc(deepcopy(doc), q, a, eng))
|
||||||
return res
|
return res
|
||||||
|
|
||||||
elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
|
elif re.search(r"\.(txt)$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
txt = get_text(filename, binary)
|
txt = get_text(filename, binary)
|
||||||
lines = txt.split("\n")
|
lines = txt.split("\n")
|
||||||
@ -359,6 +359,38 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|||||||
|
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
elif re.search(r"\.(csv)$", filename, re.IGNORECASE):
|
||||||
|
callback(0.1, "Start to parse.")
|
||||||
|
txt = get_text(filename, binary)
|
||||||
|
lines = txt.split("\n")
|
||||||
|
delimiter = "\t" if any("\t" in line for line in lines) else ","
|
||||||
|
|
||||||
|
fails = []
|
||||||
|
question, answer = "", ""
|
||||||
|
res = []
|
||||||
|
reader = csv.reader(lines, delimiter=delimiter)
|
||||||
|
|
||||||
|
for i, row in enumerate(reader):
|
||||||
|
if len(row) != 2:
|
||||||
|
if question:
|
||||||
|
answer += "\n" + lines[i]
|
||||||
|
else:
|
||||||
|
fails.append(str(i + 1))
|
||||||
|
elif len(row) == 2:
|
||||||
|
if question and answer:
|
||||||
|
res.append(beAdoc(deepcopy(doc), question, answer, eng))
|
||||||
|
question, answer = row
|
||||||
|
if len(res) % 999 == 0:
|
||||||
|
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
|
||||||
|
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
||||||
|
|
||||||
|
if question:
|
||||||
|
res.append(beAdoc(deepcopy(doc), question, answer, eng))
|
||||||
|
|
||||||
|
callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
|
||||||
|
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
||||||
|
return res
|
||||||
|
|
||||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
pdf_parser = Pdf()
|
pdf_parser = Pdf()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user