From dd13a5d05c3eb57f24639917ac759e6f844aa5a3 Mon Sep 17 00:00:00 2001 From: TeslaZY Date: Mon, 30 Dec 2024 10:32:19 +0800 Subject: [PATCH] Fix some bugs in text2sql.(#4279)(#4281) (#4280) Fix some bugs in text2sql.(#4279)(#4281) ### What problem does this PR solve? - The incorrect results in parsing CSV files of the QA knowledge base in the text2sql scenario. Process CSV files using the csv library. Decouple CSV parsing from TXT parsing - Most llm return results in markdown format ```sql query ```, Fix execution error caused by LLM output SQLmarkdown format.### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- agent/component/exesql.py | 20 ++++++++------------ rag/app/qa.py | 36 ++++++++++++++++++++++++++++++++++-- 2 files changed, 42 insertions(+), 14 deletions(-) diff --git a/agent/component/exesql.py b/agent/component/exesql.py index 1b48b6b60..253dac5af 100644 --- a/agent/component/exesql.py +++ b/agent/component/exesql.py @@ -65,20 +65,16 @@ class ExeSQL(ComponentBase, ABC): self._loop += 1 ans = self.get_input() - - ans = "".join([str(a) for a in ans["content"]]) if "content" in ans else "" - if self._param.db_type == 'mssql': - # improve the information extraction, most llm return results in markdown format ```sql query ``` - match = re.search(r"```sql\s*(.*?)\s*```", ans, re.DOTALL) - if match: - ans = match.group(1) # Query content - print(ans) - else: - print("no markdown") - ans = re.sub(r'^.*?SELECT ', 'SELECT ', (ans), flags=re.IGNORECASE) + + # improve the information extraction, most llm return results in markdown format ```sql query ``` + match = re.search(r"```sql\s*(.*?)\s*```", ans, re.DOTALL) + if match: + ans = match.group(1) # Query content + print(ans) else: - ans = re.sub(r'^.*?SELECT ', 'SELECT ', repr(ans), flags=re.IGNORECASE) + print("no markdown") + ans = re.sub(r'^.*?SELECT ', 'SELECT ', (ans), flags=re.IGNORECASE) ans = re.sub(r';.*?SELECT ', '; SELECT ', ans, flags=re.IGNORECASE) ans = re.sub(r';[^;]*$', r';', ans) if not ans: diff --git a/rag/app/qa.py b/rag/app/qa.py index d77daebd6..824a398bd 100644 --- a/rag/app/qa.py +++ b/rag/app/qa.py @@ -12,6 +12,7 @@ # import logging import re +import csv from copy import deepcopy from io import BytesIO from timeit import default_timer as timer @@ -25,7 +26,6 @@ from docx import Document from PIL import Image from markdown import markdown - class Excel(ExcelParser): def __call__(self, fnm, binary=None, callback=None): if not binary: @@ -320,7 +320,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): res.append(beAdoc(deepcopy(doc), q, a, eng)) return res - elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): + elif re.search(r"\.(txt)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") txt = get_text(filename, binary) lines = txt.split("\n") @@ -359,6 +359,38 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): return res + elif re.search(r"\.(csv)$", filename, re.IGNORECASE): + callback(0.1, "Start to parse.") + txt = get_text(filename, binary) + lines = txt.split("\n") + delimiter = "\t" if any("\t" in line for line in lines) else "," + + fails = [] + question, answer = "", "" + res = [] + reader = csv.reader(lines, delimiter=delimiter) + + for i, row in enumerate(reader): + if len(row) != 2: + if question: + answer += "\n" + lines[i] + else: + fails.append(str(i + 1)) + elif len(row) == 2: + if question and answer: + res.append(beAdoc(deepcopy(doc), question, answer, eng)) + question, answer = row + if len(res) % 999 == 0: + callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + ( + f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) + + if question: + res.append(beAdoc(deepcopy(doc), question, answer, eng)) + + callback(0.6, ("Extract Q&A: {}".format(len(res)) + ( + f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) + return res + elif re.search(r"\.pdf$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") pdf_parser = Pdf()