Fix some bugs in text2sql.(#4279)(#4281) (#4280)

Fix some bugs in text2sql.(#4279)(#4281) ### What problem does this PR solve? - The incorrect results in parsing CSV files of the QA knowledge base in the text2sql scenario. Process CSV files using the csv library. Decouple CSV parsing from TXT parsing - Most llm return results in markdown format ```sql query ```, Fix execution error caused by LLM output SQLmarkdown format.### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2025-08-13 11:29:01 +08:00 · 2024-12-30 10:32:19 +08:00 · 2024-12-30 10:32:19 +08:00 · dd13a5d05c
commit dd13a5d05c
parent 8cdf10148d
2 changed files with 42 additions and 14 deletions
--- a/agent/component/exesql.py
+++ b/agent/component/exesql.py
@ -65,10 +65,8 @@ class ExeSQL(ComponentBase, ABC):
        self._loop += 1
        ans = self.get_input()
        ans = "".join([str(a) for a in ans["content"]]) if "content" in ans else ""
-        if self._param.db_type == 'mssql':
+
        # improve the information extraction, most llm return results in markdown format ```sql query ```
        match = re.search(r"```sql\s*(.*?)\s*```", ans, re.DOTALL)
        if match:
@ -77,8 +75,6 @@ class ExeSQL(ComponentBase, ABC):
        else:
            print("no markdown")
        ans = re.sub(r'^.*?SELECT ', 'SELECT ', (ans), flags=re.IGNORECASE)
        else:
            ans = re.sub(r'^.*?SELECT ', 'SELECT ', repr(ans), flags=re.IGNORECASE)
        ans = re.sub(r';.*?SELECT ', '; SELECT ', ans, flags=re.IGNORECASE)
        ans = re.sub(r';[^;]*$', r';', ans)
        if not ans:
--- a/rag/app/qa.py
+++ b/rag/app/qa.py
@ -12,6 +12,7 @@
 #
 import logging
 import re
 import csv
 from copy import deepcopy
 from io import BytesIO
 from timeit import default_timer as timer
@ -25,7 +26,6 @@ from docx import Document
 from PIL import Image
 from markdown import markdown
 class Excel(ExcelParser):
    def __call__(self, fnm, binary=None, callback=None):
        if not binary:
@ -320,7 +320,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
            res.append(beAdoc(deepcopy(doc), q, a, eng))
        return res
-    elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
+    elif re.search(r"\.(txt)$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        txt = get_text(filename, binary)
        lines = txt.split("\n")
@ -359,6 +359,38 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
        return res
    elif re.search(r"\.(csv)$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        txt = get_text(filename, binary)
        lines = txt.split("\n")
        delimiter = "\t" if any("\t" in line for line in lines) else ","
        fails = []
        question, answer = "", ""
        res = []
        reader = csv.reader(lines, delimiter=delimiter)
        for i, row in enumerate(reader):
            if len(row) != 2:
                if question:
                    answer += "\n" + lines[i]
                else:
                    fails.append(str(i + 1))
            elif len(row) == 2:
                if question and answer:
                    res.append(beAdoc(deepcopy(doc), question, answer, eng))
                question, answer = row
            if len(res) % 999 == 0:
                callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
                    f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
        if question:
            res.append(beAdoc(deepcopy(doc), question, answer, eng))
        callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
        return res
    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        pdf_parser = Pdf()