From dd13a5d05c3eb57f24639917ac759e6f844aa5a3 Mon Sep 17 00:00:00 2001
From: TeslaZY <TeslaZY@outlook.com>
Date: Mon, 30 Dec 2024 10:32:19 +0800
Subject: [PATCH] Fix some bugs in text2sql.(#4279)(#4281) (#4280)

Fix some bugs in text2sql.(#4279)(#4281)

### What problem does this PR solve?
- The incorrect results in parsing CSV files of the QA knowledge base in
the text2sql scenario. Process CSV files using the csv library. Decouple
CSV parsing from TXT parsing
- Most llm return results in markdown format ```sql query ```, Fix
execution error caused by LLM output SQLmarkdown format.### Type of
change
- [x] Bug Fix (non-breaking change which fixes an issue)
---
 agent/component/exesql.py | 20 ++++++++------------
 rag/app/qa.py             | 36 ++++++++++++++++++++++++++++++++++--
 2 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/agent/component/exesql.py b/agent/component/exesql.py
index 1b48b6b60..253dac5af 100644
--- a/agent/component/exesql.py
+++ b/agent/component/exesql.py
@@ -65,20 +65,16 @@ class ExeSQL(ComponentBase, ABC):
         self._loop += 1
 
         ans = self.get_input()
-      
-
         ans = "".join([str(a) for a in ans["content"]]) if "content" in ans else ""
-        if self._param.db_type == 'mssql':
-            # improve the information extraction, most llm return results in markdown format ```sql query ```
-            match = re.search(r"```sql\s*(.*?)\s*```", ans, re.DOTALL)
-            if match:
-                ans = match.group(1)  # Query content
-                print(ans)
-            else:
-                print("no markdown")
-            ans = re.sub(r'^.*?SELECT ', 'SELECT ', (ans), flags=re.IGNORECASE)
+
+        # improve the information extraction, most llm return results in markdown format ```sql query ```
+        match = re.search(r"```sql\s*(.*?)\s*```", ans, re.DOTALL)
+        if match:
+            ans = match.group(1)  # Query content
+            print(ans)
         else:
-            ans = re.sub(r'^.*?SELECT ', 'SELECT ', repr(ans), flags=re.IGNORECASE)
+            print("no markdown")
+        ans = re.sub(r'^.*?SELECT ', 'SELECT ', (ans), flags=re.IGNORECASE)
         ans = re.sub(r';.*?SELECT ', '; SELECT ', ans, flags=re.IGNORECASE)
         ans = re.sub(r';[^;]*$', r';', ans)
         if not ans:
diff --git a/rag/app/qa.py b/rag/app/qa.py
index d77daebd6..824a398bd 100644
--- a/rag/app/qa.py
+++ b/rag/app/qa.py
@@ -12,6 +12,7 @@
 #
 import logging
 import re
+import csv
 from copy import deepcopy
 from io import BytesIO
 from timeit import default_timer as timer
@@ -25,7 +26,6 @@ from docx import Document
 from PIL import Image
 from markdown import markdown
 
-
 class Excel(ExcelParser):
     def __call__(self, fnm, binary=None, callback=None):
         if not binary:
@@ -320,7 +320,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
             res.append(beAdoc(deepcopy(doc), q, a, eng))
         return res
 
-    elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
+    elif re.search(r"\.(txt)$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
         txt = get_text(filename, binary)
         lines = txt.split("\n")
@@ -359,6 +359,38 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
 
         return res
 
+    elif re.search(r"\.(csv)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        txt = get_text(filename, binary)
+        lines = txt.split("\n")
+        delimiter = "\t" if any("\t" in line for line in lines) else ","
+
+        fails = []
+        question, answer = "", ""
+        res = []
+        reader = csv.reader(lines, delimiter=delimiter)
+
+        for i, row in enumerate(reader):
+            if len(row) != 2:
+                if question:
+                    answer += "\n" + lines[i]
+                else:
+                    fails.append(str(i + 1))
+            elif len(row) == 2:
+                if question and answer:
+                    res.append(beAdoc(deepcopy(doc), question, answer, eng))
+                question, answer = row
+            if len(res) % 999 == 0:
+                callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
+                    f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
+
+        if question:
+            res.append(beAdoc(deepcopy(doc), question, answer, eng))
+
+        callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
+            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
+        return res
+
     elif re.search(r"\.pdf$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
         pdf_parser = Pdf()