From 58db719a2c82d6e8d12fd0750873dd625a80a1dd Mon Sep 17 00:00:00 2001
From: Bowen Liang <liangbowen@gf.com.cn>
Date: Tue, 4 Jun 2024 13:24:28 +0800
Subject: [PATCH] dep: bump pandas from 1.x to 2.x (#4820)

---
 api/core/rag/extractor/csv_extractor.py  |  2 +-
 api/requirements.txt                     |  4 +-
 api/tests/unit_tests/libs/test_pandas.py | 62 ++++++++++++++++++++++++
 3 files changed, 64 insertions(+), 4 deletions(-)
 create mode 100644 api/tests/unit_tests/libs/test_pandas.py

diff --git a/api/core/rag/extractor/csv_extractor.py b/api/core/rag/extractor/csv_extractor.py
index 09a1cddd1e..0470569f39 100644
--- a/api/core/rag/extractor/csv_extractor.py
+++ b/api/core/rag/extractor/csv_extractor.py
@@ -57,7 +57,7 @@ class CSVExtractor(BaseExtractor):
         docs = []
         try:
             # load csv file into pandas dataframe
-            df = pd.read_csv(csvfile, error_bad_lines=False, **self.csv_args)
+            df = pd.read_csv(csvfile, on_bad_lines='skip', **self.csv_args)
 
             # check source column exists
             if self.source_column and self.source_column not in df.columns:
diff --git a/api/requirements.txt b/api/requirements.txt
index 76c6b2bace..84ad31bb88 100644
--- a/api/requirements.txt
+++ b/api/requirements.txt
@@ -26,7 +26,6 @@ sympy==1.12
 jieba==0.42.1
 celery~=5.3.6
 redis[hiredis]~=5.0.3
-openpyxl==3.1.2
 chardet~=5.1.0
 python-docx~=1.1.0
 pypdfium2~=4.17.0
@@ -51,7 +50,7 @@ dashscope[tokenizer]~=1.17.0
 huggingface_hub~=0.16.4
 transformers~=4.35.0
 tokenizers~=0.15.0
-pandas==1.5.3
+pandas[performance,excel]~=2.2.2
 xinference-client==0.9.4
 safetensors~=0.4.3
 zhipuai==1.0.7
@@ -78,7 +77,6 @@ qrcode~=7.4.2
 azure-storage-blob==12.13.0
 azure-identity==1.15.0
 lxml==5.1.0
-xlrd~=2.0.1
 pydantic~=1.10.0
 pgvecto-rs==0.1.4
 firecrawl-py==0.0.5
diff --git a/api/tests/unit_tests/libs/test_pandas.py b/api/tests/unit_tests/libs/test_pandas.py
new file mode 100644
index 0000000000..bbc372ed61
--- /dev/null
+++ b/api/tests/unit_tests/libs/test_pandas.py
@@ -0,0 +1,62 @@
+import pandas as pd
+
+
+def test_pandas_csv(tmp_path, monkeypatch):
+    monkeypatch.chdir(tmp_path)
+    data = {'col1': [1, 2.2, -3.3, 4.0, 5],
+            'col2': ['A', 'B', 'C', 'D', 'E']}
+    df1 = pd.DataFrame(data)
+
+    # write to csv file
+    csv_file_path = tmp_path.joinpath('example.csv')
+    df1.to_csv(csv_file_path, index=False)
+
+    # read from csv file
+    df2 = pd.read_csv(csv_file_path, on_bad_lines='skip')
+    assert df2[df2.columns[0]].to_list() == data['col1']
+    assert df2[df2.columns[1]].to_list() == data['col2']
+
+
+def test_pandas_xlsx(tmp_path, monkeypatch):
+    monkeypatch.chdir(tmp_path)
+    data = {'col1': [1, 2.2, -3.3, 4.0, 5],
+            'col2': ['A', 'B', 'C', 'D', 'E']}
+    df1 = pd.DataFrame(data)
+
+    # write to xlsx file
+    xlsx_file_path = tmp_path.joinpath('example.xlsx')
+    df1.to_excel(xlsx_file_path, index=False)
+
+    # read from xlsx file
+    df2 = pd.read_excel(xlsx_file_path)
+    assert df2[df2.columns[0]].to_list() == data['col1']
+    assert df2[df2.columns[1]].to_list() == data['col2']
+
+
+def test_pandas_xlsx_with_sheets(tmp_path, monkeypatch):
+    monkeypatch.chdir(tmp_path)
+    data1 = {'col1': [1, 2, 3, 4, 5],
+             'col2': ['A', 'B', 'C', 'D', 'E']}
+    df1 = pd.DataFrame(data1)
+
+    data2 = {'col1': [6, 7, 8, 9, 10],
+             'col2': ['F', 'G', 'H', 'I', 'J']}
+    df2 = pd.DataFrame(data2)
+
+    # write to xlsx file with sheets
+    xlsx_file_path = tmp_path.joinpath('example_with_sheets.xlsx')
+    sheet1 = 'Sheet1'
+    sheet2 = 'Sheet2'
+    with pd.ExcelWriter(xlsx_file_path) as excel_writer:
+        df1.to_excel(excel_writer, sheet_name=sheet1, index=False)
+        df2.to_excel(excel_writer, sheet_name=sheet2, index=False)
+
+    # read from xlsx file with sheets
+    with pd.ExcelFile(xlsx_file_path) as excel_file:
+        df1 = pd.read_excel(excel_file, sheet_name=sheet1)
+        assert df1[df1.columns[0]].to_list() == data1['col1']
+        assert df1[df1.columns[1]].to_list() == data1['col2']
+
+        df2 = pd.read_excel(excel_file, sheet_name=sheet2)
+        assert df2[df2.columns[0]].to_list() == data2['col1']
+        assert df2[df2.columns[1]].to_list() == data2['col2']