From 58db719a2c82d6e8d12fd0750873dd625a80a1dd Mon Sep 17 00:00:00 2001 From: Bowen Liang Date: Tue, 4 Jun 2024 13:24:28 +0800 Subject: [PATCH] dep: bump pandas from 1.x to 2.x (#4820) --- api/core/rag/extractor/csv_extractor.py | 2 +- api/requirements.txt | 4 +- api/tests/unit_tests/libs/test_pandas.py | 62 ++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 4 deletions(-) create mode 100644 api/tests/unit_tests/libs/test_pandas.py diff --git a/api/core/rag/extractor/csv_extractor.py b/api/core/rag/extractor/csv_extractor.py index 09a1cddd1e..0470569f39 100644 --- a/api/core/rag/extractor/csv_extractor.py +++ b/api/core/rag/extractor/csv_extractor.py @@ -57,7 +57,7 @@ class CSVExtractor(BaseExtractor): docs = [] try: # load csv file into pandas dataframe - df = pd.read_csv(csvfile, error_bad_lines=False, **self.csv_args) + df = pd.read_csv(csvfile, on_bad_lines='skip', **self.csv_args) # check source column exists if self.source_column and self.source_column not in df.columns: diff --git a/api/requirements.txt b/api/requirements.txt index 76c6b2bace..84ad31bb88 100644 --- a/api/requirements.txt +++ b/api/requirements.txt @@ -26,7 +26,6 @@ sympy==1.12 jieba==0.42.1 celery~=5.3.6 redis[hiredis]~=5.0.3 -openpyxl==3.1.2 chardet~=5.1.0 python-docx~=1.1.0 pypdfium2~=4.17.0 @@ -51,7 +50,7 @@ dashscope[tokenizer]~=1.17.0 huggingface_hub~=0.16.4 transformers~=4.35.0 tokenizers~=0.15.0 -pandas==1.5.3 +pandas[performance,excel]~=2.2.2 xinference-client==0.9.4 safetensors~=0.4.3 zhipuai==1.0.7 @@ -78,7 +77,6 @@ qrcode~=7.4.2 azure-storage-blob==12.13.0 azure-identity==1.15.0 lxml==5.1.0 -xlrd~=2.0.1 pydantic~=1.10.0 pgvecto-rs==0.1.4 firecrawl-py==0.0.5 diff --git a/api/tests/unit_tests/libs/test_pandas.py b/api/tests/unit_tests/libs/test_pandas.py new file mode 100644 index 0000000000..bbc372ed61 --- /dev/null +++ b/api/tests/unit_tests/libs/test_pandas.py @@ -0,0 +1,62 @@ +import pandas as pd + + +def test_pandas_csv(tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + data = {'col1': [1, 2.2, -3.3, 4.0, 5], + 'col2': ['A', 'B', 'C', 'D', 'E']} + df1 = pd.DataFrame(data) + + # write to csv file + csv_file_path = tmp_path.joinpath('example.csv') + df1.to_csv(csv_file_path, index=False) + + # read from csv file + df2 = pd.read_csv(csv_file_path, on_bad_lines='skip') + assert df2[df2.columns[0]].to_list() == data['col1'] + assert df2[df2.columns[1]].to_list() == data['col2'] + + +def test_pandas_xlsx(tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + data = {'col1': [1, 2.2, -3.3, 4.0, 5], + 'col2': ['A', 'B', 'C', 'D', 'E']} + df1 = pd.DataFrame(data) + + # write to xlsx file + xlsx_file_path = tmp_path.joinpath('example.xlsx') + df1.to_excel(xlsx_file_path, index=False) + + # read from xlsx file + df2 = pd.read_excel(xlsx_file_path) + assert df2[df2.columns[0]].to_list() == data['col1'] + assert df2[df2.columns[1]].to_list() == data['col2'] + + +def test_pandas_xlsx_with_sheets(tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + data1 = {'col1': [1, 2, 3, 4, 5], + 'col2': ['A', 'B', 'C', 'D', 'E']} + df1 = pd.DataFrame(data1) + + data2 = {'col1': [6, 7, 8, 9, 10], + 'col2': ['F', 'G', 'H', 'I', 'J']} + df2 = pd.DataFrame(data2) + + # write to xlsx file with sheets + xlsx_file_path = tmp_path.joinpath('example_with_sheets.xlsx') + sheet1 = 'Sheet1' + sheet2 = 'Sheet2' + with pd.ExcelWriter(xlsx_file_path) as excel_writer: + df1.to_excel(excel_writer, sheet_name=sheet1, index=False) + df2.to_excel(excel_writer, sheet_name=sheet2, index=False) + + # read from xlsx file with sheets + with pd.ExcelFile(xlsx_file_path) as excel_file: + df1 = pd.read_excel(excel_file, sheet_name=sheet1) + assert df1[df1.columns[0]].to_list() == data1['col1'] + assert df1[df1.columns[1]].to_list() == data1['col2'] + + df2 = pd.read_excel(excel_file, sheet_name=sheet2) + assert df2[df2.columns[0]].to_list() == data2['col1'] + assert df2[df2.columns[1]].to_list() == data2['col2']