rename get_txt to get_text (#2649)

### What problem does this PR solve?



### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Kevin Hu 2024-09-29 12:47:09 +08:00 committed by GitHub
parent fb694143ee
commit fc867cb959
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 13 additions and 12 deletions

View File

@ -14,7 +14,7 @@
from rag.nlp import find_codec from rag.nlp import find_codec
def get_txt(fnm: str, binary=None) -> str: def get_text(fnm: str, binary=None) -> str:
txt = "" txt = ""
if binary: if binary:
encoding = find_codec(binary) encoding = find_codec(binary)

View File

@ -10,7 +10,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# #
import copy
from tika import parser from tika import parser
import re import re
from io import BytesIO from io import BytesIO

View File

@ -17,7 +17,7 @@ from io import BytesIO
from docx import Document from docx import Document
from api.db import ParserType from api.db import ParserType
from deepdoc.parser.utils import get_txt from deepdoc.parser.utils import get_text
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \ from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
make_colon_as_title, add_positions, tokenize_chunks, find_codec, docx_question_level make_colon_as_title, add_positions, tokenize_chunks, find_codec, docx_question_level
from rag.nlp import rag_tokenizer from rag.nlp import rag_tokenizer
@ -166,7 +166,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
elif re.search(r"\.txt$", filename, re.IGNORECASE): elif re.search(r"\.txt$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
txt = get_txt(filename, binary) txt = get_text(filename, binary)
sections = txt.split("\n") sections = txt.split("\n")
sections = [l for l in sections if l] sections = [l for l in sections if l]
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")

View File

@ -14,9 +14,9 @@ from tika import parser
from io import BytesIO from io import BytesIO
import re import re
from deepdoc.parser.utils import get_txt from deepdoc.parser.utils import get_text
from rag.app import laws from rag.app import laws
from rag.nlp import rag_tokenizer, tokenize, find_codec from rag.nlp import rag_tokenizer, tokenize
from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser
@ -84,7 +84,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
elif re.search(r"\.(txt|md|markdown)$", filename, re.IGNORECASE): elif re.search(r"\.(txt|md|markdown)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
txt = get_txt(filename, binary) txt = get_text(filename, binary)
sections = txt.split("\n") sections = txt.split("\n")
sections = [s for s in sections if s] sections = [s for s in sections if s]
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")

View File

@ -17,14 +17,16 @@ from timeit import default_timer as timer
from nltk import word_tokenize from nltk import word_tokenize
from openpyxl import load_workbook from openpyxl import load_workbook
from deepdoc.parser.utils import get_txt from deepdoc.parser.utils import get_text
from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level from rag.nlp import is_english, random_choices, qbullets_category, add_positions, has_qbullet, docx_question_level
from rag.nlp import rag_tokenizer, tokenize_table, concat_img from rag.nlp import rag_tokenizer, tokenize_table, concat_img
from rag.settings import cron_logger from rag.settings import cron_logger
from deepdoc.parser import PdfParser, ExcelParser, DocxParser from deepdoc.parser import PdfParser, ExcelParser, DocxParser
from docx import Document from docx import Document
from PIL import Image from PIL import Image
from markdown import markdown from markdown import markdown
class Excel(ExcelParser): class Excel(ExcelParser):
def __call__(self, fnm, binary=None, callback=None): def __call__(self, fnm, binary=None, callback=None):
if not binary: if not binary:
@ -307,7 +309,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
return res return res
elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
txt = get_txt(filename, binary) txt = get_text(filename, binary)
lines = txt.split("\n") lines = txt.split("\n")
comma, tab = 0, 0 comma, tab = 0, 0
for l in lines: for l in lines:
@ -350,7 +352,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
return res return res
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
txt = get_txt(filename, binary) txt = get_text(filename, binary)
lines = txt.split("\n") lines = txt.split("\n")
last_question, last_answer = "", "" last_question, last_answer = "", ""
question_stack, level_stack = [], [] question_stack, level_stack = [], []

View File

@ -21,7 +21,7 @@ from dateutil.parser import parse as datetime_parse
from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
from deepdoc.parser.utils import get_text from deepdoc.parser.utils import get_text
from rag.nlp import rag_tokenizer, is_english, tokenize, find_codec from rag.nlp import rag_tokenizer, tokenize
from deepdoc.parser import ExcelParser from deepdoc.parser import ExcelParser