diff --git a/rag/app/audio.py b/rag/app/audio.py index 397e4e75e..ec7ffeea3 100644 --- a/rag/app/audio.py +++ b/rag/app/audio.py @@ -10,9 +10,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import io import re -import numpy as np from api.db import LLMType from rag.nlp import rag_tokenizer diff --git a/rag/app/book.py b/rag/app/book.py index 716c3b4eb..cf29bb119 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -15,9 +15,9 @@ import re from io import BytesIO from deepdoc.parser.utils import get_text -from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \ - hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \ - tokenize_chunks, find_codec +from rag.nlp import bullets_category, is_english,remove_contents_table, \ + hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \ + tokenize_chunks from rag.nlp import rag_tokenizer from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser diff --git a/rag/app/laws.py b/rag/app/laws.py index c1f2740d0..74aac8a29 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -10,7 +10,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import copy from tika import parser import re from io import BytesIO @@ -18,8 +17,8 @@ from docx import Document from api.db import ParserType from deepdoc.parser.utils import get_text -from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \ - make_colon_as_title, add_positions, tokenize_chunks, find_codec, docx_question_level +from rag.nlp import bullets_category, remove_contents_table, hierarchical_merge, \ + make_colon_as_title, tokenize_chunks, docx_question_level from rag.nlp import rag_tokenizer from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser from rag.settings import cron_logger diff --git a/rag/app/manual.py b/rag/app/manual.py index 29c7cd7f6..8c663e787 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -19,13 +19,13 @@ import re from api.db import ParserType from io import BytesIO -from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks, docx_question_level -from deepdoc.parser import PdfParser, PlainParser +from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level from rag.utils import num_tokens_from_string -from deepdoc.parser import PdfParser, ExcelParser, DocxParser +from deepdoc.parser import PdfParser, PlainParser, DocxParser from docx import Document from PIL import Image + class Pdf(PdfParser): def __init__(self): self.model_speciess = ParserType.MANUAL.value diff --git a/rag/app/naive.py b/rag/app/naive.py index c5a7f0ba8..54d10e463 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -25,6 +25,7 @@ from functools import reduce from markdown import markdown from docx.image.exceptions import UnrecognizedImageError + class Docx(DocxParser): def __init__(self): pass @@ -93,7 +94,7 @@ class Docx(DocxParser): tbls = [] for tb in self.doc.tables: - html= "" + html = "
" for r in tb.rows: html += "" i = 0 @@ -146,8 +147,6 @@ class Pdf(PdfParser): class Markdown(MarkdownParser): def __call__(self, filename, binary=None): - txt = "" - tbls = [] if binary: encoding = find_codec(binary) txt = binary.decode(encoding, errors="ignore") diff --git a/rag/app/paper.py b/rag/app/paper.py index 19185e962..2092581bb 100644 --- a/rag/app/paper.py +++ b/rag/app/paper.py @@ -12,13 +12,11 @@ # import copy import re -from collections import Counter from api.db import ParserType from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks from deepdoc.parser import PdfParser, PlainParser import numpy as np -from rag.utils import num_tokens_from_string class Pdf(PdfParser): @@ -135,7 +133,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, Only pdf is supported. The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly. """ - pdf_parser = None if re.search(r"\.pdf$", filename, re.IGNORECASE): if not kwargs.get("parser_config", {}).get("layout_recognize", True): pdf_parser = PlainParser() diff --git a/rag/app/qa.py b/rag/app/qa.py index 0a7757579..8994b4f0f 100644 --- a/rag/app/qa.py +++ b/rag/app/qa.py @@ -14,7 +14,6 @@ import re from copy import deepcopy from io import BytesIO from timeit import default_timer as timer -from nltk import word_tokenize from openpyxl import load_workbook from deepdoc.parser.utils import get_text