remove unused import (#2679)

### What problem does this PR solve?

### Type of change

- [x] Refactoring
This commit is contained in:
yqkcn 2024-09-30 16:59:39 +08:00 committed by GitHub
parent ae5a877ed4
commit 570ad420a8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 10 additions and 18 deletions

View File

@ -10,9 +10,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# #
import io
import re import re
import numpy as np
from api.db import LLMType from api.db import LLMType
from rag.nlp import rag_tokenizer from rag.nlp import rag_tokenizer

View File

@ -15,9 +15,9 @@ import re
from io import BytesIO from io import BytesIO
from deepdoc.parser.utils import get_text from deepdoc.parser.utils import get_text
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \ from rag.nlp import bullets_category, is_english,remove_contents_table, \
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \ hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
tokenize_chunks, find_codec tokenize_chunks
from rag.nlp import rag_tokenizer from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser

View File

@ -10,7 +10,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# #
import copy
from tika import parser from tika import parser
import re import re
from io import BytesIO from io import BytesIO
@ -18,8 +17,8 @@ from docx import Document
from api.db import ParserType from api.db import ParserType
from deepdoc.parser.utils import get_text from deepdoc.parser.utils import get_text
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \ from rag.nlp import bullets_category, remove_contents_table, hierarchical_merge, \
make_colon_as_title, add_positions, tokenize_chunks, find_codec, docx_question_level make_colon_as_title, tokenize_chunks, docx_question_level
from rag.nlp import rag_tokenizer from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
from rag.settings import cron_logger from rag.settings import cron_logger

View File

@ -19,13 +19,13 @@ import re
from api.db import ParserType from api.db import ParserType
from io import BytesIO from io import BytesIO
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks, docx_question_level from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level
from deepdoc.parser import PdfParser, PlainParser
from rag.utils import num_tokens_from_string from rag.utils import num_tokens_from_string
from deepdoc.parser import PdfParser, ExcelParser, DocxParser from deepdoc.parser import PdfParser, PlainParser, DocxParser
from docx import Document from docx import Document
from PIL import Image from PIL import Image
class Pdf(PdfParser): class Pdf(PdfParser):
def __init__(self): def __init__(self):
self.model_speciess = ParserType.MANUAL.value self.model_speciess = ParserType.MANUAL.value

View File

@ -25,6 +25,7 @@ from functools import reduce
from markdown import markdown from markdown import markdown
from docx.image.exceptions import UnrecognizedImageError from docx.image.exceptions import UnrecognizedImageError
class Docx(DocxParser): class Docx(DocxParser):
def __init__(self): def __init__(self):
pass pass
@ -93,7 +94,7 @@ class Docx(DocxParser):
tbls = [] tbls = []
for tb in self.doc.tables: for tb in self.doc.tables:
html= "<table>" html = "<table>"
for r in tb.rows: for r in tb.rows:
html += "<tr>" html += "<tr>"
i = 0 i = 0
@ -146,8 +147,6 @@ class Pdf(PdfParser):
class Markdown(MarkdownParser): class Markdown(MarkdownParser):
def __call__(self, filename, binary=None): def __call__(self, filename, binary=None):
txt = ""
tbls = []
if binary: if binary:
encoding = find_codec(binary) encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore") txt = binary.decode(encoding, errors="ignore")

View File

@ -12,13 +12,11 @@
# #
import copy import copy
import re import re
from collections import Counter
from api.db import ParserType from api.db import ParserType
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
from deepdoc.parser import PdfParser, PlainParser from deepdoc.parser import PdfParser, PlainParser
import numpy as np import numpy as np
from rag.utils import num_tokens_from_string
class Pdf(PdfParser): class Pdf(PdfParser):
@ -135,7 +133,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
Only pdf is supported. Only pdf is supported.
The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly. The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
""" """
pdf_parser = None
if re.search(r"\.pdf$", filename, re.IGNORECASE): if re.search(r"\.pdf$", filename, re.IGNORECASE):
if not kwargs.get("parser_config", {}).get("layout_recognize", True): if not kwargs.get("parser_config", {}).get("layout_recognize", True):
pdf_parser = PlainParser() pdf_parser = PlainParser()

View File

@ -14,7 +14,6 @@ import re
from copy import deepcopy from copy import deepcopy
from io import BytesIO from io import BytesIO
from timeit import default_timer as timer from timeit import default_timer as timer
from nltk import word_tokenize
from openpyxl import load_workbook from openpyxl import load_workbook
from deepdoc.parser.utils import get_text from deepdoc.parser.utils import get_text