From 4ff609b6a848643ebda852192fcf2e1c7ccf385c Mon Sep 17 00:00:00 2001 From: Yongteng Lei Date: Thu, 13 Mar 2025 18:48:32 +0800 Subject: [PATCH] Fix: optimize OCR garbage identification to reduce unnecessary filtering (#6027) ### What problem does this PR solve? Optimize OCR garbage identification to reduce unnecessary filtering. #5713 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- deepdoc/vision/layout_recognizer.py | 13 ++++++------- rag/app/naive.py | 23 ++++++++++++----------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/deepdoc/vision/layout_recognizer.py b/deepdoc/vision/layout_recognizer.py index 467a22ee9..01e6af6c9 100644 --- a/deepdoc/vision/layout_recognizer.py +++ b/deepdoc/vision/layout_recognizer.py @@ -46,8 +46,8 @@ class LayoutRecognizer(Recognizer): def __init__(self, domain): try: model_dir = os.path.join( - get_project_base_directory(), - "rag/res/deepdoc") + get_project_base_directory(), + "rag/res/deepdoc") super().__init__(self.labels, domain, model_dir) except Exception: model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc", @@ -60,9 +60,8 @@ class LayoutRecognizer(Recognizer): def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2, batch_size=16, drop=True): def __is_garbage(b): - patt = [r"^•+$", r"(版权归©|免责条款|地址[::])", r"\.{3,}", "^[0-9]{1,2} / ?[0-9]{1,2}$", + patt = [r"^•+$", "^[0-9]{1,2} / ?[0-9]{1,2}$", r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}", - "(资料|数据)来源[::]", "[0-9a-z._-]+@[a-z0-9-]+\\.[a-z]{2,3}", "\\(cid *: *[0-9]+ *\\)" ] return any([re.search(p, b["text"]) for p in patt]) @@ -160,6 +159,7 @@ class LayoutRecognizer(Recognizer): def forward(self, image_list, thr=0.7, batch_size=16): return super().__call__(image_list, thr, batch_size) + class LayoutRecognizer4YOLOv10(LayoutRecognizer): labels = [ "title", @@ -185,9 +185,9 @@ class LayoutRecognizer4YOLOv10(LayoutRecognizer): def preprocess(self, image_list): inputs = [] - new_shape = self.input_shape # height, width + new_shape = self.input_shape # height, width for img in image_list: - shape = img.shape[:2]# current shape [height, width] + shape = img.shape[:2] # current shape [height, width] # Scale ratio (new / old) r = min(new_shape[0] / shape[0], new_shape[1] / shape[1]) # Compute padding @@ -242,4 +242,3 @@ class LayoutRecognizer4YOLOv10(LayoutRecognizer): "bbox": [float(t) for t in boxes[i].tolist()], "score": float(scores[i]) } for i in indices] - diff --git a/rag/app/naive.py b/rag/app/naive.py index 5efcacf68..ff4f63f8c 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -15,20 +15,21 @@ # import logging -from tika import parser -from io import BytesIO -from docx import Document -from timeit import default_timer as timer import re -from deepdoc.parser.pdf_parser import PlainParser -from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, \ - naive_merge_docx, tokenize_chunks_docx -from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser -from rag.utils import num_tokens_from_string -from PIL import Image from functools import reduce +from io import BytesIO +from timeit import default_timer as timer + +from docx import Document +from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError from markdown import markdown -from docx.image.exceptions import UnrecognizedImageError, UnexpectedEndOfFileError, InvalidImageStreamError +from PIL import Image +from tika import parser + +from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser +from deepdoc.parser.pdf_parser import PlainParser +from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_docx, tokenize_table +from rag.utils import num_tokens_from_string class Docx(DocxParser):