Fix: optimize OCR garbage identification to reduce unnecessary filtering (#6027)

### What problem does this PR solve?

Optimize OCR garbage identification to reduce unnecessary filtering.
#5713

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Yongteng Lei 2025-03-13 18:48:32 +08:00 committed by GitHub
parent 0a877941f4
commit 4ff609b6a8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 18 additions and 18 deletions

View File

@ -46,8 +46,8 @@ class LayoutRecognizer(Recognizer):
def __init__(self, domain):
try:
model_dir = os.path.join(
get_project_base_directory(),
"rag/res/deepdoc")
get_project_base_directory(),
"rag/res/deepdoc")
super().__init__(self.labels, domain, model_dir)
except Exception:
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc",
@ -60,9 +60,8 @@ class LayoutRecognizer(Recognizer):
def __call__(self, image_list, ocr_res, scale_factor=3,
thr=0.2, batch_size=16, drop=True):
def __is_garbage(b):
patt = [r"^•+$", r"(版权归©|免责条款|地址[:])", r"\.{3,}", "^[0-9]{1,2} / ?[0-9]{1,2}$",
patt = [r"^•+$", "^[0-9]{1,2} / ?[0-9]{1,2}$",
r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}",
"(资料|数据)来源[:]", "[0-9a-z._-]+@[a-z0-9-]+\\.[a-z]{2,3}",
"\\(cid *: *[0-9]+ *\\)"
]
return any([re.search(p, b["text"]) for p in patt])
@ -160,6 +159,7 @@ class LayoutRecognizer(Recognizer):
def forward(self, image_list, thr=0.7, batch_size=16):
return super().__call__(image_list, thr, batch_size)
class LayoutRecognizer4YOLOv10(LayoutRecognizer):
labels = [
"title",
@ -185,9 +185,9 @@ class LayoutRecognizer4YOLOv10(LayoutRecognizer):
def preprocess(self, image_list):
inputs = []
new_shape = self.input_shape # height, width
new_shape = self.input_shape # height, width
for img in image_list:
shape = img.shape[:2]# current shape [height, width]
shape = img.shape[:2] # current shape [height, width]
# Scale ratio (new / old)
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
# Compute padding
@ -242,4 +242,3 @@ class LayoutRecognizer4YOLOv10(LayoutRecognizer):
"bbox": [float(t) for t in boxes[i].tolist()],
"score": float(scores[i])
} for i in indices]

View File

@ -15,20 +15,21 @@
#
import logging
from tika import parser
from io import BytesIO
from docx import Document
from timeit import default_timer as timer
import re
from deepdoc.parser.pdf_parser import PlainParser
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, \
naive_merge_docx, tokenize_chunks_docx
from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser
from rag.utils import num_tokens_from_string
from PIL import Image
from functools import reduce
from io import BytesIO
from timeit import default_timer as timer
from docx import Document
from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError
from markdown import markdown
from docx.image.exceptions import UnrecognizedImageError, UnexpectedEndOfFileError, InvalidImageStreamError
from PIL import Image
from tika import parser
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser
from deepdoc.parser.pdf_parser import PlainParser
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_docx, tokenize_table
from rag.utils import num_tokens_from_string
class Docx(DocxParser):