mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-06-04 11:24:00 +08:00
Fix: optimize OCR garbage identification to reduce unnecessary filtering (#6027)
### What problem does this PR solve? Optimize OCR garbage identification to reduce unnecessary filtering. #5713 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
parent
0a877941f4
commit
4ff609b6a8
@ -46,8 +46,8 @@ class LayoutRecognizer(Recognizer):
|
||||
def __init__(self, domain):
|
||||
try:
|
||||
model_dir = os.path.join(
|
||||
get_project_base_directory(),
|
||||
"rag/res/deepdoc")
|
||||
get_project_base_directory(),
|
||||
"rag/res/deepdoc")
|
||||
super().__init__(self.labels, domain, model_dir)
|
||||
except Exception:
|
||||
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc",
|
||||
@ -60,9 +60,8 @@ class LayoutRecognizer(Recognizer):
|
||||
def __call__(self, image_list, ocr_res, scale_factor=3,
|
||||
thr=0.2, batch_size=16, drop=True):
|
||||
def __is_garbage(b):
|
||||
patt = [r"^•+$", r"(版权归©|免责条款|地址[::])", r"\.{3,}", "^[0-9]{1,2} / ?[0-9]{1,2}$",
|
||||
patt = [r"^•+$", "^[0-9]{1,2} / ?[0-9]{1,2}$",
|
||||
r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}",
|
||||
"(资料|数据)来源[::]", "[0-9a-z._-]+@[a-z0-9-]+\\.[a-z]{2,3}",
|
||||
"\\(cid *: *[0-9]+ *\\)"
|
||||
]
|
||||
return any([re.search(p, b["text"]) for p in patt])
|
||||
@ -160,6 +159,7 @@ class LayoutRecognizer(Recognizer):
|
||||
def forward(self, image_list, thr=0.7, batch_size=16):
|
||||
return super().__call__(image_list, thr, batch_size)
|
||||
|
||||
|
||||
class LayoutRecognizer4YOLOv10(LayoutRecognizer):
|
||||
labels = [
|
||||
"title",
|
||||
@ -185,9 +185,9 @@ class LayoutRecognizer4YOLOv10(LayoutRecognizer):
|
||||
|
||||
def preprocess(self, image_list):
|
||||
inputs = []
|
||||
new_shape = self.input_shape # height, width
|
||||
new_shape = self.input_shape # height, width
|
||||
for img in image_list:
|
||||
shape = img.shape[:2]# current shape [height, width]
|
||||
shape = img.shape[:2] # current shape [height, width]
|
||||
# Scale ratio (new / old)
|
||||
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
|
||||
# Compute padding
|
||||
@ -242,4 +242,3 @@ class LayoutRecognizer4YOLOv10(LayoutRecognizer):
|
||||
"bbox": [float(t) for t in boxes[i].tolist()],
|
||||
"score": float(scores[i])
|
||||
} for i in indices]
|
||||
|
||||
|
@ -15,20 +15,21 @@
|
||||
#
|
||||
|
||||
import logging
|
||||
from tika import parser
|
||||
from io import BytesIO
|
||||
from docx import Document
|
||||
from timeit import default_timer as timer
|
||||
import re
|
||||
from deepdoc.parser.pdf_parser import PlainParser
|
||||
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, \
|
||||
naive_merge_docx, tokenize_chunks_docx
|
||||
from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser
|
||||
from rag.utils import num_tokens_from_string
|
||||
from PIL import Image
|
||||
from functools import reduce
|
||||
from io import BytesIO
|
||||
from timeit import default_timer as timer
|
||||
|
||||
from docx import Document
|
||||
from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError
|
||||
from markdown import markdown
|
||||
from docx.image.exceptions import UnrecognizedImageError, UnexpectedEndOfFileError, InvalidImageStreamError
|
||||
from PIL import Image
|
||||
from tika import parser
|
||||
|
||||
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser
|
||||
from deepdoc.parser.pdf_parser import PlainParser
|
||||
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_docx, tokenize_table
|
||||
from rag.utils import num_tokens_from_string
|
||||
|
||||
|
||||
class Docx(DocxParser):
|
||||
|
Loading…
x
Reference in New Issue
Block a user