mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-06-04 11:24:00 +08:00
Optimized Recognizer.sort_X_firstly and Recognizer.sort_Y_firstly (#5182)
### What problem does this PR solve? Optimized Recognizer.sort_X_firstly and Recognizer.sort_Y_firstly ### Type of change - [x] Performance Improvement
This commit is contained in:
parent
07ddb8fcff
commit
c326f14fed
@ -957,7 +957,7 @@ class RAGFlowPdfParser:
|
|||||||
self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
|
self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
|
||||||
enumerate(self.pdf.pages[page_from:page_to])]
|
enumerate(self.pdf.pages[page_from:page_to])]
|
||||||
try:
|
try:
|
||||||
self.page_chars = [[{**c, 'top': c['top'], 'bottom': c['bottom']} for c in page.dedupe_chars().chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
|
self.page_chars = [[c for c in page.dedupe_chars().chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(f"Failed to extract characters for pages {page_from}-{page_to}: {str(e)}")
|
logging.warning(f"Failed to extract characters for pages {page_from}-{page_to}: {str(e)}")
|
||||||
self.page_chars = [[] for _ in range(page_to - page_from)] # If failed to extract, using empty list instead.
|
self.page_chars = [[] for _ in range(page_to - page_from)] # If failed to extract, using empty list instead.
|
||||||
|
@ -19,7 +19,7 @@ import os
|
|||||||
import math
|
import math
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import cv2
|
import cv2
|
||||||
from copy import deepcopy
|
from functools import cmp_to_key
|
||||||
|
|
||||||
import onnxruntime as ort
|
import onnxruntime as ort
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
@ -99,30 +99,22 @@ class Recognizer(object):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def sort_Y_firstly(arr, threashold):
|
def sort_Y_firstly(arr, threashold):
|
||||||
# sort using y1 first and then x1
|
def cmp(c1, c2):
|
||||||
arr = sorted(arr, key=lambda r: (r["top"], r["x0"]))
|
diff = c1["top"] - c2["top"]
|
||||||
for i in range(len(arr) - 1):
|
if abs(diff) < threashold:
|
||||||
for j in range(i, -1, -1):
|
diff = c1["x0"] - c2["x0"]
|
||||||
# restore the order using th
|
return diff
|
||||||
if abs(arr[j + 1]["top"] - arr[j]["top"]) < threashold \
|
arr = sorted(arr, key=cmp_to_key(cmp))
|
||||||
and arr[j + 1]["x0"] < arr[j]["x0"]:
|
|
||||||
tmp = deepcopy(arr[j])
|
|
||||||
arr[j] = deepcopy(arr[j + 1])
|
|
||||||
arr[j + 1] = deepcopy(tmp)
|
|
||||||
return arr
|
return arr
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def sort_X_firstly(arr, threashold, copy=True):
|
def sort_X_firstly(arr, threashold):
|
||||||
# sort using y1 first and then x1
|
def cmp(c1, c2):
|
||||||
arr = sorted(arr, key=lambda r: (r["x0"], r["top"]))
|
diff = c1["x0"] - c2["x0"]
|
||||||
for i in range(len(arr) - 1):
|
if abs(diff) < threashold:
|
||||||
for j in range(i, -1, -1):
|
diff = c1["top"] - c2["top"]
|
||||||
# restore the order using th
|
return diff
|
||||||
if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
|
arr = sorted(arr, key=cmp_to_key(cmp))
|
||||||
and arr[j + 1]["top"] < arr[j]["top"]:
|
|
||||||
tmp = deepcopy(arr[j]) if copy else arr[j]
|
|
||||||
arr[j] = deepcopy(arr[j + 1]) if copy else arr[j + 1]
|
|
||||||
arr[j + 1] = deepcopy(tmp) if copy else tmp
|
|
||||||
return arr
|
return arr
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -145,8 +137,6 @@ class Recognizer(object):
|
|||||||
arr[j + 1] = tmp
|
arr[j + 1] = tmp
|
||||||
return arr
|
return arr
|
||||||
|
|
||||||
return sorted(arr, key=lambda r: (r.get("C", r["x0"]), r["top"]))
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def sort_R_firstly(arr, thr=0):
|
def sort_R_firstly(arr, thr=0):
|
||||||
# sort using y1 first and then x1
|
# sort using y1 first and then x1
|
||||||
|
@ -177,7 +177,7 @@ class TableStructureRecognizer(Recognizer):
|
|||||||
colwm = np.min(colwm) if colwm else 0
|
colwm = np.min(colwm) if colwm else 0
|
||||||
crosspage = len(set([b["page_number"] for b in boxes])) > 1
|
crosspage = len(set([b["page_number"] for b in boxes])) > 1
|
||||||
if crosspage:
|
if crosspage:
|
||||||
boxes = Recognizer.sort_X_firstly(boxes, colwm / 2, False)
|
boxes = Recognizer.sort_X_firstly(boxes, colwm / 2)
|
||||||
else:
|
else:
|
||||||
boxes = Recognizer.sort_C_firstly(boxes, colwm / 2)
|
boxes = Recognizer.sort_C_firstly(boxes, colwm / 2)
|
||||||
boxes[0]["cn"] = 0
|
boxes[0]["cn"] = 0
|
||||||
|
Loading…
x
Reference in New Issue
Block a user