diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 990c0355c..114253e1d 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -957,7 +957,7 @@ class RAGFlowPdfParser: self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in enumerate(self.pdf.pages[page_from:page_to])] try: - self.page_chars = [[{**c, 'top': c['top'], 'bottom': c['bottom']} for c in page.dedupe_chars().chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]] + self.page_chars = [[c for c in page.dedupe_chars().chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]] except Exception as e: logging.warning(f"Failed to extract characters for pages {page_from}-{page_to}: {str(e)}") self.page_chars = [[] for _ in range(page_to - page_from)] # If failed to extract, using empty list instead. diff --git a/deepdoc/vision/recognizer.py b/deepdoc/vision/recognizer.py index f6052f33b..a1d88693f 100644 --- a/deepdoc/vision/recognizer.py +++ b/deepdoc/vision/recognizer.py @@ -19,7 +19,7 @@ import os import math import numpy as np import cv2 -from copy import deepcopy +from functools import cmp_to_key import onnxruntime as ort from huggingface_hub import snapshot_download @@ -99,30 +99,22 @@ class Recognizer(object): @staticmethod def sort_Y_firstly(arr, threashold): - # sort using y1 first and then x1 - arr = sorted(arr, key=lambda r: (r["top"], r["x0"])) - for i in range(len(arr) - 1): - for j in range(i, -1, -1): - # restore the order using th - if abs(arr[j + 1]["top"] - arr[j]["top"]) < threashold \ - and arr[j + 1]["x0"] < arr[j]["x0"]: - tmp = deepcopy(arr[j]) - arr[j] = deepcopy(arr[j + 1]) - arr[j + 1] = deepcopy(tmp) + def cmp(c1, c2): + diff = c1["top"] - c2["top"] + if abs(diff) < threashold: + diff = c1["x0"] - c2["x0"] + return diff + arr = sorted(arr, key=cmp_to_key(cmp)) return arr @staticmethod - def sort_X_firstly(arr, threashold, copy=True): - # sort using y1 first and then x1 - arr = sorted(arr, key=lambda r: (r["x0"], r["top"])) - for i in range(len(arr) - 1): - for j in range(i, -1, -1): - # restore the order using th - if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \ - and arr[j + 1]["top"] < arr[j]["top"]: - tmp = deepcopy(arr[j]) if copy else arr[j] - arr[j] = deepcopy(arr[j + 1]) if copy else arr[j + 1] - arr[j + 1] = deepcopy(tmp) if copy else tmp + def sort_X_firstly(arr, threashold): + def cmp(c1, c2): + diff = c1["x0"] - c2["x0"] + if abs(diff) < threashold: + diff = c1["top"] - c2["top"] + return diff + arr = sorted(arr, key=cmp_to_key(cmp)) return arr @staticmethod @@ -145,8 +137,6 @@ class Recognizer(object): arr[j + 1] = tmp return arr - return sorted(arr, key=lambda r: (r.get("C", r["x0"]), r["top"])) - @staticmethod def sort_R_firstly(arr, thr=0): # sort using y1 first and then x1 diff --git a/deepdoc/vision/table_structure_recognizer.py b/deepdoc/vision/table_structure_recognizer.py index e31fd901e..7d0e1b4d6 100644 --- a/deepdoc/vision/table_structure_recognizer.py +++ b/deepdoc/vision/table_structure_recognizer.py @@ -177,7 +177,7 @@ class TableStructureRecognizer(Recognizer): colwm = np.min(colwm) if colwm else 0 crosspage = len(set([b["page_number"] for b in boxes])) > 1 if crosspage: - boxes = Recognizer.sort_X_firstly(boxes, colwm / 2, False) + boxes = Recognizer.sort_X_firstly(boxes, colwm / 2) else: boxes = Recognizer.sort_C_firstly(boxes, colwm / 2) boxes[0]["cn"] = 0