mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-04-17 19:50:01 +08:00
fix RAGFlowPdfParser AttributeError: 'PdfReader' object has no attribute 'close' err (#6859)
i use PdfParser in local(refer to this case: https://github.com/infiniflow/ragflow/blob/main/rag/app/paper.py) like this: ``` import re import openpyxl from ragflow.api.db import ParserType from ragflow.rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, \ title_frequency, \ tokenize_chunks from ragflow.rag.utils import num_tokens_from_string from ragflow.deepdoc.parser import PdfParser, ExcelParser, DocxParser,PlainParser def logger(prog=None, msg=""): print(msg) class Pdf(PdfParser): def __init__(self): self.model_speciess = ParserType.MANUAL.value super().__init__() def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): from timeit import default_timer as timer start = timer() callback(msg="OCR is running...") self.__images__( filename if not binary else binary, zoomin, from_page, to_page, callback ) callback(msg="OCR finished.") print("OCR:", timer() - start) self._layouts_rec(zoomin) callback(0.65, "Layout analysis finished.") print("layouts:", timer() - start) self._table_transformer_job(zoomin) callback(0.67, "Table analysis finished.") self._text_merge() tbls = self._extract_table_figure(True, zoomin, True, True) self._concat_downward() self._filter_forpages() callback(0.68, "Text merging finished") # clean mess for b in self.boxes: b["text"] = re.sub(r"([\t ]|\u3000){2,}", " ", b["text"].strip()) return [(b["text"], b.get("layout_no", ""), self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)], tbls ``` show err like this: ``` File "xxxxx/third_party/ragflow/deepdoc/parser/pdf_parser.py", line 1039, in __images__ self.pdf.close() AttributeError: 'PdfReader' object has no attribute 'close' ``` i found ragflow source code use `pdfplumber.open`(https://github.com/infiniflow/ragflow/blob/main/deepdoc/parser/pdf_parser.py#L1007C28-L1007C43) and replace` self.pdf `with ` pdf2_read` (from pypdf import PdfReader as pdf2_read)in line 1024 (https://github.com/infiniflow/ragflow/blob/main/deepdoc/parser/pdf_parser.py#L1024) ``` self.pdf = pdf2_read ``` --- and I found that `pdfplumber` can be used in this way: ``` file_path="xxx.pdf" res = pdfplumber.open(file_path) res.close() ``` but `pypdf.PdfReader` source code do not has `close` func, source code use like this ``` with open(stream, "rb") as fh: stream = BytesIO(fh.read()) self._stream_opened = True ``` > https://github.com/py-pdf/pypdf/blob/main/pypdf/_reader.py#L156 so I moved the `self.pdf.close` function call and fixed this problem hoping to help the project😊
This commit is contained in:
parent
b70abe52b2
commit
53c653b099
@ -1004,38 +1004,42 @@ class RAGFlowPdfParser:
|
||||
start = timer()
|
||||
try:
|
||||
with sys.modules[LOCK_KEY_pdfplumber]:
|
||||
self.pdf = pdfplumber.open(fnm) if isinstance(
|
||||
fnm, str) else pdfplumber.open(BytesIO(fnm))
|
||||
self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
|
||||
enumerate(self.pdf.pages[page_from:page_to])]
|
||||
try:
|
||||
self.page_chars = [[c for c in page.dedupe_chars().chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed to extract characters for pages {page_from}-{page_to}: {str(e)}")
|
||||
self.page_chars = [[] for _ in range(page_to - page_from)] # If failed to extract, using empty list instead.
|
||||
with (pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))) as pdf:
|
||||
self.pdf = pdf
|
||||
self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
|
||||
enumerate(self.pdf.pages[page_from:page_to])]
|
||||
|
||||
try:
|
||||
self.page_chars = [[c for c in page.dedupe_chars().chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed to extract characters for pages {page_from}-{page_to}: {str(e)}")
|
||||
self.page_chars = [[] for _ in range(page_to - page_from)] # If failed to extract, using empty list instead.
|
||||
|
||||
self.total_page = len(self.pdf.pages)
|
||||
|
||||
self.total_page = len(self.pdf.pages)
|
||||
except Exception:
|
||||
logging.exception("RAGFlowPdfParser __images__")
|
||||
logging.info(f"__images__ dedupe_chars cost {timer() - start}s")
|
||||
|
||||
self.outlines = []
|
||||
try:
|
||||
self.pdf = pdf2_read(fnm if isinstance(fnm, str) else BytesIO(fnm))
|
||||
outlines = self.pdf.outline
|
||||
with (pdf2_read(fnm if isinstance(fnm, str)
|
||||
else BytesIO(fnm))) as pdf:
|
||||
self.pdf = pdf
|
||||
|
||||
def dfs(arr, depth):
|
||||
for a in arr:
|
||||
if isinstance(a, dict):
|
||||
self.outlines.append((a["/Title"], depth))
|
||||
continue
|
||||
dfs(a, depth + 1)
|
||||
outlines = self.pdf.outline
|
||||
def dfs(arr, depth):
|
||||
for a in arr:
|
||||
if isinstance(a, dict):
|
||||
self.outlines.append((a["/Title"], depth))
|
||||
continue
|
||||
dfs(a, depth + 1)
|
||||
|
||||
dfs(outlines, 0)
|
||||
|
||||
dfs(outlines, 0)
|
||||
except Exception as e:
|
||||
logging.warning(f"Outlines exception: {e}")
|
||||
finally:
|
||||
self.pdf.close()
|
||||
|
||||
if not self.outlines:
|
||||
logging.warning("Miss outlines")
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user