From 100b3165d8baa6a86ab9c9a0055f2fae52e7e671 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Wed, 24 Jul 2024 12:38:48 +0800 Subject: [PATCH] pypdf2 to pypdf (#1684) ### What problem does this PR solve? pypdf and PyPDF2 possible Infinite Loop when a comment isn't followed by a character #59 ### Type of change - [x] Refactoring --- deepdoc/parser/pdf_parser.py | 2 +- requirements.txt | 1 + requirements_arm.txt | 1 + requirements_dev.txt | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 29947f7d9..909b89d5c 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -23,7 +23,7 @@ import logging from PIL import Image, ImageDraw import numpy as np from timeit import default_timer as timer -from PyPDF2 import PdfReader as pdf2_read +from pypdf import PdfReader as pdf2_read from api.utils.file_utils import get_project_base_directory from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer diff --git a/requirements.txt b/requirements.txt index e50d7f86c..1752b7759 100644 --- a/requirements.txt +++ b/requirements.txt @@ -79,3 +79,4 @@ word2number==1.1 xgboost==2.1.0 xpinyin==0.7.6 zhipuai==2.0.1 +pypdf==4.3.0 diff --git a/requirements_arm.txt b/requirements_arm.txt index 3650ca538..1a8f6a137 100644 --- a/requirements_arm.txt +++ b/requirements_arm.txt @@ -153,3 +153,4 @@ groq==0.9.0 wikipedia==1.4.0 Bio==1.7.1 arxiv==2.1.3 +pypdf==4.3.0 diff --git a/requirements_dev.txt b/requirements_dev.txt index ee0829737..f518cab64 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -138,3 +138,4 @@ groq==0.9.0 wikipedia==1.4.0 Bio==1.7.1 arxiv==2.1.3 +pypdf==4.3.0