Feat: repair corrupted PDF files on upload automatically (#7693)

### What problem does this PR solve? Try the best to repair corrupted PDF files on upload automatically. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2025-08-15 11:15:55 +08:00 · 2025-05-19 14:54:06 +08:00 · 2025-05-19 14:54:06 +08:00 · 0ebf05440e
commit 0ebf05440e
parent 7df1bd4b4a
5 changed files with 251 additions and 323 deletions
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -50,9 +50,9 @@ jobs:

      # https://github.com/astral-sh/ruff-action
      - name: Static check with Ruff
-        uses: astral-sh/ruff-action@v2
+        uses: astral-sh/ruff-action@v3
        with:
-          version: ">=0.8.2"
+          version: ">=0.11.x"
          args: "check"

      - name: Build ragflow:nightly-slim
--- a/3
+++ b/3
@ -59,7 +59,8 @@ RUN --mount=type=cache,id=ragflow_apt,target=/var/cache/apt,sharing=locked \
    apt install -y libatk-bridge2.0-0 && \
    apt install -y libpython3-dev libgtk-4-1 libnss3 xdg-utils libgbm-dev && \
    apt install -y libjemalloc-dev && \
-    apt install -y python3-pip pipx nginx unzip curl wget git vim less
+    apt install -y python3-pip pipx nginx unzip curl wget git vim less && \
+    apt install -y ghostscript

 RUN if [ "$NEED_MIRROR" == "1" ]; then \
        pip3 config set global.index-url https://mirrors.aliyun.com/pypi/simple && \
--- a/api/apps/document_app.py
+++ b/api/apps/document_app.py
@ -20,79 +20,73 @@ import re

 import flask
 from flask import request
-from flask_login import login_required, current_user
+from flask_login import current_user, login_required

-from deepdoc.parser.html_parser import RAGFlowHtmlParser
-from rag.nlp import search
-
-from api.db import VALID_FILE_TYPES, VALID_TASK_STATUS, FileType, TaskStatus, ParserType, FileSource
+from api import settings
+from api.constants import IMG_BASE64_PREFIX
+from api.db import VALID_FILE_TYPES, VALID_TASK_STATUS, FileSource, FileType, ParserType, TaskStatus
 from api.db.db_models import File, Task
+from api.db.services import duplicate_name
+from api.db.services.document_service import DocumentService, doc_upload_and_parse
 from api.db.services.file2document_service import File2DocumentService
 from api.db.services.file_service import FileService
-from api.db.services.task_service import queue_tasks
-from api.db.services.user_service import UserTenantService
-from api.db.services import duplicate_name
 from api.db.services.knowledgebase_service import KnowledgebaseService
-from api.db.services.task_service import TaskService
-from api.db.services.document_service import DocumentService, doc_upload_and_parse
+from api.db.services.task_service import TaskService, queue_tasks
+from api.db.services.user_service import UserTenantService
+from api.utils import get_uuid
 from api.utils.api_utils import (
-    server_error_response,
    get_data_error_result,
+    get_json_result,
+    server_error_response,
    validate_request,
 )
-from api.utils import get_uuid
-from api import settings
-from api.utils.api_utils import get_json_result
-from rag.utils.storage_factory import STORAGE_IMPL
-from api.utils.file_utils import filename_type, thumbnail, get_project_base_directory
+from api.utils.file_utils import filename_type, get_project_base_directory, thumbnail
 from api.utils.web_utils import html2pdf, is_valid_url
-from api.constants import IMG_BASE64_PREFIX
+from deepdoc.parser.html_parser import RAGFlowHtmlParser
+from rag.nlp import search
+from rag.utils.storage_factory import STORAGE_IMPL


-@manager.route('/upload', methods=['POST'])  # noqa: F821
+@manager.route("/upload", methods=["POST"])  # noqa: F821
@login_required
@validate_request("kb_id")
 def upload():
    kb_id = request.form.get("kb_id")
    if not kb_id:
-        return get_json_result(
-            data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
-    if 'file' not in request.files:
-        return get_json_result(
-            data=False, message='No file part!', code=settings.RetCode.ARGUMENT_ERROR)
+        return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
+    if "file" not in request.files:
+        return get_json_result(data=False, message="No file part!", code=settings.RetCode.ARGUMENT_ERROR)

-    file_objs = request.files.getlist('file')
+    file_objs = request.files.getlist("file")
    for file_obj in file_objs:
-        if file_obj.filename == '':
-            return get_json_result(
-                data=False, message='No file selected!', code=settings.RetCode.ARGUMENT_ERROR)
+        if file_obj.filename == "":
+            return get_json_result(data=False, message="No file selected!", code=settings.RetCode.ARGUMENT_ERROR)

    e, kb = KnowledgebaseService.get_by_id(kb_id)
    if not e:
        raise LookupError("Can't find this knowledgebase!")
-
    err, files = FileService.upload_document(kb, file_objs, current_user.id)
+
+    if not files:
+        return get_json_result(data=files, message="There seems to be an issue with your file format. Please verify it is correct and not corrupted.", code=settings.RetCode.DATA_ERROR)
    files = [f[0] for f in files]  # remove the blob

    if err:
-        return get_json_result(
-            data=files, message="\n".join(err), code=settings.RetCode.SERVER_ERROR)
+        return get_json_result(data=files, message="\n".join(err), code=settings.RetCode.SERVER_ERROR)
    return get_json_result(data=files)


-@manager.route('/web_crawl', methods=['POST'])  # noqa: F821
+@manager.route("/web_crawl", methods=["POST"])  # noqa: F821
@login_required
@validate_request("kb_id", "name", "url")
 def web_crawl():
    kb_id = request.form.get("kb_id")
    if not kb_id:
-        return get_json_result(
-            data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
+        return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
    name = request.form.get("name")
    url = request.form.get("url")
    if not is_valid_url(url):
-        return get_json_result(
-            data=False, message='The URL format is invalid', code=settings.RetCode.ARGUMENT_ERROR)
+        return get_json_result(data=False, message="The URL format is invalid", code=settings.RetCode.ARGUMENT_ERROR)
    e, kb = KnowledgebaseService.get_by_id(kb_id)
    if not e:
        raise LookupError("Can't find this knowledgebase!")
@ -108,10 +102,7 @@ def web_crawl():
    kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])

    try:
-        filename = duplicate_name(
-            DocumentService.query,
-            name=name + ".pdf",
-            kb_id=kb.id)
+        filename = duplicate_name(DocumentService.query, name=name + ".pdf", kb_id=kb.id)
        filetype = filename_type(filename)
        if filetype == FileType.OTHER.value:
            raise RuntimeError("This type of file has not been supported yet!")
@ -130,7 +121,7 @@ def web_crawl():
            "name": filename,
            "location": location,
            "size": len(blob),
-            "thumbnail": thumbnail(filename, blob)
+            "thumbnail": thumbnail(filename, blob),
        }
        if doc["type"] == FileType.VISUAL:
            doc["parser_id"] = ParserType.PICTURE.value
@ -147,27 +138,25 @@ def web_crawl():
    return get_json_result(data=True)


-@manager.route('/create', methods=['POST'])  # noqa: F821
+@manager.route("/create", methods=["POST"])  # noqa: F821
@login_required
@validate_request("name", "kb_id")
 def create():
    req = request.json
    kb_id = req["kb_id"]
    if not kb_id:
-        return get_json_result(
-            data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
+        return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)

    try:
        e, kb = KnowledgebaseService.get_by_id(kb_id)
        if not e:
-            return get_data_error_result(
-                message="Can't find this knowledgebase!")
+            return get_data_error_result(message="Can't find this knowledgebase!")

        if DocumentService.query(name=req["name"], kb_id=kb_id):
-            return get_data_error_result(
-                message="Duplicated document name in the same knowledgebase.")
+            return get_data_error_result(message="Duplicated document name in the same knowledgebase.")

-        doc = DocumentService.insert({
+        doc = DocumentService.insert(
+            {
                "id": get_uuid(),
                "kb_id": kb.id,
                "parser_id": kb.parser_id,
@ -176,29 +165,26 @@ def create():
                "type": FileType.VIRTUAL,
                "name": req["name"],
                "location": "",
-            "size": 0
-        })
+                "size": 0,
+            }
+        )
        return get_json_result(data=doc.to_json())
    except Exception as e:
        return server_error_response(e)


-@manager.route('/list', methods=['POST'])  # noqa: F821
+@manager.route("/list", methods=["POST"])  # noqa: F821
@login_required
 def list_docs():
    kb_id = request.args.get("kb_id")
    if not kb_id:
-        return get_json_result(
-            data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
+        return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
    tenants = UserTenantService.query(user_id=current_user.id)
    for tenant in tenants:
-        if KnowledgebaseService.query(
-                tenant_id=tenant.tenant_id, id=kb_id):
+        if KnowledgebaseService.query(tenant_id=tenant.tenant_id, id=kb_id):
            break
    else:
-        return get_json_result(
-            data=False, message='Only owner of knowledgebase authorized for this operation.',
-            code=settings.RetCode.OPERATING_ERROR)
+        return get_json_result(data=False, message="Only owner of knowledgebase authorized for this operation.", code=settings.RetCode.OPERATING_ERROR)
    keywords = request.args.get("keywords", "")

    page_number = int(request.args.get("page", 0))
@ -212,83 +198,67 @@ def list_docs():
    if run_status:
        invalid_status = {s for s in run_status if s not in VALID_TASK_STATUS}
        if invalid_status:
-            return get_data_error_result(
-                message=f"Invalid filter run status conditions: {', '.join(invalid_status)}"
-            )
+            return get_data_error_result(message=f"Invalid filter run status conditions: {', '.join(invalid_status)}")

    types = req.get("types", [])
    if types:
        invalid_types = {t for t in types if t not in VALID_FILE_TYPES}
        if invalid_types:
-            return get_data_error_result(
-                message=f"Invalid filter conditions: {', '.join(invalid_types)} type{'s' if len(invalid_types) > 1 else ''}"
-            )
+            return get_data_error_result(message=f"Invalid filter conditions: {', '.join(invalid_types)} type{'s' if len(invalid_types) > 1 else ''}")

    try:
-        docs, tol = DocumentService.get_by_kb_id(
-            kb_id, page_number, items_per_page, orderby, desc, keywords, run_status, types)
+        docs, tol = DocumentService.get_by_kb_id(kb_id, page_number, items_per_page, orderby, desc, keywords, run_status, types)

        for doc_item in docs:
-            if doc_item['thumbnail'] and not doc_item['thumbnail'].startswith(IMG_BASE64_PREFIX):
-                doc_item['thumbnail'] = f"/v1/document/image/{kb_id}-{doc_item['thumbnail']}"
+            if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX):
+                doc_item["thumbnail"] = f"/v1/document/image/{kb_id}-{doc_item['thumbnail']}"

        return get_json_result(data={"total": tol, "docs": docs})
    except Exception as e:
        return server_error_response(e)


-@manager.route('/infos', methods=['POST'])  # noqa: F821
+@manager.route("/infos", methods=["POST"])  # noqa: F821
@login_required
 def docinfos():
    req = request.json
    doc_ids = req["doc_ids"]
    for doc_id in doc_ids:
        if not DocumentService.accessible(doc_id, current_user.id):
-            return get_json_result(
-                data=False,
-                message='No authorization.',
-                code=settings.RetCode.AUTHENTICATION_ERROR
-            )
+            return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
    docs = DocumentService.get_by_ids(doc_ids)
    return get_json_result(data=list(docs.dicts()))


-@manager.route('/thumbnails', methods=['GET'])  # noqa: F821
+@manager.route("/thumbnails", methods=["GET"])  # noqa: F821
 # @login_required
 def thumbnails():
    doc_ids = request.args.get("doc_ids").split(",")
    if not doc_ids:
-        return get_json_result(
-            data=False, message='Lack of "Document ID"', code=settings.RetCode.ARGUMENT_ERROR)
+        return get_json_result(data=False, message='Lack of "Document ID"', code=settings.RetCode.ARGUMENT_ERROR)

    try:
        docs = DocumentService.get_thumbnails(doc_ids)

        for doc_item in docs:
-            if doc_item['thumbnail'] and not doc_item['thumbnail'].startswith(IMG_BASE64_PREFIX):
-                doc_item['thumbnail'] = f"/v1/document/image/{doc_item['kb_id']}-{doc_item['thumbnail']}"
+            if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX):
+                doc_item["thumbnail"] = f"/v1/document/image/{doc_item['kb_id']}-{doc_item['thumbnail']}"

        return get_json_result(data={d["id"]: d["thumbnail"] for d in docs})
    except Exception as e:
        return server_error_response(e)


-@manager.route('/change_status', methods=['POST'])  # noqa: F821
+@manager.route("/change_status", methods=["POST"])  # noqa: F821
@login_required
@validate_request("doc_id", "status")
 def change_status():
    req = request.json
    if str(req["status"]) not in ["0", "1"]:
-        return get_json_result(
-            data=False,
-            message='"Status" must be either 0 or 1!',
-            code=settings.RetCode.ARGUMENT_ERROR)
+        return get_json_result(data=False, message='"Status" must be either 0 or 1!', code=settings.RetCode.ARGUMENT_ERROR)

    if not DocumentService.accessible(req["doc_id"], current_user.id):
-        return get_json_result(
-            data=False,
-            message='No authorization.',
-            code=settings.RetCode.AUTHENTICATION_ERROR)
+        return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)

    try:
        e, doc = DocumentService.get_by_id(req["doc_id"])
@ -296,23 +266,19 @@ def change_status():
            return get_data_error_result(message="Document not found!")
        e, kb = KnowledgebaseService.get_by_id(doc.kb_id)
        if not e:
-            return get_data_error_result(
-                message="Can't find this knowledgebase!")
+            return get_data_error_result(message="Can't find this knowledgebase!")

-        if not DocumentService.update_by_id(
-                req["doc_id"], {"status": str(req["status"])}):
-            return get_data_error_result(
-                message="Database error (Document update)!")
+        if not DocumentService.update_by_id(req["doc_id"], {"status": str(req["status"])}):
+            return get_data_error_result(message="Database error (Document update)!")

        status = int(req["status"])
-        settings.docStoreConn.update({"doc_id": req["doc_id"]}, {"available_int": status},
-                                     search.index_name(kb.tenant_id), doc.kb_id)
+        settings.docStoreConn.update({"doc_id": req["doc_id"]}, {"available_int": status}, search.index_name(kb.tenant_id), doc.kb_id)
        return get_json_result(data=True)
    except Exception as e:
        return server_error_response(e)


-@manager.route('/rm', methods=['POST'])  # noqa: F821
+@manager.route("/rm", methods=["POST"])  # noqa: F821
@login_required
@validate_request("doc_id")
 def rm():
@ -323,11 +289,7 @@ def rm():

    for doc_id in doc_ids:
        if not DocumentService.accessible4deletion(doc_id, current_user.id):
-            return get_json_result(
-                data=False,
-                message='No authorization.',
-                code=settings.RetCode.AUTHENTICATION_ERROR
-            )
+            return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)

    root_folder = FileService.get_root_folder(current_user.id)
    pf_id = root_folder["id"]
@ -347,8 +309,7 @@ def rm():

            TaskService.filter_delete([Task.doc_id == doc_id])
            if not DocumentService.remove_document(doc, tenant_id):
-                return get_data_error_result(
-                    message="Database error (Document removal)!")
+                return get_data_error_result(message="Database error (Document removal)!")

            f2d = File2DocumentService.get_by_document_id(doc_id)
            deleted_file_count = 0
@ -376,18 +337,14 @@ def rm():
    return get_json_result(data=True)


-@manager.route('/run', methods=['POST'])  # noqa: F821
+@manager.route("/run", methods=["POST"])  # noqa: F821
@login_required
@validate_request("doc_ids", "run")
 def run():
    req = request.json
    for doc_id in req["doc_ids"]:
        if not DocumentService.accessible(doc_id, current_user.id):
-            return get_json_result(
-                data=False,
-                message='No authorization.',
-                code=settings.RetCode.AUTHENTICATION_ERROR
-            )
+            return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
    try:
        kb_table_num_map = {}
        for id in req["doc_ids"]:
@ -421,7 +378,7 @@ def run():
                    if kb_id not in kb_table_num_map:
                        count = DocumentService.count_by_kb_id(kb_id=kb_id, keywords="", run_status=[TaskStatus.DONE], types=[])
                        kb_table_num_map[kb_id] = count
-                        if kb_table_num_map[kb_id] <=0:
+                        if kb_table_num_map[kb_id] <= 0:
                            KnowledgebaseService.delete_field_map(kb_id)
                bucket, name = File2DocumentService.get_storage_address(doc_id=doc["id"])
                queue_tasks(doc, bucket, name, 0)
@ -431,36 +388,25 @@ def run():
        return server_error_response(e)


-@manager.route('/rename', methods=['POST'])  # noqa: F821
+@manager.route("/rename", methods=["POST"])  # noqa: F821
@login_required
@validate_request("doc_id", "name")
 def rename():
    req = request.json
    if not DocumentService.accessible(req["doc_id"], current_user.id):
-        return get_json_result(
-            data=False,
-            message='No authorization.',
-            code=settings.RetCode.AUTHENTICATION_ERROR
-        )
+        return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
    try:
        e, doc = DocumentService.get_by_id(req["doc_id"])
        if not e:
            return get_data_error_result(message="Document not found!")
-        if pathlib.Path(req["name"].lower()).suffix != pathlib.Path(
-                doc.name.lower()).suffix:
-            return get_json_result(
-                data=False,
-                message="The extension of file can't be changed",
-                code=settings.RetCode.ARGUMENT_ERROR)
+        if pathlib.Path(req["name"].lower()).suffix != pathlib.Path(doc.name.lower()).suffix:
+            return get_json_result(data=False, message="The extension of file can't be changed", code=settings.RetCode.ARGUMENT_ERROR)
        for d in DocumentService.query(name=req["name"], kb_id=doc.kb_id):
            if d.name == req["name"]:
-                return get_data_error_result(
-                    message="Duplicated document name in the same knowledgebase.")
+                return get_data_error_result(message="Duplicated document name in the same knowledgebase.")

-        if not DocumentService.update_by_id(
-                req["doc_id"], {"name": req["name"]}):
-            return get_data_error_result(
-                message="Database error (Document rename)!")
+        if not DocumentService.update_by_id(req["doc_id"], {"name": req["name"]}):
+            return get_data_error_result(message="Database error (Document rename)!")

        informs = File2DocumentService.get_by_document_id(req["doc_id"])
        if informs:
@ -472,7 +418,7 @@ def rename():
        return server_error_response(e)


-@manager.route('/get/<doc_id>', methods=['GET'])  # noqa: F821
+@manager.route("/get/<doc_id>", methods=["GET"])  # noqa: F821
 # @login_required
 def get(doc_id):
    try:
@ -486,29 +432,22 @@ def get(doc_id):
        ext = re.search(r"\.([^.]+)$", doc.name)
        if ext:
            if doc.type == FileType.VISUAL.value:
-                response.headers.set('Content-Type', 'image/%s' % ext.group(1))
+                response.headers.set("Content-Type", "image/%s" % ext.group(1))
            else:
-                response.headers.set(
-                    'Content-Type',
-                    'application/%s' %
-                    ext.group(1))
+                response.headers.set("Content-Type", "application/%s" % ext.group(1))
        return response
    except Exception as e:
        return server_error_response(e)


-@manager.route('/change_parser', methods=['POST'])  # noqa: F821
+@manager.route("/change_parser", methods=["POST"])  # noqa: F821
@login_required
@validate_request("doc_id", "parser_id")
 def change_parser():
    req = request.json

    if not DocumentService.accessible(req["doc_id"], current_user.id):
-        return get_json_result(
-            data=False,
-            message='No authorization.',
-            code=settings.RetCode.AUTHENTICATION_ERROR
-        )
+        return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
    try:
        e, doc = DocumentService.get_by_id(req["doc_id"])
        if not e:
@ -520,21 +459,16 @@ def change_parser():
            else:
                return get_json_result(data=True)

-        if ((doc.type == FileType.VISUAL and req["parser_id"] != "picture")
-                or (re.search(
-                    r"\.(ppt|pptx|pages)$", doc.name) and req["parser_id"] != "presentation")):
+        if (doc.type == FileType.VISUAL and req["parser_id"] != "picture") or (re.search(r"\.(ppt|pptx|pages)$", doc.name) and req["parser_id"] != "presentation"):
            return get_data_error_result(message="Not supported yet!")

-        e = DocumentService.update_by_id(doc.id,
-                                         {"parser_id": req["parser_id"], "progress": 0, "progress_msg": "",
-                                          "run": TaskStatus.UNSTART.value})
+        e = DocumentService.update_by_id(doc.id, {"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": TaskStatus.UNSTART.value})
        if not e:
            return get_data_error_result(message="Document not found!")
        if "parser_config" in req:
            DocumentService.update_parser_config(doc.id, req["parser_config"])
        if doc.token_num > 0:
-            e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1,
-                                                    doc.process_duation * -1)
+            e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, doc.process_duation * -1)
            if not e:
                return get_data_error_result(message="Document not found!")
            tenant_id = DocumentService.get_tenant_id(req["doc_id"])
@ -548,7 +482,7 @@ def change_parser():
        return server_error_response(e)


-@manager.route('/image/<image_id>', methods=['GET'])  # noqa: F821
+@manager.route("/image/<image_id>", methods=["GET"])  # noqa: F821
 # @login_required
 def get_image(image_id):
    try:
@ -557,53 +491,46 @@ def get_image(image_id):
            return get_data_error_result(message="Image not found.")
        bkt, nm = image_id.split("-")
        response = flask.make_response(STORAGE_IMPL.get(bkt, nm))
-        response.headers.set('Content-Type', 'image/JPEG')
+        response.headers.set("Content-Type", "image/JPEG")
        return response
    except Exception as e:
        return server_error_response(e)


-@manager.route('/upload_and_parse', methods=['POST'])  # noqa: F821
+@manager.route("/upload_and_parse", methods=["POST"])  # noqa: F821
@login_required
@validate_request("conversation_id")
 def upload_and_parse():
-    if 'file' not in request.files:
-        return get_json_result(
-            data=False, message='No file part!', code=settings.RetCode.ARGUMENT_ERROR)
+    if "file" not in request.files:
+        return get_json_result(data=False, message="No file part!", code=settings.RetCode.ARGUMENT_ERROR)

-    file_objs = request.files.getlist('file')
+    file_objs = request.files.getlist("file")
    for file_obj in file_objs:
-        if file_obj.filename == '':
-            return get_json_result(
-                data=False, message='No file selected!', code=settings.RetCode.ARGUMENT_ERROR)
+        if file_obj.filename == "":
+            return get_json_result(data=False, message="No file selected!", code=settings.RetCode.ARGUMENT_ERROR)

    doc_ids = doc_upload_and_parse(request.form.get("conversation_id"), file_objs, current_user.id)

    return get_json_result(data=doc_ids)


-@manager.route('/parse', methods=['POST'])  # noqa: F821
+@manager.route("/parse", methods=["POST"])  # noqa: F821
@login_required
 def parse():
    url = request.json.get("url") if request.json else ""
    if url:
        if not is_valid_url(url):
-            return get_json_result(
-                data=False, message='The URL format is invalid', code=settings.RetCode.ARGUMENT_ERROR)
+            return get_json_result(data=False, message="The URL format is invalid", code=settings.RetCode.ARGUMENT_ERROR)
        download_path = os.path.join(get_project_base_directory(), "logs/downloads")
        os.makedirs(download_path, exist_ok=True)
        from seleniumwire.webdriver import Chrome, ChromeOptions
+
        options = ChromeOptions()
-        options.add_argument('--headless')
-        options.add_argument('--disable-gpu')
-        options.add_argument('--no-sandbox')
-        options.add_argument('--disable-dev-shm-usage')
-        options.add_experimental_option('prefs', {
-            'download.default_directory': download_path,
-            'download.prompt_for_download': False,
-            'download.directory_upgrade': True,
-            'safebrowsing.enabled': True
-        })
+        options.add_argument("--headless")
+        options.add_argument("--disable-gpu")
+        options.add_argument("--no-sandbox")
+        options.add_argument("--disable-dev-shm-usage")
+        options.add_experimental_option("prefs", {"download.default_directory": download_path, "download.prompt_for_download": False, "download.directory_upgrade": True, "safebrowsing.enabled": True})
        driver = Chrome(options=options)
        driver.get(url)
        res_headers = [r.response.headers for r in driver.requests if r and r.response]
@ -626,51 +553,41 @@ def parse():

        r = re.search(r"filename=\"([^\"]+)\"", str(res_headers))
        if not r or not r.group(1):
-            return get_json_result(
-                data=False, message="Can't not identify downloaded file", code=settings.RetCode.ARGUMENT_ERROR)
+            return get_json_result(data=False, message="Can't not identify downloaded file", code=settings.RetCode.ARGUMENT_ERROR)
        f = File(r.group(1), os.path.join(download_path, r.group(1)))
        txt = FileService.parse_docs([f], current_user.id)
        return get_json_result(data=txt)

-    if 'file' not in request.files:
-        return get_json_result(
-            data=False, message='No file part!', code=settings.RetCode.ARGUMENT_ERROR)
+    if "file" not in request.files:
+        return get_json_result(data=False, message="No file part!", code=settings.RetCode.ARGUMENT_ERROR)

-    file_objs = request.files.getlist('file')
+    file_objs = request.files.getlist("file")
    txt = FileService.parse_docs(file_objs, current_user.id)

    return get_json_result(data=txt)


-@manager.route('/set_meta', methods=['POST'])  # noqa: F821
+@manager.route("/set_meta", methods=["POST"])  # noqa: F821
@login_required
@validate_request("doc_id", "meta")
 def set_meta():
    req = request.json
    if not DocumentService.accessible(req["doc_id"], current_user.id):
-        return get_json_result(
-            data=False,
-            message='No authorization.',
-            code=settings.RetCode.AUTHENTICATION_ERROR
-        )
+        return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
    try:
        meta = json.loads(req["meta"])
    except Exception as e:
-        return get_json_result(
-            data=False, message=f'Json syntax error: {e}', code=settings.RetCode.ARGUMENT_ERROR)
+        return get_json_result(data=False, message=f"Json syntax error: {e}", code=settings.RetCode.ARGUMENT_ERROR)
    if not isinstance(meta, dict):
-        return get_json_result(
-            data=False, message='Meta data should be in Json map format, like {"key": "value"}', code=settings.RetCode.ARGUMENT_ERROR)
+        return get_json_result(data=False, message='Meta data should be in Json map format, like {"key": "value"}', code=settings.RetCode.ARGUMENT_ERROR)

    try:
        e, doc = DocumentService.get_by_id(req["doc_id"])
        if not e:
            return get_data_error_result(message="Document not found!")

-        if not DocumentService.update_by_id(
-                req["doc_id"], {"meta_fields": meta}):
-            return get_data_error_result(
-                message="Database error (meta updates)!")
+        if not DocumentService.update_by_id(req["doc_id"], {"meta_fields": meta}):
+            return get_data_error_result(message="Database error (meta updates)!")

        return get_json_result(data=True)
    except Exception as e:
--- a/api/db/services/file_service.py
+++ b/api/db/services/file_service.py
@ -14,22 +14,21 @@
 #  limitations under the License.
 #
 import logging
-import re
 import os
+import re
 from concurrent.futures import ThreadPoolExecutor

 from flask_login import current_user
 from peewee import fn

-from api.db import FileType, KNOWLEDGEBASE_FOLDER_NAME, FileSource, ParserType
-from api.db.db_models import DB, File2Document, Knowledgebase
-from api.db.db_models import File, Document
+from api.db import KNOWLEDGEBASE_FOLDER_NAME, FileSource, FileType, ParserType
+from api.db.db_models import DB, Document, File, File2Document, Knowledgebase
 from api.db.services import duplicate_name
 from api.db.services.common_service import CommonService
 from api.db.services.document_service import DocumentService
 from api.db.services.file2document_service import File2DocumentService
 from api.utils import get_uuid
-from api.utils.file_utils import filename_type, thumbnail_img
+from api.utils.file_utils import filename_type, read_potential_broken_pdf, thumbnail_img
 from rag.utils.storage_factory import STORAGE_IMPL


@ -39,8 +38,7 @@ class FileService(CommonService):

    @classmethod
    @DB.connection_context()
-    def get_by_pf_id(cls, tenant_id, pf_id, page_number, items_per_page,
-                     orderby, desc, keywords):
+    def get_by_pf_id(cls, tenant_id, pf_id, page_number, items_per_page, orderby, desc, keywords):
        # Get files by parent folder ID with pagination and filtering
        # Args:
        #     tenant_id: ID of the tenant
@ -53,17 +51,9 @@ class FileService(CommonService):
        # Returns:
        #     Tuple of (file_list, total_count)
        if keywords:
-            files = cls.model.select().where(
-                (cls.model.tenant_id == tenant_id),
-                (cls.model.parent_id == pf_id),
-                (fn.LOWER(cls.model.name).contains(keywords.lower())),
-                ~(cls.model.id == pf_id)
-            )
+            files = cls.model.select().where((cls.model.tenant_id == tenant_id), (cls.model.parent_id == pf_id), (fn.LOWER(cls.model.name).contains(keywords.lower())), ~(cls.model.id == pf_id))
        else:
-            files = cls.model.select().where((cls.model.tenant_id == tenant_id),
-                                             (cls.model.parent_id == pf_id),
-                                             ~(cls.model.id == pf_id)
-                                             )
+            files = cls.model.select().where((cls.model.tenant_id == tenant_id), (cls.model.parent_id == pf_id), ~(cls.model.id == pf_id))
        count = files.count()
        if desc:
            files = files.order_by(cls.model.getter_by(orderby).desc())
@ -76,16 +66,20 @@ class FileService(CommonService):
        for file in res_files:
            if file["type"] == FileType.FOLDER.value:
                file["size"] = cls.get_folder_size(file["id"])
-                file['kbs_info'] = []
-                children = list(cls.model.select().where(
+                file["kbs_info"] = []
+                children = list(
+                    cls.model.select()
+                    .where(
                        (cls.model.tenant_id == tenant_id),
                        (cls.model.parent_id == file["id"]),
                        ~(cls.model.id == file["id"]),
-                ).dicts())
+                    )
+                    .dicts()
+                )
                file["has_child_folder"] = any(value["type"] == FileType.FOLDER.value for value in children)
                continue
-            kbs_info = cls.get_kb_id_by_file_id(file['id'])
-            file['kbs_info'] = kbs_info
+            kbs_info = cls.get_kb_id_by_file_id(file["id"])
+            file["kbs_info"] = kbs_info

        return res_files, count

@ -97,16 +91,18 @@ class FileService(CommonService):
        #     file_id: File ID
        # Returns:
        #     List of dictionaries containing knowledge base IDs and names
-        kbs = (cls.model.select(*[Knowledgebase.id, Knowledgebase.name])
+        kbs = (
+            cls.model.select(*[Knowledgebase.id, Knowledgebase.name])
            .join(File2Document, on=(File2Document.file_id == file_id))
            .join(Document, on=(File2Document.document_id == Document.id))
            .join(Knowledgebase, on=(Knowledgebase.id == Document.kb_id))
-               .where(cls.model.id == file_id))
+            .where(cls.model.id == file_id)
+        )
        if not kbs:
            return []
        kbs_info_list = []
        for kb in list(kbs.dicts()):
-            kbs_info_list.append({"kb_id": kb['id'], "kb_name": kb['name']})
+            kbs_info_list.append({"kb_id": kb["id"], "kb_name": kb["name"]})
        return kbs_info_list

    @classmethod
@ -178,16 +174,9 @@ class FileService(CommonService):
        if count > len(name) - 2:
            return file
        else:
-            file = cls.insert({
-                "id": get_uuid(),
-                "parent_id": parent_id,
-                "tenant_id": current_user.id,
-                "created_by": current_user.id,
-                "name": name[count],
-                "location": "",
-                "size": 0,
-                "type": FileType.FOLDER.value
-            })
+            file = cls.insert(
+                {"id": get_uuid(), "parent_id": parent_id, "tenant_id": current_user.id, "created_by": current_user.id, "name": name[count], "location": "", "size": 0, "type": FileType.FOLDER.value}
+            )
            return cls.create_folder(file, file.id, name, count + 1)

    @classmethod
@ -212,9 +201,7 @@ class FileService(CommonService):
        #     tenant_id: Tenant ID
        # Returns:
        #     Root folder dictionary
-        for file in cls.model.select().where((cls.model.tenant_id == tenant_id),
-                                        (cls.model.parent_id == cls.model.id)
-                                        ):
+        for file in cls.model.select().where((cls.model.tenant_id == tenant_id), (cls.model.parent_id == cls.model.id)):
            return file.to_dict()

        file_id = get_uuid()
@ -239,11 +226,8 @@ class FileService(CommonService):
        #     tenant_id: Tenant ID
        # Returns:
        #     Knowledge base folder dictionary
-        for root in cls.model.select().where(
-                (cls.model.tenant_id == tenant_id), (cls.model.parent_id == cls.model.id)):
-            for folder in cls.model.select().where(
-                    (cls.model.tenant_id == tenant_id), (cls.model.parent_id == root.id),
-                    (cls.model.name == KNOWLEDGEBASE_FOLDER_NAME)):
+        for root in cls.model.select().where((cls.model.tenant_id == tenant_id), (cls.model.parent_id == cls.model.id)):
+            for folder in cls.model.select().where((cls.model.tenant_id == tenant_id), (cls.model.parent_id == root.id), (cls.model.name == KNOWLEDGEBASE_FOLDER_NAME)):
                return folder.to_dict()
        assert False, "Can't find the KB folder. Database init error."

@ -271,7 +255,7 @@ class FileService(CommonService):
            "type": ty,
            "size": size,
            "location": location,
-            "source_type": FileSource.KNOWLEDGEBASE
+            "source_type": FileSource.KNOWLEDGEBASE,
        }
        cls.save(**file)
        return file
@ -283,12 +267,11 @@ class FileService(CommonService):
        # Args:
        #     root_id: Root folder ID
        #     tenant_id: Tenant ID
-        for _ in cls.model.select().where((cls.model.name == KNOWLEDGEBASE_FOLDER_NAME)\
-                                          & (cls.model.parent_id == root_id)):
+        for _ in cls.model.select().where((cls.model.name == KNOWLEDGEBASE_FOLDER_NAME) & (cls.model.parent_id == root_id)):
            return
        folder = cls.new_a_file_from_kb(tenant_id, KNOWLEDGEBASE_FOLDER_NAME, root_id)

-        for kb in Knowledgebase.select(*[Knowledgebase.id, Knowledgebase.name]).where(Knowledgebase.tenant_id==tenant_id):
+        for kb in Knowledgebase.select(*[Knowledgebase.id, Knowledgebase.name]).where(Knowledgebase.tenant_id == tenant_id):
            kb_folder = cls.new_a_file_from_kb(tenant_id, kb.name, folder["id"])
            for doc in DocumentService.query(kb_id=kb.id):
                FileService.add_file_from_kb(doc.to_dict(), kb_folder["id"], tenant_id)
@ -357,12 +340,10 @@ class FileService(CommonService):
    @DB.connection_context()
    def delete_folder_by_pf_id(cls, user_id, folder_id):
        try:
-            files = cls.model.select().where((cls.model.tenant_id == user_id)
-                                             & (cls.model.parent_id == folder_id))
+            files = cls.model.select().where((cls.model.tenant_id == user_id) & (cls.model.parent_id == folder_id))
            for file in files:
                cls.delete_folder_by_pf_id(user_id, file.id)
-            return cls.model.delete().where((cls.model.tenant_id == user_id)
-                                            & (cls.model.id == folder_id)).execute(),
+            return (cls.model.delete().where((cls.model.tenant_id == user_id) & (cls.model.id == folder_id)).execute(),)
        except Exception:
            logging.exception("delete_folder_by_pf_id")
            raise RuntimeError("Database error (File retrieval)!")
@ -380,8 +361,7 @@ class FileService(CommonService):

        def dfs(parent_id):
            nonlocal size
-            for f in cls.model.select(*[cls.model.id, cls.model.size, cls.model.type]).where(
-                    cls.model.parent_id == parent_id, cls.model.id != parent_id):
+            for f in cls.model.select(*[cls.model.id, cls.model.size, cls.model.type]).where(cls.model.parent_id == parent_id, cls.model.id != parent_id):
                size += f.size
                if f.type == FileType.FOLDER.value:
                    dfs(f.id)
@ -403,7 +383,7 @@ class FileService(CommonService):
            "type": doc["type"],
            "size": doc["size"],
            "location": doc["location"],
-            "source_type": FileSource.KNOWLEDGEBASE
+            "source_type": FileSource.KNOWLEDGEBASE,
        }
        cls.save(**file)
        File2DocumentService.save(**{"id": get_uuid(), "file_id": file["id"], "document_id": doc["id"]})
@ -412,7 +392,7 @@ class FileService(CommonService):
    @DB.connection_context()
    def move_file(cls, file_ids, folder_id):
        try:
-            cls.filter_update((cls.model.id << file_ids, ), { 'parent_id': folder_id })
+            cls.filter_update((cls.model.id << file_ids,), {"parent_id": folder_id})
        except Exception:
            logging.exception("move_file")
            raise RuntimeError("Database error (File move)!")
@ -429,16 +409,13 @@ class FileService(CommonService):
        err, files = [], []
        for file in file_objs:
            try:
-                MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))
+                MAX_FILE_NUM_PER_USER = int(os.environ.get("MAX_FILE_NUM_PER_USER", 0))
                if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(kb.tenant_id) >= MAX_FILE_NUM_PER_USER:
                    raise RuntimeError("Exceed the maximum file number of a free user!")
                if len(file.filename.encode("utf-8")) >= 128:
                    raise RuntimeError("Exceed the maximum length of file name!")

-                filename = duplicate_name(
-                    DocumentService.query,
-                    name=file.filename,
-                    kb_id=kb.id)
+                filename = duplicate_name(DocumentService.query, name=file.filename, kb_id=kb.id)
                filetype = filename_type(filename)
                if filetype == FileType.OTHER.value:
                    raise RuntimeError("This type of file has not been supported yet!")
@ -446,15 +423,18 @@ class FileService(CommonService):
                location = filename
                while STORAGE_IMPL.obj_exist(kb.id, location):
                    location += "_"
+
                blob = file.read()
+                if filetype == FileType.PDF.value:
+                    blob = read_potential_broken_pdf(blob)
                STORAGE_IMPL.put(kb.id, location, blob)

                doc_id = get_uuid()

                img = thumbnail_img(filename, blob)
-                thumbnail_location = ''
+                thumbnail_location = ""
                if img is not None:
-                    thumbnail_location = f'thumbnail_{doc_id}.png'
+                    thumbnail_location = f"thumbnail_{doc_id}.png"
                    STORAGE_IMPL.put(kb.id, thumbnail_location, img)

                doc = {
@ -467,7 +447,7 @@ class FileService(CommonService):
                    "name": filename,
                    "location": location,
                    "size": len(blob),
-                    "thumbnail": thumbnail_location
+                    "thumbnail": thumbnail_location,
                }
                DocumentService.insert(doc)

@ -480,29 +460,17 @@ class FileService(CommonService):

    @staticmethod
    def parse_docs(file_objs, user_id):
-        from rag.app import presentation, picture, naive, audio, email
+        from rag.app import audio, email, naive, picture, presentation

        def dummy(prog=None, msg=""):
            pass

-        FACTORY = {
-            ParserType.PRESENTATION.value: presentation,
-            ParserType.PICTURE.value: picture,
-            ParserType.AUDIO.value: audio,
-            ParserType.EMAIL.value: email
-        }
+        FACTORY = {ParserType.PRESENTATION.value: presentation, ParserType.PICTURE.value: picture, ParserType.AUDIO.value: audio, ParserType.EMAIL.value: email}
        parser_config = {"chunk_token_num": 16096, "delimiter": "\n!?;。；！？", "layout_recognize": "Plain Text"}
        exe = ThreadPoolExecutor(max_workers=12)
        threads = []
        for file in file_objs:
-            kwargs = {
-                "lang": "English",
-                "callback": dummy,
-                "parser_config": parser_config,
-                "from_page": 0,
-                "to_page": 100000,
-                "tenant_id": user_id
-            }
+            kwargs = {"lang": "English", "callback": dummy, "parser_config": parser_config, "from_page": 0, "to_page": 100000, "tenant_id": user_id}
            filetype = filename_type(file.filename)
            blob = file.read()
            threads.append(exe.submit(FACTORY.get(FileService.get_parser(filetype, file.filename, ""), naive).chunk, file.filename, blob, **kwargs))
@ -524,3 +492,4 @@ class FileService(CommonService):
        if re.search(r"\.(eml)$", filename):
            return ParserType.EMAIL.value
        return default
+
--- a/api/utils/file_utils.py
+++ b/api/utils/file_utils.py
@ -17,17 +17,20 @@ import base64
 import json
 import os
 import re
+import shutil
+import subprocess
 import sys
+import tempfile
 import threading
 from io import BytesIO

 import pdfplumber
-from PIL import Image
 from cachetools import LRUCache, cached
+from PIL import Image
 from ruamel.yaml import YAML

-from api.db import FileType
 from api.constants import IMG_BASE64_PREFIX
+from api.db import FileType

 PROJECT_BASE = os.getenv("RAG_PROJECT_BASE") or os.getenv("RAG_DEPLOY_BASE")
 RAG_BASE = os.getenv("RAG_BASE")
@ -74,7 +77,7 @@ def get_rag_python_directory(*args):


 def get_home_cache_dir():
-    dir = os.path.join(os.path.expanduser('~'), ".ragflow")
+    dir = os.path.join(os.path.expanduser("~"), ".ragflow")
    try:
        os.mkdir(dir)
    except OSError:
@ -92,9 +95,7 @@ def load_json_conf(conf_path):
        with open(json_conf_path) as f:
            return json.load(f)
    except BaseException:
-        raise EnvironmentError(
-            "loading json file config from '{}' failed!".format(json_conf_path)
-        )
+        raise EnvironmentError("loading json file config from '{}' failed!".format(json_conf_path))


 def dump_json_conf(config_data, conf_path):
@ -106,9 +107,7 @@ def dump_json_conf(config_data, conf_path):
        with open(json_conf_path, "w") as f:
            json.dump(config_data, f, indent=4)
    except BaseException:
-        raise EnvironmentError(
-            "loading json file config from '{}' failed!".format(json_conf_path)
-        )
+        raise EnvironmentError("loading json file config from '{}' failed!".format(json_conf_path))


 def load_json_conf_real_time(conf_path):
@ -120,9 +119,7 @@ def load_json_conf_real_time(conf_path):
        with open(json_conf_path) as f:
            return json.load(f)
    except BaseException:
-        raise EnvironmentError(
-            "loading json file config from '{}' failed!".format(json_conf_path)
-        )
+        raise EnvironmentError("loading json file config from '{}' failed!".format(json_conf_path))


 def load_yaml_conf(conf_path):
@ -130,12 +127,10 @@ def load_yaml_conf(conf_path):
        conf_path = os.path.join(get_project_base_directory(), conf_path)
    try:
        with open(conf_path) as f:
-            yaml = YAML(typ='safe', pure=True)
+            yaml = YAML(typ="safe", pure=True)
            return yaml.load(f)
    except Exception as e:
-        raise EnvironmentError(
-            "loading yaml file config from {} failed:".format(conf_path), e
-        )
+        raise EnvironmentError("loading yaml file config from {} failed:".format(conf_path), e)


 def rewrite_yaml_conf(conf_path, config):
@ -146,13 +141,11 @@ def rewrite_yaml_conf(conf_path, config):
            yaml = YAML(typ="safe")
            yaml.dump(config, f)
    except Exception as e:
-        raise EnvironmentError(
-            "rewrite yaml file config {} failed:".format(conf_path), e
-        )
+        raise EnvironmentError("rewrite yaml file config {} failed:".format(conf_path), e)


 def rewrite_json_file(filepath, json_data):
-    with open(filepath, "w", encoding='utf-8') as f:
+    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(json_data, f, indent=4, separators=(",", ": "))
    f.close()

@ -162,12 +155,10 @@ def filename_type(filename):
    if re.match(r".*\.pdf$", filename):
        return FileType.PDF.value

-    if re.match(
-             r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename):
+    if re.match(r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename):
        return FileType.DOC.value

-    if re.match(
-            r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$", filename):
+    if re.match(r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$", filename):
        return FileType.AURAL.value

    if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename):
@ -175,6 +166,7 @@ def filename_type(filename):

    return FileType.OTHER.value

+
 def thumbnail_img(filename, blob):
    """
    MySQL LongText max length is 65535
@ -183,6 +175,7 @@ def thumbnail_img(filename, blob):
    if re.match(r".*\.pdf$", filename):
        with sys.modules[LOCK_KEY_pdfplumber]:
            pdf = pdfplumber.open(BytesIO(blob))
+
            buffered = BytesIO()
            resolution = 32
            img = None
@ -206,8 +199,9 @@ def thumbnail_img(filename, blob):
        return buffered.getvalue()

    elif re.match(r".*\.(ppt|pptx)$", filename):
-        import aspose.slides as slides
        import aspose.pydrawing as drawing
+        import aspose.slides as slides
+
        try:
            with slides.Presentation(BytesIO(blob)) as presentation:
                buffered = BytesIO()
@ -215,8 +209,7 @@ def thumbnail_img(filename, blob):
                img = None
                for _ in range(10):
                    # https://reference.aspose.com/slides/python-net/aspose.slides/slide/get_thumbnail/#float-float
-                    presentation.slides[0].get_thumbnail(scale, scale).save(
-                        buffered, drawing.imaging.ImageFormat.png)
+                    presentation.slides[0].get_thumbnail(scale, scale).save(buffered, drawing.imaging.ImageFormat.png)
                    img = buffered.getvalue()
                    if len(img) >= 64000:
                        scale = scale / 2.0
@ -232,10 +225,9 @@ def thumbnail_img(filename, blob):
 def thumbnail(filename, blob):
    img = thumbnail_img(filename, blob)
    if img is not None:
-        return IMG_BASE64_PREFIX + \
-            base64.b64encode(img).decode("utf-8")
+        return IMG_BASE64_PREFIX + base64.b64encode(img).decode("utf-8")
    else:
-        return ''
+        return ""


 def traversal_files(base):
@ -243,3 +235,52 @@ def traversal_files(base):
        for f in fs:
            fullname = os.path.join(root, f)
            yield fullname
+
+
+def repair_pdf_with_ghostscript(input_bytes):
+    if shutil.which("gs") is None:
+        return input_bytes
+
+    with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_in, tempfile.NamedTemporaryFile(suffix=".pdf") as temp_out:
+        temp_in.write(input_bytes)
+        temp_in.flush()
+
+        cmd = [
+            "gs",
+            "-o",
+            temp_out.name,
+            "-sDEVICE=pdfwrite",
+            "-dPDFSETTINGS=/prepress",
+            temp_in.name,
+        ]
+        try:
+            proc = subprocess.run(cmd, capture_output=True, text=True)
+            if proc.returncode != 0:
+                return input_bytes
+        except Exception:
+            return input_bytes
+
+        temp_out.seek(0)
+        repaired_bytes = temp_out.read()
+
+    return repaired_bytes
+
+
+def read_potential_broken_pdf(blob):
+    def try_open(blob):
+        try:
+            with pdfplumber.open(BytesIO(blob)) as pdf:
+                if pdf.pages:
+                    return True
+        except Exception:
+            return False
+        return False
+
+    if try_open(blob):
+        return blob
+
+    repaired = repair_pdf_with_ghostscript(blob)
+    if try_open(repaired):
+        return repaired
+
+    return blob