mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-15 11:15:55 +08:00
Feat: repair corrupted PDF files on upload automatically (#7693)
### What problem does this PR solve? Try the best to repair corrupted PDF files on upload automatically. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
parent
7df1bd4b4a
commit
0ebf05440e
4
.github/workflows/tests.yml
vendored
4
.github/workflows/tests.yml
vendored
@ -50,9 +50,9 @@ jobs:
|
||||
|
||||
# https://github.com/astral-sh/ruff-action
|
||||
- name: Static check with Ruff
|
||||
uses: astral-sh/ruff-action@v2
|
||||
uses: astral-sh/ruff-action@v3
|
||||
with:
|
||||
version: ">=0.8.2"
|
||||
version: ">=0.11.x"
|
||||
args: "check"
|
||||
|
||||
- name: Build ragflow:nightly-slim
|
||||
|
@ -59,7 +59,8 @@ RUN --mount=type=cache,id=ragflow_apt,target=/var/cache/apt,sharing=locked \
|
||||
apt install -y libatk-bridge2.0-0 && \
|
||||
apt install -y libpython3-dev libgtk-4-1 libnss3 xdg-utils libgbm-dev && \
|
||||
apt install -y libjemalloc-dev && \
|
||||
apt install -y python3-pip pipx nginx unzip curl wget git vim less
|
||||
apt install -y python3-pip pipx nginx unzip curl wget git vim less && \
|
||||
apt install -y ghostscript
|
||||
|
||||
RUN if [ "$NEED_MIRROR" == "1" ]; then \
|
||||
pip3 config set global.index-url https://mirrors.aliyun.com/pypi/simple && \
|
||||
|
@ -20,79 +20,73 @@ import re
|
||||
|
||||
import flask
|
||||
from flask import request
|
||||
from flask_login import login_required, current_user
|
||||
from flask_login import current_user, login_required
|
||||
|
||||
from deepdoc.parser.html_parser import RAGFlowHtmlParser
|
||||
from rag.nlp import search
|
||||
|
||||
from api.db import VALID_FILE_TYPES, VALID_TASK_STATUS, FileType, TaskStatus, ParserType, FileSource
|
||||
from api import settings
|
||||
from api.constants import IMG_BASE64_PREFIX
|
||||
from api.db import VALID_FILE_TYPES, VALID_TASK_STATUS, FileSource, FileType, ParserType, TaskStatus
|
||||
from api.db.db_models import File, Task
|
||||
from api.db.services import duplicate_name
|
||||
from api.db.services.document_service import DocumentService, doc_upload_and_parse
|
||||
from api.db.services.file2document_service import File2DocumentService
|
||||
from api.db.services.file_service import FileService
|
||||
from api.db.services.task_service import queue_tasks
|
||||
from api.db.services.user_service import UserTenantService
|
||||
from api.db.services import duplicate_name
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.db.services.task_service import TaskService
|
||||
from api.db.services.document_service import DocumentService, doc_upload_and_parse
|
||||
from api.db.services.task_service import TaskService, queue_tasks
|
||||
from api.db.services.user_service import UserTenantService
|
||||
from api.utils import get_uuid
|
||||
from api.utils.api_utils import (
|
||||
server_error_response,
|
||||
get_data_error_result,
|
||||
get_json_result,
|
||||
server_error_response,
|
||||
validate_request,
|
||||
)
|
||||
from api.utils import get_uuid
|
||||
from api import settings
|
||||
from api.utils.api_utils import get_json_result
|
||||
from rag.utils.storage_factory import STORAGE_IMPL
|
||||
from api.utils.file_utils import filename_type, thumbnail, get_project_base_directory
|
||||
from api.utils.file_utils import filename_type, get_project_base_directory, thumbnail
|
||||
from api.utils.web_utils import html2pdf, is_valid_url
|
||||
from api.constants import IMG_BASE64_PREFIX
|
||||
from deepdoc.parser.html_parser import RAGFlowHtmlParser
|
||||
from rag.nlp import search
|
||||
from rag.utils.storage_factory import STORAGE_IMPL
|
||||
|
||||
|
||||
@manager.route('/upload', methods=['POST']) # noqa: F821
|
||||
@manager.route("/upload", methods=["POST"]) # noqa: F821
|
||||
@login_required
|
||||
@validate_request("kb_id")
|
||||
def upload():
|
||||
kb_id = request.form.get("kb_id")
|
||||
if not kb_id:
|
||||
return get_json_result(
|
||||
data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
|
||||
if 'file' not in request.files:
|
||||
return get_json_result(
|
||||
data=False, message='No file part!', code=settings.RetCode.ARGUMENT_ERROR)
|
||||
return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
|
||||
if "file" not in request.files:
|
||||
return get_json_result(data=False, message="No file part!", code=settings.RetCode.ARGUMENT_ERROR)
|
||||
|
||||
file_objs = request.files.getlist('file')
|
||||
file_objs = request.files.getlist("file")
|
||||
for file_obj in file_objs:
|
||||
if file_obj.filename == '':
|
||||
return get_json_result(
|
||||
data=False, message='No file selected!', code=settings.RetCode.ARGUMENT_ERROR)
|
||||
if file_obj.filename == "":
|
||||
return get_json_result(data=False, message="No file selected!", code=settings.RetCode.ARGUMENT_ERROR)
|
||||
|
||||
e, kb = KnowledgebaseService.get_by_id(kb_id)
|
||||
if not e:
|
||||
raise LookupError("Can't find this knowledgebase!")
|
||||
|
||||
err, files = FileService.upload_document(kb, file_objs, current_user.id)
|
||||
|
||||
if not files:
|
||||
return get_json_result(data=files, message="There seems to be an issue with your file format. Please verify it is correct and not corrupted.", code=settings.RetCode.DATA_ERROR)
|
||||
files = [f[0] for f in files] # remove the blob
|
||||
|
||||
if err:
|
||||
return get_json_result(
|
||||
data=files, message="\n".join(err), code=settings.RetCode.SERVER_ERROR)
|
||||
return get_json_result(data=files, message="\n".join(err), code=settings.RetCode.SERVER_ERROR)
|
||||
return get_json_result(data=files)
|
||||
|
||||
|
||||
@manager.route('/web_crawl', methods=['POST']) # noqa: F821
|
||||
@manager.route("/web_crawl", methods=["POST"]) # noqa: F821
|
||||
@login_required
|
||||
@validate_request("kb_id", "name", "url")
|
||||
def web_crawl():
|
||||
kb_id = request.form.get("kb_id")
|
||||
if not kb_id:
|
||||
return get_json_result(
|
||||
data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
|
||||
return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
|
||||
name = request.form.get("name")
|
||||
url = request.form.get("url")
|
||||
if not is_valid_url(url):
|
||||
return get_json_result(
|
||||
data=False, message='The URL format is invalid', code=settings.RetCode.ARGUMENT_ERROR)
|
||||
return get_json_result(data=False, message="The URL format is invalid", code=settings.RetCode.ARGUMENT_ERROR)
|
||||
e, kb = KnowledgebaseService.get_by_id(kb_id)
|
||||
if not e:
|
||||
raise LookupError("Can't find this knowledgebase!")
|
||||
@ -108,10 +102,7 @@ def web_crawl():
|
||||
kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])
|
||||
|
||||
try:
|
||||
filename = duplicate_name(
|
||||
DocumentService.query,
|
||||
name=name + ".pdf",
|
||||
kb_id=kb.id)
|
||||
filename = duplicate_name(DocumentService.query, name=name + ".pdf", kb_id=kb.id)
|
||||
filetype = filename_type(filename)
|
||||
if filetype == FileType.OTHER.value:
|
||||
raise RuntimeError("This type of file has not been supported yet!")
|
||||
@ -130,7 +121,7 @@ def web_crawl():
|
||||
"name": filename,
|
||||
"location": location,
|
||||
"size": len(blob),
|
||||
"thumbnail": thumbnail(filename, blob)
|
||||
"thumbnail": thumbnail(filename, blob),
|
||||
}
|
||||
if doc["type"] == FileType.VISUAL:
|
||||
doc["parser_id"] = ParserType.PICTURE.value
|
||||
@ -147,27 +138,25 @@ def web_crawl():
|
||||
return get_json_result(data=True)
|
||||
|
||||
|
||||
@manager.route('/create', methods=['POST']) # noqa: F821
|
||||
@manager.route("/create", methods=["POST"]) # noqa: F821
|
||||
@login_required
|
||||
@validate_request("name", "kb_id")
|
||||
def create():
|
||||
req = request.json
|
||||
kb_id = req["kb_id"]
|
||||
if not kb_id:
|
||||
return get_json_result(
|
||||
data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
|
||||
return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
|
||||
|
||||
try:
|
||||
e, kb = KnowledgebaseService.get_by_id(kb_id)
|
||||
if not e:
|
||||
return get_data_error_result(
|
||||
message="Can't find this knowledgebase!")
|
||||
return get_data_error_result(message="Can't find this knowledgebase!")
|
||||
|
||||
if DocumentService.query(name=req["name"], kb_id=kb_id):
|
||||
return get_data_error_result(
|
||||
message="Duplicated document name in the same knowledgebase.")
|
||||
return get_data_error_result(message="Duplicated document name in the same knowledgebase.")
|
||||
|
||||
doc = DocumentService.insert({
|
||||
doc = DocumentService.insert(
|
||||
{
|
||||
"id": get_uuid(),
|
||||
"kb_id": kb.id,
|
||||
"parser_id": kb.parser_id,
|
||||
@ -176,29 +165,26 @@ def create():
|
||||
"type": FileType.VIRTUAL,
|
||||
"name": req["name"],
|
||||
"location": "",
|
||||
"size": 0
|
||||
})
|
||||
"size": 0,
|
||||
}
|
||||
)
|
||||
return get_json_result(data=doc.to_json())
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
|
||||
|
||||
@manager.route('/list', methods=['POST']) # noqa: F821
|
||||
@manager.route("/list", methods=["POST"]) # noqa: F821
|
||||
@login_required
|
||||
def list_docs():
|
||||
kb_id = request.args.get("kb_id")
|
||||
if not kb_id:
|
||||
return get_json_result(
|
||||
data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
|
||||
return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
|
||||
tenants = UserTenantService.query(user_id=current_user.id)
|
||||
for tenant in tenants:
|
||||
if KnowledgebaseService.query(
|
||||
tenant_id=tenant.tenant_id, id=kb_id):
|
||||
if KnowledgebaseService.query(tenant_id=tenant.tenant_id, id=kb_id):
|
||||
break
|
||||
else:
|
||||
return get_json_result(
|
||||
data=False, message='Only owner of knowledgebase authorized for this operation.',
|
||||
code=settings.RetCode.OPERATING_ERROR)
|
||||
return get_json_result(data=False, message="Only owner of knowledgebase authorized for this operation.", code=settings.RetCode.OPERATING_ERROR)
|
||||
keywords = request.args.get("keywords", "")
|
||||
|
||||
page_number = int(request.args.get("page", 0))
|
||||
@ -212,83 +198,67 @@ def list_docs():
|
||||
if run_status:
|
||||
invalid_status = {s for s in run_status if s not in VALID_TASK_STATUS}
|
||||
if invalid_status:
|
||||
return get_data_error_result(
|
||||
message=f"Invalid filter run status conditions: {', '.join(invalid_status)}"
|
||||
)
|
||||
return get_data_error_result(message=f"Invalid filter run status conditions: {', '.join(invalid_status)}")
|
||||
|
||||
types = req.get("types", [])
|
||||
if types:
|
||||
invalid_types = {t for t in types if t not in VALID_FILE_TYPES}
|
||||
if invalid_types:
|
||||
return get_data_error_result(
|
||||
message=f"Invalid filter conditions: {', '.join(invalid_types)} type{'s' if len(invalid_types) > 1 else ''}"
|
||||
)
|
||||
return get_data_error_result(message=f"Invalid filter conditions: {', '.join(invalid_types)} type{'s' if len(invalid_types) > 1 else ''}")
|
||||
|
||||
try:
|
||||
docs, tol = DocumentService.get_by_kb_id(
|
||||
kb_id, page_number, items_per_page, orderby, desc, keywords, run_status, types)
|
||||
docs, tol = DocumentService.get_by_kb_id(kb_id, page_number, items_per_page, orderby, desc, keywords, run_status, types)
|
||||
|
||||
for doc_item in docs:
|
||||
if doc_item['thumbnail'] and not doc_item['thumbnail'].startswith(IMG_BASE64_PREFIX):
|
||||
doc_item['thumbnail'] = f"/v1/document/image/{kb_id}-{doc_item['thumbnail']}"
|
||||
if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX):
|
||||
doc_item["thumbnail"] = f"/v1/document/image/{kb_id}-{doc_item['thumbnail']}"
|
||||
|
||||
return get_json_result(data={"total": tol, "docs": docs})
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
|
||||
|
||||
@manager.route('/infos', methods=['POST']) # noqa: F821
|
||||
@manager.route("/infos", methods=["POST"]) # noqa: F821
|
||||
@login_required
|
||||
def docinfos():
|
||||
req = request.json
|
||||
doc_ids = req["doc_ids"]
|
||||
for doc_id in doc_ids:
|
||||
if not DocumentService.accessible(doc_id, current_user.id):
|
||||
return get_json_result(
|
||||
data=False,
|
||||
message='No authorization.',
|
||||
code=settings.RetCode.AUTHENTICATION_ERROR
|
||||
)
|
||||
return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
|
||||
docs = DocumentService.get_by_ids(doc_ids)
|
||||
return get_json_result(data=list(docs.dicts()))
|
||||
|
||||
|
||||
@manager.route('/thumbnails', methods=['GET']) # noqa: F821
|
||||
@manager.route("/thumbnails", methods=["GET"]) # noqa: F821
|
||||
# @login_required
|
||||
def thumbnails():
|
||||
doc_ids = request.args.get("doc_ids").split(",")
|
||||
if not doc_ids:
|
||||
return get_json_result(
|
||||
data=False, message='Lack of "Document ID"', code=settings.RetCode.ARGUMENT_ERROR)
|
||||
return get_json_result(data=False, message='Lack of "Document ID"', code=settings.RetCode.ARGUMENT_ERROR)
|
||||
|
||||
try:
|
||||
docs = DocumentService.get_thumbnails(doc_ids)
|
||||
|
||||
for doc_item in docs:
|
||||
if doc_item['thumbnail'] and not doc_item['thumbnail'].startswith(IMG_BASE64_PREFIX):
|
||||
doc_item['thumbnail'] = f"/v1/document/image/{doc_item['kb_id']}-{doc_item['thumbnail']}"
|
||||
if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX):
|
||||
doc_item["thumbnail"] = f"/v1/document/image/{doc_item['kb_id']}-{doc_item['thumbnail']}"
|
||||
|
||||
return get_json_result(data={d["id"]: d["thumbnail"] for d in docs})
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
|
||||
|
||||
@manager.route('/change_status', methods=['POST']) # noqa: F821
|
||||
@manager.route("/change_status", methods=["POST"]) # noqa: F821
|
||||
@login_required
|
||||
@validate_request("doc_id", "status")
|
||||
def change_status():
|
||||
req = request.json
|
||||
if str(req["status"]) not in ["0", "1"]:
|
||||
return get_json_result(
|
||||
data=False,
|
||||
message='"Status" must be either 0 or 1!',
|
||||
code=settings.RetCode.ARGUMENT_ERROR)
|
||||
return get_json_result(data=False, message='"Status" must be either 0 or 1!', code=settings.RetCode.ARGUMENT_ERROR)
|
||||
|
||||
if not DocumentService.accessible(req["doc_id"], current_user.id):
|
||||
return get_json_result(
|
||||
data=False,
|
||||
message='No authorization.',
|
||||
code=settings.RetCode.AUTHENTICATION_ERROR)
|
||||
return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
|
||||
|
||||
try:
|
||||
e, doc = DocumentService.get_by_id(req["doc_id"])
|
||||
@ -296,23 +266,19 @@ def change_status():
|
||||
return get_data_error_result(message="Document not found!")
|
||||
e, kb = KnowledgebaseService.get_by_id(doc.kb_id)
|
||||
if not e:
|
||||
return get_data_error_result(
|
||||
message="Can't find this knowledgebase!")
|
||||
return get_data_error_result(message="Can't find this knowledgebase!")
|
||||
|
||||
if not DocumentService.update_by_id(
|
||||
req["doc_id"], {"status": str(req["status"])}):
|
||||
return get_data_error_result(
|
||||
message="Database error (Document update)!")
|
||||
if not DocumentService.update_by_id(req["doc_id"], {"status": str(req["status"])}):
|
||||
return get_data_error_result(message="Database error (Document update)!")
|
||||
|
||||
status = int(req["status"])
|
||||
settings.docStoreConn.update({"doc_id": req["doc_id"]}, {"available_int": status},
|
||||
search.index_name(kb.tenant_id), doc.kb_id)
|
||||
settings.docStoreConn.update({"doc_id": req["doc_id"]}, {"available_int": status}, search.index_name(kb.tenant_id), doc.kb_id)
|
||||
return get_json_result(data=True)
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
|
||||
|
||||
@manager.route('/rm', methods=['POST']) # noqa: F821
|
||||
@manager.route("/rm", methods=["POST"]) # noqa: F821
|
||||
@login_required
|
||||
@validate_request("doc_id")
|
||||
def rm():
|
||||
@ -323,11 +289,7 @@ def rm():
|
||||
|
||||
for doc_id in doc_ids:
|
||||
if not DocumentService.accessible4deletion(doc_id, current_user.id):
|
||||
return get_json_result(
|
||||
data=False,
|
||||
message='No authorization.',
|
||||
code=settings.RetCode.AUTHENTICATION_ERROR
|
||||
)
|
||||
return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
|
||||
|
||||
root_folder = FileService.get_root_folder(current_user.id)
|
||||
pf_id = root_folder["id"]
|
||||
@ -347,8 +309,7 @@ def rm():
|
||||
|
||||
TaskService.filter_delete([Task.doc_id == doc_id])
|
||||
if not DocumentService.remove_document(doc, tenant_id):
|
||||
return get_data_error_result(
|
||||
message="Database error (Document removal)!")
|
||||
return get_data_error_result(message="Database error (Document removal)!")
|
||||
|
||||
f2d = File2DocumentService.get_by_document_id(doc_id)
|
||||
deleted_file_count = 0
|
||||
@ -376,18 +337,14 @@ def rm():
|
||||
return get_json_result(data=True)
|
||||
|
||||
|
||||
@manager.route('/run', methods=['POST']) # noqa: F821
|
||||
@manager.route("/run", methods=["POST"]) # noqa: F821
|
||||
@login_required
|
||||
@validate_request("doc_ids", "run")
|
||||
def run():
|
||||
req = request.json
|
||||
for doc_id in req["doc_ids"]:
|
||||
if not DocumentService.accessible(doc_id, current_user.id):
|
||||
return get_json_result(
|
||||
data=False,
|
||||
message='No authorization.',
|
||||
code=settings.RetCode.AUTHENTICATION_ERROR
|
||||
)
|
||||
return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
|
||||
try:
|
||||
kb_table_num_map = {}
|
||||
for id in req["doc_ids"]:
|
||||
@ -421,7 +378,7 @@ def run():
|
||||
if kb_id not in kb_table_num_map:
|
||||
count = DocumentService.count_by_kb_id(kb_id=kb_id, keywords="", run_status=[TaskStatus.DONE], types=[])
|
||||
kb_table_num_map[kb_id] = count
|
||||
if kb_table_num_map[kb_id] <=0:
|
||||
if kb_table_num_map[kb_id] <= 0:
|
||||
KnowledgebaseService.delete_field_map(kb_id)
|
||||
bucket, name = File2DocumentService.get_storage_address(doc_id=doc["id"])
|
||||
queue_tasks(doc, bucket, name, 0)
|
||||
@ -431,36 +388,25 @@ def run():
|
||||
return server_error_response(e)
|
||||
|
||||
|
||||
@manager.route('/rename', methods=['POST']) # noqa: F821
|
||||
@manager.route("/rename", methods=["POST"]) # noqa: F821
|
||||
@login_required
|
||||
@validate_request("doc_id", "name")
|
||||
def rename():
|
||||
req = request.json
|
||||
if not DocumentService.accessible(req["doc_id"], current_user.id):
|
||||
return get_json_result(
|
||||
data=False,
|
||||
message='No authorization.',
|
||||
code=settings.RetCode.AUTHENTICATION_ERROR
|
||||
)
|
||||
return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
|
||||
try:
|
||||
e, doc = DocumentService.get_by_id(req["doc_id"])
|
||||
if not e:
|
||||
return get_data_error_result(message="Document not found!")
|
||||
if pathlib.Path(req["name"].lower()).suffix != pathlib.Path(
|
||||
doc.name.lower()).suffix:
|
||||
return get_json_result(
|
||||
data=False,
|
||||
message="The extension of file can't be changed",
|
||||
code=settings.RetCode.ARGUMENT_ERROR)
|
||||
if pathlib.Path(req["name"].lower()).suffix != pathlib.Path(doc.name.lower()).suffix:
|
||||
return get_json_result(data=False, message="The extension of file can't be changed", code=settings.RetCode.ARGUMENT_ERROR)
|
||||
for d in DocumentService.query(name=req["name"], kb_id=doc.kb_id):
|
||||
if d.name == req["name"]:
|
||||
return get_data_error_result(
|
||||
message="Duplicated document name in the same knowledgebase.")
|
||||
return get_data_error_result(message="Duplicated document name in the same knowledgebase.")
|
||||
|
||||
if not DocumentService.update_by_id(
|
||||
req["doc_id"], {"name": req["name"]}):
|
||||
return get_data_error_result(
|
||||
message="Database error (Document rename)!")
|
||||
if not DocumentService.update_by_id(req["doc_id"], {"name": req["name"]}):
|
||||
return get_data_error_result(message="Database error (Document rename)!")
|
||||
|
||||
informs = File2DocumentService.get_by_document_id(req["doc_id"])
|
||||
if informs:
|
||||
@ -472,7 +418,7 @@ def rename():
|
||||
return server_error_response(e)
|
||||
|
||||
|
||||
@manager.route('/get/<doc_id>', methods=['GET']) # noqa: F821
|
||||
@manager.route("/get/<doc_id>", methods=["GET"]) # noqa: F821
|
||||
# @login_required
|
||||
def get(doc_id):
|
||||
try:
|
||||
@ -486,29 +432,22 @@ def get(doc_id):
|
||||
ext = re.search(r"\.([^.]+)$", doc.name)
|
||||
if ext:
|
||||
if doc.type == FileType.VISUAL.value:
|
||||
response.headers.set('Content-Type', 'image/%s' % ext.group(1))
|
||||
response.headers.set("Content-Type", "image/%s" % ext.group(1))
|
||||
else:
|
||||
response.headers.set(
|
||||
'Content-Type',
|
||||
'application/%s' %
|
||||
ext.group(1))
|
||||
response.headers.set("Content-Type", "application/%s" % ext.group(1))
|
||||
return response
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
|
||||
|
||||
@manager.route('/change_parser', methods=['POST']) # noqa: F821
|
||||
@manager.route("/change_parser", methods=["POST"]) # noqa: F821
|
||||
@login_required
|
||||
@validate_request("doc_id", "parser_id")
|
||||
def change_parser():
|
||||
req = request.json
|
||||
|
||||
if not DocumentService.accessible(req["doc_id"], current_user.id):
|
||||
return get_json_result(
|
||||
data=False,
|
||||
message='No authorization.',
|
||||
code=settings.RetCode.AUTHENTICATION_ERROR
|
||||
)
|
||||
return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
|
||||
try:
|
||||
e, doc = DocumentService.get_by_id(req["doc_id"])
|
||||
if not e:
|
||||
@ -520,21 +459,16 @@ def change_parser():
|
||||
else:
|
||||
return get_json_result(data=True)
|
||||
|
||||
if ((doc.type == FileType.VISUAL and req["parser_id"] != "picture")
|
||||
or (re.search(
|
||||
r"\.(ppt|pptx|pages)$", doc.name) and req["parser_id"] != "presentation")):
|
||||
if (doc.type == FileType.VISUAL and req["parser_id"] != "picture") or (re.search(r"\.(ppt|pptx|pages)$", doc.name) and req["parser_id"] != "presentation"):
|
||||
return get_data_error_result(message="Not supported yet!")
|
||||
|
||||
e = DocumentService.update_by_id(doc.id,
|
||||
{"parser_id": req["parser_id"], "progress": 0, "progress_msg": "",
|
||||
"run": TaskStatus.UNSTART.value})
|
||||
e = DocumentService.update_by_id(doc.id, {"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": TaskStatus.UNSTART.value})
|
||||
if not e:
|
||||
return get_data_error_result(message="Document not found!")
|
||||
if "parser_config" in req:
|
||||
DocumentService.update_parser_config(doc.id, req["parser_config"])
|
||||
if doc.token_num > 0:
|
||||
e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1,
|
||||
doc.process_duation * -1)
|
||||
e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, doc.process_duation * -1)
|
||||
if not e:
|
||||
return get_data_error_result(message="Document not found!")
|
||||
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
|
||||
@ -548,7 +482,7 @@ def change_parser():
|
||||
return server_error_response(e)
|
||||
|
||||
|
||||
@manager.route('/image/<image_id>', methods=['GET']) # noqa: F821
|
||||
@manager.route("/image/<image_id>", methods=["GET"]) # noqa: F821
|
||||
# @login_required
|
||||
def get_image(image_id):
|
||||
try:
|
||||
@ -557,53 +491,46 @@ def get_image(image_id):
|
||||
return get_data_error_result(message="Image not found.")
|
||||
bkt, nm = image_id.split("-")
|
||||
response = flask.make_response(STORAGE_IMPL.get(bkt, nm))
|
||||
response.headers.set('Content-Type', 'image/JPEG')
|
||||
response.headers.set("Content-Type", "image/JPEG")
|
||||
return response
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
|
||||
|
||||
@manager.route('/upload_and_parse', methods=['POST']) # noqa: F821
|
||||
@manager.route("/upload_and_parse", methods=["POST"]) # noqa: F821
|
||||
@login_required
|
||||
@validate_request("conversation_id")
|
||||
def upload_and_parse():
|
||||
if 'file' not in request.files:
|
||||
return get_json_result(
|
||||
data=False, message='No file part!', code=settings.RetCode.ARGUMENT_ERROR)
|
||||
if "file" not in request.files:
|
||||
return get_json_result(data=False, message="No file part!", code=settings.RetCode.ARGUMENT_ERROR)
|
||||
|
||||
file_objs = request.files.getlist('file')
|
||||
file_objs = request.files.getlist("file")
|
||||
for file_obj in file_objs:
|
||||
if file_obj.filename == '':
|
||||
return get_json_result(
|
||||
data=False, message='No file selected!', code=settings.RetCode.ARGUMENT_ERROR)
|
||||
if file_obj.filename == "":
|
||||
return get_json_result(data=False, message="No file selected!", code=settings.RetCode.ARGUMENT_ERROR)
|
||||
|
||||
doc_ids = doc_upload_and_parse(request.form.get("conversation_id"), file_objs, current_user.id)
|
||||
|
||||
return get_json_result(data=doc_ids)
|
||||
|
||||
|
||||
@manager.route('/parse', methods=['POST']) # noqa: F821
|
||||
@manager.route("/parse", methods=["POST"]) # noqa: F821
|
||||
@login_required
|
||||
def parse():
|
||||
url = request.json.get("url") if request.json else ""
|
||||
if url:
|
||||
if not is_valid_url(url):
|
||||
return get_json_result(
|
||||
data=False, message='The URL format is invalid', code=settings.RetCode.ARGUMENT_ERROR)
|
||||
return get_json_result(data=False, message="The URL format is invalid", code=settings.RetCode.ARGUMENT_ERROR)
|
||||
download_path = os.path.join(get_project_base_directory(), "logs/downloads")
|
||||
os.makedirs(download_path, exist_ok=True)
|
||||
from seleniumwire.webdriver import Chrome, ChromeOptions
|
||||
|
||||
options = ChromeOptions()
|
||||
options.add_argument('--headless')
|
||||
options.add_argument('--disable-gpu')
|
||||
options.add_argument('--no-sandbox')
|
||||
options.add_argument('--disable-dev-shm-usage')
|
||||
options.add_experimental_option('prefs', {
|
||||
'download.default_directory': download_path,
|
||||
'download.prompt_for_download': False,
|
||||
'download.directory_upgrade': True,
|
||||
'safebrowsing.enabled': True
|
||||
})
|
||||
options.add_argument("--headless")
|
||||
options.add_argument("--disable-gpu")
|
||||
options.add_argument("--no-sandbox")
|
||||
options.add_argument("--disable-dev-shm-usage")
|
||||
options.add_experimental_option("prefs", {"download.default_directory": download_path, "download.prompt_for_download": False, "download.directory_upgrade": True, "safebrowsing.enabled": True})
|
||||
driver = Chrome(options=options)
|
||||
driver.get(url)
|
||||
res_headers = [r.response.headers for r in driver.requests if r and r.response]
|
||||
@ -626,51 +553,41 @@ def parse():
|
||||
|
||||
r = re.search(r"filename=\"([^\"]+)\"", str(res_headers))
|
||||
if not r or not r.group(1):
|
||||
return get_json_result(
|
||||
data=False, message="Can't not identify downloaded file", code=settings.RetCode.ARGUMENT_ERROR)
|
||||
return get_json_result(data=False, message="Can't not identify downloaded file", code=settings.RetCode.ARGUMENT_ERROR)
|
||||
f = File(r.group(1), os.path.join(download_path, r.group(1)))
|
||||
txt = FileService.parse_docs([f], current_user.id)
|
||||
return get_json_result(data=txt)
|
||||
|
||||
if 'file' not in request.files:
|
||||
return get_json_result(
|
||||
data=False, message='No file part!', code=settings.RetCode.ARGUMENT_ERROR)
|
||||
if "file" not in request.files:
|
||||
return get_json_result(data=False, message="No file part!", code=settings.RetCode.ARGUMENT_ERROR)
|
||||
|
||||
file_objs = request.files.getlist('file')
|
||||
file_objs = request.files.getlist("file")
|
||||
txt = FileService.parse_docs(file_objs, current_user.id)
|
||||
|
||||
return get_json_result(data=txt)
|
||||
|
||||
|
||||
@manager.route('/set_meta', methods=['POST']) # noqa: F821
|
||||
@manager.route("/set_meta", methods=["POST"]) # noqa: F821
|
||||
@login_required
|
||||
@validate_request("doc_id", "meta")
|
||||
def set_meta():
|
||||
req = request.json
|
||||
if not DocumentService.accessible(req["doc_id"], current_user.id):
|
||||
return get_json_result(
|
||||
data=False,
|
||||
message='No authorization.',
|
||||
code=settings.RetCode.AUTHENTICATION_ERROR
|
||||
)
|
||||
return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
|
||||
try:
|
||||
meta = json.loads(req["meta"])
|
||||
except Exception as e:
|
||||
return get_json_result(
|
||||
data=False, message=f'Json syntax error: {e}', code=settings.RetCode.ARGUMENT_ERROR)
|
||||
return get_json_result(data=False, message=f"Json syntax error: {e}", code=settings.RetCode.ARGUMENT_ERROR)
|
||||
if not isinstance(meta, dict):
|
||||
return get_json_result(
|
||||
data=False, message='Meta data should be in Json map format, like {"key": "value"}', code=settings.RetCode.ARGUMENT_ERROR)
|
||||
return get_json_result(data=False, message='Meta data should be in Json map format, like {"key": "value"}', code=settings.RetCode.ARGUMENT_ERROR)
|
||||
|
||||
try:
|
||||
e, doc = DocumentService.get_by_id(req["doc_id"])
|
||||
if not e:
|
||||
return get_data_error_result(message="Document not found!")
|
||||
|
||||
if not DocumentService.update_by_id(
|
||||
req["doc_id"], {"meta_fields": meta}):
|
||||
return get_data_error_result(
|
||||
message="Database error (meta updates)!")
|
||||
if not DocumentService.update_by_id(req["doc_id"], {"meta_fields": meta}):
|
||||
return get_data_error_result(message="Database error (meta updates)!")
|
||||
|
||||
return get_json_result(data=True)
|
||||
except Exception as e:
|
||||
|
@ -14,22 +14,21 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
import logging
|
||||
import re
|
||||
import os
|
||||
import re
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from flask_login import current_user
|
||||
from peewee import fn
|
||||
|
||||
from api.db import FileType, KNOWLEDGEBASE_FOLDER_NAME, FileSource, ParserType
|
||||
from api.db.db_models import DB, File2Document, Knowledgebase
|
||||
from api.db.db_models import File, Document
|
||||
from api.db import KNOWLEDGEBASE_FOLDER_NAME, FileSource, FileType, ParserType
|
||||
from api.db.db_models import DB, Document, File, File2Document, Knowledgebase
|
||||
from api.db.services import duplicate_name
|
||||
from api.db.services.common_service import CommonService
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.db.services.file2document_service import File2DocumentService
|
||||
from api.utils import get_uuid
|
||||
from api.utils.file_utils import filename_type, thumbnail_img
|
||||
from api.utils.file_utils import filename_type, read_potential_broken_pdf, thumbnail_img
|
||||
from rag.utils.storage_factory import STORAGE_IMPL
|
||||
|
||||
|
||||
@ -39,8 +38,7 @@ class FileService(CommonService):
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_by_pf_id(cls, tenant_id, pf_id, page_number, items_per_page,
|
||||
orderby, desc, keywords):
|
||||
def get_by_pf_id(cls, tenant_id, pf_id, page_number, items_per_page, orderby, desc, keywords):
|
||||
# Get files by parent folder ID with pagination and filtering
|
||||
# Args:
|
||||
# tenant_id: ID of the tenant
|
||||
@ -53,17 +51,9 @@ class FileService(CommonService):
|
||||
# Returns:
|
||||
# Tuple of (file_list, total_count)
|
||||
if keywords:
|
||||
files = cls.model.select().where(
|
||||
(cls.model.tenant_id == tenant_id),
|
||||
(cls.model.parent_id == pf_id),
|
||||
(fn.LOWER(cls.model.name).contains(keywords.lower())),
|
||||
~(cls.model.id == pf_id)
|
||||
)
|
||||
files = cls.model.select().where((cls.model.tenant_id == tenant_id), (cls.model.parent_id == pf_id), (fn.LOWER(cls.model.name).contains(keywords.lower())), ~(cls.model.id == pf_id))
|
||||
else:
|
||||
files = cls.model.select().where((cls.model.tenant_id == tenant_id),
|
||||
(cls.model.parent_id == pf_id),
|
||||
~(cls.model.id == pf_id)
|
||||
)
|
||||
files = cls.model.select().where((cls.model.tenant_id == tenant_id), (cls.model.parent_id == pf_id), ~(cls.model.id == pf_id))
|
||||
count = files.count()
|
||||
if desc:
|
||||
files = files.order_by(cls.model.getter_by(orderby).desc())
|
||||
@ -76,16 +66,20 @@ class FileService(CommonService):
|
||||
for file in res_files:
|
||||
if file["type"] == FileType.FOLDER.value:
|
||||
file["size"] = cls.get_folder_size(file["id"])
|
||||
file['kbs_info'] = []
|
||||
children = list(cls.model.select().where(
|
||||
file["kbs_info"] = []
|
||||
children = list(
|
||||
cls.model.select()
|
||||
.where(
|
||||
(cls.model.tenant_id == tenant_id),
|
||||
(cls.model.parent_id == file["id"]),
|
||||
~(cls.model.id == file["id"]),
|
||||
).dicts())
|
||||
)
|
||||
.dicts()
|
||||
)
|
||||
file["has_child_folder"] = any(value["type"] == FileType.FOLDER.value for value in children)
|
||||
continue
|
||||
kbs_info = cls.get_kb_id_by_file_id(file['id'])
|
||||
file['kbs_info'] = kbs_info
|
||||
kbs_info = cls.get_kb_id_by_file_id(file["id"])
|
||||
file["kbs_info"] = kbs_info
|
||||
|
||||
return res_files, count
|
||||
|
||||
@ -97,16 +91,18 @@ class FileService(CommonService):
|
||||
# file_id: File ID
|
||||
# Returns:
|
||||
# List of dictionaries containing knowledge base IDs and names
|
||||
kbs = (cls.model.select(*[Knowledgebase.id, Knowledgebase.name])
|
||||
kbs = (
|
||||
cls.model.select(*[Knowledgebase.id, Knowledgebase.name])
|
||||
.join(File2Document, on=(File2Document.file_id == file_id))
|
||||
.join(Document, on=(File2Document.document_id == Document.id))
|
||||
.join(Knowledgebase, on=(Knowledgebase.id == Document.kb_id))
|
||||
.where(cls.model.id == file_id))
|
||||
.where(cls.model.id == file_id)
|
||||
)
|
||||
if not kbs:
|
||||
return []
|
||||
kbs_info_list = []
|
||||
for kb in list(kbs.dicts()):
|
||||
kbs_info_list.append({"kb_id": kb['id'], "kb_name": kb['name']})
|
||||
kbs_info_list.append({"kb_id": kb["id"], "kb_name": kb["name"]})
|
||||
return kbs_info_list
|
||||
|
||||
@classmethod
|
||||
@ -178,16 +174,9 @@ class FileService(CommonService):
|
||||
if count > len(name) - 2:
|
||||
return file
|
||||
else:
|
||||
file = cls.insert({
|
||||
"id": get_uuid(),
|
||||
"parent_id": parent_id,
|
||||
"tenant_id": current_user.id,
|
||||
"created_by": current_user.id,
|
||||
"name": name[count],
|
||||
"location": "",
|
||||
"size": 0,
|
||||
"type": FileType.FOLDER.value
|
||||
})
|
||||
file = cls.insert(
|
||||
{"id": get_uuid(), "parent_id": parent_id, "tenant_id": current_user.id, "created_by": current_user.id, "name": name[count], "location": "", "size": 0, "type": FileType.FOLDER.value}
|
||||
)
|
||||
return cls.create_folder(file, file.id, name, count + 1)
|
||||
|
||||
@classmethod
|
||||
@ -212,9 +201,7 @@ class FileService(CommonService):
|
||||
# tenant_id: Tenant ID
|
||||
# Returns:
|
||||
# Root folder dictionary
|
||||
for file in cls.model.select().where((cls.model.tenant_id == tenant_id),
|
||||
(cls.model.parent_id == cls.model.id)
|
||||
):
|
||||
for file in cls.model.select().where((cls.model.tenant_id == tenant_id), (cls.model.parent_id == cls.model.id)):
|
||||
return file.to_dict()
|
||||
|
||||
file_id = get_uuid()
|
||||
@ -239,11 +226,8 @@ class FileService(CommonService):
|
||||
# tenant_id: Tenant ID
|
||||
# Returns:
|
||||
# Knowledge base folder dictionary
|
||||
for root in cls.model.select().where(
|
||||
(cls.model.tenant_id == tenant_id), (cls.model.parent_id == cls.model.id)):
|
||||
for folder in cls.model.select().where(
|
||||
(cls.model.tenant_id == tenant_id), (cls.model.parent_id == root.id),
|
||||
(cls.model.name == KNOWLEDGEBASE_FOLDER_NAME)):
|
||||
for root in cls.model.select().where((cls.model.tenant_id == tenant_id), (cls.model.parent_id == cls.model.id)):
|
||||
for folder in cls.model.select().where((cls.model.tenant_id == tenant_id), (cls.model.parent_id == root.id), (cls.model.name == KNOWLEDGEBASE_FOLDER_NAME)):
|
||||
return folder.to_dict()
|
||||
assert False, "Can't find the KB folder. Database init error."
|
||||
|
||||
@ -271,7 +255,7 @@ class FileService(CommonService):
|
||||
"type": ty,
|
||||
"size": size,
|
||||
"location": location,
|
||||
"source_type": FileSource.KNOWLEDGEBASE
|
||||
"source_type": FileSource.KNOWLEDGEBASE,
|
||||
}
|
||||
cls.save(**file)
|
||||
return file
|
||||
@ -283,12 +267,11 @@ class FileService(CommonService):
|
||||
# Args:
|
||||
# root_id: Root folder ID
|
||||
# tenant_id: Tenant ID
|
||||
for _ in cls.model.select().where((cls.model.name == KNOWLEDGEBASE_FOLDER_NAME)\
|
||||
& (cls.model.parent_id == root_id)):
|
||||
for _ in cls.model.select().where((cls.model.name == KNOWLEDGEBASE_FOLDER_NAME) & (cls.model.parent_id == root_id)):
|
||||
return
|
||||
folder = cls.new_a_file_from_kb(tenant_id, KNOWLEDGEBASE_FOLDER_NAME, root_id)
|
||||
|
||||
for kb in Knowledgebase.select(*[Knowledgebase.id, Knowledgebase.name]).where(Knowledgebase.tenant_id==tenant_id):
|
||||
for kb in Knowledgebase.select(*[Knowledgebase.id, Knowledgebase.name]).where(Knowledgebase.tenant_id == tenant_id):
|
||||
kb_folder = cls.new_a_file_from_kb(tenant_id, kb.name, folder["id"])
|
||||
for doc in DocumentService.query(kb_id=kb.id):
|
||||
FileService.add_file_from_kb(doc.to_dict(), kb_folder["id"], tenant_id)
|
||||
@ -357,12 +340,10 @@ class FileService(CommonService):
|
||||
@DB.connection_context()
|
||||
def delete_folder_by_pf_id(cls, user_id, folder_id):
|
||||
try:
|
||||
files = cls.model.select().where((cls.model.tenant_id == user_id)
|
||||
& (cls.model.parent_id == folder_id))
|
||||
files = cls.model.select().where((cls.model.tenant_id == user_id) & (cls.model.parent_id == folder_id))
|
||||
for file in files:
|
||||
cls.delete_folder_by_pf_id(user_id, file.id)
|
||||
return cls.model.delete().where((cls.model.tenant_id == user_id)
|
||||
& (cls.model.id == folder_id)).execute(),
|
||||
return (cls.model.delete().where((cls.model.tenant_id == user_id) & (cls.model.id == folder_id)).execute(),)
|
||||
except Exception:
|
||||
logging.exception("delete_folder_by_pf_id")
|
||||
raise RuntimeError("Database error (File retrieval)!")
|
||||
@ -380,8 +361,7 @@ class FileService(CommonService):
|
||||
|
||||
def dfs(parent_id):
|
||||
nonlocal size
|
||||
for f in cls.model.select(*[cls.model.id, cls.model.size, cls.model.type]).where(
|
||||
cls.model.parent_id == parent_id, cls.model.id != parent_id):
|
||||
for f in cls.model.select(*[cls.model.id, cls.model.size, cls.model.type]).where(cls.model.parent_id == parent_id, cls.model.id != parent_id):
|
||||
size += f.size
|
||||
if f.type == FileType.FOLDER.value:
|
||||
dfs(f.id)
|
||||
@ -403,7 +383,7 @@ class FileService(CommonService):
|
||||
"type": doc["type"],
|
||||
"size": doc["size"],
|
||||
"location": doc["location"],
|
||||
"source_type": FileSource.KNOWLEDGEBASE
|
||||
"source_type": FileSource.KNOWLEDGEBASE,
|
||||
}
|
||||
cls.save(**file)
|
||||
File2DocumentService.save(**{"id": get_uuid(), "file_id": file["id"], "document_id": doc["id"]})
|
||||
@ -412,7 +392,7 @@ class FileService(CommonService):
|
||||
@DB.connection_context()
|
||||
def move_file(cls, file_ids, folder_id):
|
||||
try:
|
||||
cls.filter_update((cls.model.id << file_ids, ), { 'parent_id': folder_id })
|
||||
cls.filter_update((cls.model.id << file_ids,), {"parent_id": folder_id})
|
||||
except Exception:
|
||||
logging.exception("move_file")
|
||||
raise RuntimeError("Database error (File move)!")
|
||||
@ -429,16 +409,13 @@ class FileService(CommonService):
|
||||
err, files = [], []
|
||||
for file in file_objs:
|
||||
try:
|
||||
MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))
|
||||
MAX_FILE_NUM_PER_USER = int(os.environ.get("MAX_FILE_NUM_PER_USER", 0))
|
||||
if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(kb.tenant_id) >= MAX_FILE_NUM_PER_USER:
|
||||
raise RuntimeError("Exceed the maximum file number of a free user!")
|
||||
if len(file.filename.encode("utf-8")) >= 128:
|
||||
raise RuntimeError("Exceed the maximum length of file name!")
|
||||
|
||||
filename = duplicate_name(
|
||||
DocumentService.query,
|
||||
name=file.filename,
|
||||
kb_id=kb.id)
|
||||
filename = duplicate_name(DocumentService.query, name=file.filename, kb_id=kb.id)
|
||||
filetype = filename_type(filename)
|
||||
if filetype == FileType.OTHER.value:
|
||||
raise RuntimeError("This type of file has not been supported yet!")
|
||||
@ -446,15 +423,18 @@ class FileService(CommonService):
|
||||
location = filename
|
||||
while STORAGE_IMPL.obj_exist(kb.id, location):
|
||||
location += "_"
|
||||
|
||||
blob = file.read()
|
||||
if filetype == FileType.PDF.value:
|
||||
blob = read_potential_broken_pdf(blob)
|
||||
STORAGE_IMPL.put(kb.id, location, blob)
|
||||
|
||||
doc_id = get_uuid()
|
||||
|
||||
img = thumbnail_img(filename, blob)
|
||||
thumbnail_location = ''
|
||||
thumbnail_location = ""
|
||||
if img is not None:
|
||||
thumbnail_location = f'thumbnail_{doc_id}.png'
|
||||
thumbnail_location = f"thumbnail_{doc_id}.png"
|
||||
STORAGE_IMPL.put(kb.id, thumbnail_location, img)
|
||||
|
||||
doc = {
|
||||
@ -467,7 +447,7 @@ class FileService(CommonService):
|
||||
"name": filename,
|
||||
"location": location,
|
||||
"size": len(blob),
|
||||
"thumbnail": thumbnail_location
|
||||
"thumbnail": thumbnail_location,
|
||||
}
|
||||
DocumentService.insert(doc)
|
||||
|
||||
@ -480,29 +460,17 @@ class FileService(CommonService):
|
||||
|
||||
@staticmethod
|
||||
def parse_docs(file_objs, user_id):
|
||||
from rag.app import presentation, picture, naive, audio, email
|
||||
from rag.app import audio, email, naive, picture, presentation
|
||||
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
|
||||
FACTORY = {
|
||||
ParserType.PRESENTATION.value: presentation,
|
||||
ParserType.PICTURE.value: picture,
|
||||
ParserType.AUDIO.value: audio,
|
||||
ParserType.EMAIL.value: email
|
||||
}
|
||||
FACTORY = {ParserType.PRESENTATION.value: presentation, ParserType.PICTURE.value: picture, ParserType.AUDIO.value: audio, ParserType.EMAIL.value: email}
|
||||
parser_config = {"chunk_token_num": 16096, "delimiter": "\n!?;。;!?", "layout_recognize": "Plain Text"}
|
||||
exe = ThreadPoolExecutor(max_workers=12)
|
||||
threads = []
|
||||
for file in file_objs:
|
||||
kwargs = {
|
||||
"lang": "English",
|
||||
"callback": dummy,
|
||||
"parser_config": parser_config,
|
||||
"from_page": 0,
|
||||
"to_page": 100000,
|
||||
"tenant_id": user_id
|
||||
}
|
||||
kwargs = {"lang": "English", "callback": dummy, "parser_config": parser_config, "from_page": 0, "to_page": 100000, "tenant_id": user_id}
|
||||
filetype = filename_type(file.filename)
|
||||
blob = file.read()
|
||||
threads.append(exe.submit(FACTORY.get(FileService.get_parser(filetype, file.filename, ""), naive).chunk, file.filename, blob, **kwargs))
|
||||
@ -524,3 +492,4 @@ class FileService(CommonService):
|
||||
if re.search(r"\.(eml)$", filename):
|
||||
return ParserType.EMAIL.value
|
||||
return default
|
||||
|
||||
|
@ -17,17 +17,20 @@ import base64
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import threading
|
||||
from io import BytesIO
|
||||
|
||||
import pdfplumber
|
||||
from PIL import Image
|
||||
from cachetools import LRUCache, cached
|
||||
from PIL import Image
|
||||
from ruamel.yaml import YAML
|
||||
|
||||
from api.db import FileType
|
||||
from api.constants import IMG_BASE64_PREFIX
|
||||
from api.db import FileType
|
||||
|
||||
PROJECT_BASE = os.getenv("RAG_PROJECT_BASE") or os.getenv("RAG_DEPLOY_BASE")
|
||||
RAG_BASE = os.getenv("RAG_BASE")
|
||||
@ -74,7 +77,7 @@ def get_rag_python_directory(*args):
|
||||
|
||||
|
||||
def get_home_cache_dir():
|
||||
dir = os.path.join(os.path.expanduser('~'), ".ragflow")
|
||||
dir = os.path.join(os.path.expanduser("~"), ".ragflow")
|
||||
try:
|
||||
os.mkdir(dir)
|
||||
except OSError:
|
||||
@ -92,9 +95,7 @@ def load_json_conf(conf_path):
|
||||
with open(json_conf_path) as f:
|
||||
return json.load(f)
|
||||
except BaseException:
|
||||
raise EnvironmentError(
|
||||
"loading json file config from '{}' failed!".format(json_conf_path)
|
||||
)
|
||||
raise EnvironmentError("loading json file config from '{}' failed!".format(json_conf_path))
|
||||
|
||||
|
||||
def dump_json_conf(config_data, conf_path):
|
||||
@ -106,9 +107,7 @@ def dump_json_conf(config_data, conf_path):
|
||||
with open(json_conf_path, "w") as f:
|
||||
json.dump(config_data, f, indent=4)
|
||||
except BaseException:
|
||||
raise EnvironmentError(
|
||||
"loading json file config from '{}' failed!".format(json_conf_path)
|
||||
)
|
||||
raise EnvironmentError("loading json file config from '{}' failed!".format(json_conf_path))
|
||||
|
||||
|
||||
def load_json_conf_real_time(conf_path):
|
||||
@ -120,9 +119,7 @@ def load_json_conf_real_time(conf_path):
|
||||
with open(json_conf_path) as f:
|
||||
return json.load(f)
|
||||
except BaseException:
|
||||
raise EnvironmentError(
|
||||
"loading json file config from '{}' failed!".format(json_conf_path)
|
||||
)
|
||||
raise EnvironmentError("loading json file config from '{}' failed!".format(json_conf_path))
|
||||
|
||||
|
||||
def load_yaml_conf(conf_path):
|
||||
@ -130,12 +127,10 @@ def load_yaml_conf(conf_path):
|
||||
conf_path = os.path.join(get_project_base_directory(), conf_path)
|
||||
try:
|
||||
with open(conf_path) as f:
|
||||
yaml = YAML(typ='safe', pure=True)
|
||||
yaml = YAML(typ="safe", pure=True)
|
||||
return yaml.load(f)
|
||||
except Exception as e:
|
||||
raise EnvironmentError(
|
||||
"loading yaml file config from {} failed:".format(conf_path), e
|
||||
)
|
||||
raise EnvironmentError("loading yaml file config from {} failed:".format(conf_path), e)
|
||||
|
||||
|
||||
def rewrite_yaml_conf(conf_path, config):
|
||||
@ -146,13 +141,11 @@ def rewrite_yaml_conf(conf_path, config):
|
||||
yaml = YAML(typ="safe")
|
||||
yaml.dump(config, f)
|
||||
except Exception as e:
|
||||
raise EnvironmentError(
|
||||
"rewrite yaml file config {} failed:".format(conf_path), e
|
||||
)
|
||||
raise EnvironmentError("rewrite yaml file config {} failed:".format(conf_path), e)
|
||||
|
||||
|
||||
def rewrite_json_file(filepath, json_data):
|
||||
with open(filepath, "w", encoding='utf-8') as f:
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
json.dump(json_data, f, indent=4, separators=(",", ": "))
|
||||
f.close()
|
||||
|
||||
@ -162,12 +155,10 @@ def filename_type(filename):
|
||||
if re.match(r".*\.pdf$", filename):
|
||||
return FileType.PDF.value
|
||||
|
||||
if re.match(
|
||||
r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename):
|
||||
if re.match(r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename):
|
||||
return FileType.DOC.value
|
||||
|
||||
if re.match(
|
||||
r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$", filename):
|
||||
if re.match(r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$", filename):
|
||||
return FileType.AURAL.value
|
||||
|
||||
if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename):
|
||||
@ -175,6 +166,7 @@ def filename_type(filename):
|
||||
|
||||
return FileType.OTHER.value
|
||||
|
||||
|
||||
def thumbnail_img(filename, blob):
|
||||
"""
|
||||
MySQL LongText max length is 65535
|
||||
@ -183,6 +175,7 @@ def thumbnail_img(filename, blob):
|
||||
if re.match(r".*\.pdf$", filename):
|
||||
with sys.modules[LOCK_KEY_pdfplumber]:
|
||||
pdf = pdfplumber.open(BytesIO(blob))
|
||||
|
||||
buffered = BytesIO()
|
||||
resolution = 32
|
||||
img = None
|
||||
@ -206,8 +199,9 @@ def thumbnail_img(filename, blob):
|
||||
return buffered.getvalue()
|
||||
|
||||
elif re.match(r".*\.(ppt|pptx)$", filename):
|
||||
import aspose.slides as slides
|
||||
import aspose.pydrawing as drawing
|
||||
import aspose.slides as slides
|
||||
|
||||
try:
|
||||
with slides.Presentation(BytesIO(blob)) as presentation:
|
||||
buffered = BytesIO()
|
||||
@ -215,8 +209,7 @@ def thumbnail_img(filename, blob):
|
||||
img = None
|
||||
for _ in range(10):
|
||||
# https://reference.aspose.com/slides/python-net/aspose.slides/slide/get_thumbnail/#float-float
|
||||
presentation.slides[0].get_thumbnail(scale, scale).save(
|
||||
buffered, drawing.imaging.ImageFormat.png)
|
||||
presentation.slides[0].get_thumbnail(scale, scale).save(buffered, drawing.imaging.ImageFormat.png)
|
||||
img = buffered.getvalue()
|
||||
if len(img) >= 64000:
|
||||
scale = scale / 2.0
|
||||
@ -232,10 +225,9 @@ def thumbnail_img(filename, blob):
|
||||
def thumbnail(filename, blob):
|
||||
img = thumbnail_img(filename, blob)
|
||||
if img is not None:
|
||||
return IMG_BASE64_PREFIX + \
|
||||
base64.b64encode(img).decode("utf-8")
|
||||
return IMG_BASE64_PREFIX + base64.b64encode(img).decode("utf-8")
|
||||
else:
|
||||
return ''
|
||||
return ""
|
||||
|
||||
|
||||
def traversal_files(base):
|
||||
@ -243,3 +235,52 @@ def traversal_files(base):
|
||||
for f in fs:
|
||||
fullname = os.path.join(root, f)
|
||||
yield fullname
|
||||
|
||||
|
||||
def repair_pdf_with_ghostscript(input_bytes):
|
||||
if shutil.which("gs") is None:
|
||||
return input_bytes
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_in, tempfile.NamedTemporaryFile(suffix=".pdf") as temp_out:
|
||||
temp_in.write(input_bytes)
|
||||
temp_in.flush()
|
||||
|
||||
cmd = [
|
||||
"gs",
|
||||
"-o",
|
||||
temp_out.name,
|
||||
"-sDEVICE=pdfwrite",
|
||||
"-dPDFSETTINGS=/prepress",
|
||||
temp_in.name,
|
||||
]
|
||||
try:
|
||||
proc = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if proc.returncode != 0:
|
||||
return input_bytes
|
||||
except Exception:
|
||||
return input_bytes
|
||||
|
||||
temp_out.seek(0)
|
||||
repaired_bytes = temp_out.read()
|
||||
|
||||
return repaired_bytes
|
||||
|
||||
|
||||
def read_potential_broken_pdf(blob):
|
||||
def try_open(blob):
|
||||
try:
|
||||
with pdfplumber.open(BytesIO(blob)) as pdf:
|
||||
if pdf.pages:
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
return False
|
||||
|
||||
if try_open(blob):
|
||||
return blob
|
||||
|
||||
repaired = repair_pdf_with_ghostscript(blob)
|
||||
if try_open(repaired):
|
||||
return repaired
|
||||
|
||||
return blob
|
||||
|
Loading…
x
Reference in New Issue
Block a user