ragflow/python/svr/add_thumbnail2file.py

import sys, datetime, random, re, cv2
from os.path import dirname, realpath
sys.path.append(dirname(realpath(__file__)) + "/../")
from util.db_conn import Postgres
from util.minio_conn import HuMinio
from util import findMaxDt
import base64
from io import BytesIO
import pandas as pd
from PIL import Image
import pdfplumber


PG = Postgres("infiniflow", "docgpt")
MINIO = HuMinio("infiniflow")
def set_thumbnail(did, base64):
    sql = f"""
    update doc_info set thumbnail_base64='{base64}'
    where
    did={did}
    """
    PG.update(sql)


def collect(comm, mod, tm):
    sql = f"""
    select
    did, uid, doc_name, location, updated_at
    from doc_info
    where
    updated_at >= '{tm}'
    and MOD(did, {comm}) = {mod}
    and is_deleted=false
    and type <> 'folder'
    and thumbnail_base64=''
    order by updated_at asc
    limit 10
    """
    docs = PG.select(sql)
    if len(docs) == 0:return pd.DataFrame()

    mtm = str(docs["updated_at"].max())[:19]
    print("TOTAL:", len(docs), "To: ", mtm)
    return docs


def build(row):
    if not re.search(r"\.(pdf|jpg|jpeg|png|gif|svg|apng|icon|ico|webp|mpg|mpeg|avi|rm|rmvb|mov|wmv|mp4)$",
              row["doc_name"].lower().strip()):
        set_thumbnail(row["did"], "_")
        return

    def thumbnail(img, SIZE=128):
        w,h = img.size
        p = SIZE/max(w, h)
        w, h = int(w*p), int(h*p)
        img.thumbnail((w, h))
        buffered = BytesIO()
        try:
            img.save(buffered, format="JPEG")
        except Exception as e:
            try:
                img.save(buffered, format="PNG")
            except Exception as ee:
                pass
        return base64.b64encode(buffered.getvalue()).decode("utf-8")


    iobytes = BytesIO(MINIO.get("%s-upload"%str(row["uid"]), row["location"]))
    if re.search(r"\.pdf$", row["doc_name"].lower().strip()):
        pdf = pdfplumber.open(iobytes)
        img = pdf.pages[0].to_image().annotated
        set_thumbnail(row["did"], thumbnail(img))

    if re.search(r"\.(jpg|jpeg|png|gif|svg|apng|webp|icon|ico)$", row["doc_name"].lower().strip()):
        img = Image.open(iobytes)
        set_thumbnail(row["did"], thumbnail(img))

    if re.search(r"\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|mp4)$", row["doc_name"].lower().strip()):
        url  = MINIO.get_presigned_url("%s-upload"%str(row["uid"]),
                                       row["location"],
                                       expires=datetime.timedelta(seconds=60)
                                      )
        cap = cv2.VideoCapture(url)
        succ = cap.isOpened()
        i = random.randint(1, 11)
        while succ:
            ret, frame = cap.read()
            if not ret: break
            if i > 0:
                i -= 1
                continue
            img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            print(img.size)
            set_thumbnail(row["did"], thumbnail(img))
        cap.release()
        cv2.destroyAllWindows()


def main(comm, mod):
    global model
    tm_fnm = f"res/thumbnail-{comm}-{mod}.tm"
    tm = findMaxDt(tm_fnm)
    rows = collect(comm, mod, tm)
    if len(rows) == 0:return

    tmf = open(tm_fnm, "a+")
    for _, r in rows.iterrows():
        build(r)
        tmf.write(str(r["updated_at"]) + "\n")
    tmf.close()


if __name__ == "__main__":
    from mpi4py import MPI
    comm = MPI.COMM_WORLD
    main(comm.Get_size(), comm.Get_rank())