mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-07-31 15:12:01 +08:00
finish add thumbnail to video,image,pdf files (#18)
This commit is contained in:
parent
3fc700a1d4
commit
cdd956568d
@ -32,4 +32,4 @@ regex = "1.10.2"
|
|||||||
name = "doc_gpt"
|
name = "doc_gpt"
|
||||||
|
|
||||||
[workspace]
|
[workspace]
|
||||||
members = [".", "migration"]
|
members = [".", "migration"]
|
||||||
|
@ -201,7 +201,7 @@ impl MigrationTrait for Migration {
|
|||||||
.col(ColumnDef::new(DocInfo::Location).string().not_null())
|
.col(ColumnDef::new(DocInfo::Location).string().not_null())
|
||||||
.col(ColumnDef::new(DocInfo::Size).big_integer().not_null())
|
.col(ColumnDef::new(DocInfo::Size).big_integer().not_null())
|
||||||
.col(ColumnDef::new(DocInfo::Type).string().not_null())
|
.col(ColumnDef::new(DocInfo::Type).string().not_null())
|
||||||
.col(ColumnDef::new(DocInfo::ThumbnailBase64).string().not_null())
|
.col(ColumnDef::new(DocInfo::ThumbnailBase64).string().default(""))
|
||||||
.comment("doc type|folder")
|
.comment("doc type|folder")
|
||||||
.col(
|
.col(
|
||||||
ColumnDef::new(DocInfo::CreatedAt)
|
ColumnDef::new(DocInfo::CreatedAt)
|
||||||
@ -274,28 +274,28 @@ impl MigrationTrait for Migration {
|
|||||||
.values_panic([
|
.values_panic([
|
||||||
(1).into(),
|
(1).into(),
|
||||||
"Video".into(),
|
"Video".into(),
|
||||||
".*\\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa)".into(),
|
".*\\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)".into(),
|
||||||
(1).into(),
|
(1).into(),
|
||||||
(1).into(),
|
(1).into(),
|
||||||
])
|
])
|
||||||
.values_panic([
|
.values_panic([
|
||||||
(1).into(),
|
(1).into(),
|
||||||
"Picture".into(),
|
"Picture".into(),
|
||||||
".*\\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng)".into(),
|
".*\\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico)".into(),
|
||||||
(2).into(),
|
(2).into(),
|
||||||
(2).into(),
|
(2).into(),
|
||||||
])
|
])
|
||||||
.values_panic([
|
.values_panic([
|
||||||
(1).into(),
|
(1).into(),
|
||||||
"Music".into(),
|
"Music".into(),
|
||||||
".*\\.(WAV|FLAC|APE|ALAC|WavPack|WV|MP3|AAC|Ogg|Vorbis|Opus)".into(),
|
".*\\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)".into(),
|
||||||
(3).into(),
|
(3).into(),
|
||||||
(3).into(),
|
(3).into(),
|
||||||
])
|
])
|
||||||
.values_panic([
|
.values_panic([
|
||||||
(1).into(),
|
(1).into(),
|
||||||
"Document".into(),
|
"Document".into(),
|
||||||
".*\\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp)".into(),
|
".*\\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp|pages|numbers|key)".into(),
|
||||||
(3).into(),
|
(3).into(),
|
||||||
(3).into(),
|
(3).into(),
|
||||||
])
|
])
|
||||||
|
118
python/svr/add_thumbnail2file.py
Normal file
118
python/svr/add_thumbnail2file.py
Normal file
@ -0,0 +1,118 @@
|
|||||||
|
import sys, datetime, random, re, cv2
|
||||||
|
from os.path import dirname, realpath
|
||||||
|
sys.path.append(dirname(realpath(__file__)) + "/../")
|
||||||
|
from util.db_conn import Postgres
|
||||||
|
from util.minio_conn import HuMinio
|
||||||
|
from util import findMaxDt
|
||||||
|
import base64
|
||||||
|
from io import BytesIO
|
||||||
|
import pandas as pd
|
||||||
|
from PIL import Image
|
||||||
|
import pdfplumber
|
||||||
|
|
||||||
|
|
||||||
|
PG = Postgres("infiniflow", "docgpt")
|
||||||
|
MINIO = HuMinio("infiniflow")
|
||||||
|
def set_thumbnail(did, base64):
|
||||||
|
sql = f"""
|
||||||
|
update doc_info set thumbnail_base64='{base64}'
|
||||||
|
where
|
||||||
|
did={did}
|
||||||
|
"""
|
||||||
|
PG.update(sql)
|
||||||
|
|
||||||
|
|
||||||
|
def collect(comm, mod, tm):
|
||||||
|
sql = f"""
|
||||||
|
select
|
||||||
|
did, uid, doc_name, location, updated_at
|
||||||
|
from doc_info
|
||||||
|
where
|
||||||
|
updated_at >= '{tm}'
|
||||||
|
and MOD(did, {comm}) = {mod}
|
||||||
|
and is_deleted=false
|
||||||
|
and type <> 'folder'
|
||||||
|
and thumbnail_base64=''
|
||||||
|
order by updated_at asc
|
||||||
|
limit 10
|
||||||
|
"""
|
||||||
|
docs = PG.select(sql)
|
||||||
|
if len(docs) == 0:return pd.DataFrame()
|
||||||
|
|
||||||
|
mtm = str(docs["updated_at"].max())[:19]
|
||||||
|
print("TOTAL:", len(docs), "To: ", mtm)
|
||||||
|
return docs
|
||||||
|
|
||||||
|
|
||||||
|
def build(row):
|
||||||
|
if not re.search(r"\.(pdf|jpg|jpeg|png|gif|svg|apng|icon|ico|webp|mpg|mpeg|avi|rm|rmvb|mov|wmv|mp4)$",
|
||||||
|
row["doc_name"].lower().strip()):
|
||||||
|
set_thumbnail(row["did"], "_")
|
||||||
|
return
|
||||||
|
|
||||||
|
def thumbnail(img, SIZE=128):
|
||||||
|
w,h = img.size
|
||||||
|
p = SIZE/max(w, h)
|
||||||
|
w, h = int(w*p), int(h*p)
|
||||||
|
img.thumbnail((w, h))
|
||||||
|
buffered = BytesIO()
|
||||||
|
try:
|
||||||
|
img.save(buffered, format="JPEG")
|
||||||
|
except Exception as e:
|
||||||
|
try:
|
||||||
|
img.save(buffered, format="PNG")
|
||||||
|
except Exception as ee:
|
||||||
|
pass
|
||||||
|
return base64.b64encode(buffered.getvalue()).decode("utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
iobytes = BytesIO(MINIO.get("%s-upload"%str(row["uid"]), row["location"]))
|
||||||
|
if re.search(r"\.pdf$", row["doc_name"].lower().strip()):
|
||||||
|
pdf = pdfplumber.open(iobytes)
|
||||||
|
img = pdf.pages[0].to_image().annotated
|
||||||
|
set_thumbnail(row["did"], thumbnail(img))
|
||||||
|
|
||||||
|
if re.search(r"\.(jpg|jpeg|png|gif|svg|apng|webp|icon|ico)$", row["doc_name"].lower().strip()):
|
||||||
|
img = Image.open(iobytes)
|
||||||
|
set_thumbnail(row["did"], thumbnail(img))
|
||||||
|
|
||||||
|
if re.search(r"\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|mp4)$", row["doc_name"].lower().strip()):
|
||||||
|
url = MINIO.get_presigned_url("%s-upload"%str(row["uid"]),
|
||||||
|
row["location"],
|
||||||
|
expires=datetime.timedelta(seconds=60)
|
||||||
|
)
|
||||||
|
cap = cv2.VideoCapture(url)
|
||||||
|
succ = cap.isOpened()
|
||||||
|
i = random.randint(1, 11)
|
||||||
|
while succ:
|
||||||
|
ret, frame = cap.read()
|
||||||
|
if not ret: break
|
||||||
|
if i > 0:
|
||||||
|
i -= 1
|
||||||
|
continue
|
||||||
|
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
||||||
|
print(img.size)
|
||||||
|
set_thumbnail(row["did"], thumbnail(img))
|
||||||
|
cap.release()
|
||||||
|
cv2.destroyAllWindows()
|
||||||
|
|
||||||
|
|
||||||
|
def main(comm, mod):
|
||||||
|
global model
|
||||||
|
tm_fnm = f"res/thumbnail-{comm}-{mod}.tm"
|
||||||
|
tm = findMaxDt(tm_fnm)
|
||||||
|
rows = collect(comm, mod, tm)
|
||||||
|
if len(rows) == 0:return
|
||||||
|
|
||||||
|
tmf = open(tm_fnm, "a+")
|
||||||
|
for _, r in rows.iterrows():
|
||||||
|
build(r)
|
||||||
|
tmf.write(str(r["updated_at"]) + "\n")
|
||||||
|
tmf.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
from mpi4py import MPI
|
||||||
|
comm = MPI.COMM_WORLD
|
||||||
|
main(comm.Get_size(), comm.Get_rank())
|
||||||
|
|
@ -54,11 +54,24 @@ class HuMinio(object):
|
|||||||
r = self.conn.get_object(bucket, fnm)
|
r = self.conn.get_object(bucket, fnm)
|
||||||
return r.read()
|
return r.read()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Fail get {bucket}/{fnm}: "+str(e))
|
logging.error(f"fail get {bucket}/{fnm}: "+str(e))
|
||||||
self.__open__()
|
self.__open__()
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
def get_presigned_url(self, bucket, fnm, expires):
|
||||||
|
for _ in range(10):
|
||||||
|
try:
|
||||||
|
return self.conn.get_presigned_url("GET", bucket, fnm, expires)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"fail get {bucket}/{fnm}: "+str(e))
|
||||||
|
self.__open__()
|
||||||
|
time.sleep(1)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
conn = HuMinio("infiniflow")
|
conn = HuMinio("infiniflow")
|
||||||
fnm = "/opt/home/kevinhu/docgpt/upload/13/11-408.jpg"
|
fnm = "/opt/home/kevinhu/docgpt/upload/13/11-408.jpg"
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::{HashMap};
|
||||||
use std::io::BufReader;
|
use std::io::BufReader;
|
||||||
use actix_multipart_extract::{ File, Multipart, MultipartForm };
|
use actix_multipart_extract::{ File, Multipart, MultipartForm };
|
||||||
|
use actix_web::web::Bytes;
|
||||||
use actix_web::{ HttpResponse, post, web };
|
use actix_web::{ HttpResponse, post, web };
|
||||||
use chrono::{ Utc, FixedOffset };
|
use chrono::{ Utc, FixedOffset };
|
||||||
use minio::s3::args::{ BucketExistsArgs, MakeBucketArgs, PutObjectArgs };
|
use minio::s3::args::{ BucketExistsArgs, MakeBucketArgs, PutObjectArgs };
|
||||||
@ -68,7 +69,7 @@ pub struct UploadForm {
|
|||||||
fn file_type(filename: &String) -> String {
|
fn file_type(filename: &String) -> String {
|
||||||
let fnm = filename.to_lowercase();
|
let fnm = filename.to_lowercase();
|
||||||
if
|
if
|
||||||
let Some(_) = Regex::new(r"\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa)$")
|
let Some(_) = Regex::new(r"\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$")
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.captures(&fnm)
|
.captures(&fnm)
|
||||||
{
|
{
|
||||||
@ -76,7 +77,7 @@ fn file_type(filename: &String) -> String {
|
|||||||
}
|
}
|
||||||
if
|
if
|
||||||
let Some(_) = Regex::new(
|
let Some(_) = Regex::new(
|
||||||
r"\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng)$"
|
r"\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico)$"
|
||||||
)
|
)
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.captures(&fnm)
|
.captures(&fnm)
|
||||||
@ -84,14 +85,14 @@ fn file_type(filename: &String) -> String {
|
|||||||
return "Picture".to_owned();
|
return "Picture".to_owned();
|
||||||
}
|
}
|
||||||
if
|
if
|
||||||
let Some(_) = Regex::new(r"\.(WAV|FLAC|APE|ALAC|WavPack|WV|MP3|AAC|Ogg|Vorbis|Opus)$")
|
let Some(_) = Regex::new(r"\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$")
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.captures(&fnm)
|
.captures(&fnm)
|
||||||
{
|
{
|
||||||
return "Music".to_owned();
|
return "Music".to_owned();
|
||||||
}
|
}
|
||||||
if
|
if
|
||||||
let Some(_) = Regex::new(r"\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp)$")
|
let Some(_) = Regex::new(r"\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp|pages|numbers|key)$")
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.captures(&fnm)
|
.captures(&fnm)
|
||||||
{
|
{
|
||||||
@ -100,6 +101,7 @@ fn file_type(filename: &String) -> String {
|
|||||||
"Other".to_owned()
|
"Other".to_owned()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#[post("/v1.0/upload")]
|
#[post("/v1.0/upload")]
|
||||||
async fn upload(
|
async fn upload(
|
||||||
payload: Multipart<UploadForm>,
|
payload: Multipart<UploadForm>,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user