build dialog server; add thumbnail to docinfo; (#17)

This commit is contained in:
KevinHuSh 2023-12-26 19:32:06 +08:00 committed by GitHub
parent 3245107dc7
commit 3fc700a1d4
12 changed files with 94 additions and 42 deletions

View File

@ -26,6 +26,7 @@ migration = { path = "./migration" }
minio = "0.1.0"
futures-util = "0.3.29"
actix-multipart-extract = "0.1.5"
regex = "1.10.2"
[[bin]]
name = "doc_gpt"

View File

@ -201,7 +201,8 @@ impl MigrationTrait for Migration {
.col(ColumnDef::new(DocInfo::Location).string().not_null())
.col(ColumnDef::new(DocInfo::Size).big_integer().not_null())
.col(ColumnDef::new(DocInfo::Type).string().not_null())
.comment("doc|folder")
.col(ColumnDef::new(DocInfo::ThumbnailBase64).string().not_null())
.comment("doc type|folder")
.col(
ColumnDef::new(DocInfo::CreatedAt)
.timestamp_with_time_zone()
@ -249,7 +250,6 @@ impl MigrationTrait for Migration {
.to_owned()
).await?;
let tm = now();
let root_insert = Query::insert()
.into_table(UserInfo::Table)
.columns([UserInfo::Email, UserInfo::Nickname, UserInfo::Password])
@ -273,28 +273,28 @@ impl MigrationTrait for Migration {
.columns([TagInfo::Uid, TagInfo::TagName, TagInfo::Regx, TagInfo::Color, TagInfo::Icon])
.values_panic([
(1).into(),
"视频".into(),
"Video".into(),
".*\\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa)".into(),
(1).into(),
(1).into(),
])
.values_panic([
(1).into(),
"图片".into(),
".*\\.(png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng)".into(),
"Picture".into(),
".*\\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng)".into(),
(2).into(),
(2).into(),
])
.values_panic([
(1).into(),
"音乐".into(),
"Music".into(),
".*\\.(WAV|FLAC|APE|ALAC|WavPack|WV|MP3|AAC|Ogg|Vorbis|Opus)".into(),
(3).into(),
(3).into(),
])
.values_panic([
(1).into(),
"文档".into(),
"Document".into(),
".*\\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp)".into(),
(3).into(),
(3).into(),
@ -419,6 +419,7 @@ enum DocInfo {
Location,
Size,
Type,
ThumbnailBase64,
CreatedAt,
UpdatedAt,
IsDeleted,

View File

@ -1,10 +1,10 @@
[infiniflow]
es=http://127.0.0.1:9200
es=http://es01:9200
pgdb_usr=root
pgdb_pwd=infiniflow_docgpt
pgdb_host=127.0.0.1
pgdb_port=5455
minio_host=127.0.0.1:9000
pgdb_host=postgres
pgdb_port=5432
minio_host=minio:9000
minio_usr=infiniflow
minio_pwd=infiniflow_docgpt

View File

@ -24,6 +24,7 @@ class QWen(Base):
from http import HTTPStatus
from dashscope import Generation
from dashscope.api_entities.dashscope_response import Role
# export DASHSCOPE_API_KEY=YOUR_DASHSCOPE_API_KEY
response = Generation.call(
Generation.Models.qwen_turbo,
messages=messages,

View File

@ -9,6 +9,8 @@ from sklearn.metrics.pairwise import cosine_similarity as CosineSimilarity
import numpy as np
from copy import deepcopy
def index_name(uid):return f"docgpt_{uid}"
class Dealer:
def __init__(self, es, emb_mdl):
self.qryr = query.EsQueryer(es)

View File

@ -6,11 +6,10 @@ from tornado.ioloop import IOLoop
from tornado.httpserver import HTTPServer
from tornado.options import define,options
from util import es_conn, setup_logging
from svr import sec_search as search
from svr.rpc_proxy import RPCProxy
from sklearn.metrics.pairwise import cosine_similarity as CosineSimilarity
from nlp import huqie
from nlp import query as Query
from nlp import search
from llm import HuEmbedding, GptTurbo
import numpy as np
from io import BytesIO
@ -38,7 +37,7 @@ def get_QA_pairs(hists):
def get_instruction(sres, top_i, max_len=8096 fld="content_ltks"):
def get_instruction(sres, top_i, max_len=8096, fld="content_ltks"):
max_len //= len(top_i)
# add instruction to prompt
instructions = [re.sub(r"[\r\n]+", " ", sres.field[sres.ids[i]][fld]) for i in top_i]
@ -98,8 +97,9 @@ class Handler(RequestHandler):
res = SE.search({
"question": question,
"kb_ids": param.get("kb_ids", []),
"size": param.get("topn", 15)
})
"size": param.get("topn", 15)},
search.index_name(param["uid"])
)
sim = SE.rerank(res, question)
rk_idx = np.argsort(sim*-1)
@ -112,12 +112,12 @@ class Handler(RequestHandler):
refer = OrderedDict()
docnms = {}
for i in rk_idx:
did = res.field[res.ids[i]]["doc_id"])
if did not in docnms: docnms[did] = res.field[res.ids[i]]["docnm_kwd"])
did = res.field[res.ids[i]]["doc_id"]
if did not in docnms: docnms[did] = res.field[res.ids[i]]["docnm_kwd"]
if did not in refer: refer[did] = []
refer[did].append({
"chunk_id": res.ids[i],
"content": res.field[res.ids[i]]["content_ltks"]),
"content": res.field[res.ids[i]]["content_ltks"],
"image": ""
})
@ -128,7 +128,7 @@ class Handler(RequestHandler):
"data":{
"uid": param["uid"],
"dialog_id": param["dialog_id"],
"assistant": ans
"assistant": ans,
"refer": [{
"did": did,
"doc_name": docnms[did],
@ -153,7 +153,7 @@ if __name__ == '__main__':
parser.add_argument("--port", default=4455, type=int, help="Port used for service")
ARGS = parser.parse_args()
SE = search.ResearchReportSearch(es_conn.HuEs("infiniflow"), EMBEDDING)
SE = search.Dealer(es_conn.HuEs("infiniflow"), EMBEDDING)
app = Application([(r'/v1/chat/completions', Handler)],debug=False)
http_server = HTTPServer(app)

View File

@ -6,7 +6,7 @@ from util.db_conn import Postgres
from util.minio_conn import HuMinio
from util import rmSpace, findMaxDt
from FlagEmbedding import FlagModel
from nlp import huchunk, huqie
from nlp import huchunk, huqie, search
import base64, hashlib
from io import BytesIO
import pandas as pd
@ -103,7 +103,7 @@ def build(row):
if(!ctx._source.kb_id.contains('%s'))
ctx._source.kb_id.add('%s');
"""%(str(row["kb_id"]), str(row["kb_id"])),
idxnm = index_name(row["uid"])
idxnm = search.index_name(row["uid"])
)
set_progress(row["kb2doc_id"], 1, "Done")
return []
@ -171,10 +171,8 @@ def build(row):
return docs
def index_name(uid):return f"docgpt_{uid}"
def init_kb(row):
idxnm = index_name(row["uid"])
idxnm = search.index_name(row["uid"])
if ES.indexExist(idxnm): return
return ES.createIdx(idxnm, json.load(open("conf/mapping.json", "r")))
@ -199,7 +197,7 @@ def rm_doc_from_kb(df):
ctx._source.kb_id.indexOf('%s')
);
"""%(str(r["kb_id"]),str(r["kb_id"])),
idxnm = index_name(r["uid"])
idxnm = search.index_name(r["uid"])
)
if len(df) == 0:return
sql = """
@ -233,7 +231,7 @@ def main(comm, mod):
set_progress(r["kb2doc_id"], random.randint(70, 95)/100.,
"Finished embedding! Start to build index!")
init_kb(r)
es_r = ES.bulk(cks, index_name(r["uid"]))
es_r = ES.bulk(cks, search.index_name(r["uid"]))
if es_r:
set_progress(r["kb2doc_id"], -1, "Index failure!")
print(es_r)

View File

@ -11,6 +11,7 @@ use crate::entity::doc_info::Model;
use crate::errors::AppError;
use crate::service::doc_info::{ Mutation, Query };
use serde::Deserialize;
use regex::Regex;
fn now() -> chrono::DateTime<FixedOffset> {
Utc::now().with_timezone(&FixedOffset::east_opt(3600 * 8).unwrap())
@ -64,6 +65,41 @@ pub struct UploadForm {
did: i64,
}
fn file_type(filename: &String) -> String {
let fnm = filename.to_lowercase();
if
let Some(_) = Regex::new(r"\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa)$")
.unwrap()
.captures(&fnm)
{
return "Video".to_owned();
}
if
let Some(_) = Regex::new(
r"\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng)$"
)
.unwrap()
.captures(&fnm)
{
return "Picture".to_owned();
}
if
let Some(_) = Regex::new(r"\.(WAV|FLAC|APE|ALAC|WavPack|WV|MP3|AAC|Ogg|Vorbis|Opus)$")
.unwrap()
.captures(&fnm)
{
return "Music".to_owned();
}
if
let Some(_) = Regex::new(r"\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp)$")
.unwrap()
.captures(&fnm)
{
return "Document".to_owned();
}
"Other".to_owned()
}
#[post("/v1.0/upload")]
async fn upload(
payload: Multipart<UploadForm>,
@ -114,7 +150,13 @@ async fn upload(
print!("Existing bucket: {}", bucket_name.clone());
}
let location = format!("/{}/{}", payload.did, fnm);
let location = format!("/{}/{}", payload.did, fnm)
.as_bytes()
.to_vec()
.iter()
.map(|b| format!("{:02x}", b).to_string())
.collect::<Vec<String>>()
.join("");
print!("===>{}", location.clone());
s3_client.put_object(
&mut PutObjectArgs::new(
@ -129,10 +171,11 @@ async fn upload(
let doc = Mutation::create_doc_info(&data.conn, Model {
did: Default::default(),
uid: uid,
doc_name: fnm,
doc_name: fnm.clone(),
size: payload.file_field.bytes.len() as i64,
location,
r#type: "doc".to_string(),
r#type: file_type(&fnm),
thumbnail_base64: Default::default(),
created_at: now(),
updated_at: now(),
is_deleted: Default::default(),
@ -214,6 +257,7 @@ async fn new_folder(
size: 0,
r#type: "folder".to_string(),
location: "".to_owned(),
thumbnail_base64: Default::default(),
created_at: now(),
updated_at: now(),
is_deleted: Default::default(),

View File

@ -90,12 +90,13 @@ async fn register(
doc_name: "/".into(),
size: 0,
location: "".into(),
thumbnail_base64: "".into(),
r#type: "folder".to_string(),
created_at: now(),
updated_at: now(),
is_deleted: Default::default(),
}).await?;
let tnm = vec!["视频", "图片", "音乐", "文档"];
let tnm = vec!["Video", "Picture", "Music", "Document"];
let tregx = vec![
".*\\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa)",
".*\\.(png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng)",

View File

@ -17,6 +17,8 @@ pub struct Model {
#[serde(skip_deserializing)]
pub location: String,
#[serde(skip_deserializing)]
pub thumbnail_base64: String,
#[serde(skip_deserializing)]
pub created_at: DateTime<FixedOffset>,
#[serde(skip_deserializing)]
pub updated_at: DateTime<FixedOffset>,

View File

@ -9,28 +9,28 @@ pub struct Model {
#[sea_orm(index)]
pub tag_id: i64,
#[sea_orm(index)]
pub uid: i64,
pub did: i64,
}
#[derive(Debug, Clone, Copy, EnumIter)]
pub enum Relation {
DocInfo,
Tag,
DocInfo,
}
impl RelationTrait for Relation {
fn def(&self) -> sea_orm::RelationDef {
match self {
Self::DocInfo =>
Entity::belongs_to(super::doc_info::Entity)
.from(Column::Uid)
.to(super::doc_info::Column::Uid)
.into(),
Self::Tag =>
Entity::belongs_to(super::tag_info::Entity)
.from(Column::TagId)
.to(super::tag_info::Column::Tid)
.into(),
Self::DocInfo =>
Entity::belongs_to(super::doc_info::Entity)
.from(Column::Did)
.to(super::doc_info::Column::Did)
.into(),
}
}
}

View File

@ -163,7 +163,7 @@ impl Query {
);
}
if tag.regx.len() > 0 {
cond.push_str(&format!(" and doc_name ~ '{}'", tag.regx));
cond.push_str(&format!(" and (type='{}' or doc_name ~ '{}') ", tag.tag_name, tag.regx));
}
}
@ -254,6 +254,7 @@ impl Mutation {
size: Set(form_data.size.to_owned()),
r#type: Set(form_data.r#type.to_owned()),
location: Set(form_data.location.to_owned()),
thumbnail_base64: Default::default(),
created_at: Set(form_data.created_at.to_owned()),
updated_at: Set(form_data.updated_at.to_owned()),
is_deleted: Default::default(),
@ -277,6 +278,7 @@ impl Mutation {
size: Set(form_data.size.to_owned()),
r#type: Set(form_data.r#type.to_owned()),
location: Set(form_data.location.to_owned()),
thumbnail_base64: doc_info.thumbnail_base64,
created_at: doc_info.created_at,
updated_at: Set(now()),
is_deleted: Default::default(),