build dialog server; add thumbnail to docinfo; (#17)

This commit is contained in:
KevinHuSh 2023-12-26 19:32:06 +08:00 committed by GitHub
parent 3245107dc7
commit 3fc700a1d4
12 changed files with 94 additions and 42 deletions

View File

@ -26,6 +26,7 @@ migration = { path = "./migration" }
minio = "0.1.0" minio = "0.1.0"
futures-util = "0.3.29" futures-util = "0.3.29"
actix-multipart-extract = "0.1.5" actix-multipart-extract = "0.1.5"
regex = "1.10.2"
[[bin]] [[bin]]
name = "doc_gpt" name = "doc_gpt"

View File

@ -201,7 +201,8 @@ impl MigrationTrait for Migration {
.col(ColumnDef::new(DocInfo::Location).string().not_null()) .col(ColumnDef::new(DocInfo::Location).string().not_null())
.col(ColumnDef::new(DocInfo::Size).big_integer().not_null()) .col(ColumnDef::new(DocInfo::Size).big_integer().not_null())
.col(ColumnDef::new(DocInfo::Type).string().not_null()) .col(ColumnDef::new(DocInfo::Type).string().not_null())
.comment("doc|folder") .col(ColumnDef::new(DocInfo::ThumbnailBase64).string().not_null())
.comment("doc type|folder")
.col( .col(
ColumnDef::new(DocInfo::CreatedAt) ColumnDef::new(DocInfo::CreatedAt)
.timestamp_with_time_zone() .timestamp_with_time_zone()
@ -249,7 +250,6 @@ impl MigrationTrait for Migration {
.to_owned() .to_owned()
).await?; ).await?;
let tm = now();
let root_insert = Query::insert() let root_insert = Query::insert()
.into_table(UserInfo::Table) .into_table(UserInfo::Table)
.columns([UserInfo::Email, UserInfo::Nickname, UserInfo::Password]) .columns([UserInfo::Email, UserInfo::Nickname, UserInfo::Password])
@ -273,28 +273,28 @@ impl MigrationTrait for Migration {
.columns([TagInfo::Uid, TagInfo::TagName, TagInfo::Regx, TagInfo::Color, TagInfo::Icon]) .columns([TagInfo::Uid, TagInfo::TagName, TagInfo::Regx, TagInfo::Color, TagInfo::Icon])
.values_panic([ .values_panic([
(1).into(), (1).into(),
"视频".into(), "Video".into(),
".*\\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa)".into(), ".*\\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa)".into(),
(1).into(), (1).into(),
(1).into(), (1).into(),
]) ])
.values_panic([ .values_panic([
(1).into(), (1).into(),
"图片".into(), "Picture".into(),
".*\\.(png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng)".into(), ".*\\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng)".into(),
(2).into(), (2).into(),
(2).into(), (2).into(),
]) ])
.values_panic([ .values_panic([
(1).into(), (1).into(),
"音乐".into(), "Music".into(),
".*\\.(WAV|FLAC|APE|ALAC|WavPack|WV|MP3|AAC|Ogg|Vorbis|Opus)".into(), ".*\\.(WAV|FLAC|APE|ALAC|WavPack|WV|MP3|AAC|Ogg|Vorbis|Opus)".into(),
(3).into(), (3).into(),
(3).into(), (3).into(),
]) ])
.values_panic([ .values_panic([
(1).into(), (1).into(),
"文档".into(), "Document".into(),
".*\\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp)".into(), ".*\\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp)".into(),
(3).into(), (3).into(),
(3).into(), (3).into(),
@ -419,6 +419,7 @@ enum DocInfo {
Location, Location,
Size, Size,
Type, Type,
ThumbnailBase64,
CreatedAt, CreatedAt,
UpdatedAt, UpdatedAt,
IsDeleted, IsDeleted,

View File

@ -1,10 +1,10 @@
[infiniflow] [infiniflow]
es=http://127.0.0.1:9200 es=http://es01:9200
pgdb_usr=root pgdb_usr=root
pgdb_pwd=infiniflow_docgpt pgdb_pwd=infiniflow_docgpt
pgdb_host=127.0.0.1 pgdb_host=postgres
pgdb_port=5455 pgdb_port=5432
minio_host=127.0.0.1:9000 minio_host=minio:9000
minio_usr=infiniflow minio_usr=infiniflow
minio_pwd=infiniflow_docgpt minio_pwd=infiniflow_docgpt

View File

@ -24,6 +24,7 @@ class QWen(Base):
from http import HTTPStatus from http import HTTPStatus
from dashscope import Generation from dashscope import Generation
from dashscope.api_entities.dashscope_response import Role from dashscope.api_entities.dashscope_response import Role
# export DASHSCOPE_API_KEY=YOUR_DASHSCOPE_API_KEY
response = Generation.call( response = Generation.call(
Generation.Models.qwen_turbo, Generation.Models.qwen_turbo,
messages=messages, messages=messages,

View File

@ -9,6 +9,8 @@ from sklearn.metrics.pairwise import cosine_similarity as CosineSimilarity
import numpy as np import numpy as np
from copy import deepcopy from copy import deepcopy
def index_name(uid):return f"docgpt_{uid}"
class Dealer: class Dealer:
def __init__(self, es, emb_mdl): def __init__(self, es, emb_mdl):
self.qryr = query.EsQueryer(es) self.qryr = query.EsQueryer(es)

View File

@ -6,11 +6,10 @@ from tornado.ioloop import IOLoop
from tornado.httpserver import HTTPServer from tornado.httpserver import HTTPServer
from tornado.options import define,options from tornado.options import define,options
from util import es_conn, setup_logging from util import es_conn, setup_logging
from svr import sec_search as search
from svr.rpc_proxy import RPCProxy
from sklearn.metrics.pairwise import cosine_similarity as CosineSimilarity from sklearn.metrics.pairwise import cosine_similarity as CosineSimilarity
from nlp import huqie from nlp import huqie
from nlp import query as Query from nlp import query as Query
from nlp import search
from llm import HuEmbedding, GptTurbo from llm import HuEmbedding, GptTurbo
import numpy as np import numpy as np
from io import BytesIO from io import BytesIO
@ -38,7 +37,7 @@ def get_QA_pairs(hists):
def get_instruction(sres, top_i, max_len=8096 fld="content_ltks"): def get_instruction(sres, top_i, max_len=8096, fld="content_ltks"):
max_len //= len(top_i) max_len //= len(top_i)
# add instruction to prompt # add instruction to prompt
instructions = [re.sub(r"[\r\n]+", " ", sres.field[sres.ids[i]][fld]) for i in top_i] instructions = [re.sub(r"[\r\n]+", " ", sres.field[sres.ids[i]][fld]) for i in top_i]
@ -98,8 +97,9 @@ class Handler(RequestHandler):
res = SE.search({ res = SE.search({
"question": question, "question": question,
"kb_ids": param.get("kb_ids", []), "kb_ids": param.get("kb_ids", []),
"size": param.get("topn", 15) "size": param.get("topn", 15)},
}) search.index_name(param["uid"])
)
sim = SE.rerank(res, question) sim = SE.rerank(res, question)
rk_idx = np.argsort(sim*-1) rk_idx = np.argsort(sim*-1)
@ -112,12 +112,12 @@ class Handler(RequestHandler):
refer = OrderedDict() refer = OrderedDict()
docnms = {} docnms = {}
for i in rk_idx: for i in rk_idx:
did = res.field[res.ids[i]]["doc_id"]) did = res.field[res.ids[i]]["doc_id"]
if did not in docnms: docnms[did] = res.field[res.ids[i]]["docnm_kwd"]) if did not in docnms: docnms[did] = res.field[res.ids[i]]["docnm_kwd"]
if did not in refer: refer[did] = [] if did not in refer: refer[did] = []
refer[did].append({ refer[did].append({
"chunk_id": res.ids[i], "chunk_id": res.ids[i],
"content": res.field[res.ids[i]]["content_ltks"]), "content": res.field[res.ids[i]]["content_ltks"],
"image": "" "image": ""
}) })
@ -128,7 +128,7 @@ class Handler(RequestHandler):
"data":{ "data":{
"uid": param["uid"], "uid": param["uid"],
"dialog_id": param["dialog_id"], "dialog_id": param["dialog_id"],
"assistant": ans "assistant": ans,
"refer": [{ "refer": [{
"did": did, "did": did,
"doc_name": docnms[did], "doc_name": docnms[did],
@ -153,7 +153,7 @@ if __name__ == '__main__':
parser.add_argument("--port", default=4455, type=int, help="Port used for service") parser.add_argument("--port", default=4455, type=int, help="Port used for service")
ARGS = parser.parse_args() ARGS = parser.parse_args()
SE = search.ResearchReportSearch(es_conn.HuEs("infiniflow"), EMBEDDING) SE = search.Dealer(es_conn.HuEs("infiniflow"), EMBEDDING)
app = Application([(r'/v1/chat/completions', Handler)],debug=False) app = Application([(r'/v1/chat/completions', Handler)],debug=False)
http_server = HTTPServer(app) http_server = HTTPServer(app)

View File

@ -6,7 +6,7 @@ from util.db_conn import Postgres
from util.minio_conn import HuMinio from util.minio_conn import HuMinio
from util import rmSpace, findMaxDt from util import rmSpace, findMaxDt
from FlagEmbedding import FlagModel from FlagEmbedding import FlagModel
from nlp import huchunk, huqie from nlp import huchunk, huqie, search
import base64, hashlib import base64, hashlib
from io import BytesIO from io import BytesIO
import pandas as pd import pandas as pd
@ -103,7 +103,7 @@ def build(row):
if(!ctx._source.kb_id.contains('%s')) if(!ctx._source.kb_id.contains('%s'))
ctx._source.kb_id.add('%s'); ctx._source.kb_id.add('%s');
"""%(str(row["kb_id"]), str(row["kb_id"])), """%(str(row["kb_id"]), str(row["kb_id"])),
idxnm = index_name(row["uid"]) idxnm = search.index_name(row["uid"])
) )
set_progress(row["kb2doc_id"], 1, "Done") set_progress(row["kb2doc_id"], 1, "Done")
return [] return []
@ -171,10 +171,8 @@ def build(row):
return docs return docs
def index_name(uid):return f"docgpt_{uid}"
def init_kb(row): def init_kb(row):
idxnm = index_name(row["uid"]) idxnm = search.index_name(row["uid"])
if ES.indexExist(idxnm): return if ES.indexExist(idxnm): return
return ES.createIdx(idxnm, json.load(open("conf/mapping.json", "r"))) return ES.createIdx(idxnm, json.load(open("conf/mapping.json", "r")))
@ -199,7 +197,7 @@ def rm_doc_from_kb(df):
ctx._source.kb_id.indexOf('%s') ctx._source.kb_id.indexOf('%s')
); );
"""%(str(r["kb_id"]),str(r["kb_id"])), """%(str(r["kb_id"]),str(r["kb_id"])),
idxnm = index_name(r["uid"]) idxnm = search.index_name(r["uid"])
) )
if len(df) == 0:return if len(df) == 0:return
sql = """ sql = """
@ -233,7 +231,7 @@ def main(comm, mod):
set_progress(r["kb2doc_id"], random.randint(70, 95)/100., set_progress(r["kb2doc_id"], random.randint(70, 95)/100.,
"Finished embedding! Start to build index!") "Finished embedding! Start to build index!")
init_kb(r) init_kb(r)
es_r = ES.bulk(cks, index_name(r["uid"])) es_r = ES.bulk(cks, search.index_name(r["uid"]))
if es_r: if es_r:
set_progress(r["kb2doc_id"], -1, "Index failure!") set_progress(r["kb2doc_id"], -1, "Index failure!")
print(es_r) print(es_r)

View File

@ -11,6 +11,7 @@ use crate::entity::doc_info::Model;
use crate::errors::AppError; use crate::errors::AppError;
use crate::service::doc_info::{ Mutation, Query }; use crate::service::doc_info::{ Mutation, Query };
use serde::Deserialize; use serde::Deserialize;
use regex::Regex;
fn now() -> chrono::DateTime<FixedOffset> { fn now() -> chrono::DateTime<FixedOffset> {
Utc::now().with_timezone(&FixedOffset::east_opt(3600 * 8).unwrap()) Utc::now().with_timezone(&FixedOffset::east_opt(3600 * 8).unwrap())
@ -64,6 +65,41 @@ pub struct UploadForm {
did: i64, did: i64,
} }
fn file_type(filename: &String) -> String {
let fnm = filename.to_lowercase();
if
let Some(_) = Regex::new(r"\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa)$")
.unwrap()
.captures(&fnm)
{
return "Video".to_owned();
}
if
let Some(_) = Regex::new(
r"\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng)$"
)
.unwrap()
.captures(&fnm)
{
return "Picture".to_owned();
}
if
let Some(_) = Regex::new(r"\.(WAV|FLAC|APE|ALAC|WavPack|WV|MP3|AAC|Ogg|Vorbis|Opus)$")
.unwrap()
.captures(&fnm)
{
return "Music".to_owned();
}
if
let Some(_) = Regex::new(r"\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp)$")
.unwrap()
.captures(&fnm)
{
return "Document".to_owned();
}
"Other".to_owned()
}
#[post("/v1.0/upload")] #[post("/v1.0/upload")]
async fn upload( async fn upload(
payload: Multipart<UploadForm>, payload: Multipart<UploadForm>,
@ -114,7 +150,13 @@ async fn upload(
print!("Existing bucket: {}", bucket_name.clone()); print!("Existing bucket: {}", bucket_name.clone());
} }
let location = format!("/{}/{}", payload.did, fnm); let location = format!("/{}/{}", payload.did, fnm)
.as_bytes()
.to_vec()
.iter()
.map(|b| format!("{:02x}", b).to_string())
.collect::<Vec<String>>()
.join("");
print!("===>{}", location.clone()); print!("===>{}", location.clone());
s3_client.put_object( s3_client.put_object(
&mut PutObjectArgs::new( &mut PutObjectArgs::new(
@ -129,10 +171,11 @@ async fn upload(
let doc = Mutation::create_doc_info(&data.conn, Model { let doc = Mutation::create_doc_info(&data.conn, Model {
did: Default::default(), did: Default::default(),
uid: uid, uid: uid,
doc_name: fnm, doc_name: fnm.clone(),
size: payload.file_field.bytes.len() as i64, size: payload.file_field.bytes.len() as i64,
location, location,
r#type: "doc".to_string(), r#type: file_type(&fnm),
thumbnail_base64: Default::default(),
created_at: now(), created_at: now(),
updated_at: now(), updated_at: now(),
is_deleted: Default::default(), is_deleted: Default::default(),
@ -214,6 +257,7 @@ async fn new_folder(
size: 0, size: 0,
r#type: "folder".to_string(), r#type: "folder".to_string(),
location: "".to_owned(), location: "".to_owned(),
thumbnail_base64: Default::default(),
created_at: now(), created_at: now(),
updated_at: now(), updated_at: now(),
is_deleted: Default::default(), is_deleted: Default::default(),

View File

@ -90,12 +90,13 @@ async fn register(
doc_name: "/".into(), doc_name: "/".into(),
size: 0, size: 0,
location: "".into(), location: "".into(),
thumbnail_base64: "".into(),
r#type: "folder".to_string(), r#type: "folder".to_string(),
created_at: now(), created_at: now(),
updated_at: now(), updated_at: now(),
is_deleted: Default::default(), is_deleted: Default::default(),
}).await?; }).await?;
let tnm = vec!["视频", "图片", "音乐", "文档"]; let tnm = vec!["Video", "Picture", "Music", "Document"];
let tregx = vec![ let tregx = vec![
".*\\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa)", ".*\\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa)",
".*\\.(png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng)", ".*\\.(png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng)",

View File

@ -17,6 +17,8 @@ pub struct Model {
#[serde(skip_deserializing)] #[serde(skip_deserializing)]
pub location: String, pub location: String,
#[serde(skip_deserializing)] #[serde(skip_deserializing)]
pub thumbnail_base64: String,
#[serde(skip_deserializing)]
pub created_at: DateTime<FixedOffset>, pub created_at: DateTime<FixedOffset>,
#[serde(skip_deserializing)] #[serde(skip_deserializing)]
pub updated_at: DateTime<FixedOffset>, pub updated_at: DateTime<FixedOffset>,

View File

@ -9,28 +9,28 @@ pub struct Model {
#[sea_orm(index)] #[sea_orm(index)]
pub tag_id: i64, pub tag_id: i64,
#[sea_orm(index)] #[sea_orm(index)]
pub uid: i64, pub did: i64,
} }
#[derive(Debug, Clone, Copy, EnumIter)] #[derive(Debug, Clone, Copy, EnumIter)]
pub enum Relation { pub enum Relation {
DocInfo,
Tag, Tag,
DocInfo,
} }
impl RelationTrait for Relation { impl RelationTrait for Relation {
fn def(&self) -> sea_orm::RelationDef { fn def(&self) -> sea_orm::RelationDef {
match self { match self {
Self::DocInfo =>
Entity::belongs_to(super::doc_info::Entity)
.from(Column::Uid)
.to(super::doc_info::Column::Uid)
.into(),
Self::Tag => Self::Tag =>
Entity::belongs_to(super::tag_info::Entity) Entity::belongs_to(super::tag_info::Entity)
.from(Column::TagId) .from(Column::TagId)
.to(super::tag_info::Column::Tid) .to(super::tag_info::Column::Tid)
.into(), .into(),
Self::DocInfo =>
Entity::belongs_to(super::doc_info::Entity)
.from(Column::Did)
.to(super::doc_info::Column::Did)
.into(),
} }
} }
} }

View File

@ -163,7 +163,7 @@ impl Query {
); );
} }
if tag.regx.len() > 0 { if tag.regx.len() > 0 {
cond.push_str(&format!(" and doc_name ~ '{}'", tag.regx)); cond.push_str(&format!(" and (type='{}' or doc_name ~ '{}') ", tag.tag_name, tag.regx));
} }
} }
@ -254,6 +254,7 @@ impl Mutation {
size: Set(form_data.size.to_owned()), size: Set(form_data.size.to_owned()),
r#type: Set(form_data.r#type.to_owned()), r#type: Set(form_data.r#type.to_owned()),
location: Set(form_data.location.to_owned()), location: Set(form_data.location.to_owned()),
thumbnail_base64: Default::default(),
created_at: Set(form_data.created_at.to_owned()), created_at: Set(form_data.created_at.to_owned()),
updated_at: Set(form_data.updated_at.to_owned()), updated_at: Set(form_data.updated_at.to_owned()),
is_deleted: Default::default(), is_deleted: Default::default(),
@ -277,6 +278,7 @@ impl Mutation {
size: Set(form_data.size.to_owned()), size: Set(form_data.size.to_owned()),
r#type: Set(form_data.r#type.to_owned()), r#type: Set(form_data.r#type.to_owned()),
location: Set(form_data.location.to_owned()), location: Set(form_data.location.to_owned()),
thumbnail_base64: doc_info.thumbnail_base64,
created_at: doc_info.created_at, created_at: doc_info.created_at,
updated_at: Set(now()), updated_at: Set(now()),
is_deleted: Default::default(), is_deleted: Default::default(),