add docker compose (#8)

* add docker compose

* add docker compose
This commit is contained in:
KevinHuSh 2023-12-15 19:38:32 +08:00 committed by GitHub
parent f4456af464
commit 738c322508
6 changed files with 153 additions and 3 deletions

68
docker/docker-compose.yml Normal file
View File

@ -0,0 +1,68 @@
version: '2.2'
services:
es01:
container_name: docass-es-01
image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
volumes:
- esdata01:/usr/share/elasticsearch/data
ports:
- ${ES_PORT}:9200
environment:
- node.name=es01
- cluster.name=${CLUSTER_NAME}
- cluster.initial_master_nodes=es01
- ELASTIC_PASSWORD=${ELASTIC_PASSWORD}
- bootstrap.memory_lock=false
- xpack.security.enabled=false
mem_limit: ${MEM_LIMIT}
ulimits:
memlock:
soft: -1
hard: -1
networks:
- docass
restart: always
kibana:
depends_on:
- es01
image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
container_name: docass-kibana
volumes:
- kibanadata:/usr/share/kibana/data
ports:
- ${KIBANA_PORT}:5601
environment:
- SERVERNAME=kibana
- ELASTICSEARCH_HOSTS=http://es01:9200
mem_limit: ${MEM_LIMIT}
networks:
- docass
postgres:
image: postgres
container_name: docass-postgres
environment:
- POSTGRES_USER=${POSTGRES_USER}
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
- POSTGRES_DB=${POSTGRES_DB}
ports:
- 5455:5455
volumes:
- pg_data:/usr/share/elasticsearch/data
networks:
- docass
restart: always
volumes:
esdata01:
driver: local
kibanadata:
driver: local
pg_data:
driver: local
networks:
docass:
driver: bridge

22
python/README.md Normal file
View File

@ -0,0 +1,22 @@
```shell
docker pull postgres
LOCAL_POSTGRES_DATA=./postgres-data
docker run
--name docass-postgres
-p 5455:5432
-v $LOCAL_POSTGRES_DATA:/var/lib/postgresql/data
-e POSTGRES_USER=root
-e POSTGRES_PASSWORD=infiniflow_docass
-e POSTGRES_DB=docass
-d
postgres
docker network create elastic
docker pull elasticsearch:8.11.3;
docker pull docker.elastic.co/kibana/kibana:8.11.3
```

View File

@ -1,4 +1,8 @@
[online] [online]
es=127.0.0.1:9200 es=127.0.0.1:9200
idx_nm=toxic idx_nm=toxic
pgdb_usr=root
pgdb_pwd=infiniflow_docass
pgdb_host=127.0.0.1
pgdb_port=5432

View File

@ -291,6 +291,12 @@ class PdfChunker(HuChunker):
class DocxChunker(HuChunker): class DocxChunker(HuChunker):
@dataclass
class Fields:
text_chunks: List = None
table_chunks: List = None
def __init__(self, doc_parser): def __init__(self, doc_parser):
self.doc = doc_parser self.doc = doc_parser
super().__init__() super().__init__()
@ -336,6 +342,12 @@ class DocxChunker(HuChunker):
class ExcelChunker(HuChunker): class ExcelChunker(HuChunker):
@dataclass
class Fields:
text_chunks: List = None
table_chunks: List = None
def __init__(self, excel_parser): def __init__(self, excel_parser):
self.excel = excel_parser self.excel = excel_parser
super().__init__() super().__init__()
@ -354,10 +366,10 @@ if __name__ == "__main__":
from parser import PdfParser from parser import PdfParser
ckr = PdfChunker(PdfParser()) ckr = PdfChunker(PdfParser())
if sys.argv[1].split(".")[-1].lower().find("doc") >= 0: if sys.argv[1].split(".")[-1].lower().find("doc") >= 0:
from .parser import DocxParser from parser import DocxParser
ckr = DocxChunker(DocxParser()) ckr = DocxChunker(DocxParser())
if sys.argv[1].split(".")[-1].lower().find("xlsx") >= 0: if sys.argv[1].split(".")[-1].lower().find("xlsx") >= 0:
from .parser import ExcelParser from parser import ExcelParser
ckr = ExcelChunker(ExcelParser()) ckr = ExcelChunker(ExcelParser())
# ckr.html(sys.argv[1]) # ckr.html(sys.argv[1])

View File

@ -323,7 +323,7 @@ class HuParser:
return layouts return layouts
def __table_paddle(self, images): def __table_paddle(self, images):
tbls = self.tbl_det([np.array(img) for img in images], thr=0.5) tbls = self.tbl_det([img for img in images], threshold=0.5)
res = [] res = []
# align left&right for rows, align top&bottom for columns # align left&right for rows, align top&bottom for columns
for tbl in tbls: for tbl in tbls:

44
python/util/db_conn.py Normal file
View File

@ -0,0 +1,44 @@
import logging
import time
from util import config
import pandas as pd
class Postgre(object):
def __init__(self, env, dbnm):
self.config = config.init(env)
self.conn = None
self.dbnm = dbnm
self.__open__()
def __open__(self):
import psycopg2
try:
if self.conn:self.__close__()
del self.conn
except Exception as e:
pass
try:
self.conn = psycopg2.connect(f"dbname={self.dbnm} user={self.config.get('pgdb_usr')} password={self.config.get('pgdb_pwd')} host={self.config.get('pgdb_host')} port={self.config.get('pgdb_port')}")
except Exception as e:
logging.error("Fail to connect %s "%self.config.get("pgdb_host") + str(e))
def __close__(self):
try:
self.conn.close()
except Exception as e:
logging.error("Fail to close %s "%self.config.get("pgdb_host") + str(e))
def select(self, sql):
for _ in range(10):
try:
return pd.read_sql(sql, self.conn)
except Exception as e:
logging.error(f"Fail to exec {sql}l "+str(e))
self.__open__()
time.sleep(1)
return pd.DataFrame()