Add Q&A and Book, fix task running bugs (#50)

This commit is contained in:
KevinHuSh 2024-02-01 18:53:56 +08:00 committed by GitHub
parent 6224edcd1b
commit e6acaf6738
21 changed files with 628 additions and 276 deletions

View File

@ -18,10 +18,12 @@ import datetime
from flask import request from flask import request
from flask_login import login_required, current_user from flask_login import login_required, current_user
from elasticsearch_dsl import Q from elasticsearch_dsl import Q
from rag.app.qa import rmPrefix, beAdoc
from rag.nlp import search, huqie, retrievaler from rag.nlp import search, huqie, retrievaler
from rag.utils import ELASTICSEARCH, rmSpace from rag.utils import ELASTICSEARCH, rmSpace
from api.db import LLMType from api.db import LLMType, ParserType
from api.db.services.kb_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.llm_service import TenantLLMService from api.db.services.llm_service import TenantLLMService
from api.db.services.user_service import UserTenantService from api.db.services.user_service import UserTenantService
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
@ -89,10 +91,8 @@ def get():
res["chunk_id"] = id res["chunk_id"] = id
k = [] k = []
for n in res.keys(): for n in res.keys():
if re.search(r"(_vec$|_sm_)", n): if re.search(r"(_vec$|_sm_|_tks|_ltks)", n):
k.append(n) k.append(n)
if re.search(r"(_tks|_ltks)", n):
res[n] = rmSpace(res[n])
for n in k: for n in k:
del res[n] del res[n]
@ -106,12 +106,12 @@ def get():
@manager.route('/set', methods=['POST']) @manager.route('/set', methods=['POST'])
@login_required @login_required
@validate_request("doc_id", "chunk_id", "content_ltks", @validate_request("doc_id", "chunk_id", "content_with_weight",
"important_kwd") "important_kwd")
def set(): def set():
req = request.json req = request.json
d = {"id": req["chunk_id"]} d = {"id": req["chunk_id"]}
d["content_ltks"] = huqie.qie(req["content_ltks"]) d["content_ltks"] = huqie.qie(req["content_with_weight"])
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
d["important_kwd"] = req["important_kwd"] d["important_kwd"] = req["important_kwd"]
d["important_tks"] = huqie.qie(" ".join(req["important_kwd"])) d["important_tks"] = huqie.qie(" ".join(req["important_kwd"]))
@ -127,8 +127,15 @@ def set():
e, doc = DocumentService.get_by_id(req["doc_id"]) e, doc = DocumentService.get_by_id(req["doc_id"])
if not e: if not e:
return get_data_error_result(retmsg="Document not found!") return get_data_error_result(retmsg="Document not found!")
if doc.parser_id == ParserType.QA:
arr = [t for t in re.split(r"[\n\t]", req["content_with_weight"]) if len(t)>1]
if len(arr) != 2: return get_data_error_result(retmsg="Q&A must be separated by TAB/ENTER key.")
q, a = rmPrefix(arr[0]), rmPrefix[arr[1]]
d = beAdoc(d, arr[0], arr[1], not any([huqie.is_chinese(t) for t in q+a]))
v, c = embd_mdl.encode([doc.name, req["content_ltks"]]) v, c = embd_mdl.encode([doc.name, req["content_ltks"]])
v = 0.1 * v[0] + 0.9 * v[1] v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
d["q_%d_vec" % len(v)] = v.tolist() d["q_%d_vec" % len(v)] = v.tolist()
ELASTICSEARCH.upsert([d], search.index_name(tenant_id)) ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
return get_json_result(data=True) return get_json_result(data=True)

View File

@ -18,7 +18,7 @@ from flask import request
from flask_login import login_required, current_user from flask_login import login_required, current_user
from api.db.services.dialog_service import DialogService from api.db.services.dialog_service import DialogService
from api.db import StatusEnum from api.db import StatusEnum
from api.db.services.kb_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.user_service import TenantService from api.db.services.user_service import TenantService
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
from api.utils import get_uuid from api.utils import get_uuid

View File

@ -27,10 +27,10 @@ from api.db.services.task_service import TaskService
from rag.nlp import search from rag.nlp import search
from rag.utils import ELASTICSEARCH from rag.utils import ELASTICSEARCH
from api.db.services import duplicate_name from api.db.services import duplicate_name
from api.db.services.kb_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
from api.utils import get_uuid from api.utils import get_uuid
from api.db import FileType from api.db import FileType, TaskStatus
from api.db.services.document_service import DocumentService from api.db.services.document_service import DocumentService
from api.settings import RetCode from api.settings import RetCode
from api.utils.api_utils import get_json_result from api.utils.api_utils import get_json_result
@ -210,13 +210,12 @@ def rm():
@manager.route('/run', methods=['POST']) @manager.route('/run', methods=['POST'])
@login_required @login_required
@validate_request("doc_ids", "run") @validate_request("doc_ids", "run")
def rm(): def run():
req = request.json req = request.json
try: try:
for id in req["doc_ids"]: for id in req["doc_ids"]:
DocumentService.update_by_id(id, {"run": str(req["run"])}) DocumentService.update_by_id(id, {"run": str(req["run"]), "progress": 0})
if req["run"] == "2": if str(req["run"]) == TaskStatus.CANCEL.value:
TaskService.filter_delete([Task.doc_id == id])
tenant_id = DocumentService.get_tenant_id(id) tenant_id = DocumentService.get_tenant_id(id)
if not tenant_id: if not tenant_id:
return get_data_error_result(retmsg="Tenant not found!") return get_data_error_result(retmsg="Tenant not found!")
@ -284,9 +283,10 @@ def change_parser():
if doc.parser_id.lower() == req["parser_id"].lower(): if doc.parser_id.lower() == req["parser_id"].lower():
return get_json_result(data=True) return get_json_result(data=True)
e = DocumentService.update_by_id(doc.id, {"parser_id": req["parser_id"], "progress":0, "progress_msg": "", "run": 1}) e = DocumentService.update_by_id(doc.id, {"parser_id": req["parser_id"], "progress":0, "progress_msg": ""})
if not e: if not e:
return get_data_error_result(retmsg="Document not found!") return get_data_error_result(retmsg="Document not found!")
if doc.token_num>0:
e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num*-1, doc.chunk_num*-1, doc.process_duation*-1) e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num*-1, doc.chunk_num*-1, doc.process_duation*-1)
if not e: if not e:
return get_data_error_result(retmsg="Document not found!") return get_data_error_result(retmsg="Document not found!")

View File

@ -21,7 +21,7 @@ from api.db.services.user_service import TenantService, UserTenantService
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
from api.utils import get_uuid, get_format_time from api.utils import get_uuid, get_format_time
from api.db import StatusEnum, UserTenantRole from api.db import StatusEnum, UserTenantRole
from api.db.services.kb_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.db_models import Knowledgebase from api.db.db_models import Knowledgebase
from api.settings import stat_logger, RetCode from api.settings import stat_logger, RetCode
from api.utils.api_utils import get_json_result from api.utils.api_utils import get_json_result

View File

@ -22,7 +22,7 @@ from api.db.services.user_service import TenantService, UserTenantService
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
from api.utils import get_uuid, get_format_time from api.utils import get_uuid, get_format_time
from api.db import StatusEnum, UserTenantRole from api.db import StatusEnum, UserTenantRole
from api.db.services.kb_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.db_models import Knowledgebase, TenantLLM from api.db.db_models import Knowledgebase, TenantLLM
from api.settings import stat_logger, RetCode from api.settings import stat_logger, RetCode
from api.utils.api_utils import get_json_result from api.utils.api_utils import get_json_result

View File

@ -61,12 +61,19 @@ class ChatStyle(StrEnum):
CUSTOM = 'Custom' CUSTOM = 'Custom'
class TaskStatus(StrEnum):
RUNNING = "1"
CANCEL = "2"
DONE = "3"
FAIL = "4"
class ParserType(StrEnum): class ParserType(StrEnum):
GENERAL = "general" GENERAL = "general"
PRESENTATION = "presentation" PRESENTATION = "presentation"
LAWS = "laws" LAWS = "laws"
MANUAL = "manual" MANUAL = "manual"
PAPER = "paper" PAPER = "paper"
RESUME = "" RESUME = "resume"
BOOK = "" BOOK = "book"
QA = "" QA = "qa"

View File

@ -33,8 +33,8 @@ def bulk_insert_into_db(model, data_source, replace_on_conflict=False):
DB.create_tables([model]) DB.create_tables([model])
for data in data_source: for i,data in enumerate(data_source):
current_time = current_timestamp() current_time = current_timestamp() + i
current_date = timestamp_to_date(current_time) current_date = timestamp_to_date(current_time)
if 'create_time' not in data: if 'create_time' not in data:
data['create_time'] = current_time data['create_time'] = current_time

View File

@ -15,11 +15,11 @@
# #
from peewee import Expression from peewee import Expression
from api.db import TenantPermission, FileType from api.db import TenantPermission, FileType, TaskStatus
from api.db.db_models import DB, Knowledgebase, Tenant from api.db.db_models import DB, Knowledgebase, Tenant
from api.db.db_models import Document from api.db.db_models import Document
from api.db.services.common_service import CommonService from api.db.services.common_service import CommonService
from api.db.services.kb_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db import StatusEnum from api.db import StatusEnum
@ -71,6 +71,7 @@ class DocumentService(CommonService):
~(cls.model.type == FileType.VIRTUAL.value), ~(cls.model.type == FileType.VIRTUAL.value),
cls.model.progress == 0, cls.model.progress == 0,
cls.model.update_time >= tm, cls.model.update_time >= tm,
cls.model.run == TaskStatus.RUNNING.value,
(Expression(cls.model.create_time, "%%", comm) == mod))\ (Expression(cls.model.create_time, "%%", comm) == mod))\
.order_by(cls.model.update_time.asc())\ .order_by(cls.model.update_time.asc())\
.paginate(1, items_per_page) .paginate(1, items_per_page)

View File

@ -13,13 +13,52 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# #
from api.db.db_models import Knowledgebase, Document from api.db import StatusEnum, TenantPermission
from api.db.db_models import Knowledgebase, DB, Tenant
from api.db.services.common_service import CommonService from api.db.services.common_service import CommonService
class KnowledgebaseService(CommonService): class KnowledgebaseService(CommonService):
model = Knowledgebase model = Knowledgebase
@classmethod
@DB.connection_context()
def get_by_tenant_ids(cls, joined_tenant_ids, user_id,
page_number, items_per_page, orderby, desc):
kbs = cls.model.select().where(
((cls.model.tenant_id.in_(joined_tenant_ids) & (cls.model.permission ==
TenantPermission.TEAM.value)) | (cls.model.tenant_id == user_id))
& (cls.model.status == StatusEnum.VALID.value)
)
if desc:
kbs = kbs.order_by(cls.model.getter_by(orderby).desc())
else:
kbs = kbs.order_by(cls.model.getter_by(orderby).asc())
class DocumentService(CommonService): kbs = kbs.paginate(page_number, items_per_page)
model = Document
return list(kbs.dicts())
@classmethod
@DB.connection_context()
def get_detail(cls, kb_id):
fields = [
cls.model.id,
Tenant.embd_id,
cls.model.avatar,
cls.model.name,
cls.model.description,
cls.model.permission,
cls.model.doc_num,
cls.model.token_num,
cls.model.chunk_num,
cls.model.parser_id]
kbs = cls.model.select(*fields).join(Tenant, on=((Tenant.id == cls.model.tenant_id)&(Tenant.status== StatusEnum.VALID.value))).where(
(cls.model.id == kb_id),
(cls.model.status == StatusEnum.VALID.value)
)
if not kbs:
return
d = kbs[0].to_dict()
d["embd_id"] = kbs[0].tenant.embd_id
return d

View File

@ -15,9 +15,10 @@
# #
from peewee import Expression from peewee import Expression
from api.db.db_models import DB from api.db.db_models import DB
from api.db import StatusEnum, FileType from api.db import StatusEnum, FileType, TaskStatus
from api.db.db_models import Task, Document, Knowledgebase, Tenant from api.db.db_models import Task, Document, Knowledgebase, Tenant
from api.db.services.common_service import CommonService from api.db.services.common_service import CommonService
from api.db.services.document_service import DocumentService
class TaskService(CommonService): class TaskService(CommonService):
@ -46,8 +47,9 @@ class TaskService(CommonService):
@DB.connection_context() @DB.connection_context()
def do_cancel(cls, id): def do_cancel(cls, id):
try: try:
cls.model.get_by_id(id) task = cls.model.get_by_id(id)
return False _, doc = DocumentService.get_by_id(task.doc_id)
return doc.run == TaskStatus.CANCEL.value
except Exception as e: except Exception as e:
pass pass
return True return True

View File

@ -143,7 +143,7 @@ def filename_type(filename):
if re.match(r".*\.pdf$", filename): if re.match(r".*\.pdf$", filename):
return FileType.PDF.value return FileType.PDF.value
if re.match(r".*\.(docx|doc|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp|pages|numbers|key|md)$", filename): if re.match(r".*\.(docx|doc|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md)$", filename):
return FileType.DOC.value return FileType.DOC.value
if re.match(r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$", filename): if re.match(r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$", filename):

View File

@ -4,14 +4,8 @@ from nltk import word_tokenize
from rag.nlp import stemmer, huqie from rag.nlp import stemmer, huqie
def callback__(progress, msg, func):
if not func :return
func(progress, msg)
BULLET_PATTERN = [[ BULLET_PATTERN = [[
r"第[零一二三四五六七八九十百]+", r"第[零一二三四五六七八九十百]+(编|部分)",
r"第[零一二三四五六七八九十百]+章", r"第[零一二三四五六七八九十百]+章",
r"第[零一二三四五六七八九十百]+节", r"第[零一二三四五六七八九十百]+节",
r"第[零一二三四五六七八九十百]+条", r"第[零一二三四五六七八九十百]+条",
@ -22,6 +16,8 @@ BULLET_PATTERN = [[
r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}", r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}", r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
], [ ], [
r"第[零一二三四五六七八九十百]+章",
r"第[零一二三四五六七八九十百]+节",
r"[零一二三四五六七八九十百]+[ 、]", r"[零一二三四五六七八九十百]+[ 、]",
r"[\(][零一二三四五六七八九十百]+[\)]", r"[\(][零一二三四五六七八九十百]+[\)]",
r"[\(][0-9]{,2}[\)]", r"[\(][0-9]{,2}[\)]",
@ -54,7 +50,7 @@ def bullets_category(sections):
def is_english(texts): def is_english(texts):
eng = 0 eng = 0
for t in texts: for t in texts:
if re.match(r"[a-zA-Z]", t.strip()): if re.match(r"[a-zA-Z]{2,}", t.strip()):
eng += 1 eng += 1
if eng / len(texts) > 0.8: if eng / len(texts) > 0.8:
return True return True
@ -70,3 +66,26 @@ def tokenize(d, t, eng):
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
def remove_contents_table(sections, eng=False):
i = 0
while i < len(sections):
def get(i):
nonlocal sections
return (sections[i] if type(sections[i]) == type("") else sections[i][0]).strip()
if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)):
i += 1
continue
sections.pop(i)
if i >= len(sections): break
prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
while not prefix:
sections.pop(i)
if i >= len(sections): break
prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
sections.pop(i)
if i >= len(sections) or not prefix: break
for j in range(i, min(i+128, len(sections))):
if not re.match(prefix, get(j)):
continue
for _ in range(i, j):sections.pop(i)
break

156
rag/app/book.py Normal file
View File

@ -0,0 +1,156 @@
import copy
import random
import re
from io import BytesIO
from docx import Document
import numpy as np
from rag.app import bullets_category, BULLET_PATTERN, is_english, tokenize, remove_contents_table
from rag.nlp import huqie
from rag.parser.docx_parser import HuDocxParser
from rag.parser.pdf_parser import HuParser
class Pdf(HuParser):
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page)
callback(0.1, "OCR finished")
from timeit import default_timer as timer
start = timer()
self._layouts_paddle(zoomin)
callback(0.47, "Layout analysis finished")
print("paddle layouts:", timer() - start)
self._table_transformer_job(zoomin)
callback(0.68, "Table analysis finished")
self._text_merge()
column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
self._concat_downward(concat_between_pages=False)
self._filter_forpages()
self._merge_with_same_bullet()
callback(0.75, "Text merging finished.")
tbls = self._extract_table_figure(True, zoomin, False)
callback(0.8, "Text extraction finished")
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes]
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
doc = {
"docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
pdf_parser = None
sections,tbls = [], []
if re.search(r"\.docx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
doc_parser = HuDocxParser()
# TODO: table of contents need to be removed
sections, tbls = doc_parser(binary if binary else filename)
remove_contents_table(sections, eng = is_english(random.choices([t for t,_ in sections], k=200)))
callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf()
sections,tbls = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
elif re.search(r"\.txt$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = ""
if binary:txt = binary.decode("utf-8")
else:
with open(filename, "r") as f:
while True:
l = f.readline()
if not l:break
txt += l
sections = txt.split("\n")
sections = [(l,"") for l in sections if l]
remove_contents_table(sections, eng = is_english(random.choices([t for t,_ in sections], k=200)))
callback(0.8, "Finish parsing.")
else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
bull = bullets_category([b["text"] for b in random.choices([t for t,_ in sections], k=100)])
projs = [len(BULLET_PATTERN[bull]) + 1] * len(sections)
levels = [[]] * len(BULLET_PATTERN[bull]) + 2
for i, (txt, layout) in enumerate(sections):
for j, p in enumerate(BULLET_PATTERN[bull]):
if re.match(p, txt.strip()):
projs[i] = j
levels[j].append(i)
break
else:
if re.search(r"(title|head)", layout):
projs[i] = BULLET_PATTERN[bull]
levels[BULLET_PATTERN[bull]].append(i)
else:
levels[BULLET_PATTERN[bull] + 1].append(i)
sections = [t for t,_ in sections]
def binary_search(arr, target):
if target > arr[-1]: return len(arr) - 1
if target > arr[0]: return -1
s, e = 0, len(arr)
while e - s > 1:
i = (e + s) // 2
if target > arr[i]:
s = i
continue
elif target < arr[i]:
e = i
continue
else:
assert False
return s
cks = []
readed = [False] * len(sections)
levels = levels[::-1]
for i, arr in enumerate(levels):
for j in arr:
if readed[j]: continue
readed[j] = True
cks.append([j])
if i + 1 == len(levels) - 1: continue
for ii in range(i + 1, len(levels)):
jj = binary_search(levels[ii], j)
if jj < 0: break
if jj > cks[-1][-1]: cks[-1].pop(-1)
cks[-1].append(levels[ii][jj])
# is it English
eng = is_english(random.choices(sections, k=218))
res = []
# add tables
for img, rows in tbls:
bs = 10
de = ";" if eng else ""
for i in range(0, len(rows), bs):
d = copy.deepcopy(doc)
r = de.join(rows[i:i + bs])
r = re.sub(r"\t——(来自| in ).*”%s" % de, "", r)
tokenize(d, r, eng)
d["image"] = img
res.append(d)
# wrap up to es documents
for ck in cks:
print("\n-".join(ck[::-1]))
ck = "\n".join(ck[::-1])
d = copy.deepcopy(doc)
if pdf_parser:
d["image"] = pdf_parser.crop(ck)
ck = pdf_parser.remove_tag(ck)
tokenize(d, ck, eng)
res.append(d)
return res
if __name__ == "__main__":
import sys
chunk(sys.argv[1])

View File

@ -3,7 +3,7 @@ import re
from io import BytesIO from io import BytesIO
from docx import Document from docx import Document
import numpy as np import numpy as np
from rag.app import callback__, bullets_category, BULLET_PATTERN, is_english, tokenize from rag.app import bullets_category, BULLET_PATTERN, is_english, tokenize
from rag.nlp import huqie from rag.nlp import huqie
from rag.parser.docx_parser import HuDocxParser from rag.parser.docx_parser import HuDocxParser
from rag.parser.pdf_parser import HuParser from rag.parser.pdf_parser import HuParser
@ -32,12 +32,12 @@ class Pdf(HuParser):
zoomin, zoomin,
from_page, from_page,
to_page) to_page)
callback__(0.1, "OCR finished", callback) callback(0.1, "OCR finished")
from timeit import default_timer as timer from timeit import default_timer as timer
start = timer() start = timer()
self._layouts_paddle(zoomin) self._layouts_paddle(zoomin)
callback__(0.77, "Layout analysis finished", callback) callback(0.77, "Layout analysis finished")
print("paddle layouts:", timer()-start) print("paddle layouts:", timer()-start)
bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3) bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
# is it English # is it English
@ -75,7 +75,7 @@ class Pdf(HuParser):
b["x1"] = max(b["x1"], b_["x1"]) b["x1"] = max(b["x1"], b_["x1"])
bxs.pop(i + 1) bxs.pop(i + 1)
callback__(0.8, "Text extraction finished", callback) callback(0.8, "Text extraction finished")
return [b["text"] + self._line_tag(b, zoomin) for b in bxs] return [b["text"] + self._line_tag(b, zoomin) for b in bxs]
@ -89,17 +89,17 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
pdf_parser = None pdf_parser = None
sections = [] sections = []
if re.search(r"\.docx?$", filename, re.IGNORECASE): if re.search(r"\.docx?$", filename, re.IGNORECASE):
callback__(0.1, "Start to parse.", callback) callback(0.1, "Start to parse.")
for txt in Docx()(filename, binary): for txt in Docx()(filename, binary):
sections.append(txt) sections.append(txt)
callback__(0.8, "Finish parsing.", callback) callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE): elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf() pdf_parser = Pdf()
for txt in pdf_parser(filename if not binary else binary, for txt in pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback): from_page=from_page, to_page=to_page, callback=callback):
sections.append(txt) sections.append(txt)
elif re.search(r"\.txt$", filename, re.IGNORECASE): elif re.search(r"\.txt$", filename, re.IGNORECASE):
callback__(0.1, "Start to parse.", callback) callback(0.1, "Start to parse.")
txt = "" txt = ""
if binary:txt = binary.decode("utf-8") if binary:txt = binary.decode("utf-8")
else: else:
@ -110,7 +110,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
txt += l txt += l
sections = txt.split("\n") sections = txt.split("\n")
sections = [l for l in sections if l] sections = [l for l in sections if l]
callback__(0.8, "Finish parsing.", callback) callback(0.8, "Finish parsing.")
else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)") else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
# is it English # is it English
@ -118,7 +118,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
# Remove 'Contents' part # Remove 'Contents' part
i = 0 i = 0
while i < len(sections): while i < len(sections):
if not re.match(r"(Contents|目录|目次)$", re.sub(r"( | |\u3000)+", "", sections[i].split("@@")[0])): if not re.match(r"(contents|目录|目次|table of contents)$", re.sub(r"( | |\u3000)+", "", sections[i].split("@@")[0], re.IGNORECASE)):
i += 1 i += 1
continue continue
sections.pop(i) sections.pop(i)
@ -133,7 +133,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
for j in range(i, min(i+128, len(sections))): for j in range(i, min(i+128, len(sections))):
if not re.match(prefix, sections[j]): if not re.match(prefix, sections[j]):
continue continue
for k in range(i, j):sections.pop(i) for _ in range(i, j):sections.pop(i)
break break
bull = bullets_category(sections) bull = bullets_category(sections)

View File

@ -1,6 +1,6 @@
import copy import copy
import re import re
from rag.app import callback__, tokenize from rag.app import tokenize
from rag.nlp import huqie from rag.nlp import huqie
from rag.parser.pdf_parser import HuParser from rag.parser.pdf_parser import HuParser
from rag.utils import num_tokens_from_string from rag.utils import num_tokens_from_string
@ -14,19 +14,19 @@ class Pdf(HuParser):
zoomin, zoomin,
from_page, from_page,
to_page) to_page)
callback__(0.2, "OCR finished.", callback) callback(0.2, "OCR finished.")
from timeit import default_timer as timer from timeit import default_timer as timer
start = timer() start = timer()
self._layouts_paddle(zoomin) self._layouts_paddle(zoomin)
callback__(0.5, "Layout analysis finished.", callback) callback(0.5, "Layout analysis finished.")
print("paddle layouts:", timer() - start) print("paddle layouts:", timer() - start)
self._table_transformer_job(zoomin) self._table_transformer_job(zoomin)
callback__(0.7, "Table analysis finished.", callback) callback(0.7, "Table analysis finished.")
self._text_merge() self._text_merge()
self._concat_downward(concat_between_pages=False) self._concat_downward(concat_between_pages=False)
self._filter_forpages() self._filter_forpages()
callback__(0.77, "Text merging finished", callback) callback(0.77, "Text merging finished")
tbls = self._extract_table_figure(True, zoomin, False) tbls = self._extract_table_figure(True, zoomin, False)
# clean mess # clean mess
@ -34,20 +34,8 @@ class Pdf(HuParser):
b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip()) b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
# merge chunks with the same bullets # merge chunks with the same bullets
i = 0 self._merge_with_same_bullet()
while i + 1 < len(self.boxes):
b = self.boxes[i]
b_ = self.boxes[i + 1]
if b["text"].strip()[0] != b_["text"].strip()[0] \
or b["page_number"]!=b_["page_number"] \
or b["top"] > b_["bottom"]:
i += 1
continue
b_["text"] = b["text"] + "\n" + b_["text"]
b_["x0"] = min(b["x0"], b_["x0"])
b_["x1"] = max(b["x1"], b_["x1"])
b_["top"] = b["top"]
self.boxes.pop(i)
# merge title with decent chunk # merge title with decent chunk
i = 0 i = 0
while i + 1 < len(self.boxes): while i + 1 < len(self.boxes):
@ -62,7 +50,7 @@ class Pdf(HuParser):
b_["top"] = b["top"] b_["top"] = b["top"]
self.boxes.pop(i) self.boxes.pop(i)
callback__(0.8, "Parsing finished", callback) callback(0.8, "Parsing finished")
for b in self.boxes: print(b["text"], b.get("layoutno")) for b in self.boxes: print(b["text"], b.get("layoutno"))
print(tbls) print(tbls)

View File

@ -1,11 +1,9 @@
import copy import copy
import re import re
from collections import Counter from collections import Counter
from rag.app import callback__, bullets_category, BULLET_PATTERN, is_english, tokenize from rag.app import tokenize
from rag.nlp import huqie, stemmer from rag.nlp import huqie
from rag.parser.docx_parser import HuDocxParser
from rag.parser.pdf_parser import HuParser from rag.parser.pdf_parser import HuParser
from nltk.tokenize import word_tokenize
import numpy as np import numpy as np
from rag.utils import num_tokens_from_string from rag.utils import num_tokens_from_string
@ -18,20 +16,20 @@ class Pdf(HuParser):
zoomin, zoomin,
from_page, from_page,
to_page) to_page)
callback__(0.2, "OCR finished.", callback) callback(0.2, "OCR finished.")
from timeit import default_timer as timer from timeit import default_timer as timer
start = timer() start = timer()
self._layouts_paddle(zoomin) self._layouts_paddle(zoomin)
callback__(0.47, "Layout analysis finished", callback) callback(0.47, "Layout analysis finished")
print("paddle layouts:", timer() - start) print("paddle layouts:", timer() - start)
self._table_transformer_job(zoomin) self._table_transformer_job(zoomin)
callback__(0.68, "Table analysis finished", callback) callback(0.68, "Table analysis finished")
self._text_merge() self._text_merge()
column_width = np.median([b["x1"] - b["x0"] for b in self.boxes]) column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
self._concat_downward(concat_between_pages=False) self._concat_downward(concat_between_pages=False)
self._filter_forpages() self._filter_forpages()
callback__(0.75, "Text merging finished.", callback) callback(0.75, "Text merging finished.")
tbls = self._extract_table_figure(True, zoomin, False) tbls = self._extract_table_figure(True, zoomin, False)
# clean mess # clean mess
@ -101,7 +99,7 @@ class Pdf(HuParser):
break break
if not abstr: i = 0 if not abstr: i = 0
callback__(0.8, "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)), callback) callback(0.8, "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)))
for b in self.boxes: print(b["text"], b.get("layoutno")) for b in self.boxes: print(b["text"], b.get("layoutno"))
print(tbls) print(tbls)

View File

@ -3,7 +3,7 @@ import re
from io import BytesIO from io import BytesIO
from pptx import Presentation from pptx import Presentation
from rag.app import callback__, tokenize, is_english from rag.app import tokenize, is_english
from rag.nlp import huqie from rag.nlp import huqie
from rag.parser.pdf_parser import HuParser from rag.parser.pdf_parser import HuParser
@ -43,7 +43,7 @@ class Ppt(object):
if txt: texts.append(txt) if txt: texts.append(txt)
txts.append("\n".join(texts)) txts.append("\n".join(texts))
callback__(0.5, "Text extraction finished.", callback) callback(0.5, "Text extraction finished.")
import aspose.slides as slides import aspose.slides as slides
import aspose.pydrawing as drawing import aspose.pydrawing as drawing
imgs = [] imgs = []
@ -53,7 +53,7 @@ class Ppt(object):
slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg) slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg)
imgs.append(buffered.getvalue()) imgs.append(buffered.getvalue())
assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts)) assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
callback__(0.9, "Image extraction finished", callback) callback(0.9, "Image extraction finished")
self.is_english = is_english(txts) self.is_english = is_english(txts)
return [(txts[i], imgs[i]) for i in range(len(txts))] return [(txts[i], imgs[i]) for i in range(len(txts))]
@ -70,7 +70,7 @@ class Pdf(HuParser):
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
self.__images__(filename if not binary else binary, zoomin, from_page, to_page) self.__images__(filename if not binary else binary, zoomin, from_page, to_page)
callback__(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)), callback) callback(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)))
assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images)) assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images))
res = [] res = []
#################### More precisely ################### #################### More precisely ###################
@ -89,7 +89,7 @@ class Pdf(HuParser):
for i in range(len(self.boxes)): for i in range(len(self.boxes)):
lines = "\n".join([b["text"] for b in self.boxes[i] if not self.__garbage(b["text"])]) lines = "\n".join([b["text"] for b in self.boxes[i] if not self.__garbage(b["text"])])
res.append((lines, self.page_images[i])) res.append((lines, self.page_images[i]))
callback__(0.9, "Page {}~{}: Parsing finished".format(from_page, min(to_page, self.total_page)), callback) callback(0.9, "Page {}~{}: Parsing finished".format(from_page, min(to_page, self.total_page)))
return res return res

104
rag/app/qa.py Normal file
View File

@ -0,0 +1,104 @@
import random
import re
from io import BytesIO
from nltk import word_tokenize
from openpyxl import load_workbook
from rag.app import is_english
from rag.nlp import huqie, stemmer
class Excel(object):
def __call__(self, fnm, binary=None, callback=None):
if not binary:
wb = load_workbook(fnm)
else:
wb = load_workbook(BytesIO(binary))
total = 0
for sheetname in wb.sheetnames:
total += len(list(wb[sheetname].rows))
res, fails = [], []
for sheetname in wb.sheetnames:
ws = wb[sheetname]
rows = list(ws.rows)
for i, r in enumerate(rows):
q, a = "", ""
for cell in r:
if not cell.value: continue
if not q: q = str(cell.value)
elif not a: a = str(cell.value)
else: break
if q and a: res.append((q, a))
else: fails.append(str(i+1))
if len(res) % 999 == 0:
callback(len(res)*0.6/total, ("Extract Q&A: {}".format(len(res)) + (f"{len(fails)} failure, line: %s..."%(",".join(fails[:3])) if fails else "")))
callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
self.is_english = is_english([rmPrefix(q) for q, _ in random.choices(res, k=30) if len(q)>1])
return res
def rmPrefix(txt):
return re.sub(r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t: ]+", "", txt.strip(), flags=re.IGNORECASE)
def beAdoc(d, q, a, eng):
qprefix = "Question: " if eng else "问题:"
aprefix = "Answer: " if eng else "回答:"
d["content_with_weight"] = "\t".join([qprefix+rmPrefix(q), aprefix+rmPrefix(a)])
if eng:
d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(q)])
else:
d["content_ltks"] = huqie.qie(q)
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
return d
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
res = []
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
excel_parser = Excel()
for q,a in excel_parser(filename, binary, callback):
res.append(beAdoc({}, q, a, excel_parser.is_english))
return res
elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = ""
if binary:
txt = binary.decode("utf-8")
else:
with open(filename, "r") as f:
while True:
l = f.readline()
if not l: break
txt += l
lines = txt.split("\n")
eng = is_english([rmPrefix(l) for l in lines[:100]])
fails = []
for i, line in enumerate(lines):
arr = [l for l in line.split("\t") if len(l) > 1]
if len(arr) != 2:
fails.append(str(i))
continue
res.append(beAdoc({}, arr[0], arr[1], eng))
if len(res) % 999 == 0:
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
return res
raise NotImplementedError("file type not supported yet(pptx, pdf supported)")
if __name__== "__main__":
import sys
def kk(rat, ss):
pass
print(chunk(sys.argv[1], callback=kk))

View File

@ -763,7 +763,7 @@ class HuParser:
return return
i = 0 i = 0
while i < len(self.boxes): while i < len(self.boxes):
if not re.match(r"(contents|目录|目次|table of contents)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())): if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
i += 1 i += 1
continue continue
eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip()) eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip())
@ -782,6 +782,22 @@ class HuParser:
for k in range(i, j): self.boxes.pop(i) for k in range(i, j): self.boxes.pop(i)
break break
def _merge_with_same_bullet(self):
i = 0
while i + 1 < len(self.boxes):
b = self.boxes[i]
b_ = self.boxes[i + 1]
if b["text"].strip()[0] != b_["text"].strip()[0] \
or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
or b["top"] > b_["bottom"]:
i += 1
continue
b_["text"] = b["text"] + "\n" + b_["text"]
b_["x0"] = min(b["x0"], b_["x0"])
b_["x1"] = max(b["x1"], b_["x1"])
b_["top"] = b["top"]
self.boxes.pop(i)
def _blockType(self, b): def _blockType(self, b):
patt = [ patt = [
("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"), ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),

View File

@ -17,7 +17,7 @@ import logging
import os import os
import time import time
import random import random
from timeit import default_timer as timer from datetime import datetime
from api.db.db_models import Task from api.db.db_models import Task
from api.db.db_utils import bulk_insert_into_db from api.db.db_utils import bulk_insert_into_db
from api.db.services.task_service import TaskService from api.db.services.task_service import TaskService
@ -26,7 +26,7 @@ from rag.settings import cron_logger
from rag.utils import MINIO from rag.utils import MINIO
from rag.utils import findMaxTm from rag.utils import findMaxTm
import pandas as pd import pandas as pd
from api.db import FileType from api.db import FileType, TaskStatus
from api.db.services.document_service import DocumentService from api.db.services.document_service import DocumentService
from api.settings import database_logger from api.settings import database_logger
from api.utils import get_format_time, get_uuid from api.utils import get_format_time, get_uuid
@ -105,15 +105,23 @@ def update_progress():
prg = 0 prg = 0
finished = True finished = True
bad = 0 bad = 0
status = TaskStatus.RUNNING.value
for t in tsks: for t in tsks:
if 0 <= t.progress < 1: finished = False if 0 <= t.progress < 1: finished = False
prg += t.progress if t.progress >= 0 else 0 prg += t.progress if t.progress >= 0 else 0
msg.append(t.progress_msg) msg.append(t.progress_msg)
if t.progress == -1: bad += 1 if t.progress == -1: bad += 1
prg /= len(tsks) prg /= len(tsks)
if finished and bad: prg = -1 if finished and bad:
prg = -1
status = TaskStatus.FAIL.value
elif finished: status = TaskStatus.DONE.value
msg = "\n".join(msg) msg = "\n".join(msg)
DocumentService.update_by_id(d["id"], {"progress": prg, "progress_msg": msg, "process_duation": timer()-d["process_begin_at"].timestamp()}) info = {"process_duation": datetime.timestamp(datetime.now())-d["process_begin_at"].timestamp(), "run": status}
if prg !=0 : info["progress"] = prg
if msg: info["progress_msg"] = msg
DocumentService.update_by_id(d["id"], info)
except Exception as e: except Exception as e:
cron_logger.error("fetch task exception:" + str(e)) cron_logger.error("fetch task exception:" + str(e))

View File

@ -24,8 +24,9 @@ import sys
from functools import partial from functools import partial
from timeit import default_timer as timer from timeit import default_timer as timer
from elasticsearch_dsl import Q
from api.db.services.task_service import TaskService from api.db.services.task_service import TaskService
from rag.llm import EmbeddingModel, CvModel
from rag.settings import cron_logger, DOC_MAXIMUM_SIZE from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
from rag.utils import ELASTICSEARCH from rag.utils import ELASTICSEARCH
from rag.utils import MINIO from rag.utils import MINIO
@ -35,7 +36,7 @@ from rag.nlp import search
from io import BytesIO from io import BytesIO
import pandas as pd import pandas as pd
from rag.app import laws, paper, presentation, manual from rag.app import laws, paper, presentation, manual, qa
from api.db import LLMType, ParserType from api.db import LLMType, ParserType
from api.db.services.document_service import DocumentService from api.db.services.document_service import DocumentService
@ -51,13 +52,14 @@ FACTORY = {
ParserType.PRESENTATION.value: presentation, ParserType.PRESENTATION.value: presentation,
ParserType.MANUAL.value: manual, ParserType.MANUAL.value: manual,
ParserType.LAWS.value: laws, ParserType.LAWS.value: laws,
ParserType.QA.value: qa,
} }
def set_progress(task_id, from_page, to_page, prog=None, msg="Processing..."): def set_progress(task_id, from_page, to_page, prog=None, msg="Processing..."):
cancel = TaskService.do_cancel(task_id) cancel = TaskService.do_cancel(task_id)
if cancel: if cancel:
msg = "Canceled." msg += " [Canceled]"
prog = -1 prog = -1
if to_page > 0: msg = f"Page({from_page}~{to_page}): " + msg if to_page > 0: msg = f"Page({from_page}~{to_page}): " + msg
@ -166,13 +168,16 @@ def init_kb(row):
def embedding(docs, mdl): def embedding(docs, mdl):
tts, cnts = [d["docnm_kwd"] for d in docs], [d["content_with_weight"] for d in docs] tts, cnts = [d["docnm_kwd"] for d in docs if d.get("docnm_kwd")], [d["content_with_weight"] for d in docs]
tk_count = 0 tk_count = 0
if len(tts) == len(cnts):
tts, c = mdl.encode(tts) tts, c = mdl.encode(tts)
tk_count += c tk_count += c
cnts, c = mdl.encode(cnts) cnts, c = mdl.encode(cnts)
tk_count += c tk_count += c
vects = 0.1 * tts + 0.9 * cnts vects = (0.1 * tts + 0.9 * cnts) if len(tts) == len(cnts) else cnts
assert len(vects) == len(docs) assert len(vects) == len(docs)
for i, d in enumerate(docs): for i, d in enumerate(docs):
v = vects[i].tolist() v = vects[i].tolist()
@ -215,12 +220,14 @@ def main(comm, mod):
callback(msg="Finished embedding! Start to build index!") callback(msg="Finished embedding! Start to build index!")
init_kb(r) init_kb(r)
chunk_count = len(set([c["_id"] for c in cks])) chunk_count = len(set([c["_id"] for c in cks]))
callback(1., "Done!")
es_r = ELASTICSEARCH.bulk(cks, search.index_name(r["tenant_id"])) es_r = ELASTICSEARCH.bulk(cks, search.index_name(r["tenant_id"]))
if es_r: if es_r:
callback(-1, "Index failure!") callback(-1, "Index failure!")
cron_logger.error(str(es_r)) cron_logger.error(str(es_r))
else: else:
if TaskService.do_cancel(r["id"]):
ELASTICSEARCH.deleteByQuery(Q("match", doc_id=r["doc_id"]), idxnm=search.index_name(r["tenant_id"]))
callback(1., "Done!")
DocumentService.increment_chunk_num(r["doc_id"], r["kb_id"], tk_count, chunk_count, 0) DocumentService.increment_chunk_num(r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
cron_logger.info("Chunk doc({}), token({}), chunks({})".format(r["id"], tk_count, len(cks))) cron_logger.info("Chunk doc({}), token({}), chunks({})".format(r["id"], tk_count, len(cks)))