refine manual parser (#140)

This commit is contained in:
KevinHuSh 2024-03-21 18:17:32 +08:00 committed by GitHub
parent f4ec7cfa76
commit 6c6b144de2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 77 additions and 47 deletions

View File

@ -118,14 +118,13 @@ def message_fit_in(msg, max_length=4000):
c = count() c = count()
if c < max_length: return c, msg if c < max_length: return c, msg
msg = [m for m in msg if m.role in ["system", "user"]]
c = count()
if c < max_length: return c, msg
msg_ = [m for m in msg[:-1] if m.role == "system"] msg_ = [m for m in msg[:-1] if m.role == "system"]
msg_.append(msg[-1]) msg_.append(msg[-1])
msg = msg_ msg = msg_
c = count() c = count()
if c < max_length: return c, msg if c < max_length: return c, msg
ll = num_tokens_from_string(msg_[0].content) ll = num_tokens_from_string(msg_[0].content)
l = num_tokens_from_string(msg_[-1].content) l = num_tokens_from_string(msg_[-1].content)
if ll / (ll + l) > 0.8: if ll / (ll + l) > 0.8:

View File

@ -218,7 +218,7 @@ def rm():
ELASTICSEARCH.deleteByQuery(Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id)) ELASTICSEARCH.deleteByQuery(Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, 0) DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, 0)
if not DocumentService.delete_by_id(req["doc_id"]): if not DocumentService.delete(doc):
return get_data_error_result( return get_data_error_result(
retmsg="Database error (Document removal)!") retmsg="Database error (Document removal)!")

View File

@ -353,7 +353,7 @@ class User(DataBaseModel, UserMixin):
email = CharField(max_length=255, null=False, help_text="email", index=True) email = CharField(max_length=255, null=False, help_text="email", index=True)
avatar = TextField(null=True, help_text="avatar base64 string") avatar = TextField(null=True, help_text="avatar base64 string")
language = CharField(max_length=32, null=True, help_text="English|Chinese", default="Chinese") language = CharField(max_length=32, null=True, help_text="English|Chinese", default="Chinese")
color_schema = CharField(max_length=32, null=True, help_text="Bright|Dark", default="Dark") color_schema = CharField(max_length=32, null=True, help_text="Bright|Dark", default="Bright")
timezone = CharField(max_length=64, null=True, help_text="Timezone", default="UTC+8\tAsia/Shanghai") timezone = CharField(max_length=64, null=True, help_text="Timezone", default="UTC+8\tAsia/Shanghai")
last_login_time = DateTimeField(null=True) last_login_time = DateTimeField(null=True)
is_authenticated = CharField(max_length=1, null=False, default="1") is_authenticated = CharField(max_length=1, null=False, default="1")

View File

@ -223,7 +223,7 @@ def init_llm_factory():
"fid": factory_infos[3]["name"], "fid": factory_infos[3]["name"],
"llm_name": "qwen-14B-chat", "llm_name": "qwen-14B-chat",
"tags": "LLM,CHAT,", "tags": "LLM,CHAT,",
"max_tokens": 8191, "max_tokens": 4096,
"model_type": LLMType.CHAT.value "model_type": LLMType.CHAT.value
}, { }, {
"fid": factory_infos[3]["name"], "fid": factory_infos[3]["name"],
@ -271,11 +271,15 @@ def init_llm_factory():
pass pass
""" """
modify service_config
drop table llm; drop table llm;
drop table factories; drop table llm_factories;
update tenant_llm set llm_factory='Tongyi-Qianwen' where llm_factory='通义千问'; update tenant_llm set llm_factory='Tongyi-Qianwen' where llm_factory='通义千问';
update tenant_llm set llm_factory='ZHIPU-AI' where llm_factory='智谱AI'; update tenant_llm set llm_factory='ZHIPU-AI' where llm_factory='智谱AI';
update tenant set parser_ids='naive:General,one:One,qa:Q&A,resume:Resume,table:Table,laws:Laws,manual:Manual,book:Book,paper:Paper,presentation:Presentation,picture:Picture'; update tenant set parser_ids='naive:General,one:One,qa:Q&A,resume:Resume,table:Table,laws:Laws,manual:Manual,book:Book,paper:Paper,presentation:Presentation,picture:Picture';
alter table knowledgebase modify avatar longtext;
alter table user modify avatar longtext;
alter table dialog modify icon longtext;
""" """

View File

@ -60,6 +60,15 @@ class DocumentService(CommonService):
raise RuntimeError("Database error (Knowledgebase)!") raise RuntimeError("Database error (Knowledgebase)!")
return doc return doc
@classmethod
@DB.connection_context()
def delete(cls, doc):
e, kb = KnowledgebaseService.get_by_id(doc.kb_id)
if not KnowledgebaseService.update_by_id(
kb.id, {"doc_num": kb.doc_num - 1}):
raise RuntimeError("Database error (Knowledgebase)!")
return cls.delete_by_id(doc.id)
@classmethod @classmethod
@DB.connection_context() @DB.connection_context()
def get_newly_uploaded(cls, tm, mod=0, comm=1, items_per_page=64): def get_newly_uploaded(cls, tm, mod=0, comm=1, items_per_page=64):

View File

@ -11,7 +11,7 @@ import logging
from PIL import Image, ImageDraw from PIL import Image, ImageDraw
import numpy as np import numpy as np
from api.db import ParserType from PyPDF2 import PdfReader as pdf2_read
from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
from rag.nlp import huqie from rag.nlp import huqie
from copy import deepcopy from copy import deepcopy
@ -288,9 +288,9 @@ class HuParser:
for b in bxs]) for b in bxs])
self.boxes.append(bxs) self.boxes.append(bxs)
def _layouts_rec(self, ZM): def _layouts_rec(self, ZM, drop=True):
assert len(self.page_images) == len(self.boxes) assert len(self.page_images) == len(self.boxes)
self.boxes, self.page_layout = self.layouter(self.page_images, self.boxes, ZM) self.boxes, self.page_layout = self.layouter(self.page_images, self.boxes, ZM, drop=drop)
# cumlative Y # cumlative Y
for i in range(len(self.boxes)): for i in range(len(self.boxes)):
self.boxes[i]["top"] += \ self.boxes[i]["top"] += \
@ -908,6 +908,23 @@ class HuParser:
self.page_images.append(img) self.page_images.append(img)
self.page_chars.append([]) self.page_chars.append([])
self.outlines = []
try:
self.pdf = pdf2_read(fnm if isinstance(fnm, str) else BytesIO(fnm))
outlines = self.pdf.outline
def dfs(arr, depth):
for a in arr:
if isinstance(a, dict):
self.outlines.append((a["/Title"], depth))
continue
dfs(a, depth+1)
dfs(outlines, 0)
except Exception as e:
logging.warning(f"Outlines exception: {e}")
if not self.outlines:
logging.warning(f"Miss outlines")
logging.info("Images converted.") logging.info("Images converted.")
self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join( self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in

View File

@ -39,7 +39,7 @@ class LayoutRecognizer(Recognizer):
super().__init__(self.labels, domain, os.path.join(get_project_base_directory(), "rag/res/deepdoc/")) super().__init__(self.labels, domain, os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
self.garbage_layouts = ["footer", "header", "reference"] self.garbage_layouts = ["footer", "header", "reference"]
def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2, batch_size=16): def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2, batch_size=16, drop=True):
def __is_garbage(b): def __is_garbage(b):
patt = [r"^•+$", r"(版权归©|免责条款|地址[:])", r"\.{3,}", "^[0-9]{1,2} / ?[0-9]{1,2}$", patt = [r"^•+$", r"(版权归©|免责条款|地址[:])", r"\.{3,}", "^[0-9]{1,2} / ?[0-9]{1,2}$",
r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}", r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}",
@ -88,7 +88,11 @@ class LayoutRecognizer(Recognizer):
i += 1 i += 1
continue continue
lts_[ii]["visited"] = True lts_[ii]["visited"] = True
if lts_[ii]["type"] in self.garbage_layouts: keep_feats = [
lts_[ii]["type"] == "footer" and bxs[i]["bottom"] < image_list[pn].size[1]*0.9/scale_factor,
lts_[ii]["type"] == "header" and bxs[i]["top"] > image_list[pn].size[1]*0.1/scale_factor,
]
if drop and lts_[ii]["type"] in self.garbage_layouts and not any(keep_feats):
if lts_[ii]["type"] not in garbages: if lts_[ii]["type"] not in garbages:
garbages[lts_[ii]["type"]] = [] garbages[lts_[ii]["type"]] = []
garbages[lts_[ii]["type"]].append(bxs[i]["text"]) garbages[lts_[ii]["type"]].append(bxs[i]["text"])

View File

@ -51,15 +51,30 @@ class Pdf(PdfParser):
# set pivot using the most frequent type of title, # set pivot using the most frequent type of title,
# then merge between 2 pivot # then merge between 2 pivot
bull = bullets_category([b["text"] for b in self.boxes]) if len(self.boxes)>0 and len(self.outlines)/len(self.boxes) > 0.1:
most_level, levels = title_frequency(bull, [(b["text"], b.get("layout_no","")) for b in self.boxes]) max_lvl = max([lvl for _, lvl in self.outlines])
most_level = max(0, max_lvl-1)
levels = []
for b in self.boxes:
for t,lvl in self.outlines:
tks = set([t[i]+t[i+1] for i in range(len(t)-1)])
tks_ = set([b["text"][i]+b["text"][i+1] for i in range(min(len(t), len(b["text"])-1))])
if len(set(tks & tks_))/max([len(tks), len(tks_), 1]) > 0.8:
levels.append(lvl)
break
else:
levels.append(max_lvl + 1)
else:
bull = bullets_category([b["text"] for b in self.boxes])
most_level, levels = title_frequency(bull, [(b["text"], b.get("layout_no","")) for b in self.boxes])
assert len(self.boxes) == len(levels) assert len(self.boxes) == len(levels)
sec_ids = [] sec_ids = []
sid = 0 sid = 0
for i, lvl in enumerate(levels): for i, lvl in enumerate(levels):
if lvl <= most_level and i > 0 and lvl != levels[i-1]: sid += 1 if lvl <= most_level and i > 0 and lvl != levels[i-1]: sid += 1
sec_ids.append(sid) sec_ids.append(sid)
#print(lvl, self.boxes[i]["text"], most_level) #print(lvl, self.boxes[i]["text"], most_level, sid)
sections = [(b["text"], sec_ids[i], self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)] sections = [(b["text"], sec_ids[i], self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)]
for (img, rows), poss in tbls: for (img, rows), poss in tbls:
@ -67,13 +82,16 @@ class Pdf(PdfParser):
chunks = [] chunks = []
last_sid = -2 last_sid = -2
tk_cnt = 0
for txt, sec_id, poss in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1])): for txt, sec_id, poss in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1])):
poss = "\t".join([tag(*pos) for pos in poss]) poss = "\t".join([tag(*pos) for pos in poss])
if sec_id == last_sid or sec_id == -1: if tk_cnt < 2048 and (sec_id == last_sid or sec_id == -1):
if chunks: if chunks:
chunks[-1] += "\n" + txt + poss chunks[-1] += "\n" + txt + poss
tk_cnt += num_tokens_from_string(txt)
continue continue
chunks.append(txt + poss) chunks.append(txt + poss)
tk_cnt = num_tokens_from_string(txt)
if sec_id >-1: last_sid = sec_id if sec_id >-1: last_sid = sec_id
return chunks, tbls return chunks, tbls
@ -97,37 +115,17 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
# is it English # is it English
eng = lang.lower() == "english"#pdf_parser.is_english eng = lang.lower() == "english"#pdf_parser.is_english
i = 0
chunk = []
tk_cnt = 0
res = tokenize_table(tbls, doc, eng) res = tokenize_table(tbls, doc, eng)
def add_chunk(): for ck in cks:
nonlocal chunk, res, doc, pdf_parser, tk_cnt
d = copy.deepcopy(doc) d = copy.deepcopy(doc)
ck = "\n".join(chunk)
tokenize(d, pdf_parser.remove_tag(ck), eng)
d["image"], poss = pdf_parser.crop(ck, need_position=True) d["image"], poss = pdf_parser.crop(ck, need_position=True)
add_positions(d, poss) add_positions(d, poss)
tokenize(d, pdf_parser.remove_tag(ck), eng)
res.append(d) res.append(d)
chunk = []
tk_cnt = 0
while i < len(cks):
if tk_cnt > 256: add_chunk()
txt = cks[i]
txt_ = pdf_parser.remove_tag(txt)
i += 1
cnt = num_tokens_from_string(txt_)
chunk.append(txt)
tk_cnt += cnt
if chunk: add_chunk()
for i, d in enumerate(res):
print(d)
# d["image"].save(f"./logs/{i}.jpg")
return res return res
if __name__ == "__main__": if __name__ == "__main__":
import sys import sys
def dummy(prog=None, msg=""): def dummy(prog=None, msg=""):

View File

@ -10,12 +10,10 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# #
import copy
import re import re
from rag.app import laws from rag.app import laws
from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions from rag.nlp import huqie, tokenize
from deepdoc.parser import PdfParser, ExcelParser from deepdoc.parser import PdfParser, ExcelParser
from rag.settings import cron_logger
class Pdf(PdfParser): class Pdf(PdfParser):
@ -33,7 +31,7 @@ class Pdf(PdfParser):
from timeit import default_timer as timer from timeit import default_timer as timer
start = timer() start = timer()
self._layouts_rec(zoomin) self._layouts_rec(zoomin, drop=False)
callback(0.63, "Layout analysis finished.") callback(0.63, "Layout analysis finished.")
print("paddle layouts:", timer() - start) print("paddle layouts:", timer() - start)
self._table_transformer_job(zoomin) self._table_transformer_job(zoomin)

View File

@ -215,7 +215,7 @@ class Dealer:
else: else:
pieces = re.split(r"([^\|][;。?!\n]|[a-z][.?;!][ \n])", answer) pieces = re.split(r"([^\|][;。?!\n]|[a-z][.?;!][ \n])", answer)
for i in range(1, len(pieces)): for i in range(1, len(pieces)):
if re.match(r"[a-z][.?;!][ \n]", pieces[i]): if re.match(r"([^\|][;。?!\n]|[a-z][.?;!][ \n])", pieces[i]):
pieces[i - 1] += pieces[i][0] pieces[i - 1] += pieces[i][0]
pieces[i] = pieces[i][1:] pieces[i] = pieces[i][1:]
idx = [] idx = []
@ -243,7 +243,8 @@ class Dealer:
chunks_tks, chunks_tks,
tkweight, vtweight) tkweight, vtweight)
mx = np.max(sim) * 0.99 mx = np.max(sim) * 0.99
if mx < 0.65: es_logger.info("{} SIM: {}".format(pieces_[i], mx))
if mx < 0.63:
continue continue
cites[idx[i]] = list( cites[idx[i]] = list(
set([str(ii) for ii in range(len(chunk_v)) if sim[ii] > mx]))[:4] set([str(ii) for ii in range(len(chunk_v)) if sim[ii] > mx]))[:4]

View File

@ -82,8 +82,8 @@ def dispatch():
tsks = [] tsks = []
if r["type"] == FileType.PDF.value: if r["type"] == FileType.PDF.value:
pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"])) pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
page_size = 5 page_size = 12
if r["parser_id"] == "paper": page_size = 12 if r["parser_id"] == "paper": page_size = 22
if r["parser_id"] == "one": page_size = 1000000000 if r["parser_id"] == "one": page_size = 1000000000
for s,e in r["parser_config"].get("pages", [(0,100000)]): for s,e in r["parser_config"].get("pages", [(0,100000)]):
e = min(e, pages) e = min(e, pages)