Code refactor. (#4291)

### What problem does this PR solve?

### Type of change

- [x] Refactoring
This commit is contained in:
Kevin Hu 2024-12-30 18:38:51 +08:00 committed by GitHub
parent f619d5a9b6
commit 8fb18f37f6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 33 additions and 18 deletions

View File

@ -16,6 +16,7 @@
import random import random
from abc import ABC from abc import ABC
from functools import partial from functools import partial
from typing import Tuple, Union
import pandas as pd import pandas as pd
@ -76,4 +77,13 @@ class Answer(ComponentBase, ABC):
def set_exception(self, e): def set_exception(self, e):
self.exception = e self.exception = e
def output(self, allow_partial=True) -> Tuple[str, Union[pd.DataFrame, partial]]:
if allow_partial:
return super.output()
for r, c in self._canvas.history[::-1]:
if r == "user":
return self._param.output_var_name, pd.DataFrame([{"content": c}])
self._param.output_var_name, pd.DataFrame([])

View File

@ -146,12 +146,16 @@ def run():
canvas.messages.append({"role": "assistant", "content": final_ans["content"], "id": message_id}) canvas.messages.append({"role": "assistant", "content": final_ans["content"], "id": message_id})
canvas.history.append(("assistant", final_ans["content"])) canvas.history.append(("assistant", final_ans["content"]))
if not canvas.path[-1]:
canvas.path.pop(-1)
if final_ans.get("reference"): if final_ans.get("reference"):
canvas.reference.append(final_ans["reference"]) canvas.reference.append(final_ans["reference"])
cvs.dsl = json.loads(str(canvas)) cvs.dsl = json.loads(str(canvas))
UserCanvasService.update_by_id(req["id"], cvs.to_dict()) UserCanvasService.update_by_id(req["id"], cvs.to_dict())
except Exception as e: except Exception as e:
cvs.dsl = json.loads(str(canvas)) cvs.dsl = json.loads(str(canvas))
if not canvas.path[-1]:
canvas.path.pop(-1)
UserCanvasService.update_by_id(req["id"], cvs.to_dict()) UserCanvasService.update_by_id(req["id"], cvs.to_dict())
traceback.print_exc() traceback.print_exc()
yield "data:" + json.dumps({"code": 500, "message": str(e), yield "data:" + json.dumps({"code": 500, "message": str(e),

View File

@ -103,10 +103,7 @@ def set_dialog():
} }
if not DialogService.save(**dia): if not DialogService.save(**dia):
return get_data_error_result(message="Fail to new a dialog!") return get_data_error_result(message="Fail to new a dialog!")
e, dia = DialogService.get_by_id(dia["id"]) return get_json_result(data=dia)
if not e:
return get_data_error_result(message="Fail to new a dialog!")
return get_json_result(data=dia.to_json())
else: else:
del req["dialog_id"] del req["dialog_id"]
if "kb_names" in req: if "kb_names" in req:
@ -117,6 +114,7 @@ def set_dialog():
if not e: if not e:
return get_data_error_result(message="Fail to update a dialog!") return get_data_error_result(message="Fail to update a dialog!")
dia = dia.to_dict() dia = dia.to_dict()
dia.update(req)
dia["kb_ids"], dia["kb_names"] = get_kb_names(dia["kb_ids"]) dia["kb_ids"], dia["kb_names"] = get_kb_names(dia["kb_ids"])
return get_json_result(data=dia) return get_json_result(data=dia)
except Exception as e: except Exception as e:

View File

@ -185,7 +185,8 @@ def rm():
return get_data_error_result( return get_data_error_result(
message="Database error (Document removal)!") message="Database error (Document removal)!")
f2d = File2DocumentService.get_by_document_id(doc.id) f2d = File2DocumentService.get_by_document_id(doc.id)
FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id]) if f2d:
FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
File2DocumentService.delete_by_document_id(doc.id) File2DocumentService.delete_by_document_id(doc.id)
FileService.filter_delete( FileService.filter_delete(
[File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name]) [File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name])

View File

@ -120,6 +120,10 @@ def server_error_response(e):
if len(e.args) > 1: if len(e.args) > 1:
return get_json_result( return get_json_result(
code=settings.RetCode.EXCEPTION_ERROR, message=repr(e.args[0]), data=e.args[1]) code=settings.RetCode.EXCEPTION_ERROR, message=repr(e.args[0]), data=e.args[1])
if repr(e).find("index_not_found_exception") >= 0:
return get_json_result(code=settings.RetCode.EXCEPTION_ERROR,
message="No chunk found, please upload file and parse it.")
return get_json_result(code=settings.RetCode.EXCEPTION_ERROR, message=repr(e)) return get_json_result(code=settings.RetCode.EXCEPTION_ERROR, message=repr(e))

View File

@ -11,20 +11,20 @@ Given a text document that is potentially relevant to this activity and a list o
-Steps- -Steps-
1. Identify all entities. For each identified entity, extract the following information: 1. Identify all entities. For each identified entity, extract the following information:
- entity_name: Name of the entity, capitalized - entity_name: Name of the entity, capitalized, in language of 'Text'
- entity_type: One of the following types: [{entity_types}] - entity_type: One of the following types: [{entity_types}]
- entity_description: Comprehensive description of the entity's attributes and activities - entity_description: Comprehensive description of the entity's attributes and activities in language of 'Text'
Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description> Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>
2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other. 2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
For each pair of related entities, extract the following information: For each pair of related entities, extract the following information:
- source_entity: name of the source entity, as identified in step 1 - source_entity: name of the source entity, as identified in step 1
- target_entity: name of the target entity, as identified in step 1 - target_entity: name of the target entity, as identified in step 1
- relationship_description: explanation as to why you think the source entity and the target entity are related to each other - relationship_description: explanation as to why you think the source entity and the target entity are related to each other in language of 'Text'
- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity - relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity
Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_description>{tuple_delimiter}<relationship_strength>) Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_description>{tuple_delimiter}<relationship_strength>)
3. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter. 3. Return output as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.
4. When finished, output {completion_delimiter} 4. When finished, output {completion_delimiter}

View File

@ -81,7 +81,7 @@ def get_llm_cache(llmnm, txt, history, genconf):
return bin return bin
def set_llm_cache(llmnm, txt, v: str, history, genconf): def set_llm_cache(llmnm, txt, v, history, genconf):
hasher = xxhash.xxh64() hasher = xxhash.xxh64()
hasher.update(str(llmnm).encode("utf-8")) hasher.update(str(llmnm).encode("utf-8"))
hasher.update(str(txt).encode("utf-8")) hasher.update(str(txt).encode("utf-8"))

View File

@ -153,11 +153,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if re.search(r"\.docx$", filename, re.IGNORECASE): if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
for txt in Docx()(filename, binary): chunks = Docx()(filename, binary)
sections.append(txt) callback(0.7, "Finish parsing.")
callback(0.8, "Finish parsing.") return tokenize_chunks(chunks, doc, eng, None)
chunks = sections
return tokenize_chunks(chunks, doc, eng, pdf_parser)
elif re.search(r"\.pdf$", filename, re.IGNORECASE): elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf() if kwargs.get( pdf_parser = Pdf() if kwargs.get(

View File

@ -193,7 +193,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
sections = [(t, lvl, [[0] * 5]) for t, lvl in sections] sections = [(t, lvl, [[0] * 5]) for t, lvl in sections]
# set pivot using the most frequent type of title, # set pivot using the most frequent type of title,
# then merge between 2 pivot # then merge between 2 pivot
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.1: if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
max_lvl = max([lvl for _, lvl in pdf_parser.outlines]) max_lvl = max([lvl for _, lvl in pdf_parser.outlines])
most_level = max(0, max_lvl - 1) most_level = max(0, max_lvl - 1)
levels = [] levels = []
@ -256,7 +256,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
return res return res
if re.search(r"\.docx$", filename, re.IGNORECASE): elif re.search(r"\.docx$", filename, re.IGNORECASE):
docx_parser = Docx() docx_parser = Docx()
ti_list, tbls = docx_parser(filename, binary, ti_list, tbls = docx_parser(filename, binary,
from_page=0, to_page=10000, callback=callback) from_page=0, to_page=10000, callback=callback)

View File

@ -185,7 +185,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
"datetime": "_dt", "datetime": "_dt",
"bool": "_kwd"} "bool": "_kwd"}
for df in dfs: for df in dfs:
for n in ["id", "index", "idx"]: for n in ["id", "_id", "index", "idx"]:
if n in df.columns: if n in df.columns:
del df[n] del df[n]
clmns = df.columns.values clmns = df.columns.values