diff --git a/api/ragflow_server.py b/api/ragflow_server.py index 69218d6f0..e28347383 100644 --- a/api/ragflow_server.py +++ b/api/ragflow_server.py @@ -47,6 +47,7 @@ from api.db.db_models import init_database_tables as init_web_db from api.db.init_data import init_web_data from api.versions import get_ragflow_version from api.utils import show_configs +from rag.settings import print_rag_settings def update_progress(): @@ -75,6 +76,7 @@ if __name__ == '__main__': ) show_configs() settings.init_settings() + print_rag_settings() # init db init_web_db() diff --git a/rag/app/book.py b/rag/app/book.py index efd78c18e..65de875a1 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -26,30 +26,33 @@ from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser class Pdf(PdfParser): def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): - callback(msg="OCR is running...") + from timeit import default_timer as timer + start = timer() + callback(msg="OCR started") self.__images__( filename if not binary else binary, zoomin, from_page, to_page, callback) - callback(msg="OCR finished") + callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) - from timeit import default_timer as timer start = timer() self._layouts_rec(zoomin) - callback(0.67, "Layout analysis finished") + callback(0.67, "Layout analysis ({:.2f}s)".format(timer() - start)) logging.debug("layouts: {}".format(timer() - start)) + + start = timer() self._table_transformer_job(zoomin) - callback(0.68, "Table analysis finished") + callback(0.68, "Table analysis ({:.2f}s)".format(timer() - start)) + + start = timer() self._text_merge() tbls = self._extract_table_figure(True, zoomin, True, True) self._naive_vertical_merge() self._filter_forpages() self._merge_with_same_bullet() - callback(0.75, "Text merging finished.") - - callback(0.8, "Text extraction finished") + callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start)) return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes], tbls diff --git a/rag/app/laws.py b/rag/app/laws.py index a021e339f..3ee41c7a7 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -108,7 +108,9 @@ class Pdf(PdfParser): def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): - callback(msg="OCR is running...") + from timeit import default_timer as timer + start = timer() + callback(msg="OCR started") self.__images__( filename if not binary else binary, zoomin, @@ -116,17 +118,16 @@ class Pdf(PdfParser): to_page, callback ) - callback(msg="OCR finished") + callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) - from timeit import default_timer as timer start = timer() self._layouts_rec(zoomin) - callback(0.67, "Layout analysis finished") + callback(0.67, "Layout analysis ({:.2f}s)".format(timer() - start)) logging.debug("layouts:".format( )) self._naive_vertical_merge() - callback(0.8, "Text extraction finished") + callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start)) return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], None diff --git a/rag/app/manual.py b/rag/app/manual.py index 4efb92986..1ea5e9633 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -36,7 +36,7 @@ class Pdf(PdfParser): to_page=100000, zoomin=3, callback=None): from timeit import default_timer as timer start = timer() - callback(msg="OCR is running...") + callback(msg="OCR started") self.__images__( filename if not binary else binary, zoomin, @@ -44,22 +44,27 @@ class Pdf(PdfParser): to_page, callback ) - callback(msg="OCR finished.") + callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) # for bb in self.boxes: # for b in bb: # print(b) logging.debug("OCR: {}".format(timer() - start)) + start = timer() self._layouts_rec(zoomin) - callback(0.65, "Layout analysis finished.") + callback(0.65, "Layout analysis ({:.2f}s)".format(timer() - start)) logging.debug("layouts: {}".format(timer() - start)) + + start = timer() self._table_transformer_job(zoomin) - callback(0.67, "Table analysis finished.") + callback(0.67, "Table analysis ({:.2f}s)".format(timer() - start)) + + start = timer() self._text_merge() tbls = self._extract_table_figure(True, zoomin, True, True) self._concat_downward() self._filter_forpages() - callback(0.68, "Text merging finished") + callback(0.68, "Text merged ({:.2f}s)".format(timer() - start)) # clean mess for b in self.boxes: diff --git a/rag/app/naive.py b/rag/app/naive.py index 2628fd546..4127e88f3 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -124,7 +124,8 @@ class Pdf(PdfParser): def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): start = timer() - callback(msg="OCR is running...") + first_start = start + callback(msg="OCR started") self.__images__( filename if not binary else binary, zoomin, @@ -132,22 +133,26 @@ class Pdf(PdfParser): to_page, callback ) - callback(msg="OCR finished") - logging.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start)) + callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) + logging.info("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start)) start = timer() self._layouts_rec(zoomin) - callback(0.63, "Layout analysis finished.") + callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start)) + + start = timer() self._table_transformer_job(zoomin) - callback(0.65, "Table analysis finished.") + callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start)) + + start = timer() self._text_merge() - callback(0.67, "Text merging finished") + callback(0.67, "Text merged ({:.2f}s)".format(timer() - start)) tbls = self._extract_table_figure(True, zoomin, True, True) # self._naive_vertical_merge() self._concat_downward() # self._filter_forpages() - logging.info("layouts cost: {}s".format(timer() - start)) + logging.info("layouts cost: {}s".format(timer() - first_start)) return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls @@ -170,7 +175,7 @@ class Markdown(MarkdownParser): else: if sections and sections[-1][0].strip().find("#") == 0: sec_, _ = sections.pop(-1) - sections.append((sec_+"\n"+sec, "")) + sections.append((sec_ + "\n" + sec, "")) else: sections.append((sec, "")) diff --git a/rag/app/one.py b/rag/app/one.py index 76dc45893..c7fe0314c 100644 --- a/rag/app/one.py +++ b/rag/app/one.py @@ -24,7 +24,9 @@ from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser class Pdf(PdfParser): def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): - callback(msg="OCR is running...") + from timeit import default_timer as timer + start = timer() + callback(msg="OCR started") self.__images__( filename if not binary else binary, zoomin, @@ -32,17 +34,20 @@ class Pdf(PdfParser): to_page, callback ) - callback(msg="OCR finished") + callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) - from timeit import default_timer as timer start = timer() self._layouts_rec(zoomin, drop=False) - callback(0.63, "Layout analysis finished.") + callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start)) logging.debug("layouts cost: {}s".format(timer() - start)) + + start = timer() self._table_transformer_job(zoomin) - callback(0.65, "Table analysis finished.") + callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start)) + + start = timer() self._text_merge() - callback(0.67, "Text merging finished") + callback(0.67, "Text merged ({:.2f}s)".format(timer() - start)) tbls = self._extract_table_figure(True, zoomin, True, True) self._concat_downward() diff --git a/rag/app/paper.py b/rag/app/paper.py index 23483cc04..89af09b90 100644 --- a/rag/app/paper.py +++ b/rag/app/paper.py @@ -27,7 +27,9 @@ class Pdf(PdfParser): def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): - callback(msg="OCR is running...") + from timeit import default_timer as timer + start = timer() + callback(msg="OCR started") self.__images__( filename if not binary else binary, zoomin, @@ -35,21 +37,24 @@ class Pdf(PdfParser): to_page, callback ) - callback(msg="OCR finished.") + callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) - from timeit import default_timer as timer start = timer() self._layouts_rec(zoomin) - callback(0.63, "Layout analysis finished") + callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start)) logging.debug(f"layouts cost: {timer() - start}s") + + start = timer() self._table_transformer_job(zoomin) - callback(0.68, "Table analysis finished") + callback(0.68, "Table analysis ({:.2f}s)".format(timer() - start)) + + start = timer() self._text_merge() tbls = self._extract_table_figure(True, zoomin, True, True) column_width = np.median([b["x1"] - b["x0"] for b in self.boxes]) self._concat_downward() self._filter_forpages() - callback(0.75, "Text merging finished.") + callback(0.75, "Text merged ({:.2f}s)".format(timer() - start)) # clean mess if column_width < self.page_images[0].size[0] / zoomin / 2: diff --git a/rag/app/presentation.py b/rag/app/presentation.py index 54d897616..bebf64bff 100644 --- a/rag/app/presentation.py +++ b/rag/app/presentation.py @@ -59,11 +59,12 @@ class Pdf(PdfParser): def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): - callback(msg="OCR is running...") + from timeit import default_timer as timer + start = timer() + callback(msg="OCR started") self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback) - callback(0.8, "Page {}~{}: OCR finished".format( - from_page, min(to_page, self.total_page))) + callback(msg="Page {}~{}: OCR finished ({:.2f}s)".format(from_page, min(to_page, self.total_page), timer() - start)) assert len(self.boxes) == len(self.page_images), "{} vs. {}".format( len(self.boxes), len(self.page_images)) res = [] diff --git a/rag/app/qa.py b/rag/app/qa.py index d9756eecd..0fd7a932b 100644 --- a/rag/app/qa.py +++ b/rag/app/qa.py @@ -73,7 +73,7 @@ class Pdf(PdfParser): def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): start = timer() - callback(msg="OCR is running...") + callback(msg="OCR started") self.__images__( filename if not binary else binary, zoomin, @@ -81,15 +81,19 @@ class Pdf(PdfParser): to_page, callback ) - callback(msg="OCR finished") - logging.debug("OCR({}~{}): {}".format(from_page, to_page, timer() - start)) + callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) + logging.debug("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start)) start = timer() self._layouts_rec(zoomin, drop=False) - callback(0.63, "Layout analysis finished.") + callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start)) + + start = timer() self._table_transformer_job(zoomin) - callback(0.65, "Table analysis finished.") + callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start)) + + start = timer() self._text_merge() - callback(0.67, "Text merging finished") + callback(0.67, "Text merged ({:.2f}s)".format(timer() - start)) tbls = self._extract_table_figure(True, zoomin, True, True) #self._naive_vertical_merge() # self._concat_downward() @@ -226,7 +230,7 @@ class Docx(DocxParser): sum_question = '\n'.join(question_stack) if sum_question: qai_list.append((sum_question, last_answer, last_image)) - + tbls = [] for tb in self.doc.tables: html= "" diff --git a/rag/settings.py b/rag/settings.py index f31e00fdc..af6075d85 100644 --- a/rag/settings.py +++ b/rag/settings.py @@ -14,6 +14,7 @@ # limitations under the License. # import os +import logging from api.utils import get_base_config, decrypt_database_config from api.utils.file_utils import get_project_base_directory @@ -37,3 +38,9 @@ SVR_QUEUE_RETENTION = 60*60 SVR_QUEUE_MAX_LEN = 1024 SVR_CONSUMER_NAME = "rag_flow_svr_consumer" SVR_CONSUMER_GROUP_NAME = "rag_flow_svr_consumer_group" + +def print_rag_settings(): + logging.info(f"MAX_CONTENT_LENGTH: {DOC_MAXIMUM_SIZE}") + logging.info(f"SERVER_QUEUE_MAX_LEN: {SVR_QUEUE_MAX_LEN}") + logging.info(f"SERVER_QUEUE_RETENTION: {SVR_QUEUE_RETENTION}") + logging.info(f"MAX_FILE_COUNT_PER_USER: {int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))}") \ No newline at end of file diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index cc69bdaa6..b68c58317 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -56,12 +56,13 @@ from api.db.services.llm_service import LLMBundle from api.db.services.task_service import TaskService from api.db.services.file2document_service import File2DocumentService from api import settings +from api.versions import get_ragflow_version from api.db.db_models import close_connection from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, \ knowledge_graph, email from rag.nlp import search, rag_tokenizer from rag.raptor import RecursiveAbstractiveProcessing4TreeOrganizedRetrieval as Raptor -from rag.settings import DOC_MAXIMUM_SIZE, SVR_QUEUE_NAME +from rag.settings import DOC_MAXIMUM_SIZE, SVR_QUEUE_NAME, print_rag_settings from rag.utils import rmSpace, num_tokens_from_string from rag.utils.redis_conn import REDIS_CONN, Payload from rag.utils.storage_factory import STORAGE_IMPL @@ -395,8 +396,7 @@ def do_handle_task(r): # TODO: exception handler ## set_progress(r["did"], -1, "ERROR: ") callback( - msg="Finished slicing files ({} chunks in {:.2f}s). Start to embedding the content.".format(len(cks), - timer() - st) + msg="Generate {} chunks ({:.2f}s). Embedding chunks.".format(len(cks), timer() - st) ) st = timer() try: @@ -407,7 +407,7 @@ def do_handle_task(r): tk_count = 0 raise logging.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st)) - callback(msg="Finished embedding (in {:.2f}s)! Start to build index!".format(timer() - st)) + callback(msg="Finished embedding ({:.2f}s)!".format(timer() - st)) # logging.info(f"task_executor init_kb index {search.index_name(r["tenant_id"])} embd_mdl {embd_mdl.llm_name} vector length {vector_size}") init_kb(r, vector_size) chunk_count = len(set([c["id"] for c in cks])) @@ -420,7 +420,8 @@ def do_handle_task(r): callback(prog=0.8 + 0.1 * (b + 1) / len(cks), msg="") logging.info("Indexing elapsed({}): {:.2f}".format(r["name"], timer() - st)) if es_r: - callback(-1, "Insert chunk error, detail info please check log file. Please also check Elasticsearch/Infinity status!") + callback(-1, + "Insert chunk error, detail info please check log file. Please also check Elasticsearch/Infinity status!") settings.docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"]) logging.error('Insert chunk error: ' + str(es_r)) raise Exception('Insert chunk error: ' + str(es_r)) @@ -429,13 +430,12 @@ def do_handle_task(r): settings.docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"]) return - callback(msg="Indexing elapsed in {:.2f}s.".format(timer() - st)) - callback(1., "Done!") + callback(1., msg="Index cost {:.2f}s.".format(timer() - st)) DocumentService.increment_chunk_num( r["doc_id"], r["kb_id"], tk_count, chunk_count, 0) logging.info( "Chunk doc({}), token({}), chunks({}), elapsed:{:.2f}".format( - r["id"], tk_count, len(cks), timer() - st)) + r["id"], tk_count, len(cks), timer() - st)) def handle_task(): @@ -502,7 +502,7 @@ def analyze_heap(snapshot1: tracemalloc.Snapshot, snapshot2: tracemalloc.Snapsho for stat in stats2[:10]: msg += f"{stat}\n" stats1_vs_2 = snapshot2.compare_to(snapshot1, 'lineno') - msg += f"{CONSUMER_NAME} memory usage increase from snapshot {snapshot_id-1} to snapshot {snapshot_id}:\n" + msg += f"{CONSUMER_NAME} memory usage increase from snapshot {snapshot_id - 1} to snapshot {snapshot_id}:\n" for stat in stats1_vs_2[:10]: msg += f"{stat}\n" msg += f"{CONSUMER_NAME} detailed traceback for the top memory consumers:\n" @@ -512,7 +512,16 @@ def analyze_heap(snapshot1: tracemalloc.Snapshot, snapshot2: tracemalloc.Snapsho def main(): + logging.info(r""" + ______ __ ______ __ + /_ __/___ ______/ /__ / ____/ _____ _______ __/ /_____ _____ + / / / __ `/ ___/ //_/ / __/ | |/_/ _ \/ ___/ / / / __/ __ \/ ___/ + / / / /_/ (__ ) ,< / /____> 0 and num_tasks > 0 and num_tasks % TRACE_MALLOC_DELTA == 0: + if TRACE_MALLOC_DELTA > 0 and num_tasks > 0 and num_tasks % TRACE_MALLOC_DELTA == 0: snapshot2 = tracemalloc.take_snapshot() - analyze_heap(snapshot1, snapshot2, int(num_tasks/TRACE_MALLOC_DELTA), num_tasks % TRACE_MALLOC_FULL == 0) + analyze_heap(snapshot1, snapshot2, int(num_tasks / TRACE_MALLOC_DELTA), num_tasks % TRACE_MALLOC_FULL == 0) snapshot1 = snapshot2 snapshot2 = None + if __name__ == "__main__": main()