diff --git a/api/ragflow_server.py b/api/ragflow_server.py index 69218d6f0..e28347383 100644 --- a/api/ragflow_server.py +++ b/api/ragflow_server.py @@ -47,6 +47,7 @@ from api.db.db_models import init_database_tables as init_web_db from api.db.init_data import init_web_data from api.versions import get_ragflow_version from api.utils import show_configs +from rag.settings import print_rag_settings def update_progress(): @@ -75,6 +76,7 @@ if __name__ == '__main__': ) show_configs() settings.init_settings() + print_rag_settings() # init db init_web_db() diff --git a/rag/app/book.py b/rag/app/book.py index efd78c18e..65de875a1 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -26,30 +26,33 @@ from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser class Pdf(PdfParser): def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): - callback(msg="OCR is running...") + from timeit import default_timer as timer + start = timer() + callback(msg="OCR started") self.__images__( filename if not binary else binary, zoomin, from_page, to_page, callback) - callback(msg="OCR finished") + callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) - from timeit import default_timer as timer start = timer() self._layouts_rec(zoomin) - callback(0.67, "Layout analysis finished") + callback(0.67, "Layout analysis ({:.2f}s)".format(timer() - start)) logging.debug("layouts: {}".format(timer() - start)) + + start = timer() self._table_transformer_job(zoomin) - callback(0.68, "Table analysis finished") + callback(0.68, "Table analysis ({:.2f}s)".format(timer() - start)) + + start = timer() self._text_merge() tbls = self._extract_table_figure(True, zoomin, True, True) self._naive_vertical_merge() self._filter_forpages() self._merge_with_same_bullet() - callback(0.75, "Text merging finished.") - - callback(0.8, "Text extraction finished") + callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start)) return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes], tbls diff --git a/rag/app/laws.py b/rag/app/laws.py index a021e339f..3ee41c7a7 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -108,7 +108,9 @@ class Pdf(PdfParser): def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): - callback(msg="OCR is running...") + from timeit import default_timer as timer + start = timer() + callback(msg="OCR started") self.__images__( filename if not binary else binary, zoomin, @@ -116,17 +118,16 @@ class Pdf(PdfParser): to_page, callback ) - callback(msg="OCR finished") + callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) - from timeit import default_timer as timer start = timer() self._layouts_rec(zoomin) - callback(0.67, "Layout analysis finished") + callback(0.67, "Layout analysis ({:.2f}s)".format(timer() - start)) logging.debug("layouts:".format( )) self._naive_vertical_merge() - callback(0.8, "Text extraction finished") + callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start)) return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], None diff --git a/rag/app/manual.py b/rag/app/manual.py index 4efb92986..1ea5e9633 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -36,7 +36,7 @@ class Pdf(PdfParser): to_page=100000, zoomin=3, callback=None): from timeit import default_timer as timer start = timer() - callback(msg="OCR is running...") + callback(msg="OCR started") self.__images__( filename if not binary else binary, zoomin, @@ -44,22 +44,27 @@ class Pdf(PdfParser): to_page, callback ) - callback(msg="OCR finished.") + callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) # for bb in self.boxes: # for b in bb: # print(b) logging.debug("OCR: {}".format(timer() - start)) + start = timer() self._layouts_rec(zoomin) - callback(0.65, "Layout analysis finished.") + callback(0.65, "Layout analysis ({:.2f}s)".format(timer() - start)) logging.debug("layouts: {}".format(timer() - start)) + + start = timer() self._table_transformer_job(zoomin) - callback(0.67, "Table analysis finished.") + callback(0.67, "Table analysis ({:.2f}s)".format(timer() - start)) + + start = timer() self._text_merge() tbls = self._extract_table_figure(True, zoomin, True, True) self._concat_downward() self._filter_forpages() - callback(0.68, "Text merging finished") + callback(0.68, "Text merged ({:.2f}s)".format(timer() - start)) # clean mess for b in self.boxes: diff --git a/rag/app/naive.py b/rag/app/naive.py index 2628fd546..4127e88f3 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -124,7 +124,8 @@ class Pdf(PdfParser): def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): start = timer() - callback(msg="OCR is running...") + first_start = start + callback(msg="OCR started") self.__images__( filename if not binary else binary, zoomin, @@ -132,22 +133,26 @@ class Pdf(PdfParser): to_page, callback ) - callback(msg="OCR finished") - logging.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start)) + callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) + logging.info("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start)) start = timer() self._layouts_rec(zoomin) - callback(0.63, "Layout analysis finished.") + callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start)) + + start = timer() self._table_transformer_job(zoomin) - callback(0.65, "Table analysis finished.") + callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start)) + + start = timer() self._text_merge() - callback(0.67, "Text merging finished") + callback(0.67, "Text merged ({:.2f}s)".format(timer() - start)) tbls = self._extract_table_figure(True, zoomin, True, True) # self._naive_vertical_merge() self._concat_downward() # self._filter_forpages() - logging.info("layouts cost: {}s".format(timer() - start)) + logging.info("layouts cost: {}s".format(timer() - first_start)) return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls @@ -170,7 +175,7 @@ class Markdown(MarkdownParser): else: if sections and sections[-1][0].strip().find("#") == 0: sec_, _ = sections.pop(-1) - sections.append((sec_+"\n"+sec, "")) + sections.append((sec_ + "\n" + sec, "")) else: sections.append((sec, "")) diff --git a/rag/app/one.py b/rag/app/one.py index 76dc45893..c7fe0314c 100644 --- a/rag/app/one.py +++ b/rag/app/one.py @@ -24,7 +24,9 @@ from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser class Pdf(PdfParser): def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): - callback(msg="OCR is running...") + from timeit import default_timer as timer + start = timer() + callback(msg="OCR started") self.__images__( filename if not binary else binary, zoomin, @@ -32,17 +34,20 @@ class Pdf(PdfParser): to_page, callback ) - callback(msg="OCR finished") + callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) - from timeit import default_timer as timer start = timer() self._layouts_rec(zoomin, drop=False) - callback(0.63, "Layout analysis finished.") + callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start)) logging.debug("layouts cost: {}s".format(timer() - start)) + + start = timer() self._table_transformer_job(zoomin) - callback(0.65, "Table analysis finished.") + callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start)) + + start = timer() self._text_merge() - callback(0.67, "Text merging finished") + callback(0.67, "Text merged ({:.2f}s)".format(timer() - start)) tbls = self._extract_table_figure(True, zoomin, True, True) self._concat_downward() diff --git a/rag/app/paper.py b/rag/app/paper.py index 23483cc04..89af09b90 100644 --- a/rag/app/paper.py +++ b/rag/app/paper.py @@ -27,7 +27,9 @@ class Pdf(PdfParser): def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): - callback(msg="OCR is running...") + from timeit import default_timer as timer + start = timer() + callback(msg="OCR started") self.__images__( filename if not binary else binary, zoomin, @@ -35,21 +37,24 @@ class Pdf(PdfParser): to_page, callback ) - callback(msg="OCR finished.") + callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) - from timeit import default_timer as timer start = timer() self._layouts_rec(zoomin) - callback(0.63, "Layout analysis finished") + callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start)) logging.debug(f"layouts cost: {timer() - start}s") + + start = timer() self._table_transformer_job(zoomin) - callback(0.68, "Table analysis finished") + callback(0.68, "Table analysis ({:.2f}s)".format(timer() - start)) + + start = timer() self._text_merge() tbls = self._extract_table_figure(True, zoomin, True, True) column_width = np.median([b["x1"] - b["x0"] for b in self.boxes]) self._concat_downward() self._filter_forpages() - callback(0.75, "Text merging finished.") + callback(0.75, "Text merged ({:.2f}s)".format(timer() - start)) # clean mess if column_width < self.page_images[0].size[0] / zoomin / 2: diff --git a/rag/app/presentation.py b/rag/app/presentation.py index 54d897616..bebf64bff 100644 --- a/rag/app/presentation.py +++ b/rag/app/presentation.py @@ -59,11 +59,12 @@ class Pdf(PdfParser): def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): - callback(msg="OCR is running...") + from timeit import default_timer as timer + start = timer() + callback(msg="OCR started") self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback) - callback(0.8, "Page {}~{}: OCR finished".format( - from_page, min(to_page, self.total_page))) + callback(msg="Page {}~{}: OCR finished ({:.2f}s)".format(from_page, min(to_page, self.total_page), timer() - start)) assert len(self.boxes) == len(self.page_images), "{} vs. {}".format( len(self.boxes), len(self.page_images)) res = [] diff --git a/rag/app/qa.py b/rag/app/qa.py index d9756eecd..0fd7a932b 100644 --- a/rag/app/qa.py +++ b/rag/app/qa.py @@ -73,7 +73,7 @@ class Pdf(PdfParser): def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): start = timer() - callback(msg="OCR is running...") + callback(msg="OCR started") self.__images__( filename if not binary else binary, zoomin, @@ -81,15 +81,19 @@ class Pdf(PdfParser): to_page, callback ) - callback(msg="OCR finished") - logging.debug("OCR({}~{}): {}".format(from_page, to_page, timer() - start)) + callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) + logging.debug("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start)) start = timer() self._layouts_rec(zoomin, drop=False) - callback(0.63, "Layout analysis finished.") + callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start)) + + start = timer() self._table_transformer_job(zoomin) - callback(0.65, "Table analysis finished.") + callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start)) + + start = timer() self._text_merge() - callback(0.67, "Text merging finished") + callback(0.67, "Text merged ({:.2f}s)".format(timer() - start)) tbls = self._extract_table_figure(True, zoomin, True, True) #self._naive_vertical_merge() # self._concat_downward() @@ -226,7 +230,7 @@ class Docx(DocxParser): sum_question = '\n'.join(question_stack) if sum_question: qai_list.append((sum_question, last_answer, last_image)) - + tbls = [] for tb in self.doc.tables: html= "