diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index b8a69245a..6bda36970 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -11,7 +11,7 @@ import pdfplumber import logging from PIL import Image, ImageDraw import numpy as np - +from timeit import default_timer as timer from PyPDF2 import PdfReader as pdf2_read from api.utils.file_utils import get_project_base_directory @@ -936,6 +936,7 @@ class HuParser: self.page_cum_height = [0] self.page_layout = [] self.page_from = page_from + st = timer() try: self.pdf = pdfplumber.open(fnm) if isinstance( fnm, str) else pdfplumber.open(BytesIO(fnm)) @@ -989,7 +990,9 @@ class HuParser: self.is_english = True else: self.is_english = False + self.is_english = False + st = timer() for i, img in enumerate(self.page_images): chars = self.page_chars[i] if not self.is_english else [] self.mean_height.append( @@ -1007,15 +1010,11 @@ class HuParser: chars[j]["width"]) / 2: chars[j]["text"] += " " j += 1 - # if i > 0: - # if not chars: - # self.page_cum_height.append(img.size[1] / zoomin) - # else: - # self.page_cum_height.append( - # np.max([c["bottom"] for c in chars])) + self.__ocr(i + 1, img, chars, zoomin) - if callback: - callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="") + #if callback: + # callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="") + #print("OCR:", timer()-st) if not self.is_english and not any( [c for c in self.page_chars]) and self.boxes: diff --git a/docker/.env b/docker/.env index ddb181d10..9f028c826 100644 --- a/docker/.env +++ b/docker/.env @@ -11,7 +11,9 @@ ES_PORT=1200 KIBANA_PORT=6601 # Increase or decrease based on the available host memory (in bytes) -MEM_LIMIT=12073741824 + +MEM_LIMIT=8073741824 + MYSQL_PASSWORD=infini_rag_flow MYSQL_PORT=5455 diff --git a/docker/docker-compose-base.yml b/docker/docker-compose-base.yml index b110ce952..24519ebd4 100644 --- a/docker/docker-compose-base.yml +++ b/docker/docker-compose-base.yml @@ -29,23 +29,23 @@ services: - ragflow restart: always - kibana: - depends_on: - es01: - condition: service_healthy - image: docker.elastic.co/kibana/kibana:${STACK_VERSION} - container_name: ragflow-kibana - volumes: - - kibanadata:/usr/share/kibana/data - ports: - - ${KIBANA_PORT}:5601 - environment: - - SERVERNAME=kibana - - ELASTICSEARCH_HOSTS=http://es01:9200 - - TZ=${TIMEZONE} - mem_limit: ${MEM_LIMIT} - networks: - - ragflow + #kibana: + # depends_on: + # es01: + # condition: service_healthy + # image: docker.elastic.co/kibana/kibana:${STACK_VERSION} + # container_name: ragflow-kibana + # volumes: + # - kibanadata:/usr/share/kibana/data + # ports: + # - ${KIBANA_PORT}:5601 + # environment: + # - SERVERNAME=kibana + # - ELASTICSEARCH_HOSTS=http://es01:9200 + # - TZ=${TIMEZONE} + # mem_limit: ${MEM_LIMIT} + # networks: + # - ragflow mysql: image: mysql:5.7.18 diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index 6da4a814c..a13616182 100644 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -29,7 +29,7 @@ function task_bro(){ task_bro & -WS=2 +WS=1 for ((i=0;i