mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-14 20:56:13 +08:00
perf: optimze figure parser (#7392)
### What problem does this PR solve? When parsing documents containing images, the current code uses a single-threaded approach to call the VL model, resulting in extremely slow parsing speed (e.g., parsing a Word document with dozens of images takes over 20 minutes). By switching to a multithreaded approach to call the VL model, the parsing speed can be improved to an acceptable level. ### Type of change - [x] Performance Improvement --------- Co-authored-by: liuzhenghua-jk <liuzhenghua-jk@360shuke.com>
This commit is contained in:
parent
d6cc6453d1
commit
2f768b96e8
@ -13,7 +13,7 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
@ -28,6 +28,7 @@ def vision_figure_parser_figure_data_wraper(figures_data_without_positions):
|
|||||||
) for figure_data in figures_data_without_positions if isinstance(figure_data[1], Image.Image)]
|
) for figure_data in figures_data_without_positions if isinstance(figure_data[1], Image.Image)]
|
||||||
|
|
||||||
|
|
||||||
|
shared_executor = ThreadPoolExecutor(max_workers=10)
|
||||||
class VisionFigureParser:
|
class VisionFigureParser:
|
||||||
def __init__(self, vision_model, figures_data, *args, **kwargs):
|
def __init__(self, vision_model, figures_data, *args, **kwargs):
|
||||||
self.vision_model = vision_model
|
self.vision_model = vision_model
|
||||||
@ -73,16 +74,21 @@ class VisionFigureParser:
|
|||||||
def __call__(self, **kwargs):
|
def __call__(self, **kwargs):
|
||||||
callback = kwargs.get("callback", lambda prog, msg: None)
|
callback = kwargs.get("callback", lambda prog, msg: None)
|
||||||
|
|
||||||
for idx, img_binary in enumerate(self.figures or []):
|
def process(figure_idx, figure_binary):
|
||||||
figure_num = idx # 0-based
|
description_text = picture_vision_llm_chunk(
|
||||||
|
binary=figure_binary,
|
||||||
txt = picture_vision_llm_chunk(
|
|
||||||
binary=img_binary,
|
|
||||||
vision_model=self.vision_model,
|
vision_model=self.vision_model,
|
||||||
prompt=vision_llm_figure_describe_prompt(),
|
prompt=vision_llm_figure_describe_prompt(),
|
||||||
callback=callback,
|
callback=callback,
|
||||||
)
|
)
|
||||||
|
return figure_idx, description_text
|
||||||
|
|
||||||
|
futures = []
|
||||||
|
for idx, img_binary in enumerate(self.figures or []):
|
||||||
|
futures.append(shared_executor.submit(process, idx, img_binary))
|
||||||
|
|
||||||
|
for future in as_completed(futures):
|
||||||
|
figure_num, txt = future.result()
|
||||||
if txt:
|
if txt:
|
||||||
self.descriptions[figure_num] = txt + "\n".join(self.descriptions[figure_num])
|
self.descriptions[figure_num] = txt + "\n".join(self.descriptions[figure_num])
|
||||||
|
|
||||||
|
@ -99,8 +99,10 @@ CURRENT_TASKS = {}
|
|||||||
|
|
||||||
MAX_CONCURRENT_TASKS = int(os.environ.get('MAX_CONCURRENT_TASKS', "5"))
|
MAX_CONCURRENT_TASKS = int(os.environ.get('MAX_CONCURRENT_TASKS', "5"))
|
||||||
MAX_CONCURRENT_CHUNK_BUILDERS = int(os.environ.get('MAX_CONCURRENT_CHUNK_BUILDERS', "1"))
|
MAX_CONCURRENT_CHUNK_BUILDERS = int(os.environ.get('MAX_CONCURRENT_CHUNK_BUILDERS', "1"))
|
||||||
|
MAX_CONCURRENT_MINIO = int(os.environ.get('MAX_CONCURRENT_MINIO', '10'))
|
||||||
task_limiter = trio.CapacityLimiter(MAX_CONCURRENT_TASKS)
|
task_limiter = trio.CapacityLimiter(MAX_CONCURRENT_TASKS)
|
||||||
chunk_limiter = trio.CapacityLimiter(MAX_CONCURRENT_CHUNK_BUILDERS)
|
chunk_limiter = trio.CapacityLimiter(MAX_CONCURRENT_CHUNK_BUILDERS)
|
||||||
|
minio_limiter = trio.CapacityLimiter(MAX_CONCURRENT_MINIO)
|
||||||
WORKER_HEARTBEAT_TIMEOUT = int(os.environ.get('WORKER_HEARTBEAT_TIMEOUT', '120'))
|
WORKER_HEARTBEAT_TIMEOUT = int(os.environ.get('WORKER_HEARTBEAT_TIMEOUT', '120'))
|
||||||
stop_event = threading.Event()
|
stop_event = threading.Event()
|
||||||
|
|
||||||
@ -270,38 +272,43 @@ async def build_chunks(task, progress_callback):
|
|||||||
}
|
}
|
||||||
if task["pagerank"]:
|
if task["pagerank"]:
|
||||||
doc[PAGERANK_FLD] = int(task["pagerank"])
|
doc[PAGERANK_FLD] = int(task["pagerank"])
|
||||||
el = 0
|
st = timer()
|
||||||
for ck in cks:
|
|
||||||
d = copy.deepcopy(doc)
|
async def upload_to_minio(document, chunk):
|
||||||
d.update(ck)
|
try:
|
||||||
d["id"] = xxhash.xxh64((ck["content_with_weight"] + str(d["doc_id"])).encode("utf-8")).hexdigest()
|
async with minio_limiter:
|
||||||
|
d = copy.deepcopy(document)
|
||||||
|
d.update(chunk)
|
||||||
|
d["id"] = xxhash.xxh64((chunk["content_with_weight"] + str(d["doc_id"])).encode("utf-8")).hexdigest()
|
||||||
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
|
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
|
||||||
d["create_timestamp_flt"] = datetime.now().timestamp()
|
d["create_timestamp_flt"] = datetime.now().timestamp()
|
||||||
if not d.get("image"):
|
if not d.get("image"):
|
||||||
_ = d.pop("image", None)
|
_ = d.pop("image", None)
|
||||||
d["img_id"] = ""
|
d["img_id"] = ""
|
||||||
docs.append(d)
|
docs.append(d)
|
||||||
continue
|
return
|
||||||
|
|
||||||
try:
|
|
||||||
output_buffer = BytesIO()
|
output_buffer = BytesIO()
|
||||||
if isinstance(d["image"], bytes):
|
if isinstance(d["image"], bytes):
|
||||||
output_buffer = BytesIO(d["image"])
|
output_buffer = BytesIO(d["image"])
|
||||||
else:
|
else:
|
||||||
d["image"].save(output_buffer, format='JPEG')
|
d["image"].save(output_buffer, format='JPEG')
|
||||||
|
|
||||||
st = timer()
|
|
||||||
await trio.to_thread.run_sync(lambda: STORAGE_IMPL.put(task["kb_id"], d["id"], output_buffer.getvalue()))
|
await trio.to_thread.run_sync(lambda: STORAGE_IMPL.put(task["kb_id"], d["id"], output_buffer.getvalue()))
|
||||||
el += timer() - st
|
|
||||||
|
d["img_id"] = "{}-{}".format(task["kb_id"], d["id"])
|
||||||
|
del d["image"]
|
||||||
|
docs.append(d)
|
||||||
except Exception:
|
except Exception:
|
||||||
logging.exception(
|
logging.exception(
|
||||||
"Saving image of chunk {}/{}/{} got exception".format(task["location"], task["name"], d["id"]))
|
"Saving image of chunk {}/{}/{} got exception".format(task["location"], task["name"], d["id"]))
|
||||||
raise
|
raise
|
||||||
|
|
||||||
d["img_id"] = "{}-{}".format(task["kb_id"], d["id"])
|
async with trio.open_nursery() as nursery:
|
||||||
del d["image"]
|
for ck in cks:
|
||||||
docs.append(d)
|
nursery.start_soon(upload_to_minio, doc, ck)
|
||||||
logging.info("MINIO PUT({}):{}".format(task["name"], el))
|
|
||||||
|
el = timer() - st
|
||||||
|
logging.info("MINIO PUT({}) cost {:.3f} s".format(task["name"], el))
|
||||||
|
|
||||||
if task["parser_config"].get("auto_keywords", 0):
|
if task["parser_config"].get("auto_keywords", 0):
|
||||||
st = timer()
|
st = timer()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user