diff --git a/api/db/services/task_service.py b/api/db/services/task_service.py index 2b251c374..90e4c9ed2 100644 --- a/api/db/services/task_service.py +++ b/api/db/services/task_service.py @@ -133,9 +133,8 @@ class TaskService(CommonService): cls.model.id == id).execute() -def queue_tasks(doc, bucket, name): +def queue_tasks(doc: dict, bucket: str, name: str): def new_task(): - nonlocal doc return { "id": get_uuid(), "doc_id": doc["id"] @@ -149,15 +148,9 @@ def queue_tasks(doc, bucket, name): page_size = doc["parser_config"].get("task_page_size", 12) if doc["parser_id"] == "paper": page_size = doc["parser_config"].get("task_page_size", 22) - if doc["parser_id"] == "one": - page_size = 1000000000 - if doc["parser_id"] == "knowledge_graph": - page_size = 1000000000 - if not do_layout: - page_size = 1000000000 - page_ranges = doc["parser_config"].get("pages") - if not page_ranges: - page_ranges = [(1, 100000)] + if doc["parser_id"] in ["one", "knowledge_graph"] or not do_layout: + page_size = 10 ** 9 + page_ranges = doc["parser_config"].get("pages") or [(1, 10 ** 5)] for s, e in page_ranges: s -= 1 s = max(0, s) @@ -170,8 +163,7 @@ def queue_tasks(doc, bucket, name): elif doc["parser_id"] == "table": file_bin = STORAGE_IMPL.get(bucket, name) - rn = RAGFlowExcelParser.row_number( - doc["name"], file_bin) + rn = RAGFlowExcelParser.row_number(doc["name"], file_bin) for i in range(0, rn, 3000): task = new_task() task["from_page"] = i