From d4a123d6ddef185bd4b9f27672dea47bcf09eec6 Mon Sep 17 00:00:00 2001 From: Emmanuel Ferdman Date: Thu, 22 May 2025 05:06:28 +0300 Subject: [PATCH] Fix: resolve regex library warnings (#7782) ### What problem does this PR solve? This small PR resolves the regex library warnings showing in Python3.11: ```python DeprecationWarning: 'count' is passed as positional argument ``` ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [ ] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [x] Refactoring - [ ] Performance Improvement - [ ] Other (please describe): Signed-off-by: Emmanuel Ferdman --- deepdoc/parser/resume/entities/corporations.py | 6 +++--- deepdoc/parser/resume/step_two.py | 4 ++-- rag/nlp/__init__.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/deepdoc/parser/resume/entities/corporations.py b/deepdoc/parser/resume/entities/corporations.py index 43793668d..0396281de 100644 --- a/deepdoc/parser/resume/entities/corporations.py +++ b/deepdoc/parser/resume/entities/corporations.py @@ -53,14 +53,14 @@ def corpNorm(nm, add_region=True): nm = re.sub(r"&", "&", nm) nm = re.sub(r"[\(\)()\+'\"\t \*\\【】-]+", " ", nm) nm = re.sub( - r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE + r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, count=10000, flags=re.IGNORECASE ) nm = re.sub( r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, - 10000, - re.IGNORECASE, + count=10000, + flags=re.IGNORECASE, ) if not nm or (len(nm) < 5 and not regions.isName(nm[0:2])): return nm diff --git a/deepdoc/parser/resume/step_two.py b/deepdoc/parser/resume/step_two.py index 6097a0132..0aa3ad383 100644 --- a/deepdoc/parser/resume/step_two.py +++ b/deepdoc/parser/resume/step_two.py @@ -51,7 +51,7 @@ PY = Pinyin() def rmHtmlTag(line): - return re.sub(r"<[a-z0-9.\"=';,:\+_/ -]+>", " ", line, 100000, re.IGNORECASE) + return re.sub(r"<[a-z0-9.\"=';,:\+_/ -]+>", " ", line, count=100000, flags=re.IGNORECASE) def highest_degree(dg): @@ -507,7 +507,7 @@ def parse(cv): (r".*国有.*", "国企"), (r"[ ()\(\)人/·0-9-]+", ""), (r".*(元|规模|于|=|北京|上海|至今|中国|工资|州|shanghai|强|餐饮|融资|职).*", "")]: - cv["corporation_type"] = re.sub(p, r, cv["corporation_type"], 1000, re.IGNORECASE) + cv["corporation_type"] = re.sub(p, r, cv["corporation_type"], count=1000, flags=re.IGNORECASE) if len(cv["corporation_type"]) < 2: del cv["corporation_type"] diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index 5b0d4ff36..71ca2aca1 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -343,7 +343,7 @@ def remove_contents_table(sections, eng=False): type("")) else sections[i][0]).strip() if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", - re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)): + re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], flags=re.IGNORECASE)): i += 1 continue sections.pop(i)