From d4a123d6ddef185bd4b9f27672dea47bcf09eec6 Mon Sep 17 00:00:00 2001
From: Emmanuel Ferdman <emmanuelferdman@gmail.com>
Date: Thu, 22 May 2025 05:06:28 +0300
Subject: [PATCH] Fix: resolve regex library warnings (#7782)

### What problem does this PR solve?
This small PR resolves the regex library warnings showing in Python3.11:
```python
DeprecationWarning: 'count' is passed as positional argument
```

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [x] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):

Signed-off-by: Emmanuel Ferdman <emmanuelferdman@gmail.com>
---
 deepdoc/parser/resume/entities/corporations.py | 6 +++---
 deepdoc/parser/resume/step_two.py              | 4 ++--
 rag/nlp/__init__.py                            | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/deepdoc/parser/resume/entities/corporations.py b/deepdoc/parser/resume/entities/corporations.py
index 43793668d..0396281de 100644
--- a/deepdoc/parser/resume/entities/corporations.py
+++ b/deepdoc/parser/resume/entities/corporations.py
@@ -53,14 +53,14 @@ def corpNorm(nm, add_region=True):
     nm = re.sub(r"&amp;", "&", nm)
     nm = re.sub(r"[\(\)（）\+'\"\t \*\\【】-]+", " ", nm)
     nm = re.sub(
-        r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE
+        r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, count=10000, flags=re.IGNORECASE
     )
     nm = re.sub(
         r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$",
         "",
         nm,
-        10000,
-        re.IGNORECASE,
+        count=10000,
+        flags=re.IGNORECASE,
     )
     if not nm or (len(nm) < 5 and not regions.isName(nm[0:2])):
         return nm
diff --git a/deepdoc/parser/resume/step_two.py b/deepdoc/parser/resume/step_two.py
index 6097a0132..0aa3ad383 100644
--- a/deepdoc/parser/resume/step_two.py
+++ b/deepdoc/parser/resume/step_two.py
@@ -51,7 +51,7 @@ PY = Pinyin()
 
 
 def rmHtmlTag(line):
-    return re.sub(r"<[a-z0-9.\"=';,:\+_/ -]+>", " ", line, 100000, re.IGNORECASE)
+    return re.sub(r"<[a-z0-9.\"=';,:\+_/ -]+>", " ", line, count=100000, flags=re.IGNORECASE)
 
 
 def highest_degree(dg):
@@ -507,7 +507,7 @@ def parse(cv):
                      (r".*国有.*", "国企"),
                      (r"[ （）\(\)人/·0-9-]+", ""),
                      (r".*(元|规模|于|=|北京|上海|至今|中国|工资|州|shanghai|强|餐饮|融资|职).*", "")]:
-            cv["corporation_type"] = re.sub(p, r, cv["corporation_type"], 1000, re.IGNORECASE)
+            cv["corporation_type"] = re.sub(p, r, cv["corporation_type"], count=1000, flags=re.IGNORECASE)
         if len(cv["corporation_type"]) < 2:
             del cv["corporation_type"]
 
diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py
index 5b0d4ff36..71ca2aca1 100644
--- a/rag/nlp/__init__.py
+++ b/rag/nlp/__init__.py
@@ -343,7 +343,7 @@ def remove_contents_table(sections, eng=False):
                                               type("")) else sections[i][0]).strip()
 
         if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
-                        re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)):
+                        re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], flags=re.IGNORECASE)):
             i += 1
             continue
         sections.pop(i)