mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-06-04 11:24:00 +08:00
Fix: resolve regex library warnings (#7782)
### What problem does this PR solve? This small PR resolves the regex library warnings showing in Python3.11: ```python DeprecationWarning: 'count' is passed as positional argument ``` ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [ ] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [x] Refactoring - [ ] Performance Improvement - [ ] Other (please describe): Signed-off-by: Emmanuel Ferdman <emmanuelferdman@gmail.com>
This commit is contained in:
parent
ce816edb5f
commit
d4a123d6dd
@ -53,14 +53,14 @@ def corpNorm(nm, add_region=True):
|
|||||||
nm = re.sub(r"&", "&", nm)
|
nm = re.sub(r"&", "&", nm)
|
||||||
nm = re.sub(r"[\(\)()\+'\"\t \*\\【】-]+", " ", nm)
|
nm = re.sub(r"[\(\)()\+'\"\t \*\\【】-]+", " ", nm)
|
||||||
nm = re.sub(
|
nm = re.sub(
|
||||||
r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE
|
r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, count=10000, flags=re.IGNORECASE
|
||||||
)
|
)
|
||||||
nm = re.sub(
|
nm = re.sub(
|
||||||
r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$",
|
r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$",
|
||||||
"",
|
"",
|
||||||
nm,
|
nm,
|
||||||
10000,
|
count=10000,
|
||||||
re.IGNORECASE,
|
flags=re.IGNORECASE,
|
||||||
)
|
)
|
||||||
if not nm or (len(nm) < 5 and not regions.isName(nm[0:2])):
|
if not nm or (len(nm) < 5 and not regions.isName(nm[0:2])):
|
||||||
return nm
|
return nm
|
||||||
|
@ -51,7 +51,7 @@ PY = Pinyin()
|
|||||||
|
|
||||||
|
|
||||||
def rmHtmlTag(line):
|
def rmHtmlTag(line):
|
||||||
return re.sub(r"<[a-z0-9.\"=';,:\+_/ -]+>", " ", line, 100000, re.IGNORECASE)
|
return re.sub(r"<[a-z0-9.\"=';,:\+_/ -]+>", " ", line, count=100000, flags=re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
def highest_degree(dg):
|
def highest_degree(dg):
|
||||||
@ -507,7 +507,7 @@ def parse(cv):
|
|||||||
(r".*国有.*", "国企"),
|
(r".*国有.*", "国企"),
|
||||||
(r"[ ()\(\)人/·0-9-]+", ""),
|
(r"[ ()\(\)人/·0-9-]+", ""),
|
||||||
(r".*(元|规模|于|=|北京|上海|至今|中国|工资|州|shanghai|强|餐饮|融资|职).*", "")]:
|
(r".*(元|规模|于|=|北京|上海|至今|中国|工资|州|shanghai|强|餐饮|融资|职).*", "")]:
|
||||||
cv["corporation_type"] = re.sub(p, r, cv["corporation_type"], 1000, re.IGNORECASE)
|
cv["corporation_type"] = re.sub(p, r, cv["corporation_type"], count=1000, flags=re.IGNORECASE)
|
||||||
if len(cv["corporation_type"]) < 2:
|
if len(cv["corporation_type"]) < 2:
|
||||||
del cv["corporation_type"]
|
del cv["corporation_type"]
|
||||||
|
|
||||||
|
@ -343,7 +343,7 @@ def remove_contents_table(sections, eng=False):
|
|||||||
type("")) else sections[i][0]).strip()
|
type("")) else sections[i][0]).strip()
|
||||||
|
|
||||||
if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
|
if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
|
||||||
re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)):
|
re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], flags=re.IGNORECASE)):
|
||||||
i += 1
|
i += 1
|
||||||
continue
|
continue
|
||||||
sections.pop(i)
|
sections.pop(i)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user