mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-07-24 18:44:28 +08:00
init README of deepdoc, add picture processer. (#71)
* init README of deepdoc, add picture processer. * add resume parsing
This commit is contained in:
parent
d32322c081
commit
7fd1eca582
2
.gitignore
vendored
2
.gitignore
vendored
@ -6,7 +6,7 @@ __pycache__/
|
||||
hudet/
|
||||
cv/
|
||||
layout_app.py
|
||||
resume/
|
||||
api/flask_session
|
||||
|
||||
# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
|
||||
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
|
||||
|
@ -163,6 +163,7 @@ def completion():
|
||||
del req["conversation_id"]
|
||||
del req["messages"]
|
||||
ans = chat(dia, msg, **req)
|
||||
if not conv.reference: conv.reference = []
|
||||
conv.reference.append(ans["reference"])
|
||||
conv.message.append({"role": "assistant", "content": ans["answer"]})
|
||||
ConversationService.update_by_id(conv.id, conv.to_dict())
|
||||
|
@ -32,7 +32,6 @@ def set_dialog():
|
||||
dialog_id = req.get("dialog_id")
|
||||
name = req.get("name", "New Dialog")
|
||||
description = req.get("description", "A helpful Dialog")
|
||||
language = req.get("language", "Chinese")
|
||||
top_n = req.get("top_n", 6)
|
||||
similarity_threshold = req.get("similarity_threshold", 0.1)
|
||||
vector_similarity_weight = req.get("vector_similarity_weight", 0.3)
|
||||
@ -80,7 +79,6 @@ def set_dialog():
|
||||
"name": name,
|
||||
"kb_ids": req["kb_ids"],
|
||||
"description": description,
|
||||
"language": language,
|
||||
"llm_id": llm_id,
|
||||
"llm_setting": llm_setting,
|
||||
"prompt_config": prompt_config,
|
||||
|
@ -272,7 +272,9 @@ def get(doc_id):
|
||||
response = flask.make_response(MINIO.get(doc.kb_id, doc.location))
|
||||
ext = re.search(r"\.([^.]+)$", doc.name)
|
||||
if ext:
|
||||
response.headers.set('Content-Type', 'application/%s'%ext.group(1))
|
||||
if doc.type == FileType.VISUAL.value:
|
||||
response.headers.set('Content-Type', 'image/%s'%ext.group(1))
|
||||
else: response.headers.set('Content-Type', 'application/%s'%ext.group(1))
|
||||
return response
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
|
@ -464,6 +464,7 @@ class Knowledgebase(DataBaseModel):
|
||||
avatar = TextField(null=True, help_text="avatar base64 string")
|
||||
tenant_id = CharField(max_length=32, null=False)
|
||||
name = CharField(max_length=128, null=False, help_text="KB name", index=True)
|
||||
language = CharField(max_length=32, null=True, default="Chinese", help_text="English|Chinese")
|
||||
description = TextField(null=True, help_text="KB description")
|
||||
embd_id = CharField(max_length=128, null=False, help_text="default embedding model ID")
|
||||
permission = CharField(max_length=16, null=False, help_text="me|team", default="me")
|
||||
|
@ -57,7 +57,7 @@ class TenantLLMService(CommonService):
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def model_instance(cls, tenant_id, llm_type, llm_name=None):
|
||||
def model_instance(cls, tenant_id, llm_type, llm_name=None, lang="Chinese"):
|
||||
e, tenant = TenantService.get_by_id(tenant_id)
|
||||
if not e:
|
||||
raise LookupError("Tenant not found")
|
||||
@ -87,7 +87,7 @@ class TenantLLMService(CommonService):
|
||||
if model_config["llm_factory"] not in CvModel:
|
||||
return
|
||||
return CvModel[model_config["llm_factory"]](
|
||||
model_config["api_key"], model_config["llm_name"])
|
||||
model_config["api_key"], model_config["llm_name"], lang)
|
||||
|
||||
if llm_type == LLMType.CHAT.value:
|
||||
if model_config["llm_factory"] not in ChatModel:
|
||||
@ -120,11 +120,11 @@ class TenantLLMService(CommonService):
|
||||
|
||||
|
||||
class LLMBundle(object):
|
||||
def __init__(self, tenant_id, llm_type, llm_name=None):
|
||||
def __init__(self, tenant_id, llm_type, llm_name=None, lang="Chinese"):
|
||||
self.tenant_id = tenant_id
|
||||
self.llm_type = llm_type
|
||||
self.llm_name = llm_name
|
||||
self.mdl = TenantLLMService.model_instance(tenant_id, llm_type, llm_name)
|
||||
self.mdl = TenantLLMService.model_instance(tenant_id, llm_type, llm_name, lang=lang)
|
||||
assert self.mdl, "Can't find mole for {}/{}/{}".format(tenant_id, llm_type, llm_name)
|
||||
|
||||
def encode(self, texts: list, batch_size=32):
|
||||
|
@ -27,7 +27,24 @@ class TaskService(CommonService):
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_tasks(cls, tm, mod=0, comm=1, items_per_page=64):
|
||||
fields = [cls.model.id, cls.model.doc_id, cls.model.from_page,cls.model.to_page, Document.kb_id, Document.parser_id, Document.parser_config, Document.name, Document.type, Document.location, Document.size, Knowledgebase.tenant_id, Tenant.embd_id, Tenant.img2txt_id, Tenant.asr_id, cls.model.update_time]
|
||||
fields = [
|
||||
cls.model.id,
|
||||
cls.model.doc_id,
|
||||
cls.model.from_page,
|
||||
cls.model.to_page,
|
||||
Document.kb_id,
|
||||
Document.parser_id,
|
||||
Document.parser_config,
|
||||
Document.name,
|
||||
Document.type,
|
||||
Document.location,
|
||||
Document.size,
|
||||
Knowledgebase.tenant_id,
|
||||
Knowledgebase.language,
|
||||
Tenant.embd_id,
|
||||
Tenant.img2txt_id,
|
||||
Tenant.asr_id,
|
||||
cls.model.update_time]
|
||||
docs = cls.model.select(*fields) \
|
||||
.join(Document, on=(cls.model.doc_id == Document.id)) \
|
||||
.join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \
|
||||
@ -42,7 +59,6 @@ class TaskService(CommonService):
|
||||
.paginate(1, items_per_page)
|
||||
return list(docs.dicts())
|
||||
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def do_cancel(cls, id):
|
||||
@ -54,12 +70,11 @@ class TaskService(CommonService):
|
||||
pass
|
||||
return True
|
||||
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def update_progress(cls, id, info):
|
||||
cls.model.update(progress_msg=cls.model.progress_msg + "\n"+info["progress_msg"]).where(
|
||||
cls.model.update(progress_msg=cls.model.progress_msg + "\n" + info["progress_msg"]).where(
|
||||
cls.model.id == id).execute()
|
||||
if "progress" in info:
|
||||
cls.model.update(progress=info["progress"]).where(
|
||||
cls.model.id == id).execute()
|
||||
cls.model.id == id).execute()
|
||||
|
Binary file not shown.
@ -167,7 +167,11 @@ def thumbnail(filename, blob):
|
||||
return "data:image/png;base64," + base64.b64encode(buffered.getvalue()).decode("utf-8")
|
||||
|
||||
if re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename):
|
||||
return ("data:image/%s;base64,"%filename.split(".")[-1]) + base64.b64encode(Image.open(BytesIO(blob)).thumbnail((30, 30)).tobytes()).decode("utf-8")
|
||||
image = Image.open(BytesIO(blob))
|
||||
image.thumbnail((30, 30))
|
||||
buffered = BytesIO()
|
||||
image.save(buffered, format="png")
|
||||
return "data:image/png;base64," + base64.b64encode(buffered.getvalue()).decode("utf-8")
|
||||
|
||||
if re.match(r".*\.(ppt|pptx)$", filename):
|
||||
import aspose.slides as slides
|
||||
|
82
deepdoc/README.md
Normal file
82
deepdoc/README.md
Normal file
@ -0,0 +1,82 @@
|
||||
English | [简体中文](./README_zh.md)
|
||||
|
||||
#*Deep*Doc
|
||||
|
||||
---
|
||||
|
||||
- [1. Introduction](#1)
|
||||
- [2. Vision](#2)
|
||||
- [3. Parser](#3)
|
||||
|
||||
<a name="1"></a>
|
||||
## 1. Introduction
|
||||
|
||||
---
|
||||
With a bunch of documents from various domains with various formats and along with diverse retrieval requirements,
|
||||
an accurate analysis becomes a very challenge task. *Deep*Doc is born for that purpose.
|
||||
There 2 parts in *Deep*Doc so far: vision and parser.
|
||||
|
||||
<a name="2"></a>
|
||||
## 2. Vision
|
||||
|
||||
---
|
||||
|
||||
We use vision information to resolve problems as human being.
|
||||
- OCR. Since a lot of documents presented as images or at least be able to transform to image,
|
||||
OCR is a very essential and fundamental or even universal solution for text extraction.
|
||||
|
||||
<div align="center" style="margin-top:20px;margin-bottom:20px;">
|
||||
<img src="https://lh6.googleusercontent.com/2xdiSjaGWkZ71YdORc71Ujf7jCHmO6G-6ONklzGiUYEh3QZpjPo6MQ9eqEFX20am_cdW4Ck0YRraXEetXWnM08kJd99yhik13Cy0_YKUAq2zVGR15LzkovRAmK9iT4o3hcJ8dTpspaJKUwt6R4gN7So" width="300"/>
|
||||
</div>
|
||||
|
||||
- Layout recognition. Documents from different domain may have various layouts,
|
||||
like, newspaper, magazine, book and résumé are distinct in terms of layout.
|
||||
Only when machine have an accurate layout analysis, it can decide if these text parts are successive or not,
|
||||
or this part needs Table Structure Recognition(TSR) to process, or this part is a figure and described with this caption.
|
||||
We have 10 basic layout components which covers most cases:
|
||||
- Text
|
||||
- Title
|
||||
- Figure
|
||||
- Figure caption
|
||||
- Table
|
||||
- Table caption
|
||||
- Header
|
||||
- Footer
|
||||
- Reference
|
||||
- Equation
|
||||
<div align="center" style="margin-top:20px;margin-bottom:20px;">
|
||||
<img src="https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.7/ppstructure/docs/layout/layout.png?raw=true" width="900"/>
|
||||
</div>
|
||||
|
||||
- Table Structure Recognition(TSR). Data table is a frequently used structure present data including numbers or text.
|
||||
And the structure of a table might be very complex, like hierarchy headers, spanning cells and projected row headers.
|
||||
Along with TSR, we also reassemble the content into sentences which could be well comprehended by LLM.
|
||||
We have five labels for TSR task:
|
||||
- Column
|
||||
- Row
|
||||
- Column header
|
||||
- Projected row header
|
||||
- Spanning cell
|
||||
<div align="center" style="margin-top:20px;margin-bottom:20px;">
|
||||
<img src="https://user-images.githubusercontent.com/10793386/139559159-cd23c972-8731-48ed-91df-f3f27e9f4d79.jpg" width="900"/>
|
||||
</div>
|
||||
|
||||
<a name="3"></a>
|
||||
## 3. Parser
|
||||
|
||||
---
|
||||
|
||||
Four kinds of document formats as PDF, DOCX, EXCEL and PPT have their corresponding parser.
|
||||
The most complex one is PDF parser since PDF's flexibility. The output of PDF parser includes:
|
||||
- Text chunks with their own positions in PDF(page number and rectangular positions).
|
||||
- Tables with cropped image from the PDF, and contents which has already translated into natural language sentences.
|
||||
- Figures with caption and text in the figures.
|
||||
|
||||
###Résumé
|
||||
|
||||
---
|
||||
The résumé is a very complicated kind of document. A résumé which is composed of unstructured text
|
||||
with various layouts could be resolved into structured data composed of nearly a hundred of fields.
|
||||
We haven't opened the parser yet, as we open the processing method after parsing procedure.
|
||||
|
||||
|
1
deepdoc/README_zh.md
Normal file
1
deepdoc/README_zh.md
Normal file
@ -0,0 +1 @@
|
||||
[English](./README.md) | 简体中文
|
@ -1,223 +1,8 @@
|
||||
import random
|
||||
|
||||
|
||||
from .pdf_parser import HuParser as PdfParser
|
||||
from .docx_parser import HuDocxParser as DocxParser
|
||||
from .excel_parser import HuExcelParser as ExcelParser
|
||||
|
||||
import re
|
||||
|
||||
from nltk import word_tokenize
|
||||
|
||||
from rag.nlp import stemmer, huqie
|
||||
from rag.utils import num_tokens_from_string
|
||||
|
||||
BULLET_PATTERN = [[
|
||||
r"第[零一二三四五六七八九十百0-9]+(分?编|部分)",
|
||||
r"第[零一二三四五六七八九十百0-9]+章",
|
||||
r"第[零一二三四五六七八九十百0-9]+节",
|
||||
r"第[零一二三四五六七八九十百0-9]+条",
|
||||
r"[\((][零一二三四五六七八九十百]+[\))]",
|
||||
], [
|
||||
r"第[0-9]+章",
|
||||
r"第[0-9]+节",
|
||||
r"[0-9]{,3}[\. 、]",
|
||||
r"[0-9]{,2}\.[0-9]{,2}",
|
||||
r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
|
||||
r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
|
||||
], [
|
||||
r"第[零一二三四五六七八九十百0-9]+章",
|
||||
r"第[零一二三四五六七八九十百0-9]+节",
|
||||
r"[零一二三四五六七八九十百]+[ 、]",
|
||||
r"[\((][零一二三四五六七八九十百]+[\))]",
|
||||
r"[\((][0-9]{,2}[\))]",
|
||||
], [
|
||||
r"PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)",
|
||||
r"Chapter (I+V?|VI*|XI|IX|X)",
|
||||
r"Section [0-9]+",
|
||||
r"Article [0-9]+"
|
||||
]
|
||||
]
|
||||
|
||||
def random_choices(arr, k):
|
||||
k = min(len(arr), k)
|
||||
return random.choices(arr, k=k)
|
||||
|
||||
def bullets_category(sections):
|
||||
global BULLET_PATTERN
|
||||
hits = [0] * len(BULLET_PATTERN)
|
||||
for i, pro in enumerate(BULLET_PATTERN):
|
||||
for sec in sections:
|
||||
for p in pro:
|
||||
if re.match(p, sec):
|
||||
hits[i] += 1
|
||||
break
|
||||
maxium = 0
|
||||
res = -1
|
||||
for i, h in enumerate(hits):
|
||||
if h <= maxium: continue
|
||||
res = i
|
||||
maxium = h
|
||||
return res
|
||||
|
||||
|
||||
def is_english(texts):
|
||||
eng = 0
|
||||
for t in texts:
|
||||
if re.match(r"[a-zA-Z]{2,}", t.strip()):
|
||||
eng += 1
|
||||
if eng / len(texts) > 0.8:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def tokenize(d, t, eng):
|
||||
d["content_with_weight"] = t
|
||||
if eng:
|
||||
t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
|
||||
d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)])
|
||||
else:
|
||||
d["content_ltks"] = huqie.qie(t)
|
||||
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
||||
|
||||
|
||||
def remove_contents_table(sections, eng=False):
|
||||
i = 0
|
||||
while i < len(sections):
|
||||
def get(i):
|
||||
nonlocal sections
|
||||
return (sections[i] if type(sections[i]) == type("") else sections[i][0]).strip()
|
||||
|
||||
if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
|
||||
re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)):
|
||||
i += 1
|
||||
continue
|
||||
sections.pop(i)
|
||||
if i >= len(sections): break
|
||||
prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
|
||||
while not prefix:
|
||||
sections.pop(i)
|
||||
if i >= len(sections): break
|
||||
prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
|
||||
sections.pop(i)
|
||||
if i >= len(sections) or not prefix: break
|
||||
for j in range(i, min(i + 128, len(sections))):
|
||||
if not re.match(prefix, get(j)):
|
||||
continue
|
||||
for _ in range(i, j): sections.pop(i)
|
||||
break
|
||||
|
||||
|
||||
def make_colon_as_title(sections):
|
||||
if not sections: return []
|
||||
if type(sections[0]) == type(""): return sections
|
||||
i = 0
|
||||
while i < len(sections):
|
||||
txt, layout = sections[i]
|
||||
i += 1
|
||||
txt = txt.split("@")[0].strip()
|
||||
if not txt:
|
||||
continue
|
||||
if txt[-1] not in "::":
|
||||
continue
|
||||
txt = txt[::-1]
|
||||
arr = re.split(r"([。?!!?;;]| .)", txt)
|
||||
if len(arr) < 2 or len(arr[1]) < 32:
|
||||
continue
|
||||
sections.insert(i - 1, (arr[0][::-1], "title"))
|
||||
i += 1
|
||||
|
||||
|
||||
def hierarchical_merge(bull, sections, depth):
|
||||
if not sections or bull < 0: return []
|
||||
if type(sections[0]) == type(""): sections = [(s, "") for s in sections]
|
||||
sections = [(t,o) for t, o in sections if t and len(t.split("@")[0].strip()) > 1 and not re.match(r"[0-9]+$", t.split("@")[0].strip())]
|
||||
bullets_size = len(BULLET_PATTERN[bull])
|
||||
levels = [[] for _ in range(bullets_size + 2)]
|
||||
|
||||
def not_title(txt):
|
||||
if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt): return False
|
||||
if len(txt) >= 128: return True
|
||||
return re.search(r"[,;,。;!!]", txt)
|
||||
|
||||
for i, (txt, layout) in enumerate(sections):
|
||||
for j, p in enumerate(BULLET_PATTERN[bull]):
|
||||
if re.match(p, txt.strip()) and not not_title(txt):
|
||||
levels[j].append(i)
|
||||
break
|
||||
else:
|
||||
if re.search(r"(title|head)", layout):
|
||||
levels[bullets_size].append(i)
|
||||
else:
|
||||
levels[bullets_size + 1].append(i)
|
||||
sections = [t for t, _ in sections]
|
||||
for s in sections: print("--", s)
|
||||
|
||||
def binary_search(arr, target):
|
||||
if not arr: return -1
|
||||
if target > arr[-1]: return len(arr) - 1
|
||||
if target < arr[0]: return -1
|
||||
s, e = 0, len(arr)
|
||||
while e - s > 1:
|
||||
i = (e + s) // 2
|
||||
if target > arr[i]:
|
||||
s = i
|
||||
continue
|
||||
elif target < arr[i]:
|
||||
e = i
|
||||
continue
|
||||
else:
|
||||
assert False
|
||||
return s
|
||||
|
||||
cks = []
|
||||
readed = [False] * len(sections)
|
||||
levels = levels[::-1]
|
||||
for i, arr in enumerate(levels[:depth]):
|
||||
for j in arr:
|
||||
if readed[j]: continue
|
||||
readed[j] = True
|
||||
cks.append([j])
|
||||
if i + 1 == len(levels) - 1: continue
|
||||
for ii in range(i + 1, len(levels)):
|
||||
jj = binary_search(levels[ii], j)
|
||||
if jj < 0: continue
|
||||
if jj > cks[-1][-1]: cks[-1].pop(-1)
|
||||
cks[-1].append(levels[ii][jj])
|
||||
for ii in cks[-1]: readed[ii] = True
|
||||
for i in range(len(cks)):
|
||||
cks[i] = [sections[j] for j in cks[i][::-1]]
|
||||
print("--------------\n", "\n* ".join(cks[i]))
|
||||
|
||||
return cks
|
||||
|
||||
|
||||
def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
||||
if not sections: return []
|
||||
if type(sections[0]) == type(""): sections = [(s, "") for s in sections]
|
||||
cks = [""]
|
||||
tk_nums = [0]
|
||||
def add_chunk(t, pos):
|
||||
nonlocal cks, tk_nums, delimiter
|
||||
tnum = num_tokens_from_string(t)
|
||||
if tnum < 8: pos = ""
|
||||
if tk_nums[-1] > chunk_token_num:
|
||||
cks.append(t + pos)
|
||||
tk_nums.append(tnum)
|
||||
else:
|
||||
cks[-1] += t + pos
|
||||
tk_nums[-1] += tnum
|
||||
|
||||
for sec, pos in sections:
|
||||
s, e = 0, 1
|
||||
while e < len(sec):
|
||||
if sec[e] in delimiter:
|
||||
add_chunk(sec[s: e+1], pos)
|
||||
s = e + 1
|
||||
e = s + 1
|
||||
else:
|
||||
e += 1
|
||||
if s < e: add_chunk(sec[s: e], pos)
|
||||
|
||||
return cks
|
||||
from .ppt_parser import HuPptParser as PptParser
|
||||
|
||||
|
||||
|
52
deepdoc/parser/ppt_parser.py
Normal file
52
deepdoc/parser/ppt_parser.py
Normal file
@ -0,0 +1,52 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from io import BytesIO
|
||||
from pptx import Presentation
|
||||
|
||||
|
||||
class HuPptParser(object):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def __extract(self, shape):
|
||||
if shape.shape_type == 19:
|
||||
tb = shape.table
|
||||
rows = []
|
||||
for i in range(1, len(tb.rows)):
|
||||
rows.append("; ".join([tb.cell(0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
|
||||
return "\n".join(rows)
|
||||
|
||||
if shape.has_text_frame:
|
||||
return shape.text_frame.text
|
||||
|
||||
if shape.shape_type == 6:
|
||||
texts = []
|
||||
for p in shape.shapes:
|
||||
t = self.__extract(p)
|
||||
if t: texts.append(t)
|
||||
return "\n".join(texts)
|
||||
|
||||
def __call__(self, fnm, from_page, to_page, callback=None):
|
||||
ppt = Presentation(fnm) if isinstance(
|
||||
fnm, str) else Presentation(
|
||||
BytesIO(fnm))
|
||||
txts = []
|
||||
self.total_page = len(ppt.slides)
|
||||
for i, slide in enumerate(ppt.slides[from_page: to_page]):
|
||||
texts = []
|
||||
for shape in slide.shapes:
|
||||
txt = self.__extract(shape)
|
||||
if txt: texts.append(txt)
|
||||
txts.append("\n".join(texts))
|
||||
|
||||
return txts
|
52
deepdoc/parser/resume/__init__.py
Normal file
52
deepdoc/parser/resume/__init__.py
Normal file
@ -0,0 +1,52 @@
|
||||
import datetime
|
||||
|
||||
|
||||
def refactor(cv):
|
||||
for n in ["raw_txt", "parser_name", "inference", "ori_text", "use_time", "time_stat"]:
|
||||
if n in cv and cv[n] is not None: del cv[n]
|
||||
cv["is_deleted"] = 0
|
||||
if "basic" not in cv: cv["basic"] = {}
|
||||
if cv["basic"].get("photo2"): del cv["basic"]["photo2"]
|
||||
|
||||
for n in ["education", "work", "certificate", "project", "language", "skill", "training"]:
|
||||
if n not in cv or cv[n] is None: continue
|
||||
if type(cv[n]) == type({}): cv[n] = [v for _, v in cv[n].items()]
|
||||
if type(cv[n]) != type([]):
|
||||
del cv[n]
|
||||
continue
|
||||
vv = []
|
||||
for v in cv[n]:
|
||||
if "external" in v and v["external"] is not None: del v["external"]
|
||||
vv.append(v)
|
||||
cv[n] = {str(i): vv[i] for i in range(len(vv))}
|
||||
|
||||
basics = [
|
||||
("basic_salary_month", "salary_month"),
|
||||
("expect_annual_salary_from", "expect_annual_salary"),
|
||||
]
|
||||
for n, t in basics:
|
||||
if cv["basic"].get(n):
|
||||
cv["basic"][t] = cv["basic"][n]
|
||||
del cv["basic"][n]
|
||||
|
||||
work = sorted([v for _, v in cv.get("work", {}).items()], key=lambda x: x.get("start_time", ""))
|
||||
edu = sorted([v for _, v in cv.get("education", {}).items()], key=lambda x: x.get("start_time", ""))
|
||||
|
||||
if work:
|
||||
cv["basic"]["work_start_time"] = work[0].get("start_time", "")
|
||||
cv["basic"]["management_experience"] = 'Y' if any(
|
||||
[w.get("management_experience", '') == 'Y' for w in work]) else 'N'
|
||||
cv["basic"]["annual_salary"] = work[-1].get("annual_salary_from", "0")
|
||||
|
||||
for n in ["annual_salary_from", "annual_salary_to", "industry_name", "position_name", "responsibilities",
|
||||
"corporation_type", "scale", "corporation_name"]:
|
||||
cv["basic"][n] = work[-1].get(n, "")
|
||||
|
||||
if edu:
|
||||
for n in ["school_name", "discipline_name"]:
|
||||
if n in edu[-1]: cv["basic"][n] = edu[-1][n]
|
||||
|
||||
cv["basic"]["updated_at"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
if "contact" not in cv: cv["contact"] = {}
|
||||
if not cv["contact"].get("name"): cv["contact"]["name"] = cv["basic"].get("name", "")
|
||||
return cv
|
0
deepdoc/parser/resume/entities/__init__.py
Normal file
0
deepdoc/parser/resume/entities/__init__.py
Normal file
80
deepdoc/parser/resume/entities/corporations.py
Normal file
80
deepdoc/parser/resume/entities/corporations.py
Normal file
@ -0,0 +1,80 @@
|
||||
import re,json,os
|
||||
import pandas as pd
|
||||
from rag.nlp import huqie
|
||||
from . import regions
|
||||
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
||||
GOODS = pd.read_csv(os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0).fillna(0)
|
||||
GOODS["cid"] = GOODS["cid"].astype(str)
|
||||
GOODS = GOODS.set_index(["cid"])
|
||||
CORP_TKS = json.load(open(os.path.join(current_file_path, "res/corp.tks.freq.json"), "r"))
|
||||
GOOD_CORP = json.load(open(os.path.join(current_file_path, "res/good_corp.json"), "r"))
|
||||
CORP_TAG = json.load(open(os.path.join(current_file_path, "res/corp_tag.json"), "r"))
|
||||
|
||||
def baike(cid, default_v=0):
|
||||
global GOODS
|
||||
try:
|
||||
return GOODS.loc[str(cid), "len"]
|
||||
except Exception as e:
|
||||
pass
|
||||
return default_v
|
||||
|
||||
|
||||
def corpNorm(nm, add_region=True):
|
||||
global CORP_TKS
|
||||
if not nm or type(nm)!=type(""):return ""
|
||||
nm = huqie.tradi2simp(huqie.strQ2B(nm)).lower()
|
||||
nm = re.sub(r"&", "&", nm)
|
||||
nm = re.sub(r"[\(\)()\+'\"\t \*\\【】-]+", " ", nm)
|
||||
nm = re.sub(r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE)
|
||||
nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE)
|
||||
if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm
|
||||
|
||||
tks = huqie.qie(nm).split(" ")
|
||||
reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
|
||||
nm = ""
|
||||
for t in tks:
|
||||
if regions.isName(t) or t in CORP_TKS:continue
|
||||
if re.match(r"[0-9a-zA-Z\\,.]+", t) and re.match(r".*[0-9a-zA-Z\,.]+$", nm):nm += " "
|
||||
nm += t
|
||||
|
||||
r = re.search(r"^([^a-z0-9 \(\)&]{2,})[a-z ]{4,}$", nm.strip())
|
||||
if r:nm = r.group(1)
|
||||
r = re.search(r"^([a-z ]{3,})[^a-z0-9 \(\)&]{2,}$", nm.strip())
|
||||
if r:nm = r.group(1)
|
||||
return nm.strip() + (("" if not reg else "(%s)"%reg[0]) if add_region else "")
|
||||
|
||||
|
||||
def rmNoise(n):
|
||||
n = re.sub(r"[\((][^()()]+[))]", "", n)
|
||||
n = re.sub(r"[,. &()()]+", "", n)
|
||||
return n
|
||||
|
||||
GOOD_CORP = set([corpNorm(rmNoise(c), False) for c in GOOD_CORP])
|
||||
for c,v in CORP_TAG.items():
|
||||
cc = corpNorm(rmNoise(c), False)
|
||||
if not cc: print (c)
|
||||
CORP_TAG = {corpNorm(rmNoise(c), False):v for c,v in CORP_TAG.items()}
|
||||
|
||||
def is_good(nm):
|
||||
global GOOD_CORP
|
||||
if nm.find("外派")>=0:return False
|
||||
nm = rmNoise(nm)
|
||||
nm = corpNorm(nm, False)
|
||||
for n in GOOD_CORP:
|
||||
if re.match(r"[0-9a-zA-Z]+$", n):
|
||||
if n == nm: return True
|
||||
elif nm.find(n)>=0:return True
|
||||
return False
|
||||
|
||||
def corp_tag(nm):
|
||||
global CORP_TAG
|
||||
nm = rmNoise(nm)
|
||||
nm = corpNorm(nm, False)
|
||||
for n in CORP_TAG.keys():
|
||||
if re.match(r"[0-9a-zA-Z., ]+$", n):
|
||||
if n == nm: return CORP_TAG[n]
|
||||
elif nm.find(n)>=0:
|
||||
if len(n)<3 and len(nm)/len(n)>=2:continue
|
||||
return CORP_TAG[n]
|
||||
return []
|
||||
|
24
deepdoc/parser/resume/entities/degrees.py
Normal file
24
deepdoc/parser/resume/entities/degrees.py
Normal file
@ -0,0 +1,24 @@
|
||||
TBL = {"94":"EMBA",
|
||||
"6":"MBA",
|
||||
"95":"MPA",
|
||||
"92":"专升本",
|
||||
"4":"专科",
|
||||
"90":"中专",
|
||||
"91":"中技",
|
||||
"86":"初中",
|
||||
"3":"博士",
|
||||
"10":"博士后",
|
||||
"1":"本科",
|
||||
"2":"硕士",
|
||||
"87":"职高",
|
||||
"89":"高中"
|
||||
}
|
||||
|
||||
TBL_ = {v:k for k,v in TBL.items()}
|
||||
|
||||
def get_name(id):
|
||||
return TBL.get(str(id), "")
|
||||
|
||||
def get_id(nm):
|
||||
if not nm:return ""
|
||||
return TBL_.get(nm.upper().strip(), "")
|
692
deepdoc/parser/resume/entities/industries.py
Normal file
692
deepdoc/parser/resume/entities/industries.py
Normal file
@ -0,0 +1,692 @@
|
||||
|
||||
TBL = {"1":{"name":"IT/通信/电子","parent":"0"},
|
||||
"2":{"name":"互联网","parent":"0"},
|
||||
"3":{"name":"电子商务","parent":"2"},
|
||||
"4":{"name":"互联网金融","parent":"2"},
|
||||
"5":{"name":"网络游戏","parent":"2"},
|
||||
"6":{"name":"社交网络平台","parent":"2"},
|
||||
"7":{"name":"视频音乐","parent":"2"},
|
||||
"9":{"name":"安全","parent":"2"},
|
||||
"10":{"name":"云计算","parent":"2"},
|
||||
"12":{"name":"工具类客户端应用","parent":"2"},
|
||||
"13":{"name":"互联网广告","parent":"2"},
|
||||
"14":{"name":"企业互联网服务","parent":"2"},
|
||||
"16":{"name":"在线教育","parent":"2"},
|
||||
"17":{"name":"在线医疗","parent":"2"},
|
||||
"19":{"name":"B2B","parent":"3"},
|
||||
"20":{"name":"B2C","parent":"3"},
|
||||
"21":{"name":"C2C","parent":"3"},
|
||||
"22":{"name":"生活信息本地化","parent":"3"},
|
||||
"23":{"name":"在线旅游","parent":"2"},
|
||||
"24":{"name":"第三方支付","parent":"4"},
|
||||
"26":{"name":"客户端游戏","parent":"5"},
|
||||
"27":{"name":"网页游戏","parent":"5"},
|
||||
"28":{"name":"手机游戏","parent":"5"},
|
||||
"29":{"name":"微博","parent":"6"},
|
||||
"30":{"name":"社交网站","parent":"6"},
|
||||
"31":{"name":"在线视频","parent":"7"},
|
||||
"32":{"name":"在线音乐","parent":"7"},
|
||||
"35":{"name":"企业安全","parent":"9"},
|
||||
"36":{"name":"个人安全","parent":"9"},
|
||||
"37":{"name":"企业级云服务","parent":"10"},
|
||||
"38":{"name":"个人级云服务","parent":"10"},
|
||||
"43":{"name":"输入法","parent":"12"},
|
||||
"44":{"name":"浏览器","parent":"12"},
|
||||
"45":{"name":"词典","parent":"12"},
|
||||
"46":{"name":"播放器","parent":"12"},
|
||||
"47":{"name":"下载器","parent":"12"},
|
||||
"48":{"name":"IM","parent":"12"},
|
||||
"49":{"name":"广告服务","parent":"13"},
|
||||
"50":{"name":"第三方广告网络平台","parent":"13"},
|
||||
"51":{"name":"媒体代理","parent":"13"},
|
||||
"52":{"name":"创意代理","parent":"13"},
|
||||
"53":{"name":"IT-综合","parent":"1"},
|
||||
"71":{"name":"团购","parent":"3"},
|
||||
"72":{"name":"地图","parent":"2"},
|
||||
"73":{"name":"数据存储","parent":"2"},
|
||||
"414":{"name":"计算机软件","parent":"1"},
|
||||
"415":{"name":"计算机硬件","parent":"1"},
|
||||
"416":{"name":"计算机服务(系统、数据服务、维修)","parent":"1"},
|
||||
"417":{"name":"通信/电信/网络设备","parent":"1"},
|
||||
"418":{"name":"通信/电信运营、增值服务","parent":"1"},
|
||||
"419":{"name":"电子技术/半导体/集成电路","parent":"1"},
|
||||
"472":{"name":"P2P网贷","parent":"4"},
|
||||
"473":{"name":"互联网理财","parent":"4"},
|
||||
"474":{"name":"婚恋","parent":"6"},
|
||||
"476":{"name":"虚拟化","parent":"10"},
|
||||
"477":{"name":"邮箱","parent":"12"},
|
||||
"478":{"name":"商业智能","parent":"14"},
|
||||
"479":{"name":"企业建站","parent":"14"},
|
||||
"480":{"name":"安防","parent":"14"},
|
||||
"481":{"name":"网络营销","parent":"2"},
|
||||
"487":{"name":"智能终端","parent":"2"},
|
||||
"488":{"name":"移动互联网","parent":"2"},
|
||||
"489":{"name":"数字城市","parent":"2"},
|
||||
"490":{"name":"大数据","parent":"2"},
|
||||
"491":{"name":"互联网人力资源","parent":"2"},
|
||||
"492":{"name":"舆情监控","parent":"2"},
|
||||
"493":{"name":"移动营销","parent":"481"},
|
||||
"494":{"name":"微博营销","parent":"481"},
|
||||
"495":{"name":"精准营销","parent":"481"},
|
||||
"496":{"name":"海外营销","parent":"481"},
|
||||
"497":{"name":"微信营销","parent":"481"},
|
||||
"498":{"name":"智能手机","parent":"487"},
|
||||
"499":{"name":"可穿戴设备","parent":"487"},
|
||||
"500":{"name":"智能电视","parent":"487"},
|
||||
"501":{"name":"WAP","parent":"488"},
|
||||
"502":{"name":"物联网","parent":"489"},
|
||||
"503":{"name":"O2O","parent":"489"},
|
||||
"504":{"name":"数字出版","parent":"489"},
|
||||
"505":{"name":"搜索","parent":"2"},
|
||||
"506":{"name":"垂直搜索","parent":"505"},
|
||||
"507":{"name":"无线搜索","parent":"505"},
|
||||
"508":{"name":"网页搜索","parent":"505"},
|
||||
"509":{"name":"网址导航","parent":"2"},
|
||||
"510":{"name":"门户","parent":"2"},
|
||||
"511":{"name":"网络文学","parent":"2"},
|
||||
"512":{"name":"自媒体","parent":"2"},
|
||||
"513":{"name":"金融","parent":"0"},
|
||||
"514":{"name":"建筑与房地产","parent":"0"},
|
||||
"515":{"name":"专业服务","parent":"0"},
|
||||
"516":{"name":"教育培训","parent":"0"},
|
||||
"517":{"name":"文化传媒","parent":"0"},
|
||||
"518":{"name":"消费品","parent":"0"},
|
||||
"519":{"name":"工业","parent":"0"},
|
||||
"520":{"name":"交通物流","parent":"0"},
|
||||
"521":{"name":"贸易","parent":"0"},
|
||||
"522":{"name":"医药","parent":"0"},
|
||||
"523":{"name":"医疗器械","parent":"522"},
|
||||
"524":{"name":"保健品","parent":"518"},
|
||||
"525":{"name":"服务业","parent":"0"},
|
||||
"526":{"name":"能源/矿产/环保","parent":"0"},
|
||||
"527":{"name":"化工","parent":"0"},
|
||||
"528":{"name":"政府","parent":"0"},
|
||||
"529":{"name":"公共事业","parent":"0"},
|
||||
"530":{"name":"非盈利机构","parent":"0"},
|
||||
"531":{"name":"农业","parent":"1131"},
|
||||
"532":{"name":"林业","parent":"1131"},
|
||||
"533":{"name":"畜牧业","parent":"1131"},
|
||||
"534":{"name":"渔业","parent":"1131"},
|
||||
"535":{"name":"学术科研","parent":"0"},
|
||||
"536":{"name":"零售","parent":"0"},
|
||||
"537":{"name":"银行","parent":"513"},
|
||||
"538":{"name":"保险","parent":"513"},
|
||||
"539":{"name":"证券","parent":"513"},
|
||||
"540":{"name":"基金","parent":"513"},
|
||||
"541":{"name":"信托","parent":"513"},
|
||||
"542":{"name":"担保","parent":"513"},
|
||||
"543":{"name":"典当","parent":"513"},
|
||||
"544":{"name":"拍卖","parent":"513"},
|
||||
"545":{"name":"投资/融资","parent":"513"},
|
||||
"546":{"name":"期货","parent":"513"},
|
||||
"547":{"name":"房地产开发","parent":"514"},
|
||||
"548":{"name":"工程施工","parent":"514"},
|
||||
"549":{"name":"建筑设计","parent":"514"},
|
||||
"550":{"name":"房地产代理","parent":"514"},
|
||||
"551":{"name":"物业管理","parent":"514"},
|
||||
"552":{"name":"室内设计","parent":"514"},
|
||||
"553":{"name":"装修装潢","parent":"514"},
|
||||
"554":{"name":"市政工程","parent":"514"},
|
||||
"555":{"name":"工程造价","parent":"514"},
|
||||
"556":{"name":"工程监理","parent":"514"},
|
||||
"557":{"name":"环境工程","parent":"514"},
|
||||
"558":{"name":"园林景观","parent":"514"},
|
||||
"559":{"name":"法律","parent":"515"},
|
||||
"560":{"name":"人力资源","parent":"515"},
|
||||
"561":{"name":"会计","parent":"1125"},
|
||||
"562":{"name":"审计","parent":"515"},
|
||||
"563":{"name":"检测认证","parent":"515"},
|
||||
"565":{"name":"翻译","parent":"515"},
|
||||
"566":{"name":"中介","parent":"515"},
|
||||
"567":{"name":"咨询","parent":"515"},
|
||||
"568":{"name":"外包服务","parent":"515"},
|
||||
"569":{"name":"家教","parent":"516"},
|
||||
"570":{"name":"早教","parent":"516"},
|
||||
"571":{"name":"职业技能培训","parent":"516"},
|
||||
"572":{"name":"外语培训","parent":"516"},
|
||||
"573":{"name":"设计培训","parent":"516"},
|
||||
"574":{"name":"IT培训","parent":"516"},
|
||||
"575":{"name":"文艺体育培训","parent":"516"},
|
||||
"576":{"name":"学历教育","parent":"516"},
|
||||
"577":{"name":"管理培训","parent":"516"},
|
||||
"578":{"name":"民办基础教育","parent":"516"},
|
||||
"579":{"name":"广告","parent":"517"},
|
||||
"580":{"name":"媒体","parent":"517"},
|
||||
"581":{"name":"会展","parent":"517"},
|
||||
"582":{"name":"公关","parent":"517"},
|
||||
"583":{"name":"影视","parent":"517"},
|
||||
"584":{"name":"艺术","parent":"517"},
|
||||
"585":{"name":"文化传播","parent":"517"},
|
||||
"586":{"name":"娱乐","parent":"517"},
|
||||
"587":{"name":"体育","parent":"517"},
|
||||
"588":{"name":"出版","parent":"517"},
|
||||
"589":{"name":"休闲","parent":"517"},
|
||||
"590":{"name":"动漫","parent":"517"},
|
||||
"591":{"name":"市场推广","parent":"517"},
|
||||
"592":{"name":"市场研究","parent":"517"},
|
||||
"593":{"name":"食品","parent":"1129"},
|
||||
"594":{"name":"饮料","parent":"1129"},
|
||||
"595":{"name":"烟草","parent":"1129"},
|
||||
"596":{"name":"酒品","parent":"518"},
|
||||
"597":{"name":"服饰","parent":"518"},
|
||||
"598":{"name":"纺织","parent":"518"},
|
||||
"599":{"name":"化妆品","parent":"1129"},
|
||||
"600":{"name":"日用品","parent":"1129"},
|
||||
"601":{"name":"家电","parent":"518"},
|
||||
"602":{"name":"家具","parent":"518"},
|
||||
"603":{"name":"办公用品","parent":"518"},
|
||||
"604":{"name":"奢侈品","parent":"518"},
|
||||
"605":{"name":"珠宝","parent":"518"},
|
||||
"606":{"name":"数码产品","parent":"518"},
|
||||
"607":{"name":"玩具","parent":"518"},
|
||||
"608":{"name":"图书","parent":"518"},
|
||||
"609":{"name":"音像","parent":"518"},
|
||||
"610":{"name":"钟表","parent":"518"},
|
||||
"611":{"name":"箱包","parent":"518"},
|
||||
"612":{"name":"母婴","parent":"518"},
|
||||
"613":{"name":"营养保健","parent":"518"},
|
||||
"614":{"name":"户外用品","parent":"518"},
|
||||
"615":{"name":"健身器材","parent":"518"},
|
||||
"616":{"name":"乐器","parent":"518"},
|
||||
"617":{"name":"汽车用品","parent":"518"},
|
||||
"619":{"name":"厨具","parent":"518"},
|
||||
"620":{"name":"机械制造","parent":"519"},
|
||||
"621":{"name":"流体控制","parent":"519"},
|
||||
"622":{"name":"自动化控制","parent":"519"},
|
||||
"623":{"name":"仪器仪表","parent":"519"},
|
||||
"624":{"name":"航空/航天","parent":"519"},
|
||||
"625":{"name":"交通设施","parent":"519"},
|
||||
"626":{"name":"工业电子","parent":"519"},
|
||||
"627":{"name":"建材","parent":"519"},
|
||||
"628":{"name":"五金材料","parent":"519"},
|
||||
"629":{"name":"汽车","parent":"519"},
|
||||
"630":{"name":"印刷","parent":"519"},
|
||||
"631":{"name":"造纸","parent":"519"},
|
||||
"632":{"name":"包装","parent":"519"},
|
||||
"633":{"name":"原材料及加工","parent":"519"},
|
||||
"634":{"name":"物流","parent":"520"},
|
||||
"635":{"name":"仓储","parent":"520"},
|
||||
"636":{"name":"客运","parent":"520"},
|
||||
"637":{"name":"快递","parent":"520"},
|
||||
"638":{"name":"化学药","parent":"522"},
|
||||
"639":{"name":"中药","parent":"522"},
|
||||
"640":{"name":"生物制药","parent":"522"},
|
||||
"641":{"name":"兽药","parent":"522"},
|
||||
"642":{"name":"农药","parent":"522"},
|
||||
"643":{"name":"CRO","parent":"522"},
|
||||
"644":{"name":"消毒","parent":"522"},
|
||||
"645":{"name":"医药商业","parent":"522"},
|
||||
"646":{"name":"医疗服务","parent":"522"},
|
||||
"647":{"name":"医疗器械","parent":"523"},
|
||||
"648":{"name":"制药设备","parent":"523"},
|
||||
"649":{"name":"医用耗材","parent":"523"},
|
||||
"650":{"name":"手术器械","parent":"523"},
|
||||
"651":{"name":"保健器材","parent":"524"},
|
||||
"652":{"name":"性保健品","parent":"524"},
|
||||
"653":{"name":"医药保养","parent":"524"},
|
||||
"654":{"name":"医用保健","parent":"524"},
|
||||
"655":{"name":"酒店","parent":"525"},
|
||||
"656":{"name":"餐饮","parent":"525"},
|
||||
"657":{"name":"旅游","parent":"525"},
|
||||
"658":{"name":"生活服务","parent":"525"},
|
||||
"659":{"name":"保健服务","parent":"525"},
|
||||
"660":{"name":"运动健身","parent":"525"},
|
||||
"661":{"name":"家政服务","parent":"525"},
|
||||
"662":{"name":"婚庆服务","parent":"525"},
|
||||
"663":{"name":"租赁服务","parent":"525"},
|
||||
"664":{"name":"维修服务","parent":"525"},
|
||||
"665":{"name":"石油天然气","parent":"526"},
|
||||
"666":{"name":"电力","parent":"526"},
|
||||
"667":{"name":"新能源","parent":"526"},
|
||||
"668":{"name":"水利","parent":"526"},
|
||||
"669":{"name":"矿产","parent":"526"},
|
||||
"670":{"name":"采掘业","parent":"526"},
|
||||
"671":{"name":"冶炼","parent":"526"},
|
||||
"672":{"name":"环保","parent":"526"},
|
||||
"673":{"name":"无机化工原料","parent":"527"},
|
||||
"674":{"name":"有机化工原料","parent":"527"},
|
||||
"675":{"name":"精细化学品","parent":"527"},
|
||||
"676":{"name":"化工设备","parent":"527"},
|
||||
"677":{"name":"化工工程","parent":"527"},
|
||||
"678":{"name":"资产管理","parent":"513"},
|
||||
"679":{"name":"金融租赁","parent":"513"},
|
||||
"680":{"name":"征信及信评机构","parent":"513"},
|
||||
"681":{"name":"资产评估机构","parent":"513"},
|
||||
"683":{"name":"金融监管机构","parent":"513"},
|
||||
"684":{"name":"国际贸易","parent":"521"},
|
||||
"685":{"name":"海关","parent":"521"},
|
||||
"686":{"name":"购物中心","parent":"536"},
|
||||
"687":{"name":"超市","parent":"536"},
|
||||
"688":{"name":"便利店","parent":"536"},
|
||||
"689":{"name":"专卖店","parent":"536"},
|
||||
"690":{"name":"专业店","parent":"536"},
|
||||
"691":{"name":"百货店","parent":"536"},
|
||||
"692":{"name":"杂货店","parent":"536"},
|
||||
"693":{"name":"个人银行","parent":"537"},
|
||||
"695":{"name":"私人银行","parent":"537"},
|
||||
"696":{"name":"公司银行","parent":"537"},
|
||||
"697":{"name":"投资银行","parent":"537"},
|
||||
"698":{"name":"政策性银行","parent":"537"},
|
||||
"699":{"name":"中央银行","parent":"537"},
|
||||
"700":{"name":"人寿险","parent":"538"},
|
||||
"701":{"name":"财产险","parent":"538"},
|
||||
"702":{"name":"再保险","parent":"538"},
|
||||
"703":{"name":"养老险","parent":"538"},
|
||||
"704":{"name":"保险代理公司","parent":"538"},
|
||||
"705":{"name":"公募基金","parent":"540"},
|
||||
"707":{"name":"私募基金","parent":"540"},
|
||||
"708":{"name":"第三方理财","parent":"679"},
|
||||
"709":{"name":"资产管理公司","parent":"679"},
|
||||
"711":{"name":"房产中介","parent":"566"},
|
||||
"712":{"name":"职业中介","parent":"566"},
|
||||
"713":{"name":"婚姻中介","parent":"566"},
|
||||
"714":{"name":"战略咨询","parent":"567"},
|
||||
"715":{"name":"投资咨询","parent":"567"},
|
||||
"716":{"name":"心理咨询","parent":"567"},
|
||||
"717":{"name":"留学移民咨询","parent":"567"},
|
||||
"718":{"name":"工商注册代理","parent":"568"},
|
||||
"719":{"name":"商标专利代理","parent":"568"},
|
||||
"720":{"name":"财务代理","parent":"568"},
|
||||
"721":{"name":"工程机械","parent":"620"},
|
||||
"722":{"name":"农业机械","parent":"620"},
|
||||
"723":{"name":"海工设备","parent":"620"},
|
||||
"724":{"name":"包装机械","parent":"620"},
|
||||
"725":{"name":"印刷机械","parent":"620"},
|
||||
"726":{"name":"数控机床","parent":"620"},
|
||||
"727":{"name":"矿山机械","parent":"620"},
|
||||
"728":{"name":"水泵","parent":"621"},
|
||||
"729":{"name":"管道","parent":"621"},
|
||||
"730":{"name":"阀门","parent":"621"},
|
||||
"732":{"name":"压缩机","parent":"621"},
|
||||
"733":{"name":"集散控制系统","parent":"622"},
|
||||
"734":{"name":"远程控制","parent":"622"},
|
||||
"735":{"name":"液压系统","parent":"622"},
|
||||
"736":{"name":"楼宇智能化","parent":"622"},
|
||||
"737":{"name":"飞机制造","parent":"624"},
|
||||
"738":{"name":"航空公司","parent":"624"},
|
||||
"739":{"name":"发动机","parent":"624"},
|
||||
"740":{"name":"复合材料","parent":"624"},
|
||||
"741":{"name":"高铁","parent":"625"},
|
||||
"742":{"name":"地铁","parent":"625"},
|
||||
"743":{"name":"信号传输","parent":"625"},
|
||||
"745":{"name":"结构材料","parent":"627"},
|
||||
"746":{"name":"装饰材料","parent":"627"},
|
||||
"747":{"name":"专用材料","parent":"627"},
|
||||
"749":{"name":"经销商集团","parent":"629"},
|
||||
"750":{"name":"整车制造","parent":"629"},
|
||||
"751":{"name":"汽车零配件","parent":"629"},
|
||||
"752":{"name":"外型设计","parent":"629"},
|
||||
"753":{"name":"平版印刷","parent":"630"},
|
||||
"754":{"name":"凸版印刷","parent":"630"},
|
||||
"755":{"name":"凹版印刷","parent":"630"},
|
||||
"756":{"name":"孔版印刷","parent":"630"},
|
||||
"757":{"name":"印刷用纸","parent":"631"},
|
||||
"758":{"name":"书写、制图及复制用纸","parent":"631"},
|
||||
"759":{"name":"包装用纸","parent":"631"},
|
||||
"760":{"name":"生活、卫生及装饰用纸","parent":"631"},
|
||||
"761":{"name":"技术用纸","parent":"631"},
|
||||
"762":{"name":"加工纸原纸","parent":"631"},
|
||||
"763":{"name":"食品包装","parent":"632"},
|
||||
"764":{"name":"医药包装","parent":"632"},
|
||||
"765":{"name":"日化包装","parent":"632"},
|
||||
"766":{"name":"物流包装","parent":"632"},
|
||||
"767":{"name":"礼品包装","parent":"632"},
|
||||
"768":{"name":"电子五金包装","parent":"632"},
|
||||
"769":{"name":"汽车服务","parent":"525"},
|
||||
"770":{"name":"汽车保养","parent":"769"},
|
||||
"771":{"name":"租车","parent":"769"},
|
||||
"773":{"name":"出租车","parent":"769"},
|
||||
"774":{"name":"代驾","parent":"769"},
|
||||
"775":{"name":"发电","parent":"666"},
|
||||
"777":{"name":"输配电","parent":"666"},
|
||||
"779":{"name":"风电","parent":"667"},
|
||||
"780":{"name":"光伏/太阳能","parent":"667"},
|
||||
"781":{"name":"生物质发电","parent":"667"},
|
||||
"782":{"name":"煤化工","parent":"667"},
|
||||
"783":{"name":"垃圾发电","parent":"667"},
|
||||
"784":{"name":"核电","parent":"667"},
|
||||
"785":{"name":"能源矿产","parent":"669"},
|
||||
"786":{"name":"金属矿产","parent":"669"},
|
||||
"787":{"name":"非金属矿产","parent":"669"},
|
||||
"788":{"name":"水气矿产","parent":"669"},
|
||||
"789":{"name":"锅炉","parent":"775"},
|
||||
"790":{"name":"发电机","parent":"775"},
|
||||
"791":{"name":"汽轮机","parent":"775"},
|
||||
"792":{"name":"燃机","parent":"775"},
|
||||
"793":{"name":"冷却","parent":"775"},
|
||||
"794":{"name":"电力设计院","parent":"775"},
|
||||
"795":{"name":"高压输配电","parent":"777"},
|
||||
"796":{"name":"中压输配电","parent":"777"},
|
||||
"797":{"name":"低压输配电","parent":"777"},
|
||||
"798":{"name":"继电保护","parent":"777"},
|
||||
"799":{"name":"智能电网","parent":"777"},
|
||||
"800":{"name":"小学","parent":"516"},
|
||||
"801":{"name":"电动车","parent":"519"},
|
||||
"802":{"name":"皮具箱包","parent":"518"},
|
||||
"803":{"name":"医药制造","parent":"522"},
|
||||
"804":{"name":"电器销售","parent":"536"},
|
||||
"805":{"name":"塑料制品","parent":"527"},
|
||||
"806":{"name":"公益基金会","parent":"530"},
|
||||
"807":{"name":"美发服务","parent":"525"},
|
||||
"808":{"name":"农业养殖","parent":"531"},
|
||||
"809":{"name":"金融服务","parent":"513"},
|
||||
"810":{"name":"商业地产综合体","parent":"514"},
|
||||
"811":{"name":"美容服务","parent":"525"},
|
||||
"812":{"name":"灯饰","parent":"518"},
|
||||
"813":{"name":"油墨颜料产品","parent":"527"},
|
||||
"814":{"name":"眼镜制造","parent":"518"},
|
||||
"815":{"name":"农业生物技术","parent":"531"},
|
||||
"816":{"name":"体育用品","parent":"518"},
|
||||
"817":{"name":"保健用品","parent":"524"},
|
||||
"818":{"name":"化学化工产品","parent":"527"},
|
||||
"819":{"name":"饲料","parent":"531"},
|
||||
"821":{"name":"保安服务","parent":"525"},
|
||||
"822":{"name":"干细胞技术","parent":"522"},
|
||||
"824":{"name":"农药化肥","parent":"527"},
|
||||
"825":{"name":"卫生洁具","parent":"518"},
|
||||
"826":{"name":"体育器材、场馆","parent":"518"},
|
||||
"827":{"name":"饲料加工","parent":"531"},
|
||||
"828":{"name":"测绘服务","parent":"529"},
|
||||
"830":{"name":"金属船舶制造","parent":"519"},
|
||||
"831":{"name":"基因工程","parent":"522"},
|
||||
"832":{"name":"花卉服务","parent":"536"},
|
||||
"833":{"name":"农业种植","parent":"531"},
|
||||
"834":{"name":"皮革制品","parent":"518"},
|
||||
"835":{"name":"地理信息加工服务","parent":"529"},
|
||||
"836":{"name":"机器人","parent":"519"},
|
||||
"837":{"name":"礼品","parent":"518"},
|
||||
"838":{"name":"理发及美容服务","parent":"525"},
|
||||
"839":{"name":"其他清洁服务","parent":"525"},
|
||||
"840":{"name":"硅胶材料","parent":"527"},
|
||||
"841":{"name":"茶叶销售","parent":"518"},
|
||||
"842":{"name":"彩票活动","parent":"529"},
|
||||
"843":{"name":"化妆培训","parent":"516"},
|
||||
"844":{"name":"鞋业","parent":"518"},
|
||||
"845":{"name":"酒店用品","parent":"518"},
|
||||
"846":{"name":"复合材料","parent":"527"},
|
||||
"847":{"name":"房地产工程建设","parent":"548"},
|
||||
"848":{"name":"知识产权服务","parent":"559"},
|
||||
"849":{"name":"新型建材","parent":"627"},
|
||||
"850":{"name":"企业投资咨询","parent":"567"},
|
||||
"851":{"name":"含乳饮料和植物蛋白饮料制造","parent":"594"},
|
||||
"852":{"name":"汽车检测设备","parent":"629"},
|
||||
"853":{"name":"手机通讯器材","parent":"417"},
|
||||
"854":{"name":"环保材料","parent":"672"},
|
||||
"855":{"name":"交通设施","parent":"554"},
|
||||
"856":{"name":"电子器件","parent":"419"},
|
||||
"857":{"name":"啤酒","parent":"594"},
|
||||
"858":{"name":"生态旅游","parent":"657"},
|
||||
"859":{"name":"自动化设备","parent":"626"},
|
||||
"860":{"name":"软件开发","parent":"414"},
|
||||
"861":{"name":"葡萄酒销售","parent":"594"},
|
||||
"862":{"name":"钢材","parent":"633"},
|
||||
"863":{"name":"餐饮培训","parent":"656"},
|
||||
"864":{"name":"速冻食品","parent":"593"},
|
||||
"865":{"name":"空气环保","parent":"672"},
|
||||
"866":{"name":"互联网房地产经纪服务","parent":"550"},
|
||||
"867":{"name":"食品添加剂","parent":"593"},
|
||||
"868":{"name":"演艺传播","parent":"585"},
|
||||
"869":{"name":"信用卡","parent":"537"},
|
||||
"870":{"name":"报纸期刊广告","parent":"579"},
|
||||
"871":{"name":"摄影","parent":"525"},
|
||||
"872":{"name":"手机软件","parent":"414"},
|
||||
"873":{"name":"地坪建材","parent":"627"},
|
||||
"874":{"name":"企业管理咨询","parent":"567"},
|
||||
"875":{"name":"幼儿教育","parent":"570"},
|
||||
"876":{"name":"系统集成","parent":"416"},
|
||||
"877":{"name":"皮革服饰","parent":"597"},
|
||||
"878":{"name":"保健食品","parent":"593"},
|
||||
"879":{"name":"叉车","parent":"620"},
|
||||
"880":{"name":"厨卫电器","parent":"601"},
|
||||
"882":{"name":"地暖设备","parent":"627"},
|
||||
"883":{"name":"钢结构制造","parent":"548"},
|
||||
"884":{"name":"投影机","parent":"606"},
|
||||
"885":{"name":"啤酒销售","parent":"594"},
|
||||
"886":{"name":"度假村旅游","parent":"657"},
|
||||
"887":{"name":"电力元件设备","parent":"626"},
|
||||
"888":{"name":"管理软件","parent":"414"},
|
||||
"889":{"name":"轴承","parent":"628"},
|
||||
"890":{"name":"餐饮设备","parent":"656"},
|
||||
"891":{"name":"肉制品及副产品加工","parent":"593"},
|
||||
"892":{"name":"艺术收藏品投资交易","parent":"584"},
|
||||
"893":{"name":"净水器","parent":"601"},
|
||||
"894":{"name":"进口食品","parent":"593"},
|
||||
"895":{"name":"娱乐文化传播","parent":"585"},
|
||||
"896":{"name":"文化传播","parent":"585"},
|
||||
"897":{"name":"商旅传媒","parent":"580"},
|
||||
"898":{"name":"广告设计制作","parent":"579"},
|
||||
"899":{"name":"金属丝绳及其制品制造","parent":"627"},
|
||||
"900":{"name":"建筑涂料","parent":"627"},
|
||||
"901":{"name":"抵押贷款","parent":"543"},
|
||||
"902":{"name":"早教","parent":"570"},
|
||||
"903":{"name":"电影放映","parent":"583"},
|
||||
"904":{"name":"内衣服饰","parent":"597"},
|
||||
"905":{"name":"无线网络通信","parent":"418"},
|
||||
"906":{"name":"记忆卡","parent":"415"},
|
||||
"907":{"name":"女装服饰","parent":"597"},
|
||||
"908":{"name":"建筑机械","parent":"620"},
|
||||
"909":{"name":"制冷电器","parent":"601"},
|
||||
"910":{"name":"通信设备","parent":"417"},
|
||||
"911":{"name":"空调设备","parent":"601"},
|
||||
"912":{"name":"建筑装饰","parent":"553"},
|
||||
"913":{"name":"办公设备","parent":"603"},
|
||||
"916":{"name":"数据处理软件","parent":"414"},
|
||||
"917":{"name":"葡萄酒贸易","parent":"594"},
|
||||
"918":{"name":"通讯器材","parent":"417"},
|
||||
"919":{"name":"铜业","parent":"633"},
|
||||
"920":{"name":"食堂","parent":"656"},
|
||||
"921":{"name":"糖果零食","parent":"593"},
|
||||
"922":{"name":"文化艺术传播","parent":"584"},
|
||||
"923":{"name":"太阳能电器","parent":"601"},
|
||||
"924":{"name":"药品零售","parent":"645"},
|
||||
"925":{"name":"果蔬食品","parent":"593"},
|
||||
"926":{"name":"文化活动策划","parent":"585"},
|
||||
"928":{"name":"汽车广告","parent":"657"},
|
||||
"929":{"name":"条码设备","parent":"630"},
|
||||
"930":{"name":"建筑石材","parent":"627"},
|
||||
"931":{"name":"贵金属","parent":"545"},
|
||||
"932":{"name":"体育","parent":"660"},
|
||||
"933":{"name":"金融信息服务","parent":"414"},
|
||||
"934":{"name":"玻璃建材","parent":"627"},
|
||||
"935":{"name":"家教","parent":"569"},
|
||||
"936":{"name":"歌舞厅娱乐活动","parent":"586"},
|
||||
"937":{"name":"计算机服务器","parent":"415"},
|
||||
"938":{"name":"管道","parent":"627"},
|
||||
"939":{"name":"婴幼儿服饰","parent":"597"},
|
||||
"940":{"name":"热水器","parent":"601"},
|
||||
"941":{"name":"计算机及零部件制造","parent":"415"},
|
||||
"942":{"name":"钢铁贸易","parent":"633"},
|
||||
"944":{"name":"包装材料","parent":"632"},
|
||||
"945":{"name":"计算机办公设备","parent":"603"},
|
||||
"946":{"name":"白酒","parent":"594"},
|
||||
"948":{"name":"发动机","parent":"620"},
|
||||
"949":{"name":"快餐服务","parent":"656"},
|
||||
"950":{"name":"酒类销售","parent":"594"},
|
||||
"951":{"name":"电子产品、机电设备","parent":"626"},
|
||||
"952":{"name":"激光设备","parent":"626"},
|
||||
"953":{"name":"餐饮策划","parent":"656"},
|
||||
"954":{"name":"饮料、食品","parent":"594"},
|
||||
"955":{"name":"文化娱乐经纪","parent":"585"},
|
||||
"956":{"name":"天然气","parent":"665"},
|
||||
"957":{"name":"农副食品","parent":"593"},
|
||||
"958":{"name":"艺术表演","parent":"585"},
|
||||
"959":{"name":"石膏、水泥制品及类似制品制造","parent":"627"},
|
||||
"960":{"name":"橱柜","parent":"602"},
|
||||
"961":{"name":"管理培训","parent":"577"},
|
||||
"962":{"name":"男装服饰","parent":"597"},
|
||||
"963":{"name":"化肥制造","parent":"675"},
|
||||
"964":{"name":"童装服饰","parent":"597"},
|
||||
"965":{"name":"电源电池","parent":"626"},
|
||||
"966":{"name":"家电维修","parent":"664"},
|
||||
"967":{"name":"光电子器件","parent":"419"},
|
||||
"968":{"name":"旅行社服务","parent":"657"},
|
||||
"969":{"name":"电线、电缆制造","parent":"626"},
|
||||
"970":{"name":"软件开发、信息系统集成","parent":"419"},
|
||||
"971":{"name":"白酒制造","parent":"594"},
|
||||
"973":{"name":"甜品服务","parent":"656"},
|
||||
"974":{"name":"糕点、面包制造","parent":"593"},
|
||||
"975":{"name":"木工机械","parent":"620"},
|
||||
"976":{"name":"酒吧服务","parent":"656"},
|
||||
"977":{"name":"火腿肠","parent":"593"},
|
||||
"978":{"name":"广告策划推广","parent":"579"},
|
||||
"979":{"name":"新能源产品和生产装备制造","parent":"667"},
|
||||
"980":{"name":"调味品","parent":"593"},
|
||||
"981":{"name":"礼仪表演","parent":"585"},
|
||||
"982":{"name":"劳务派遣","parent":"560"},
|
||||
"983":{"name":"建材零售","parent":"627"},
|
||||
"984":{"name":"商品交易中心","parent":"545"},
|
||||
"985":{"name":"体育推广","parent":"585"},
|
||||
"986":{"name":"茶饮料及其他饮料制造","parent":"594"},
|
||||
"987":{"name":"金属建材","parent":"627"},
|
||||
"988":{"name":"职业技能培训","parent":"571"},
|
||||
"989":{"name":"网吧活动","parent":"586"},
|
||||
"990":{"name":"洗衣服务","parent":"658"},
|
||||
"991":{"name":"管道工程","parent":"554"},
|
||||
"992":{"name":"通信工程","parent":"417"},
|
||||
"993":{"name":"电子元器件","parent":"626"},
|
||||
"994":{"name":"电子设备","parent":"419"},
|
||||
"995":{"name":"茶馆服务","parent":"656"},
|
||||
"996":{"name":"旅游开发","parent":"657"},
|
||||
"997":{"name":"视频通讯","parent":"417"},
|
||||
"998":{"name":"白酒销售","parent":"594"},
|
||||
"1000":{"name":"咖啡馆服务","parent":"656"},
|
||||
"1001":{"name":"食品零售","parent":"593"},
|
||||
"1002":{"name":"健康疗养旅游","parent":"655"},
|
||||
"1003":{"name":"粮油食品","parent":"593"},
|
||||
"1004":{"name":"儿童教育影视","parent":"583"},
|
||||
"1005":{"name":"新能源发电","parent":"667"},
|
||||
"1006":{"name":"旅游策划","parent":"657"},
|
||||
"1007":{"name":"绘画","parent":"575"},
|
||||
"1008":{"name":"方便面及其他方便食品","parent":"593"},
|
||||
"1009":{"name":"房地产经纪","parent":"550"},
|
||||
"1010":{"name":"母婴家政","parent":"661"},
|
||||
"1011":{"name":"居家养老健康服务","parent":"661"},
|
||||
"1012":{"name":"文化艺术投资","parent":"545"},
|
||||
"1013":{"name":"运动健身","parent":"660"},
|
||||
"1014":{"name":"瓶(罐)装饮用水制造","parent":"594"},
|
||||
"1015":{"name":"金属门窗","parent":"627"},
|
||||
"1016":{"name":"机动车检测","parent":"563"},
|
||||
"1017":{"name":"货物运输","parent":"634"},
|
||||
"1018":{"name":"服饰专卖","parent":"690"},
|
||||
"1019":{"name":"酒店服装","parent":"597"},
|
||||
"1020":{"name":"通讯软件","parent":"417"},
|
||||
"1021":{"name":"消防工程","parent":"554"},
|
||||
"1022":{"name":"嵌入式电子系统","parent":"419"},
|
||||
"1023":{"name":"航空票务","parent":"636"},
|
||||
"1024":{"name":"电气设备","parent":"626"},
|
||||
"1025":{"name":"酒业贸易","parent":"594"},
|
||||
"1027":{"name":"其他饮料及冷饮服务","parent":"656"},
|
||||
"1028":{"name":"乳制品","parent":"593"},
|
||||
"1029":{"name":"新闻期刊出版","parent":"588"},
|
||||
"1030":{"name":"水污染治理","parent":"672"},
|
||||
"1031":{"name":"谷物食品","parent":"593"},
|
||||
"1032":{"name":"数字动漫设计制造服务","parent":"590"},
|
||||
"1033":{"name":"医院","parent":"646"},
|
||||
"1034":{"name":"旅游广告","parent":"657"},
|
||||
"1035":{"name":"办公家具","parent":"602"},
|
||||
"1036":{"name":"房地产营销策划","parent":"550"},
|
||||
"1037":{"name":"保洁家政","parent":"661"},
|
||||
"1038":{"name":"水泥制造","parent":"627"},
|
||||
"1039":{"name":"市场研究咨询","parent":"567"},
|
||||
"1040":{"name":"驾校","parent":"571"},
|
||||
"1041":{"name":"正餐服务","parent":"656"},
|
||||
"1043":{"name":"机动车燃油","parent":"665"},
|
||||
"1044":{"name":"食品","parent":"593"},
|
||||
"1045":{"name":"新能源汽车","parent":"629"},
|
||||
"1046":{"name":"手机无线网络推广","parent":"417"},
|
||||
"1047":{"name":"环保设备","parent":"672"},
|
||||
"1048":{"name":"通讯工程","parent":"418"},
|
||||
"1049":{"name":"半导体集成电路","parent":"419"},
|
||||
"1050":{"name":"航空服务","parent":"636"},
|
||||
"1051":{"name":"电机设备","parent":"626"},
|
||||
"1052":{"name":"档案软件","parent":"414"},
|
||||
"1053":{"name":"冷链物流服务","parent":"634"},
|
||||
"1054":{"name":"小吃服务","parent":"656"},
|
||||
"1055":{"name":"水产品加工","parent":"593"},
|
||||
"1056":{"name":"图书出版","parent":"588"},
|
||||
"1057":{"name":"固体废物治理","parent":"672"},
|
||||
"1059":{"name":"坚果食品","parent":"593"},
|
||||
"1060":{"name":"广告传媒","parent":"579"},
|
||||
"1061":{"name":"电梯","parent":"622"},
|
||||
"1062":{"name":"社区医疗与卫生院","parent":"646"},
|
||||
"1063":{"name":"广告、印刷包装","parent":"630"},
|
||||
"1064":{"name":"婚纱礼服","parent":"662"},
|
||||
"1065":{"name":"地毯","parent":"602"},
|
||||
"1066":{"name":"互联网物业","parent":"551"},
|
||||
"1067":{"name":"跨境电商","parent":"3"},
|
||||
"1068":{"name":"信息安全、系统集成","parent":"9"},
|
||||
"1069":{"name":"专用汽车制造","parent":"750"},
|
||||
"1070":{"name":"商品贸易","parent":"3"},
|
||||
"1071":{"name":"墙壁装饰材料","parent":"746"},
|
||||
"1072":{"name":"窗帘装饰材料","parent":"746"},
|
||||
"1073":{"name":"电子商务、本地生活服务","parent":"3"},
|
||||
"1075":{"name":"白酒电子商务","parent":"3"},
|
||||
"1076":{"name":"商品贸易、电子商务","parent":"3"},
|
||||
"1077":{"name":"木质装饰材料","parent":"746"},
|
||||
"1078":{"name":"电子商务、汽车电商交易平台","parent":"3"},
|
||||
"1079":{"name":"汽车轮胎","parent":"751"},
|
||||
"1080":{"name":"气体压缩机械制造","parent":"732"},
|
||||
"1081":{"name":"家装家具电子商务","parent":"3"},
|
||||
"1082":{"name":"化妆品电子商务","parent":"3"},
|
||||
"1083":{"name":"汽车销售","parent":"749"},
|
||||
"1084":{"name":"新闻资讯网站","parent":"510"},
|
||||
"1085":{"name":"母婴电商","parent":"3"},
|
||||
"1086":{"name":"电商商务、收藏品交易","parent":"3"},
|
||||
"1088":{"name":"电子商务、数码产品","parent":"3"},
|
||||
"1089":{"name":"二手车交易","parent":"749"},
|
||||
"1090":{"name":"游戏制作服务","parent":"5"},
|
||||
"1091":{"name":"母婴服务","parent":"510"},
|
||||
"1092":{"name":"家具电子商务","parent":"3"},
|
||||
"1093":{"name":"汽车配件电子商务","parent":"3"},
|
||||
"1094":{"name":"输配电设备","parent":"777"},
|
||||
"1095":{"name":"矿山设备","parent":"727"},
|
||||
"1096":{"name":"机床机械","parent":"726"},
|
||||
"1097":{"name":"农产品电商","parent":"3"},
|
||||
"1098":{"name":"陶瓷装饰材料","parent":"746"},
|
||||
"1099":{"name":"车载联网设备","parent":"487"},
|
||||
"1100":{"name":"汽车销售电子商务","parent":"3"},
|
||||
"1101":{"name":"石油设备","parent":"730"},
|
||||
"1102":{"name":"智能家居","parent":"487"},
|
||||
"1103":{"name":"散热器","parent":"751"},
|
||||
"1104":{"name":"电力工程","parent":"775"},
|
||||
"1105":{"name":"生鲜电商","parent":"3"},
|
||||
"1106":{"name":"互联网数据服务","parent":"490"},
|
||||
"1107":{"name":"房车、商务车销售","parent":"749"},
|
||||
"1108":{"name":"茶叶电子商务","parent":"3"},
|
||||
"1109":{"name":"酒类电子商务","parent":"3"},
|
||||
"1110":{"name":"阀门","parent":"730"},
|
||||
"1111":{"name":"食品电商","parent":"3"},
|
||||
"1112":{"name":"儿童摄影","parent":"871"},
|
||||
"1113":{"name":"广告摄影","parent":"871"},
|
||||
"1114":{"name":"婚纱摄影","parent":"871"},
|
||||
"1115":{"name":"模具制造","parent":"620"},
|
||||
"1116":{"name":"汽车模具","parent":"629"},
|
||||
"1117":{"name":"认证咨询","parent":"567"},
|
||||
"1118":{"name":"数字视觉制作服务","parent":"590"},
|
||||
"1119":{"name":"牙科及医疗器械","parent":"646"},
|
||||
"1120":{"name":"猎头招聘","parent":"560"},
|
||||
"1121":{"name":"家居","parent":"518"},
|
||||
"1122":{"name":"收藏品","parent":"518"},
|
||||
"1123":{"name":"首饰","parent":"518"},
|
||||
"1124":{"name":"工艺品","parent":"518"},
|
||||
"1125":{"name":"财务","parent":"515"},
|
||||
"1126":{"name":"税务","parent":"515"},
|
||||
"1127":{"name":"分类信息","parent":"2"},
|
||||
"1128":{"name":"宠物","parent":"0"},
|
||||
"1129":{"name":"快消品","parent":"518"},
|
||||
"1130":{"name":"人工智能","parent":"2"},
|
||||
"1131":{"name":"农/林/牧/渔","parent":"0"}
|
||||
}
|
||||
|
||||
def get_names(id):
|
||||
id = str(id)
|
||||
nms = []
|
||||
d = TBL.get(id)
|
||||
if not d:return []
|
||||
nms.append(d["name"])
|
||||
p = get_names(d["parent"])
|
||||
if p: nms.extend(p)
|
||||
return nms
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(get_names("1119"))
|
762
deepdoc/parser/resume/entities/regions.py
Normal file
762
deepdoc/parser/resume/entities/regions.py
Normal file
@ -0,0 +1,762 @@
|
||||
TBL = {
|
||||
"2":{"name":"北京","parent":"1"},
|
||||
"3":{"name":"天津","parent":"1"},
|
||||
"4":{"name":"河北","parent":"1"},
|
||||
"5":{"name":"山西","parent":"1"},
|
||||
"6":{"name":"内蒙古","parent":"1"},
|
||||
"7":{"name":"辽宁","parent":"1"},
|
||||
"8":{"name":"吉林","parent":"1"},
|
||||
"9":{"name":"黑龙江","parent":"1"},
|
||||
"10":{"name":"上海","parent":"1"},
|
||||
"11":{"name":"江苏","parent":"1"},
|
||||
"12":{"name":"浙江","parent":"1"},
|
||||
"13":{"name":"安徽","parent":"1"},
|
||||
"14":{"name":"福建","parent":"1"},
|
||||
"15":{"name":"江西","parent":"1"},
|
||||
"16":{"name":"山东","parent":"1"},
|
||||
"17":{"name":"河南","parent":"1"},
|
||||
"18":{"name":"湖北","parent":"1"},
|
||||
"19":{"name":"湖南","parent":"1"},
|
||||
"20":{"name":"广东","parent":"1"},
|
||||
"21":{"name":"广西","parent":"1"},
|
||||
"22":{"name":"海南","parent":"1"},
|
||||
"23":{"name":"重庆","parent":"1"},
|
||||
"24":{"name":"四川","parent":"1"},
|
||||
"25":{"name":"贵州","parent":"1"},
|
||||
"26":{"name":"云南","parent":"1"},
|
||||
"27":{"name":"西藏","parent":"1"},
|
||||
"28":{"name":"陕西","parent":"1"},
|
||||
"29":{"name":"甘肃","parent":"1"},
|
||||
"30":{"name":"青海","parent":"1"},
|
||||
"31":{"name":"宁夏","parent":"1"},
|
||||
"32":{"name":"新疆","parent":"1"},
|
||||
"33":{"name":"北京市","parent":"2"},
|
||||
"34":{"name":"天津市","parent":"3"},
|
||||
"35":{"name":"石家庄市","parent":"4"},
|
||||
"36":{"name":"唐山市","parent":"4"},
|
||||
"37":{"name":"秦皇岛市","parent":"4"},
|
||||
"38":{"name":"邯郸市","parent":"4"},
|
||||
"39":{"name":"邢台市","parent":"4"},
|
||||
"40":{"name":"保定市","parent":"4"},
|
||||
"41":{"name":"张家口市","parent":"4"},
|
||||
"42":{"name":"承德市","parent":"4"},
|
||||
"43":{"name":"沧州市","parent":"4"},
|
||||
"44":{"name":"廊坊市","parent":"4"},
|
||||
"45":{"name":"衡水市","parent":"4"},
|
||||
"46":{"name":"太原市","parent":"5"},
|
||||
"47":{"name":"大同市","parent":"5"},
|
||||
"48":{"name":"阳泉市","parent":"5"},
|
||||
"49":{"name":"长治市","parent":"5"},
|
||||
"50":{"name":"晋城市","parent":"5"},
|
||||
"51":{"name":"朔州市","parent":"5"},
|
||||
"52":{"name":"晋中市","parent":"5"},
|
||||
"53":{"name":"运城市","parent":"5"},
|
||||
"54":{"name":"忻州市","parent":"5"},
|
||||
"55":{"name":"临汾市","parent":"5"},
|
||||
"56":{"name":"吕梁市","parent":"5"},
|
||||
"57":{"name":"呼和浩特市","parent":"6"},
|
||||
"58":{"name":"包头市","parent":"6"},
|
||||
"59":{"name":"乌海市","parent":"6"},
|
||||
"60":{"name":"赤峰市","parent":"6"},
|
||||
"61":{"name":"通辽市","parent":"6"},
|
||||
"62":{"name":"鄂尔多斯市","parent":"6"},
|
||||
"63":{"name":"呼伦贝尔市","parent":"6"},
|
||||
"64":{"name":"巴彦淖尔市","parent":"6"},
|
||||
"65":{"name":"乌兰察布市","parent":"6"},
|
||||
"66":{"name":"兴安盟","parent":"6"},
|
||||
"67":{"name":"锡林郭勒盟","parent":"6"},
|
||||
"68":{"name":"阿拉善盟","parent":"6"},
|
||||
"69":{"name":"沈阳市","parent":"7"},
|
||||
"70":{"name":"大连市","parent":"7"},
|
||||
"71":{"name":"鞍山市","parent":"7"},
|
||||
"72":{"name":"抚顺市","parent":"7"},
|
||||
"73":{"name":"本溪市","parent":"7"},
|
||||
"74":{"name":"丹东市","parent":"7"},
|
||||
"75":{"name":"锦州市","parent":"7"},
|
||||
"76":{"name":"营口市","parent":"7"},
|
||||
"77":{"name":"阜新市","parent":"7"},
|
||||
"78":{"name":"辽阳市","parent":"7"},
|
||||
"79":{"name":"盘锦市","parent":"7"},
|
||||
"80":{"name":"铁岭市","parent":"7"},
|
||||
"81":{"name":"朝阳市","parent":"7"},
|
||||
"82":{"name":"葫芦岛市","parent":"7"},
|
||||
"83":{"name":"长春市","parent":"8"},
|
||||
"84":{"name":"吉林市","parent":"8"},
|
||||
"85":{"name":"四平市","parent":"8"},
|
||||
"86":{"name":"辽源市","parent":"8"},
|
||||
"87":{"name":"通化市","parent":"8"},
|
||||
"88":{"name":"白山市","parent":"8"},
|
||||
"89":{"name":"松原市","parent":"8"},
|
||||
"90":{"name":"白城市","parent":"8"},
|
||||
"91":{"name":"延边朝鲜族自治州","parent":"8"},
|
||||
"92":{"name":"哈尔滨市","parent":"9"},
|
||||
"93":{"name":"齐齐哈尔市","parent":"9"},
|
||||
"94":{"name":"鸡西市","parent":"9"},
|
||||
"95":{"name":"鹤岗市","parent":"9"},
|
||||
"96":{"name":"双鸭山市","parent":"9"},
|
||||
"97":{"name":"大庆市","parent":"9"},
|
||||
"98":{"name":"伊春市","parent":"9"},
|
||||
"99":{"name":"佳木斯市","parent":"9"},
|
||||
"100":{"name":"七台河市","parent":"9"},
|
||||
"101":{"name":"牡丹江市","parent":"9"},
|
||||
"102":{"name":"黑河市","parent":"9"},
|
||||
"103":{"name":"绥化市","parent":"9"},
|
||||
"104":{"name":"大兴安岭地区","parent":"9"},
|
||||
"105":{"name":"上海市","parent":"10"},
|
||||
"106":{"name":"南京市","parent":"11"},
|
||||
"107":{"name":"无锡市","parent":"11"},
|
||||
"108":{"name":"徐州市","parent":"11"},
|
||||
"109":{"name":"常州市","parent":"11"},
|
||||
"110":{"name":"苏州市","parent":"11"},
|
||||
"111":{"name":"南通市","parent":"11"},
|
||||
"112":{"name":"连云港市","parent":"11"},
|
||||
"113":{"name":"淮安市","parent":"11"},
|
||||
"114":{"name":"盐城市","parent":"11"},
|
||||
"115":{"name":"扬州市","parent":"11"},
|
||||
"116":{"name":"镇江市","parent":"11"},
|
||||
"117":{"name":"泰州市","parent":"11"},
|
||||
"118":{"name":"宿迁市","parent":"11"},
|
||||
"119":{"name":"杭州市","parent":"12"},
|
||||
"120":{"name":"宁波市","parent":"12"},
|
||||
"121":{"name":"温州市","parent":"12"},
|
||||
"122":{"name":"嘉兴市","parent":"12"},
|
||||
"123":{"name":"湖州市","parent":"12"},
|
||||
"124":{"name":"绍兴市","parent":"12"},
|
||||
"125":{"name":"金华市","parent":"12"},
|
||||
"126":{"name":"衢州市","parent":"12"},
|
||||
"127":{"name":"舟山市","parent":"12"},
|
||||
"128":{"name":"台州市","parent":"12"},
|
||||
"129":{"name":"丽水市","parent":"12"},
|
||||
"130":{"name":"合肥市","parent":"13"},
|
||||
"131":{"name":"芜湖市","parent":"13"},
|
||||
"132":{"name":"蚌埠市","parent":"13"},
|
||||
"133":{"name":"淮南市","parent":"13"},
|
||||
"134":{"name":"马鞍山市","parent":"13"},
|
||||
"135":{"name":"淮北市","parent":"13"},
|
||||
"136":{"name":"铜陵市","parent":"13"},
|
||||
"137":{"name":"安庆市","parent":"13"},
|
||||
"138":{"name":"黄山市","parent":"13"},
|
||||
"139":{"name":"滁州市","parent":"13"},
|
||||
"140":{"name":"阜阳市","parent":"13"},
|
||||
"141":{"name":"宿州市","parent":"13"},
|
||||
"143":{"name":"六安市","parent":"13"},
|
||||
"144":{"name":"亳州市","parent":"13"},
|
||||
"145":{"name":"池州市","parent":"13"},
|
||||
"146":{"name":"宣城市","parent":"13"},
|
||||
"147":{"name":"福州市","parent":"14"},
|
||||
"148":{"name":"厦门市","parent":"14"},
|
||||
"149":{"name":"莆田市","parent":"14"},
|
||||
"150":{"name":"三明市","parent":"14"},
|
||||
"151":{"name":"泉州市","parent":"14"},
|
||||
"152":{"name":"漳州市","parent":"14"},
|
||||
"153":{"name":"南平市","parent":"14"},
|
||||
"154":{"name":"龙岩市","parent":"14"},
|
||||
"155":{"name":"宁德市","parent":"14"},
|
||||
"156":{"name":"南昌市","parent":"15"},
|
||||
"157":{"name":"景德镇市","parent":"15"},
|
||||
"158":{"name":"萍乡市","parent":"15"},
|
||||
"159":{"name":"九江市","parent":"15"},
|
||||
"160":{"name":"新余市","parent":"15"},
|
||||
"161":{"name":"鹰潭市","parent":"15"},
|
||||
"162":{"name":"赣州市","parent":"15"},
|
||||
"163":{"name":"吉安市","parent":"15"},
|
||||
"164":{"name":"宜春市","parent":"15"},
|
||||
"165":{"name":"抚州市","parent":"15"},
|
||||
"166":{"name":"上饶市","parent":"15"},
|
||||
"167":{"name":"济南市","parent":"16"},
|
||||
"168":{"name":"青岛市","parent":"16"},
|
||||
"169":{"name":"淄博市","parent":"16"},
|
||||
"170":{"name":"枣庄市","parent":"16"},
|
||||
"171":{"name":"东营市","parent":"16"},
|
||||
"172":{"name":"烟台市","parent":"16"},
|
||||
"173":{"name":"潍坊市","parent":"16"},
|
||||
"174":{"name":"济宁市","parent":"16"},
|
||||
"175":{"name":"泰安市","parent":"16"},
|
||||
"176":{"name":"威海市","parent":"16"},
|
||||
"177":{"name":"日照市","parent":"16"},
|
||||
"179":{"name":"临沂市","parent":"16"},
|
||||
"180":{"name":"德州市","parent":"16"},
|
||||
"181":{"name":"聊城市","parent":"16"},
|
||||
"182":{"name":"滨州市","parent":"16"},
|
||||
"183":{"name":"菏泽市","parent":"16"},
|
||||
"184":{"name":"郑州市","parent":"17"},
|
||||
"185":{"name":"开封市","parent":"17"},
|
||||
"186":{"name":"洛阳市","parent":"17"},
|
||||
"187":{"name":"平顶山市","parent":"17"},
|
||||
"188":{"name":"安阳市","parent":"17"},
|
||||
"189":{"name":"鹤壁市","parent":"17"},
|
||||
"190":{"name":"新乡市","parent":"17"},
|
||||
"191":{"name":"焦作市","parent":"17"},
|
||||
"192":{"name":"濮阳市","parent":"17"},
|
||||
"193":{"name":"许昌市","parent":"17"},
|
||||
"194":{"name":"漯河市","parent":"17"},
|
||||
"195":{"name":"三门峡市","parent":"17"},
|
||||
"196":{"name":"南阳市","parent":"17"},
|
||||
"197":{"name":"商丘市","parent":"17"},
|
||||
"198":{"name":"信阳市","parent":"17"},
|
||||
"199":{"name":"周口市","parent":"17"},
|
||||
"200":{"name":"驻马店市","parent":"17"},
|
||||
"201":{"name":"武汉市","parent":"18"},
|
||||
"202":{"name":"黄石市","parent":"18"},
|
||||
"203":{"name":"十堰市","parent":"18"},
|
||||
"204":{"name":"宜昌市","parent":"18"},
|
||||
"205":{"name":"襄阳市","parent":"18"},
|
||||
"206":{"name":"鄂州市","parent":"18"},
|
||||
"207":{"name":"荆门市","parent":"18"},
|
||||
"208":{"name":"孝感市","parent":"18"},
|
||||
"209":{"name":"荆州市","parent":"18"},
|
||||
"210":{"name":"黄冈市","parent":"18"},
|
||||
"211":{"name":"咸宁市","parent":"18"},
|
||||
"212":{"name":"随州市","parent":"18"},
|
||||
"213":{"name":"恩施土家族苗族自治州","parent":"18"},
|
||||
"215":{"name":"长沙市","parent":"19"},
|
||||
"216":{"name":"株洲市","parent":"19"},
|
||||
"217":{"name":"湘潭市","parent":"19"},
|
||||
"218":{"name":"衡阳市","parent":"19"},
|
||||
"219":{"name":"邵阳市","parent":"19"},
|
||||
"220":{"name":"岳阳市","parent":"19"},
|
||||
"221":{"name":"常德市","parent":"19"},
|
||||
"222":{"name":"张家界市","parent":"19"},
|
||||
"223":{"name":"益阳市","parent":"19"},
|
||||
"224":{"name":"郴州市","parent":"19"},
|
||||
"225":{"name":"永州市","parent":"19"},
|
||||
"226":{"name":"怀化市","parent":"19"},
|
||||
"227":{"name":"娄底市","parent":"19"},
|
||||
"228":{"name":"湘西土家族苗族自治州","parent":"19"},
|
||||
"229":{"name":"广州市","parent":"20"},
|
||||
"230":{"name":"韶关市","parent":"20"},
|
||||
"231":{"name":"深圳市","parent":"20"},
|
||||
"232":{"name":"珠海市","parent":"20"},
|
||||
"233":{"name":"汕头市","parent":"20"},
|
||||
"234":{"name":"佛山市","parent":"20"},
|
||||
"235":{"name":"江门市","parent":"20"},
|
||||
"236":{"name":"湛江市","parent":"20"},
|
||||
"237":{"name":"茂名市","parent":"20"},
|
||||
"238":{"name":"肇庆市","parent":"20"},
|
||||
"239":{"name":"惠州市","parent":"20"},
|
||||
"240":{"name":"梅州市","parent":"20"},
|
||||
"241":{"name":"汕尾市","parent":"20"},
|
||||
"242":{"name":"河源市","parent":"20"},
|
||||
"243":{"name":"阳江市","parent":"20"},
|
||||
"244":{"name":"清远市","parent":"20"},
|
||||
"245":{"name":"东莞市","parent":"20"},
|
||||
"246":{"name":"中山市","parent":"20"},
|
||||
"247":{"name":"潮州市","parent":"20"},
|
||||
"248":{"name":"揭阳市","parent":"20"},
|
||||
"249":{"name":"云浮市","parent":"20"},
|
||||
"250":{"name":"南宁市","parent":"21"},
|
||||
"251":{"name":"柳州市","parent":"21"},
|
||||
"252":{"name":"桂林市","parent":"21"},
|
||||
"253":{"name":"梧州市","parent":"21"},
|
||||
"254":{"name":"北海市","parent":"21"},
|
||||
"255":{"name":"防城港市","parent":"21"},
|
||||
"256":{"name":"钦州市","parent":"21"},
|
||||
"257":{"name":"贵港市","parent":"21"},
|
||||
"258":{"name":"玉林市","parent":"21"},
|
||||
"259":{"name":"百色市","parent":"21"},
|
||||
"260":{"name":"贺州市","parent":"21"},
|
||||
"261":{"name":"河池市","parent":"21"},
|
||||
"262":{"name":"来宾市","parent":"21"},
|
||||
"263":{"name":"崇左市","parent":"21"},
|
||||
"264":{"name":"海口市","parent":"22"},
|
||||
"265":{"name":"三亚市","parent":"22"},
|
||||
"267":{"name":"重庆市","parent":"23"},
|
||||
"268":{"name":"成都市","parent":"24"},
|
||||
"269":{"name":"自贡市","parent":"24"},
|
||||
"270":{"name":"攀枝花市","parent":"24"},
|
||||
"271":{"name":"泸州市","parent":"24"},
|
||||
"272":{"name":"德阳市","parent":"24"},
|
||||
"273":{"name":"绵阳市","parent":"24"},
|
||||
"274":{"name":"广元市","parent":"24"},
|
||||
"275":{"name":"遂宁市","parent":"24"},
|
||||
"276":{"name":"内江市","parent":"24"},
|
||||
"277":{"name":"乐山市","parent":"24"},
|
||||
"278":{"name":"南充市","parent":"24"},
|
||||
"279":{"name":"眉山市","parent":"24"},
|
||||
"280":{"name":"宜宾市","parent":"24"},
|
||||
"281":{"name":"广安市","parent":"24"},
|
||||
"282":{"name":"达州市","parent":"24"},
|
||||
"283":{"name":"雅安市","parent":"24"},
|
||||
"284":{"name":"巴中市","parent":"24"},
|
||||
"285":{"name":"资阳市","parent":"24"},
|
||||
"286":{"name":"阿坝藏族羌族自治州","parent":"24"},
|
||||
"287":{"name":"甘孜藏族自治州","parent":"24"},
|
||||
"288":{"name":"凉山彝族自治州","parent":"24"},
|
||||
"289":{"name":"贵阳市","parent":"25"},
|
||||
"290":{"name":"六盘水市","parent":"25"},
|
||||
"291":{"name":"遵义市","parent":"25"},
|
||||
"292":{"name":"安顺市","parent":"25"},
|
||||
"293":{"name":"铜仁市","parent":"25"},
|
||||
"294":{"name":"黔西南布依族苗族自治州","parent":"25"},
|
||||
"295":{"name":"毕节市","parent":"25"},
|
||||
"296":{"name":"黔东南苗族侗族自治州","parent":"25"},
|
||||
"297":{"name":"黔南布依族苗族自治州","parent":"25"},
|
||||
"298":{"name":"昆明市","parent":"26"},
|
||||
"299":{"name":"曲靖市","parent":"26"},
|
||||
"300":{"name":"玉溪市","parent":"26"},
|
||||
"301":{"name":"保山市","parent":"26"},
|
||||
"302":{"name":"昭通市","parent":"26"},
|
||||
"303":{"name":"丽江市","parent":"26"},
|
||||
"304":{"name":"普洱市","parent":"26"},
|
||||
"305":{"name":"临沧市","parent":"26"},
|
||||
"306":{"name":"楚雄彝族自治州","parent":"26"},
|
||||
"307":{"name":"红河哈尼族彝族自治州","parent":"26"},
|
||||
"308":{"name":"文山壮族苗族自治州","parent":"26"},
|
||||
"309":{"name":"西双版纳傣族自治州","parent":"26"},
|
||||
"310":{"name":"大理白族自治州","parent":"26"},
|
||||
"311":{"name":"德宏傣族景颇族自治州","parent":"26"},
|
||||
"312":{"name":"怒江傈僳族自治州","parent":"26"},
|
||||
"313":{"name":"迪庆藏族自治州","parent":"26"},
|
||||
"314":{"name":"拉萨市","parent":"27"},
|
||||
"315":{"name":"昌都市","parent":"27"},
|
||||
"316":{"name":"山南市","parent":"27"},
|
||||
"317":{"name":"日喀则市","parent":"27"},
|
||||
"318":{"name":"那曲市","parent":"27"},
|
||||
"319":{"name":"阿里地区","parent":"27"},
|
||||
"320":{"name":"林芝市","parent":"27"},
|
||||
"321":{"name":"西安市","parent":"28"},
|
||||
"322":{"name":"铜川市","parent":"28"},
|
||||
"323":{"name":"宝鸡市","parent":"28"},
|
||||
"324":{"name":"咸阳市","parent":"28"},
|
||||
"325":{"name":"渭南市","parent":"28"},
|
||||
"326":{"name":"延安市","parent":"28"},
|
||||
"327":{"name":"汉中市","parent":"28"},
|
||||
"328":{"name":"榆林市","parent":"28"},
|
||||
"329":{"name":"安康市","parent":"28"},
|
||||
"330":{"name":"商洛市","parent":"28"},
|
||||
"331":{"name":"兰州市","parent":"29"},
|
||||
"332":{"name":"嘉峪关市","parent":"29"},
|
||||
"333":{"name":"金昌市","parent":"29"},
|
||||
"334":{"name":"白银市","parent":"29"},
|
||||
"335":{"name":"天水市","parent":"29"},
|
||||
"336":{"name":"武威市","parent":"29"},
|
||||
"337":{"name":"张掖市","parent":"29"},
|
||||
"338":{"name":"平凉市","parent":"29"},
|
||||
"339":{"name":"酒泉市","parent":"29"},
|
||||
"340":{"name":"庆阳市","parent":"29"},
|
||||
"341":{"name":"定西市","parent":"29"},
|
||||
"342":{"name":"陇南市","parent":"29"},
|
||||
"343":{"name":"临夏回族自治州","parent":"29"},
|
||||
"344":{"name":"甘南藏族自治州","parent":"29"},
|
||||
"345":{"name":"西宁市","parent":"30"},
|
||||
"346":{"name":"海东市","parent":"30"},
|
||||
"347":{"name":"海北藏族自治州","parent":"30"},
|
||||
"348":{"name":"黄南藏族自治州","parent":"30"},
|
||||
"349":{"name":"海南藏族自治州","parent":"30"},
|
||||
"350":{"name":"果洛藏族自治州","parent":"30"},
|
||||
"351":{"name":"玉树藏族自治州","parent":"30"},
|
||||
"352":{"name":"海西蒙古族藏族自治州","parent":"30"},
|
||||
"353":{"name":"银川市","parent":"31"},
|
||||
"354":{"name":"石嘴山市","parent":"31"},
|
||||
"355":{"name":"吴忠市","parent":"31"},
|
||||
"356":{"name":"固原市","parent":"31"},
|
||||
"357":{"name":"中卫市","parent":"31"},
|
||||
"358":{"name":"乌鲁木齐市","parent":"32"},
|
||||
"359":{"name":"克拉玛依市","parent":"32"},
|
||||
"360":{"name":"吐鲁番市","parent":"32"},
|
||||
"361":{"name":"哈密市","parent":"32"},
|
||||
"362":{"name":"昌吉回族自治州","parent":"32"},
|
||||
"363":{"name":"博尔塔拉蒙古自治州","parent":"32"},
|
||||
"364":{"name":"巴音郭楞蒙古自治州","parent":"32"},
|
||||
"365":{"name":"阿克苏地区","parent":"32"},
|
||||
"366":{"name":"克孜勒苏柯尔克孜自治州","parent":"32"},
|
||||
"367":{"name":"喀什地区","parent":"32"},
|
||||
"368":{"name":"和田地区","parent":"32"},
|
||||
"369":{"name":"伊犁哈萨克自治州","parent":"32"},
|
||||
"370":{"name":"塔城地区","parent":"32"},
|
||||
"371":{"name":"阿勒泰地区","parent":"32"},
|
||||
"372":{"name":"新疆省直辖行政单位","parent":"32"},
|
||||
"373":{"name":"可克达拉市","parent":"32"},
|
||||
"374":{"name":"昆玉市","parent":"32"},
|
||||
"375":{"name":"胡杨河市","parent":"32"},
|
||||
"376":{"name":"双河市","parent":"32"},
|
||||
"3560":{"name":"北票市","parent":"7"},
|
||||
"3615":{"name":"高州市","parent":"20"},
|
||||
"3651":{"name":"济源市","parent":"17"},
|
||||
"3662":{"name":"胶南市","parent":"16"},
|
||||
"3683":{"name":"老河口市","parent":"18"},
|
||||
"3758":{"name":"沙河市","parent":"4"},
|
||||
"3822":{"name":"宜城市","parent":"18"},
|
||||
"3842":{"name":"枣阳市","parent":"18"},
|
||||
"3850":{"name":"肇东市","parent":"9"},
|
||||
"3905":{"name":"澳门","parent":"1"},
|
||||
"3906":{"name":"澳门","parent":"3905"},
|
||||
"3907":{"name":"香港","parent":"1"},
|
||||
"3908":{"name":"香港","parent":"3907"},
|
||||
"3947":{"name":"仙桃市","parent":"18"},
|
||||
"3954":{"name":"台湾","parent":"1"},
|
||||
"3955":{"name":"台湾","parent":"3954"},
|
||||
"3956":{"name":"海外","parent":"1"},
|
||||
"3957":{"name":"海外","parent":"3956"},
|
||||
"3958":{"name":"美国","parent":"3956"},
|
||||
"3959":{"name":"加拿大","parent":"3956"},
|
||||
"3961":{"name":"日本","parent":"3956"},
|
||||
"3962":{"name":"韩国","parent":"3956"},
|
||||
"3963":{"name":"德国","parent":"3956"},
|
||||
"3964":{"name":"英国","parent":"3956"},
|
||||
"3965":{"name":"意大利","parent":"3956"},
|
||||
"3966":{"name":"西班牙","parent":"3956"},
|
||||
"3967":{"name":"法国","parent":"3956"},
|
||||
"3968":{"name":"澳大利亚","parent":"3956"},
|
||||
"3969":{"name":"东城区","parent":"2"},
|
||||
"3970":{"name":"西城区","parent":"2"},
|
||||
"3971":{"name":"崇文区","parent":"2"},
|
||||
"3972":{"name":"宣武区","parent":"2"},
|
||||
"3973":{"name":"朝阳区","parent":"2"},
|
||||
"3974":{"name":"海淀区","parent":"2"},
|
||||
"3975":{"name":"丰台区","parent":"2"},
|
||||
"3976":{"name":"石景山区","parent":"2"},
|
||||
"3977":{"name":"门头沟区","parent":"2"},
|
||||
"3978":{"name":"房山区","parent":"2"},
|
||||
"3979":{"name":"通州区","parent":"2"},
|
||||
"3980":{"name":"顺义区","parent":"2"},
|
||||
"3981":{"name":"昌平区","parent":"2"},
|
||||
"3982":{"name":"大兴区","parent":"2"},
|
||||
"3983":{"name":"平谷区","parent":"2"},
|
||||
"3984":{"name":"怀柔区","parent":"2"},
|
||||
"3985":{"name":"密云区","parent":"2"},
|
||||
"3986":{"name":"延庆区","parent":"2"},
|
||||
"3987":{"name":"黄浦区","parent":"10"},
|
||||
"3988":{"name":"徐汇区","parent":"10"},
|
||||
"3989":{"name":"长宁区","parent":"10"},
|
||||
"3990":{"name":"静安区","parent":"10"},
|
||||
"3991":{"name":"普陀区","parent":"10"},
|
||||
"3992":{"name":"闸北区","parent":"10"},
|
||||
"3993":{"name":"虹口区","parent":"10"},
|
||||
"3994":{"name":"杨浦区","parent":"10"},
|
||||
"3995":{"name":"宝山区","parent":"10"},
|
||||
"3996":{"name":"闵行区","parent":"10"},
|
||||
"3997":{"name":"嘉定区","parent":"10"},
|
||||
"3998":{"name":"浦东新区","parent":"10"},
|
||||
"3999":{"name":"松江区","parent":"10"},
|
||||
"4000":{"name":"金山区","parent":"10"},
|
||||
"4001":{"name":"青浦区","parent":"10"},
|
||||
"4002":{"name":"奉贤区","parent":"10"},
|
||||
"4003":{"name":"崇明区","parent":"10"},
|
||||
"4004":{"name":"和平区","parent":"3"},
|
||||
"4005":{"name":"河东区","parent":"3"},
|
||||
"4006":{"name":"河西区","parent":"3"},
|
||||
"4007":{"name":"南开区","parent":"3"},
|
||||
"4008":{"name":"红桥区","parent":"3"},
|
||||
"4009":{"name":"河北区","parent":"3"},
|
||||
"4010":{"name":"滨海新区","parent":"3"},
|
||||
"4011":{"name":"东丽区","parent":"3"},
|
||||
"4012":{"name":"西青区","parent":"3"},
|
||||
"4013":{"name":"北辰区","parent":"3"},
|
||||
"4014":{"name":"津南区","parent":"3"},
|
||||
"4015":{"name":"武清区","parent":"3"},
|
||||
"4016":{"name":"宝坻区","parent":"3"},
|
||||
"4017":{"name":"静海区","parent":"3"},
|
||||
"4018":{"name":"宁河区","parent":"3"},
|
||||
"4019":{"name":"蓟州区","parent":"3"},
|
||||
"4020":{"name":"渝中区","parent":"23"},
|
||||
"4021":{"name":"江北区","parent":"23"},
|
||||
"4022":{"name":"南岸区","parent":"23"},
|
||||
"4023":{"name":"沙坪坝区","parent":"23"},
|
||||
"4024":{"name":"九龙坡区","parent":"23"},
|
||||
"4025":{"name":"大渡口区","parent":"23"},
|
||||
"4026":{"name":"渝北区","parent":"23"},
|
||||
"4027":{"name":"巴南区","parent":"23"},
|
||||
"4028":{"name":"北碚区","parent":"23"},
|
||||
"4029":{"name":"万州区","parent":"23"},
|
||||
"4030":{"name":"黔江区","parent":"23"},
|
||||
"4031":{"name":"永川区","parent":"23"},
|
||||
"4032":{"name":"涪陵区","parent":"23"},
|
||||
"4033":{"name":"江津区","parent":"23"},
|
||||
"4034":{"name":"合川区","parent":"23"},
|
||||
"4035":{"name":"双桥区","parent":"23"},
|
||||
"4036":{"name":"万盛区","parent":"23"},
|
||||
"4037":{"name":"荣昌区","parent":"23"},
|
||||
"4038":{"name":"大足区","parent":"23"},
|
||||
"4039":{"name":"璧山区","parent":"23"},
|
||||
"4040":{"name":"铜梁区","parent":"23"},
|
||||
"4041":{"name":"潼南区","parent":"23"},
|
||||
"4042":{"name":"綦江区","parent":"23"},
|
||||
"4043":{"name":"忠县","parent":"23"},
|
||||
"4044":{"name":"开州区","parent":"23"},
|
||||
"4045":{"name":"云阳县","parent":"23"},
|
||||
"4046":{"name":"梁平区","parent":"23"},
|
||||
"4047":{"name":"垫江县","parent":"23"},
|
||||
"4048":{"name":"丰都县","parent":"23"},
|
||||
"4049":{"name":"奉节县","parent":"23"},
|
||||
"4050":{"name":"巫山县","parent":"23"},
|
||||
"4051":{"name":"巫溪县","parent":"23"},
|
||||
"4052":{"name":"城口县","parent":"23"},
|
||||
"4053":{"name":"武隆区","parent":"23"},
|
||||
"4054":{"name":"石柱土家族自治县","parent":"23"},
|
||||
"4055":{"name":"秀山土家族苗族自治县","parent":"23"},
|
||||
"4056":{"name":"酉阳土家族苗族自治县","parent":"23"},
|
||||
"4057":{"name":"彭水苗族土家族自治县","parent":"23"},
|
||||
"4058":{"name":"潜江市","parent":"18"},
|
||||
"4059":{"name":"三沙市","parent":"22"},
|
||||
"4060":{"name":"石河子市","parent":"32"},
|
||||
"4061":{"name":"阿拉尔市","parent":"32"},
|
||||
"4062":{"name":"图木舒克市","parent":"32"},
|
||||
"4063":{"name":"五家渠市","parent":"32"},
|
||||
"4064":{"name":"北屯市","parent":"32"},
|
||||
"4065":{"name":"铁门关市","parent":"32"},
|
||||
"4066":{"name":"儋州市","parent":"22"},
|
||||
"4067":{"name":"五指山市","parent":"22"},
|
||||
"4068":{"name":"文昌市","parent":"22"},
|
||||
"4069":{"name":"琼海市","parent":"22"},
|
||||
"4070":{"name":"万宁市","parent":"22"},
|
||||
"4072":{"name":"定安县","parent":"22"},
|
||||
"4073":{"name":"屯昌县","parent":"22"},
|
||||
"4074":{"name":"澄迈县","parent":"22"},
|
||||
"4075":{"name":"临高县","parent":"22"},
|
||||
"4076":{"name":"琼中黎族苗族自治县","parent":"22"},
|
||||
"4077":{"name":"保亭黎族苗族自治县","parent":"22"},
|
||||
"4078":{"name":"白沙黎族自治县","parent":"22"},
|
||||
"4079":{"name":"昌江黎族自治县","parent":"22"},
|
||||
"4080":{"name":"乐东黎族自治县","parent":"22"},
|
||||
"4081":{"name":"陵水黎族自治县","parent":"22"},
|
||||
"4082":{"name":"马来西亚","parent":"3956"},
|
||||
"6047":{"name":"长寿区","parent":"23"},
|
||||
"6857":{"name":"阿富汗","parent":"3956"},
|
||||
"6858":{"name":"阿尔巴尼亚","parent":"3956"},
|
||||
"6859":{"name":"阿尔及利亚","parent":"3956"},
|
||||
"6860":{"name":"美属萨摩亚","parent":"3956"},
|
||||
"6861":{"name":"安道尔","parent":"3956"},
|
||||
"6862":{"name":"安哥拉","parent":"3956"},
|
||||
"6863":{"name":"安圭拉","parent":"3956"},
|
||||
"6864":{"name":"南极洲","parent":"3956"},
|
||||
"6865":{"name":"安提瓜和巴布达","parent":"3956"},
|
||||
"6866":{"name":"阿根廷","parent":"3956"},
|
||||
"6867":{"name":"亚美尼亚","parent":"3956"},
|
||||
"6869":{"name":"奥地利","parent":"3956"},
|
||||
"6870":{"name":"阿塞拜疆","parent":"3956"},
|
||||
"6871":{"name":"巴哈马","parent":"3956"},
|
||||
"6872":{"name":"巴林","parent":"3956"},
|
||||
"6873":{"name":"孟加拉国","parent":"3956"},
|
||||
"6874":{"name":"巴巴多斯","parent":"3956"},
|
||||
"6875":{"name":"白俄罗斯","parent":"3956"},
|
||||
"6876":{"name":"比利时","parent":"3956"},
|
||||
"6877":{"name":"伯利兹","parent":"3956"},
|
||||
"6878":{"name":"贝宁","parent":"3956"},
|
||||
"6879":{"name":"百慕大","parent":"3956"},
|
||||
"6880":{"name":"不丹","parent":"3956"},
|
||||
"6881":{"name":"玻利维亚","parent":"3956"},
|
||||
"6882":{"name":"波黑","parent":"3956"},
|
||||
"6883":{"name":"博茨瓦纳","parent":"3956"},
|
||||
"6884":{"name":"布维岛","parent":"3956"},
|
||||
"6885":{"name":"巴西","parent":"3956"},
|
||||
"6886":{"name":"英属印度洋领土","parent":"3956"},
|
||||
"6887":{"name":"文莱","parent":"3956"},
|
||||
"6888":{"name":"保加利亚","parent":"3956"},
|
||||
"6889":{"name":"布基纳法索","parent":"3956"},
|
||||
"6890":{"name":"布隆迪","parent":"3956"},
|
||||
"6891":{"name":"柬埔寨","parent":"3956"},
|
||||
"6892":{"name":"喀麦隆","parent":"3956"},
|
||||
"6893":{"name":"佛得角","parent":"3956"},
|
||||
"6894":{"name":"开曼群岛","parent":"3956"},
|
||||
"6895":{"name":"中非","parent":"3956"},
|
||||
"6896":{"name":"乍得","parent":"3956"},
|
||||
"6897":{"name":"智利","parent":"3956"},
|
||||
"6898":{"name":"圣诞岛","parent":"3956"},
|
||||
"6899":{"name":"科科斯(基林)群岛","parent":"3956"},
|
||||
"6900":{"name":"哥伦比亚","parent":"3956"},
|
||||
"6901":{"name":"科摩罗","parent":"3956"},
|
||||
"6902":{"name":"刚果(布)","parent":"3956"},
|
||||
"6903":{"name":"刚果(金)","parent":"3956"},
|
||||
"6904":{"name":"库克群岛","parent":"3956"},
|
||||
"6905":{"name":"哥斯达黎加","parent":"3956"},
|
||||
"6906":{"name":"科特迪瓦","parent":"3956"},
|
||||
"6907":{"name":"克罗地亚","parent":"3956"},
|
||||
"6908":{"name":"古巴","parent":"3956"},
|
||||
"6909":{"name":"塞浦路斯","parent":"3956"},
|
||||
"6910":{"name":"捷克","parent":"3956"},
|
||||
"6911":{"name":"丹麦","parent":"3956"},
|
||||
"6912":{"name":"吉布提","parent":"3956"},
|
||||
"6913":{"name":"多米尼克","parent":"3956"},
|
||||
"6914":{"name":"多米尼加共和国","parent":"3956"},
|
||||
"6915":{"name":"东帝汶","parent":"3956"},
|
||||
"6916":{"name":"厄瓜多尔","parent":"3956"},
|
||||
"6917":{"name":"埃及","parent":"3956"},
|
||||
"6918":{"name":"萨尔瓦多","parent":"3956"},
|
||||
"6919":{"name":"赤道几内亚","parent":"3956"},
|
||||
"6920":{"name":"厄立特里亚","parent":"3956"},
|
||||
"6921":{"name":"爱沙尼亚","parent":"3956"},
|
||||
"6922":{"name":"埃塞俄比亚","parent":"3956"},
|
||||
"6923":{"name":"福克兰群岛(马尔维纳斯)","parent":"3956"},
|
||||
"6924":{"name":"法罗群岛","parent":"3956"},
|
||||
"6925":{"name":"斐济","parent":"3956"},
|
||||
"6926":{"name":"芬兰","parent":"3956"},
|
||||
"6927":{"name":"法属圭亚那","parent":"3956"},
|
||||
"6928":{"name":"法属波利尼西亚","parent":"3956"},
|
||||
"6929":{"name":"法属南部领土","parent":"3956"},
|
||||
"6930":{"name":"加蓬","parent":"3956"},
|
||||
"6931":{"name":"冈比亚","parent":"3956"},
|
||||
"6932":{"name":"格鲁吉亚","parent":"3956"},
|
||||
"6933":{"name":"加纳","parent":"3956"},
|
||||
"6934":{"name":"直布罗陀","parent":"3956"},
|
||||
"6935":{"name":"希腊","parent":"3956"},
|
||||
"6936":{"name":"格陵兰","parent":"3956"},
|
||||
"6937":{"name":"格林纳达","parent":"3956"},
|
||||
"6938":{"name":"瓜德罗普","parent":"3956"},
|
||||
"6939":{"name":"关岛","parent":"3956"},
|
||||
"6940":{"name":"危地马拉","parent":"3956"},
|
||||
"6941":{"name":"几内亚","parent":"3956"},
|
||||
"6942":{"name":"几内亚比绍","parent":"3956"},
|
||||
"6943":{"name":"圭亚那","parent":"3956"},
|
||||
"6944":{"name":"海地","parent":"3956"},
|
||||
"6945":{"name":"赫德岛和麦克唐纳岛","parent":"3956"},
|
||||
"6946":{"name":"洪都拉斯","parent":"3956"},
|
||||
"6947":{"name":"匈牙利","parent":"3956"},
|
||||
"6948":{"name":"冰岛","parent":"3956"},
|
||||
"6949":{"name":"印度","parent":"3956"},
|
||||
"6950":{"name":"印度尼西亚","parent":"3956"},
|
||||
"6951":{"name":"伊朗","parent":"3956"},
|
||||
"6952":{"name":"伊拉克","parent":"3956"},
|
||||
"6953":{"name":"爱尔兰","parent":"3956"},
|
||||
"6954":{"name":"以色列","parent":"3956"},
|
||||
"6955":{"name":"牙买加","parent":"3956"},
|
||||
"6956":{"name":"约旦","parent":"3956"},
|
||||
"6957":{"name":"哈萨克斯坦","parent":"3956"},
|
||||
"6958":{"name":"肯尼亚","parent":"3956"},
|
||||
"6959":{"name":"基里巴斯","parent":"3956"},
|
||||
"6960":{"name":"朝鲜","parent":"3956"},
|
||||
"6961":{"name":"科威特","parent":"3956"},
|
||||
"6962":{"name":"吉尔吉斯斯坦","parent":"3956"},
|
||||
"6963":{"name":"老挝","parent":"3956"},
|
||||
"6964":{"name":"拉脱维亚","parent":"3956"},
|
||||
"6965":{"name":"黎巴嫩","parent":"3956"},
|
||||
"6966":{"name":"莱索托","parent":"3956"},
|
||||
"6967":{"name":"利比里亚","parent":"3956"},
|
||||
"6968":{"name":"利比亚","parent":"3956"},
|
||||
"6969":{"name":"列支敦士登","parent":"3956"},
|
||||
"6970":{"name":"立陶宛","parent":"3956"},
|
||||
"6971":{"name":"卢森堡","parent":"3956"},
|
||||
"6972":{"name":"前南马其顿","parent":"3956"},
|
||||
"6973":{"name":"马达加斯加","parent":"3956"},
|
||||
"6974":{"name":"马拉维","parent":"3956"},
|
||||
"6975":{"name":"马尔代夫","parent":"3956"},
|
||||
"6976":{"name":"马里","parent":"3956"},
|
||||
"6977":{"name":"马耳他","parent":"3956"},
|
||||
"6978":{"name":"马绍尔群岛","parent":"3956"},
|
||||
"6979":{"name":"马提尼克","parent":"3956"},
|
||||
"6980":{"name":"毛里塔尼亚","parent":"3956"},
|
||||
"6981":{"name":"毛里求斯","parent":"3956"},
|
||||
"6982":{"name":"马约特","parent":"3956"},
|
||||
"6983":{"name":"墨西哥","parent":"3956"},
|
||||
"6984":{"name":"密克罗尼西亚联邦","parent":"3956"},
|
||||
"6985":{"name":"摩尔多瓦","parent":"3956"},
|
||||
"6986":{"name":"摩纳哥","parent":"3956"},
|
||||
"6987":{"name":"蒙古","parent":"3956"},
|
||||
"6988":{"name":"蒙特塞拉特","parent":"3956"},
|
||||
"6989":{"name":"摩洛哥","parent":"3956"},
|
||||
"6990":{"name":"莫桑比克","parent":"3956"},
|
||||
"6991":{"name":"缅甸","parent":"3956"},
|
||||
"6992":{"name":"纳米比亚","parent":"3956"},
|
||||
"6993":{"name":"瑙鲁","parent":"3956"},
|
||||
"6994":{"name":"尼泊尔","parent":"3956"},
|
||||
"6995":{"name":"荷兰","parent":"3956"},
|
||||
"6996":{"name":"荷属安的列斯","parent":"3956"},
|
||||
"6997":{"name":"新喀里多尼亚","parent":"3956"},
|
||||
"6998":{"name":"新西兰","parent":"3956"},
|
||||
"6999":{"name":"尼加拉瓜","parent":"3956"},
|
||||
"7000":{"name":"尼日尔","parent":"3956"},
|
||||
"7001":{"name":"尼日利亚","parent":"3956"},
|
||||
"7002":{"name":"纽埃","parent":"3956"},
|
||||
"7003":{"name":"诺福克岛","parent":"3956"},
|
||||
"7004":{"name":"北马里亚纳","parent":"3956"},
|
||||
"7005":{"name":"挪威","parent":"3956"},
|
||||
"7006":{"name":"阿曼","parent":"3956"},
|
||||
"7007":{"name":"巴基斯坦","parent":"3956"},
|
||||
"7008":{"name":"帕劳","parent":"3956"},
|
||||
"7009":{"name":"巴勒斯坦","parent":"3956"},
|
||||
"7010":{"name":"巴拿马","parent":"3956"},
|
||||
"7011":{"name":"巴布亚新几内亚","parent":"3956"},
|
||||
"7012":{"name":"巴拉圭","parent":"3956"},
|
||||
"7013":{"name":"秘鲁","parent":"3956"},
|
||||
"7014":{"name":"菲律宾","parent":"3956"},
|
||||
"7015":{"name":"皮特凯恩群岛","parent":"3956"},
|
||||
"7016":{"name":"波兰","parent":"3956"},
|
||||
"7017":{"name":"葡萄牙","parent":"3956"},
|
||||
"7018":{"name":"波多黎各","parent":"3956"},
|
||||
"7019":{"name":"卡塔尔","parent":"3956"},
|
||||
"7020":{"name":"留尼汪","parent":"3956"},
|
||||
"7021":{"name":"罗马尼亚","parent":"3956"},
|
||||
"7022":{"name":"俄罗斯联邦","parent":"3956"},
|
||||
"7023":{"name":"卢旺达","parent":"3956"},
|
||||
"7024":{"name":"圣赫勒拿","parent":"3956"},
|
||||
"7025":{"name":"圣基茨和尼维斯","parent":"3956"},
|
||||
"7026":{"name":"圣卢西亚","parent":"3956"},
|
||||
"7027":{"name":"圣皮埃尔和密克隆","parent":"3956"},
|
||||
"7028":{"name":"圣文森特和格林纳丁斯","parent":"3956"},
|
||||
"7029":{"name":"萨摩亚","parent":"3956"},
|
||||
"7030":{"name":"圣马力诺","parent":"3956"},
|
||||
"7031":{"name":"圣多美和普林西比","parent":"3956"},
|
||||
"7032":{"name":"沙特阿拉伯","parent":"3956"},
|
||||
"7033":{"name":"塞内加尔","parent":"3956"},
|
||||
"7034":{"name":"塞舌尔","parent":"3956"},
|
||||
"7035":{"name":"塞拉利昂","parent":"3956"},
|
||||
"7036":{"name":"新加坡","parent":"3956"},
|
||||
"7037":{"name":"斯洛伐克","parent":"3956"},
|
||||
"7038":{"name":"斯洛文尼亚","parent":"3956"},
|
||||
"7039":{"name":"所罗门群岛","parent":"3956"},
|
||||
"7040":{"name":"索马里","parent":"3956"},
|
||||
"7041":{"name":"南非","parent":"3956"},
|
||||
"7042":{"name":"南乔治亚岛和南桑德韦奇岛","parent":"3956"},
|
||||
"7043":{"name":"斯里兰卡","parent":"3956"},
|
||||
"7044":{"name":"苏丹","parent":"3956"},
|
||||
"7045":{"name":"苏里南","parent":"3956"},
|
||||
"7046":{"name":"斯瓦尔巴群岛","parent":"3956"},
|
||||
"7047":{"name":"斯威士兰","parent":"3956"},
|
||||
"7048":{"name":"瑞典","parent":"3956"},
|
||||
"7049":{"name":"瑞士","parent":"3956"},
|
||||
"7050":{"name":"叙利亚","parent":"3956"},
|
||||
"7051":{"name":"塔吉克斯坦","parent":"3956"},
|
||||
"7052":{"name":"坦桑尼亚","parent":"3956"},
|
||||
"7053":{"name":"泰国","parent":"3956"},
|
||||
"7054":{"name":"多哥","parent":"3956"},
|
||||
"7055":{"name":"托克劳","parent":"3956"},
|
||||
"7056":{"name":"汤加","parent":"3956"},
|
||||
"7057":{"name":"特立尼达和多巴哥","parent":"3956"},
|
||||
"7058":{"name":"突尼斯","parent":"3956"},
|
||||
"7059":{"name":"土耳其","parent":"3956"},
|
||||
"7060":{"name":"土库曼斯坦","parent":"3956"},
|
||||
"7061":{"name":"特克斯科斯群岛","parent":"3956"},
|
||||
"7062":{"name":"图瓦卢","parent":"3956"},
|
||||
"7063":{"name":"乌干达","parent":"3956"},
|
||||
"7064":{"name":"乌克兰","parent":"3956"},
|
||||
"7065":{"name":"阿联酋","parent":"3956"},
|
||||
"7066":{"name":"美国本土外小岛屿","parent":"3956"},
|
||||
"7067":{"name":"乌拉圭","parent":"3956"},
|
||||
"7068":{"name":"乌兹别克斯坦","parent":"3956"},
|
||||
"7069":{"name":"瓦努阿图","parent":"3956"},
|
||||
"7070":{"name":"梵蒂冈","parent":"3956"},
|
||||
"7071":{"name":"委内瑞拉","parent":"3956"},
|
||||
"7072":{"name":"越南","parent":"3956"},
|
||||
"7073":{"name":"英属维尔京群岛","parent":"3956"},
|
||||
"7074":{"name":"美属维尔京群岛","parent":"3956"},
|
||||
"7075":{"name":"瓦利斯和富图纳","parent":"3956"},
|
||||
"7076":{"name":"西撒哈拉","parent":"3956"},
|
||||
"7077":{"name":"也门","parent":"3956"},
|
||||
"7078":{"name":"南斯拉夫","parent":"3956"},
|
||||
"7079":{"name":"赞比亚","parent":"3956"},
|
||||
"7080":{"name":"津巴布韦","parent":"3956"},
|
||||
"7081":{"name":"塞尔维亚","parent":"3956"},
|
||||
"7082":{"name":"雄安新区","parent":"4"},
|
||||
"7084":{"name":"天门市","parent":"18"}
|
||||
}
|
||||
|
||||
NM_SET = set([v["name"] for _,v in TBL.items()])
|
||||
|
||||
def get_names(id):
|
||||
if not id or str(id).lower() == "none":return []
|
||||
id = str(id)
|
||||
if not re.match("[0-9]+$", id.strip()):return [id]
|
||||
nms = []
|
||||
d = TBL.get(id)
|
||||
if not d:return[]
|
||||
nms.append(d["name"])
|
||||
p = get_names(d["parent"])
|
||||
if p: nms.extend(p)
|
||||
return nms
|
||||
|
||||
import re
|
||||
def isName(nm):
|
||||
if nm in NM_SET:return True
|
||||
if nm + "市" in NM_SET:return True
|
||||
if re.sub(r"(省|(回族|壮族|维吾尔)*自治区)$", "", nm) in NM_SET:return True
|
||||
return False
|
65
deepdoc/parser/resume/entities/res/corp.tks.freq.json
Normal file
65
deepdoc/parser/resume/entities/res/corp.tks.freq.json
Normal file
@ -0,0 +1,65 @@
|
||||
[
|
||||
"科技",
|
||||
"集团",
|
||||
"网络科技",
|
||||
"技术",
|
||||
"信息",
|
||||
"分公司",
|
||||
"信息技术",
|
||||
"发展",
|
||||
"科技股份",
|
||||
"网络",
|
||||
"贸易",
|
||||
"商贸",
|
||||
"工程",
|
||||
"企业",
|
||||
"集团股份",
|
||||
"商务",
|
||||
"工业",
|
||||
"控股集团",
|
||||
"国际贸易",
|
||||
"软件技术",
|
||||
"数码科技",
|
||||
"软件开发",
|
||||
"有限",
|
||||
"经营",
|
||||
"科技开发",
|
||||
"股份公司",
|
||||
"电子技术",
|
||||
"实业集团",
|
||||
"责任",
|
||||
"无限",
|
||||
"工程技术",
|
||||
"上市公司",
|
||||
"技术开发",
|
||||
"软件系统",
|
||||
"总公司",
|
||||
"网络服务",
|
||||
"ltd.",
|
||||
"technology",
|
||||
"company",
|
||||
"服务公司",
|
||||
"计算机技术",
|
||||
"计算机软件",
|
||||
"电子信息",
|
||||
"corporation",
|
||||
"计算机服务",
|
||||
"计算机系统",
|
||||
"有限公司",
|
||||
"事业部",
|
||||
"公司",
|
||||
"股份",
|
||||
"有限责任",
|
||||
"软件",
|
||||
"控股",
|
||||
"高科技",
|
||||
"房地产",
|
||||
"事业群",
|
||||
"部门",
|
||||
"电子商务",
|
||||
"人力资源顾问",
|
||||
"人力资源",
|
||||
"株式会社",
|
||||
"网络营销"
|
||||
]
|
||||
|
31480
deepdoc/parser/resume/entities/res/corp_baike_len.csv
Normal file
31480
deepdoc/parser/resume/entities/res/corp_baike_len.csv
Normal file
File diff suppressed because it is too large
Load Diff
14939
deepdoc/parser/resume/entities/res/corp_tag.json
Normal file
14939
deepdoc/parser/resume/entities/res/corp_tag.json
Normal file
File diff suppressed because it is too large
Load Diff
911
deepdoc/parser/resume/entities/res/good_corp.json
Normal file
911
deepdoc/parser/resume/entities/res/good_corp.json
Normal file
@ -0,0 +1,911 @@
|
||||
[
|
||||
"google assistant investments",
|
||||
"amazon",
|
||||
"dingtalk china information",
|
||||
"zhejiang alibaba communication",
|
||||
"yunos",
|
||||
"腾讯云",
|
||||
"新浪新闻",
|
||||
"网邻通",
|
||||
"蚂蚁集团",
|
||||
"大疆",
|
||||
"恒生股份",
|
||||
"sf express",
|
||||
"智者天下",
|
||||
"shanghai hema network",
|
||||
"papayamobile",
|
||||
"lexinfintech",
|
||||
"industrial consumer finance",
|
||||
"360搜索",
|
||||
"世纪光速",
|
||||
"迅雷区块链",
|
||||
"赛盒科技",
|
||||
"齐力电子商务",
|
||||
"平安养老险",
|
||||
"平安证券",
|
||||
"平安好贷",
|
||||
"五八新服",
|
||||
"呯嘭智能",
|
||||
"阿里妈妈",
|
||||
"mdt",
|
||||
"tencent",
|
||||
"weibo",
|
||||
"浪潮软件",
|
||||
"阿里巴巴广告",
|
||||
"mashang consumer finance",
|
||||
"维沃",
|
||||
"hqg , limited",
|
||||
"moodys",
|
||||
"搜狐支付",
|
||||
"百度秀",
|
||||
"新浪服务",
|
||||
"零售通",
|
||||
"同城艺龙",
|
||||
"虾米音乐",
|
||||
"贝壳集团",
|
||||
"小米有品",
|
||||
"滴滴自动驾驶",
|
||||
"图记",
|
||||
"阿里影业",
|
||||
"卓联软件",
|
||||
"zhejiang tmall",
|
||||
"谷歌中国",
|
||||
"hithink flush",
|
||||
"时装科技",
|
||||
"程会玩国际旅行社",
|
||||
"amazon china holding limited",
|
||||
"中信消金",
|
||||
"当当比特物流",
|
||||
"新浪新媒体咨询",
|
||||
"tongcheng network",
|
||||
"金山在线",
|
||||
"shopping cart",
|
||||
"犀互动",
|
||||
"五八",
|
||||
"bilibili",
|
||||
"阿里星球",
|
||||
"滴滴金科服务",
|
||||
"美团",
|
||||
"哈啰出行",
|
||||
"face",
|
||||
"平安健康",
|
||||
"招商银行",
|
||||
"连亚",
|
||||
"盒马网络",
|
||||
"b站",
|
||||
"华为机器",
|
||||
"shanghai mdt infotech",
|
||||
"ping an healthkonnect",
|
||||
"beijing home link real estate broker",
|
||||
"花海仓",
|
||||
"beijing jingdong shangke information",
|
||||
"微影智能",
|
||||
"酷狗游戏",
|
||||
"health.pingan.com",
|
||||
"众安",
|
||||
"陌陌",
|
||||
"海康威视数字",
|
||||
"同程网",
|
||||
"艾丁金融",
|
||||
"知乎",
|
||||
" lu",
|
||||
"国际商业机器公司",
|
||||
"捷信消费金融",
|
||||
"恒生利融",
|
||||
"china merchants bank",
|
||||
"企鹅电竞",
|
||||
"捷信信驰",
|
||||
"360智能家居",
|
||||
"小桔车服",
|
||||
"homecredit",
|
||||
"皮皮虾",
|
||||
"畅游",
|
||||
"聚爱聊",
|
||||
"suning.com",
|
||||
"途牛旅游网",
|
||||
"花呗",
|
||||
"盈店通",
|
||||
"sina",
|
||||
"阿里巴巴音乐",
|
||||
"华为技术有限公司",
|
||||
"国付宝",
|
||||
"shanghai lianshang network",
|
||||
"oppo",
|
||||
"华为投资控股",
|
||||
"beijing sohu new media information",
|
||||
"times square",
|
||||
"菜鸟物流",
|
||||
"lingxing",
|
||||
"jd digits",
|
||||
"同程旅游",
|
||||
"分期乐",
|
||||
"火锅视频",
|
||||
"天天快报",
|
||||
"猎豹移动",
|
||||
"五八人力资源",
|
||||
"宝宝树",
|
||||
"顺丰科技",
|
||||
"上海西翠",
|
||||
"诗程文化传播",
|
||||
"dewu",
|
||||
"领星网络",
|
||||
"aliexpress",
|
||||
"贝塔通科技",
|
||||
"链家",
|
||||
"花小猪",
|
||||
"趣输入",
|
||||
"搜狐新媒体",
|
||||
"一淘",
|
||||
"56",
|
||||
"qq阅读",
|
||||
"青桔单车",
|
||||
"iflytek",
|
||||
"每日优鲜电子商务",
|
||||
"腾讯觅影",
|
||||
"微医",
|
||||
"松果网",
|
||||
"paypal",
|
||||
"递瑞供应链管理",
|
||||
"领星",
|
||||
"qunar",
|
||||
"三快",
|
||||
"lu.com",
|
||||
"携程旅行网",
|
||||
"新潮传媒",
|
||||
"链家经纪",
|
||||
"景域文化",
|
||||
"阿里健康",
|
||||
"pingpeng",
|
||||
"聚划算",
|
||||
"零机科技",
|
||||
"街兔电单车",
|
||||
"快乐购",
|
||||
"华为数字能源",
|
||||
"搜狐",
|
||||
"陆家嘴国际金融资产交易市场",
|
||||
"nanjing tuniu",
|
||||
"亚马逊",
|
||||
"苏宁易购",
|
||||
"携程旅游",
|
||||
"苏宁金服",
|
||||
"babytree",
|
||||
"悟空问答",
|
||||
"同花顺",
|
||||
"eastmoney",
|
||||
"浪潮信息",
|
||||
"滴滴智慧交通",
|
||||
"beijing ruixun lingtong",
|
||||
"平安综合金融服务",
|
||||
"爱奇艺",
|
||||
"小米集团",
|
||||
"华为云",
|
||||
"微店",
|
||||
"恒生集团",
|
||||
"网易有道",
|
||||
"boccfc",
|
||||
"世纪思速科技",
|
||||
"海康消防",
|
||||
"beijing xiaomi",
|
||||
"众安科技",
|
||||
"五八同城",
|
||||
"霆程汽车租赁",
|
||||
"云卖分销",
|
||||
"乐信集团",
|
||||
"蚂蚁",
|
||||
"舶乐蜜电子商务",
|
||||
"支付宝中国",
|
||||
"砖块消消消",
|
||||
"vivo",
|
||||
"阿里互娱",
|
||||
"中国平安",
|
||||
"lingxihudong",
|
||||
"百度网盘",
|
||||
"1号店",
|
||||
"字节跳动",
|
||||
"京东科技",
|
||||
"驴妈妈兴旅国际旅行社",
|
||||
"hangzhou alibaba music",
|
||||
"xunlei",
|
||||
"灵犀互动娱乐",
|
||||
"快手",
|
||||
"youtube",
|
||||
"连尚慧眼",
|
||||
"腾讯体育",
|
||||
"爱商在线",
|
||||
"酷我音乐",
|
||||
"金融壹账通",
|
||||
"搜狗服务",
|
||||
"banma information",
|
||||
"a站",
|
||||
"罗汉堂",
|
||||
"薇仕网络",
|
||||
"搜狐新闻",
|
||||
"贝宝",
|
||||
"薇仕",
|
||||
"口袋时尚科技",
|
||||
"穆迪咨询",
|
||||
"新狐投资管理",
|
||||
"hikvision",
|
||||
"alimama china holding limited",
|
||||
"超聚变数字",
|
||||
"腾讯视频",
|
||||
"恒生电子",
|
||||
"百度游戏",
|
||||
"绿洲",
|
||||
"木瓜移动",
|
||||
"红袖添香",
|
||||
"店匠科技",
|
||||
"易贝",
|
||||
"一淘网",
|
||||
"博览群书",
|
||||
"唯品会",
|
||||
"lazglobal",
|
||||
"amap",
|
||||
"芒果网",
|
||||
"口碑",
|
||||
"海康慧影",
|
||||
"腾讯音乐娱乐",
|
||||
"网易严选",
|
||||
"微信",
|
||||
"shenzhen lexin holding",
|
||||
"hangzhou pingpeng intelligent",
|
||||
"连尚网络",
|
||||
"海思",
|
||||
"isunor",
|
||||
"蝉翼",
|
||||
"阿里游戏",
|
||||
"广州优视",
|
||||
"优视",
|
||||
"腾讯征信",
|
||||
"识装",
|
||||
"finserve.pingan.com",
|
||||
"papaya",
|
||||
"阅文",
|
||||
"平安健康保险",
|
||||
"考拉海购",
|
||||
"网易印象",
|
||||
"wifi万能钥匙",
|
||||
"新浪互联服务",
|
||||
"亚马逊云科技",
|
||||
"迅雷看看",
|
||||
"华为朗新科技",
|
||||
"adyen hong kong limited",
|
||||
"谷歌",
|
||||
"得物",
|
||||
"网心",
|
||||
"cainiao network",
|
||||
"沐瞳",
|
||||
"linkedln",
|
||||
"hundsun",
|
||||
"阿里旅行",
|
||||
"珍爱网",
|
||||
"阿里巴巴通信",
|
||||
"金山奇剑",
|
||||
"tongtool",
|
||||
"华为安捷信电气",
|
||||
"快乐时代",
|
||||
"平安寿险",
|
||||
"微博",
|
||||
"微跳蚤",
|
||||
"oppo移动通信",
|
||||
"毒",
|
||||
"alimama",
|
||||
"shoplazza",
|
||||
"shenzhen dianjiang science and",
|
||||
"众鸣世科",
|
||||
"平安金融",
|
||||
"狐友",
|
||||
"维沃移动通信",
|
||||
"tobosoft",
|
||||
"齐力电商",
|
||||
"ali",
|
||||
"诚信通",
|
||||
"行吟",
|
||||
"跳舞的线",
|
||||
"橙心优选",
|
||||
"众安健康",
|
||||
"亚马逊中国投资",
|
||||
"德絮投资管理中心合伙",
|
||||
"招联消费金融",
|
||||
"百度文学",
|
||||
"芝麻信用",
|
||||
"阿里零售通",
|
||||
"时装",
|
||||
"花样直播",
|
||||
"sogou",
|
||||
"uc",
|
||||
"海思半导体",
|
||||
"zhongan online p&c insurance",
|
||||
"新浪数字",
|
||||
"驴妈妈旅游网",
|
||||
"华为数字能源技术",
|
||||
"京东数科",
|
||||
"oracle",
|
||||
"xiaomi",
|
||||
"nyse",
|
||||
"阳光消费金融",
|
||||
"天天动听",
|
||||
"大众点评",
|
||||
"上海瑞家",
|
||||
"trustpass",
|
||||
"hundsun technologies",
|
||||
"美团小贷",
|
||||
"ebay",
|
||||
"通途",
|
||||
"tcl",
|
||||
"鸿蒙",
|
||||
"酷狗计算机",
|
||||
"品诺保险",
|
||||
"capitalg",
|
||||
"康盛创想",
|
||||
"58同城",
|
||||
"闲鱼",
|
||||
"微软",
|
||||
"吉易付科技",
|
||||
"理财通",
|
||||
"ctrip",
|
||||
"yy",
|
||||
"华为数字",
|
||||
"kingsoft",
|
||||
"孙宁金融",
|
||||
"房江湖经纪",
|
||||
"youku",
|
||||
"ant financial services group",
|
||||
"盒马",
|
||||
"sensetime",
|
||||
"伊千网络",
|
||||
"小豹ai翻译棒",
|
||||
"shopify",
|
||||
"前海微众银行",
|
||||
"qd",
|
||||
"gmail",
|
||||
"pingpong",
|
||||
"alibaba group holding limited",
|
||||
"捷信时空电子商务",
|
||||
"orientsec",
|
||||
"乔戈里管理咨询",
|
||||
"ant",
|
||||
"锐讯灵通",
|
||||
"兴业消费金融",
|
||||
"京东叁佰陆拾度电子商务",
|
||||
"新浪",
|
||||
"优酷土豆",
|
||||
"海康机器人",
|
||||
"美团单车",
|
||||
"海康存储",
|
||||
"领英",
|
||||
"阿里全球速卖通",
|
||||
"美菜网",
|
||||
"京邦达",
|
||||
"安居客",
|
||||
"阿里体育",
|
||||
"相互宝",
|
||||
"cloudwalk",
|
||||
"百度智能云",
|
||||
"贝壳",
|
||||
"酷狗",
|
||||
"sunshine consumer finance",
|
||||
"掌宜",
|
||||
"奇酷网",
|
||||
"核新同花顺",
|
||||
"阿里巴巴影业",
|
||||
"节创",
|
||||
"学而思网校",
|
||||
"速途",
|
||||
"途牛",
|
||||
"阿里云计算",
|
||||
"beijing sensetime",
|
||||
"alibaba cloud",
|
||||
"西瓜视频",
|
||||
"美团优选",
|
||||
"orient securities limited",
|
||||
"华为朗新",
|
||||
"店匠",
|
||||
"shanghai weishi network",
|
||||
"友盟",
|
||||
"飞猪旅行",
|
||||
"滴滴出行",
|
||||
"alipay",
|
||||
"mogu",
|
||||
"dangdang",
|
||||
"大麦网",
|
||||
"汉军智能系统",
|
||||
"百度地图",
|
||||
"货车帮",
|
||||
"狐狸金服",
|
||||
"众安在线保险经纪",
|
||||
"华为通信",
|
||||
"新浪支付",
|
||||
"zhihu",
|
||||
"alibaba cloud computing",
|
||||
"沙发视频",
|
||||
"金山软件",
|
||||
"ping an good doctor",
|
||||
"携程",
|
||||
"脉脉",
|
||||
"youku information beijing",
|
||||
"zhongan",
|
||||
"艾丁软件",
|
||||
"乒乓智能",
|
||||
"蘑菇街",
|
||||
"taobao",
|
||||
"华为技术服务",
|
||||
"仕承文化传播",
|
||||
"安捷信",
|
||||
"狐狸互联网小额贷款",
|
||||
"节点迅捷",
|
||||
"中国银行",
|
||||
"搜镇",
|
||||
"众安在线",
|
||||
"dingtalk",
|
||||
"云从科技",
|
||||
"beijing jingbangda trade",
|
||||
"moody s",
|
||||
"滚动的天空",
|
||||
"yl.pingan.com",
|
||||
"奇虎",
|
||||
"alihealth",
|
||||
"芒果tv",
|
||||
"lufax",
|
||||
"美团打车",
|
||||
"小桔",
|
||||
"贝壳找房网",
|
||||
"小米科技",
|
||||
"vips",
|
||||
"kindle",
|
||||
"亚马逊服务",
|
||||
"citic consumer finance",
|
||||
"微众",
|
||||
"搜狗智慧互联网医院",
|
||||
"盒马鲜生",
|
||||
"life.pinan.com",
|
||||
"ph.com.cn",
|
||||
"银联",
|
||||
"cmbchina",
|
||||
"平安金融科技咨询",
|
||||
"微保",
|
||||
"甲骨文中国",
|
||||
"飞书",
|
||||
"koubei shanghai information",
|
||||
"企鹅辅导",
|
||||
"斑马",
|
||||
"平安租赁",
|
||||
"云从",
|
||||
"马上消费",
|
||||
"hangzhou ali baba advertising",
|
||||
"金山",
|
||||
"赛盒",
|
||||
"科大讯飞",
|
||||
"金星创业投资",
|
||||
"平安国际融资租赁",
|
||||
"360你财富",
|
||||
"西山居",
|
||||
"shenzhen qianhai fourth paradigm data",
|
||||
"海思光电子",
|
||||
"猎户星空",
|
||||
"网易公司",
|
||||
"浪潮",
|
||||
"粒粒橙传媒",
|
||||
"招联金融",
|
||||
"100. me",
|
||||
"捷信信驰咨询",
|
||||
"唯品仓",
|
||||
"orient",
|
||||
"趣拿",
|
||||
"摩拜单车",
|
||||
"天猫精灵",
|
||||
"菜鸟",
|
||||
"豹小贩",
|
||||
"去哪儿",
|
||||
"米家",
|
||||
"哈啰单车",
|
||||
"搜狐体育",
|
||||
"shopify payments usa",
|
||||
"高德软件",
|
||||
"讯联智付",
|
||||
"乐信",
|
||||
"唯你搭",
|
||||
"第四范式",
|
||||
"菜鸟网络",
|
||||
"同程",
|
||||
"yy语音",
|
||||
"浪潮云",
|
||||
"东财",
|
||||
"淘宝",
|
||||
"寻梦",
|
||||
"citic securities limited",
|
||||
"青橙之旅",
|
||||
"阿里巴巴",
|
||||
"番茄小说",
|
||||
"上海亿贝",
|
||||
"inspur",
|
||||
"babytree inc",
|
||||
"海康智慧产业股权投资基金合伙合伙",
|
||||
"adyen",
|
||||
"艺龙",
|
||||
"蚂蚁金服",
|
||||
"平安金服",
|
||||
"百度百科",
|
||||
"unionpay",
|
||||
"当当",
|
||||
"阅文集团",
|
||||
"东方财富",
|
||||
"东方证券",
|
||||
"哈罗单车",
|
||||
"优酷",
|
||||
"海康",
|
||||
"alipay china network",
|
||||
"网商银行",
|
||||
"钧正",
|
||||
"property.pingan.com",
|
||||
"豹咖啡",
|
||||
"网易",
|
||||
"我爱cba",
|
||||
"theduapp",
|
||||
"360",
|
||||
"金山数字娱乐",
|
||||
"新浪阅读",
|
||||
"alibabagames",
|
||||
"顺丰",
|
||||
"支点商贸",
|
||||
"同程旅行",
|
||||
"citic securities",
|
||||
"ele.com",
|
||||
"tal",
|
||||
"fresh hema",
|
||||
"运满满",
|
||||
"贝壳网",
|
||||
"酷狗音乐",
|
||||
"鲜城",
|
||||
"360健康",
|
||||
"浪潮世科",
|
||||
"迅雷网络",
|
||||
"哔哩哔哩",
|
||||
"华为电动",
|
||||
"淘友天下",
|
||||
"华多网络",
|
||||
"xunlei networking technologies",
|
||||
"云杉",
|
||||
"当当网电子商务",
|
||||
"津虹网络",
|
||||
"wedoc cloud hangzhou holdings",
|
||||
"alisports shanghai",
|
||||
"旷视金智",
|
||||
"钉钉中国",
|
||||
"微影",
|
||||
"金山快快",
|
||||
"亿贝",
|
||||
"wedoc",
|
||||
"autonavi",
|
||||
"哈啰助力车",
|
||||
"google cloud",
|
||||
"新浪乐居",
|
||||
"京东股票",
|
||||
"搜狗智慧远程医疗中心",
|
||||
"中银消金",
|
||||
"merchants union consumer finance",
|
||||
"王者荣耀",
|
||||
"百度手机",
|
||||
"美团民宿",
|
||||
"kaola",
|
||||
"小屋",
|
||||
"金山网络",
|
||||
"来往",
|
||||
"顺丰速运",
|
||||
"腾讯课堂",
|
||||
"百度在线网络",
|
||||
"美团买菜",
|
||||
"威视汽车",
|
||||
"uc mobile",
|
||||
"来赞达",
|
||||
"平安健康医疗",
|
||||
"豹小秘",
|
||||
"尚网",
|
||||
"哈勃投资",
|
||||
" ping an insurance group of china ,",
|
||||
"小米",
|
||||
"360好药",
|
||||
"qq音乐",
|
||||
"lingxigames",
|
||||
"faceu激萌",
|
||||
"搜狗",
|
||||
"sohu",
|
||||
"满帮",
|
||||
"vipshop",
|
||||
"wishpost",
|
||||
"金山世游",
|
||||
"shanghai yibaimi network",
|
||||
"1688",
|
||||
"海康汽车",
|
||||
"顺丰控股",
|
||||
"华为",
|
||||
"妙镜vr",
|
||||
"paybkj.com",
|
||||
"hellobike",
|
||||
"豹来电",
|
||||
"京东",
|
||||
"驴妈妈",
|
||||
"momo",
|
||||
"平安健康险",
|
||||
"哈勃科技",
|
||||
"美菜",
|
||||
"众安在线财产保险",
|
||||
"海康威视",
|
||||
"east money information",
|
||||
"阿里云",
|
||||
"蝉游记",
|
||||
"余额宝",
|
||||
"屋客",
|
||||
"滴滴",
|
||||
"shopify international limited",
|
||||
"百度",
|
||||
"阿里健康中国",
|
||||
"阿里通信",
|
||||
"微梦创科",
|
||||
"微医云",
|
||||
"轻颜相机",
|
||||
"搜易居",
|
||||
"趣店集团",
|
||||
"美团云",
|
||||
"ant group",
|
||||
"金山云",
|
||||
"beijing express hand",
|
||||
"觅觅",
|
||||
"支付宝",
|
||||
"滴滴承信科技咨询服务",
|
||||
"拼多多",
|
||||
"众安运动",
|
||||
"乞力电商",
|
||||
"youcash",
|
||||
"唯品金融",
|
||||
"陆金所",
|
||||
"本地生活",
|
||||
"sz dji",
|
||||
"海康智能",
|
||||
"魔方网聘",
|
||||
"青藤大学",
|
||||
"international business machines",
|
||||
"学而思",
|
||||
"beijing zhongming century science and",
|
||||
"猎豹清理大师",
|
||||
"asinking",
|
||||
"高德",
|
||||
"苏宁",
|
||||
"优酷网",
|
||||
"艾丁",
|
||||
"中银消费金融",
|
||||
"京东健康",
|
||||
"五八教育",
|
||||
"pingpongx",
|
||||
"搜狐时尚",
|
||||
"阿里广告",
|
||||
"平安财险",
|
||||
"中邮消金",
|
||||
"etao",
|
||||
"怕怕",
|
||||
"nyse:cmcm",
|
||||
"华为培训中心",
|
||||
"高德地图",
|
||||
"云狐天下征信",
|
||||
"大疆创新",
|
||||
"连尚",
|
||||
"壹佰米",
|
||||
"康健公司",
|
||||
"iqiyi.com",
|
||||
"360安全云盘",
|
||||
"馒头直播",
|
||||
"淘友网",
|
||||
"东方赢家",
|
||||
"bank of china",
|
||||
"微众银行",
|
||||
"阿里巴巴国际站",
|
||||
"虾米",
|
||||
"去哪儿网",
|
||||
"ctrip travel network shanghai",
|
||||
"潇湘书院",
|
||||
"腾讯",
|
||||
"快乐阳光互动娱乐传媒",
|
||||
"迅雷",
|
||||
"weidian",
|
||||
"滴滴货运",
|
||||
"ping an puhui enterprise management",
|
||||
"新浪仓石基金销售",
|
||||
"搜狐焦点",
|
||||
"alibaba pictures",
|
||||
"wps",
|
||||
"平安",
|
||||
"lazmall",
|
||||
"百度开放平台",
|
||||
"兴业消金",
|
||||
" 珍爱网",
|
||||
"京东云",
|
||||
"小红书",
|
||||
"1688. com",
|
||||
"如视智数",
|
||||
"missfresh",
|
||||
"pazl.pingan.cn",
|
||||
"平安集团",
|
||||
"kugou",
|
||||
"懂车帝",
|
||||
"斑马智行",
|
||||
"浪潮集团",
|
||||
"netease hangzhou network",
|
||||
"pagd.net",
|
||||
"探探",
|
||||
"chinaliterature",
|
||||
"amazon亚马逊",
|
||||
"alphabet",
|
||||
"当当文创手工艺品电子商务",
|
||||
"五八邦",
|
||||
"shenzhen zhenai network information",
|
||||
"lingshoutong",
|
||||
"字节",
|
||||
"lvmama",
|
||||
"金山办公",
|
||||
"众安保险",
|
||||
"时装信息",
|
||||
"优视科技",
|
||||
"guangzhou kugou",
|
||||
"ibm",
|
||||
"滴滴打车",
|
||||
"beijing sogou information service",
|
||||
"megvii",
|
||||
"健谈哥",
|
||||
"cloudwalk group",
|
||||
"蜂联科技",
|
||||
"冬云",
|
||||
"京东尚科",
|
||||
"钢琴块2",
|
||||
"京东世纪",
|
||||
"商汤",
|
||||
"众鸣世纪",
|
||||
"腾讯音乐",
|
||||
"迅雷网文化",
|
||||
"华为云计算技术",
|
||||
"live.me",
|
||||
"全球速卖通",
|
||||
"快的打车",
|
||||
"hello group inc",
|
||||
"美丽说",
|
||||
"suning",
|
||||
"opengauss",
|
||||
"lazada",
|
||||
"tmall",
|
||||
"acfun",
|
||||
"当当网",
|
||||
"中银",
|
||||
"旷视科技",
|
||||
"百度钱包",
|
||||
"淘宝网",
|
||||
"新浪微博",
|
||||
"迅雷集团",
|
||||
"中信消费金融",
|
||||
"学而思教育",
|
||||
"平安普惠",
|
||||
"悟空跨境",
|
||||
"irobotbox",
|
||||
"平安产险",
|
||||
"inspur group",
|
||||
"世纪卓越快递服务",
|
||||
"奇虎360",
|
||||
"webank",
|
||||
"偶藻",
|
||||
"唯品支付",
|
||||
"腾讯云计算",
|
||||
"众安服务",
|
||||
"亿之唐",
|
||||
"beijing 58 information ttechnology",
|
||||
"平安好医生",
|
||||
"迅雷之锤",
|
||||
"旅行小账本",
|
||||
"芒果游戏",
|
||||
"新浪传媒",
|
||||
"旷镜博煊",
|
||||
"全民k歌",
|
||||
"滴滴支付",
|
||||
"北京网心科技",
|
||||
"挂号网",
|
||||
"萤石",
|
||||
"chinavision media group limited",
|
||||
"猎豹安全大师",
|
||||
"cmcm",
|
||||
"趣店",
|
||||
"蚂蚁财富",
|
||||
"商汤科技",
|
||||
"甲骨文",
|
||||
"百度云",
|
||||
"百度apollo",
|
||||
"19 pay",
|
||||
"stock.pingan.com",
|
||||
"tiktok",
|
||||
"alibaba pictures group limited",
|
||||
"ele",
|
||||
"考拉",
|
||||
"天猫",
|
||||
"腾讯优图",
|
||||
"起点中文网",
|
||||
"百度视频",
|
||||
"shanghai bili bili",
|
||||
"京东物流",
|
||||
"ebay marketplaces gmbh",
|
||||
"alibaba sport",
|
||||
"wish",
|
||||
"阿里巴巴中国",
|
||||
"中国银联",
|
||||
"alibaba china network",
|
||||
"china ping an property insurance",
|
||||
"百度糯米网",
|
||||
"微软中国",
|
||||
"一九付",
|
||||
"4 paradigm",
|
||||
"叮咚买菜",
|
||||
"umeng",
|
||||
"众鸣科技",
|
||||
"平安财富通",
|
||||
"google",
|
||||
"巨量引擎",
|
||||
"百度贴吧",
|
||||
"beijing jingdong century information",
|
||||
"讯飞",
|
||||
"beijing yunshan information",
|
||||
"满运软件",
|
||||
"中邮消费金融",
|
||||
"饿了么",
|
||||
"alios",
|
||||
"腾讯ai实验室",
|
||||
"第四范式智能",
|
||||
"瀚星创业投资",
|
||||
"gradient ventures",
|
||||
"microsoft",
|
||||
"哈啰共享汽车",
|
||||
"乞力电子商务",
|
||||
"mscf",
|
||||
"网易影业文化",
|
||||
"铁友旅游咨询",
|
||||
"kilimall",
|
||||
"云企互联投资",
|
||||
"ping an financial consulting",
|
||||
"beijng jingdong century commerce",
|
||||
"高德威智能交通系统",
|
||||
"中友信息",
|
||||
"平安医疗健康管理",
|
||||
"eciticcfc",
|
||||
"中信证券",
|
||||
"fliggy",
|
||||
"电子湾",
|
||||
"旷云金智",
|
||||
"微粒贷",
|
||||
"rsi",
|
||||
"滴滴云计算",
|
||||
"google ventures",
|
||||
"箐程",
|
||||
"每日优鲜",
|
||||
"音兔",
|
||||
"拉扎斯",
|
||||
"今日头条",
|
||||
"乐信控股",
|
||||
"猎豹浏览器",
|
||||
"细微咨询",
|
||||
"好未来",
|
||||
"我乐",
|
||||
"绘声绘色",
|
||||
"抖音",
|
||||
"搜狐新时代",
|
||||
"飞猪",
|
||||
"鹅厂",
|
||||
"贝壳找房",
|
||||
"tuniu",
|
||||
"红马传媒文化",
|
||||
"钉钉",
|
||||
"马上消费金融",
|
||||
"360手机",
|
||||
"平安医保",
|
||||
"快途",
|
||||
"alibaba",
|
||||
"小哈换电",
|
||||
"大麦",
|
||||
"恒睿人工智能研究院",
|
||||
"谷歌资本",
|
||||
"猎豹",
|
||||
"穆迪信息"
|
||||
]
|
595
deepdoc/parser/resume/entities/res/good_sch.json
Normal file
595
deepdoc/parser/resume/entities/res/good_sch.json
Normal file
@ -0,0 +1,595 @@
|
||||
[
|
||||
"中国科技大学",
|
||||
"国防科学技术大学",
|
||||
"清华大学",
|
||||
"清华",
|
||||
"tsinghua university",
|
||||
"thu",
|
||||
"北京大学",
|
||||
"北大",
|
||||
"beijing university",
|
||||
"pku",
|
||||
"中国科学技术大学",
|
||||
"中国科大",
|
||||
"中科大",
|
||||
"china science & technology university",
|
||||
"ustc",
|
||||
"复旦大学",
|
||||
"复旦",
|
||||
"fudan university",
|
||||
"fdu",
|
||||
"中国人民大学",
|
||||
"人大",
|
||||
"人民大学",
|
||||
"renmin university of china",
|
||||
"ruc",
|
||||
"上海交通大学",
|
||||
"上海交大",
|
||||
"shanghai jiao tong university",
|
||||
"sjtu",
|
||||
"南京大学",
|
||||
"南大",
|
||||
"nanjing university",
|
||||
"nju",
|
||||
"同济大学",
|
||||
"同济",
|
||||
"tongji university",
|
||||
"tongji",
|
||||
"浙江大学",
|
||||
"浙大",
|
||||
"zhejiang university",
|
||||
"zju",
|
||||
"南开大学",
|
||||
"南开",
|
||||
"nankai university",
|
||||
"nku",
|
||||
"北京航空航天大学",
|
||||
"北航",
|
||||
"beihang university",
|
||||
"buaa",
|
||||
"北京师范大学",
|
||||
"北师",
|
||||
"北师大",
|
||||
"beijing normal university",
|
||||
"bnu",
|
||||
"武汉大学",
|
||||
"武大",
|
||||
"wuhan university",
|
||||
"whu",
|
||||
"西安交通大学",
|
||||
"西安交大",
|
||||
"xi’an jiaotong university",
|
||||
"xjtu",
|
||||
"天津大学",
|
||||
"天大",
|
||||
"university of tianjin",
|
||||
"tju",
|
||||
"华中科技大学",
|
||||
"华中大",
|
||||
"central china university science and technology",
|
||||
"hust",
|
||||
"北京理工大学",
|
||||
"北理",
|
||||
"beijing institute of technology",
|
||||
"bit",
|
||||
"东南大学",
|
||||
"东大",
|
||||
"southeast china university",
|
||||
"seu",
|
||||
"中山大学",
|
||||
"中大",
|
||||
"zhongshan university",
|
||||
"sysu",
|
||||
"华东师范大学",
|
||||
"华师大",
|
||||
"east china normal university",
|
||||
"ecnu",
|
||||
"哈尔滨工业大学",
|
||||
"哈工大",
|
||||
"harbin institute of technology",
|
||||
"hit",
|
||||
"厦门大学",
|
||||
"厦大",
|
||||
"xiamen university",
|
||||
"xmu",
|
||||
"西北工业大学",
|
||||
"西工大",
|
||||
"西北工大",
|
||||
"northwestern polytechnical university",
|
||||
"npu",
|
||||
"中南大学",
|
||||
"中南",
|
||||
"middle and southern university",
|
||||
"csu",
|
||||
"大连理工大学",
|
||||
"大工",
|
||||
"institute of technology of dalian",
|
||||
"dut",
|
||||
"四川大学",
|
||||
"川大",
|
||||
"sichuan university",
|
||||
"scu",
|
||||
"电子科技大学",
|
||||
"电子科大",
|
||||
"university of electronic science and technology of china",
|
||||
"uestc",
|
||||
"华南理工大学",
|
||||
"华南理工",
|
||||
"institutes of technology of south china",
|
||||
"scut",
|
||||
"吉林大学",
|
||||
"吉大",
|
||||
"jilin university",
|
||||
"jlu",
|
||||
"湖南大学",
|
||||
"湖大",
|
||||
"hunan university",
|
||||
"hnu",
|
||||
"重庆大学",
|
||||
"重大",
|
||||
"university of chongqing",
|
||||
"cqu",
|
||||
"山东大学",
|
||||
"山大",
|
||||
"shandong university",
|
||||
"sdu",
|
||||
"中国农业大学",
|
||||
"中国农大",
|
||||
"china agricultural university",
|
||||
"cau",
|
||||
"中国海洋大学",
|
||||
"中国海大",
|
||||
"chinese marine university",
|
||||
"ouc",
|
||||
"中央民族大学",
|
||||
"中央民大",
|
||||
"central university for nationalities",
|
||||
"muc",
|
||||
"东北大学",
|
||||
"东北工学院",
|
||||
"northeastern university",
|
||||
"neu 或 nu",
|
||||
"兰州大学",
|
||||
"兰大",
|
||||
"lanzhou university",
|
||||
"lzu",
|
||||
"西北农林科技大学",
|
||||
"西农","西北农大",
|
||||
"northwest a&f university",
|
||||
"nwafu",
|
||||
"中国人民解放军国防科技大学",
|
||||
"国防科技大学","国防科大",
|
||||
"national university of defense technology",
|
||||
"nudt",
|
||||
"郑州大学",
|
||||
"郑大",
|
||||
"zhengzhou university",
|
||||
"zzu",
|
||||
"云南大学",
|
||||
"云大",
|
||||
"yunnan university",
|
||||
"ynu",
|
||||
"新疆大学",
|
||||
"新大",
|
||||
"xinjiang university",
|
||||
"xju",
|
||||
"北京交通大学",
|
||||
"北京交大",
|
||||
"beijing jiaotong university",
|
||||
"bjtu",
|
||||
"北京工业大学",
|
||||
"北工大",
|
||||
"beijing university of technology",
|
||||
"bjut",
|
||||
"北京科技大学",
|
||||
"北科大","北京科大",
|
||||
"university of science and technology beijing",
|
||||
"ustb",
|
||||
"北京化工大学",
|
||||
"北化",
|
||||
"beijing university of chemical technology",
|
||||
"buct",
|
||||
"北京邮电大学",
|
||||
"北邮",
|
||||
"beijing university of posts and telecommunications",
|
||||
"beijing university of post and telecommunications",
|
||||
"beijing university of post and telecommunication",
|
||||
"beijing university of posts and telecommunication",
|
||||
"bupt",
|
||||
"北京林业大学",
|
||||
"北林",
|
||||
"beijing forestry university",
|
||||
"bfu",
|
||||
"北京协和医学院",
|
||||
"协和医学院",
|
||||
"peking union medical college",
|
||||
"pumc",
|
||||
"北京中医药大学",
|
||||
"北中医",
|
||||
"beijing university of chinese medicine",
|
||||
"bucm",
|
||||
"首都师范大学",
|
||||
"首师大",
|
||||
"capital normal university",
|
||||
"cnu",
|
||||
"北京外国语大学",
|
||||
"北外",
|
||||
"beijing foreign studies university",
|
||||
"bfsu",
|
||||
"中国传媒大学",
|
||||
"中媒",
|
||||
"中传",
|
||||
"北京广播学院",
|
||||
"communication university of china",
|
||||
"cuc",
|
||||
"中央财经大学",
|
||||
"中央财大",
|
||||
"中财大",
|
||||
"the central university of finance and economics",
|
||||
"cufe",
|
||||
"对外经济贸易大学",
|
||||
"对外经贸大学",
|
||||
"贸大",
|
||||
"university of international business and economics",
|
||||
"uibe",
|
||||
"外交学院",
|
||||
"外院",
|
||||
"china foreign affairs university",
|
||||
"cfau",
|
||||
"中国人民公安大学",
|
||||
"公安大学",
|
||||
"people's public security university of china",
|
||||
"ppsuc",
|
||||
"北京体育大学",
|
||||
"北体大",
|
||||
"beijing sport university",
|
||||
"bsu",
|
||||
"中央音乐学院",
|
||||
"央音",
|
||||
"中央院",
|
||||
"central conservatory of music",
|
||||
"ccom",
|
||||
"中国音乐学院",
|
||||
"国音",
|
||||
"中国院",
|
||||
"china conservatory of music",
|
||||
"ccmusic",
|
||||
"中央美术学院",
|
||||
"央美",
|
||||
"central academy of fine art",
|
||||
"cafa",
|
||||
"中央戏剧学院",
|
||||
"中戏",
|
||||
"the central academy of drama",
|
||||
"tcad",
|
||||
"中国政法大学",
|
||||
"法大",
|
||||
"china university of political science and law",
|
||||
"zuc",
|
||||
"cupl",
|
||||
"中国科学院大学",
|
||||
"国科大",
|
||||
"科院大",
|
||||
"university of chinese academy of sciences",
|
||||
"ucas",
|
||||
"福州大学",
|
||||
"福大",
|
||||
"university of fuzhou",
|
||||
"fzu",
|
||||
"暨南大学",
|
||||
"暨大",
|
||||
"ji'nan university",
|
||||
"jnu",
|
||||
"广州中医药大学",
|
||||
"广中医",
|
||||
"traditional chinese medicine university of guangzhou",
|
||||
"gucm",
|
||||
"华南师范大学",
|
||||
"华南师大",
|
||||
"south china normal university",
|
||||
"scnu",
|
||||
"广西大学",
|
||||
"西大",
|
||||
"guangxi university",
|
||||
"gxu",
|
||||
"贵州大学",
|
||||
"贵大",
|
||||
"guizhou university",
|
||||
"gzu",
|
||||
"海南大学",
|
||||
"海大",
|
||||
"university of hainan",
|
||||
"hainu",
|
||||
"河南大学",
|
||||
"河大",
|
||||
"he'nan university",
|
||||
"henu",
|
||||
"哈尔滨工程大学",
|
||||
"哈工程",
|
||||
"harbin engineering university",
|
||||
"heu",
|
||||
"东北农业大学",
|
||||
"东北农大",
|
||||
"northeast agricultural university",
|
||||
"neau",
|
||||
"东北林业大学",
|
||||
"东北林大",
|
||||
"northeast forestry university",
|
||||
"nefu",
|
||||
"中国地质大学",
|
||||
"地大",
|
||||
"china university of geosciences",
|
||||
"cug",
|
||||
"武汉理工大学",
|
||||
"武汉理工",
|
||||
"wuhan university of technology",
|
||||
"wut",
|
||||
"华中农业大学",
|
||||
"华中农大",
|
||||
"华农",
|
||||
"central china agricultural university",
|
||||
"hzau",
|
||||
"华中师范大学",
|
||||
"华中师大",
|
||||
"华大",
|
||||
"central china normal university",
|
||||
"ccnu",
|
||||
"中南财经政法大学",
|
||||
"中南大",
|
||||
"zhongnan university of economics & law",
|
||||
"zuel",
|
||||
"湖南师范大学",
|
||||
"湖南师大",
|
||||
"hunan normal university",
|
||||
"hunnu",
|
||||
"延边大学",
|
||||
"延大",
|
||||
"yanbian university",
|
||||
"ybu",
|
||||
"东北师范大学",
|
||||
"东北师大",
|
||||
"northeast normal university",
|
||||
"nenu",
|
||||
"苏州大学",
|
||||
"苏大",
|
||||
"soochow university",
|
||||
"suda",
|
||||
"南京航空航天大学",
|
||||
"南航",
|
||||
"nanjing aero-space university",
|
||||
"nuaa",
|
||||
"南京理工大学",
|
||||
"南理工",
|
||||
"institutes of technology of nanjing",
|
||||
"njust",
|
||||
"中国矿业大学",
|
||||
"中国矿大",
|
||||
"china mining university",
|
||||
"cumt",
|
||||
"南京邮电大学",
|
||||
"南邮",
|
||||
"nanjing university of posts and telecommunications",
|
||||
"njupt",
|
||||
"河海大学",
|
||||
"河海",
|
||||
"river sea university",
|
||||
"hhu",
|
||||
"江南大学",
|
||||
"江南大",
|
||||
"jiangnan university",
|
||||
"jiangnan",
|
||||
"南京林业大学",
|
||||
"南林",
|
||||
"nanjing forestry university",
|
||||
"njfu",
|
||||
"南京信息工程大学",
|
||||
"南信大",
|
||||
"nanjing university of information science and technology",
|
||||
"nuist",
|
||||
"南京农业大学",
|
||||
"南农",
|
||||
"南农大",
|
||||
"南京农大",
|
||||
"agricultural university of nanjing",
|
||||
"njau",
|
||||
"nau",
|
||||
"南京中医药大学",
|
||||
"南中医",
|
||||
"nanjing university of chinese medicine",
|
||||
"njucm",
|
||||
"中国药科大学",
|
||||
"中国药大",
|
||||
"china medicine university",
|
||||
"cpu",
|
||||
"南京师范大学",
|
||||
"南京师大",
|
||||
"南师大",
|
||||
"南师",
|
||||
"nanjing normal university",
|
||||
"nnu",
|
||||
"南昌大学",
|
||||
"昌大",
|
||||
"university of nanchang","nanchang university",
|
||||
"ncu",
|
||||
"辽宁大学",
|
||||
"辽大",
|
||||
"liaoning university",
|
||||
"lnu",
|
||||
"大连海事大学",
|
||||
"大连海大",
|
||||
"海大",
|
||||
"maritime affairs university of dalian",
|
||||
"dmu",
|
||||
"内蒙古大学",
|
||||
"内大",
|
||||
"university of the inner mongol","inner mongolia university",
|
||||
"imu",
|
||||
"宁夏大学",
|
||||
"宁大",
|
||||
"ningxia university",
|
||||
"nxu",
|
||||
"青海大学",
|
||||
"清大",
|
||||
"qinghai university",
|
||||
"qhu",
|
||||
"中国石油大学",
|
||||
"中石大",
|
||||
"china university of petroleum beijing",
|
||||
"upc",
|
||||
"太原理工大学",
|
||||
"太原理工",
|
||||
"institutes of technology of taiyuan","taiyuan university of technology",
|
||||
"tyut",
|
||||
"西北大学",
|
||||
"西大",
|
||||
"northwest university",
|
||||
"nwu",
|
||||
"西安电子科技大学",
|
||||
"西电",
|
||||
"xidian university",
|
||||
"xdu",
|
||||
"长安大学",
|
||||
"长大",
|
||||
"chang`an university",
|
||||
"chu",
|
||||
"陕西师范大学",
|
||||
"陕西师大",
|
||||
"陕师大",
|
||||
"shaanxi normal university",
|
||||
"snnu",
|
||||
"第四军医大学",
|
||||
"空军军医大学","四医大",
|
||||
"air force medical university",
|
||||
"fmmu",
|
||||
"华东理工大学",
|
||||
"华理",
|
||||
"east china university of science",
|
||||
"ecust",
|
||||
"东华大学",
|
||||
"东华",
|
||||
"donghua university",
|
||||
"dhu",
|
||||
"上海海洋大学",
|
||||
"上海海大",
|
||||
"shanghai ocean university",
|
||||
"shou",
|
||||
"上海中医药大学",
|
||||
"上中医",
|
||||
"shanghai university of traditional chinese medicine",
|
||||
"shutcm",
|
||||
"上海外国语大学",
|
||||
"上外",
|
||||
"shanghai international studies university",
|
||||
"sisu",
|
||||
"上海财经大学",
|
||||
"上海财大",
|
||||
"上财",
|
||||
"shanghai university of finance",
|
||||
"sufe",
|
||||
"上海体育学院",
|
||||
"shanghai university of sport",
|
||||
"上海音乐学院",
|
||||
"上音",
|
||||
"shanghai conservatory of music",
|
||||
"shcm",
|
||||
"上海大学",
|
||||
"上大",
|
||||
"shanghai university",
|
||||
"第二军医大学",
|
||||
"海军军医大学",
|
||||
"naval medical university",
|
||||
"西南交通大学",
|
||||
"西南交大",
|
||||
"southwest jiaotong university",
|
||||
"swjtu",
|
||||
"西南石油大学",
|
||||
"西南石大",
|
||||
"southwest petroleum university",
|
||||
"swpu",
|
||||
"成都理工大学",
|
||||
"成都理工",
|
||||
"chengdu university of technology",
|
||||
"cdut ",
|
||||
"四川农业大学",
|
||||
"川农",
|
||||
"川农大",
|
||||
"sichuan agricultural university",
|
||||
"sicau",
|
||||
"成都中医药大学",
|
||||
"成中医",
|
||||
"chengdu university of tcm",
|
||||
"cdutcm",
|
||||
"西南财经大学",
|
||||
"西南财大",
|
||||
"西财",
|
||||
"southwestern university of finance and economics",
|
||||
"swufe",
|
||||
"天津工业大学",
|
||||
"天工大",
|
||||
"tianjin university of technology",
|
||||
"tgu",
|
||||
"天津医科大学",
|
||||
"天津医大",
|
||||
"medical university of tianjin",
|
||||
"tmu",
|
||||
"天津中医药大学",
|
||||
"天中",
|
||||
"tianjin university of traditional chinese medicine",
|
||||
"tutcm",
|
||||
"华北电力大学",
|
||||
"华电",
|
||||
"north china electric power university",
|
||||
"ncepu",
|
||||
"河北工业大学",
|
||||
"河工大",
|
||||
"hebei university of technology",
|
||||
"hebut",
|
||||
"西藏大学",
|
||||
"藏大",
|
||||
"tibet university",
|
||||
"tu",
|
||||
"石河子大学",
|
||||
"石大",
|
||||
"shihezi university",
|
||||
"中国美术学院",
|
||||
"中国美院",
|
||||
"国美",
|
||||
"china academy of art",
|
||||
"caa",
|
||||
"宁波大学",
|
||||
"宁大",
|
||||
"ningbo university",
|
||||
"nbu",
|
||||
"西南大学",
|
||||
"西大",
|
||||
"southwest university",
|
||||
"swu",
|
||||
"安徽大学",
|
||||
"安大",
|
||||
"university of anhui",
|
||||
"ahu",
|
||||
"合肥工业大学",
|
||||
"合肥工大",
|
||||
"合工大",
|
||||
"hefei university of technology",
|
||||
"hfut",
|
||||
"中国地质大学",
|
||||
"地大",
|
||||
"china university of geosciences",
|
||||
"cug",
|
||||
"中国地质大学",
|
||||
"地大",
|
||||
"北京地大",
|
||||
"cugb",
|
||||
"中国矿业大学",
|
||||
"中国矿大",
|
||||
"china university of mining & technology",
|
||||
"cumtb",
|
||||
"中国石油大学",
|
||||
"中石大",
|
||||
"石大",
|
||||
"china university of petroleum",
|
||||
"cup",
|
||||
"中国石油大学",
|
||||
"中石大",
|
||||
"cup"]
|
1627
deepdoc/parser/resume/entities/res/school.rank.csv
Normal file
1627
deepdoc/parser/resume/entities/res/school.rank.csv
Normal file
File diff suppressed because it is too large
Load Diff
5713
deepdoc/parser/resume/entities/res/schools.csv
Normal file
5713
deepdoc/parser/resume/entities/res/schools.csv
Normal file
File diff suppressed because it is too large
Load Diff
62
deepdoc/parser/resume/entities/schools.py
Normal file
62
deepdoc/parser/resume/entities/schools.py
Normal file
@ -0,0 +1,62 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
import os, json,re,copy
|
||||
import pandas as pd
|
||||
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
||||
TBL = pd.read_csv(os.path.join(current_file_path, "res/schools.csv"), sep="\t", header=0).fillna("")
|
||||
TBL["name_en"] = TBL["name_en"].map(lambda x: x.lower().strip())
|
||||
GOOD_SCH = json.load(open(os.path.join(current_file_path, "res/good_sch.json"), "r"))
|
||||
GOOD_SCH = set([re.sub(r"[,. &()()]+", "", c) for c in GOOD_SCH])
|
||||
|
||||
|
||||
def loadRank(fnm):
|
||||
global TBL
|
||||
TBL["rank"] = 1000000
|
||||
with open(fnm, "r",encoding='UTF-8') as f:
|
||||
while True:
|
||||
l = f.readline()
|
||||
if not l:break
|
||||
l = l.strip("\n").split(",")
|
||||
try:
|
||||
nm,rk = l[0].strip(),int(l[1])
|
||||
#assert len(TBL[((TBL.name_cn == nm) | (TBL.name_en == nm))]),f"<{nm}>"
|
||||
TBL.loc[((TBL.name_cn == nm) | (TBL.name_en == nm)), "rank"] = rk
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
|
||||
loadRank(os.path.join(current_file_path, "res/school.rank.csv"))
|
||||
|
||||
|
||||
def split(txt):
|
||||
tks = []
|
||||
for t in re.sub(r"[ \t]+", " ",txt).split(" "):
|
||||
if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
|
||||
re.match(r"[a-zA-Z]", t) and tks:
|
||||
tks[-1] = tks[-1] + " " + t
|
||||
else:tks.append(t)
|
||||
return tks
|
||||
|
||||
|
||||
def select(nm):
|
||||
global TBL
|
||||
if not nm:return
|
||||
if isinstance(nm, list):nm = str(nm[0])
|
||||
nm = split(nm)[0]
|
||||
nm = str(nm).lower().strip()
|
||||
nm = re.sub(r"[((][^()()]+[))]", "", nm.lower())
|
||||
nm = re.sub(r"(^the |[,.&()();;·]+|^(英国|美国|瑞士))", "", nm)
|
||||
nm = re.sub(r"大学.*学院", "大学", nm)
|
||||
tbl = copy.deepcopy(TBL)
|
||||
tbl["hit_alias"] = tbl["alias"].map(lambda x:nm in set(x.split("+")))
|
||||
res = tbl[((tbl.name_cn == nm) | (tbl.name_en == nm) | (tbl.hit_alias == True))]
|
||||
if res.empty:return
|
||||
|
||||
return json.loads(res.to_json(orient="records"))[0]
|
||||
|
||||
|
||||
def is_good(nm):
|
||||
global GOOD_SCH
|
||||
nm = re.sub(r"[((][^()()]+[))]", "", nm.lower())
|
||||
nm = re.sub(r"[''`‘’“”,. &()();;]+", "", nm)
|
||||
return nm in GOOD_SCH
|
||||
|
174
deepdoc/parser/resume/step_one.py
Normal file
174
deepdoc/parser/resume/step_one.py
Normal file
@ -0,0 +1,174 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import json
|
||||
from deepdoc.parser.resume.entities import degrees, regions, industries
|
||||
|
||||
FIELDS = [
|
||||
"address STRING",
|
||||
"annual_salary int",
|
||||
"annual_salary_from int",
|
||||
"annual_salary_to int",
|
||||
"birth STRING",
|
||||
"card STRING",
|
||||
"certificate_obj string",
|
||||
"city STRING",
|
||||
"corporation_id int",
|
||||
"corporation_name STRING",
|
||||
"corporation_type STRING",
|
||||
"degree STRING",
|
||||
"discipline_name STRING",
|
||||
"education_obj string",
|
||||
"email STRING",
|
||||
"expect_annual_salary int",
|
||||
"expect_city_names string",
|
||||
"expect_industry_name STRING",
|
||||
"expect_position_name STRING",
|
||||
"expect_salary_from int",
|
||||
"expect_salary_to int",
|
||||
"expect_type STRING",
|
||||
"gender STRING",
|
||||
"industry_name STRING",
|
||||
"industry_names STRING",
|
||||
"is_deleted STRING",
|
||||
"is_fertility STRING",
|
||||
"is_house STRING",
|
||||
"is_management_experience STRING",
|
||||
"is_marital STRING",
|
||||
"is_oversea STRING",
|
||||
"language_obj string",
|
||||
"name STRING",
|
||||
"nation STRING",
|
||||
"phone STRING",
|
||||
"political_status STRING",
|
||||
"position_name STRING",
|
||||
"project_obj string",
|
||||
"responsibilities string",
|
||||
"salary_month int",
|
||||
"scale STRING",
|
||||
"school_name STRING",
|
||||
"self_remark string",
|
||||
"skill_obj string",
|
||||
"title_name STRING",
|
||||
"tob_resume_id STRING",
|
||||
"updated_at Timestamp",
|
||||
"wechat STRING",
|
||||
"work_obj string",
|
||||
"work_experience int",
|
||||
"work_start_time BIGINT"
|
||||
]
|
||||
|
||||
def refactor(df):
|
||||
def deal_obj(obj, k, kk):
|
||||
if not isinstance(obj, type({})):
|
||||
return ""
|
||||
obj = obj.get(k, {})
|
||||
if not isinstance(obj, type({})):
|
||||
return ""
|
||||
return obj.get(kk, "")
|
||||
|
||||
def loadjson(line):
|
||||
try:
|
||||
return json.loads(line)
|
||||
except Exception as e:
|
||||
pass
|
||||
return {}
|
||||
|
||||
df["obj"] = df["resume_content"].map(lambda x: loadjson(x))
|
||||
df.fillna("", inplace=True)
|
||||
|
||||
clms = ["tob_resume_id", "updated_at"]
|
||||
|
||||
def extract(nms, cc=None):
|
||||
nonlocal clms
|
||||
clms.extend(nms)
|
||||
for c in nms:
|
||||
if cc:
|
||||
df[c] = df["obj"].map(lambda x: deal_obj(x, cc, c))
|
||||
else:
|
||||
df[c] = df["obj"].map(
|
||||
lambda x: json.dumps(
|
||||
x.get(
|
||||
c,
|
||||
{}),
|
||||
ensure_ascii=False) if isinstance(
|
||||
x,
|
||||
type(
|
||||
{})) and (
|
||||
isinstance(
|
||||
x.get(c),
|
||||
type(
|
||||
{})) or not x.get(c)) else str(x).replace(
|
||||
"None",
|
||||
""))
|
||||
|
||||
extract(["education", "work", "certificate", "project", "language",
|
||||
"skill"])
|
||||
extract(["wechat", "phone", "is_deleted",
|
||||
"name", "tel", "email"], "contact")
|
||||
extract(["nation", "expect_industry_name", "salary_month",
|
||||
"industry_ids", "is_house", "birth", "annual_salary_from",
|
||||
"annual_salary_to", "card",
|
||||
"expect_salary_to", "expect_salary_from",
|
||||
"expect_position_name", "gender", "city",
|
||||
"is_fertility", "expect_city_names",
|
||||
"political_status", "title_name", "expect_annual_salary",
|
||||
"industry_name", "address", "position_name", "school_name",
|
||||
"corporation_id",
|
||||
"is_oversea", "responsibilities",
|
||||
"work_start_time", "degree", "management_experience",
|
||||
"expect_type", "corporation_type", "scale", "corporation_name",
|
||||
"self_remark", "annual_salary", "work_experience",
|
||||
"discipline_name", "marital", "updated_at"], "basic")
|
||||
|
||||
df["degree"] = df["degree"].map(lambda x: degrees.get_name(x))
|
||||
df["address"] = df["address"].map(lambda x: " ".join(regions.get_names(x)))
|
||||
df["industry_names"] = df["industry_ids"].map(lambda x: " ".join([" ".join(industries.get_names(i)) for i in
|
||||
str(x).split(",")]))
|
||||
clms.append("industry_names")
|
||||
|
||||
def arr2str(a):
|
||||
if not a:
|
||||
return ""
|
||||
if isinstance(a, list):
|
||||
a = " ".join([str(i) for i in a])
|
||||
return str(a).replace(",", " ")
|
||||
|
||||
df["expect_industry_name"] = df["expect_industry_name"].map(
|
||||
lambda x: arr2str(x))
|
||||
df["gender"] = df["gender"].map(
|
||||
lambda x: "男" if x == 'M' else (
|
||||
"女" if x == 'F' else ""))
|
||||
for c in ["is_fertility", "is_oversea", "is_house",
|
||||
"management_experience", "marital"]:
|
||||
df[c] = df[c].map(
|
||||
lambda x: '是' if x == 'Y' else (
|
||||
'否' if x == 'N' else ""))
|
||||
df["is_management_experience"] = df["management_experience"]
|
||||
df["is_marital"] = df["marital"]
|
||||
clms.extend(["is_management_experience", "is_marital"])
|
||||
|
||||
df.fillna("", inplace=True)
|
||||
for i in range(len(df)):
|
||||
if not df.loc[i, "phone"].strip() and df.loc[i, "tel"].strip():
|
||||
df.loc[i, "phone"] = df.loc[i, "tel"].strip()
|
||||
|
||||
for n in ["industry_ids", "management_experience", "marital", "tel"]:
|
||||
for i in range(len(clms)):
|
||||
if clms[i] == n:
|
||||
del clms[i]
|
||||
break
|
||||
|
||||
clms = list(set(clms))
|
||||
|
||||
df = df.reindex(sorted(clms), axis=1)
|
||||
#print(json.dumps(list(df.columns.values)), "LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL")
|
||||
for c in clms:
|
||||
df[c] = df[c].map(
|
||||
lambda s: str(s).replace(
|
||||
"\t",
|
||||
" ").replace(
|
||||
"\n",
|
||||
"\\n").replace(
|
||||
"\r",
|
||||
"\\n"))
|
||||
# print(df.values.tolist())
|
||||
return dict(zip([n.split(" ")[0] for n in FIELDS], df.values.tolist()[0]))
|
580
deepdoc/parser/resume/step_two.py
Normal file
580
deepdoc/parser/resume/step_two.py
Normal file
@ -0,0 +1,580 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import re, copy, time, datetime, demjson, \
|
||||
traceback, signal
|
||||
import numpy as np
|
||||
from deepdoc.parser.resume.entities import degrees, schools, corporations
|
||||
from rag.nlp import huqie, surname
|
||||
from xpinyin import Pinyin
|
||||
from contextlib import contextmanager
|
||||
|
||||
|
||||
class TimeoutException(Exception): pass
|
||||
|
||||
|
||||
@contextmanager
|
||||
def time_limit(seconds):
|
||||
def signal_handler(signum, frame):
|
||||
raise TimeoutException("Timed out!")
|
||||
|
||||
signal.signal(signal.SIGALRM, signal_handler)
|
||||
signal.alarm(seconds)
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
signal.alarm(0)
|
||||
|
||||
|
||||
ENV = None
|
||||
PY = Pinyin()
|
||||
|
||||
|
||||
def rmHtmlTag(line):
|
||||
return re.sub(r"<[a-z0-9.\"=';,:\+_/ -]+>", " ", line, 100000, re.IGNORECASE)
|
||||
|
||||
|
||||
def highest_degree(dg):
|
||||
if not dg: return ""
|
||||
if type(dg) == type(""): dg = [dg]
|
||||
m = {"初中": 0, "高中": 1, "中专": 2, "大专": 3, "专升本": 4, "本科": 5, "硕士": 6, "博士": 7, "博士后": 8}
|
||||
return sorted([(d, m.get(d, -1)) for d in dg], key=lambda x: x[1] * -1)[0][0]
|
||||
|
||||
|
||||
def forEdu(cv):
|
||||
if not cv.get("education_obj"):
|
||||
cv["integerity_flt"] *= 0.8
|
||||
return cv
|
||||
|
||||
first_fea, fea, maj, fmaj, deg, fdeg, sch, fsch, st_dt, ed_dt = [], [], [], [], [], [], [], [], [], []
|
||||
edu_nst = []
|
||||
edu_end_dt = ""
|
||||
cv["school_rank_int"] = 1000000
|
||||
for ii, n in enumerate(sorted(cv["education_obj"], key=lambda x: x.get("start_time", "3"))):
|
||||
e = {}
|
||||
if n.get("end_time"):
|
||||
if n["end_time"] > edu_end_dt: edu_end_dt = n["end_time"]
|
||||
try:
|
||||
dt = n["end_time"]
|
||||
if re.match(r"[0-9]{9,}", dt): dt = turnTm2Dt(dt)
|
||||
y, m, d = getYMD(dt)
|
||||
ed_dt.append(str(y))
|
||||
e["end_dt_kwd"] = str(y)
|
||||
except Exception as e:
|
||||
pass
|
||||
if n.get("start_time"):
|
||||
try:
|
||||
dt = n["start_time"]
|
||||
if re.match(r"[0-9]{9,}", dt): dt = turnTm2Dt(dt)
|
||||
y, m, d = getYMD(dt)
|
||||
st_dt.append(str(y))
|
||||
e["start_dt_kwd"] = str(y)
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
r = schools.select(n.get("school_name", ""))
|
||||
if r:
|
||||
if str(r.get("type", "")) == "1": fea.append("211")
|
||||
if str(r.get("type", "")) == "2": fea.append("211")
|
||||
if str(r.get("is_abroad", "")) == "1": fea.append("留学")
|
||||
if str(r.get("is_double_first", "")) == "1": fea.append("双一流")
|
||||
if str(r.get("is_985", "")) == "1": fea.append("985")
|
||||
if str(r.get("is_world_known", "")) == "1": fea.append("海外知名")
|
||||
if r.get("rank") and cv["school_rank_int"] > r["rank"]: cv["school_rank_int"] = r["rank"]
|
||||
|
||||
if n.get("school_name") and isinstance(n["school_name"], str):
|
||||
sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"]))
|
||||
e["sch_nm_kwd"] = sch[-1]
|
||||
fea.append(huqie.qieqie(huqie.qie(n.get("school_name", ""))).split(" ")[-1])
|
||||
|
||||
if n.get("discipline_name") and isinstance(n["discipline_name"], str):
|
||||
maj.append(n["discipline_name"])
|
||||
e["major_kwd"] = n["discipline_name"]
|
||||
|
||||
if not n.get("degree") and "985" in fea and not first_fea: n["degree"] = "1"
|
||||
|
||||
if n.get("degree"):
|
||||
d = degrees.get_name(n["degree"])
|
||||
if d: e["degree_kwd"] = d
|
||||
if d == "本科" and ("专科" in deg or "专升本" in deg or "中专" in deg or "大专" in deg or re.search(r"(成人|自考|自学考试)",
|
||||
n.get(
|
||||
"school_name",
|
||||
""))): d = "专升本"
|
||||
if d: deg.append(d)
|
||||
|
||||
# for first degree
|
||||
if not fdeg and d in ["中专", "专升本", "专科", "本科", "大专"]:
|
||||
fdeg = [d]
|
||||
if n.get("school_name"): fsch = [n["school_name"]]
|
||||
if n.get("discipline_name"): fmaj = [n["discipline_name"]]
|
||||
first_fea = copy.deepcopy(fea)
|
||||
|
||||
edu_nst.append(e)
|
||||
|
||||
cv["sch_rank_kwd"] = []
|
||||
if cv["school_rank_int"] <= 20 \
|
||||
or ("海外名校" in fea and cv["school_rank_int"] <= 200):
|
||||
cv["sch_rank_kwd"].append("顶尖学校")
|
||||
elif cv["school_rank_int"] <= 50 and cv["school_rank_int"] > 20 \
|
||||
or ("海外名校" in fea and cv["school_rank_int"] <= 500 and \
|
||||
cv["school_rank_int"] > 200):
|
||||
cv["sch_rank_kwd"].append("精英学校")
|
||||
elif cv["school_rank_int"] > 50 and ("985" in fea or "211" in fea) \
|
||||
or ("海外名校" in fea and cv["school_rank_int"] > 500):
|
||||
cv["sch_rank_kwd"].append("优质学校")
|
||||
else:
|
||||
cv["sch_rank_kwd"].append("一般学校")
|
||||
|
||||
if edu_nst: cv["edu_nst"] = edu_nst
|
||||
if fea: cv["edu_fea_kwd"] = list(set(fea))
|
||||
if first_fea: cv["edu_first_fea_kwd"] = list(set(first_fea))
|
||||
if maj: cv["major_kwd"] = maj
|
||||
if fsch: cv["first_school_name_kwd"] = fsch
|
||||
if fdeg: cv["first_degree_kwd"] = fdeg
|
||||
if fmaj: cv["first_major_kwd"] = fmaj
|
||||
if st_dt: cv["edu_start_kwd"] = st_dt
|
||||
if ed_dt: cv["edu_end_kwd"] = ed_dt
|
||||
if ed_dt: cv["edu_end_int"] = max([int(t) for t in ed_dt])
|
||||
if deg:
|
||||
if "本科" in deg and "专科" in deg:
|
||||
deg.append("专升本")
|
||||
deg = [d for d in deg if d != '本科']
|
||||
cv["degree_kwd"] = deg
|
||||
cv["highest_degree_kwd"] = highest_degree(deg)
|
||||
if edu_end_dt:
|
||||
try:
|
||||
if re.match(r"[0-9]{9,}", edu_end_dt): edu_end_dt = turnTm2Dt(edu_end_dt)
|
||||
if edu_end_dt.strip("\n") == "至今": edu_end_dt = cv.get("updated_at_dt", str(datetime.date.today()))
|
||||
y, m, d = getYMD(edu_end_dt)
|
||||
cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
|
||||
except Exception as e:
|
||||
print("EXCEPTION: ", e, edu_end_dt, cv.get("work_exp_flt"))
|
||||
if sch:
|
||||
cv["school_name_kwd"] = sch
|
||||
if (len(cv.get("degree_kwd", [])) >= 1 and "本科" in cv["degree_kwd"]) \
|
||||
or all([c.lower() in ["硕士", "博士", "mba", "博士后"] for c in cv.get("degree_kwd", [])]) \
|
||||
or not cv.get("degree_kwd"):
|
||||
for c in sch:
|
||||
if schools.is_good(c):
|
||||
if "tag_kwd" not in cv: cv["tag_kwd"] = []
|
||||
cv["tag_kwd"].append("好学校")
|
||||
cv["tag_kwd"].append("好学历")
|
||||
break
|
||||
if (len(cv.get("degree_kwd", [])) >= 1 and \
|
||||
"本科" in cv["degree_kwd"] and \
|
||||
any([d.lower() in ["硕士", "博士", "mba", "博士"] for d in cv.get("degree_kwd", [])])) \
|
||||
or all([d.lower() in ["硕士", "博士", "mba", "博士后"] for d in cv.get("degree_kwd", [])]) \
|
||||
or any([d in ["mba", "emba", "博士后"] for d in cv.get("degree_kwd", [])]):
|
||||
if "tag_kwd" not in cv: cv["tag_kwd"] = []
|
||||
if "好学历" not in cv["tag_kwd"]: cv["tag_kwd"].append("好学历")
|
||||
|
||||
if cv.get("major_kwd"): cv["major_tks"] = huqie.qie(" ".join(maj))
|
||||
if cv.get("school_name_kwd"): cv["school_name_tks"] = huqie.qie(" ".join(sch))
|
||||
if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] = huqie.qie(" ".join(fsch))
|
||||
if cv.get("first_major_kwd"): cv["first_major_tks"] = huqie.qie(" ".join(fmaj))
|
||||
|
||||
return cv
|
||||
|
||||
|
||||
def forProj(cv):
|
||||
if not cv.get("project_obj"): return cv
|
||||
|
||||
pro_nms, desc = [], []
|
||||
for i, n in enumerate(
|
||||
sorted(cv.get("project_obj", []), key=lambda x: str(x.get("updated_at", "")) if type(x) == type({}) else "",
|
||||
reverse=True)):
|
||||
if n.get("name"): pro_nms.append(n["name"])
|
||||
if n.get("describe"): desc.append(str(n["describe"]))
|
||||
if n.get("responsibilities"): desc.append(str(n["responsibilities"]))
|
||||
if n.get("achivement"): desc.append(str(n["achivement"]))
|
||||
|
||||
if pro_nms:
|
||||
# cv["pro_nms_tks"] = huqie.qie(" ".join(pro_nms))
|
||||
cv["project_name_tks"] = huqie.qie(pro_nms[0])
|
||||
if desc:
|
||||
cv["pro_desc_ltks"] = huqie.qie(rmHtmlTag(" ".join(desc)))
|
||||
cv["project_desc_ltks"] = huqie.qie(rmHtmlTag(desc[0]))
|
||||
|
||||
return cv
|
||||
|
||||
|
||||
def json_loads(line):
|
||||
return demjson.decode(re.sub(r": *(True|False)", r": '\1'", line))
|
||||
|
||||
|
||||
def forWork(cv):
|
||||
if not cv.get("work_obj"):
|
||||
cv["integerity_flt"] *= 0.7
|
||||
return cv
|
||||
|
||||
flds = ["position_name", "corporation_name", "corporation_id", "responsibilities",
|
||||
"industry_name", "subordinates_count"]
|
||||
duas = []
|
||||
scales = []
|
||||
fea = {c: [] for c in flds}
|
||||
latest_job_tm = ""
|
||||
goodcorp = False
|
||||
goodcorp_ = False
|
||||
work_st_tm = ""
|
||||
corp_tags = []
|
||||
for i, n in enumerate(
|
||||
sorted(cv.get("work_obj", []), key=lambda x: str(x.get("start_time", "")) if type(x) == type({}) else "",
|
||||
reverse=True)):
|
||||
if type(n) == type(""):
|
||||
try:
|
||||
n = json_loads(n)
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
if n.get("start_time") and (not work_st_tm or n["start_time"] < work_st_tm): work_st_tm = n["start_time"]
|
||||
for c in flds:
|
||||
if not n.get(c) or str(n[c]) == '0':
|
||||
fea[c].append("")
|
||||
continue
|
||||
if c == "corporation_name":
|
||||
n[c] = corporations.corpNorm(n[c], False)
|
||||
if corporations.is_good(n[c]):
|
||||
if i == 0:
|
||||
goodcorp = True
|
||||
else:
|
||||
goodcorp_ = True
|
||||
ct = corporations.corp_tag(n[c])
|
||||
if i == 0:
|
||||
corp_tags.extend(ct)
|
||||
elif ct and ct[0] != "软外":
|
||||
corp_tags.extend([f"{t}(曾)" for t in ct])
|
||||
|
||||
fea[c].append(rmHtmlTag(str(n[c]).lower()))
|
||||
|
||||
y, m, d = getYMD(n.get("start_time"))
|
||||
if not y or not m: continue
|
||||
st = "%s-%02d-%02d" % (y, int(m), int(d))
|
||||
latest_job_tm = st
|
||||
|
||||
y, m, d = getYMD(n.get("end_time"))
|
||||
if (not y or not m) and i > 0: continue
|
||||
if not y or not m or int(y) > 2022: y, m, d = getYMD(str(n.get("updated_at", "")))
|
||||
if not y or not m: continue
|
||||
ed = "%s-%02d-%02d" % (y, int(m), int(d))
|
||||
|
||||
try:
|
||||
duas.append((datetime.datetime.strptime(ed, "%Y-%m-%d") - datetime.datetime.strptime(st, "%Y-%m-%d")).days)
|
||||
except Exception as e:
|
||||
print("kkkkkkkkkkkkkkkkkkkk", n.get("start_time"), n.get("end_time"))
|
||||
|
||||
if n.get("scale"):
|
||||
r = re.search(r"^([0-9]+)", str(n["scale"]))
|
||||
if r: scales.append(int(r.group(1)))
|
||||
|
||||
if goodcorp:
|
||||
if "tag_kwd" not in cv: cv["tag_kwd"] = []
|
||||
cv["tag_kwd"].append("好公司")
|
||||
if goodcorp_:
|
||||
if "tag_kwd" not in cv: cv["tag_kwd"] = []
|
||||
cv["tag_kwd"].append("好公司(曾)")
|
||||
|
||||
if corp_tags:
|
||||
if "tag_kwd" not in cv: cv["tag_kwd"] = []
|
||||
cv["tag_kwd"].extend(corp_tags)
|
||||
cv["corp_tag_kwd"] = [c for c in corp_tags if re.match(r"(综合|行业)", c)]
|
||||
|
||||
if latest_job_tm: cv["latest_job_dt"] = latest_job_tm
|
||||
if fea["corporation_id"]: cv["corporation_id"] = fea["corporation_id"]
|
||||
|
||||
if fea["position_name"]:
|
||||
cv["position_name_tks"] = huqie.qie(fea["position_name"][0])
|
||||
cv["position_name_sm_tks"] = huqie.qieqie(cv["position_name_tks"])
|
||||
cv["pos_nm_tks"] = huqie.qie(" ".join(fea["position_name"][1:]))
|
||||
|
||||
if fea["industry_name"]:
|
||||
cv["industry_name_tks"] = huqie.qie(fea["industry_name"][0])
|
||||
cv["industry_name_sm_tks"] = huqie.qieqie(cv["industry_name_tks"])
|
||||
cv["indu_nm_tks"] = huqie.qie(" ".join(fea["industry_name"][1:]))
|
||||
|
||||
if fea["corporation_name"]:
|
||||
cv["corporation_name_kwd"] = fea["corporation_name"][0]
|
||||
cv["corp_nm_kwd"] = fea["corporation_name"]
|
||||
cv["corporation_name_tks"] = huqie.qie(fea["corporation_name"][0])
|
||||
cv["corporation_name_sm_tks"] = huqie.qieqie(cv["corporation_name_tks"])
|
||||
cv["corp_nm_tks"] = huqie.qie(" ".join(fea["corporation_name"][1:]))
|
||||
|
||||
if fea["responsibilities"]:
|
||||
cv["responsibilities_ltks"] = huqie.qie(fea["responsibilities"][0])
|
||||
cv["resp_ltks"] = huqie.qie(" ".join(fea["responsibilities"][1:]))
|
||||
|
||||
if fea["subordinates_count"]: fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
|
||||
re.match(r"[^0-9]+$", str(i))]
|
||||
if fea["subordinates_count"]: cv["max_sub_cnt_int"] = np.max(fea["subordinates_count"])
|
||||
|
||||
if type(cv.get("corporation_id")) == type(1): cv["corporation_id"] = [str(cv["corporation_id"])]
|
||||
if not cv.get("corporation_id"): cv["corporation_id"] = []
|
||||
for i in cv.get("corporation_id", []):
|
||||
cv["baike_flt"] = max(corporations.baike(i), cv["baike_flt"] if "baike_flt" in cv else 0)
|
||||
|
||||
if work_st_tm:
|
||||
try:
|
||||
if re.match(r"[0-9]{9,}", work_st_tm): work_st_tm = turnTm2Dt(work_st_tm)
|
||||
y, m, d = getYMD(work_st_tm)
|
||||
cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
|
||||
except Exception as e:
|
||||
print("EXCEPTION: ", e, work_st_tm, cv.get("work_exp_flt"))
|
||||
|
||||
cv["job_num_int"] = 0
|
||||
if duas:
|
||||
cv["dua_flt"] = np.mean(duas)
|
||||
cv["cur_dua_int"] = duas[0]
|
||||
cv["job_num_int"] = len(duas)
|
||||
if scales: cv["scale_flt"] = np.max(scales)
|
||||
return cv
|
||||
|
||||
|
||||
def turnTm2Dt(b):
|
||||
if not b: return
|
||||
b = str(b).strip()
|
||||
if re.match(r"[0-9]{10,}", b): b = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(b[:10])))
|
||||
return b
|
||||
|
||||
|
||||
def getYMD(b):
|
||||
y, m, d = "", "", "01"
|
||||
if not b: return (y, m, d)
|
||||
b = turnTm2Dt(b)
|
||||
if re.match(r"[0-9]{4}", b): y = int(b[:4])
|
||||
r = re.search(r"[0-9]{4}.?([0-9]{1,2})", b)
|
||||
if r: m = r.group(1)
|
||||
r = re.search(r"[0-9]{4}.?[0-9]{,2}.?([0-9]{1,2})", b)
|
||||
if r: d = r.group(1)
|
||||
if not d or int(d) == 0 or int(d) > 31: d = "1"
|
||||
if not m or int(m) > 12 or int(m) < 1: m = "1"
|
||||
return (y, m, d)
|
||||
|
||||
|
||||
def birth(cv):
|
||||
if not cv.get("birth"):
|
||||
cv["integerity_flt"] *= 0.9
|
||||
return cv
|
||||
y, m, d = getYMD(cv["birth"])
|
||||
if not m or not y: return cv
|
||||
b = "%s-%02d-%02d" % (y, int(m), int(d))
|
||||
cv["birth_dt"] = b
|
||||
cv["birthday_kwd"] = "%02d%02d" % (int(m), int(d))
|
||||
|
||||
cv["age_int"] = datetime.datetime.now().year - int(y)
|
||||
return cv
|
||||
|
||||
|
||||
def parse(cv):
|
||||
for k in cv.keys():
|
||||
if cv[k] == '\\N': cv[k] = ''
|
||||
# cv = cv.asDict()
|
||||
tks_fld = ["address", "corporation_name", "discipline_name", "email", "expect_city_names",
|
||||
"expect_industry_name", "expect_position_name", "industry_name", "industry_names", "name",
|
||||
"position_name", "school_name", "self_remark", "title_name"]
|
||||
small_tks_fld = ["corporation_name", "expect_position_name", "position_name", "school_name", "title_name"]
|
||||
kwd_fld = ["address", "city", "corporation_type", "degree", "discipline_name", "expect_city_names", "email",
|
||||
"expect_industry_name", "expect_position_name", "expect_type", "gender", "industry_name",
|
||||
"industry_names", "political_status", "position_name", "scale", "school_name", "phone", "tel"]
|
||||
num_fld = ["annual_salary", "annual_salary_from", "annual_salary_to", "expect_annual_salary", "expect_salary_from",
|
||||
"expect_salary_to", "salary_month"]
|
||||
|
||||
is_fld = [
|
||||
("is_fertility", "已育", "未育"),
|
||||
("is_house", "有房", "没房"),
|
||||
("is_management_experience", "有管理经验", "无管理经验"),
|
||||
("is_marital", "已婚", "未婚"),
|
||||
("is_oversea", "有海外经验", "无海外经验")
|
||||
]
|
||||
|
||||
rmkeys = []
|
||||
for k in cv.keys():
|
||||
if cv[k] is None: rmkeys.append(k)
|
||||
if (type(cv[k]) == type([]) or type(cv[k]) == type("")) and len(cv[k]) == 0: rmkeys.append(k)
|
||||
for k in rmkeys: del cv[k]
|
||||
|
||||
integerity = 0.
|
||||
flds_num = 0.
|
||||
|
||||
def hasValues(flds):
|
||||
nonlocal integerity, flds_num
|
||||
flds_num += len(flds)
|
||||
for f in flds:
|
||||
v = str(cv.get(f, ""))
|
||||
if len(v) > 0 and v != '0' and v != '[]': integerity += 1
|
||||
|
||||
hasValues(tks_fld)
|
||||
hasValues(small_tks_fld)
|
||||
hasValues(kwd_fld)
|
||||
hasValues(num_fld)
|
||||
cv["integerity_flt"] = integerity / flds_num
|
||||
|
||||
if cv.get("corporation_type"):
|
||||
for p, r in [(r"(公司|企业|其它|其他|Others*|\n|未填写|Enterprises|Company|companies)", ""),
|
||||
(r"[//.· <\((]+.*", ""),
|
||||
(r".*(合资|民企|股份制|中外|私营|个体|Private|创业|Owned|投资).*", "民营"),
|
||||
(r".*(机关|事业).*", "机关"),
|
||||
(r".*(非盈利|Non-profit).*", "非盈利"),
|
||||
(r".*(外企|外商|欧美|foreign|Institution|Australia|港资).*", "外企"),
|
||||
(r".*国有.*", "国企"),
|
||||
(r"[ ()\(\)人/·0-9-]+", ""),
|
||||
(r".*(元|规模|于|=|北京|上海|至今|中国|工资|州|shanghai|强|餐饮|融资|职).*", "")]:
|
||||
cv["corporation_type"] = re.sub(p, r, cv["corporation_type"], 1000, re.IGNORECASE)
|
||||
if len(cv["corporation_type"]) < 2: del cv["corporation_type"]
|
||||
|
||||
if cv.get("political_status"):
|
||||
for p, r in [
|
||||
(r".*党员.*", "党员"),
|
||||
(r".*(无党派|公民).*", "群众"),
|
||||
(r".*团员.*", "团员")]:
|
||||
cv["political_status"] = re.sub(p, r, cv["political_status"])
|
||||
if not re.search(r"[党团群]", cv["political_status"]): del cv["political_status"]
|
||||
|
||||
if cv.get("phone"): cv["phone"] = re.sub(r"^0*86([0-9]{11})", r"\1", re.sub(r"[^0-9]+", "", cv["phone"]))
|
||||
|
||||
keys = list(cv.keys())
|
||||
for k in keys:
|
||||
# deal with json objects
|
||||
if k.find("_obj") > 0:
|
||||
try:
|
||||
cv[k] = json_loads(cv[k])
|
||||
cv[k] = [a for _, a in cv[k].items()]
|
||||
nms = []
|
||||
for n in cv[k]:
|
||||
if type(n) != type({}) or "name" not in n or not n.get("name"): continue
|
||||
n["name"] = re.sub(r"((442)|\t )", "", n["name"]).strip().lower()
|
||||
if not n["name"]: continue
|
||||
nms.append(n["name"])
|
||||
if nms:
|
||||
t = k[:-4]
|
||||
cv[f"{t}_kwd"] = nms
|
||||
cv[f"{t}_tks"] = huqie.qie(" ".join(nms))
|
||||
except Exception as e:
|
||||
print("【EXCEPTION】:", str(traceback.format_exc()), cv[k])
|
||||
cv[k] = []
|
||||
|
||||
# tokenize fields
|
||||
if k in tks_fld:
|
||||
cv[f"{k}_tks"] = huqie.qie(cv[k])
|
||||
if k in small_tks_fld: cv[f"{k}_sm_tks"] = huqie.qie(cv[f"{k}_tks"])
|
||||
|
||||
# keyword fields
|
||||
if k in kwd_fld: cv[f"{k}_kwd"] = [n.lower()
|
||||
for n in re.split(r"[\t,,;;. ]",
|
||||
re.sub(r"([^a-zA-Z])[ ]+([^a-zA-Z ])", r"\1,\2", cv[k])
|
||||
) if n]
|
||||
|
||||
if k in num_fld and cv.get(k): cv[f"{k}_int"] = cv[k]
|
||||
|
||||
cv["email_kwd"] = cv.get("email_tks", "").replace(" ", "")
|
||||
# for name field
|
||||
if cv.get("name"):
|
||||
nm = re.sub(r"[\n——\-\((\+].*", "", cv["name"].strip())
|
||||
nm = re.sub(r"[ \t ]+", " ", nm)
|
||||
if re.match(r"[a-zA-Z ]+$", nm):
|
||||
if len(nm.split(" ")) > 1:
|
||||
cv["name"] = nm
|
||||
else:
|
||||
nm = ""
|
||||
elif nm and (surname.isit(nm[0]) or surname.isit(nm[:2])):
|
||||
nm = re.sub(r"[a-zA-Z]+.*", "", nm[:5])
|
||||
else:
|
||||
nm = ""
|
||||
cv["name"] = nm.strip()
|
||||
name = cv["name"]
|
||||
|
||||
# name pingyin and its prefix
|
||||
cv["name_py_tks"] = " ".join(PY.get_pinyins(nm[:20], '')) + " " + " ".join(PY.get_pinyins(nm[:20], ' '))
|
||||
cv["name_py_pref0_tks"] = ""
|
||||
cv["name_py_pref_tks"] = ""
|
||||
for py in PY.get_pinyins(nm[:20], ''):
|
||||
for i in range(2, len(py) + 1): cv["name_py_pref_tks"] += " " + py[:i]
|
||||
for py in PY.get_pinyins(nm[:20], ' '):
|
||||
py = py.split(" ")
|
||||
for i in range(1, len(py) + 1): cv["name_py_pref0_tks"] += " " + "".join(py[:i])
|
||||
|
||||
cv["name_kwd"] = name
|
||||
cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3]
|
||||
cv["name_tks"] = (
|
||||
huqie.qie(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "")
|
||||
) if name else ""
|
||||
else:
|
||||
cv["integerity_flt"] /= 2.
|
||||
|
||||
if cv.get("phone"):
|
||||
r = re.search(r"(1[3456789][0-9]{9})", cv["phone"])
|
||||
if not r:
|
||||
cv["phone"] = ""
|
||||
else:
|
||||
cv["phone"] = r.group(1)
|
||||
|
||||
# deal with date fields
|
||||
if cv.get("updated_at") and isinstance(cv["updated_at"], datetime.datetime):
|
||||
cv["updated_at_dt"] = cv["updated_at"].strftime('%Y-%m-%d %H:%M:%S')
|
||||
else:
|
||||
y, m, d = getYMD(str(cv.get("updated_at", "")))
|
||||
if not y: y = "2012"
|
||||
if not m: m = "01"
|
||||
if not d: d = "01"
|
||||
cv["updated_at_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
|
||||
# long text tokenize
|
||||
|
||||
if cv.get("responsibilities"): cv["responsibilities_ltks"] = huqie.qie(rmHtmlTag(cv["responsibilities"]))
|
||||
|
||||
# for yes or no field
|
||||
fea = []
|
||||
for f, y, n in is_fld:
|
||||
if f not in cv: continue
|
||||
if cv[f] == '是': fea.append(y)
|
||||
if cv[f] == '否': fea.append(n)
|
||||
|
||||
if fea: cv["tag_kwd"] = fea
|
||||
|
||||
cv = forEdu(cv)
|
||||
cv = forProj(cv)
|
||||
cv = forWork(cv)
|
||||
cv = birth(cv)
|
||||
|
||||
cv["corp_proj_sch_deg_kwd"] = [c for c in cv.get("corp_tag_kwd", [])]
|
||||
for i in range(len(cv["corp_proj_sch_deg_kwd"])):
|
||||
for j in cv.get("sch_rank_kwd", []): cv["corp_proj_sch_deg_kwd"][i] += "+" + j
|
||||
for i in range(len(cv["corp_proj_sch_deg_kwd"])):
|
||||
if cv.get("highest_degree_kwd"): cv["corp_proj_sch_deg_kwd"][i] += "+" + cv["highest_degree_kwd"]
|
||||
|
||||
try:
|
||||
if not cv.get("work_exp_flt") and cv.get("work_start_time"):
|
||||
if re.match(r"[0-9]{9,}", str(cv["work_start_time"])):
|
||||
cv["work_start_dt"] = turnTm2Dt(cv["work_start_time"])
|
||||
cv["work_exp_flt"] = (time.time() - int(int(cv["work_start_time"]) / 1000)) / 3600. / 24. / 365.
|
||||
elif re.match(r"[0-9]{4}[^0-9]", str(cv["work_start_time"])):
|
||||
y, m, d = getYMD(str(cv["work_start_time"]))
|
||||
cv["work_start_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
|
||||
cv["work_exp_flt"] = int(str(datetime.date.today())[0:4]) - int(y)
|
||||
except Exception as e:
|
||||
print("【EXCEPTION】", e, "==>", cv.get("work_start_time"))
|
||||
if "work_exp_flt" not in cv and cv.get("work_experience", 0): cv["work_exp_flt"] = int(cv["work_experience"]) / 12.
|
||||
|
||||
keys = list(cv.keys())
|
||||
for k in keys:
|
||||
if not re.search(r"_(fea|tks|nst|dt|int|flt|ltks|kwd|id)$", k): del cv[k]
|
||||
for k in cv.keys():
|
||||
if not re.search("_(kwd|id)$", k) or type(cv[k]) != type([]): continue
|
||||
cv[k] = list(set([re.sub("(市)$", "", str(n)) for n in cv[k] if n not in ['中国', '0']]))
|
||||
keys = [k for k in cv.keys() if re.search(r"_feas*$", k)]
|
||||
for k in keys:
|
||||
if cv[k] <= 0: del cv[k]
|
||||
|
||||
cv["tob_resume_id"] = str(cv["tob_resume_id"])
|
||||
cv["id"] = cv["tob_resume_id"]
|
||||
print("CCCCCCCCCCCCCCC")
|
||||
|
||||
return dealWithInt64(cv)
|
||||
|
||||
|
||||
def dealWithInt64(d):
|
||||
if isinstance(d, dict):
|
||||
for n, v in d.items():
|
||||
d[n] = dealWithInt64(v)
|
||||
|
||||
if isinstance(d, list):
|
||||
d = [dealWithInt64(t) for t in d]
|
||||
|
||||
if isinstance(d, np.integer): d = int(d)
|
||||
return d
|
||||
|
@ -64,7 +64,11 @@ def load_model(model_dir, nm):
|
||||
if not os.path.exists(model_file_path):
|
||||
raise ValueError("not find model file path {}".format(
|
||||
model_file_path))
|
||||
sess = ort.InferenceSession(model_file_path)
|
||||
|
||||
if ort.get_device() == "GPU":
|
||||
sess = ort.InferenceSession(model_file_path, providers=['CUDAExecutionProvider'])
|
||||
else:
|
||||
sess = ort.InferenceSession(model_file_path, providers=['CPUExecutionProvider'])
|
||||
return sess, sess.get_inputs()[0]
|
||||
|
||||
|
||||
|
@ -12,7 +12,7 @@
|
||||
#
|
||||
import copy
|
||||
import re
|
||||
from deepdoc.parser import bullets_category, is_english, tokenize, remove_contents_table, \
|
||||
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
|
||||
hierarchical_merge, make_colon_as_title, naive_merge, random_choices
|
||||
from rag.nlp import huqie
|
||||
from deepdoc.parser import PdfParser, DocxParser
|
||||
@ -47,7 +47,7 @@ class Pdf(PdfParser):
|
||||
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Supported file formats are docx, pdf, txt.
|
||||
Since a book is long and not all the parts are useful, if it's a PDF,
|
||||
@ -94,7 +94,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
|
||||
|
||||
sections = [t for t, _ in sections]
|
||||
# is it English
|
||||
eng = is_english(random_choices(sections, k=218))
|
||||
eng = lang.lower() == "english"#is_english(random_choices(sections, k=218))
|
||||
|
||||
res = []
|
||||
# add tables
|
||||
|
@ -14,7 +14,7 @@ import copy
|
||||
import re
|
||||
from io import BytesIO
|
||||
from docx import Document
|
||||
from deepdoc.parser import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
|
||||
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
|
||||
make_colon_as_title
|
||||
from rag.nlp import huqie
|
||||
from deepdoc.parser import PdfParser, DocxParser
|
||||
@ -68,7 +68,7 @@ class Pdf(PdfParser):
|
||||
return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes]
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Supported file formats are docx, pdf, txt.
|
||||
"""
|
||||
@ -106,7 +106,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
|
||||
else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
|
||||
|
||||
# is it English
|
||||
eng = is_english(sections)
|
||||
eng = lang.lower() == "english"#is_english(sections)
|
||||
# Remove 'Contents' part
|
||||
remove_contents_table(sections, eng)
|
||||
|
||||
|
@ -1,7 +1,6 @@
|
||||
import copy
|
||||
import re
|
||||
from deepdoc.parser import tokenize
|
||||
from rag.nlp import huqie
|
||||
from rag.nlp import huqie, tokenize
|
||||
from deepdoc.parser import PdfParser
|
||||
from rag.utils import num_tokens_from_string
|
||||
|
||||
@ -57,7 +56,7 @@ class Pdf(PdfParser):
|
||||
return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes], tbls
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Only pdf is supported.
|
||||
"""
|
||||
@ -74,7 +73,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
|
||||
doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
|
||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
||||
# is it English
|
||||
eng = pdf_parser.is_english
|
||||
eng = lang.lower() == "english"#pdf_parser.is_english
|
||||
|
||||
res = []
|
||||
# add tables
|
||||
|
@ -13,8 +13,7 @@
|
||||
import copy
|
||||
import re
|
||||
from rag.app import laws
|
||||
from deepdoc.parser import is_english, tokenize, naive_merge
|
||||
from rag.nlp import huqie
|
||||
from rag.nlp import huqie, is_english, tokenize, naive_merge
|
||||
from deepdoc.parser import PdfParser
|
||||
from rag.settings import cron_logger
|
||||
|
||||
@ -38,7 +37,7 @@ class Pdf(PdfParser):
|
||||
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes]
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Supported file formats are docx, pdf, txt.
|
||||
This method apply the naive ways to chunk files.
|
||||
@ -80,7 +79,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
|
||||
|
||||
parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。;!?"})
|
||||
cks = naive_merge(sections, parser_config["chunk_token_num"], parser_config["delimiter"])
|
||||
eng = is_english(cks)
|
||||
eng = lang.lower() == "english"#is_english(cks)
|
||||
res = []
|
||||
# wrap up to es documents
|
||||
for ck in cks:
|
||||
|
@ -15,8 +15,7 @@ import re
|
||||
from collections import Counter
|
||||
|
||||
from api.db import ParserType
|
||||
from deepdoc.parser import tokenize
|
||||
from rag.nlp import huqie
|
||||
from rag.nlp import huqie, tokenize
|
||||
from deepdoc.parser import PdfParser
|
||||
import numpy as np
|
||||
from rag.utils import num_tokens_from_string
|
||||
@ -140,7 +139,7 @@ class Pdf(PdfParser):
|
||||
}
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Only pdf is supported.
|
||||
The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
|
||||
@ -156,7 +155,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
|
||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
||||
doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
|
||||
# is it English
|
||||
eng = pdf_parser.is_english
|
||||
eng = lang.lower() == "english"#pdf_parser.is_english
|
||||
print("It's English.....", eng)
|
||||
|
||||
res = []
|
||||
|
56
rag/app/picture.py
Normal file
56
rag/app/picture.py
Normal file
@ -0,0 +1,56 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import io
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
from api.db import LLMType
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from rag.nlp import tokenize
|
||||
from deepdoc.vision import OCR
|
||||
|
||||
ocr = OCR()
|
||||
|
||||
|
||||
def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
|
||||
try:
|
||||
cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang)
|
||||
except Exception as e:
|
||||
callback(prog=-1, msg=str(e))
|
||||
return []
|
||||
img = Image.open(io.BytesIO(binary))
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"image": img
|
||||
}
|
||||
bxs = ocr(np.array(img))
|
||||
txt = "\n".join([t[0] for _, t in bxs if t[0]])
|
||||
eng = lang.lower() == "english"
|
||||
callback(0.4, "Finish OCR: (%s ...)" % txt[:12])
|
||||
if (eng and len(txt.split(" ")) > 32) or len(txt) > 32:
|
||||
tokenize(doc, txt, eng)
|
||||
callback(0.8, "OCR results is too long to use CV LLM.")
|
||||
return [doc]
|
||||
|
||||
try:
|
||||
callback(0.4, "Use CV LLM to describe the picture.")
|
||||
ans = cv_mdl.describe(binary)
|
||||
callback(0.8, "CV LLM respoond: %s ..." % ans[:32])
|
||||
txt += "\n" + ans
|
||||
tokenize(doc, txt, eng)
|
||||
return [doc]
|
||||
except Exception as e:
|
||||
callback(prog=-1, msg=str(e))
|
||||
|
||||
return []
|
@ -13,46 +13,14 @@
|
||||
import copy
|
||||
import re
|
||||
from io import BytesIO
|
||||
from pptx import Presentation
|
||||
from deepdoc.parser import tokenize, is_english
|
||||
from rag.nlp import tokenize, is_english
|
||||
from rag.nlp import huqie
|
||||
from deepdoc.parser import PdfParser
|
||||
from deepdoc.parser import PdfParser, PptParser
|
||||
|
||||
|
||||
class Ppt(object):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def __extract(self, shape):
|
||||
if shape.shape_type == 19:
|
||||
tb = shape.table
|
||||
rows = []
|
||||
for i in range(1, len(tb.rows)):
|
||||
rows.append("; ".join([tb.cell(0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
|
||||
return "\n".join(rows)
|
||||
|
||||
if shape.has_text_frame:
|
||||
return shape.text_frame.text
|
||||
|
||||
if shape.shape_type == 6:
|
||||
texts = []
|
||||
for p in shape.shapes:
|
||||
t = self.__extract(p)
|
||||
if t: texts.append(t)
|
||||
return "\n".join(texts)
|
||||
|
||||
class Ppt(PptParser):
|
||||
def __call__(self, fnm, from_page, to_page, callback=None):
|
||||
ppt = Presentation(fnm) if isinstance(
|
||||
fnm, str) else Presentation(
|
||||
BytesIO(fnm))
|
||||
txts = []
|
||||
self.total_page = len(ppt.slides)
|
||||
for i, slide in enumerate(ppt.slides[from_page: to_page]):
|
||||
texts = []
|
||||
for shape in slide.shapes:
|
||||
txt = self.__extract(shape)
|
||||
if txt: texts.append(txt)
|
||||
txts.append("\n".join(texts))
|
||||
txts = super.__call__(fnm, from_page, to_page)
|
||||
|
||||
callback(0.5, "Text extraction finished.")
|
||||
import aspose.slides as slides
|
||||
|
@ -14,7 +14,7 @@ import re
|
||||
from io import BytesIO
|
||||
from nltk import word_tokenize
|
||||
from openpyxl import load_workbook
|
||||
from deepdoc.parser import is_english, random_choices
|
||||
from rag.nlp import is_english, random_choices
|
||||
from rag.nlp import huqie, stemmer
|
||||
from deepdoc.parser import ExcelParser
|
||||
|
||||
@ -81,7 +81,7 @@ def beAdoc(d, q, a, eng):
|
||||
return d
|
||||
|
||||
|
||||
def chunk(filename, binary=None, callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Excel and csv(txt) format files are supported.
|
||||
If the file is in excel format, there should be 2 column question and answer without header.
|
||||
@ -113,7 +113,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
||||
break
|
||||
txt += l
|
||||
lines = txt.split("\n")
|
||||
eng = is_english([rmPrefix(l) for l in lines[:100]])
|
||||
eng = lang.lower() == "english"#is_english([rmPrefix(l) for l in lines[:100]])
|
||||
fails = []
|
||||
for i, line in enumerate(lines):
|
||||
arr = [l for l in line.split("\t") if len(l) > 1]
|
||||
|
@ -20,8 +20,7 @@ from openpyxl import load_workbook
|
||||
from dateutil.parser import parse as datetime_parse
|
||||
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from deepdoc.parser import is_english, tokenize
|
||||
from rag.nlp import huqie
|
||||
from rag.nlp import huqie, is_english, tokenize
|
||||
from deepdoc.parser import ExcelParser
|
||||
|
||||
|
||||
@ -112,7 +111,7 @@ def column_data_type(arr):
|
||||
return arr, ty
|
||||
|
||||
|
||||
def chunk(filename, binary=None, callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Excel and csv(txt) format files are supported.
|
||||
For csv or txt file, the delimiter between columns is TAB.
|
||||
@ -192,7 +191,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
||||
clmns_map = [(py_clmns[j] + fieds_map[clmn_tys[j]], clmns[j])
|
||||
for i in range(len(clmns))]
|
||||
|
||||
eng = is_english(txts)
|
||||
eng = lang.lower() == "english"#is_english(txts)
|
||||
for ii, row in df.iterrows():
|
||||
d = {}
|
||||
row_txt = []
|
||||
|
@ -13,12 +13,18 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import io
|
||||
from abc import ABC
|
||||
|
||||
from PIL import Image
|
||||
from openai import OpenAI
|
||||
import os
|
||||
import base64
|
||||
from io import BytesIO
|
||||
|
||||
from api.utils import get_uuid
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
|
||||
|
||||
class Base(ABC):
|
||||
def __init__(self, key, model_name):
|
||||
@ -44,25 +50,26 @@ class Base(ABC):
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等。",
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{b64}"
|
||||
},
|
||||
},
|
||||
{
|
||||
"text": "请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等,如果有数据请提取出数据。" if self.lang.lower() == "chinese" else \
|
||||
"Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out.",
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
class GptV4(Base):
|
||||
def __init__(self, key, model_name="gpt-4-vision-preview"):
|
||||
def __init__(self, key, model_name="gpt-4-vision-preview", lang="Chinese"):
|
||||
self.client = OpenAI(api_key=key)
|
||||
self.model_name = model_name
|
||||
self.lang = lang
|
||||
|
||||
def describe(self, image, max_tokens=300):
|
||||
b64 = self.image2base64(image)
|
||||
@ -76,18 +83,40 @@ class GptV4(Base):
|
||||
|
||||
|
||||
class QWenCV(Base):
|
||||
def __init__(self, key, model_name="qwen-vl-chat-v1"):
|
||||
def __init__(self, key, model_name="qwen-vl-chat-v1", lang="Chinese"):
|
||||
import dashscope
|
||||
dashscope.api_key = key
|
||||
self.model_name = model_name
|
||||
self.lang = lang
|
||||
|
||||
def prompt(self, binary):
|
||||
# stupid as hell
|
||||
tmp_dir = get_project_base_directory("tmp")
|
||||
if not os.path.exists(tmp_dir): os.mkdir(tmp_dir)
|
||||
path = os.path.join(tmp_dir, "%s.jpg"%get_uuid())
|
||||
Image.open(io.BytesIO(binary)).save(path)
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"image": f"file://{path}"
|
||||
},
|
||||
{
|
||||
"text": "请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等,如果有数据请提取出数据。" if self.lang.lower() == "chinese" else \
|
||||
"Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out.",
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
def describe(self, image, max_tokens=300):
|
||||
from http import HTTPStatus
|
||||
from dashscope import MultiModalConversation
|
||||
response = MultiModalConversation.call(model=self.model_name,
|
||||
messages=self.prompt(self.image2base64(image)))
|
||||
messages=self.prompt(image))
|
||||
if response.status_code == HTTPStatus.OK:
|
||||
return response.output.choices[0]['message']['content'], response.usage.output_tokens
|
||||
return response.output.choices[0]['message']['content'][0]["text"], response.usage.output_tokens
|
||||
return response.message, 0
|
||||
|
||||
|
||||
@ -95,9 +124,10 @@ from zhipuai import ZhipuAI
|
||||
|
||||
|
||||
class Zhipu4V(Base):
|
||||
def __init__(self, key, model_name="glm-4v"):
|
||||
def __init__(self, key, model_name="glm-4v", lang="Chinese"):
|
||||
self.client = ZhipuAI(api_key=key)
|
||||
self.model_name = model_name
|
||||
self.lang = lang
|
||||
|
||||
def describe(self, image, max_tokens=1024):
|
||||
b64 = self.image2base64(image)
|
||||
|
@ -5,3 +5,219 @@ retrievaler = search.Dealer(ELASTICSEARCH)
|
||||
|
||||
from nltk.stem import PorterStemmer
|
||||
stemmer = PorterStemmer()
|
||||
|
||||
import re
|
||||
from nltk import word_tokenize
|
||||
from . import huqie
|
||||
from rag.utils import num_tokens_from_string
|
||||
import random
|
||||
|
||||
BULLET_PATTERN = [[
|
||||
r"第[零一二三四五六七八九十百0-9]+(分?编|部分)",
|
||||
r"第[零一二三四五六七八九十百0-9]+章",
|
||||
r"第[零一二三四五六七八九十百0-9]+节",
|
||||
r"第[零一二三四五六七八九十百0-9]+条",
|
||||
r"[\((][零一二三四五六七八九十百]+[\))]",
|
||||
], [
|
||||
r"第[0-9]+章",
|
||||
r"第[0-9]+节",
|
||||
r"[0-9]{,3}[\. 、]",
|
||||
r"[0-9]{,2}\.[0-9]{,2}",
|
||||
r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
|
||||
r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
|
||||
], [
|
||||
r"第[零一二三四五六七八九十百0-9]+章",
|
||||
r"第[零一二三四五六七八九十百0-9]+节",
|
||||
r"[零一二三四五六七八九十百]+[ 、]",
|
||||
r"[\((][零一二三四五六七八九十百]+[\))]",
|
||||
r"[\((][0-9]{,2}[\))]",
|
||||
], [
|
||||
r"PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)",
|
||||
r"Chapter (I+V?|VI*|XI|IX|X)",
|
||||
r"Section [0-9]+",
|
||||
r"Article [0-9]+"
|
||||
]
|
||||
]
|
||||
|
||||
def random_choices(arr, k):
|
||||
k = min(len(arr), k)
|
||||
return random.choices(arr, k=k)
|
||||
|
||||
def bullets_category(sections):
|
||||
global BULLET_PATTERN
|
||||
hits = [0] * len(BULLET_PATTERN)
|
||||
for i, pro in enumerate(BULLET_PATTERN):
|
||||
for sec in sections:
|
||||
for p in pro:
|
||||
if re.match(p, sec):
|
||||
hits[i] += 1
|
||||
break
|
||||
maxium = 0
|
||||
res = -1
|
||||
for i, h in enumerate(hits):
|
||||
if h <= maxium: continue
|
||||
res = i
|
||||
maxium = h
|
||||
return res
|
||||
|
||||
|
||||
def is_english(texts):
|
||||
eng = 0
|
||||
for t in texts:
|
||||
if re.match(r"[a-zA-Z]{2,}", t.strip()):
|
||||
eng += 1
|
||||
if eng / len(texts) > 0.8:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def tokenize(d, t, eng):
|
||||
d["content_with_weight"] = t
|
||||
if eng:
|
||||
t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
|
||||
d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)])
|
||||
else:
|
||||
d["content_ltks"] = huqie.qie(t)
|
||||
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
||||
|
||||
|
||||
def remove_contents_table(sections, eng=False):
|
||||
i = 0
|
||||
while i < len(sections):
|
||||
def get(i):
|
||||
nonlocal sections
|
||||
return (sections[i] if type(sections[i]) == type("") else sections[i][0]).strip()
|
||||
|
||||
if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
|
||||
re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)):
|
||||
i += 1
|
||||
continue
|
||||
sections.pop(i)
|
||||
if i >= len(sections): break
|
||||
prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
|
||||
while not prefix:
|
||||
sections.pop(i)
|
||||
if i >= len(sections): break
|
||||
prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
|
||||
sections.pop(i)
|
||||
if i >= len(sections) or not prefix: break
|
||||
for j in range(i, min(i + 128, len(sections))):
|
||||
if not re.match(prefix, get(j)):
|
||||
continue
|
||||
for _ in range(i, j): sections.pop(i)
|
||||
break
|
||||
|
||||
|
||||
def make_colon_as_title(sections):
|
||||
if not sections: return []
|
||||
if type(sections[0]) == type(""): return sections
|
||||
i = 0
|
||||
while i < len(sections):
|
||||
txt, layout = sections[i]
|
||||
i += 1
|
||||
txt = txt.split("@")[0].strip()
|
||||
if not txt:
|
||||
continue
|
||||
if txt[-1] not in "::":
|
||||
continue
|
||||
txt = txt[::-1]
|
||||
arr = re.split(r"([。?!!?;;]| .)", txt)
|
||||
if len(arr) < 2 or len(arr[1]) < 32:
|
||||
continue
|
||||
sections.insert(i - 1, (arr[0][::-1], "title"))
|
||||
i += 1
|
||||
|
||||
|
||||
def hierarchical_merge(bull, sections, depth):
|
||||
if not sections or bull < 0: return []
|
||||
if type(sections[0]) == type(""): sections = [(s, "") for s in sections]
|
||||
sections = [(t,o) for t, o in sections if t and len(t.split("@")[0].strip()) > 1 and not re.match(r"[0-9]+$", t.split("@")[0].strip())]
|
||||
bullets_size = len(BULLET_PATTERN[bull])
|
||||
levels = [[] for _ in range(bullets_size + 2)]
|
||||
|
||||
def not_title(txt):
|
||||
if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt): return False
|
||||
if len(txt) >= 128: return True
|
||||
return re.search(r"[,;,。;!!]", txt)
|
||||
|
||||
for i, (txt, layout) in enumerate(sections):
|
||||
for j, p in enumerate(BULLET_PATTERN[bull]):
|
||||
if re.match(p, txt.strip()) and not not_title(txt):
|
||||
levels[j].append(i)
|
||||
break
|
||||
else:
|
||||
if re.search(r"(title|head)", layout):
|
||||
levels[bullets_size].append(i)
|
||||
else:
|
||||
levels[bullets_size + 1].append(i)
|
||||
sections = [t for t, _ in sections]
|
||||
for s in sections: print("--", s)
|
||||
|
||||
def binary_search(arr, target):
|
||||
if not arr: return -1
|
||||
if target > arr[-1]: return len(arr) - 1
|
||||
if target < arr[0]: return -1
|
||||
s, e = 0, len(arr)
|
||||
while e - s > 1:
|
||||
i = (e + s) // 2
|
||||
if target > arr[i]:
|
||||
s = i
|
||||
continue
|
||||
elif target < arr[i]:
|
||||
e = i
|
||||
continue
|
||||
else:
|
||||
assert False
|
||||
return s
|
||||
|
||||
cks = []
|
||||
readed = [False] * len(sections)
|
||||
levels = levels[::-1]
|
||||
for i, arr in enumerate(levels[:depth]):
|
||||
for j in arr:
|
||||
if readed[j]: continue
|
||||
readed[j] = True
|
||||
cks.append([j])
|
||||
if i + 1 == len(levels) - 1: continue
|
||||
for ii in range(i + 1, len(levels)):
|
||||
jj = binary_search(levels[ii], j)
|
||||
if jj < 0: continue
|
||||
if jj > cks[-1][-1]: cks[-1].pop(-1)
|
||||
cks[-1].append(levels[ii][jj])
|
||||
for ii in cks[-1]: readed[ii] = True
|
||||
for i in range(len(cks)):
|
||||
cks[i] = [sections[j] for j in cks[i][::-1]]
|
||||
print("--------------\n", "\n* ".join(cks[i]))
|
||||
|
||||
return cks
|
||||
|
||||
|
||||
def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
||||
if not sections: return []
|
||||
if type(sections[0]) == type(""): sections = [(s, "") for s in sections]
|
||||
cks = [""]
|
||||
tk_nums = [0]
|
||||
def add_chunk(t, pos):
|
||||
nonlocal cks, tk_nums, delimiter
|
||||
tnum = num_tokens_from_string(t)
|
||||
if tnum < 8: pos = ""
|
||||
if tk_nums[-1] > chunk_token_num:
|
||||
cks.append(t + pos)
|
||||
tk_nums.append(tnum)
|
||||
else:
|
||||
cks[-1] += t + pos
|
||||
tk_nums[-1] += tnum
|
||||
|
||||
for sec, pos in sections:
|
||||
s, e = 0, 1
|
||||
while e < len(sec):
|
||||
if sec[e] in delimiter:
|
||||
add_chunk(sec[s: e+1], pos)
|
||||
s = e + 1
|
||||
e = s + 1
|
||||
else:
|
||||
e += 1
|
||||
if s < e: add_chunk(sec[s: e], pos)
|
||||
|
||||
return cks
|
||||
|
||||
|
@ -21,6 +21,7 @@ import hashlib
|
||||
import copy
|
||||
import re
|
||||
import sys
|
||||
import traceback
|
||||
from functools import partial
|
||||
from timeit import default_timer as timer
|
||||
|
||||
@ -36,7 +37,7 @@ from rag.nlp import search
|
||||
from io import BytesIO
|
||||
import pandas as pd
|
||||
|
||||
from rag.app import laws, paper, presentation, manual, qa, table, book, resume
|
||||
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture
|
||||
|
||||
from api.db import LLMType, ParserType
|
||||
from api.db.services.document_service import DocumentService
|
||||
@ -56,47 +57,31 @@ FACTORY = {
|
||||
ParserType.QA.value: qa,
|
||||
ParserType.TABLE.value: table,
|
||||
ParserType.RESUME.value: resume,
|
||||
ParserType.PICTURE.value: picture,
|
||||
}
|
||||
|
||||
|
||||
def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing..."):
|
||||
def set_progress(task_id, from_page=0, to_page=-1,
|
||||
prog=None, msg="Processing..."):
|
||||
if prog is not None and prog < 0:
|
||||
msg = "[ERROR]"+msg
|
||||
cancel = TaskService.do_cancel(task_id)
|
||||
if cancel:
|
||||
msg += " [Canceled]"
|
||||
prog = -1
|
||||
|
||||
if to_page > 0: msg = f"Page({from_page}~{to_page}): " + msg
|
||||
if to_page > 0:
|
||||
msg = f"Page({from_page}~{to_page}): " + msg
|
||||
d = {"progress_msg": msg}
|
||||
if prog is not None: d["progress"] = prog
|
||||
if prog is not None:
|
||||
d["progress"] = prog
|
||||
try:
|
||||
TaskService.update_progress(task_id, d)
|
||||
except Exception as e:
|
||||
cron_logger.error("set_progress:({}), {}".format(task_id, str(e)))
|
||||
|
||||
if cancel:sys.exit()
|
||||
|
||||
|
||||
"""
|
||||
def chuck_doc(name, binary, tenant_id, cvmdl=None):
|
||||
suff = os.path.split(name)[-1].lower().split(".")[-1]
|
||||
if suff.find("pdf") >= 0:
|
||||
return PDF(binary)
|
||||
if suff.find("doc") >= 0:
|
||||
return DOC(binary)
|
||||
if re.match(r"(xlsx|xlsm|xltx|xltm)", suff):
|
||||
return EXC(binary)
|
||||
if suff.find("ppt") >= 0:
|
||||
return PPT(binary)
|
||||
if cvmdl and re.search(r"\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico)$",
|
||||
name.lower()):
|
||||
txt = cvmdl.describe(binary)
|
||||
field = TextChunker.Fields()
|
||||
field.text_chunks = [(txt, binary)]
|
||||
field.table_chunks = []
|
||||
return field
|
||||
|
||||
return TextChunker()(binary)
|
||||
"""
|
||||
if cancel:
|
||||
sys.exit()
|
||||
|
||||
|
||||
def collect(comm, mod, tm):
|
||||
@ -109,29 +94,38 @@ def collect(comm, mod, tm):
|
||||
return tasks
|
||||
|
||||
|
||||
def build(row, cvmdl):
|
||||
def build(row):
|
||||
if row["size"] > DOC_MAXIMUM_SIZE:
|
||||
set_progress(row["id"], prog=-1, msg="File size exceeds( <= %dMb )" %
|
||||
(int(DOC_MAXIMUM_SIZE / 1024 / 1024)))
|
||||
return []
|
||||
|
||||
callback = partial(set_progress, row["id"], row["from_page"], row["to_page"])
|
||||
callback = partial(
|
||||
set_progress,
|
||||
row["id"],
|
||||
row["from_page"],
|
||||
row["to_page"])
|
||||
chunker = FACTORY[row["parser_id"].lower()]
|
||||
try:
|
||||
cron_logger.info("Chunkking {}/{}".format(row["location"], row["name"]))
|
||||
cks = chunker.chunk(row["name"], binary = MINIO.get(row["kb_id"], row["location"]), from_page=row["from_page"], to_page=row["to_page"],
|
||||
callback = callback, kb_id=row["kb_id"], parser_config=row["parser_config"])
|
||||
cron_logger.info(
|
||||
"Chunkking {}/{}".format(row["location"], row["name"]))
|
||||
cks = chunker.chunk(row["name"], binary=MINIO.get(row["kb_id"], row["location"]), from_page=row["from_page"],
|
||||
to_page=row["to_page"], lang=row["language"], callback=callback,
|
||||
kb_id=row["kb_id"], parser_config=row["parser_config"], tenant_id=row["tenant_id"])
|
||||
except Exception as e:
|
||||
if re.search("(No such file|not found)", str(e)):
|
||||
callback(-1, "Can not find file <%s>" % row["doc_name"])
|
||||
else:
|
||||
callback(-1, f"Internal server error: %s" % str(e).replace("'", ""))
|
||||
callback(-1, f"Internal server error: %s" %
|
||||
str(e).replace("'", ""))
|
||||
traceback.print_exc()
|
||||
|
||||
cron_logger.warn("Chunkking {}/{}: {}".format(row["location"], row["name"], str(e)))
|
||||
cron_logger.warn(
|
||||
"Chunkking {}/{}: {}".format(row["location"], row["name"], str(e)))
|
||||
|
||||
return
|
||||
|
||||
callback(msg="Finished slicing files. Start to embedding the content.")
|
||||
callback(msg="Finished slicing files(%d). Start to embedding the content."%len(cks))
|
||||
|
||||
docs = []
|
||||
doc = {
|
||||
@ -142,7 +136,8 @@ def build(row, cvmdl):
|
||||
d = copy.deepcopy(doc)
|
||||
d.update(ck)
|
||||
md5 = hashlib.md5()
|
||||
md5.update((ck["content_with_weight"] + str(d["doc_id"])).encode("utf-8"))
|
||||
md5.update((ck["content_with_weight"] +
|
||||
str(d["doc_id"])).encode("utf-8"))
|
||||
d["_id"] = md5.hexdigest()
|
||||
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
|
||||
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
|
||||
@ -173,7 +168,8 @@ def init_kb(row):
|
||||
|
||||
|
||||
def embedding(docs, mdl, parser_config={}):
|
||||
tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [d["content_with_weight"] for d in docs]
|
||||
tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [
|
||||
d["content_with_weight"] for d in docs]
|
||||
tk_count = 0
|
||||
if len(tts) == len(cnts):
|
||||
tts, c = mdl.encode(tts)
|
||||
@ -182,7 +178,8 @@ def embedding(docs, mdl, parser_config={}):
|
||||
cnts, c = mdl.encode(cnts)
|
||||
tk_count += c
|
||||
title_w = float(parser_config.get("filename_embd_weight", 0.1))
|
||||
vects = (title_w * tts + (1-title_w) * cnts) if len(tts) == len(cnts) else cnts
|
||||
vects = (title_w * tts + (1 - title_w) *
|
||||
cnts) if len(tts) == len(cnts) else cnts
|
||||
|
||||
assert len(vects) == len(docs)
|
||||
for i, d in enumerate(docs):
|
||||
@ -192,7 +189,10 @@ def embedding(docs, mdl, parser_config={}):
|
||||
|
||||
|
||||
def main(comm, mod):
|
||||
tm_fnm = os.path.join(get_project_base_directory(), "rag/res", f"{comm}-{mod}.tm")
|
||||
tm_fnm = os.path.join(
|
||||
get_project_base_directory(),
|
||||
"rag/res",
|
||||
f"{comm}-{mod}.tm")
|
||||
tm = findMaxTm(tm_fnm)
|
||||
rows = collect(comm, mod, tm)
|
||||
if len(rows) == 0:
|
||||
@ -203,15 +203,13 @@ def main(comm, mod):
|
||||
callback = partial(set_progress, r["id"], r["from_page"], r["to_page"])
|
||||
try:
|
||||
embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING)
|
||||
cv_mdl = LLMBundle(r["tenant_id"], LLMType.IMAGE2TEXT)
|
||||
# TODO: sequence2text model
|
||||
except Exception as e:
|
||||
callback(prog=-1, msg=str(e))
|
||||
continue
|
||||
|
||||
st_tm = timer()
|
||||
cks = build(r, cv_mdl)
|
||||
if cks is None:continue
|
||||
cks = build(r)
|
||||
if cks is None:
|
||||
continue
|
||||
if not cks:
|
||||
tmf.write(str(r["update_time"]) + "\n")
|
||||
callback(1., "No chunk! Done!")
|
||||
@ -233,11 +231,15 @@ def main(comm, mod):
|
||||
cron_logger.error(str(es_r))
|
||||
else:
|
||||
if TaskService.do_cancel(r["id"]):
|
||||
ELASTICSEARCH.deleteByQuery(Q("match", doc_id=r["doc_id"]), idxnm=search.index_name(r["tenant_id"]))
|
||||
ELASTICSEARCH.deleteByQuery(
|
||||
Q("match", doc_id=r["doc_id"]), idxnm=search.index_name(r["tenant_id"]))
|
||||
continue
|
||||
callback(1., "Done!")
|
||||
DocumentService.increment_chunk_num(r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
|
||||
cron_logger.info("Chunk doc({}), token({}), chunks({})".format(r["id"], tk_count, len(cks)))
|
||||
DocumentService.increment_chunk_num(
|
||||
r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
|
||||
cron_logger.info(
|
||||
"Chunk doc({}), token({}), chunks({})".format(
|
||||
r["id"], tk_count, len(cks)))
|
||||
|
||||
tmf.write(str(r["update_time"]) + "\n")
|
||||
tmf.close()
|
||||
|
Loading…
x
Reference in New Issue
Block a user