mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-07-27 18:01:59 +08:00
init README of deepdoc, add picture processer. (#71)
* init README of deepdoc, add picture processer. * add resume parsing
This commit is contained in:
parent
d32322c081
commit
7fd1eca582
2
.gitignore
vendored
2
.gitignore
vendored
@ -6,7 +6,7 @@ __pycache__/
|
|||||||
hudet/
|
hudet/
|
||||||
cv/
|
cv/
|
||||||
layout_app.py
|
layout_app.py
|
||||||
resume/
|
api/flask_session
|
||||||
|
|
||||||
# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
|
# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
|
||||||
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
|
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
|
||||||
|
@ -163,6 +163,7 @@ def completion():
|
|||||||
del req["conversation_id"]
|
del req["conversation_id"]
|
||||||
del req["messages"]
|
del req["messages"]
|
||||||
ans = chat(dia, msg, **req)
|
ans = chat(dia, msg, **req)
|
||||||
|
if not conv.reference: conv.reference = []
|
||||||
conv.reference.append(ans["reference"])
|
conv.reference.append(ans["reference"])
|
||||||
conv.message.append({"role": "assistant", "content": ans["answer"]})
|
conv.message.append({"role": "assistant", "content": ans["answer"]})
|
||||||
ConversationService.update_by_id(conv.id, conv.to_dict())
|
ConversationService.update_by_id(conv.id, conv.to_dict())
|
||||||
|
@ -32,7 +32,6 @@ def set_dialog():
|
|||||||
dialog_id = req.get("dialog_id")
|
dialog_id = req.get("dialog_id")
|
||||||
name = req.get("name", "New Dialog")
|
name = req.get("name", "New Dialog")
|
||||||
description = req.get("description", "A helpful Dialog")
|
description = req.get("description", "A helpful Dialog")
|
||||||
language = req.get("language", "Chinese")
|
|
||||||
top_n = req.get("top_n", 6)
|
top_n = req.get("top_n", 6)
|
||||||
similarity_threshold = req.get("similarity_threshold", 0.1)
|
similarity_threshold = req.get("similarity_threshold", 0.1)
|
||||||
vector_similarity_weight = req.get("vector_similarity_weight", 0.3)
|
vector_similarity_weight = req.get("vector_similarity_weight", 0.3)
|
||||||
@ -80,7 +79,6 @@ def set_dialog():
|
|||||||
"name": name,
|
"name": name,
|
||||||
"kb_ids": req["kb_ids"],
|
"kb_ids": req["kb_ids"],
|
||||||
"description": description,
|
"description": description,
|
||||||
"language": language,
|
|
||||||
"llm_id": llm_id,
|
"llm_id": llm_id,
|
||||||
"llm_setting": llm_setting,
|
"llm_setting": llm_setting,
|
||||||
"prompt_config": prompt_config,
|
"prompt_config": prompt_config,
|
||||||
|
@ -272,7 +272,9 @@ def get(doc_id):
|
|||||||
response = flask.make_response(MINIO.get(doc.kb_id, doc.location))
|
response = flask.make_response(MINIO.get(doc.kb_id, doc.location))
|
||||||
ext = re.search(r"\.([^.]+)$", doc.name)
|
ext = re.search(r"\.([^.]+)$", doc.name)
|
||||||
if ext:
|
if ext:
|
||||||
response.headers.set('Content-Type', 'application/%s'%ext.group(1))
|
if doc.type == FileType.VISUAL.value:
|
||||||
|
response.headers.set('Content-Type', 'image/%s'%ext.group(1))
|
||||||
|
else: response.headers.set('Content-Type', 'application/%s'%ext.group(1))
|
||||||
return response
|
return response
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return server_error_response(e)
|
return server_error_response(e)
|
||||||
|
@ -464,6 +464,7 @@ class Knowledgebase(DataBaseModel):
|
|||||||
avatar = TextField(null=True, help_text="avatar base64 string")
|
avatar = TextField(null=True, help_text="avatar base64 string")
|
||||||
tenant_id = CharField(max_length=32, null=False)
|
tenant_id = CharField(max_length=32, null=False)
|
||||||
name = CharField(max_length=128, null=False, help_text="KB name", index=True)
|
name = CharField(max_length=128, null=False, help_text="KB name", index=True)
|
||||||
|
language = CharField(max_length=32, null=True, default="Chinese", help_text="English|Chinese")
|
||||||
description = TextField(null=True, help_text="KB description")
|
description = TextField(null=True, help_text="KB description")
|
||||||
embd_id = CharField(max_length=128, null=False, help_text="default embedding model ID")
|
embd_id = CharField(max_length=128, null=False, help_text="default embedding model ID")
|
||||||
permission = CharField(max_length=16, null=False, help_text="me|team", default="me")
|
permission = CharField(max_length=16, null=False, help_text="me|team", default="me")
|
||||||
|
@ -57,7 +57,7 @@ class TenantLLMService(CommonService):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@DB.connection_context()
|
@DB.connection_context()
|
||||||
def model_instance(cls, tenant_id, llm_type, llm_name=None):
|
def model_instance(cls, tenant_id, llm_type, llm_name=None, lang="Chinese"):
|
||||||
e, tenant = TenantService.get_by_id(tenant_id)
|
e, tenant = TenantService.get_by_id(tenant_id)
|
||||||
if not e:
|
if not e:
|
||||||
raise LookupError("Tenant not found")
|
raise LookupError("Tenant not found")
|
||||||
@ -87,7 +87,7 @@ class TenantLLMService(CommonService):
|
|||||||
if model_config["llm_factory"] not in CvModel:
|
if model_config["llm_factory"] not in CvModel:
|
||||||
return
|
return
|
||||||
return CvModel[model_config["llm_factory"]](
|
return CvModel[model_config["llm_factory"]](
|
||||||
model_config["api_key"], model_config["llm_name"])
|
model_config["api_key"], model_config["llm_name"], lang)
|
||||||
|
|
||||||
if llm_type == LLMType.CHAT.value:
|
if llm_type == LLMType.CHAT.value:
|
||||||
if model_config["llm_factory"] not in ChatModel:
|
if model_config["llm_factory"] not in ChatModel:
|
||||||
@ -120,11 +120,11 @@ class TenantLLMService(CommonService):
|
|||||||
|
|
||||||
|
|
||||||
class LLMBundle(object):
|
class LLMBundle(object):
|
||||||
def __init__(self, tenant_id, llm_type, llm_name=None):
|
def __init__(self, tenant_id, llm_type, llm_name=None, lang="Chinese"):
|
||||||
self.tenant_id = tenant_id
|
self.tenant_id = tenant_id
|
||||||
self.llm_type = llm_type
|
self.llm_type = llm_type
|
||||||
self.llm_name = llm_name
|
self.llm_name = llm_name
|
||||||
self.mdl = TenantLLMService.model_instance(tenant_id, llm_type, llm_name)
|
self.mdl = TenantLLMService.model_instance(tenant_id, llm_type, llm_name, lang=lang)
|
||||||
assert self.mdl, "Can't find mole for {}/{}/{}".format(tenant_id, llm_type, llm_name)
|
assert self.mdl, "Can't find mole for {}/{}/{}".format(tenant_id, llm_type, llm_name)
|
||||||
|
|
||||||
def encode(self, texts: list, batch_size=32):
|
def encode(self, texts: list, batch_size=32):
|
||||||
|
@ -27,7 +27,24 @@ class TaskService(CommonService):
|
|||||||
@classmethod
|
@classmethod
|
||||||
@DB.connection_context()
|
@DB.connection_context()
|
||||||
def get_tasks(cls, tm, mod=0, comm=1, items_per_page=64):
|
def get_tasks(cls, tm, mod=0, comm=1, items_per_page=64):
|
||||||
fields = [cls.model.id, cls.model.doc_id, cls.model.from_page,cls.model.to_page, Document.kb_id, Document.parser_id, Document.parser_config, Document.name, Document.type, Document.location, Document.size, Knowledgebase.tenant_id, Tenant.embd_id, Tenant.img2txt_id, Tenant.asr_id, cls.model.update_time]
|
fields = [
|
||||||
|
cls.model.id,
|
||||||
|
cls.model.doc_id,
|
||||||
|
cls.model.from_page,
|
||||||
|
cls.model.to_page,
|
||||||
|
Document.kb_id,
|
||||||
|
Document.parser_id,
|
||||||
|
Document.parser_config,
|
||||||
|
Document.name,
|
||||||
|
Document.type,
|
||||||
|
Document.location,
|
||||||
|
Document.size,
|
||||||
|
Knowledgebase.tenant_id,
|
||||||
|
Knowledgebase.language,
|
||||||
|
Tenant.embd_id,
|
||||||
|
Tenant.img2txt_id,
|
||||||
|
Tenant.asr_id,
|
||||||
|
cls.model.update_time]
|
||||||
docs = cls.model.select(*fields) \
|
docs = cls.model.select(*fields) \
|
||||||
.join(Document, on=(cls.model.doc_id == Document.id)) \
|
.join(Document, on=(cls.model.doc_id == Document.id)) \
|
||||||
.join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \
|
.join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \
|
||||||
@ -42,7 +59,6 @@ class TaskService(CommonService):
|
|||||||
.paginate(1, items_per_page)
|
.paginate(1, items_per_page)
|
||||||
return list(docs.dicts())
|
return list(docs.dicts())
|
||||||
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@DB.connection_context()
|
@DB.connection_context()
|
||||||
def do_cancel(cls, id):
|
def do_cancel(cls, id):
|
||||||
@ -54,7 +70,6 @@ class TaskService(CommonService):
|
|||||||
pass
|
pass
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@DB.connection_context()
|
@DB.connection_context()
|
||||||
def update_progress(cls, id, info):
|
def update_progress(cls, id, info):
|
||||||
|
Binary file not shown.
@ -167,7 +167,11 @@ def thumbnail(filename, blob):
|
|||||||
return "data:image/png;base64," + base64.b64encode(buffered.getvalue()).decode("utf-8")
|
return "data:image/png;base64," + base64.b64encode(buffered.getvalue()).decode("utf-8")
|
||||||
|
|
||||||
if re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename):
|
if re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename):
|
||||||
return ("data:image/%s;base64,"%filename.split(".")[-1]) + base64.b64encode(Image.open(BytesIO(blob)).thumbnail((30, 30)).tobytes()).decode("utf-8")
|
image = Image.open(BytesIO(blob))
|
||||||
|
image.thumbnail((30, 30))
|
||||||
|
buffered = BytesIO()
|
||||||
|
image.save(buffered, format="png")
|
||||||
|
return "data:image/png;base64," + base64.b64encode(buffered.getvalue()).decode("utf-8")
|
||||||
|
|
||||||
if re.match(r".*\.(ppt|pptx)$", filename):
|
if re.match(r".*\.(ppt|pptx)$", filename):
|
||||||
import aspose.slides as slides
|
import aspose.slides as slides
|
||||||
|
82
deepdoc/README.md
Normal file
82
deepdoc/README.md
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
English | [简体中文](./README_zh.md)
|
||||||
|
|
||||||
|
#*Deep*Doc
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
- [1. Introduction](#1)
|
||||||
|
- [2. Vision](#2)
|
||||||
|
- [3. Parser](#3)
|
||||||
|
|
||||||
|
<a name="1"></a>
|
||||||
|
## 1. Introduction
|
||||||
|
|
||||||
|
---
|
||||||
|
With a bunch of documents from various domains with various formats and along with diverse retrieval requirements,
|
||||||
|
an accurate analysis becomes a very challenge task. *Deep*Doc is born for that purpose.
|
||||||
|
There 2 parts in *Deep*Doc so far: vision and parser.
|
||||||
|
|
||||||
|
<a name="2"></a>
|
||||||
|
## 2. Vision
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
We use vision information to resolve problems as human being.
|
||||||
|
- OCR. Since a lot of documents presented as images or at least be able to transform to image,
|
||||||
|
OCR is a very essential and fundamental or even universal solution for text extraction.
|
||||||
|
|
||||||
|
<div align="center" style="margin-top:20px;margin-bottom:20px;">
|
||||||
|
<img src="https://lh6.googleusercontent.com/2xdiSjaGWkZ71YdORc71Ujf7jCHmO6G-6ONklzGiUYEh3QZpjPo6MQ9eqEFX20am_cdW4Ck0YRraXEetXWnM08kJd99yhik13Cy0_YKUAq2zVGR15LzkovRAmK9iT4o3hcJ8dTpspaJKUwt6R4gN7So" width="300"/>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
- Layout recognition. Documents from different domain may have various layouts,
|
||||||
|
like, newspaper, magazine, book and résumé are distinct in terms of layout.
|
||||||
|
Only when machine have an accurate layout analysis, it can decide if these text parts are successive or not,
|
||||||
|
or this part needs Table Structure Recognition(TSR) to process, or this part is a figure and described with this caption.
|
||||||
|
We have 10 basic layout components which covers most cases:
|
||||||
|
- Text
|
||||||
|
- Title
|
||||||
|
- Figure
|
||||||
|
- Figure caption
|
||||||
|
- Table
|
||||||
|
- Table caption
|
||||||
|
- Header
|
||||||
|
- Footer
|
||||||
|
- Reference
|
||||||
|
- Equation
|
||||||
|
<div align="center" style="margin-top:20px;margin-bottom:20px;">
|
||||||
|
<img src="https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.7/ppstructure/docs/layout/layout.png?raw=true" width="900"/>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
- Table Structure Recognition(TSR). Data table is a frequently used structure present data including numbers or text.
|
||||||
|
And the structure of a table might be very complex, like hierarchy headers, spanning cells and projected row headers.
|
||||||
|
Along with TSR, we also reassemble the content into sentences which could be well comprehended by LLM.
|
||||||
|
We have five labels for TSR task:
|
||||||
|
- Column
|
||||||
|
- Row
|
||||||
|
- Column header
|
||||||
|
- Projected row header
|
||||||
|
- Spanning cell
|
||||||
|
<div align="center" style="margin-top:20px;margin-bottom:20px;">
|
||||||
|
<img src="https://user-images.githubusercontent.com/10793386/139559159-cd23c972-8731-48ed-91df-f3f27e9f4d79.jpg" width="900"/>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<a name="3"></a>
|
||||||
|
## 3. Parser
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Four kinds of document formats as PDF, DOCX, EXCEL and PPT have their corresponding parser.
|
||||||
|
The most complex one is PDF parser since PDF's flexibility. The output of PDF parser includes:
|
||||||
|
- Text chunks with their own positions in PDF(page number and rectangular positions).
|
||||||
|
- Tables with cropped image from the PDF, and contents which has already translated into natural language sentences.
|
||||||
|
- Figures with caption and text in the figures.
|
||||||
|
|
||||||
|
###Résumé
|
||||||
|
|
||||||
|
---
|
||||||
|
The résumé is a very complicated kind of document. A résumé which is composed of unstructured text
|
||||||
|
with various layouts could be resolved into structured data composed of nearly a hundred of fields.
|
||||||
|
We haven't opened the parser yet, as we open the processing method after parsing procedure.
|
||||||
|
|
||||||
|
|
1
deepdoc/README_zh.md
Normal file
1
deepdoc/README_zh.md
Normal file
@ -0,0 +1 @@
|
|||||||
|
[English](./README.md) | 简体中文
|
@ -1,223 +1,8 @@
|
|||||||
import random
|
|
||||||
|
|
||||||
from .pdf_parser import HuParser as PdfParser
|
from .pdf_parser import HuParser as PdfParser
|
||||||
from .docx_parser import HuDocxParser as DocxParser
|
from .docx_parser import HuDocxParser as DocxParser
|
||||||
from .excel_parser import HuExcelParser as ExcelParser
|
from .excel_parser import HuExcelParser as ExcelParser
|
||||||
|
from .ppt_parser import HuPptParser as PptParser
|
||||||
import re
|
|
||||||
|
|
||||||
from nltk import word_tokenize
|
|
||||||
|
|
||||||
from rag.nlp import stemmer, huqie
|
|
||||||
from rag.utils import num_tokens_from_string
|
|
||||||
|
|
||||||
BULLET_PATTERN = [[
|
|
||||||
r"第[零一二三四五六七八九十百0-9]+(分?编|部分)",
|
|
||||||
r"第[零一二三四五六七八九十百0-9]+章",
|
|
||||||
r"第[零一二三四五六七八九十百0-9]+节",
|
|
||||||
r"第[零一二三四五六七八九十百0-9]+条",
|
|
||||||
r"[\((][零一二三四五六七八九十百]+[\))]",
|
|
||||||
], [
|
|
||||||
r"第[0-9]+章",
|
|
||||||
r"第[0-9]+节",
|
|
||||||
r"[0-9]{,3}[\. 、]",
|
|
||||||
r"[0-9]{,2}\.[0-9]{,2}",
|
|
||||||
r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
|
|
||||||
r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
|
|
||||||
], [
|
|
||||||
r"第[零一二三四五六七八九十百0-9]+章",
|
|
||||||
r"第[零一二三四五六七八九十百0-9]+节",
|
|
||||||
r"[零一二三四五六七八九十百]+[ 、]",
|
|
||||||
r"[\((][零一二三四五六七八九十百]+[\))]",
|
|
||||||
r"[\((][0-9]{,2}[\))]",
|
|
||||||
], [
|
|
||||||
r"PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)",
|
|
||||||
r"Chapter (I+V?|VI*|XI|IX|X)",
|
|
||||||
r"Section [0-9]+",
|
|
||||||
r"Article [0-9]+"
|
|
||||||
]
|
|
||||||
]
|
|
||||||
|
|
||||||
def random_choices(arr, k):
|
|
||||||
k = min(len(arr), k)
|
|
||||||
return random.choices(arr, k=k)
|
|
||||||
|
|
||||||
def bullets_category(sections):
|
|
||||||
global BULLET_PATTERN
|
|
||||||
hits = [0] * len(BULLET_PATTERN)
|
|
||||||
for i, pro in enumerate(BULLET_PATTERN):
|
|
||||||
for sec in sections:
|
|
||||||
for p in pro:
|
|
||||||
if re.match(p, sec):
|
|
||||||
hits[i] += 1
|
|
||||||
break
|
|
||||||
maxium = 0
|
|
||||||
res = -1
|
|
||||||
for i, h in enumerate(hits):
|
|
||||||
if h <= maxium: continue
|
|
||||||
res = i
|
|
||||||
maxium = h
|
|
||||||
return res
|
|
||||||
|
|
||||||
|
|
||||||
def is_english(texts):
|
|
||||||
eng = 0
|
|
||||||
for t in texts:
|
|
||||||
if re.match(r"[a-zA-Z]{2,}", t.strip()):
|
|
||||||
eng += 1
|
|
||||||
if eng / len(texts) > 0.8:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def tokenize(d, t, eng):
|
|
||||||
d["content_with_weight"] = t
|
|
||||||
if eng:
|
|
||||||
t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
|
|
||||||
d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)])
|
|
||||||
else:
|
|
||||||
d["content_ltks"] = huqie.qie(t)
|
|
||||||
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
|
||||||
|
|
||||||
|
|
||||||
def remove_contents_table(sections, eng=False):
|
|
||||||
i = 0
|
|
||||||
while i < len(sections):
|
|
||||||
def get(i):
|
|
||||||
nonlocal sections
|
|
||||||
return (sections[i] if type(sections[i]) == type("") else sections[i][0]).strip()
|
|
||||||
|
|
||||||
if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
|
|
||||||
re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)):
|
|
||||||
i += 1
|
|
||||||
continue
|
|
||||||
sections.pop(i)
|
|
||||||
if i >= len(sections): break
|
|
||||||
prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
|
|
||||||
while not prefix:
|
|
||||||
sections.pop(i)
|
|
||||||
if i >= len(sections): break
|
|
||||||
prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
|
|
||||||
sections.pop(i)
|
|
||||||
if i >= len(sections) or not prefix: break
|
|
||||||
for j in range(i, min(i + 128, len(sections))):
|
|
||||||
if not re.match(prefix, get(j)):
|
|
||||||
continue
|
|
||||||
for _ in range(i, j): sections.pop(i)
|
|
||||||
break
|
|
||||||
|
|
||||||
|
|
||||||
def make_colon_as_title(sections):
|
|
||||||
if not sections: return []
|
|
||||||
if type(sections[0]) == type(""): return sections
|
|
||||||
i = 0
|
|
||||||
while i < len(sections):
|
|
||||||
txt, layout = sections[i]
|
|
||||||
i += 1
|
|
||||||
txt = txt.split("@")[0].strip()
|
|
||||||
if not txt:
|
|
||||||
continue
|
|
||||||
if txt[-1] not in "::":
|
|
||||||
continue
|
|
||||||
txt = txt[::-1]
|
|
||||||
arr = re.split(r"([。?!!?;;]| .)", txt)
|
|
||||||
if len(arr) < 2 or len(arr[1]) < 32:
|
|
||||||
continue
|
|
||||||
sections.insert(i - 1, (arr[0][::-1], "title"))
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
|
|
||||||
def hierarchical_merge(bull, sections, depth):
|
|
||||||
if not sections or bull < 0: return []
|
|
||||||
if type(sections[0]) == type(""): sections = [(s, "") for s in sections]
|
|
||||||
sections = [(t,o) for t, o in sections if t and len(t.split("@")[0].strip()) > 1 and not re.match(r"[0-9]+$", t.split("@")[0].strip())]
|
|
||||||
bullets_size = len(BULLET_PATTERN[bull])
|
|
||||||
levels = [[] for _ in range(bullets_size + 2)]
|
|
||||||
|
|
||||||
def not_title(txt):
|
|
||||||
if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt): return False
|
|
||||||
if len(txt) >= 128: return True
|
|
||||||
return re.search(r"[,;,。;!!]", txt)
|
|
||||||
|
|
||||||
for i, (txt, layout) in enumerate(sections):
|
|
||||||
for j, p in enumerate(BULLET_PATTERN[bull]):
|
|
||||||
if re.match(p, txt.strip()) and not not_title(txt):
|
|
||||||
levels[j].append(i)
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
if re.search(r"(title|head)", layout):
|
|
||||||
levels[bullets_size].append(i)
|
|
||||||
else:
|
|
||||||
levels[bullets_size + 1].append(i)
|
|
||||||
sections = [t for t, _ in sections]
|
|
||||||
for s in sections: print("--", s)
|
|
||||||
|
|
||||||
def binary_search(arr, target):
|
|
||||||
if not arr: return -1
|
|
||||||
if target > arr[-1]: return len(arr) - 1
|
|
||||||
if target < arr[0]: return -1
|
|
||||||
s, e = 0, len(arr)
|
|
||||||
while e - s > 1:
|
|
||||||
i = (e + s) // 2
|
|
||||||
if target > arr[i]:
|
|
||||||
s = i
|
|
||||||
continue
|
|
||||||
elif target < arr[i]:
|
|
||||||
e = i
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
assert False
|
|
||||||
return s
|
|
||||||
|
|
||||||
cks = []
|
|
||||||
readed = [False] * len(sections)
|
|
||||||
levels = levels[::-1]
|
|
||||||
for i, arr in enumerate(levels[:depth]):
|
|
||||||
for j in arr:
|
|
||||||
if readed[j]: continue
|
|
||||||
readed[j] = True
|
|
||||||
cks.append([j])
|
|
||||||
if i + 1 == len(levels) - 1: continue
|
|
||||||
for ii in range(i + 1, len(levels)):
|
|
||||||
jj = binary_search(levels[ii], j)
|
|
||||||
if jj < 0: continue
|
|
||||||
if jj > cks[-1][-1]: cks[-1].pop(-1)
|
|
||||||
cks[-1].append(levels[ii][jj])
|
|
||||||
for ii in cks[-1]: readed[ii] = True
|
|
||||||
for i in range(len(cks)):
|
|
||||||
cks[i] = [sections[j] for j in cks[i][::-1]]
|
|
||||||
print("--------------\n", "\n* ".join(cks[i]))
|
|
||||||
|
|
||||||
return cks
|
|
||||||
|
|
||||||
|
|
||||||
def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
|
||||||
if not sections: return []
|
|
||||||
if type(sections[0]) == type(""): sections = [(s, "") for s in sections]
|
|
||||||
cks = [""]
|
|
||||||
tk_nums = [0]
|
|
||||||
def add_chunk(t, pos):
|
|
||||||
nonlocal cks, tk_nums, delimiter
|
|
||||||
tnum = num_tokens_from_string(t)
|
|
||||||
if tnum < 8: pos = ""
|
|
||||||
if tk_nums[-1] > chunk_token_num:
|
|
||||||
cks.append(t + pos)
|
|
||||||
tk_nums.append(tnum)
|
|
||||||
else:
|
|
||||||
cks[-1] += t + pos
|
|
||||||
tk_nums[-1] += tnum
|
|
||||||
|
|
||||||
for sec, pos in sections:
|
|
||||||
s, e = 0, 1
|
|
||||||
while e < len(sec):
|
|
||||||
if sec[e] in delimiter:
|
|
||||||
add_chunk(sec[s: e+1], pos)
|
|
||||||
s = e + 1
|
|
||||||
e = s + 1
|
|
||||||
else:
|
|
||||||
e += 1
|
|
||||||
if s < e: add_chunk(sec[s: e], pos)
|
|
||||||
|
|
||||||
return cks
|
|
||||||
|
|
||||||
|
|
||||||
|
52
deepdoc/parser/ppt_parser.py
Normal file
52
deepdoc/parser/ppt_parser.py
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
from io import BytesIO
|
||||||
|
from pptx import Presentation
|
||||||
|
|
||||||
|
|
||||||
|
class HuPptParser(object):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
def __extract(self, shape):
|
||||||
|
if shape.shape_type == 19:
|
||||||
|
tb = shape.table
|
||||||
|
rows = []
|
||||||
|
for i in range(1, len(tb.rows)):
|
||||||
|
rows.append("; ".join([tb.cell(0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
|
||||||
|
return "\n".join(rows)
|
||||||
|
|
||||||
|
if shape.has_text_frame:
|
||||||
|
return shape.text_frame.text
|
||||||
|
|
||||||
|
if shape.shape_type == 6:
|
||||||
|
texts = []
|
||||||
|
for p in shape.shapes:
|
||||||
|
t = self.__extract(p)
|
||||||
|
if t: texts.append(t)
|
||||||
|
return "\n".join(texts)
|
||||||
|
|
||||||
|
def __call__(self, fnm, from_page, to_page, callback=None):
|
||||||
|
ppt = Presentation(fnm) if isinstance(
|
||||||
|
fnm, str) else Presentation(
|
||||||
|
BytesIO(fnm))
|
||||||
|
txts = []
|
||||||
|
self.total_page = len(ppt.slides)
|
||||||
|
for i, slide in enumerate(ppt.slides[from_page: to_page]):
|
||||||
|
texts = []
|
||||||
|
for shape in slide.shapes:
|
||||||
|
txt = self.__extract(shape)
|
||||||
|
if txt: texts.append(txt)
|
||||||
|
txts.append("\n".join(texts))
|
||||||
|
|
||||||
|
return txts
|
52
deepdoc/parser/resume/__init__.py
Normal file
52
deepdoc/parser/resume/__init__.py
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
import datetime
|
||||||
|
|
||||||
|
|
||||||
|
def refactor(cv):
|
||||||
|
for n in ["raw_txt", "parser_name", "inference", "ori_text", "use_time", "time_stat"]:
|
||||||
|
if n in cv and cv[n] is not None: del cv[n]
|
||||||
|
cv["is_deleted"] = 0
|
||||||
|
if "basic" not in cv: cv["basic"] = {}
|
||||||
|
if cv["basic"].get("photo2"): del cv["basic"]["photo2"]
|
||||||
|
|
||||||
|
for n in ["education", "work", "certificate", "project", "language", "skill", "training"]:
|
||||||
|
if n not in cv or cv[n] is None: continue
|
||||||
|
if type(cv[n]) == type({}): cv[n] = [v for _, v in cv[n].items()]
|
||||||
|
if type(cv[n]) != type([]):
|
||||||
|
del cv[n]
|
||||||
|
continue
|
||||||
|
vv = []
|
||||||
|
for v in cv[n]:
|
||||||
|
if "external" in v and v["external"] is not None: del v["external"]
|
||||||
|
vv.append(v)
|
||||||
|
cv[n] = {str(i): vv[i] for i in range(len(vv))}
|
||||||
|
|
||||||
|
basics = [
|
||||||
|
("basic_salary_month", "salary_month"),
|
||||||
|
("expect_annual_salary_from", "expect_annual_salary"),
|
||||||
|
]
|
||||||
|
for n, t in basics:
|
||||||
|
if cv["basic"].get(n):
|
||||||
|
cv["basic"][t] = cv["basic"][n]
|
||||||
|
del cv["basic"][n]
|
||||||
|
|
||||||
|
work = sorted([v for _, v in cv.get("work", {}).items()], key=lambda x: x.get("start_time", ""))
|
||||||
|
edu = sorted([v for _, v in cv.get("education", {}).items()], key=lambda x: x.get("start_time", ""))
|
||||||
|
|
||||||
|
if work:
|
||||||
|
cv["basic"]["work_start_time"] = work[0].get("start_time", "")
|
||||||
|
cv["basic"]["management_experience"] = 'Y' if any(
|
||||||
|
[w.get("management_experience", '') == 'Y' for w in work]) else 'N'
|
||||||
|
cv["basic"]["annual_salary"] = work[-1].get("annual_salary_from", "0")
|
||||||
|
|
||||||
|
for n in ["annual_salary_from", "annual_salary_to", "industry_name", "position_name", "responsibilities",
|
||||||
|
"corporation_type", "scale", "corporation_name"]:
|
||||||
|
cv["basic"][n] = work[-1].get(n, "")
|
||||||
|
|
||||||
|
if edu:
|
||||||
|
for n in ["school_name", "discipline_name"]:
|
||||||
|
if n in edu[-1]: cv["basic"][n] = edu[-1][n]
|
||||||
|
|
||||||
|
cv["basic"]["updated_at"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
if "contact" not in cv: cv["contact"] = {}
|
||||||
|
if not cv["contact"].get("name"): cv["contact"]["name"] = cv["basic"].get("name", "")
|
||||||
|
return cv
|
0
deepdoc/parser/resume/entities/__init__.py
Normal file
0
deepdoc/parser/resume/entities/__init__.py
Normal file
80
deepdoc/parser/resume/entities/corporations.py
Normal file
80
deepdoc/parser/resume/entities/corporations.py
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
import re,json,os
|
||||||
|
import pandas as pd
|
||||||
|
from rag.nlp import huqie
|
||||||
|
from . import regions
|
||||||
|
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
GOODS = pd.read_csv(os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0).fillna(0)
|
||||||
|
GOODS["cid"] = GOODS["cid"].astype(str)
|
||||||
|
GOODS = GOODS.set_index(["cid"])
|
||||||
|
CORP_TKS = json.load(open(os.path.join(current_file_path, "res/corp.tks.freq.json"), "r"))
|
||||||
|
GOOD_CORP = json.load(open(os.path.join(current_file_path, "res/good_corp.json"), "r"))
|
||||||
|
CORP_TAG = json.load(open(os.path.join(current_file_path, "res/corp_tag.json"), "r"))
|
||||||
|
|
||||||
|
def baike(cid, default_v=0):
|
||||||
|
global GOODS
|
||||||
|
try:
|
||||||
|
return GOODS.loc[str(cid), "len"]
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
return default_v
|
||||||
|
|
||||||
|
|
||||||
|
def corpNorm(nm, add_region=True):
|
||||||
|
global CORP_TKS
|
||||||
|
if not nm or type(nm)!=type(""):return ""
|
||||||
|
nm = huqie.tradi2simp(huqie.strQ2B(nm)).lower()
|
||||||
|
nm = re.sub(r"&", "&", nm)
|
||||||
|
nm = re.sub(r"[\(\)()\+'\"\t \*\\【】-]+", " ", nm)
|
||||||
|
nm = re.sub(r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE)
|
||||||
|
nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE)
|
||||||
|
if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm
|
||||||
|
|
||||||
|
tks = huqie.qie(nm).split(" ")
|
||||||
|
reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
|
||||||
|
nm = ""
|
||||||
|
for t in tks:
|
||||||
|
if regions.isName(t) or t in CORP_TKS:continue
|
||||||
|
if re.match(r"[0-9a-zA-Z\\,.]+", t) and re.match(r".*[0-9a-zA-Z\,.]+$", nm):nm += " "
|
||||||
|
nm += t
|
||||||
|
|
||||||
|
r = re.search(r"^([^a-z0-9 \(\)&]{2,})[a-z ]{4,}$", nm.strip())
|
||||||
|
if r:nm = r.group(1)
|
||||||
|
r = re.search(r"^([a-z ]{3,})[^a-z0-9 \(\)&]{2,}$", nm.strip())
|
||||||
|
if r:nm = r.group(1)
|
||||||
|
return nm.strip() + (("" if not reg else "(%s)"%reg[0]) if add_region else "")
|
||||||
|
|
||||||
|
|
||||||
|
def rmNoise(n):
|
||||||
|
n = re.sub(r"[\((][^()()]+[))]", "", n)
|
||||||
|
n = re.sub(r"[,. &()()]+", "", n)
|
||||||
|
return n
|
||||||
|
|
||||||
|
GOOD_CORP = set([corpNorm(rmNoise(c), False) for c in GOOD_CORP])
|
||||||
|
for c,v in CORP_TAG.items():
|
||||||
|
cc = corpNorm(rmNoise(c), False)
|
||||||
|
if not cc: print (c)
|
||||||
|
CORP_TAG = {corpNorm(rmNoise(c), False):v for c,v in CORP_TAG.items()}
|
||||||
|
|
||||||
|
def is_good(nm):
|
||||||
|
global GOOD_CORP
|
||||||
|
if nm.find("外派")>=0:return False
|
||||||
|
nm = rmNoise(nm)
|
||||||
|
nm = corpNorm(nm, False)
|
||||||
|
for n in GOOD_CORP:
|
||||||
|
if re.match(r"[0-9a-zA-Z]+$", n):
|
||||||
|
if n == nm: return True
|
||||||
|
elif nm.find(n)>=0:return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def corp_tag(nm):
|
||||||
|
global CORP_TAG
|
||||||
|
nm = rmNoise(nm)
|
||||||
|
nm = corpNorm(nm, False)
|
||||||
|
for n in CORP_TAG.keys():
|
||||||
|
if re.match(r"[0-9a-zA-Z., ]+$", n):
|
||||||
|
if n == nm: return CORP_TAG[n]
|
||||||
|
elif nm.find(n)>=0:
|
||||||
|
if len(n)<3 and len(nm)/len(n)>=2:continue
|
||||||
|
return CORP_TAG[n]
|
||||||
|
return []
|
||||||
|
|
24
deepdoc/parser/resume/entities/degrees.py
Normal file
24
deepdoc/parser/resume/entities/degrees.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
TBL = {"94":"EMBA",
|
||||||
|
"6":"MBA",
|
||||||
|
"95":"MPA",
|
||||||
|
"92":"专升本",
|
||||||
|
"4":"专科",
|
||||||
|
"90":"中专",
|
||||||
|
"91":"中技",
|
||||||
|
"86":"初中",
|
||||||
|
"3":"博士",
|
||||||
|
"10":"博士后",
|
||||||
|
"1":"本科",
|
||||||
|
"2":"硕士",
|
||||||
|
"87":"职高",
|
||||||
|
"89":"高中"
|
||||||
|
}
|
||||||
|
|
||||||
|
TBL_ = {v:k for k,v in TBL.items()}
|
||||||
|
|
||||||
|
def get_name(id):
|
||||||
|
return TBL.get(str(id), "")
|
||||||
|
|
||||||
|
def get_id(nm):
|
||||||
|
if not nm:return ""
|
||||||
|
return TBL_.get(nm.upper().strip(), "")
|
692
deepdoc/parser/resume/entities/industries.py
Normal file
692
deepdoc/parser/resume/entities/industries.py
Normal file
@ -0,0 +1,692 @@
|
|||||||
|
|
||||||
|
TBL = {"1":{"name":"IT/通信/电子","parent":"0"},
|
||||||
|
"2":{"name":"互联网","parent":"0"},
|
||||||
|
"3":{"name":"电子商务","parent":"2"},
|
||||||
|
"4":{"name":"互联网金融","parent":"2"},
|
||||||
|
"5":{"name":"网络游戏","parent":"2"},
|
||||||
|
"6":{"name":"社交网络平台","parent":"2"},
|
||||||
|
"7":{"name":"视频音乐","parent":"2"},
|
||||||
|
"9":{"name":"安全","parent":"2"},
|
||||||
|
"10":{"name":"云计算","parent":"2"},
|
||||||
|
"12":{"name":"工具类客户端应用","parent":"2"},
|
||||||
|
"13":{"name":"互联网广告","parent":"2"},
|
||||||
|
"14":{"name":"企业互联网服务","parent":"2"},
|
||||||
|
"16":{"name":"在线教育","parent":"2"},
|
||||||
|
"17":{"name":"在线医疗","parent":"2"},
|
||||||
|
"19":{"name":"B2B","parent":"3"},
|
||||||
|
"20":{"name":"B2C","parent":"3"},
|
||||||
|
"21":{"name":"C2C","parent":"3"},
|
||||||
|
"22":{"name":"生活信息本地化","parent":"3"},
|
||||||
|
"23":{"name":"在线旅游","parent":"2"},
|
||||||
|
"24":{"name":"第三方支付","parent":"4"},
|
||||||
|
"26":{"name":"客户端游戏","parent":"5"},
|
||||||
|
"27":{"name":"网页游戏","parent":"5"},
|
||||||
|
"28":{"name":"手机游戏","parent":"5"},
|
||||||
|
"29":{"name":"微博","parent":"6"},
|
||||||
|
"30":{"name":"社交网站","parent":"6"},
|
||||||
|
"31":{"name":"在线视频","parent":"7"},
|
||||||
|
"32":{"name":"在线音乐","parent":"7"},
|
||||||
|
"35":{"name":"企业安全","parent":"9"},
|
||||||
|
"36":{"name":"个人安全","parent":"9"},
|
||||||
|
"37":{"name":"企业级云服务","parent":"10"},
|
||||||
|
"38":{"name":"个人级云服务","parent":"10"},
|
||||||
|
"43":{"name":"输入法","parent":"12"},
|
||||||
|
"44":{"name":"浏览器","parent":"12"},
|
||||||
|
"45":{"name":"词典","parent":"12"},
|
||||||
|
"46":{"name":"播放器","parent":"12"},
|
||||||
|
"47":{"name":"下载器","parent":"12"},
|
||||||
|
"48":{"name":"IM","parent":"12"},
|
||||||
|
"49":{"name":"广告服务","parent":"13"},
|
||||||
|
"50":{"name":"第三方广告网络平台","parent":"13"},
|
||||||
|
"51":{"name":"媒体代理","parent":"13"},
|
||||||
|
"52":{"name":"创意代理","parent":"13"},
|
||||||
|
"53":{"name":"IT-综合","parent":"1"},
|
||||||
|
"71":{"name":"团购","parent":"3"},
|
||||||
|
"72":{"name":"地图","parent":"2"},
|
||||||
|
"73":{"name":"数据存储","parent":"2"},
|
||||||
|
"414":{"name":"计算机软件","parent":"1"},
|
||||||
|
"415":{"name":"计算机硬件","parent":"1"},
|
||||||
|
"416":{"name":"计算机服务(系统、数据服务、维修)","parent":"1"},
|
||||||
|
"417":{"name":"通信/电信/网络设备","parent":"1"},
|
||||||
|
"418":{"name":"通信/电信运营、增值服务","parent":"1"},
|
||||||
|
"419":{"name":"电子技术/半导体/集成电路","parent":"1"},
|
||||||
|
"472":{"name":"P2P网贷","parent":"4"},
|
||||||
|
"473":{"name":"互联网理财","parent":"4"},
|
||||||
|
"474":{"name":"婚恋","parent":"6"},
|
||||||
|
"476":{"name":"虚拟化","parent":"10"},
|
||||||
|
"477":{"name":"邮箱","parent":"12"},
|
||||||
|
"478":{"name":"商业智能","parent":"14"},
|
||||||
|
"479":{"name":"企业建站","parent":"14"},
|
||||||
|
"480":{"name":"安防","parent":"14"},
|
||||||
|
"481":{"name":"网络营销","parent":"2"},
|
||||||
|
"487":{"name":"智能终端","parent":"2"},
|
||||||
|
"488":{"name":"移动互联网","parent":"2"},
|
||||||
|
"489":{"name":"数字城市","parent":"2"},
|
||||||
|
"490":{"name":"大数据","parent":"2"},
|
||||||
|
"491":{"name":"互联网人力资源","parent":"2"},
|
||||||
|
"492":{"name":"舆情监控","parent":"2"},
|
||||||
|
"493":{"name":"移动营销","parent":"481"},
|
||||||
|
"494":{"name":"微博营销","parent":"481"},
|
||||||
|
"495":{"name":"精准营销","parent":"481"},
|
||||||
|
"496":{"name":"海外营销","parent":"481"},
|
||||||
|
"497":{"name":"微信营销","parent":"481"},
|
||||||
|
"498":{"name":"智能手机","parent":"487"},
|
||||||
|
"499":{"name":"可穿戴设备","parent":"487"},
|
||||||
|
"500":{"name":"智能电视","parent":"487"},
|
||||||
|
"501":{"name":"WAP","parent":"488"},
|
||||||
|
"502":{"name":"物联网","parent":"489"},
|
||||||
|
"503":{"name":"O2O","parent":"489"},
|
||||||
|
"504":{"name":"数字出版","parent":"489"},
|
||||||
|
"505":{"name":"搜索","parent":"2"},
|
||||||
|
"506":{"name":"垂直搜索","parent":"505"},
|
||||||
|
"507":{"name":"无线搜索","parent":"505"},
|
||||||
|
"508":{"name":"网页搜索","parent":"505"},
|
||||||
|
"509":{"name":"网址导航","parent":"2"},
|
||||||
|
"510":{"name":"门户","parent":"2"},
|
||||||
|
"511":{"name":"网络文学","parent":"2"},
|
||||||
|
"512":{"name":"自媒体","parent":"2"},
|
||||||
|
"513":{"name":"金融","parent":"0"},
|
||||||
|
"514":{"name":"建筑与房地产","parent":"0"},
|
||||||
|
"515":{"name":"专业服务","parent":"0"},
|
||||||
|
"516":{"name":"教育培训","parent":"0"},
|
||||||
|
"517":{"name":"文化传媒","parent":"0"},
|
||||||
|
"518":{"name":"消费品","parent":"0"},
|
||||||
|
"519":{"name":"工业","parent":"0"},
|
||||||
|
"520":{"name":"交通物流","parent":"0"},
|
||||||
|
"521":{"name":"贸易","parent":"0"},
|
||||||
|
"522":{"name":"医药","parent":"0"},
|
||||||
|
"523":{"name":"医疗器械","parent":"522"},
|
||||||
|
"524":{"name":"保健品","parent":"518"},
|
||||||
|
"525":{"name":"服务业","parent":"0"},
|
||||||
|
"526":{"name":"能源/矿产/环保","parent":"0"},
|
||||||
|
"527":{"name":"化工","parent":"0"},
|
||||||
|
"528":{"name":"政府","parent":"0"},
|
||||||
|
"529":{"name":"公共事业","parent":"0"},
|
||||||
|
"530":{"name":"非盈利机构","parent":"0"},
|
||||||
|
"531":{"name":"农业","parent":"1131"},
|
||||||
|
"532":{"name":"林业","parent":"1131"},
|
||||||
|
"533":{"name":"畜牧业","parent":"1131"},
|
||||||
|
"534":{"name":"渔业","parent":"1131"},
|
||||||
|
"535":{"name":"学术科研","parent":"0"},
|
||||||
|
"536":{"name":"零售","parent":"0"},
|
||||||
|
"537":{"name":"银行","parent":"513"},
|
||||||
|
"538":{"name":"保险","parent":"513"},
|
||||||
|
"539":{"name":"证券","parent":"513"},
|
||||||
|
"540":{"name":"基金","parent":"513"},
|
||||||
|
"541":{"name":"信托","parent":"513"},
|
||||||
|
"542":{"name":"担保","parent":"513"},
|
||||||
|
"543":{"name":"典当","parent":"513"},
|
||||||
|
"544":{"name":"拍卖","parent":"513"},
|
||||||
|
"545":{"name":"投资/融资","parent":"513"},
|
||||||
|
"546":{"name":"期货","parent":"513"},
|
||||||
|
"547":{"name":"房地产开发","parent":"514"},
|
||||||
|
"548":{"name":"工程施工","parent":"514"},
|
||||||
|
"549":{"name":"建筑设计","parent":"514"},
|
||||||
|
"550":{"name":"房地产代理","parent":"514"},
|
||||||
|
"551":{"name":"物业管理","parent":"514"},
|
||||||
|
"552":{"name":"室内设计","parent":"514"},
|
||||||
|
"553":{"name":"装修装潢","parent":"514"},
|
||||||
|
"554":{"name":"市政工程","parent":"514"},
|
||||||
|
"555":{"name":"工程造价","parent":"514"},
|
||||||
|
"556":{"name":"工程监理","parent":"514"},
|
||||||
|
"557":{"name":"环境工程","parent":"514"},
|
||||||
|
"558":{"name":"园林景观","parent":"514"},
|
||||||
|
"559":{"name":"法律","parent":"515"},
|
||||||
|
"560":{"name":"人力资源","parent":"515"},
|
||||||
|
"561":{"name":"会计","parent":"1125"},
|
||||||
|
"562":{"name":"审计","parent":"515"},
|
||||||
|
"563":{"name":"检测认证","parent":"515"},
|
||||||
|
"565":{"name":"翻译","parent":"515"},
|
||||||
|
"566":{"name":"中介","parent":"515"},
|
||||||
|
"567":{"name":"咨询","parent":"515"},
|
||||||
|
"568":{"name":"外包服务","parent":"515"},
|
||||||
|
"569":{"name":"家教","parent":"516"},
|
||||||
|
"570":{"name":"早教","parent":"516"},
|
||||||
|
"571":{"name":"职业技能培训","parent":"516"},
|
||||||
|
"572":{"name":"外语培训","parent":"516"},
|
||||||
|
"573":{"name":"设计培训","parent":"516"},
|
||||||
|
"574":{"name":"IT培训","parent":"516"},
|
||||||
|
"575":{"name":"文艺体育培训","parent":"516"},
|
||||||
|
"576":{"name":"学历教育","parent":"516"},
|
||||||
|
"577":{"name":"管理培训","parent":"516"},
|
||||||
|
"578":{"name":"民办基础教育","parent":"516"},
|
||||||
|
"579":{"name":"广告","parent":"517"},
|
||||||
|
"580":{"name":"媒体","parent":"517"},
|
||||||
|
"581":{"name":"会展","parent":"517"},
|
||||||
|
"582":{"name":"公关","parent":"517"},
|
||||||
|
"583":{"name":"影视","parent":"517"},
|
||||||
|
"584":{"name":"艺术","parent":"517"},
|
||||||
|
"585":{"name":"文化传播","parent":"517"},
|
||||||
|
"586":{"name":"娱乐","parent":"517"},
|
||||||
|
"587":{"name":"体育","parent":"517"},
|
||||||
|
"588":{"name":"出版","parent":"517"},
|
||||||
|
"589":{"name":"休闲","parent":"517"},
|
||||||
|
"590":{"name":"动漫","parent":"517"},
|
||||||
|
"591":{"name":"市场推广","parent":"517"},
|
||||||
|
"592":{"name":"市场研究","parent":"517"},
|
||||||
|
"593":{"name":"食品","parent":"1129"},
|
||||||
|
"594":{"name":"饮料","parent":"1129"},
|
||||||
|
"595":{"name":"烟草","parent":"1129"},
|
||||||
|
"596":{"name":"酒品","parent":"518"},
|
||||||
|
"597":{"name":"服饰","parent":"518"},
|
||||||
|
"598":{"name":"纺织","parent":"518"},
|
||||||
|
"599":{"name":"化妆品","parent":"1129"},
|
||||||
|
"600":{"name":"日用品","parent":"1129"},
|
||||||
|
"601":{"name":"家电","parent":"518"},
|
||||||
|
"602":{"name":"家具","parent":"518"},
|
||||||
|
"603":{"name":"办公用品","parent":"518"},
|
||||||
|
"604":{"name":"奢侈品","parent":"518"},
|
||||||
|
"605":{"name":"珠宝","parent":"518"},
|
||||||
|
"606":{"name":"数码产品","parent":"518"},
|
||||||
|
"607":{"name":"玩具","parent":"518"},
|
||||||
|
"608":{"name":"图书","parent":"518"},
|
||||||
|
"609":{"name":"音像","parent":"518"},
|
||||||
|
"610":{"name":"钟表","parent":"518"},
|
||||||
|
"611":{"name":"箱包","parent":"518"},
|
||||||
|
"612":{"name":"母婴","parent":"518"},
|
||||||
|
"613":{"name":"营养保健","parent":"518"},
|
||||||
|
"614":{"name":"户外用品","parent":"518"},
|
||||||
|
"615":{"name":"健身器材","parent":"518"},
|
||||||
|
"616":{"name":"乐器","parent":"518"},
|
||||||
|
"617":{"name":"汽车用品","parent":"518"},
|
||||||
|
"619":{"name":"厨具","parent":"518"},
|
||||||
|
"620":{"name":"机械制造","parent":"519"},
|
||||||
|
"621":{"name":"流体控制","parent":"519"},
|
||||||
|
"622":{"name":"自动化控制","parent":"519"},
|
||||||
|
"623":{"name":"仪器仪表","parent":"519"},
|
||||||
|
"624":{"name":"航空/航天","parent":"519"},
|
||||||
|
"625":{"name":"交通设施","parent":"519"},
|
||||||
|
"626":{"name":"工业电子","parent":"519"},
|
||||||
|
"627":{"name":"建材","parent":"519"},
|
||||||
|
"628":{"name":"五金材料","parent":"519"},
|
||||||
|
"629":{"name":"汽车","parent":"519"},
|
||||||
|
"630":{"name":"印刷","parent":"519"},
|
||||||
|
"631":{"name":"造纸","parent":"519"},
|
||||||
|
"632":{"name":"包装","parent":"519"},
|
||||||
|
"633":{"name":"原材料及加工","parent":"519"},
|
||||||
|
"634":{"name":"物流","parent":"520"},
|
||||||
|
"635":{"name":"仓储","parent":"520"},
|
||||||
|
"636":{"name":"客运","parent":"520"},
|
||||||
|
"637":{"name":"快递","parent":"520"},
|
||||||
|
"638":{"name":"化学药","parent":"522"},
|
||||||
|
"639":{"name":"中药","parent":"522"},
|
||||||
|
"640":{"name":"生物制药","parent":"522"},
|
||||||
|
"641":{"name":"兽药","parent":"522"},
|
||||||
|
"642":{"name":"农药","parent":"522"},
|
||||||
|
"643":{"name":"CRO","parent":"522"},
|
||||||
|
"644":{"name":"消毒","parent":"522"},
|
||||||
|
"645":{"name":"医药商业","parent":"522"},
|
||||||
|
"646":{"name":"医疗服务","parent":"522"},
|
||||||
|
"647":{"name":"医疗器械","parent":"523"},
|
||||||
|
"648":{"name":"制药设备","parent":"523"},
|
||||||
|
"649":{"name":"医用耗材","parent":"523"},
|
||||||
|
"650":{"name":"手术器械","parent":"523"},
|
||||||
|
"651":{"name":"保健器材","parent":"524"},
|
||||||
|
"652":{"name":"性保健品","parent":"524"},
|
||||||
|
"653":{"name":"医药保养","parent":"524"},
|
||||||
|
"654":{"name":"医用保健","parent":"524"},
|
||||||
|
"655":{"name":"酒店","parent":"525"},
|
||||||
|
"656":{"name":"餐饮","parent":"525"},
|
||||||
|
"657":{"name":"旅游","parent":"525"},
|
||||||
|
"658":{"name":"生活服务","parent":"525"},
|
||||||
|
"659":{"name":"保健服务","parent":"525"},
|
||||||
|
"660":{"name":"运动健身","parent":"525"},
|
||||||
|
"661":{"name":"家政服务","parent":"525"},
|
||||||
|
"662":{"name":"婚庆服务","parent":"525"},
|
||||||
|
"663":{"name":"租赁服务","parent":"525"},
|
||||||
|
"664":{"name":"维修服务","parent":"525"},
|
||||||
|
"665":{"name":"石油天然气","parent":"526"},
|
||||||
|
"666":{"name":"电力","parent":"526"},
|
||||||
|
"667":{"name":"新能源","parent":"526"},
|
||||||
|
"668":{"name":"水利","parent":"526"},
|
||||||
|
"669":{"name":"矿产","parent":"526"},
|
||||||
|
"670":{"name":"采掘业","parent":"526"},
|
||||||
|
"671":{"name":"冶炼","parent":"526"},
|
||||||
|
"672":{"name":"环保","parent":"526"},
|
||||||
|
"673":{"name":"无机化工原料","parent":"527"},
|
||||||
|
"674":{"name":"有机化工原料","parent":"527"},
|
||||||
|
"675":{"name":"精细化学品","parent":"527"},
|
||||||
|
"676":{"name":"化工设备","parent":"527"},
|
||||||
|
"677":{"name":"化工工程","parent":"527"},
|
||||||
|
"678":{"name":"资产管理","parent":"513"},
|
||||||
|
"679":{"name":"金融租赁","parent":"513"},
|
||||||
|
"680":{"name":"征信及信评机构","parent":"513"},
|
||||||
|
"681":{"name":"资产评估机构","parent":"513"},
|
||||||
|
"683":{"name":"金融监管机构","parent":"513"},
|
||||||
|
"684":{"name":"国际贸易","parent":"521"},
|
||||||
|
"685":{"name":"海关","parent":"521"},
|
||||||
|
"686":{"name":"购物中心","parent":"536"},
|
||||||
|
"687":{"name":"超市","parent":"536"},
|
||||||
|
"688":{"name":"便利店","parent":"536"},
|
||||||
|
"689":{"name":"专卖店","parent":"536"},
|
||||||
|
"690":{"name":"专业店","parent":"536"},
|
||||||
|
"691":{"name":"百货店","parent":"536"},
|
||||||
|
"692":{"name":"杂货店","parent":"536"},
|
||||||
|
"693":{"name":"个人银行","parent":"537"},
|
||||||
|
"695":{"name":"私人银行","parent":"537"},
|
||||||
|
"696":{"name":"公司银行","parent":"537"},
|
||||||
|
"697":{"name":"投资银行","parent":"537"},
|
||||||
|
"698":{"name":"政策性银行","parent":"537"},
|
||||||
|
"699":{"name":"中央银行","parent":"537"},
|
||||||
|
"700":{"name":"人寿险","parent":"538"},
|
||||||
|
"701":{"name":"财产险","parent":"538"},
|
||||||
|
"702":{"name":"再保险","parent":"538"},
|
||||||
|
"703":{"name":"养老险","parent":"538"},
|
||||||
|
"704":{"name":"保险代理公司","parent":"538"},
|
||||||
|
"705":{"name":"公募基金","parent":"540"},
|
||||||
|
"707":{"name":"私募基金","parent":"540"},
|
||||||
|
"708":{"name":"第三方理财","parent":"679"},
|
||||||
|
"709":{"name":"资产管理公司","parent":"679"},
|
||||||
|
"711":{"name":"房产中介","parent":"566"},
|
||||||
|
"712":{"name":"职业中介","parent":"566"},
|
||||||
|
"713":{"name":"婚姻中介","parent":"566"},
|
||||||
|
"714":{"name":"战略咨询","parent":"567"},
|
||||||
|
"715":{"name":"投资咨询","parent":"567"},
|
||||||
|
"716":{"name":"心理咨询","parent":"567"},
|
||||||
|
"717":{"name":"留学移民咨询","parent":"567"},
|
||||||
|
"718":{"name":"工商注册代理","parent":"568"},
|
||||||
|
"719":{"name":"商标专利代理","parent":"568"},
|
||||||
|
"720":{"name":"财务代理","parent":"568"},
|
||||||
|
"721":{"name":"工程机械","parent":"620"},
|
||||||
|
"722":{"name":"农业机械","parent":"620"},
|
||||||
|
"723":{"name":"海工设备","parent":"620"},
|
||||||
|
"724":{"name":"包装机械","parent":"620"},
|
||||||
|
"725":{"name":"印刷机械","parent":"620"},
|
||||||
|
"726":{"name":"数控机床","parent":"620"},
|
||||||
|
"727":{"name":"矿山机械","parent":"620"},
|
||||||
|
"728":{"name":"水泵","parent":"621"},
|
||||||
|
"729":{"name":"管道","parent":"621"},
|
||||||
|
"730":{"name":"阀门","parent":"621"},
|
||||||
|
"732":{"name":"压缩机","parent":"621"},
|
||||||
|
"733":{"name":"集散控制系统","parent":"622"},
|
||||||
|
"734":{"name":"远程控制","parent":"622"},
|
||||||
|
"735":{"name":"液压系统","parent":"622"},
|
||||||
|
"736":{"name":"楼宇智能化","parent":"622"},
|
||||||
|
"737":{"name":"飞机制造","parent":"624"},
|
||||||
|
"738":{"name":"航空公司","parent":"624"},
|
||||||
|
"739":{"name":"发动机","parent":"624"},
|
||||||
|
"740":{"name":"复合材料","parent":"624"},
|
||||||
|
"741":{"name":"高铁","parent":"625"},
|
||||||
|
"742":{"name":"地铁","parent":"625"},
|
||||||
|
"743":{"name":"信号传输","parent":"625"},
|
||||||
|
"745":{"name":"结构材料","parent":"627"},
|
||||||
|
"746":{"name":"装饰材料","parent":"627"},
|
||||||
|
"747":{"name":"专用材料","parent":"627"},
|
||||||
|
"749":{"name":"经销商集团","parent":"629"},
|
||||||
|
"750":{"name":"整车制造","parent":"629"},
|
||||||
|
"751":{"name":"汽车零配件","parent":"629"},
|
||||||
|
"752":{"name":"外型设计","parent":"629"},
|
||||||
|
"753":{"name":"平版印刷","parent":"630"},
|
||||||
|
"754":{"name":"凸版印刷","parent":"630"},
|
||||||
|
"755":{"name":"凹版印刷","parent":"630"},
|
||||||
|
"756":{"name":"孔版印刷","parent":"630"},
|
||||||
|
"757":{"name":"印刷用纸","parent":"631"},
|
||||||
|
"758":{"name":"书写、制图及复制用纸","parent":"631"},
|
||||||
|
"759":{"name":"包装用纸","parent":"631"},
|
||||||
|
"760":{"name":"生活、卫生及装饰用纸","parent":"631"},
|
||||||
|
"761":{"name":"技术用纸","parent":"631"},
|
||||||
|
"762":{"name":"加工纸原纸","parent":"631"},
|
||||||
|
"763":{"name":"食品包装","parent":"632"},
|
||||||
|
"764":{"name":"医药包装","parent":"632"},
|
||||||
|
"765":{"name":"日化包装","parent":"632"},
|
||||||
|
"766":{"name":"物流包装","parent":"632"},
|
||||||
|
"767":{"name":"礼品包装","parent":"632"},
|
||||||
|
"768":{"name":"电子五金包装","parent":"632"},
|
||||||
|
"769":{"name":"汽车服务","parent":"525"},
|
||||||
|
"770":{"name":"汽车保养","parent":"769"},
|
||||||
|
"771":{"name":"租车","parent":"769"},
|
||||||
|
"773":{"name":"出租车","parent":"769"},
|
||||||
|
"774":{"name":"代驾","parent":"769"},
|
||||||
|
"775":{"name":"发电","parent":"666"},
|
||||||
|
"777":{"name":"输配电","parent":"666"},
|
||||||
|
"779":{"name":"风电","parent":"667"},
|
||||||
|
"780":{"name":"光伏/太阳能","parent":"667"},
|
||||||
|
"781":{"name":"生物质发电","parent":"667"},
|
||||||
|
"782":{"name":"煤化工","parent":"667"},
|
||||||
|
"783":{"name":"垃圾发电","parent":"667"},
|
||||||
|
"784":{"name":"核电","parent":"667"},
|
||||||
|
"785":{"name":"能源矿产","parent":"669"},
|
||||||
|
"786":{"name":"金属矿产","parent":"669"},
|
||||||
|
"787":{"name":"非金属矿产","parent":"669"},
|
||||||
|
"788":{"name":"水气矿产","parent":"669"},
|
||||||
|
"789":{"name":"锅炉","parent":"775"},
|
||||||
|
"790":{"name":"发电机","parent":"775"},
|
||||||
|
"791":{"name":"汽轮机","parent":"775"},
|
||||||
|
"792":{"name":"燃机","parent":"775"},
|
||||||
|
"793":{"name":"冷却","parent":"775"},
|
||||||
|
"794":{"name":"电力设计院","parent":"775"},
|
||||||
|
"795":{"name":"高压输配电","parent":"777"},
|
||||||
|
"796":{"name":"中压输配电","parent":"777"},
|
||||||
|
"797":{"name":"低压输配电","parent":"777"},
|
||||||
|
"798":{"name":"继电保护","parent":"777"},
|
||||||
|
"799":{"name":"智能电网","parent":"777"},
|
||||||
|
"800":{"name":"小学","parent":"516"},
|
||||||
|
"801":{"name":"电动车","parent":"519"},
|
||||||
|
"802":{"name":"皮具箱包","parent":"518"},
|
||||||
|
"803":{"name":"医药制造","parent":"522"},
|
||||||
|
"804":{"name":"电器销售","parent":"536"},
|
||||||
|
"805":{"name":"塑料制品","parent":"527"},
|
||||||
|
"806":{"name":"公益基金会","parent":"530"},
|
||||||
|
"807":{"name":"美发服务","parent":"525"},
|
||||||
|
"808":{"name":"农业养殖","parent":"531"},
|
||||||
|
"809":{"name":"金融服务","parent":"513"},
|
||||||
|
"810":{"name":"商业地产综合体","parent":"514"},
|
||||||
|
"811":{"name":"美容服务","parent":"525"},
|
||||||
|
"812":{"name":"灯饰","parent":"518"},
|
||||||
|
"813":{"name":"油墨颜料产品","parent":"527"},
|
||||||
|
"814":{"name":"眼镜制造","parent":"518"},
|
||||||
|
"815":{"name":"农业生物技术","parent":"531"},
|
||||||
|
"816":{"name":"体育用品","parent":"518"},
|
||||||
|
"817":{"name":"保健用品","parent":"524"},
|
||||||
|
"818":{"name":"化学化工产品","parent":"527"},
|
||||||
|
"819":{"name":"饲料","parent":"531"},
|
||||||
|
"821":{"name":"保安服务","parent":"525"},
|
||||||
|
"822":{"name":"干细胞技术","parent":"522"},
|
||||||
|
"824":{"name":"农药化肥","parent":"527"},
|
||||||
|
"825":{"name":"卫生洁具","parent":"518"},
|
||||||
|
"826":{"name":"体育器材、场馆","parent":"518"},
|
||||||
|
"827":{"name":"饲料加工","parent":"531"},
|
||||||
|
"828":{"name":"测绘服务","parent":"529"},
|
||||||
|
"830":{"name":"金属船舶制造","parent":"519"},
|
||||||
|
"831":{"name":"基因工程","parent":"522"},
|
||||||
|
"832":{"name":"花卉服务","parent":"536"},
|
||||||
|
"833":{"name":"农业种植","parent":"531"},
|
||||||
|
"834":{"name":"皮革制品","parent":"518"},
|
||||||
|
"835":{"name":"地理信息加工服务","parent":"529"},
|
||||||
|
"836":{"name":"机器人","parent":"519"},
|
||||||
|
"837":{"name":"礼品","parent":"518"},
|
||||||
|
"838":{"name":"理发及美容服务","parent":"525"},
|
||||||
|
"839":{"name":"其他清洁服务","parent":"525"},
|
||||||
|
"840":{"name":"硅胶材料","parent":"527"},
|
||||||
|
"841":{"name":"茶叶销售","parent":"518"},
|
||||||
|
"842":{"name":"彩票活动","parent":"529"},
|
||||||
|
"843":{"name":"化妆培训","parent":"516"},
|
||||||
|
"844":{"name":"鞋业","parent":"518"},
|
||||||
|
"845":{"name":"酒店用品","parent":"518"},
|
||||||
|
"846":{"name":"复合材料","parent":"527"},
|
||||||
|
"847":{"name":"房地产工程建设","parent":"548"},
|
||||||
|
"848":{"name":"知识产权服务","parent":"559"},
|
||||||
|
"849":{"name":"新型建材","parent":"627"},
|
||||||
|
"850":{"name":"企业投资咨询","parent":"567"},
|
||||||
|
"851":{"name":"含乳饮料和植物蛋白饮料制造","parent":"594"},
|
||||||
|
"852":{"name":"汽车检测设备","parent":"629"},
|
||||||
|
"853":{"name":"手机通讯器材","parent":"417"},
|
||||||
|
"854":{"name":"环保材料","parent":"672"},
|
||||||
|
"855":{"name":"交通设施","parent":"554"},
|
||||||
|
"856":{"name":"电子器件","parent":"419"},
|
||||||
|
"857":{"name":"啤酒","parent":"594"},
|
||||||
|
"858":{"name":"生态旅游","parent":"657"},
|
||||||
|
"859":{"name":"自动化设备","parent":"626"},
|
||||||
|
"860":{"name":"软件开发","parent":"414"},
|
||||||
|
"861":{"name":"葡萄酒销售","parent":"594"},
|
||||||
|
"862":{"name":"钢材","parent":"633"},
|
||||||
|
"863":{"name":"餐饮培训","parent":"656"},
|
||||||
|
"864":{"name":"速冻食品","parent":"593"},
|
||||||
|
"865":{"name":"空气环保","parent":"672"},
|
||||||
|
"866":{"name":"互联网房地产经纪服务","parent":"550"},
|
||||||
|
"867":{"name":"食品添加剂","parent":"593"},
|
||||||
|
"868":{"name":"演艺传播","parent":"585"},
|
||||||
|
"869":{"name":"信用卡","parent":"537"},
|
||||||
|
"870":{"name":"报纸期刊广告","parent":"579"},
|
||||||
|
"871":{"name":"摄影","parent":"525"},
|
||||||
|
"872":{"name":"手机软件","parent":"414"},
|
||||||
|
"873":{"name":"地坪建材","parent":"627"},
|
||||||
|
"874":{"name":"企业管理咨询","parent":"567"},
|
||||||
|
"875":{"name":"幼儿教育","parent":"570"},
|
||||||
|
"876":{"name":"系统集成","parent":"416"},
|
||||||
|
"877":{"name":"皮革服饰","parent":"597"},
|
||||||
|
"878":{"name":"保健食品","parent":"593"},
|
||||||
|
"879":{"name":"叉车","parent":"620"},
|
||||||
|
"880":{"name":"厨卫电器","parent":"601"},
|
||||||
|
"882":{"name":"地暖设备","parent":"627"},
|
||||||
|
"883":{"name":"钢结构制造","parent":"548"},
|
||||||
|
"884":{"name":"投影机","parent":"606"},
|
||||||
|
"885":{"name":"啤酒销售","parent":"594"},
|
||||||
|
"886":{"name":"度假村旅游","parent":"657"},
|
||||||
|
"887":{"name":"电力元件设备","parent":"626"},
|
||||||
|
"888":{"name":"管理软件","parent":"414"},
|
||||||
|
"889":{"name":"轴承","parent":"628"},
|
||||||
|
"890":{"name":"餐饮设备","parent":"656"},
|
||||||
|
"891":{"name":"肉制品及副产品加工","parent":"593"},
|
||||||
|
"892":{"name":"艺术收藏品投资交易","parent":"584"},
|
||||||
|
"893":{"name":"净水器","parent":"601"},
|
||||||
|
"894":{"name":"进口食品","parent":"593"},
|
||||||
|
"895":{"name":"娱乐文化传播","parent":"585"},
|
||||||
|
"896":{"name":"文化传播","parent":"585"},
|
||||||
|
"897":{"name":"商旅传媒","parent":"580"},
|
||||||
|
"898":{"name":"广告设计制作","parent":"579"},
|
||||||
|
"899":{"name":"金属丝绳及其制品制造","parent":"627"},
|
||||||
|
"900":{"name":"建筑涂料","parent":"627"},
|
||||||
|
"901":{"name":"抵押贷款","parent":"543"},
|
||||||
|
"902":{"name":"早教","parent":"570"},
|
||||||
|
"903":{"name":"电影放映","parent":"583"},
|
||||||
|
"904":{"name":"内衣服饰","parent":"597"},
|
||||||
|
"905":{"name":"无线网络通信","parent":"418"},
|
||||||
|
"906":{"name":"记忆卡","parent":"415"},
|
||||||
|
"907":{"name":"女装服饰","parent":"597"},
|
||||||
|
"908":{"name":"建筑机械","parent":"620"},
|
||||||
|
"909":{"name":"制冷电器","parent":"601"},
|
||||||
|
"910":{"name":"通信设备","parent":"417"},
|
||||||
|
"911":{"name":"空调设备","parent":"601"},
|
||||||
|
"912":{"name":"建筑装饰","parent":"553"},
|
||||||
|
"913":{"name":"办公设备","parent":"603"},
|
||||||
|
"916":{"name":"数据处理软件","parent":"414"},
|
||||||
|
"917":{"name":"葡萄酒贸易","parent":"594"},
|
||||||
|
"918":{"name":"通讯器材","parent":"417"},
|
||||||
|
"919":{"name":"铜业","parent":"633"},
|
||||||
|
"920":{"name":"食堂","parent":"656"},
|
||||||
|
"921":{"name":"糖果零食","parent":"593"},
|
||||||
|
"922":{"name":"文化艺术传播","parent":"584"},
|
||||||
|
"923":{"name":"太阳能电器","parent":"601"},
|
||||||
|
"924":{"name":"药品零售","parent":"645"},
|
||||||
|
"925":{"name":"果蔬食品","parent":"593"},
|
||||||
|
"926":{"name":"文化活动策划","parent":"585"},
|
||||||
|
"928":{"name":"汽车广告","parent":"657"},
|
||||||
|
"929":{"name":"条码设备","parent":"630"},
|
||||||
|
"930":{"name":"建筑石材","parent":"627"},
|
||||||
|
"931":{"name":"贵金属","parent":"545"},
|
||||||
|
"932":{"name":"体育","parent":"660"},
|
||||||
|
"933":{"name":"金融信息服务","parent":"414"},
|
||||||
|
"934":{"name":"玻璃建材","parent":"627"},
|
||||||
|
"935":{"name":"家教","parent":"569"},
|
||||||
|
"936":{"name":"歌舞厅娱乐活动","parent":"586"},
|
||||||
|
"937":{"name":"计算机服务器","parent":"415"},
|
||||||
|
"938":{"name":"管道","parent":"627"},
|
||||||
|
"939":{"name":"婴幼儿服饰","parent":"597"},
|
||||||
|
"940":{"name":"热水器","parent":"601"},
|
||||||
|
"941":{"name":"计算机及零部件制造","parent":"415"},
|
||||||
|
"942":{"name":"钢铁贸易","parent":"633"},
|
||||||
|
"944":{"name":"包装材料","parent":"632"},
|
||||||
|
"945":{"name":"计算机办公设备","parent":"603"},
|
||||||
|
"946":{"name":"白酒","parent":"594"},
|
||||||
|
"948":{"name":"发动机","parent":"620"},
|
||||||
|
"949":{"name":"快餐服务","parent":"656"},
|
||||||
|
"950":{"name":"酒类销售","parent":"594"},
|
||||||
|
"951":{"name":"电子产品、机电设备","parent":"626"},
|
||||||
|
"952":{"name":"激光设备","parent":"626"},
|
||||||
|
"953":{"name":"餐饮策划","parent":"656"},
|
||||||
|
"954":{"name":"饮料、食品","parent":"594"},
|
||||||
|
"955":{"name":"文化娱乐经纪","parent":"585"},
|
||||||
|
"956":{"name":"天然气","parent":"665"},
|
||||||
|
"957":{"name":"农副食品","parent":"593"},
|
||||||
|
"958":{"name":"艺术表演","parent":"585"},
|
||||||
|
"959":{"name":"石膏、水泥制品及类似制品制造","parent":"627"},
|
||||||
|
"960":{"name":"橱柜","parent":"602"},
|
||||||
|
"961":{"name":"管理培训","parent":"577"},
|
||||||
|
"962":{"name":"男装服饰","parent":"597"},
|
||||||
|
"963":{"name":"化肥制造","parent":"675"},
|
||||||
|
"964":{"name":"童装服饰","parent":"597"},
|
||||||
|
"965":{"name":"电源电池","parent":"626"},
|
||||||
|
"966":{"name":"家电维修","parent":"664"},
|
||||||
|
"967":{"name":"光电子器件","parent":"419"},
|
||||||
|
"968":{"name":"旅行社服务","parent":"657"},
|
||||||
|
"969":{"name":"电线、电缆制造","parent":"626"},
|
||||||
|
"970":{"name":"软件开发、信息系统集成","parent":"419"},
|
||||||
|
"971":{"name":"白酒制造","parent":"594"},
|
||||||
|
"973":{"name":"甜品服务","parent":"656"},
|
||||||
|
"974":{"name":"糕点、面包制造","parent":"593"},
|
||||||
|
"975":{"name":"木工机械","parent":"620"},
|
||||||
|
"976":{"name":"酒吧服务","parent":"656"},
|
||||||
|
"977":{"name":"火腿肠","parent":"593"},
|
||||||
|
"978":{"name":"广告策划推广","parent":"579"},
|
||||||
|
"979":{"name":"新能源产品和生产装备制造","parent":"667"},
|
||||||
|
"980":{"name":"调味品","parent":"593"},
|
||||||
|
"981":{"name":"礼仪表演","parent":"585"},
|
||||||
|
"982":{"name":"劳务派遣","parent":"560"},
|
||||||
|
"983":{"name":"建材零售","parent":"627"},
|
||||||
|
"984":{"name":"商品交易中心","parent":"545"},
|
||||||
|
"985":{"name":"体育推广","parent":"585"},
|
||||||
|
"986":{"name":"茶饮料及其他饮料制造","parent":"594"},
|
||||||
|
"987":{"name":"金属建材","parent":"627"},
|
||||||
|
"988":{"name":"职业技能培训","parent":"571"},
|
||||||
|
"989":{"name":"网吧活动","parent":"586"},
|
||||||
|
"990":{"name":"洗衣服务","parent":"658"},
|
||||||
|
"991":{"name":"管道工程","parent":"554"},
|
||||||
|
"992":{"name":"通信工程","parent":"417"},
|
||||||
|
"993":{"name":"电子元器件","parent":"626"},
|
||||||
|
"994":{"name":"电子设备","parent":"419"},
|
||||||
|
"995":{"name":"茶馆服务","parent":"656"},
|
||||||
|
"996":{"name":"旅游开发","parent":"657"},
|
||||||
|
"997":{"name":"视频通讯","parent":"417"},
|
||||||
|
"998":{"name":"白酒销售","parent":"594"},
|
||||||
|
"1000":{"name":"咖啡馆服务","parent":"656"},
|
||||||
|
"1001":{"name":"食品零售","parent":"593"},
|
||||||
|
"1002":{"name":"健康疗养旅游","parent":"655"},
|
||||||
|
"1003":{"name":"粮油食品","parent":"593"},
|
||||||
|
"1004":{"name":"儿童教育影视","parent":"583"},
|
||||||
|
"1005":{"name":"新能源发电","parent":"667"},
|
||||||
|
"1006":{"name":"旅游策划","parent":"657"},
|
||||||
|
"1007":{"name":"绘画","parent":"575"},
|
||||||
|
"1008":{"name":"方便面及其他方便食品","parent":"593"},
|
||||||
|
"1009":{"name":"房地产经纪","parent":"550"},
|
||||||
|
"1010":{"name":"母婴家政","parent":"661"},
|
||||||
|
"1011":{"name":"居家养老健康服务","parent":"661"},
|
||||||
|
"1012":{"name":"文化艺术投资","parent":"545"},
|
||||||
|
"1013":{"name":"运动健身","parent":"660"},
|
||||||
|
"1014":{"name":"瓶(罐)装饮用水制造","parent":"594"},
|
||||||
|
"1015":{"name":"金属门窗","parent":"627"},
|
||||||
|
"1016":{"name":"机动车检测","parent":"563"},
|
||||||
|
"1017":{"name":"货物运输","parent":"634"},
|
||||||
|
"1018":{"name":"服饰专卖","parent":"690"},
|
||||||
|
"1019":{"name":"酒店服装","parent":"597"},
|
||||||
|
"1020":{"name":"通讯软件","parent":"417"},
|
||||||
|
"1021":{"name":"消防工程","parent":"554"},
|
||||||
|
"1022":{"name":"嵌入式电子系统","parent":"419"},
|
||||||
|
"1023":{"name":"航空票务","parent":"636"},
|
||||||
|
"1024":{"name":"电气设备","parent":"626"},
|
||||||
|
"1025":{"name":"酒业贸易","parent":"594"},
|
||||||
|
"1027":{"name":"其他饮料及冷饮服务","parent":"656"},
|
||||||
|
"1028":{"name":"乳制品","parent":"593"},
|
||||||
|
"1029":{"name":"新闻期刊出版","parent":"588"},
|
||||||
|
"1030":{"name":"水污染治理","parent":"672"},
|
||||||
|
"1031":{"name":"谷物食品","parent":"593"},
|
||||||
|
"1032":{"name":"数字动漫设计制造服务","parent":"590"},
|
||||||
|
"1033":{"name":"医院","parent":"646"},
|
||||||
|
"1034":{"name":"旅游广告","parent":"657"},
|
||||||
|
"1035":{"name":"办公家具","parent":"602"},
|
||||||
|
"1036":{"name":"房地产营销策划","parent":"550"},
|
||||||
|
"1037":{"name":"保洁家政","parent":"661"},
|
||||||
|
"1038":{"name":"水泥制造","parent":"627"},
|
||||||
|
"1039":{"name":"市场研究咨询","parent":"567"},
|
||||||
|
"1040":{"name":"驾校","parent":"571"},
|
||||||
|
"1041":{"name":"正餐服务","parent":"656"},
|
||||||
|
"1043":{"name":"机动车燃油","parent":"665"},
|
||||||
|
"1044":{"name":"食品","parent":"593"},
|
||||||
|
"1045":{"name":"新能源汽车","parent":"629"},
|
||||||
|
"1046":{"name":"手机无线网络推广","parent":"417"},
|
||||||
|
"1047":{"name":"环保设备","parent":"672"},
|
||||||
|
"1048":{"name":"通讯工程","parent":"418"},
|
||||||
|
"1049":{"name":"半导体集成电路","parent":"419"},
|
||||||
|
"1050":{"name":"航空服务","parent":"636"},
|
||||||
|
"1051":{"name":"电机设备","parent":"626"},
|
||||||
|
"1052":{"name":"档案软件","parent":"414"},
|
||||||
|
"1053":{"name":"冷链物流服务","parent":"634"},
|
||||||
|
"1054":{"name":"小吃服务","parent":"656"},
|
||||||
|
"1055":{"name":"水产品加工","parent":"593"},
|
||||||
|
"1056":{"name":"图书出版","parent":"588"},
|
||||||
|
"1057":{"name":"固体废物治理","parent":"672"},
|
||||||
|
"1059":{"name":"坚果食品","parent":"593"},
|
||||||
|
"1060":{"name":"广告传媒","parent":"579"},
|
||||||
|
"1061":{"name":"电梯","parent":"622"},
|
||||||
|
"1062":{"name":"社区医疗与卫生院","parent":"646"},
|
||||||
|
"1063":{"name":"广告、印刷包装","parent":"630"},
|
||||||
|
"1064":{"name":"婚纱礼服","parent":"662"},
|
||||||
|
"1065":{"name":"地毯","parent":"602"},
|
||||||
|
"1066":{"name":"互联网物业","parent":"551"},
|
||||||
|
"1067":{"name":"跨境电商","parent":"3"},
|
||||||
|
"1068":{"name":"信息安全、系统集成","parent":"9"},
|
||||||
|
"1069":{"name":"专用汽车制造","parent":"750"},
|
||||||
|
"1070":{"name":"商品贸易","parent":"3"},
|
||||||
|
"1071":{"name":"墙壁装饰材料","parent":"746"},
|
||||||
|
"1072":{"name":"窗帘装饰材料","parent":"746"},
|
||||||
|
"1073":{"name":"电子商务、本地生活服务","parent":"3"},
|
||||||
|
"1075":{"name":"白酒电子商务","parent":"3"},
|
||||||
|
"1076":{"name":"商品贸易、电子商务","parent":"3"},
|
||||||
|
"1077":{"name":"木质装饰材料","parent":"746"},
|
||||||
|
"1078":{"name":"电子商务、汽车电商交易平台","parent":"3"},
|
||||||
|
"1079":{"name":"汽车轮胎","parent":"751"},
|
||||||
|
"1080":{"name":"气体压缩机械制造","parent":"732"},
|
||||||
|
"1081":{"name":"家装家具电子商务","parent":"3"},
|
||||||
|
"1082":{"name":"化妆品电子商务","parent":"3"},
|
||||||
|
"1083":{"name":"汽车销售","parent":"749"},
|
||||||
|
"1084":{"name":"新闻资讯网站","parent":"510"},
|
||||||
|
"1085":{"name":"母婴电商","parent":"3"},
|
||||||
|
"1086":{"name":"电商商务、收藏品交易","parent":"3"},
|
||||||
|
"1088":{"name":"电子商务、数码产品","parent":"3"},
|
||||||
|
"1089":{"name":"二手车交易","parent":"749"},
|
||||||
|
"1090":{"name":"游戏制作服务","parent":"5"},
|
||||||
|
"1091":{"name":"母婴服务","parent":"510"},
|
||||||
|
"1092":{"name":"家具电子商务","parent":"3"},
|
||||||
|
"1093":{"name":"汽车配件电子商务","parent":"3"},
|
||||||
|
"1094":{"name":"输配电设备","parent":"777"},
|
||||||
|
"1095":{"name":"矿山设备","parent":"727"},
|
||||||
|
"1096":{"name":"机床机械","parent":"726"},
|
||||||
|
"1097":{"name":"农产品电商","parent":"3"},
|
||||||
|
"1098":{"name":"陶瓷装饰材料","parent":"746"},
|
||||||
|
"1099":{"name":"车载联网设备","parent":"487"},
|
||||||
|
"1100":{"name":"汽车销售电子商务","parent":"3"},
|
||||||
|
"1101":{"name":"石油设备","parent":"730"},
|
||||||
|
"1102":{"name":"智能家居","parent":"487"},
|
||||||
|
"1103":{"name":"散热器","parent":"751"},
|
||||||
|
"1104":{"name":"电力工程","parent":"775"},
|
||||||
|
"1105":{"name":"生鲜电商","parent":"3"},
|
||||||
|
"1106":{"name":"互联网数据服务","parent":"490"},
|
||||||
|
"1107":{"name":"房车、商务车销售","parent":"749"},
|
||||||
|
"1108":{"name":"茶叶电子商务","parent":"3"},
|
||||||
|
"1109":{"name":"酒类电子商务","parent":"3"},
|
||||||
|
"1110":{"name":"阀门","parent":"730"},
|
||||||
|
"1111":{"name":"食品电商","parent":"3"},
|
||||||
|
"1112":{"name":"儿童摄影","parent":"871"},
|
||||||
|
"1113":{"name":"广告摄影","parent":"871"},
|
||||||
|
"1114":{"name":"婚纱摄影","parent":"871"},
|
||||||
|
"1115":{"name":"模具制造","parent":"620"},
|
||||||
|
"1116":{"name":"汽车模具","parent":"629"},
|
||||||
|
"1117":{"name":"认证咨询","parent":"567"},
|
||||||
|
"1118":{"name":"数字视觉制作服务","parent":"590"},
|
||||||
|
"1119":{"name":"牙科及医疗器械","parent":"646"},
|
||||||
|
"1120":{"name":"猎头招聘","parent":"560"},
|
||||||
|
"1121":{"name":"家居","parent":"518"},
|
||||||
|
"1122":{"name":"收藏品","parent":"518"},
|
||||||
|
"1123":{"name":"首饰","parent":"518"},
|
||||||
|
"1124":{"name":"工艺品","parent":"518"},
|
||||||
|
"1125":{"name":"财务","parent":"515"},
|
||||||
|
"1126":{"name":"税务","parent":"515"},
|
||||||
|
"1127":{"name":"分类信息","parent":"2"},
|
||||||
|
"1128":{"name":"宠物","parent":"0"},
|
||||||
|
"1129":{"name":"快消品","parent":"518"},
|
||||||
|
"1130":{"name":"人工智能","parent":"2"},
|
||||||
|
"1131":{"name":"农/林/牧/渔","parent":"0"}
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_names(id):
|
||||||
|
id = str(id)
|
||||||
|
nms = []
|
||||||
|
d = TBL.get(id)
|
||||||
|
if not d:return []
|
||||||
|
nms.append(d["name"])
|
||||||
|
p = get_names(d["parent"])
|
||||||
|
if p: nms.extend(p)
|
||||||
|
return nms
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print(get_names("1119"))
|
762
deepdoc/parser/resume/entities/regions.py
Normal file
762
deepdoc/parser/resume/entities/regions.py
Normal file
@ -0,0 +1,762 @@
|
|||||||
|
TBL = {
|
||||||
|
"2":{"name":"北京","parent":"1"},
|
||||||
|
"3":{"name":"天津","parent":"1"},
|
||||||
|
"4":{"name":"河北","parent":"1"},
|
||||||
|
"5":{"name":"山西","parent":"1"},
|
||||||
|
"6":{"name":"内蒙古","parent":"1"},
|
||||||
|
"7":{"name":"辽宁","parent":"1"},
|
||||||
|
"8":{"name":"吉林","parent":"1"},
|
||||||
|
"9":{"name":"黑龙江","parent":"1"},
|
||||||
|
"10":{"name":"上海","parent":"1"},
|
||||||
|
"11":{"name":"江苏","parent":"1"},
|
||||||
|
"12":{"name":"浙江","parent":"1"},
|
||||||
|
"13":{"name":"安徽","parent":"1"},
|
||||||
|
"14":{"name":"福建","parent":"1"},
|
||||||
|
"15":{"name":"江西","parent":"1"},
|
||||||
|
"16":{"name":"山东","parent":"1"},
|
||||||
|
"17":{"name":"河南","parent":"1"},
|
||||||
|
"18":{"name":"湖北","parent":"1"},
|
||||||
|
"19":{"name":"湖南","parent":"1"},
|
||||||
|
"20":{"name":"广东","parent":"1"},
|
||||||
|
"21":{"name":"广西","parent":"1"},
|
||||||
|
"22":{"name":"海南","parent":"1"},
|
||||||
|
"23":{"name":"重庆","parent":"1"},
|
||||||
|
"24":{"name":"四川","parent":"1"},
|
||||||
|
"25":{"name":"贵州","parent":"1"},
|
||||||
|
"26":{"name":"云南","parent":"1"},
|
||||||
|
"27":{"name":"西藏","parent":"1"},
|
||||||
|
"28":{"name":"陕西","parent":"1"},
|
||||||
|
"29":{"name":"甘肃","parent":"1"},
|
||||||
|
"30":{"name":"青海","parent":"1"},
|
||||||
|
"31":{"name":"宁夏","parent":"1"},
|
||||||
|
"32":{"name":"新疆","parent":"1"},
|
||||||
|
"33":{"name":"北京市","parent":"2"},
|
||||||
|
"34":{"name":"天津市","parent":"3"},
|
||||||
|
"35":{"name":"石家庄市","parent":"4"},
|
||||||
|
"36":{"name":"唐山市","parent":"4"},
|
||||||
|
"37":{"name":"秦皇岛市","parent":"4"},
|
||||||
|
"38":{"name":"邯郸市","parent":"4"},
|
||||||
|
"39":{"name":"邢台市","parent":"4"},
|
||||||
|
"40":{"name":"保定市","parent":"4"},
|
||||||
|
"41":{"name":"张家口市","parent":"4"},
|
||||||
|
"42":{"name":"承德市","parent":"4"},
|
||||||
|
"43":{"name":"沧州市","parent":"4"},
|
||||||
|
"44":{"name":"廊坊市","parent":"4"},
|
||||||
|
"45":{"name":"衡水市","parent":"4"},
|
||||||
|
"46":{"name":"太原市","parent":"5"},
|
||||||
|
"47":{"name":"大同市","parent":"5"},
|
||||||
|
"48":{"name":"阳泉市","parent":"5"},
|
||||||
|
"49":{"name":"长治市","parent":"5"},
|
||||||
|
"50":{"name":"晋城市","parent":"5"},
|
||||||
|
"51":{"name":"朔州市","parent":"5"},
|
||||||
|
"52":{"name":"晋中市","parent":"5"},
|
||||||
|
"53":{"name":"运城市","parent":"5"},
|
||||||
|
"54":{"name":"忻州市","parent":"5"},
|
||||||
|
"55":{"name":"临汾市","parent":"5"},
|
||||||
|
"56":{"name":"吕梁市","parent":"5"},
|
||||||
|
"57":{"name":"呼和浩特市","parent":"6"},
|
||||||
|
"58":{"name":"包头市","parent":"6"},
|
||||||
|
"59":{"name":"乌海市","parent":"6"},
|
||||||
|
"60":{"name":"赤峰市","parent":"6"},
|
||||||
|
"61":{"name":"通辽市","parent":"6"},
|
||||||
|
"62":{"name":"鄂尔多斯市","parent":"6"},
|
||||||
|
"63":{"name":"呼伦贝尔市","parent":"6"},
|
||||||
|
"64":{"name":"巴彦淖尔市","parent":"6"},
|
||||||
|
"65":{"name":"乌兰察布市","parent":"6"},
|
||||||
|
"66":{"name":"兴安盟","parent":"6"},
|
||||||
|
"67":{"name":"锡林郭勒盟","parent":"6"},
|
||||||
|
"68":{"name":"阿拉善盟","parent":"6"},
|
||||||
|
"69":{"name":"沈阳市","parent":"7"},
|
||||||
|
"70":{"name":"大连市","parent":"7"},
|
||||||
|
"71":{"name":"鞍山市","parent":"7"},
|
||||||
|
"72":{"name":"抚顺市","parent":"7"},
|
||||||
|
"73":{"name":"本溪市","parent":"7"},
|
||||||
|
"74":{"name":"丹东市","parent":"7"},
|
||||||
|
"75":{"name":"锦州市","parent":"7"},
|
||||||
|
"76":{"name":"营口市","parent":"7"},
|
||||||
|
"77":{"name":"阜新市","parent":"7"},
|
||||||
|
"78":{"name":"辽阳市","parent":"7"},
|
||||||
|
"79":{"name":"盘锦市","parent":"7"},
|
||||||
|
"80":{"name":"铁岭市","parent":"7"},
|
||||||
|
"81":{"name":"朝阳市","parent":"7"},
|
||||||
|
"82":{"name":"葫芦岛市","parent":"7"},
|
||||||
|
"83":{"name":"长春市","parent":"8"},
|
||||||
|
"84":{"name":"吉林市","parent":"8"},
|
||||||
|
"85":{"name":"四平市","parent":"8"},
|
||||||
|
"86":{"name":"辽源市","parent":"8"},
|
||||||
|
"87":{"name":"通化市","parent":"8"},
|
||||||
|
"88":{"name":"白山市","parent":"8"},
|
||||||
|
"89":{"name":"松原市","parent":"8"},
|
||||||
|
"90":{"name":"白城市","parent":"8"},
|
||||||
|
"91":{"name":"延边朝鲜族自治州","parent":"8"},
|
||||||
|
"92":{"name":"哈尔滨市","parent":"9"},
|
||||||
|
"93":{"name":"齐齐哈尔市","parent":"9"},
|
||||||
|
"94":{"name":"鸡西市","parent":"9"},
|
||||||
|
"95":{"name":"鹤岗市","parent":"9"},
|
||||||
|
"96":{"name":"双鸭山市","parent":"9"},
|
||||||
|
"97":{"name":"大庆市","parent":"9"},
|
||||||
|
"98":{"name":"伊春市","parent":"9"},
|
||||||
|
"99":{"name":"佳木斯市","parent":"9"},
|
||||||
|
"100":{"name":"七台河市","parent":"9"},
|
||||||
|
"101":{"name":"牡丹江市","parent":"9"},
|
||||||
|
"102":{"name":"黑河市","parent":"9"},
|
||||||
|
"103":{"name":"绥化市","parent":"9"},
|
||||||
|
"104":{"name":"大兴安岭地区","parent":"9"},
|
||||||
|
"105":{"name":"上海市","parent":"10"},
|
||||||
|
"106":{"name":"南京市","parent":"11"},
|
||||||
|
"107":{"name":"无锡市","parent":"11"},
|
||||||
|
"108":{"name":"徐州市","parent":"11"},
|
||||||
|
"109":{"name":"常州市","parent":"11"},
|
||||||
|
"110":{"name":"苏州市","parent":"11"},
|
||||||
|
"111":{"name":"南通市","parent":"11"},
|
||||||
|
"112":{"name":"连云港市","parent":"11"},
|
||||||
|
"113":{"name":"淮安市","parent":"11"},
|
||||||
|
"114":{"name":"盐城市","parent":"11"},
|
||||||
|
"115":{"name":"扬州市","parent":"11"},
|
||||||
|
"116":{"name":"镇江市","parent":"11"},
|
||||||
|
"117":{"name":"泰州市","parent":"11"},
|
||||||
|
"118":{"name":"宿迁市","parent":"11"},
|
||||||
|
"119":{"name":"杭州市","parent":"12"},
|
||||||
|
"120":{"name":"宁波市","parent":"12"},
|
||||||
|
"121":{"name":"温州市","parent":"12"},
|
||||||
|
"122":{"name":"嘉兴市","parent":"12"},
|
||||||
|
"123":{"name":"湖州市","parent":"12"},
|
||||||
|
"124":{"name":"绍兴市","parent":"12"},
|
||||||
|
"125":{"name":"金华市","parent":"12"},
|
||||||
|
"126":{"name":"衢州市","parent":"12"},
|
||||||
|
"127":{"name":"舟山市","parent":"12"},
|
||||||
|
"128":{"name":"台州市","parent":"12"},
|
||||||
|
"129":{"name":"丽水市","parent":"12"},
|
||||||
|
"130":{"name":"合肥市","parent":"13"},
|
||||||
|
"131":{"name":"芜湖市","parent":"13"},
|
||||||
|
"132":{"name":"蚌埠市","parent":"13"},
|
||||||
|
"133":{"name":"淮南市","parent":"13"},
|
||||||
|
"134":{"name":"马鞍山市","parent":"13"},
|
||||||
|
"135":{"name":"淮北市","parent":"13"},
|
||||||
|
"136":{"name":"铜陵市","parent":"13"},
|
||||||
|
"137":{"name":"安庆市","parent":"13"},
|
||||||
|
"138":{"name":"黄山市","parent":"13"},
|
||||||
|
"139":{"name":"滁州市","parent":"13"},
|
||||||
|
"140":{"name":"阜阳市","parent":"13"},
|
||||||
|
"141":{"name":"宿州市","parent":"13"},
|
||||||
|
"143":{"name":"六安市","parent":"13"},
|
||||||
|
"144":{"name":"亳州市","parent":"13"},
|
||||||
|
"145":{"name":"池州市","parent":"13"},
|
||||||
|
"146":{"name":"宣城市","parent":"13"},
|
||||||
|
"147":{"name":"福州市","parent":"14"},
|
||||||
|
"148":{"name":"厦门市","parent":"14"},
|
||||||
|
"149":{"name":"莆田市","parent":"14"},
|
||||||
|
"150":{"name":"三明市","parent":"14"},
|
||||||
|
"151":{"name":"泉州市","parent":"14"},
|
||||||
|
"152":{"name":"漳州市","parent":"14"},
|
||||||
|
"153":{"name":"南平市","parent":"14"},
|
||||||
|
"154":{"name":"龙岩市","parent":"14"},
|
||||||
|
"155":{"name":"宁德市","parent":"14"},
|
||||||
|
"156":{"name":"南昌市","parent":"15"},
|
||||||
|
"157":{"name":"景德镇市","parent":"15"},
|
||||||
|
"158":{"name":"萍乡市","parent":"15"},
|
||||||
|
"159":{"name":"九江市","parent":"15"},
|
||||||
|
"160":{"name":"新余市","parent":"15"},
|
||||||
|
"161":{"name":"鹰潭市","parent":"15"},
|
||||||
|
"162":{"name":"赣州市","parent":"15"},
|
||||||
|
"163":{"name":"吉安市","parent":"15"},
|
||||||
|
"164":{"name":"宜春市","parent":"15"},
|
||||||
|
"165":{"name":"抚州市","parent":"15"},
|
||||||
|
"166":{"name":"上饶市","parent":"15"},
|
||||||
|
"167":{"name":"济南市","parent":"16"},
|
||||||
|
"168":{"name":"青岛市","parent":"16"},
|
||||||
|
"169":{"name":"淄博市","parent":"16"},
|
||||||
|
"170":{"name":"枣庄市","parent":"16"},
|
||||||
|
"171":{"name":"东营市","parent":"16"},
|
||||||
|
"172":{"name":"烟台市","parent":"16"},
|
||||||
|
"173":{"name":"潍坊市","parent":"16"},
|
||||||
|
"174":{"name":"济宁市","parent":"16"},
|
||||||
|
"175":{"name":"泰安市","parent":"16"},
|
||||||
|
"176":{"name":"威海市","parent":"16"},
|
||||||
|
"177":{"name":"日照市","parent":"16"},
|
||||||
|
"179":{"name":"临沂市","parent":"16"},
|
||||||
|
"180":{"name":"德州市","parent":"16"},
|
||||||
|
"181":{"name":"聊城市","parent":"16"},
|
||||||
|
"182":{"name":"滨州市","parent":"16"},
|
||||||
|
"183":{"name":"菏泽市","parent":"16"},
|
||||||
|
"184":{"name":"郑州市","parent":"17"},
|
||||||
|
"185":{"name":"开封市","parent":"17"},
|
||||||
|
"186":{"name":"洛阳市","parent":"17"},
|
||||||
|
"187":{"name":"平顶山市","parent":"17"},
|
||||||
|
"188":{"name":"安阳市","parent":"17"},
|
||||||
|
"189":{"name":"鹤壁市","parent":"17"},
|
||||||
|
"190":{"name":"新乡市","parent":"17"},
|
||||||
|
"191":{"name":"焦作市","parent":"17"},
|
||||||
|
"192":{"name":"濮阳市","parent":"17"},
|
||||||
|
"193":{"name":"许昌市","parent":"17"},
|
||||||
|
"194":{"name":"漯河市","parent":"17"},
|
||||||
|
"195":{"name":"三门峡市","parent":"17"},
|
||||||
|
"196":{"name":"南阳市","parent":"17"},
|
||||||
|
"197":{"name":"商丘市","parent":"17"},
|
||||||
|
"198":{"name":"信阳市","parent":"17"},
|
||||||
|
"199":{"name":"周口市","parent":"17"},
|
||||||
|
"200":{"name":"驻马店市","parent":"17"},
|
||||||
|
"201":{"name":"武汉市","parent":"18"},
|
||||||
|
"202":{"name":"黄石市","parent":"18"},
|
||||||
|
"203":{"name":"十堰市","parent":"18"},
|
||||||
|
"204":{"name":"宜昌市","parent":"18"},
|
||||||
|
"205":{"name":"襄阳市","parent":"18"},
|
||||||
|
"206":{"name":"鄂州市","parent":"18"},
|
||||||
|
"207":{"name":"荆门市","parent":"18"},
|
||||||
|
"208":{"name":"孝感市","parent":"18"},
|
||||||
|
"209":{"name":"荆州市","parent":"18"},
|
||||||
|
"210":{"name":"黄冈市","parent":"18"},
|
||||||
|
"211":{"name":"咸宁市","parent":"18"},
|
||||||
|
"212":{"name":"随州市","parent":"18"},
|
||||||
|
"213":{"name":"恩施土家族苗族自治州","parent":"18"},
|
||||||
|
"215":{"name":"长沙市","parent":"19"},
|
||||||
|
"216":{"name":"株洲市","parent":"19"},
|
||||||
|
"217":{"name":"湘潭市","parent":"19"},
|
||||||
|
"218":{"name":"衡阳市","parent":"19"},
|
||||||
|
"219":{"name":"邵阳市","parent":"19"},
|
||||||
|
"220":{"name":"岳阳市","parent":"19"},
|
||||||
|
"221":{"name":"常德市","parent":"19"},
|
||||||
|
"222":{"name":"张家界市","parent":"19"},
|
||||||
|
"223":{"name":"益阳市","parent":"19"},
|
||||||
|
"224":{"name":"郴州市","parent":"19"},
|
||||||
|
"225":{"name":"永州市","parent":"19"},
|
||||||
|
"226":{"name":"怀化市","parent":"19"},
|
||||||
|
"227":{"name":"娄底市","parent":"19"},
|
||||||
|
"228":{"name":"湘西土家族苗族自治州","parent":"19"},
|
||||||
|
"229":{"name":"广州市","parent":"20"},
|
||||||
|
"230":{"name":"韶关市","parent":"20"},
|
||||||
|
"231":{"name":"深圳市","parent":"20"},
|
||||||
|
"232":{"name":"珠海市","parent":"20"},
|
||||||
|
"233":{"name":"汕头市","parent":"20"},
|
||||||
|
"234":{"name":"佛山市","parent":"20"},
|
||||||
|
"235":{"name":"江门市","parent":"20"},
|
||||||
|
"236":{"name":"湛江市","parent":"20"},
|
||||||
|
"237":{"name":"茂名市","parent":"20"},
|
||||||
|
"238":{"name":"肇庆市","parent":"20"},
|
||||||
|
"239":{"name":"惠州市","parent":"20"},
|
||||||
|
"240":{"name":"梅州市","parent":"20"},
|
||||||
|
"241":{"name":"汕尾市","parent":"20"},
|
||||||
|
"242":{"name":"河源市","parent":"20"},
|
||||||
|
"243":{"name":"阳江市","parent":"20"},
|
||||||
|
"244":{"name":"清远市","parent":"20"},
|
||||||
|
"245":{"name":"东莞市","parent":"20"},
|
||||||
|
"246":{"name":"中山市","parent":"20"},
|
||||||
|
"247":{"name":"潮州市","parent":"20"},
|
||||||
|
"248":{"name":"揭阳市","parent":"20"},
|
||||||
|
"249":{"name":"云浮市","parent":"20"},
|
||||||
|
"250":{"name":"南宁市","parent":"21"},
|
||||||
|
"251":{"name":"柳州市","parent":"21"},
|
||||||
|
"252":{"name":"桂林市","parent":"21"},
|
||||||
|
"253":{"name":"梧州市","parent":"21"},
|
||||||
|
"254":{"name":"北海市","parent":"21"},
|
||||||
|
"255":{"name":"防城港市","parent":"21"},
|
||||||
|
"256":{"name":"钦州市","parent":"21"},
|
||||||
|
"257":{"name":"贵港市","parent":"21"},
|
||||||
|
"258":{"name":"玉林市","parent":"21"},
|
||||||
|
"259":{"name":"百色市","parent":"21"},
|
||||||
|
"260":{"name":"贺州市","parent":"21"},
|
||||||
|
"261":{"name":"河池市","parent":"21"},
|
||||||
|
"262":{"name":"来宾市","parent":"21"},
|
||||||
|
"263":{"name":"崇左市","parent":"21"},
|
||||||
|
"264":{"name":"海口市","parent":"22"},
|
||||||
|
"265":{"name":"三亚市","parent":"22"},
|
||||||
|
"267":{"name":"重庆市","parent":"23"},
|
||||||
|
"268":{"name":"成都市","parent":"24"},
|
||||||
|
"269":{"name":"自贡市","parent":"24"},
|
||||||
|
"270":{"name":"攀枝花市","parent":"24"},
|
||||||
|
"271":{"name":"泸州市","parent":"24"},
|
||||||
|
"272":{"name":"德阳市","parent":"24"},
|
||||||
|
"273":{"name":"绵阳市","parent":"24"},
|
||||||
|
"274":{"name":"广元市","parent":"24"},
|
||||||
|
"275":{"name":"遂宁市","parent":"24"},
|
||||||
|
"276":{"name":"内江市","parent":"24"},
|
||||||
|
"277":{"name":"乐山市","parent":"24"},
|
||||||
|
"278":{"name":"南充市","parent":"24"},
|
||||||
|
"279":{"name":"眉山市","parent":"24"},
|
||||||
|
"280":{"name":"宜宾市","parent":"24"},
|
||||||
|
"281":{"name":"广安市","parent":"24"},
|
||||||
|
"282":{"name":"达州市","parent":"24"},
|
||||||
|
"283":{"name":"雅安市","parent":"24"},
|
||||||
|
"284":{"name":"巴中市","parent":"24"},
|
||||||
|
"285":{"name":"资阳市","parent":"24"},
|
||||||
|
"286":{"name":"阿坝藏族羌族自治州","parent":"24"},
|
||||||
|
"287":{"name":"甘孜藏族自治州","parent":"24"},
|
||||||
|
"288":{"name":"凉山彝族自治州","parent":"24"},
|
||||||
|
"289":{"name":"贵阳市","parent":"25"},
|
||||||
|
"290":{"name":"六盘水市","parent":"25"},
|
||||||
|
"291":{"name":"遵义市","parent":"25"},
|
||||||
|
"292":{"name":"安顺市","parent":"25"},
|
||||||
|
"293":{"name":"铜仁市","parent":"25"},
|
||||||
|
"294":{"name":"黔西南布依族苗族自治州","parent":"25"},
|
||||||
|
"295":{"name":"毕节市","parent":"25"},
|
||||||
|
"296":{"name":"黔东南苗族侗族自治州","parent":"25"},
|
||||||
|
"297":{"name":"黔南布依族苗族自治州","parent":"25"},
|
||||||
|
"298":{"name":"昆明市","parent":"26"},
|
||||||
|
"299":{"name":"曲靖市","parent":"26"},
|
||||||
|
"300":{"name":"玉溪市","parent":"26"},
|
||||||
|
"301":{"name":"保山市","parent":"26"},
|
||||||
|
"302":{"name":"昭通市","parent":"26"},
|
||||||
|
"303":{"name":"丽江市","parent":"26"},
|
||||||
|
"304":{"name":"普洱市","parent":"26"},
|
||||||
|
"305":{"name":"临沧市","parent":"26"},
|
||||||
|
"306":{"name":"楚雄彝族自治州","parent":"26"},
|
||||||
|
"307":{"name":"红河哈尼族彝族自治州","parent":"26"},
|
||||||
|
"308":{"name":"文山壮族苗族自治州","parent":"26"},
|
||||||
|
"309":{"name":"西双版纳傣族自治州","parent":"26"},
|
||||||
|
"310":{"name":"大理白族自治州","parent":"26"},
|
||||||
|
"311":{"name":"德宏傣族景颇族自治州","parent":"26"},
|
||||||
|
"312":{"name":"怒江傈僳族自治州","parent":"26"},
|
||||||
|
"313":{"name":"迪庆藏族自治州","parent":"26"},
|
||||||
|
"314":{"name":"拉萨市","parent":"27"},
|
||||||
|
"315":{"name":"昌都市","parent":"27"},
|
||||||
|
"316":{"name":"山南市","parent":"27"},
|
||||||
|
"317":{"name":"日喀则市","parent":"27"},
|
||||||
|
"318":{"name":"那曲市","parent":"27"},
|
||||||
|
"319":{"name":"阿里地区","parent":"27"},
|
||||||
|
"320":{"name":"林芝市","parent":"27"},
|
||||||
|
"321":{"name":"西安市","parent":"28"},
|
||||||
|
"322":{"name":"铜川市","parent":"28"},
|
||||||
|
"323":{"name":"宝鸡市","parent":"28"},
|
||||||
|
"324":{"name":"咸阳市","parent":"28"},
|
||||||
|
"325":{"name":"渭南市","parent":"28"},
|
||||||
|
"326":{"name":"延安市","parent":"28"},
|
||||||
|
"327":{"name":"汉中市","parent":"28"},
|
||||||
|
"328":{"name":"榆林市","parent":"28"},
|
||||||
|
"329":{"name":"安康市","parent":"28"},
|
||||||
|
"330":{"name":"商洛市","parent":"28"},
|
||||||
|
"331":{"name":"兰州市","parent":"29"},
|
||||||
|
"332":{"name":"嘉峪关市","parent":"29"},
|
||||||
|
"333":{"name":"金昌市","parent":"29"},
|
||||||
|
"334":{"name":"白银市","parent":"29"},
|
||||||
|
"335":{"name":"天水市","parent":"29"},
|
||||||
|
"336":{"name":"武威市","parent":"29"},
|
||||||
|
"337":{"name":"张掖市","parent":"29"},
|
||||||
|
"338":{"name":"平凉市","parent":"29"},
|
||||||
|
"339":{"name":"酒泉市","parent":"29"},
|
||||||
|
"340":{"name":"庆阳市","parent":"29"},
|
||||||
|
"341":{"name":"定西市","parent":"29"},
|
||||||
|
"342":{"name":"陇南市","parent":"29"},
|
||||||
|
"343":{"name":"临夏回族自治州","parent":"29"},
|
||||||
|
"344":{"name":"甘南藏族自治州","parent":"29"},
|
||||||
|
"345":{"name":"西宁市","parent":"30"},
|
||||||
|
"346":{"name":"海东市","parent":"30"},
|
||||||
|
"347":{"name":"海北藏族自治州","parent":"30"},
|
||||||
|
"348":{"name":"黄南藏族自治州","parent":"30"},
|
||||||
|
"349":{"name":"海南藏族自治州","parent":"30"},
|
||||||
|
"350":{"name":"果洛藏族自治州","parent":"30"},
|
||||||
|
"351":{"name":"玉树藏族自治州","parent":"30"},
|
||||||
|
"352":{"name":"海西蒙古族藏族自治州","parent":"30"},
|
||||||
|
"353":{"name":"银川市","parent":"31"},
|
||||||
|
"354":{"name":"石嘴山市","parent":"31"},
|
||||||
|
"355":{"name":"吴忠市","parent":"31"},
|
||||||
|
"356":{"name":"固原市","parent":"31"},
|
||||||
|
"357":{"name":"中卫市","parent":"31"},
|
||||||
|
"358":{"name":"乌鲁木齐市","parent":"32"},
|
||||||
|
"359":{"name":"克拉玛依市","parent":"32"},
|
||||||
|
"360":{"name":"吐鲁番市","parent":"32"},
|
||||||
|
"361":{"name":"哈密市","parent":"32"},
|
||||||
|
"362":{"name":"昌吉回族自治州","parent":"32"},
|
||||||
|
"363":{"name":"博尔塔拉蒙古自治州","parent":"32"},
|
||||||
|
"364":{"name":"巴音郭楞蒙古自治州","parent":"32"},
|
||||||
|
"365":{"name":"阿克苏地区","parent":"32"},
|
||||||
|
"366":{"name":"克孜勒苏柯尔克孜自治州","parent":"32"},
|
||||||
|
"367":{"name":"喀什地区","parent":"32"},
|
||||||
|
"368":{"name":"和田地区","parent":"32"},
|
||||||
|
"369":{"name":"伊犁哈萨克自治州","parent":"32"},
|
||||||
|
"370":{"name":"塔城地区","parent":"32"},
|
||||||
|
"371":{"name":"阿勒泰地区","parent":"32"},
|
||||||
|
"372":{"name":"新疆省直辖行政单位","parent":"32"},
|
||||||
|
"373":{"name":"可克达拉市","parent":"32"},
|
||||||
|
"374":{"name":"昆玉市","parent":"32"},
|
||||||
|
"375":{"name":"胡杨河市","parent":"32"},
|
||||||
|
"376":{"name":"双河市","parent":"32"},
|
||||||
|
"3560":{"name":"北票市","parent":"7"},
|
||||||
|
"3615":{"name":"高州市","parent":"20"},
|
||||||
|
"3651":{"name":"济源市","parent":"17"},
|
||||||
|
"3662":{"name":"胶南市","parent":"16"},
|
||||||
|
"3683":{"name":"老河口市","parent":"18"},
|
||||||
|
"3758":{"name":"沙河市","parent":"4"},
|
||||||
|
"3822":{"name":"宜城市","parent":"18"},
|
||||||
|
"3842":{"name":"枣阳市","parent":"18"},
|
||||||
|
"3850":{"name":"肇东市","parent":"9"},
|
||||||
|
"3905":{"name":"澳门","parent":"1"},
|
||||||
|
"3906":{"name":"澳门","parent":"3905"},
|
||||||
|
"3907":{"name":"香港","parent":"1"},
|
||||||
|
"3908":{"name":"香港","parent":"3907"},
|
||||||
|
"3947":{"name":"仙桃市","parent":"18"},
|
||||||
|
"3954":{"name":"台湾","parent":"1"},
|
||||||
|
"3955":{"name":"台湾","parent":"3954"},
|
||||||
|
"3956":{"name":"海外","parent":"1"},
|
||||||
|
"3957":{"name":"海外","parent":"3956"},
|
||||||
|
"3958":{"name":"美国","parent":"3956"},
|
||||||
|
"3959":{"name":"加拿大","parent":"3956"},
|
||||||
|
"3961":{"name":"日本","parent":"3956"},
|
||||||
|
"3962":{"name":"韩国","parent":"3956"},
|
||||||
|
"3963":{"name":"德国","parent":"3956"},
|
||||||
|
"3964":{"name":"英国","parent":"3956"},
|
||||||
|
"3965":{"name":"意大利","parent":"3956"},
|
||||||
|
"3966":{"name":"西班牙","parent":"3956"},
|
||||||
|
"3967":{"name":"法国","parent":"3956"},
|
||||||
|
"3968":{"name":"澳大利亚","parent":"3956"},
|
||||||
|
"3969":{"name":"东城区","parent":"2"},
|
||||||
|
"3970":{"name":"西城区","parent":"2"},
|
||||||
|
"3971":{"name":"崇文区","parent":"2"},
|
||||||
|
"3972":{"name":"宣武区","parent":"2"},
|
||||||
|
"3973":{"name":"朝阳区","parent":"2"},
|
||||||
|
"3974":{"name":"海淀区","parent":"2"},
|
||||||
|
"3975":{"name":"丰台区","parent":"2"},
|
||||||
|
"3976":{"name":"石景山区","parent":"2"},
|
||||||
|
"3977":{"name":"门头沟区","parent":"2"},
|
||||||
|
"3978":{"name":"房山区","parent":"2"},
|
||||||
|
"3979":{"name":"通州区","parent":"2"},
|
||||||
|
"3980":{"name":"顺义区","parent":"2"},
|
||||||
|
"3981":{"name":"昌平区","parent":"2"},
|
||||||
|
"3982":{"name":"大兴区","parent":"2"},
|
||||||
|
"3983":{"name":"平谷区","parent":"2"},
|
||||||
|
"3984":{"name":"怀柔区","parent":"2"},
|
||||||
|
"3985":{"name":"密云区","parent":"2"},
|
||||||
|
"3986":{"name":"延庆区","parent":"2"},
|
||||||
|
"3987":{"name":"黄浦区","parent":"10"},
|
||||||
|
"3988":{"name":"徐汇区","parent":"10"},
|
||||||
|
"3989":{"name":"长宁区","parent":"10"},
|
||||||
|
"3990":{"name":"静安区","parent":"10"},
|
||||||
|
"3991":{"name":"普陀区","parent":"10"},
|
||||||
|
"3992":{"name":"闸北区","parent":"10"},
|
||||||
|
"3993":{"name":"虹口区","parent":"10"},
|
||||||
|
"3994":{"name":"杨浦区","parent":"10"},
|
||||||
|
"3995":{"name":"宝山区","parent":"10"},
|
||||||
|
"3996":{"name":"闵行区","parent":"10"},
|
||||||
|
"3997":{"name":"嘉定区","parent":"10"},
|
||||||
|
"3998":{"name":"浦东新区","parent":"10"},
|
||||||
|
"3999":{"name":"松江区","parent":"10"},
|
||||||
|
"4000":{"name":"金山区","parent":"10"},
|
||||||
|
"4001":{"name":"青浦区","parent":"10"},
|
||||||
|
"4002":{"name":"奉贤区","parent":"10"},
|
||||||
|
"4003":{"name":"崇明区","parent":"10"},
|
||||||
|
"4004":{"name":"和平区","parent":"3"},
|
||||||
|
"4005":{"name":"河东区","parent":"3"},
|
||||||
|
"4006":{"name":"河西区","parent":"3"},
|
||||||
|
"4007":{"name":"南开区","parent":"3"},
|
||||||
|
"4008":{"name":"红桥区","parent":"3"},
|
||||||
|
"4009":{"name":"河北区","parent":"3"},
|
||||||
|
"4010":{"name":"滨海新区","parent":"3"},
|
||||||
|
"4011":{"name":"东丽区","parent":"3"},
|
||||||
|
"4012":{"name":"西青区","parent":"3"},
|
||||||
|
"4013":{"name":"北辰区","parent":"3"},
|
||||||
|
"4014":{"name":"津南区","parent":"3"},
|
||||||
|
"4015":{"name":"武清区","parent":"3"},
|
||||||
|
"4016":{"name":"宝坻区","parent":"3"},
|
||||||
|
"4017":{"name":"静海区","parent":"3"},
|
||||||
|
"4018":{"name":"宁河区","parent":"3"},
|
||||||
|
"4019":{"name":"蓟州区","parent":"3"},
|
||||||
|
"4020":{"name":"渝中区","parent":"23"},
|
||||||
|
"4021":{"name":"江北区","parent":"23"},
|
||||||
|
"4022":{"name":"南岸区","parent":"23"},
|
||||||
|
"4023":{"name":"沙坪坝区","parent":"23"},
|
||||||
|
"4024":{"name":"九龙坡区","parent":"23"},
|
||||||
|
"4025":{"name":"大渡口区","parent":"23"},
|
||||||
|
"4026":{"name":"渝北区","parent":"23"},
|
||||||
|
"4027":{"name":"巴南区","parent":"23"},
|
||||||
|
"4028":{"name":"北碚区","parent":"23"},
|
||||||
|
"4029":{"name":"万州区","parent":"23"},
|
||||||
|
"4030":{"name":"黔江区","parent":"23"},
|
||||||
|
"4031":{"name":"永川区","parent":"23"},
|
||||||
|
"4032":{"name":"涪陵区","parent":"23"},
|
||||||
|
"4033":{"name":"江津区","parent":"23"},
|
||||||
|
"4034":{"name":"合川区","parent":"23"},
|
||||||
|
"4035":{"name":"双桥区","parent":"23"},
|
||||||
|
"4036":{"name":"万盛区","parent":"23"},
|
||||||
|
"4037":{"name":"荣昌区","parent":"23"},
|
||||||
|
"4038":{"name":"大足区","parent":"23"},
|
||||||
|
"4039":{"name":"璧山区","parent":"23"},
|
||||||
|
"4040":{"name":"铜梁区","parent":"23"},
|
||||||
|
"4041":{"name":"潼南区","parent":"23"},
|
||||||
|
"4042":{"name":"綦江区","parent":"23"},
|
||||||
|
"4043":{"name":"忠县","parent":"23"},
|
||||||
|
"4044":{"name":"开州区","parent":"23"},
|
||||||
|
"4045":{"name":"云阳县","parent":"23"},
|
||||||
|
"4046":{"name":"梁平区","parent":"23"},
|
||||||
|
"4047":{"name":"垫江县","parent":"23"},
|
||||||
|
"4048":{"name":"丰都县","parent":"23"},
|
||||||
|
"4049":{"name":"奉节县","parent":"23"},
|
||||||
|
"4050":{"name":"巫山县","parent":"23"},
|
||||||
|
"4051":{"name":"巫溪县","parent":"23"},
|
||||||
|
"4052":{"name":"城口县","parent":"23"},
|
||||||
|
"4053":{"name":"武隆区","parent":"23"},
|
||||||
|
"4054":{"name":"石柱土家族自治县","parent":"23"},
|
||||||
|
"4055":{"name":"秀山土家族苗族自治县","parent":"23"},
|
||||||
|
"4056":{"name":"酉阳土家族苗族自治县","parent":"23"},
|
||||||
|
"4057":{"name":"彭水苗族土家族自治县","parent":"23"},
|
||||||
|
"4058":{"name":"潜江市","parent":"18"},
|
||||||
|
"4059":{"name":"三沙市","parent":"22"},
|
||||||
|
"4060":{"name":"石河子市","parent":"32"},
|
||||||
|
"4061":{"name":"阿拉尔市","parent":"32"},
|
||||||
|
"4062":{"name":"图木舒克市","parent":"32"},
|
||||||
|
"4063":{"name":"五家渠市","parent":"32"},
|
||||||
|
"4064":{"name":"北屯市","parent":"32"},
|
||||||
|
"4065":{"name":"铁门关市","parent":"32"},
|
||||||
|
"4066":{"name":"儋州市","parent":"22"},
|
||||||
|
"4067":{"name":"五指山市","parent":"22"},
|
||||||
|
"4068":{"name":"文昌市","parent":"22"},
|
||||||
|
"4069":{"name":"琼海市","parent":"22"},
|
||||||
|
"4070":{"name":"万宁市","parent":"22"},
|
||||||
|
"4072":{"name":"定安县","parent":"22"},
|
||||||
|
"4073":{"name":"屯昌县","parent":"22"},
|
||||||
|
"4074":{"name":"澄迈县","parent":"22"},
|
||||||
|
"4075":{"name":"临高县","parent":"22"},
|
||||||
|
"4076":{"name":"琼中黎族苗族自治县","parent":"22"},
|
||||||
|
"4077":{"name":"保亭黎族苗族自治县","parent":"22"},
|
||||||
|
"4078":{"name":"白沙黎族自治县","parent":"22"},
|
||||||
|
"4079":{"name":"昌江黎族自治县","parent":"22"},
|
||||||
|
"4080":{"name":"乐东黎族自治县","parent":"22"},
|
||||||
|
"4081":{"name":"陵水黎族自治县","parent":"22"},
|
||||||
|
"4082":{"name":"马来西亚","parent":"3956"},
|
||||||
|
"6047":{"name":"长寿区","parent":"23"},
|
||||||
|
"6857":{"name":"阿富汗","parent":"3956"},
|
||||||
|
"6858":{"name":"阿尔巴尼亚","parent":"3956"},
|
||||||
|
"6859":{"name":"阿尔及利亚","parent":"3956"},
|
||||||
|
"6860":{"name":"美属萨摩亚","parent":"3956"},
|
||||||
|
"6861":{"name":"安道尔","parent":"3956"},
|
||||||
|
"6862":{"name":"安哥拉","parent":"3956"},
|
||||||
|
"6863":{"name":"安圭拉","parent":"3956"},
|
||||||
|
"6864":{"name":"南极洲","parent":"3956"},
|
||||||
|
"6865":{"name":"安提瓜和巴布达","parent":"3956"},
|
||||||
|
"6866":{"name":"阿根廷","parent":"3956"},
|
||||||
|
"6867":{"name":"亚美尼亚","parent":"3956"},
|
||||||
|
"6869":{"name":"奥地利","parent":"3956"},
|
||||||
|
"6870":{"name":"阿塞拜疆","parent":"3956"},
|
||||||
|
"6871":{"name":"巴哈马","parent":"3956"},
|
||||||
|
"6872":{"name":"巴林","parent":"3956"},
|
||||||
|
"6873":{"name":"孟加拉国","parent":"3956"},
|
||||||
|
"6874":{"name":"巴巴多斯","parent":"3956"},
|
||||||
|
"6875":{"name":"白俄罗斯","parent":"3956"},
|
||||||
|
"6876":{"name":"比利时","parent":"3956"},
|
||||||
|
"6877":{"name":"伯利兹","parent":"3956"},
|
||||||
|
"6878":{"name":"贝宁","parent":"3956"},
|
||||||
|
"6879":{"name":"百慕大","parent":"3956"},
|
||||||
|
"6880":{"name":"不丹","parent":"3956"},
|
||||||
|
"6881":{"name":"玻利维亚","parent":"3956"},
|
||||||
|
"6882":{"name":"波黑","parent":"3956"},
|
||||||
|
"6883":{"name":"博茨瓦纳","parent":"3956"},
|
||||||
|
"6884":{"name":"布维岛","parent":"3956"},
|
||||||
|
"6885":{"name":"巴西","parent":"3956"},
|
||||||
|
"6886":{"name":"英属印度洋领土","parent":"3956"},
|
||||||
|
"6887":{"name":"文莱","parent":"3956"},
|
||||||
|
"6888":{"name":"保加利亚","parent":"3956"},
|
||||||
|
"6889":{"name":"布基纳法索","parent":"3956"},
|
||||||
|
"6890":{"name":"布隆迪","parent":"3956"},
|
||||||
|
"6891":{"name":"柬埔寨","parent":"3956"},
|
||||||
|
"6892":{"name":"喀麦隆","parent":"3956"},
|
||||||
|
"6893":{"name":"佛得角","parent":"3956"},
|
||||||
|
"6894":{"name":"开曼群岛","parent":"3956"},
|
||||||
|
"6895":{"name":"中非","parent":"3956"},
|
||||||
|
"6896":{"name":"乍得","parent":"3956"},
|
||||||
|
"6897":{"name":"智利","parent":"3956"},
|
||||||
|
"6898":{"name":"圣诞岛","parent":"3956"},
|
||||||
|
"6899":{"name":"科科斯(基林)群岛","parent":"3956"},
|
||||||
|
"6900":{"name":"哥伦比亚","parent":"3956"},
|
||||||
|
"6901":{"name":"科摩罗","parent":"3956"},
|
||||||
|
"6902":{"name":"刚果(布)","parent":"3956"},
|
||||||
|
"6903":{"name":"刚果(金)","parent":"3956"},
|
||||||
|
"6904":{"name":"库克群岛","parent":"3956"},
|
||||||
|
"6905":{"name":"哥斯达黎加","parent":"3956"},
|
||||||
|
"6906":{"name":"科特迪瓦","parent":"3956"},
|
||||||
|
"6907":{"name":"克罗地亚","parent":"3956"},
|
||||||
|
"6908":{"name":"古巴","parent":"3956"},
|
||||||
|
"6909":{"name":"塞浦路斯","parent":"3956"},
|
||||||
|
"6910":{"name":"捷克","parent":"3956"},
|
||||||
|
"6911":{"name":"丹麦","parent":"3956"},
|
||||||
|
"6912":{"name":"吉布提","parent":"3956"},
|
||||||
|
"6913":{"name":"多米尼克","parent":"3956"},
|
||||||
|
"6914":{"name":"多米尼加共和国","parent":"3956"},
|
||||||
|
"6915":{"name":"东帝汶","parent":"3956"},
|
||||||
|
"6916":{"name":"厄瓜多尔","parent":"3956"},
|
||||||
|
"6917":{"name":"埃及","parent":"3956"},
|
||||||
|
"6918":{"name":"萨尔瓦多","parent":"3956"},
|
||||||
|
"6919":{"name":"赤道几内亚","parent":"3956"},
|
||||||
|
"6920":{"name":"厄立特里亚","parent":"3956"},
|
||||||
|
"6921":{"name":"爱沙尼亚","parent":"3956"},
|
||||||
|
"6922":{"name":"埃塞俄比亚","parent":"3956"},
|
||||||
|
"6923":{"name":"福克兰群岛(马尔维纳斯)","parent":"3956"},
|
||||||
|
"6924":{"name":"法罗群岛","parent":"3956"},
|
||||||
|
"6925":{"name":"斐济","parent":"3956"},
|
||||||
|
"6926":{"name":"芬兰","parent":"3956"},
|
||||||
|
"6927":{"name":"法属圭亚那","parent":"3956"},
|
||||||
|
"6928":{"name":"法属波利尼西亚","parent":"3956"},
|
||||||
|
"6929":{"name":"法属南部领土","parent":"3956"},
|
||||||
|
"6930":{"name":"加蓬","parent":"3956"},
|
||||||
|
"6931":{"name":"冈比亚","parent":"3956"},
|
||||||
|
"6932":{"name":"格鲁吉亚","parent":"3956"},
|
||||||
|
"6933":{"name":"加纳","parent":"3956"},
|
||||||
|
"6934":{"name":"直布罗陀","parent":"3956"},
|
||||||
|
"6935":{"name":"希腊","parent":"3956"},
|
||||||
|
"6936":{"name":"格陵兰","parent":"3956"},
|
||||||
|
"6937":{"name":"格林纳达","parent":"3956"},
|
||||||
|
"6938":{"name":"瓜德罗普","parent":"3956"},
|
||||||
|
"6939":{"name":"关岛","parent":"3956"},
|
||||||
|
"6940":{"name":"危地马拉","parent":"3956"},
|
||||||
|
"6941":{"name":"几内亚","parent":"3956"},
|
||||||
|
"6942":{"name":"几内亚比绍","parent":"3956"},
|
||||||
|
"6943":{"name":"圭亚那","parent":"3956"},
|
||||||
|
"6944":{"name":"海地","parent":"3956"},
|
||||||
|
"6945":{"name":"赫德岛和麦克唐纳岛","parent":"3956"},
|
||||||
|
"6946":{"name":"洪都拉斯","parent":"3956"},
|
||||||
|
"6947":{"name":"匈牙利","parent":"3956"},
|
||||||
|
"6948":{"name":"冰岛","parent":"3956"},
|
||||||
|
"6949":{"name":"印度","parent":"3956"},
|
||||||
|
"6950":{"name":"印度尼西亚","parent":"3956"},
|
||||||
|
"6951":{"name":"伊朗","parent":"3956"},
|
||||||
|
"6952":{"name":"伊拉克","parent":"3956"},
|
||||||
|
"6953":{"name":"爱尔兰","parent":"3956"},
|
||||||
|
"6954":{"name":"以色列","parent":"3956"},
|
||||||
|
"6955":{"name":"牙买加","parent":"3956"},
|
||||||
|
"6956":{"name":"约旦","parent":"3956"},
|
||||||
|
"6957":{"name":"哈萨克斯坦","parent":"3956"},
|
||||||
|
"6958":{"name":"肯尼亚","parent":"3956"},
|
||||||
|
"6959":{"name":"基里巴斯","parent":"3956"},
|
||||||
|
"6960":{"name":"朝鲜","parent":"3956"},
|
||||||
|
"6961":{"name":"科威特","parent":"3956"},
|
||||||
|
"6962":{"name":"吉尔吉斯斯坦","parent":"3956"},
|
||||||
|
"6963":{"name":"老挝","parent":"3956"},
|
||||||
|
"6964":{"name":"拉脱维亚","parent":"3956"},
|
||||||
|
"6965":{"name":"黎巴嫩","parent":"3956"},
|
||||||
|
"6966":{"name":"莱索托","parent":"3956"},
|
||||||
|
"6967":{"name":"利比里亚","parent":"3956"},
|
||||||
|
"6968":{"name":"利比亚","parent":"3956"},
|
||||||
|
"6969":{"name":"列支敦士登","parent":"3956"},
|
||||||
|
"6970":{"name":"立陶宛","parent":"3956"},
|
||||||
|
"6971":{"name":"卢森堡","parent":"3956"},
|
||||||
|
"6972":{"name":"前南马其顿","parent":"3956"},
|
||||||
|
"6973":{"name":"马达加斯加","parent":"3956"},
|
||||||
|
"6974":{"name":"马拉维","parent":"3956"},
|
||||||
|
"6975":{"name":"马尔代夫","parent":"3956"},
|
||||||
|
"6976":{"name":"马里","parent":"3956"},
|
||||||
|
"6977":{"name":"马耳他","parent":"3956"},
|
||||||
|
"6978":{"name":"马绍尔群岛","parent":"3956"},
|
||||||
|
"6979":{"name":"马提尼克","parent":"3956"},
|
||||||
|
"6980":{"name":"毛里塔尼亚","parent":"3956"},
|
||||||
|
"6981":{"name":"毛里求斯","parent":"3956"},
|
||||||
|
"6982":{"name":"马约特","parent":"3956"},
|
||||||
|
"6983":{"name":"墨西哥","parent":"3956"},
|
||||||
|
"6984":{"name":"密克罗尼西亚联邦","parent":"3956"},
|
||||||
|
"6985":{"name":"摩尔多瓦","parent":"3956"},
|
||||||
|
"6986":{"name":"摩纳哥","parent":"3956"},
|
||||||
|
"6987":{"name":"蒙古","parent":"3956"},
|
||||||
|
"6988":{"name":"蒙特塞拉特","parent":"3956"},
|
||||||
|
"6989":{"name":"摩洛哥","parent":"3956"},
|
||||||
|
"6990":{"name":"莫桑比克","parent":"3956"},
|
||||||
|
"6991":{"name":"缅甸","parent":"3956"},
|
||||||
|
"6992":{"name":"纳米比亚","parent":"3956"},
|
||||||
|
"6993":{"name":"瑙鲁","parent":"3956"},
|
||||||
|
"6994":{"name":"尼泊尔","parent":"3956"},
|
||||||
|
"6995":{"name":"荷兰","parent":"3956"},
|
||||||
|
"6996":{"name":"荷属安的列斯","parent":"3956"},
|
||||||
|
"6997":{"name":"新喀里多尼亚","parent":"3956"},
|
||||||
|
"6998":{"name":"新西兰","parent":"3956"},
|
||||||
|
"6999":{"name":"尼加拉瓜","parent":"3956"},
|
||||||
|
"7000":{"name":"尼日尔","parent":"3956"},
|
||||||
|
"7001":{"name":"尼日利亚","parent":"3956"},
|
||||||
|
"7002":{"name":"纽埃","parent":"3956"},
|
||||||
|
"7003":{"name":"诺福克岛","parent":"3956"},
|
||||||
|
"7004":{"name":"北马里亚纳","parent":"3956"},
|
||||||
|
"7005":{"name":"挪威","parent":"3956"},
|
||||||
|
"7006":{"name":"阿曼","parent":"3956"},
|
||||||
|
"7007":{"name":"巴基斯坦","parent":"3956"},
|
||||||
|
"7008":{"name":"帕劳","parent":"3956"},
|
||||||
|
"7009":{"name":"巴勒斯坦","parent":"3956"},
|
||||||
|
"7010":{"name":"巴拿马","parent":"3956"},
|
||||||
|
"7011":{"name":"巴布亚新几内亚","parent":"3956"},
|
||||||
|
"7012":{"name":"巴拉圭","parent":"3956"},
|
||||||
|
"7013":{"name":"秘鲁","parent":"3956"},
|
||||||
|
"7014":{"name":"菲律宾","parent":"3956"},
|
||||||
|
"7015":{"name":"皮特凯恩群岛","parent":"3956"},
|
||||||
|
"7016":{"name":"波兰","parent":"3956"},
|
||||||
|
"7017":{"name":"葡萄牙","parent":"3956"},
|
||||||
|
"7018":{"name":"波多黎各","parent":"3956"},
|
||||||
|
"7019":{"name":"卡塔尔","parent":"3956"},
|
||||||
|
"7020":{"name":"留尼汪","parent":"3956"},
|
||||||
|
"7021":{"name":"罗马尼亚","parent":"3956"},
|
||||||
|
"7022":{"name":"俄罗斯联邦","parent":"3956"},
|
||||||
|
"7023":{"name":"卢旺达","parent":"3956"},
|
||||||
|
"7024":{"name":"圣赫勒拿","parent":"3956"},
|
||||||
|
"7025":{"name":"圣基茨和尼维斯","parent":"3956"},
|
||||||
|
"7026":{"name":"圣卢西亚","parent":"3956"},
|
||||||
|
"7027":{"name":"圣皮埃尔和密克隆","parent":"3956"},
|
||||||
|
"7028":{"name":"圣文森特和格林纳丁斯","parent":"3956"},
|
||||||
|
"7029":{"name":"萨摩亚","parent":"3956"},
|
||||||
|
"7030":{"name":"圣马力诺","parent":"3956"},
|
||||||
|
"7031":{"name":"圣多美和普林西比","parent":"3956"},
|
||||||
|
"7032":{"name":"沙特阿拉伯","parent":"3956"},
|
||||||
|
"7033":{"name":"塞内加尔","parent":"3956"},
|
||||||
|
"7034":{"name":"塞舌尔","parent":"3956"},
|
||||||
|
"7035":{"name":"塞拉利昂","parent":"3956"},
|
||||||
|
"7036":{"name":"新加坡","parent":"3956"},
|
||||||
|
"7037":{"name":"斯洛伐克","parent":"3956"},
|
||||||
|
"7038":{"name":"斯洛文尼亚","parent":"3956"},
|
||||||
|
"7039":{"name":"所罗门群岛","parent":"3956"},
|
||||||
|
"7040":{"name":"索马里","parent":"3956"},
|
||||||
|
"7041":{"name":"南非","parent":"3956"},
|
||||||
|
"7042":{"name":"南乔治亚岛和南桑德韦奇岛","parent":"3956"},
|
||||||
|
"7043":{"name":"斯里兰卡","parent":"3956"},
|
||||||
|
"7044":{"name":"苏丹","parent":"3956"},
|
||||||
|
"7045":{"name":"苏里南","parent":"3956"},
|
||||||
|
"7046":{"name":"斯瓦尔巴群岛","parent":"3956"},
|
||||||
|
"7047":{"name":"斯威士兰","parent":"3956"},
|
||||||
|
"7048":{"name":"瑞典","parent":"3956"},
|
||||||
|
"7049":{"name":"瑞士","parent":"3956"},
|
||||||
|
"7050":{"name":"叙利亚","parent":"3956"},
|
||||||
|
"7051":{"name":"塔吉克斯坦","parent":"3956"},
|
||||||
|
"7052":{"name":"坦桑尼亚","parent":"3956"},
|
||||||
|
"7053":{"name":"泰国","parent":"3956"},
|
||||||
|
"7054":{"name":"多哥","parent":"3956"},
|
||||||
|
"7055":{"name":"托克劳","parent":"3956"},
|
||||||
|
"7056":{"name":"汤加","parent":"3956"},
|
||||||
|
"7057":{"name":"特立尼达和多巴哥","parent":"3956"},
|
||||||
|
"7058":{"name":"突尼斯","parent":"3956"},
|
||||||
|
"7059":{"name":"土耳其","parent":"3956"},
|
||||||
|
"7060":{"name":"土库曼斯坦","parent":"3956"},
|
||||||
|
"7061":{"name":"特克斯科斯群岛","parent":"3956"},
|
||||||
|
"7062":{"name":"图瓦卢","parent":"3956"},
|
||||||
|
"7063":{"name":"乌干达","parent":"3956"},
|
||||||
|
"7064":{"name":"乌克兰","parent":"3956"},
|
||||||
|
"7065":{"name":"阿联酋","parent":"3956"},
|
||||||
|
"7066":{"name":"美国本土外小岛屿","parent":"3956"},
|
||||||
|
"7067":{"name":"乌拉圭","parent":"3956"},
|
||||||
|
"7068":{"name":"乌兹别克斯坦","parent":"3956"},
|
||||||
|
"7069":{"name":"瓦努阿图","parent":"3956"},
|
||||||
|
"7070":{"name":"梵蒂冈","parent":"3956"},
|
||||||
|
"7071":{"name":"委内瑞拉","parent":"3956"},
|
||||||
|
"7072":{"name":"越南","parent":"3956"},
|
||||||
|
"7073":{"name":"英属维尔京群岛","parent":"3956"},
|
||||||
|
"7074":{"name":"美属维尔京群岛","parent":"3956"},
|
||||||
|
"7075":{"name":"瓦利斯和富图纳","parent":"3956"},
|
||||||
|
"7076":{"name":"西撒哈拉","parent":"3956"},
|
||||||
|
"7077":{"name":"也门","parent":"3956"},
|
||||||
|
"7078":{"name":"南斯拉夫","parent":"3956"},
|
||||||
|
"7079":{"name":"赞比亚","parent":"3956"},
|
||||||
|
"7080":{"name":"津巴布韦","parent":"3956"},
|
||||||
|
"7081":{"name":"塞尔维亚","parent":"3956"},
|
||||||
|
"7082":{"name":"雄安新区","parent":"4"},
|
||||||
|
"7084":{"name":"天门市","parent":"18"}
|
||||||
|
}
|
||||||
|
|
||||||
|
NM_SET = set([v["name"] for _,v in TBL.items()])
|
||||||
|
|
||||||
|
def get_names(id):
|
||||||
|
if not id or str(id).lower() == "none":return []
|
||||||
|
id = str(id)
|
||||||
|
if not re.match("[0-9]+$", id.strip()):return [id]
|
||||||
|
nms = []
|
||||||
|
d = TBL.get(id)
|
||||||
|
if not d:return[]
|
||||||
|
nms.append(d["name"])
|
||||||
|
p = get_names(d["parent"])
|
||||||
|
if p: nms.extend(p)
|
||||||
|
return nms
|
||||||
|
|
||||||
|
import re
|
||||||
|
def isName(nm):
|
||||||
|
if nm in NM_SET:return True
|
||||||
|
if nm + "市" in NM_SET:return True
|
||||||
|
if re.sub(r"(省|(回族|壮族|维吾尔)*自治区)$", "", nm) in NM_SET:return True
|
||||||
|
return False
|
65
deepdoc/parser/resume/entities/res/corp.tks.freq.json
Normal file
65
deepdoc/parser/resume/entities/res/corp.tks.freq.json
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
[
|
||||||
|
"科技",
|
||||||
|
"集团",
|
||||||
|
"网络科技",
|
||||||
|
"技术",
|
||||||
|
"信息",
|
||||||
|
"分公司",
|
||||||
|
"信息技术",
|
||||||
|
"发展",
|
||||||
|
"科技股份",
|
||||||
|
"网络",
|
||||||
|
"贸易",
|
||||||
|
"商贸",
|
||||||
|
"工程",
|
||||||
|
"企业",
|
||||||
|
"集团股份",
|
||||||
|
"商务",
|
||||||
|
"工业",
|
||||||
|
"控股集团",
|
||||||
|
"国际贸易",
|
||||||
|
"软件技术",
|
||||||
|
"数码科技",
|
||||||
|
"软件开发",
|
||||||
|
"有限",
|
||||||
|
"经营",
|
||||||
|
"科技开发",
|
||||||
|
"股份公司",
|
||||||
|
"电子技术",
|
||||||
|
"实业集团",
|
||||||
|
"责任",
|
||||||
|
"无限",
|
||||||
|
"工程技术",
|
||||||
|
"上市公司",
|
||||||
|
"技术开发",
|
||||||
|
"软件系统",
|
||||||
|
"总公司",
|
||||||
|
"网络服务",
|
||||||
|
"ltd.",
|
||||||
|
"technology",
|
||||||
|
"company",
|
||||||
|
"服务公司",
|
||||||
|
"计算机技术",
|
||||||
|
"计算机软件",
|
||||||
|
"电子信息",
|
||||||
|
"corporation",
|
||||||
|
"计算机服务",
|
||||||
|
"计算机系统",
|
||||||
|
"有限公司",
|
||||||
|
"事业部",
|
||||||
|
"公司",
|
||||||
|
"股份",
|
||||||
|
"有限责任",
|
||||||
|
"软件",
|
||||||
|
"控股",
|
||||||
|
"高科技",
|
||||||
|
"房地产",
|
||||||
|
"事业群",
|
||||||
|
"部门",
|
||||||
|
"电子商务",
|
||||||
|
"人力资源顾问",
|
||||||
|
"人力资源",
|
||||||
|
"株式会社",
|
||||||
|
"网络营销"
|
||||||
|
]
|
||||||
|
|
31480
deepdoc/parser/resume/entities/res/corp_baike_len.csv
Normal file
31480
deepdoc/parser/resume/entities/res/corp_baike_len.csv
Normal file
File diff suppressed because it is too large
Load Diff
14939
deepdoc/parser/resume/entities/res/corp_tag.json
Normal file
14939
deepdoc/parser/resume/entities/res/corp_tag.json
Normal file
File diff suppressed because it is too large
Load Diff
911
deepdoc/parser/resume/entities/res/good_corp.json
Normal file
911
deepdoc/parser/resume/entities/res/good_corp.json
Normal file
@ -0,0 +1,911 @@
|
|||||||
|
[
|
||||||
|
"google assistant investments",
|
||||||
|
"amazon",
|
||||||
|
"dingtalk china information",
|
||||||
|
"zhejiang alibaba communication",
|
||||||
|
"yunos",
|
||||||
|
"腾讯云",
|
||||||
|
"新浪新闻",
|
||||||
|
"网邻通",
|
||||||
|
"蚂蚁集团",
|
||||||
|
"大疆",
|
||||||
|
"恒生股份",
|
||||||
|
"sf express",
|
||||||
|
"智者天下",
|
||||||
|
"shanghai hema network",
|
||||||
|
"papayamobile",
|
||||||
|
"lexinfintech",
|
||||||
|
"industrial consumer finance",
|
||||||
|
"360搜索",
|
||||||
|
"世纪光速",
|
||||||
|
"迅雷区块链",
|
||||||
|
"赛盒科技",
|
||||||
|
"齐力电子商务",
|
||||||
|
"平安养老险",
|
||||||
|
"平安证券",
|
||||||
|
"平安好贷",
|
||||||
|
"五八新服",
|
||||||
|
"呯嘭智能",
|
||||||
|
"阿里妈妈",
|
||||||
|
"mdt",
|
||||||
|
"tencent",
|
||||||
|
"weibo",
|
||||||
|
"浪潮软件",
|
||||||
|
"阿里巴巴广告",
|
||||||
|
"mashang consumer finance",
|
||||||
|
"维沃",
|
||||||
|
"hqg , limited",
|
||||||
|
"moodys",
|
||||||
|
"搜狐支付",
|
||||||
|
"百度秀",
|
||||||
|
"新浪服务",
|
||||||
|
"零售通",
|
||||||
|
"同城艺龙",
|
||||||
|
"虾米音乐",
|
||||||
|
"贝壳集团",
|
||||||
|
"小米有品",
|
||||||
|
"滴滴自动驾驶",
|
||||||
|
"图记",
|
||||||
|
"阿里影业",
|
||||||
|
"卓联软件",
|
||||||
|
"zhejiang tmall",
|
||||||
|
"谷歌中国",
|
||||||
|
"hithink flush",
|
||||||
|
"时装科技",
|
||||||
|
"程会玩国际旅行社",
|
||||||
|
"amazon china holding limited",
|
||||||
|
"中信消金",
|
||||||
|
"当当比特物流",
|
||||||
|
"新浪新媒体咨询",
|
||||||
|
"tongcheng network",
|
||||||
|
"金山在线",
|
||||||
|
"shopping cart",
|
||||||
|
"犀互动",
|
||||||
|
"五八",
|
||||||
|
"bilibili",
|
||||||
|
"阿里星球",
|
||||||
|
"滴滴金科服务",
|
||||||
|
"美团",
|
||||||
|
"哈啰出行",
|
||||||
|
"face",
|
||||||
|
"平安健康",
|
||||||
|
"招商银行",
|
||||||
|
"连亚",
|
||||||
|
"盒马网络",
|
||||||
|
"b站",
|
||||||
|
"华为机器",
|
||||||
|
"shanghai mdt infotech",
|
||||||
|
"ping an healthkonnect",
|
||||||
|
"beijing home link real estate broker",
|
||||||
|
"花海仓",
|
||||||
|
"beijing jingdong shangke information",
|
||||||
|
"微影智能",
|
||||||
|
"酷狗游戏",
|
||||||
|
"health.pingan.com",
|
||||||
|
"众安",
|
||||||
|
"陌陌",
|
||||||
|
"海康威视数字",
|
||||||
|
"同程网",
|
||||||
|
"艾丁金融",
|
||||||
|
"知乎",
|
||||||
|
" lu",
|
||||||
|
"国际商业机器公司",
|
||||||
|
"捷信消费金融",
|
||||||
|
"恒生利融",
|
||||||
|
"china merchants bank",
|
||||||
|
"企鹅电竞",
|
||||||
|
"捷信信驰",
|
||||||
|
"360智能家居",
|
||||||
|
"小桔车服",
|
||||||
|
"homecredit",
|
||||||
|
"皮皮虾",
|
||||||
|
"畅游",
|
||||||
|
"聚爱聊",
|
||||||
|
"suning.com",
|
||||||
|
"途牛旅游网",
|
||||||
|
"花呗",
|
||||||
|
"盈店通",
|
||||||
|
"sina",
|
||||||
|
"阿里巴巴音乐",
|
||||||
|
"华为技术有限公司",
|
||||||
|
"国付宝",
|
||||||
|
"shanghai lianshang network",
|
||||||
|
"oppo",
|
||||||
|
"华为投资控股",
|
||||||
|
"beijing sohu new media information",
|
||||||
|
"times square",
|
||||||
|
"菜鸟物流",
|
||||||
|
"lingxing",
|
||||||
|
"jd digits",
|
||||||
|
"同程旅游",
|
||||||
|
"分期乐",
|
||||||
|
"火锅视频",
|
||||||
|
"天天快报",
|
||||||
|
"猎豹移动",
|
||||||
|
"五八人力资源",
|
||||||
|
"宝宝树",
|
||||||
|
"顺丰科技",
|
||||||
|
"上海西翠",
|
||||||
|
"诗程文化传播",
|
||||||
|
"dewu",
|
||||||
|
"领星网络",
|
||||||
|
"aliexpress",
|
||||||
|
"贝塔通科技",
|
||||||
|
"链家",
|
||||||
|
"花小猪",
|
||||||
|
"趣输入",
|
||||||
|
"搜狐新媒体",
|
||||||
|
"一淘",
|
||||||
|
"56",
|
||||||
|
"qq阅读",
|
||||||
|
"青桔单车",
|
||||||
|
"iflytek",
|
||||||
|
"每日优鲜电子商务",
|
||||||
|
"腾讯觅影",
|
||||||
|
"微医",
|
||||||
|
"松果网",
|
||||||
|
"paypal",
|
||||||
|
"递瑞供应链管理",
|
||||||
|
"领星",
|
||||||
|
"qunar",
|
||||||
|
"三快",
|
||||||
|
"lu.com",
|
||||||
|
"携程旅行网",
|
||||||
|
"新潮传媒",
|
||||||
|
"链家经纪",
|
||||||
|
"景域文化",
|
||||||
|
"阿里健康",
|
||||||
|
"pingpeng",
|
||||||
|
"聚划算",
|
||||||
|
"零机科技",
|
||||||
|
"街兔电单车",
|
||||||
|
"快乐购",
|
||||||
|
"华为数字能源",
|
||||||
|
"搜狐",
|
||||||
|
"陆家嘴国际金融资产交易市场",
|
||||||
|
"nanjing tuniu",
|
||||||
|
"亚马逊",
|
||||||
|
"苏宁易购",
|
||||||
|
"携程旅游",
|
||||||
|
"苏宁金服",
|
||||||
|
"babytree",
|
||||||
|
"悟空问答",
|
||||||
|
"同花顺",
|
||||||
|
"eastmoney",
|
||||||
|
"浪潮信息",
|
||||||
|
"滴滴智慧交通",
|
||||||
|
"beijing ruixun lingtong",
|
||||||
|
"平安综合金融服务",
|
||||||
|
"爱奇艺",
|
||||||
|
"小米集团",
|
||||||
|
"华为云",
|
||||||
|
"微店",
|
||||||
|
"恒生集团",
|
||||||
|
"网易有道",
|
||||||
|
"boccfc",
|
||||||
|
"世纪思速科技",
|
||||||
|
"海康消防",
|
||||||
|
"beijing xiaomi",
|
||||||
|
"众安科技",
|
||||||
|
"五八同城",
|
||||||
|
"霆程汽车租赁",
|
||||||
|
"云卖分销",
|
||||||
|
"乐信集团",
|
||||||
|
"蚂蚁",
|
||||||
|
"舶乐蜜电子商务",
|
||||||
|
"支付宝中国",
|
||||||
|
"砖块消消消",
|
||||||
|
"vivo",
|
||||||
|
"阿里互娱",
|
||||||
|
"中国平安",
|
||||||
|
"lingxihudong",
|
||||||
|
"百度网盘",
|
||||||
|
"1号店",
|
||||||
|
"字节跳动",
|
||||||
|
"京东科技",
|
||||||
|
"驴妈妈兴旅国际旅行社",
|
||||||
|
"hangzhou alibaba music",
|
||||||
|
"xunlei",
|
||||||
|
"灵犀互动娱乐",
|
||||||
|
"快手",
|
||||||
|
"youtube",
|
||||||
|
"连尚慧眼",
|
||||||
|
"腾讯体育",
|
||||||
|
"爱商在线",
|
||||||
|
"酷我音乐",
|
||||||
|
"金融壹账通",
|
||||||
|
"搜狗服务",
|
||||||
|
"banma information",
|
||||||
|
"a站",
|
||||||
|
"罗汉堂",
|
||||||
|
"薇仕网络",
|
||||||
|
"搜狐新闻",
|
||||||
|
"贝宝",
|
||||||
|
"薇仕",
|
||||||
|
"口袋时尚科技",
|
||||||
|
"穆迪咨询",
|
||||||
|
"新狐投资管理",
|
||||||
|
"hikvision",
|
||||||
|
"alimama china holding limited",
|
||||||
|
"超聚变数字",
|
||||||
|
"腾讯视频",
|
||||||
|
"恒生电子",
|
||||||
|
"百度游戏",
|
||||||
|
"绿洲",
|
||||||
|
"木瓜移动",
|
||||||
|
"红袖添香",
|
||||||
|
"店匠科技",
|
||||||
|
"易贝",
|
||||||
|
"一淘网",
|
||||||
|
"博览群书",
|
||||||
|
"唯品会",
|
||||||
|
"lazglobal",
|
||||||
|
"amap",
|
||||||
|
"芒果网",
|
||||||
|
"口碑",
|
||||||
|
"海康慧影",
|
||||||
|
"腾讯音乐娱乐",
|
||||||
|
"网易严选",
|
||||||
|
"微信",
|
||||||
|
"shenzhen lexin holding",
|
||||||
|
"hangzhou pingpeng intelligent",
|
||||||
|
"连尚网络",
|
||||||
|
"海思",
|
||||||
|
"isunor",
|
||||||
|
"蝉翼",
|
||||||
|
"阿里游戏",
|
||||||
|
"广州优视",
|
||||||
|
"优视",
|
||||||
|
"腾讯征信",
|
||||||
|
"识装",
|
||||||
|
"finserve.pingan.com",
|
||||||
|
"papaya",
|
||||||
|
"阅文",
|
||||||
|
"平安健康保险",
|
||||||
|
"考拉海购",
|
||||||
|
"网易印象",
|
||||||
|
"wifi万能钥匙",
|
||||||
|
"新浪互联服务",
|
||||||
|
"亚马逊云科技",
|
||||||
|
"迅雷看看",
|
||||||
|
"华为朗新科技",
|
||||||
|
"adyen hong kong limited",
|
||||||
|
"谷歌",
|
||||||
|
"得物",
|
||||||
|
"网心",
|
||||||
|
"cainiao network",
|
||||||
|
"沐瞳",
|
||||||
|
"linkedln",
|
||||||
|
"hundsun",
|
||||||
|
"阿里旅行",
|
||||||
|
"珍爱网",
|
||||||
|
"阿里巴巴通信",
|
||||||
|
"金山奇剑",
|
||||||
|
"tongtool",
|
||||||
|
"华为安捷信电气",
|
||||||
|
"快乐时代",
|
||||||
|
"平安寿险",
|
||||||
|
"微博",
|
||||||
|
"微跳蚤",
|
||||||
|
"oppo移动通信",
|
||||||
|
"毒",
|
||||||
|
"alimama",
|
||||||
|
"shoplazza",
|
||||||
|
"shenzhen dianjiang science and",
|
||||||
|
"众鸣世科",
|
||||||
|
"平安金融",
|
||||||
|
"狐友",
|
||||||
|
"维沃移动通信",
|
||||||
|
"tobosoft",
|
||||||
|
"齐力电商",
|
||||||
|
"ali",
|
||||||
|
"诚信通",
|
||||||
|
"行吟",
|
||||||
|
"跳舞的线",
|
||||||
|
"橙心优选",
|
||||||
|
"众安健康",
|
||||||
|
"亚马逊中国投资",
|
||||||
|
"德絮投资管理中心合伙",
|
||||||
|
"招联消费金融",
|
||||||
|
"百度文学",
|
||||||
|
"芝麻信用",
|
||||||
|
"阿里零售通",
|
||||||
|
"时装",
|
||||||
|
"花样直播",
|
||||||
|
"sogou",
|
||||||
|
"uc",
|
||||||
|
"海思半导体",
|
||||||
|
"zhongan online p&c insurance",
|
||||||
|
"新浪数字",
|
||||||
|
"驴妈妈旅游网",
|
||||||
|
"华为数字能源技术",
|
||||||
|
"京东数科",
|
||||||
|
"oracle",
|
||||||
|
"xiaomi",
|
||||||
|
"nyse",
|
||||||
|
"阳光消费金融",
|
||||||
|
"天天动听",
|
||||||
|
"大众点评",
|
||||||
|
"上海瑞家",
|
||||||
|
"trustpass",
|
||||||
|
"hundsun technologies",
|
||||||
|
"美团小贷",
|
||||||
|
"ebay",
|
||||||
|
"通途",
|
||||||
|
"tcl",
|
||||||
|
"鸿蒙",
|
||||||
|
"酷狗计算机",
|
||||||
|
"品诺保险",
|
||||||
|
"capitalg",
|
||||||
|
"康盛创想",
|
||||||
|
"58同城",
|
||||||
|
"闲鱼",
|
||||||
|
"微软",
|
||||||
|
"吉易付科技",
|
||||||
|
"理财通",
|
||||||
|
"ctrip",
|
||||||
|
"yy",
|
||||||
|
"华为数字",
|
||||||
|
"kingsoft",
|
||||||
|
"孙宁金融",
|
||||||
|
"房江湖经纪",
|
||||||
|
"youku",
|
||||||
|
"ant financial services group",
|
||||||
|
"盒马",
|
||||||
|
"sensetime",
|
||||||
|
"伊千网络",
|
||||||
|
"小豹ai翻译棒",
|
||||||
|
"shopify",
|
||||||
|
"前海微众银行",
|
||||||
|
"qd",
|
||||||
|
"gmail",
|
||||||
|
"pingpong",
|
||||||
|
"alibaba group holding limited",
|
||||||
|
"捷信时空电子商务",
|
||||||
|
"orientsec",
|
||||||
|
"乔戈里管理咨询",
|
||||||
|
"ant",
|
||||||
|
"锐讯灵通",
|
||||||
|
"兴业消费金融",
|
||||||
|
"京东叁佰陆拾度电子商务",
|
||||||
|
"新浪",
|
||||||
|
"优酷土豆",
|
||||||
|
"海康机器人",
|
||||||
|
"美团单车",
|
||||||
|
"海康存储",
|
||||||
|
"领英",
|
||||||
|
"阿里全球速卖通",
|
||||||
|
"美菜网",
|
||||||
|
"京邦达",
|
||||||
|
"安居客",
|
||||||
|
"阿里体育",
|
||||||
|
"相互宝",
|
||||||
|
"cloudwalk",
|
||||||
|
"百度智能云",
|
||||||
|
"贝壳",
|
||||||
|
"酷狗",
|
||||||
|
"sunshine consumer finance",
|
||||||
|
"掌宜",
|
||||||
|
"奇酷网",
|
||||||
|
"核新同花顺",
|
||||||
|
"阿里巴巴影业",
|
||||||
|
"节创",
|
||||||
|
"学而思网校",
|
||||||
|
"速途",
|
||||||
|
"途牛",
|
||||||
|
"阿里云计算",
|
||||||
|
"beijing sensetime",
|
||||||
|
"alibaba cloud",
|
||||||
|
"西瓜视频",
|
||||||
|
"美团优选",
|
||||||
|
"orient securities limited",
|
||||||
|
"华为朗新",
|
||||||
|
"店匠",
|
||||||
|
"shanghai weishi network",
|
||||||
|
"友盟",
|
||||||
|
"飞猪旅行",
|
||||||
|
"滴滴出行",
|
||||||
|
"alipay",
|
||||||
|
"mogu",
|
||||||
|
"dangdang",
|
||||||
|
"大麦网",
|
||||||
|
"汉军智能系统",
|
||||||
|
"百度地图",
|
||||||
|
"货车帮",
|
||||||
|
"狐狸金服",
|
||||||
|
"众安在线保险经纪",
|
||||||
|
"华为通信",
|
||||||
|
"新浪支付",
|
||||||
|
"zhihu",
|
||||||
|
"alibaba cloud computing",
|
||||||
|
"沙发视频",
|
||||||
|
"金山软件",
|
||||||
|
"ping an good doctor",
|
||||||
|
"携程",
|
||||||
|
"脉脉",
|
||||||
|
"youku information beijing",
|
||||||
|
"zhongan",
|
||||||
|
"艾丁软件",
|
||||||
|
"乒乓智能",
|
||||||
|
"蘑菇街",
|
||||||
|
"taobao",
|
||||||
|
"华为技术服务",
|
||||||
|
"仕承文化传播",
|
||||||
|
"安捷信",
|
||||||
|
"狐狸互联网小额贷款",
|
||||||
|
"节点迅捷",
|
||||||
|
"中国银行",
|
||||||
|
"搜镇",
|
||||||
|
"众安在线",
|
||||||
|
"dingtalk",
|
||||||
|
"云从科技",
|
||||||
|
"beijing jingbangda trade",
|
||||||
|
"moody s",
|
||||||
|
"滚动的天空",
|
||||||
|
"yl.pingan.com",
|
||||||
|
"奇虎",
|
||||||
|
"alihealth",
|
||||||
|
"芒果tv",
|
||||||
|
"lufax",
|
||||||
|
"美团打车",
|
||||||
|
"小桔",
|
||||||
|
"贝壳找房网",
|
||||||
|
"小米科技",
|
||||||
|
"vips",
|
||||||
|
"kindle",
|
||||||
|
"亚马逊服务",
|
||||||
|
"citic consumer finance",
|
||||||
|
"微众",
|
||||||
|
"搜狗智慧互联网医院",
|
||||||
|
"盒马鲜生",
|
||||||
|
"life.pinan.com",
|
||||||
|
"ph.com.cn",
|
||||||
|
"银联",
|
||||||
|
"cmbchina",
|
||||||
|
"平安金融科技咨询",
|
||||||
|
"微保",
|
||||||
|
"甲骨文中国",
|
||||||
|
"飞书",
|
||||||
|
"koubei shanghai information",
|
||||||
|
"企鹅辅导",
|
||||||
|
"斑马",
|
||||||
|
"平安租赁",
|
||||||
|
"云从",
|
||||||
|
"马上消费",
|
||||||
|
"hangzhou ali baba advertising",
|
||||||
|
"金山",
|
||||||
|
"赛盒",
|
||||||
|
"科大讯飞",
|
||||||
|
"金星创业投资",
|
||||||
|
"平安国际融资租赁",
|
||||||
|
"360你财富",
|
||||||
|
"西山居",
|
||||||
|
"shenzhen qianhai fourth paradigm data",
|
||||||
|
"海思光电子",
|
||||||
|
"猎户星空",
|
||||||
|
"网易公司",
|
||||||
|
"浪潮",
|
||||||
|
"粒粒橙传媒",
|
||||||
|
"招联金融",
|
||||||
|
"100. me",
|
||||||
|
"捷信信驰咨询",
|
||||||
|
"唯品仓",
|
||||||
|
"orient",
|
||||||
|
"趣拿",
|
||||||
|
"摩拜单车",
|
||||||
|
"天猫精灵",
|
||||||
|
"菜鸟",
|
||||||
|
"豹小贩",
|
||||||
|
"去哪儿",
|
||||||
|
"米家",
|
||||||
|
"哈啰单车",
|
||||||
|
"搜狐体育",
|
||||||
|
"shopify payments usa",
|
||||||
|
"高德软件",
|
||||||
|
"讯联智付",
|
||||||
|
"乐信",
|
||||||
|
"唯你搭",
|
||||||
|
"第四范式",
|
||||||
|
"菜鸟网络",
|
||||||
|
"同程",
|
||||||
|
"yy语音",
|
||||||
|
"浪潮云",
|
||||||
|
"东财",
|
||||||
|
"淘宝",
|
||||||
|
"寻梦",
|
||||||
|
"citic securities limited",
|
||||||
|
"青橙之旅",
|
||||||
|
"阿里巴巴",
|
||||||
|
"番茄小说",
|
||||||
|
"上海亿贝",
|
||||||
|
"inspur",
|
||||||
|
"babytree inc",
|
||||||
|
"海康智慧产业股权投资基金合伙合伙",
|
||||||
|
"adyen",
|
||||||
|
"艺龙",
|
||||||
|
"蚂蚁金服",
|
||||||
|
"平安金服",
|
||||||
|
"百度百科",
|
||||||
|
"unionpay",
|
||||||
|
"当当",
|
||||||
|
"阅文集团",
|
||||||
|
"东方财富",
|
||||||
|
"东方证券",
|
||||||
|
"哈罗单车",
|
||||||
|
"优酷",
|
||||||
|
"海康",
|
||||||
|
"alipay china network",
|
||||||
|
"网商银行",
|
||||||
|
"钧正",
|
||||||
|
"property.pingan.com",
|
||||||
|
"豹咖啡",
|
||||||
|
"网易",
|
||||||
|
"我爱cba",
|
||||||
|
"theduapp",
|
||||||
|
"360",
|
||||||
|
"金山数字娱乐",
|
||||||
|
"新浪阅读",
|
||||||
|
"alibabagames",
|
||||||
|
"顺丰",
|
||||||
|
"支点商贸",
|
||||||
|
"同程旅行",
|
||||||
|
"citic securities",
|
||||||
|
"ele.com",
|
||||||
|
"tal",
|
||||||
|
"fresh hema",
|
||||||
|
"运满满",
|
||||||
|
"贝壳网",
|
||||||
|
"酷狗音乐",
|
||||||
|
"鲜城",
|
||||||
|
"360健康",
|
||||||
|
"浪潮世科",
|
||||||
|
"迅雷网络",
|
||||||
|
"哔哩哔哩",
|
||||||
|
"华为电动",
|
||||||
|
"淘友天下",
|
||||||
|
"华多网络",
|
||||||
|
"xunlei networking technologies",
|
||||||
|
"云杉",
|
||||||
|
"当当网电子商务",
|
||||||
|
"津虹网络",
|
||||||
|
"wedoc cloud hangzhou holdings",
|
||||||
|
"alisports shanghai",
|
||||||
|
"旷视金智",
|
||||||
|
"钉钉中国",
|
||||||
|
"微影",
|
||||||
|
"金山快快",
|
||||||
|
"亿贝",
|
||||||
|
"wedoc",
|
||||||
|
"autonavi",
|
||||||
|
"哈啰助力车",
|
||||||
|
"google cloud",
|
||||||
|
"新浪乐居",
|
||||||
|
"京东股票",
|
||||||
|
"搜狗智慧远程医疗中心",
|
||||||
|
"中银消金",
|
||||||
|
"merchants union consumer finance",
|
||||||
|
"王者荣耀",
|
||||||
|
"百度手机",
|
||||||
|
"美团民宿",
|
||||||
|
"kaola",
|
||||||
|
"小屋",
|
||||||
|
"金山网络",
|
||||||
|
"来往",
|
||||||
|
"顺丰速运",
|
||||||
|
"腾讯课堂",
|
||||||
|
"百度在线网络",
|
||||||
|
"美团买菜",
|
||||||
|
"威视汽车",
|
||||||
|
"uc mobile",
|
||||||
|
"来赞达",
|
||||||
|
"平安健康医疗",
|
||||||
|
"豹小秘",
|
||||||
|
"尚网",
|
||||||
|
"哈勃投资",
|
||||||
|
" ping an insurance group of china ,",
|
||||||
|
"小米",
|
||||||
|
"360好药",
|
||||||
|
"qq音乐",
|
||||||
|
"lingxigames",
|
||||||
|
"faceu激萌",
|
||||||
|
"搜狗",
|
||||||
|
"sohu",
|
||||||
|
"满帮",
|
||||||
|
"vipshop",
|
||||||
|
"wishpost",
|
||||||
|
"金山世游",
|
||||||
|
"shanghai yibaimi network",
|
||||||
|
"1688",
|
||||||
|
"海康汽车",
|
||||||
|
"顺丰控股",
|
||||||
|
"华为",
|
||||||
|
"妙镜vr",
|
||||||
|
"paybkj.com",
|
||||||
|
"hellobike",
|
||||||
|
"豹来电",
|
||||||
|
"京东",
|
||||||
|
"驴妈妈",
|
||||||
|
"momo",
|
||||||
|
"平安健康险",
|
||||||
|
"哈勃科技",
|
||||||
|
"美菜",
|
||||||
|
"众安在线财产保险",
|
||||||
|
"海康威视",
|
||||||
|
"east money information",
|
||||||
|
"阿里云",
|
||||||
|
"蝉游记",
|
||||||
|
"余额宝",
|
||||||
|
"屋客",
|
||||||
|
"滴滴",
|
||||||
|
"shopify international limited",
|
||||||
|
"百度",
|
||||||
|
"阿里健康中国",
|
||||||
|
"阿里通信",
|
||||||
|
"微梦创科",
|
||||||
|
"微医云",
|
||||||
|
"轻颜相机",
|
||||||
|
"搜易居",
|
||||||
|
"趣店集团",
|
||||||
|
"美团云",
|
||||||
|
"ant group",
|
||||||
|
"金山云",
|
||||||
|
"beijing express hand",
|
||||||
|
"觅觅",
|
||||||
|
"支付宝",
|
||||||
|
"滴滴承信科技咨询服务",
|
||||||
|
"拼多多",
|
||||||
|
"众安运动",
|
||||||
|
"乞力电商",
|
||||||
|
"youcash",
|
||||||
|
"唯品金融",
|
||||||
|
"陆金所",
|
||||||
|
"本地生活",
|
||||||
|
"sz dji",
|
||||||
|
"海康智能",
|
||||||
|
"魔方网聘",
|
||||||
|
"青藤大学",
|
||||||
|
"international business machines",
|
||||||
|
"学而思",
|
||||||
|
"beijing zhongming century science and",
|
||||||
|
"猎豹清理大师",
|
||||||
|
"asinking",
|
||||||
|
"高德",
|
||||||
|
"苏宁",
|
||||||
|
"优酷网",
|
||||||
|
"艾丁",
|
||||||
|
"中银消费金融",
|
||||||
|
"京东健康",
|
||||||
|
"五八教育",
|
||||||
|
"pingpongx",
|
||||||
|
"搜狐时尚",
|
||||||
|
"阿里广告",
|
||||||
|
"平安财险",
|
||||||
|
"中邮消金",
|
||||||
|
"etao",
|
||||||
|
"怕怕",
|
||||||
|
"nyse:cmcm",
|
||||||
|
"华为培训中心",
|
||||||
|
"高德地图",
|
||||||
|
"云狐天下征信",
|
||||||
|
"大疆创新",
|
||||||
|
"连尚",
|
||||||
|
"壹佰米",
|
||||||
|
"康健公司",
|
||||||
|
"iqiyi.com",
|
||||||
|
"360安全云盘",
|
||||||
|
"馒头直播",
|
||||||
|
"淘友网",
|
||||||
|
"东方赢家",
|
||||||
|
"bank of china",
|
||||||
|
"微众银行",
|
||||||
|
"阿里巴巴国际站",
|
||||||
|
"虾米",
|
||||||
|
"去哪儿网",
|
||||||
|
"ctrip travel network shanghai",
|
||||||
|
"潇湘书院",
|
||||||
|
"腾讯",
|
||||||
|
"快乐阳光互动娱乐传媒",
|
||||||
|
"迅雷",
|
||||||
|
"weidian",
|
||||||
|
"滴滴货运",
|
||||||
|
"ping an puhui enterprise management",
|
||||||
|
"新浪仓石基金销售",
|
||||||
|
"搜狐焦点",
|
||||||
|
"alibaba pictures",
|
||||||
|
"wps",
|
||||||
|
"平安",
|
||||||
|
"lazmall",
|
||||||
|
"百度开放平台",
|
||||||
|
"兴业消金",
|
||||||
|
" 珍爱网",
|
||||||
|
"京东云",
|
||||||
|
"小红书",
|
||||||
|
"1688. com",
|
||||||
|
"如视智数",
|
||||||
|
"missfresh",
|
||||||
|
"pazl.pingan.cn",
|
||||||
|
"平安集团",
|
||||||
|
"kugou",
|
||||||
|
"懂车帝",
|
||||||
|
"斑马智行",
|
||||||
|
"浪潮集团",
|
||||||
|
"netease hangzhou network",
|
||||||
|
"pagd.net",
|
||||||
|
"探探",
|
||||||
|
"chinaliterature",
|
||||||
|
"amazon亚马逊",
|
||||||
|
"alphabet",
|
||||||
|
"当当文创手工艺品电子商务",
|
||||||
|
"五八邦",
|
||||||
|
"shenzhen zhenai network information",
|
||||||
|
"lingshoutong",
|
||||||
|
"字节",
|
||||||
|
"lvmama",
|
||||||
|
"金山办公",
|
||||||
|
"众安保险",
|
||||||
|
"时装信息",
|
||||||
|
"优视科技",
|
||||||
|
"guangzhou kugou",
|
||||||
|
"ibm",
|
||||||
|
"滴滴打车",
|
||||||
|
"beijing sogou information service",
|
||||||
|
"megvii",
|
||||||
|
"健谈哥",
|
||||||
|
"cloudwalk group",
|
||||||
|
"蜂联科技",
|
||||||
|
"冬云",
|
||||||
|
"京东尚科",
|
||||||
|
"钢琴块2",
|
||||||
|
"京东世纪",
|
||||||
|
"商汤",
|
||||||
|
"众鸣世纪",
|
||||||
|
"腾讯音乐",
|
||||||
|
"迅雷网文化",
|
||||||
|
"华为云计算技术",
|
||||||
|
"live.me",
|
||||||
|
"全球速卖通",
|
||||||
|
"快的打车",
|
||||||
|
"hello group inc",
|
||||||
|
"美丽说",
|
||||||
|
"suning",
|
||||||
|
"opengauss",
|
||||||
|
"lazada",
|
||||||
|
"tmall",
|
||||||
|
"acfun",
|
||||||
|
"当当网",
|
||||||
|
"中银",
|
||||||
|
"旷视科技",
|
||||||
|
"百度钱包",
|
||||||
|
"淘宝网",
|
||||||
|
"新浪微博",
|
||||||
|
"迅雷集团",
|
||||||
|
"中信消费金融",
|
||||||
|
"学而思教育",
|
||||||
|
"平安普惠",
|
||||||
|
"悟空跨境",
|
||||||
|
"irobotbox",
|
||||||
|
"平安产险",
|
||||||
|
"inspur group",
|
||||||
|
"世纪卓越快递服务",
|
||||||
|
"奇虎360",
|
||||||
|
"webank",
|
||||||
|
"偶藻",
|
||||||
|
"唯品支付",
|
||||||
|
"腾讯云计算",
|
||||||
|
"众安服务",
|
||||||
|
"亿之唐",
|
||||||
|
"beijing 58 information ttechnology",
|
||||||
|
"平安好医生",
|
||||||
|
"迅雷之锤",
|
||||||
|
"旅行小账本",
|
||||||
|
"芒果游戏",
|
||||||
|
"新浪传媒",
|
||||||
|
"旷镜博煊",
|
||||||
|
"全民k歌",
|
||||||
|
"滴滴支付",
|
||||||
|
"北京网心科技",
|
||||||
|
"挂号网",
|
||||||
|
"萤石",
|
||||||
|
"chinavision media group limited",
|
||||||
|
"猎豹安全大师",
|
||||||
|
"cmcm",
|
||||||
|
"趣店",
|
||||||
|
"蚂蚁财富",
|
||||||
|
"商汤科技",
|
||||||
|
"甲骨文",
|
||||||
|
"百度云",
|
||||||
|
"百度apollo",
|
||||||
|
"19 pay",
|
||||||
|
"stock.pingan.com",
|
||||||
|
"tiktok",
|
||||||
|
"alibaba pictures group limited",
|
||||||
|
"ele",
|
||||||
|
"考拉",
|
||||||
|
"天猫",
|
||||||
|
"腾讯优图",
|
||||||
|
"起点中文网",
|
||||||
|
"百度视频",
|
||||||
|
"shanghai bili bili",
|
||||||
|
"京东物流",
|
||||||
|
"ebay marketplaces gmbh",
|
||||||
|
"alibaba sport",
|
||||||
|
"wish",
|
||||||
|
"阿里巴巴中国",
|
||||||
|
"中国银联",
|
||||||
|
"alibaba china network",
|
||||||
|
"china ping an property insurance",
|
||||||
|
"百度糯米网",
|
||||||
|
"微软中国",
|
||||||
|
"一九付",
|
||||||
|
"4 paradigm",
|
||||||
|
"叮咚买菜",
|
||||||
|
"umeng",
|
||||||
|
"众鸣科技",
|
||||||
|
"平安财富通",
|
||||||
|
"google",
|
||||||
|
"巨量引擎",
|
||||||
|
"百度贴吧",
|
||||||
|
"beijing jingdong century information",
|
||||||
|
"讯飞",
|
||||||
|
"beijing yunshan information",
|
||||||
|
"满运软件",
|
||||||
|
"中邮消费金融",
|
||||||
|
"饿了么",
|
||||||
|
"alios",
|
||||||
|
"腾讯ai实验室",
|
||||||
|
"第四范式智能",
|
||||||
|
"瀚星创业投资",
|
||||||
|
"gradient ventures",
|
||||||
|
"microsoft",
|
||||||
|
"哈啰共享汽车",
|
||||||
|
"乞力电子商务",
|
||||||
|
"mscf",
|
||||||
|
"网易影业文化",
|
||||||
|
"铁友旅游咨询",
|
||||||
|
"kilimall",
|
||||||
|
"云企互联投资",
|
||||||
|
"ping an financial consulting",
|
||||||
|
"beijng jingdong century commerce",
|
||||||
|
"高德威智能交通系统",
|
||||||
|
"中友信息",
|
||||||
|
"平安医疗健康管理",
|
||||||
|
"eciticcfc",
|
||||||
|
"中信证券",
|
||||||
|
"fliggy",
|
||||||
|
"电子湾",
|
||||||
|
"旷云金智",
|
||||||
|
"微粒贷",
|
||||||
|
"rsi",
|
||||||
|
"滴滴云计算",
|
||||||
|
"google ventures",
|
||||||
|
"箐程",
|
||||||
|
"每日优鲜",
|
||||||
|
"音兔",
|
||||||
|
"拉扎斯",
|
||||||
|
"今日头条",
|
||||||
|
"乐信控股",
|
||||||
|
"猎豹浏览器",
|
||||||
|
"细微咨询",
|
||||||
|
"好未来",
|
||||||
|
"我乐",
|
||||||
|
"绘声绘色",
|
||||||
|
"抖音",
|
||||||
|
"搜狐新时代",
|
||||||
|
"飞猪",
|
||||||
|
"鹅厂",
|
||||||
|
"贝壳找房",
|
||||||
|
"tuniu",
|
||||||
|
"红马传媒文化",
|
||||||
|
"钉钉",
|
||||||
|
"马上消费金融",
|
||||||
|
"360手机",
|
||||||
|
"平安医保",
|
||||||
|
"快途",
|
||||||
|
"alibaba",
|
||||||
|
"小哈换电",
|
||||||
|
"大麦",
|
||||||
|
"恒睿人工智能研究院",
|
||||||
|
"谷歌资本",
|
||||||
|
"猎豹",
|
||||||
|
"穆迪信息"
|
||||||
|
]
|
595
deepdoc/parser/resume/entities/res/good_sch.json
Normal file
595
deepdoc/parser/resume/entities/res/good_sch.json
Normal file
@ -0,0 +1,595 @@
|
|||||||
|
[
|
||||||
|
"中国科技大学",
|
||||||
|
"国防科学技术大学",
|
||||||
|
"清华大学",
|
||||||
|
"清华",
|
||||||
|
"tsinghua university",
|
||||||
|
"thu",
|
||||||
|
"北京大学",
|
||||||
|
"北大",
|
||||||
|
"beijing university",
|
||||||
|
"pku",
|
||||||
|
"中国科学技术大学",
|
||||||
|
"中国科大",
|
||||||
|
"中科大",
|
||||||
|
"china science & technology university",
|
||||||
|
"ustc",
|
||||||
|
"复旦大学",
|
||||||
|
"复旦",
|
||||||
|
"fudan university",
|
||||||
|
"fdu",
|
||||||
|
"中国人民大学",
|
||||||
|
"人大",
|
||||||
|
"人民大学",
|
||||||
|
"renmin university of china",
|
||||||
|
"ruc",
|
||||||
|
"上海交通大学",
|
||||||
|
"上海交大",
|
||||||
|
"shanghai jiao tong university",
|
||||||
|
"sjtu",
|
||||||
|
"南京大学",
|
||||||
|
"南大",
|
||||||
|
"nanjing university",
|
||||||
|
"nju",
|
||||||
|
"同济大学",
|
||||||
|
"同济",
|
||||||
|
"tongji university",
|
||||||
|
"tongji",
|
||||||
|
"浙江大学",
|
||||||
|
"浙大",
|
||||||
|
"zhejiang university",
|
||||||
|
"zju",
|
||||||
|
"南开大学",
|
||||||
|
"南开",
|
||||||
|
"nankai university",
|
||||||
|
"nku",
|
||||||
|
"北京航空航天大学",
|
||||||
|
"北航",
|
||||||
|
"beihang university",
|
||||||
|
"buaa",
|
||||||
|
"北京师范大学",
|
||||||
|
"北师",
|
||||||
|
"北师大",
|
||||||
|
"beijing normal university",
|
||||||
|
"bnu",
|
||||||
|
"武汉大学",
|
||||||
|
"武大",
|
||||||
|
"wuhan university",
|
||||||
|
"whu",
|
||||||
|
"西安交通大学",
|
||||||
|
"西安交大",
|
||||||
|
"xi’an jiaotong university",
|
||||||
|
"xjtu",
|
||||||
|
"天津大学",
|
||||||
|
"天大",
|
||||||
|
"university of tianjin",
|
||||||
|
"tju",
|
||||||
|
"华中科技大学",
|
||||||
|
"华中大",
|
||||||
|
"central china university science and technology",
|
||||||
|
"hust",
|
||||||
|
"北京理工大学",
|
||||||
|
"北理",
|
||||||
|
"beijing institute of technology",
|
||||||
|
"bit",
|
||||||
|
"东南大学",
|
||||||
|
"东大",
|
||||||
|
"southeast china university",
|
||||||
|
"seu",
|
||||||
|
"中山大学",
|
||||||
|
"中大",
|
||||||
|
"zhongshan university",
|
||||||
|
"sysu",
|
||||||
|
"华东师范大学",
|
||||||
|
"华师大",
|
||||||
|
"east china normal university",
|
||||||
|
"ecnu",
|
||||||
|
"哈尔滨工业大学",
|
||||||
|
"哈工大",
|
||||||
|
"harbin institute of technology",
|
||||||
|
"hit",
|
||||||
|
"厦门大学",
|
||||||
|
"厦大",
|
||||||
|
"xiamen university",
|
||||||
|
"xmu",
|
||||||
|
"西北工业大学",
|
||||||
|
"西工大",
|
||||||
|
"西北工大",
|
||||||
|
"northwestern polytechnical university",
|
||||||
|
"npu",
|
||||||
|
"中南大学",
|
||||||
|
"中南",
|
||||||
|
"middle and southern university",
|
||||||
|
"csu",
|
||||||
|
"大连理工大学",
|
||||||
|
"大工",
|
||||||
|
"institute of technology of dalian",
|
||||||
|
"dut",
|
||||||
|
"四川大学",
|
||||||
|
"川大",
|
||||||
|
"sichuan university",
|
||||||
|
"scu",
|
||||||
|
"电子科技大学",
|
||||||
|
"电子科大",
|
||||||
|
"university of electronic science and technology of china",
|
||||||
|
"uestc",
|
||||||
|
"华南理工大学",
|
||||||
|
"华南理工",
|
||||||
|
"institutes of technology of south china",
|
||||||
|
"scut",
|
||||||
|
"吉林大学",
|
||||||
|
"吉大",
|
||||||
|
"jilin university",
|
||||||
|
"jlu",
|
||||||
|
"湖南大学",
|
||||||
|
"湖大",
|
||||||
|
"hunan university",
|
||||||
|
"hnu",
|
||||||
|
"重庆大学",
|
||||||
|
"重大",
|
||||||
|
"university of chongqing",
|
||||||
|
"cqu",
|
||||||
|
"山东大学",
|
||||||
|
"山大",
|
||||||
|
"shandong university",
|
||||||
|
"sdu",
|
||||||
|
"中国农业大学",
|
||||||
|
"中国农大",
|
||||||
|
"china agricultural university",
|
||||||
|
"cau",
|
||||||
|
"中国海洋大学",
|
||||||
|
"中国海大",
|
||||||
|
"chinese marine university",
|
||||||
|
"ouc",
|
||||||
|
"中央民族大学",
|
||||||
|
"中央民大",
|
||||||
|
"central university for nationalities",
|
||||||
|
"muc",
|
||||||
|
"东北大学",
|
||||||
|
"东北工学院",
|
||||||
|
"northeastern university",
|
||||||
|
"neu 或 nu",
|
||||||
|
"兰州大学",
|
||||||
|
"兰大",
|
||||||
|
"lanzhou university",
|
||||||
|
"lzu",
|
||||||
|
"西北农林科技大学",
|
||||||
|
"西农","西北农大",
|
||||||
|
"northwest a&f university",
|
||||||
|
"nwafu",
|
||||||
|
"中国人民解放军国防科技大学",
|
||||||
|
"国防科技大学","国防科大",
|
||||||
|
"national university of defense technology",
|
||||||
|
"nudt",
|
||||||
|
"郑州大学",
|
||||||
|
"郑大",
|
||||||
|
"zhengzhou university",
|
||||||
|
"zzu",
|
||||||
|
"云南大学",
|
||||||
|
"云大",
|
||||||
|
"yunnan university",
|
||||||
|
"ynu",
|
||||||
|
"新疆大学",
|
||||||
|
"新大",
|
||||||
|
"xinjiang university",
|
||||||
|
"xju",
|
||||||
|
"北京交通大学",
|
||||||
|
"北京交大",
|
||||||
|
"beijing jiaotong university",
|
||||||
|
"bjtu",
|
||||||
|
"北京工业大学",
|
||||||
|
"北工大",
|
||||||
|
"beijing university of technology",
|
||||||
|
"bjut",
|
||||||
|
"北京科技大学",
|
||||||
|
"北科大","北京科大",
|
||||||
|
"university of science and technology beijing",
|
||||||
|
"ustb",
|
||||||
|
"北京化工大学",
|
||||||
|
"北化",
|
||||||
|
"beijing university of chemical technology",
|
||||||
|
"buct",
|
||||||
|
"北京邮电大学",
|
||||||
|
"北邮",
|
||||||
|
"beijing university of posts and telecommunications",
|
||||||
|
"beijing university of post and telecommunications",
|
||||||
|
"beijing university of post and telecommunication",
|
||||||
|
"beijing university of posts and telecommunication",
|
||||||
|
"bupt",
|
||||||
|
"北京林业大学",
|
||||||
|
"北林",
|
||||||
|
"beijing forestry university",
|
||||||
|
"bfu",
|
||||||
|
"北京协和医学院",
|
||||||
|
"协和医学院",
|
||||||
|
"peking union medical college",
|
||||||
|
"pumc",
|
||||||
|
"北京中医药大学",
|
||||||
|
"北中医",
|
||||||
|
"beijing university of chinese medicine",
|
||||||
|
"bucm",
|
||||||
|
"首都师范大学",
|
||||||
|
"首师大",
|
||||||
|
"capital normal university",
|
||||||
|
"cnu",
|
||||||
|
"北京外国语大学",
|
||||||
|
"北外",
|
||||||
|
"beijing foreign studies university",
|
||||||
|
"bfsu",
|
||||||
|
"中国传媒大学",
|
||||||
|
"中媒",
|
||||||
|
"中传",
|
||||||
|
"北京广播学院",
|
||||||
|
"communication university of china",
|
||||||
|
"cuc",
|
||||||
|
"中央财经大学",
|
||||||
|
"中央财大",
|
||||||
|
"中财大",
|
||||||
|
"the central university of finance and economics",
|
||||||
|
"cufe",
|
||||||
|
"对外经济贸易大学",
|
||||||
|
"对外经贸大学",
|
||||||
|
"贸大",
|
||||||
|
"university of international business and economics",
|
||||||
|
"uibe",
|
||||||
|
"外交学院",
|
||||||
|
"外院",
|
||||||
|
"china foreign affairs university",
|
||||||
|
"cfau",
|
||||||
|
"中国人民公安大学",
|
||||||
|
"公安大学",
|
||||||
|
"people's public security university of china",
|
||||||
|
"ppsuc",
|
||||||
|
"北京体育大学",
|
||||||
|
"北体大",
|
||||||
|
"beijing sport university",
|
||||||
|
"bsu",
|
||||||
|
"中央音乐学院",
|
||||||
|
"央音",
|
||||||
|
"中央院",
|
||||||
|
"central conservatory of music",
|
||||||
|
"ccom",
|
||||||
|
"中国音乐学院",
|
||||||
|
"国音",
|
||||||
|
"中国院",
|
||||||
|
"china conservatory of music",
|
||||||
|
"ccmusic",
|
||||||
|
"中央美术学院",
|
||||||
|
"央美",
|
||||||
|
"central academy of fine art",
|
||||||
|
"cafa",
|
||||||
|
"中央戏剧学院",
|
||||||
|
"中戏",
|
||||||
|
"the central academy of drama",
|
||||||
|
"tcad",
|
||||||
|
"中国政法大学",
|
||||||
|
"法大",
|
||||||
|
"china university of political science and law",
|
||||||
|
"zuc",
|
||||||
|
"cupl",
|
||||||
|
"中国科学院大学",
|
||||||
|
"国科大",
|
||||||
|
"科院大",
|
||||||
|
"university of chinese academy of sciences",
|
||||||
|
"ucas",
|
||||||
|
"福州大学",
|
||||||
|
"福大",
|
||||||
|
"university of fuzhou",
|
||||||
|
"fzu",
|
||||||
|
"暨南大学",
|
||||||
|
"暨大",
|
||||||
|
"ji'nan university",
|
||||||
|
"jnu",
|
||||||
|
"广州中医药大学",
|
||||||
|
"广中医",
|
||||||
|
"traditional chinese medicine university of guangzhou",
|
||||||
|
"gucm",
|
||||||
|
"华南师范大学",
|
||||||
|
"华南师大",
|
||||||
|
"south china normal university",
|
||||||
|
"scnu",
|
||||||
|
"广西大学",
|
||||||
|
"西大",
|
||||||
|
"guangxi university",
|
||||||
|
"gxu",
|
||||||
|
"贵州大学",
|
||||||
|
"贵大",
|
||||||
|
"guizhou university",
|
||||||
|
"gzu",
|
||||||
|
"海南大学",
|
||||||
|
"海大",
|
||||||
|
"university of hainan",
|
||||||
|
"hainu",
|
||||||
|
"河南大学",
|
||||||
|
"河大",
|
||||||
|
"he'nan university",
|
||||||
|
"henu",
|
||||||
|
"哈尔滨工程大学",
|
||||||
|
"哈工程",
|
||||||
|
"harbin engineering university",
|
||||||
|
"heu",
|
||||||
|
"东北农业大学",
|
||||||
|
"东北农大",
|
||||||
|
"northeast agricultural university",
|
||||||
|
"neau",
|
||||||
|
"东北林业大学",
|
||||||
|
"东北林大",
|
||||||
|
"northeast forestry university",
|
||||||
|
"nefu",
|
||||||
|
"中国地质大学",
|
||||||
|
"地大",
|
||||||
|
"china university of geosciences",
|
||||||
|
"cug",
|
||||||
|
"武汉理工大学",
|
||||||
|
"武汉理工",
|
||||||
|
"wuhan university of technology",
|
||||||
|
"wut",
|
||||||
|
"华中农业大学",
|
||||||
|
"华中农大",
|
||||||
|
"华农",
|
||||||
|
"central china agricultural university",
|
||||||
|
"hzau",
|
||||||
|
"华中师范大学",
|
||||||
|
"华中师大",
|
||||||
|
"华大",
|
||||||
|
"central china normal university",
|
||||||
|
"ccnu",
|
||||||
|
"中南财经政法大学",
|
||||||
|
"中南大",
|
||||||
|
"zhongnan university of economics & law",
|
||||||
|
"zuel",
|
||||||
|
"湖南师范大学",
|
||||||
|
"湖南师大",
|
||||||
|
"hunan normal university",
|
||||||
|
"hunnu",
|
||||||
|
"延边大学",
|
||||||
|
"延大",
|
||||||
|
"yanbian university",
|
||||||
|
"ybu",
|
||||||
|
"东北师范大学",
|
||||||
|
"东北师大",
|
||||||
|
"northeast normal university",
|
||||||
|
"nenu",
|
||||||
|
"苏州大学",
|
||||||
|
"苏大",
|
||||||
|
"soochow university",
|
||||||
|
"suda",
|
||||||
|
"南京航空航天大学",
|
||||||
|
"南航",
|
||||||
|
"nanjing aero-space university",
|
||||||
|
"nuaa",
|
||||||
|
"南京理工大学",
|
||||||
|
"南理工",
|
||||||
|
"institutes of technology of nanjing",
|
||||||
|
"njust",
|
||||||
|
"中国矿业大学",
|
||||||
|
"中国矿大",
|
||||||
|
"china mining university",
|
||||||
|
"cumt",
|
||||||
|
"南京邮电大学",
|
||||||
|
"南邮",
|
||||||
|
"nanjing university of posts and telecommunications",
|
||||||
|
"njupt",
|
||||||
|
"河海大学",
|
||||||
|
"河海",
|
||||||
|
"river sea university",
|
||||||
|
"hhu",
|
||||||
|
"江南大学",
|
||||||
|
"江南大",
|
||||||
|
"jiangnan university",
|
||||||
|
"jiangnan",
|
||||||
|
"南京林业大学",
|
||||||
|
"南林",
|
||||||
|
"nanjing forestry university",
|
||||||
|
"njfu",
|
||||||
|
"南京信息工程大学",
|
||||||
|
"南信大",
|
||||||
|
"nanjing university of information science and technology",
|
||||||
|
"nuist",
|
||||||
|
"南京农业大学",
|
||||||
|
"南农",
|
||||||
|
"南农大",
|
||||||
|
"南京农大",
|
||||||
|
"agricultural university of nanjing",
|
||||||
|
"njau",
|
||||||
|
"nau",
|
||||||
|
"南京中医药大学",
|
||||||
|
"南中医",
|
||||||
|
"nanjing university of chinese medicine",
|
||||||
|
"njucm",
|
||||||
|
"中国药科大学",
|
||||||
|
"中国药大",
|
||||||
|
"china medicine university",
|
||||||
|
"cpu",
|
||||||
|
"南京师范大学",
|
||||||
|
"南京师大",
|
||||||
|
"南师大",
|
||||||
|
"南师",
|
||||||
|
"nanjing normal university",
|
||||||
|
"nnu",
|
||||||
|
"南昌大学",
|
||||||
|
"昌大",
|
||||||
|
"university of nanchang","nanchang university",
|
||||||
|
"ncu",
|
||||||
|
"辽宁大学",
|
||||||
|
"辽大",
|
||||||
|
"liaoning university",
|
||||||
|
"lnu",
|
||||||
|
"大连海事大学",
|
||||||
|
"大连海大",
|
||||||
|
"海大",
|
||||||
|
"maritime affairs university of dalian",
|
||||||
|
"dmu",
|
||||||
|
"内蒙古大学",
|
||||||
|
"内大",
|
||||||
|
"university of the inner mongol","inner mongolia university",
|
||||||
|
"imu",
|
||||||
|
"宁夏大学",
|
||||||
|
"宁大",
|
||||||
|
"ningxia university",
|
||||||
|
"nxu",
|
||||||
|
"青海大学",
|
||||||
|
"清大",
|
||||||
|
"qinghai university",
|
||||||
|
"qhu",
|
||||||
|
"中国石油大学",
|
||||||
|
"中石大",
|
||||||
|
"china university of petroleum beijing",
|
||||||
|
"upc",
|
||||||
|
"太原理工大学",
|
||||||
|
"太原理工",
|
||||||
|
"institutes of technology of taiyuan","taiyuan university of technology",
|
||||||
|
"tyut",
|
||||||
|
"西北大学",
|
||||||
|
"西大",
|
||||||
|
"northwest university",
|
||||||
|
"nwu",
|
||||||
|
"西安电子科技大学",
|
||||||
|
"西电",
|
||||||
|
"xidian university",
|
||||||
|
"xdu",
|
||||||
|
"长安大学",
|
||||||
|
"长大",
|
||||||
|
"chang`an university",
|
||||||
|
"chu",
|
||||||
|
"陕西师范大学",
|
||||||
|
"陕西师大",
|
||||||
|
"陕师大",
|
||||||
|
"shaanxi normal university",
|
||||||
|
"snnu",
|
||||||
|
"第四军医大学",
|
||||||
|
"空军军医大学","四医大",
|
||||||
|
"air force medical university",
|
||||||
|
"fmmu",
|
||||||
|
"华东理工大学",
|
||||||
|
"华理",
|
||||||
|
"east china university of science",
|
||||||
|
"ecust",
|
||||||
|
"东华大学",
|
||||||
|
"东华",
|
||||||
|
"donghua university",
|
||||||
|
"dhu",
|
||||||
|
"上海海洋大学",
|
||||||
|
"上海海大",
|
||||||
|
"shanghai ocean university",
|
||||||
|
"shou",
|
||||||
|
"上海中医药大学",
|
||||||
|
"上中医",
|
||||||
|
"shanghai university of traditional chinese medicine",
|
||||||
|
"shutcm",
|
||||||
|
"上海外国语大学",
|
||||||
|
"上外",
|
||||||
|
"shanghai international studies university",
|
||||||
|
"sisu",
|
||||||
|
"上海财经大学",
|
||||||
|
"上海财大",
|
||||||
|
"上财",
|
||||||
|
"shanghai university of finance",
|
||||||
|
"sufe",
|
||||||
|
"上海体育学院",
|
||||||
|
"shanghai university of sport",
|
||||||
|
"上海音乐学院",
|
||||||
|
"上音",
|
||||||
|
"shanghai conservatory of music",
|
||||||
|
"shcm",
|
||||||
|
"上海大学",
|
||||||
|
"上大",
|
||||||
|
"shanghai university",
|
||||||
|
"第二军医大学",
|
||||||
|
"海军军医大学",
|
||||||
|
"naval medical university",
|
||||||
|
"西南交通大学",
|
||||||
|
"西南交大",
|
||||||
|
"southwest jiaotong university",
|
||||||
|
"swjtu",
|
||||||
|
"西南石油大学",
|
||||||
|
"西南石大",
|
||||||
|
"southwest petroleum university",
|
||||||
|
"swpu",
|
||||||
|
"成都理工大学",
|
||||||
|
"成都理工",
|
||||||
|
"chengdu university of technology",
|
||||||
|
"cdut ",
|
||||||
|
"四川农业大学",
|
||||||
|
"川农",
|
||||||
|
"川农大",
|
||||||
|
"sichuan agricultural university",
|
||||||
|
"sicau",
|
||||||
|
"成都中医药大学",
|
||||||
|
"成中医",
|
||||||
|
"chengdu university of tcm",
|
||||||
|
"cdutcm",
|
||||||
|
"西南财经大学",
|
||||||
|
"西南财大",
|
||||||
|
"西财",
|
||||||
|
"southwestern university of finance and economics",
|
||||||
|
"swufe",
|
||||||
|
"天津工业大学",
|
||||||
|
"天工大",
|
||||||
|
"tianjin university of technology",
|
||||||
|
"tgu",
|
||||||
|
"天津医科大学",
|
||||||
|
"天津医大",
|
||||||
|
"medical university of tianjin",
|
||||||
|
"tmu",
|
||||||
|
"天津中医药大学",
|
||||||
|
"天中",
|
||||||
|
"tianjin university of traditional chinese medicine",
|
||||||
|
"tutcm",
|
||||||
|
"华北电力大学",
|
||||||
|
"华电",
|
||||||
|
"north china electric power university",
|
||||||
|
"ncepu",
|
||||||
|
"河北工业大学",
|
||||||
|
"河工大",
|
||||||
|
"hebei university of technology",
|
||||||
|
"hebut",
|
||||||
|
"西藏大学",
|
||||||
|
"藏大",
|
||||||
|
"tibet university",
|
||||||
|
"tu",
|
||||||
|
"石河子大学",
|
||||||
|
"石大",
|
||||||
|
"shihezi university",
|
||||||
|
"中国美术学院",
|
||||||
|
"中国美院",
|
||||||
|
"国美",
|
||||||
|
"china academy of art",
|
||||||
|
"caa",
|
||||||
|
"宁波大学",
|
||||||
|
"宁大",
|
||||||
|
"ningbo university",
|
||||||
|
"nbu",
|
||||||
|
"西南大学",
|
||||||
|
"西大",
|
||||||
|
"southwest university",
|
||||||
|
"swu",
|
||||||
|
"安徽大学",
|
||||||
|
"安大",
|
||||||
|
"university of anhui",
|
||||||
|
"ahu",
|
||||||
|
"合肥工业大学",
|
||||||
|
"合肥工大",
|
||||||
|
"合工大",
|
||||||
|
"hefei university of technology",
|
||||||
|
"hfut",
|
||||||
|
"中国地质大学",
|
||||||
|
"地大",
|
||||||
|
"china university of geosciences",
|
||||||
|
"cug",
|
||||||
|
"中国地质大学",
|
||||||
|
"地大",
|
||||||
|
"北京地大",
|
||||||
|
"cugb",
|
||||||
|
"中国矿业大学",
|
||||||
|
"中国矿大",
|
||||||
|
"china university of mining & technology",
|
||||||
|
"cumtb",
|
||||||
|
"中国石油大学",
|
||||||
|
"中石大",
|
||||||
|
"石大",
|
||||||
|
"china university of petroleum",
|
||||||
|
"cup",
|
||||||
|
"中国石油大学",
|
||||||
|
"中石大",
|
||||||
|
"cup"]
|
1627
deepdoc/parser/resume/entities/res/school.rank.csv
Normal file
1627
deepdoc/parser/resume/entities/res/school.rank.csv
Normal file
File diff suppressed because it is too large
Load Diff
5713
deepdoc/parser/resume/entities/res/schools.csv
Normal file
5713
deepdoc/parser/resume/entities/res/schools.csv
Normal file
File diff suppressed because it is too large
Load Diff
62
deepdoc/parser/resume/entities/schools.py
Normal file
62
deepdoc/parser/resume/entities/schools.py
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
import os, json,re,copy
|
||||||
|
import pandas as pd
|
||||||
|
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
TBL = pd.read_csv(os.path.join(current_file_path, "res/schools.csv"), sep="\t", header=0).fillna("")
|
||||||
|
TBL["name_en"] = TBL["name_en"].map(lambda x: x.lower().strip())
|
||||||
|
GOOD_SCH = json.load(open(os.path.join(current_file_path, "res/good_sch.json"), "r"))
|
||||||
|
GOOD_SCH = set([re.sub(r"[,. &()()]+", "", c) for c in GOOD_SCH])
|
||||||
|
|
||||||
|
|
||||||
|
def loadRank(fnm):
|
||||||
|
global TBL
|
||||||
|
TBL["rank"] = 1000000
|
||||||
|
with open(fnm, "r",encoding='UTF-8') as f:
|
||||||
|
while True:
|
||||||
|
l = f.readline()
|
||||||
|
if not l:break
|
||||||
|
l = l.strip("\n").split(",")
|
||||||
|
try:
|
||||||
|
nm,rk = l[0].strip(),int(l[1])
|
||||||
|
#assert len(TBL[((TBL.name_cn == nm) | (TBL.name_en == nm))]),f"<{nm}>"
|
||||||
|
TBL.loc[((TBL.name_cn == nm) | (TBL.name_en == nm)), "rank"] = rk
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
loadRank(os.path.join(current_file_path, "res/school.rank.csv"))
|
||||||
|
|
||||||
|
|
||||||
|
def split(txt):
|
||||||
|
tks = []
|
||||||
|
for t in re.sub(r"[ \t]+", " ",txt).split(" "):
|
||||||
|
if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
|
||||||
|
re.match(r"[a-zA-Z]", t) and tks:
|
||||||
|
tks[-1] = tks[-1] + " " + t
|
||||||
|
else:tks.append(t)
|
||||||
|
return tks
|
||||||
|
|
||||||
|
|
||||||
|
def select(nm):
|
||||||
|
global TBL
|
||||||
|
if not nm:return
|
||||||
|
if isinstance(nm, list):nm = str(nm[0])
|
||||||
|
nm = split(nm)[0]
|
||||||
|
nm = str(nm).lower().strip()
|
||||||
|
nm = re.sub(r"[((][^()()]+[))]", "", nm.lower())
|
||||||
|
nm = re.sub(r"(^the |[,.&()();;·]+|^(英国|美国|瑞士))", "", nm)
|
||||||
|
nm = re.sub(r"大学.*学院", "大学", nm)
|
||||||
|
tbl = copy.deepcopy(TBL)
|
||||||
|
tbl["hit_alias"] = tbl["alias"].map(lambda x:nm in set(x.split("+")))
|
||||||
|
res = tbl[((tbl.name_cn == nm) | (tbl.name_en == nm) | (tbl.hit_alias == True))]
|
||||||
|
if res.empty:return
|
||||||
|
|
||||||
|
return json.loads(res.to_json(orient="records"))[0]
|
||||||
|
|
||||||
|
|
||||||
|
def is_good(nm):
|
||||||
|
global GOOD_SCH
|
||||||
|
nm = re.sub(r"[((][^()()]+[))]", "", nm.lower())
|
||||||
|
nm = re.sub(r"[''`‘’“”,. &()();;]+", "", nm)
|
||||||
|
return nm in GOOD_SCH
|
||||||
|
|
174
deepdoc/parser/resume/step_one.py
Normal file
174
deepdoc/parser/resume/step_one.py
Normal file
@ -0,0 +1,174 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import json
|
||||||
|
from deepdoc.parser.resume.entities import degrees, regions, industries
|
||||||
|
|
||||||
|
FIELDS = [
|
||||||
|
"address STRING",
|
||||||
|
"annual_salary int",
|
||||||
|
"annual_salary_from int",
|
||||||
|
"annual_salary_to int",
|
||||||
|
"birth STRING",
|
||||||
|
"card STRING",
|
||||||
|
"certificate_obj string",
|
||||||
|
"city STRING",
|
||||||
|
"corporation_id int",
|
||||||
|
"corporation_name STRING",
|
||||||
|
"corporation_type STRING",
|
||||||
|
"degree STRING",
|
||||||
|
"discipline_name STRING",
|
||||||
|
"education_obj string",
|
||||||
|
"email STRING",
|
||||||
|
"expect_annual_salary int",
|
||||||
|
"expect_city_names string",
|
||||||
|
"expect_industry_name STRING",
|
||||||
|
"expect_position_name STRING",
|
||||||
|
"expect_salary_from int",
|
||||||
|
"expect_salary_to int",
|
||||||
|
"expect_type STRING",
|
||||||
|
"gender STRING",
|
||||||
|
"industry_name STRING",
|
||||||
|
"industry_names STRING",
|
||||||
|
"is_deleted STRING",
|
||||||
|
"is_fertility STRING",
|
||||||
|
"is_house STRING",
|
||||||
|
"is_management_experience STRING",
|
||||||
|
"is_marital STRING",
|
||||||
|
"is_oversea STRING",
|
||||||
|
"language_obj string",
|
||||||
|
"name STRING",
|
||||||
|
"nation STRING",
|
||||||
|
"phone STRING",
|
||||||
|
"political_status STRING",
|
||||||
|
"position_name STRING",
|
||||||
|
"project_obj string",
|
||||||
|
"responsibilities string",
|
||||||
|
"salary_month int",
|
||||||
|
"scale STRING",
|
||||||
|
"school_name STRING",
|
||||||
|
"self_remark string",
|
||||||
|
"skill_obj string",
|
||||||
|
"title_name STRING",
|
||||||
|
"tob_resume_id STRING",
|
||||||
|
"updated_at Timestamp",
|
||||||
|
"wechat STRING",
|
||||||
|
"work_obj string",
|
||||||
|
"work_experience int",
|
||||||
|
"work_start_time BIGINT"
|
||||||
|
]
|
||||||
|
|
||||||
|
def refactor(df):
|
||||||
|
def deal_obj(obj, k, kk):
|
||||||
|
if not isinstance(obj, type({})):
|
||||||
|
return ""
|
||||||
|
obj = obj.get(k, {})
|
||||||
|
if not isinstance(obj, type({})):
|
||||||
|
return ""
|
||||||
|
return obj.get(kk, "")
|
||||||
|
|
||||||
|
def loadjson(line):
|
||||||
|
try:
|
||||||
|
return json.loads(line)
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
return {}
|
||||||
|
|
||||||
|
df["obj"] = df["resume_content"].map(lambda x: loadjson(x))
|
||||||
|
df.fillna("", inplace=True)
|
||||||
|
|
||||||
|
clms = ["tob_resume_id", "updated_at"]
|
||||||
|
|
||||||
|
def extract(nms, cc=None):
|
||||||
|
nonlocal clms
|
||||||
|
clms.extend(nms)
|
||||||
|
for c in nms:
|
||||||
|
if cc:
|
||||||
|
df[c] = df["obj"].map(lambda x: deal_obj(x, cc, c))
|
||||||
|
else:
|
||||||
|
df[c] = df["obj"].map(
|
||||||
|
lambda x: json.dumps(
|
||||||
|
x.get(
|
||||||
|
c,
|
||||||
|
{}),
|
||||||
|
ensure_ascii=False) if isinstance(
|
||||||
|
x,
|
||||||
|
type(
|
||||||
|
{})) and (
|
||||||
|
isinstance(
|
||||||
|
x.get(c),
|
||||||
|
type(
|
||||||
|
{})) or not x.get(c)) else str(x).replace(
|
||||||
|
"None",
|
||||||
|
""))
|
||||||
|
|
||||||
|
extract(["education", "work", "certificate", "project", "language",
|
||||||
|
"skill"])
|
||||||
|
extract(["wechat", "phone", "is_deleted",
|
||||||
|
"name", "tel", "email"], "contact")
|
||||||
|
extract(["nation", "expect_industry_name", "salary_month",
|
||||||
|
"industry_ids", "is_house", "birth", "annual_salary_from",
|
||||||
|
"annual_salary_to", "card",
|
||||||
|
"expect_salary_to", "expect_salary_from",
|
||||||
|
"expect_position_name", "gender", "city",
|
||||||
|
"is_fertility", "expect_city_names",
|
||||||
|
"political_status", "title_name", "expect_annual_salary",
|
||||||
|
"industry_name", "address", "position_name", "school_name",
|
||||||
|
"corporation_id",
|
||||||
|
"is_oversea", "responsibilities",
|
||||||
|
"work_start_time", "degree", "management_experience",
|
||||||
|
"expect_type", "corporation_type", "scale", "corporation_name",
|
||||||
|
"self_remark", "annual_salary", "work_experience",
|
||||||
|
"discipline_name", "marital", "updated_at"], "basic")
|
||||||
|
|
||||||
|
df["degree"] = df["degree"].map(lambda x: degrees.get_name(x))
|
||||||
|
df["address"] = df["address"].map(lambda x: " ".join(regions.get_names(x)))
|
||||||
|
df["industry_names"] = df["industry_ids"].map(lambda x: " ".join([" ".join(industries.get_names(i)) for i in
|
||||||
|
str(x).split(",")]))
|
||||||
|
clms.append("industry_names")
|
||||||
|
|
||||||
|
def arr2str(a):
|
||||||
|
if not a:
|
||||||
|
return ""
|
||||||
|
if isinstance(a, list):
|
||||||
|
a = " ".join([str(i) for i in a])
|
||||||
|
return str(a).replace(",", " ")
|
||||||
|
|
||||||
|
df["expect_industry_name"] = df["expect_industry_name"].map(
|
||||||
|
lambda x: arr2str(x))
|
||||||
|
df["gender"] = df["gender"].map(
|
||||||
|
lambda x: "男" if x == 'M' else (
|
||||||
|
"女" if x == 'F' else ""))
|
||||||
|
for c in ["is_fertility", "is_oversea", "is_house",
|
||||||
|
"management_experience", "marital"]:
|
||||||
|
df[c] = df[c].map(
|
||||||
|
lambda x: '是' if x == 'Y' else (
|
||||||
|
'否' if x == 'N' else ""))
|
||||||
|
df["is_management_experience"] = df["management_experience"]
|
||||||
|
df["is_marital"] = df["marital"]
|
||||||
|
clms.extend(["is_management_experience", "is_marital"])
|
||||||
|
|
||||||
|
df.fillna("", inplace=True)
|
||||||
|
for i in range(len(df)):
|
||||||
|
if not df.loc[i, "phone"].strip() and df.loc[i, "tel"].strip():
|
||||||
|
df.loc[i, "phone"] = df.loc[i, "tel"].strip()
|
||||||
|
|
||||||
|
for n in ["industry_ids", "management_experience", "marital", "tel"]:
|
||||||
|
for i in range(len(clms)):
|
||||||
|
if clms[i] == n:
|
||||||
|
del clms[i]
|
||||||
|
break
|
||||||
|
|
||||||
|
clms = list(set(clms))
|
||||||
|
|
||||||
|
df = df.reindex(sorted(clms), axis=1)
|
||||||
|
#print(json.dumps(list(df.columns.values)), "LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL")
|
||||||
|
for c in clms:
|
||||||
|
df[c] = df[c].map(
|
||||||
|
lambda s: str(s).replace(
|
||||||
|
"\t",
|
||||||
|
" ").replace(
|
||||||
|
"\n",
|
||||||
|
"\\n").replace(
|
||||||
|
"\r",
|
||||||
|
"\\n"))
|
||||||
|
# print(df.values.tolist())
|
||||||
|
return dict(zip([n.split(" ")[0] for n in FIELDS], df.values.tolist()[0]))
|
580
deepdoc/parser/resume/step_two.py
Normal file
580
deepdoc/parser/resume/step_two.py
Normal file
@ -0,0 +1,580 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import re, copy, time, datetime, demjson, \
|
||||||
|
traceback, signal
|
||||||
|
import numpy as np
|
||||||
|
from deepdoc.parser.resume.entities import degrees, schools, corporations
|
||||||
|
from rag.nlp import huqie, surname
|
||||||
|
from xpinyin import Pinyin
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
|
|
||||||
|
class TimeoutException(Exception): pass
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def time_limit(seconds):
|
||||||
|
def signal_handler(signum, frame):
|
||||||
|
raise TimeoutException("Timed out!")
|
||||||
|
|
||||||
|
signal.signal(signal.SIGALRM, signal_handler)
|
||||||
|
signal.alarm(seconds)
|
||||||
|
try:
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
signal.alarm(0)
|
||||||
|
|
||||||
|
|
||||||
|
ENV = None
|
||||||
|
PY = Pinyin()
|
||||||
|
|
||||||
|
|
||||||
|
def rmHtmlTag(line):
|
||||||
|
return re.sub(r"<[a-z0-9.\"=';,:\+_/ -]+>", " ", line, 100000, re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
def highest_degree(dg):
|
||||||
|
if not dg: return ""
|
||||||
|
if type(dg) == type(""): dg = [dg]
|
||||||
|
m = {"初中": 0, "高中": 1, "中专": 2, "大专": 3, "专升本": 4, "本科": 5, "硕士": 6, "博士": 7, "博士后": 8}
|
||||||
|
return sorted([(d, m.get(d, -1)) for d in dg], key=lambda x: x[1] * -1)[0][0]
|
||||||
|
|
||||||
|
|
||||||
|
def forEdu(cv):
|
||||||
|
if not cv.get("education_obj"):
|
||||||
|
cv["integerity_flt"] *= 0.8
|
||||||
|
return cv
|
||||||
|
|
||||||
|
first_fea, fea, maj, fmaj, deg, fdeg, sch, fsch, st_dt, ed_dt = [], [], [], [], [], [], [], [], [], []
|
||||||
|
edu_nst = []
|
||||||
|
edu_end_dt = ""
|
||||||
|
cv["school_rank_int"] = 1000000
|
||||||
|
for ii, n in enumerate(sorted(cv["education_obj"], key=lambda x: x.get("start_time", "3"))):
|
||||||
|
e = {}
|
||||||
|
if n.get("end_time"):
|
||||||
|
if n["end_time"] > edu_end_dt: edu_end_dt = n["end_time"]
|
||||||
|
try:
|
||||||
|
dt = n["end_time"]
|
||||||
|
if re.match(r"[0-9]{9,}", dt): dt = turnTm2Dt(dt)
|
||||||
|
y, m, d = getYMD(dt)
|
||||||
|
ed_dt.append(str(y))
|
||||||
|
e["end_dt_kwd"] = str(y)
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
if n.get("start_time"):
|
||||||
|
try:
|
||||||
|
dt = n["start_time"]
|
||||||
|
if re.match(r"[0-9]{9,}", dt): dt = turnTm2Dt(dt)
|
||||||
|
y, m, d = getYMD(dt)
|
||||||
|
st_dt.append(str(y))
|
||||||
|
e["start_dt_kwd"] = str(y)
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
r = schools.select(n.get("school_name", ""))
|
||||||
|
if r:
|
||||||
|
if str(r.get("type", "")) == "1": fea.append("211")
|
||||||
|
if str(r.get("type", "")) == "2": fea.append("211")
|
||||||
|
if str(r.get("is_abroad", "")) == "1": fea.append("留学")
|
||||||
|
if str(r.get("is_double_first", "")) == "1": fea.append("双一流")
|
||||||
|
if str(r.get("is_985", "")) == "1": fea.append("985")
|
||||||
|
if str(r.get("is_world_known", "")) == "1": fea.append("海外知名")
|
||||||
|
if r.get("rank") and cv["school_rank_int"] > r["rank"]: cv["school_rank_int"] = r["rank"]
|
||||||
|
|
||||||
|
if n.get("school_name") and isinstance(n["school_name"], str):
|
||||||
|
sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"]))
|
||||||
|
e["sch_nm_kwd"] = sch[-1]
|
||||||
|
fea.append(huqie.qieqie(huqie.qie(n.get("school_name", ""))).split(" ")[-1])
|
||||||
|
|
||||||
|
if n.get("discipline_name") and isinstance(n["discipline_name"], str):
|
||||||
|
maj.append(n["discipline_name"])
|
||||||
|
e["major_kwd"] = n["discipline_name"]
|
||||||
|
|
||||||
|
if not n.get("degree") and "985" in fea and not first_fea: n["degree"] = "1"
|
||||||
|
|
||||||
|
if n.get("degree"):
|
||||||
|
d = degrees.get_name(n["degree"])
|
||||||
|
if d: e["degree_kwd"] = d
|
||||||
|
if d == "本科" and ("专科" in deg or "专升本" in deg or "中专" in deg or "大专" in deg or re.search(r"(成人|自考|自学考试)",
|
||||||
|
n.get(
|
||||||
|
"school_name",
|
||||||
|
""))): d = "专升本"
|
||||||
|
if d: deg.append(d)
|
||||||
|
|
||||||
|
# for first degree
|
||||||
|
if not fdeg and d in ["中专", "专升本", "专科", "本科", "大专"]:
|
||||||
|
fdeg = [d]
|
||||||
|
if n.get("school_name"): fsch = [n["school_name"]]
|
||||||
|
if n.get("discipline_name"): fmaj = [n["discipline_name"]]
|
||||||
|
first_fea = copy.deepcopy(fea)
|
||||||
|
|
||||||
|
edu_nst.append(e)
|
||||||
|
|
||||||
|
cv["sch_rank_kwd"] = []
|
||||||
|
if cv["school_rank_int"] <= 20 \
|
||||||
|
or ("海外名校" in fea and cv["school_rank_int"] <= 200):
|
||||||
|
cv["sch_rank_kwd"].append("顶尖学校")
|
||||||
|
elif cv["school_rank_int"] <= 50 and cv["school_rank_int"] > 20 \
|
||||||
|
or ("海外名校" in fea and cv["school_rank_int"] <= 500 and \
|
||||||
|
cv["school_rank_int"] > 200):
|
||||||
|
cv["sch_rank_kwd"].append("精英学校")
|
||||||
|
elif cv["school_rank_int"] > 50 and ("985" in fea or "211" in fea) \
|
||||||
|
or ("海外名校" in fea and cv["school_rank_int"] > 500):
|
||||||
|
cv["sch_rank_kwd"].append("优质学校")
|
||||||
|
else:
|
||||||
|
cv["sch_rank_kwd"].append("一般学校")
|
||||||
|
|
||||||
|
if edu_nst: cv["edu_nst"] = edu_nst
|
||||||
|
if fea: cv["edu_fea_kwd"] = list(set(fea))
|
||||||
|
if first_fea: cv["edu_first_fea_kwd"] = list(set(first_fea))
|
||||||
|
if maj: cv["major_kwd"] = maj
|
||||||
|
if fsch: cv["first_school_name_kwd"] = fsch
|
||||||
|
if fdeg: cv["first_degree_kwd"] = fdeg
|
||||||
|
if fmaj: cv["first_major_kwd"] = fmaj
|
||||||
|
if st_dt: cv["edu_start_kwd"] = st_dt
|
||||||
|
if ed_dt: cv["edu_end_kwd"] = ed_dt
|
||||||
|
if ed_dt: cv["edu_end_int"] = max([int(t) for t in ed_dt])
|
||||||
|
if deg:
|
||||||
|
if "本科" in deg and "专科" in deg:
|
||||||
|
deg.append("专升本")
|
||||||
|
deg = [d for d in deg if d != '本科']
|
||||||
|
cv["degree_kwd"] = deg
|
||||||
|
cv["highest_degree_kwd"] = highest_degree(deg)
|
||||||
|
if edu_end_dt:
|
||||||
|
try:
|
||||||
|
if re.match(r"[0-9]{9,}", edu_end_dt): edu_end_dt = turnTm2Dt(edu_end_dt)
|
||||||
|
if edu_end_dt.strip("\n") == "至今": edu_end_dt = cv.get("updated_at_dt", str(datetime.date.today()))
|
||||||
|
y, m, d = getYMD(edu_end_dt)
|
||||||
|
cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
|
||||||
|
except Exception as e:
|
||||||
|
print("EXCEPTION: ", e, edu_end_dt, cv.get("work_exp_flt"))
|
||||||
|
if sch:
|
||||||
|
cv["school_name_kwd"] = sch
|
||||||
|
if (len(cv.get("degree_kwd", [])) >= 1 and "本科" in cv["degree_kwd"]) \
|
||||||
|
or all([c.lower() in ["硕士", "博士", "mba", "博士后"] for c in cv.get("degree_kwd", [])]) \
|
||||||
|
or not cv.get("degree_kwd"):
|
||||||
|
for c in sch:
|
||||||
|
if schools.is_good(c):
|
||||||
|
if "tag_kwd" not in cv: cv["tag_kwd"] = []
|
||||||
|
cv["tag_kwd"].append("好学校")
|
||||||
|
cv["tag_kwd"].append("好学历")
|
||||||
|
break
|
||||||
|
if (len(cv.get("degree_kwd", [])) >= 1 and \
|
||||||
|
"本科" in cv["degree_kwd"] and \
|
||||||
|
any([d.lower() in ["硕士", "博士", "mba", "博士"] for d in cv.get("degree_kwd", [])])) \
|
||||||
|
or all([d.lower() in ["硕士", "博士", "mba", "博士后"] for d in cv.get("degree_kwd", [])]) \
|
||||||
|
or any([d in ["mba", "emba", "博士后"] for d in cv.get("degree_kwd", [])]):
|
||||||
|
if "tag_kwd" not in cv: cv["tag_kwd"] = []
|
||||||
|
if "好学历" not in cv["tag_kwd"]: cv["tag_kwd"].append("好学历")
|
||||||
|
|
||||||
|
if cv.get("major_kwd"): cv["major_tks"] = huqie.qie(" ".join(maj))
|
||||||
|
if cv.get("school_name_kwd"): cv["school_name_tks"] = huqie.qie(" ".join(sch))
|
||||||
|
if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] = huqie.qie(" ".join(fsch))
|
||||||
|
if cv.get("first_major_kwd"): cv["first_major_tks"] = huqie.qie(" ".join(fmaj))
|
||||||
|
|
||||||
|
return cv
|
||||||
|
|
||||||
|
|
||||||
|
def forProj(cv):
|
||||||
|
if not cv.get("project_obj"): return cv
|
||||||
|
|
||||||
|
pro_nms, desc = [], []
|
||||||
|
for i, n in enumerate(
|
||||||
|
sorted(cv.get("project_obj", []), key=lambda x: str(x.get("updated_at", "")) if type(x) == type({}) else "",
|
||||||
|
reverse=True)):
|
||||||
|
if n.get("name"): pro_nms.append(n["name"])
|
||||||
|
if n.get("describe"): desc.append(str(n["describe"]))
|
||||||
|
if n.get("responsibilities"): desc.append(str(n["responsibilities"]))
|
||||||
|
if n.get("achivement"): desc.append(str(n["achivement"]))
|
||||||
|
|
||||||
|
if pro_nms:
|
||||||
|
# cv["pro_nms_tks"] = huqie.qie(" ".join(pro_nms))
|
||||||
|
cv["project_name_tks"] = huqie.qie(pro_nms[0])
|
||||||
|
if desc:
|
||||||
|
cv["pro_desc_ltks"] = huqie.qie(rmHtmlTag(" ".join(desc)))
|
||||||
|
cv["project_desc_ltks"] = huqie.qie(rmHtmlTag(desc[0]))
|
||||||
|
|
||||||
|
return cv
|
||||||
|
|
||||||
|
|
||||||
|
def json_loads(line):
|
||||||
|
return demjson.decode(re.sub(r": *(True|False)", r": '\1'", line))
|
||||||
|
|
||||||
|
|
||||||
|
def forWork(cv):
|
||||||
|
if not cv.get("work_obj"):
|
||||||
|
cv["integerity_flt"] *= 0.7
|
||||||
|
return cv
|
||||||
|
|
||||||
|
flds = ["position_name", "corporation_name", "corporation_id", "responsibilities",
|
||||||
|
"industry_name", "subordinates_count"]
|
||||||
|
duas = []
|
||||||
|
scales = []
|
||||||
|
fea = {c: [] for c in flds}
|
||||||
|
latest_job_tm = ""
|
||||||
|
goodcorp = False
|
||||||
|
goodcorp_ = False
|
||||||
|
work_st_tm = ""
|
||||||
|
corp_tags = []
|
||||||
|
for i, n in enumerate(
|
||||||
|
sorted(cv.get("work_obj", []), key=lambda x: str(x.get("start_time", "")) if type(x) == type({}) else "",
|
||||||
|
reverse=True)):
|
||||||
|
if type(n) == type(""):
|
||||||
|
try:
|
||||||
|
n = json_loads(n)
|
||||||
|
except Exception as e:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if n.get("start_time") and (not work_st_tm or n["start_time"] < work_st_tm): work_st_tm = n["start_time"]
|
||||||
|
for c in flds:
|
||||||
|
if not n.get(c) or str(n[c]) == '0':
|
||||||
|
fea[c].append("")
|
||||||
|
continue
|
||||||
|
if c == "corporation_name":
|
||||||
|
n[c] = corporations.corpNorm(n[c], False)
|
||||||
|
if corporations.is_good(n[c]):
|
||||||
|
if i == 0:
|
||||||
|
goodcorp = True
|
||||||
|
else:
|
||||||
|
goodcorp_ = True
|
||||||
|
ct = corporations.corp_tag(n[c])
|
||||||
|
if i == 0:
|
||||||
|
corp_tags.extend(ct)
|
||||||
|
elif ct and ct[0] != "软外":
|
||||||
|
corp_tags.extend([f"{t}(曾)" for t in ct])
|
||||||
|
|
||||||
|
fea[c].append(rmHtmlTag(str(n[c]).lower()))
|
||||||
|
|
||||||
|
y, m, d = getYMD(n.get("start_time"))
|
||||||
|
if not y or not m: continue
|
||||||
|
st = "%s-%02d-%02d" % (y, int(m), int(d))
|
||||||
|
latest_job_tm = st
|
||||||
|
|
||||||
|
y, m, d = getYMD(n.get("end_time"))
|
||||||
|
if (not y or not m) and i > 0: continue
|
||||||
|
if not y or not m or int(y) > 2022: y, m, d = getYMD(str(n.get("updated_at", "")))
|
||||||
|
if not y or not m: continue
|
||||||
|
ed = "%s-%02d-%02d" % (y, int(m), int(d))
|
||||||
|
|
||||||
|
try:
|
||||||
|
duas.append((datetime.datetime.strptime(ed, "%Y-%m-%d") - datetime.datetime.strptime(st, "%Y-%m-%d")).days)
|
||||||
|
except Exception as e:
|
||||||
|
print("kkkkkkkkkkkkkkkkkkkk", n.get("start_time"), n.get("end_time"))
|
||||||
|
|
||||||
|
if n.get("scale"):
|
||||||
|
r = re.search(r"^([0-9]+)", str(n["scale"]))
|
||||||
|
if r: scales.append(int(r.group(1)))
|
||||||
|
|
||||||
|
if goodcorp:
|
||||||
|
if "tag_kwd" not in cv: cv["tag_kwd"] = []
|
||||||
|
cv["tag_kwd"].append("好公司")
|
||||||
|
if goodcorp_:
|
||||||
|
if "tag_kwd" not in cv: cv["tag_kwd"] = []
|
||||||
|
cv["tag_kwd"].append("好公司(曾)")
|
||||||
|
|
||||||
|
if corp_tags:
|
||||||
|
if "tag_kwd" not in cv: cv["tag_kwd"] = []
|
||||||
|
cv["tag_kwd"].extend(corp_tags)
|
||||||
|
cv["corp_tag_kwd"] = [c for c in corp_tags if re.match(r"(综合|行业)", c)]
|
||||||
|
|
||||||
|
if latest_job_tm: cv["latest_job_dt"] = latest_job_tm
|
||||||
|
if fea["corporation_id"]: cv["corporation_id"] = fea["corporation_id"]
|
||||||
|
|
||||||
|
if fea["position_name"]:
|
||||||
|
cv["position_name_tks"] = huqie.qie(fea["position_name"][0])
|
||||||
|
cv["position_name_sm_tks"] = huqie.qieqie(cv["position_name_tks"])
|
||||||
|
cv["pos_nm_tks"] = huqie.qie(" ".join(fea["position_name"][1:]))
|
||||||
|
|
||||||
|
if fea["industry_name"]:
|
||||||
|
cv["industry_name_tks"] = huqie.qie(fea["industry_name"][0])
|
||||||
|
cv["industry_name_sm_tks"] = huqie.qieqie(cv["industry_name_tks"])
|
||||||
|
cv["indu_nm_tks"] = huqie.qie(" ".join(fea["industry_name"][1:]))
|
||||||
|
|
||||||
|
if fea["corporation_name"]:
|
||||||
|
cv["corporation_name_kwd"] = fea["corporation_name"][0]
|
||||||
|
cv["corp_nm_kwd"] = fea["corporation_name"]
|
||||||
|
cv["corporation_name_tks"] = huqie.qie(fea["corporation_name"][0])
|
||||||
|
cv["corporation_name_sm_tks"] = huqie.qieqie(cv["corporation_name_tks"])
|
||||||
|
cv["corp_nm_tks"] = huqie.qie(" ".join(fea["corporation_name"][1:]))
|
||||||
|
|
||||||
|
if fea["responsibilities"]:
|
||||||
|
cv["responsibilities_ltks"] = huqie.qie(fea["responsibilities"][0])
|
||||||
|
cv["resp_ltks"] = huqie.qie(" ".join(fea["responsibilities"][1:]))
|
||||||
|
|
||||||
|
if fea["subordinates_count"]: fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
|
||||||
|
re.match(r"[^0-9]+$", str(i))]
|
||||||
|
if fea["subordinates_count"]: cv["max_sub_cnt_int"] = np.max(fea["subordinates_count"])
|
||||||
|
|
||||||
|
if type(cv.get("corporation_id")) == type(1): cv["corporation_id"] = [str(cv["corporation_id"])]
|
||||||
|
if not cv.get("corporation_id"): cv["corporation_id"] = []
|
||||||
|
for i in cv.get("corporation_id", []):
|
||||||
|
cv["baike_flt"] = max(corporations.baike(i), cv["baike_flt"] if "baike_flt" in cv else 0)
|
||||||
|
|
||||||
|
if work_st_tm:
|
||||||
|
try:
|
||||||
|
if re.match(r"[0-9]{9,}", work_st_tm): work_st_tm = turnTm2Dt(work_st_tm)
|
||||||
|
y, m, d = getYMD(work_st_tm)
|
||||||
|
cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
|
||||||
|
except Exception as e:
|
||||||
|
print("EXCEPTION: ", e, work_st_tm, cv.get("work_exp_flt"))
|
||||||
|
|
||||||
|
cv["job_num_int"] = 0
|
||||||
|
if duas:
|
||||||
|
cv["dua_flt"] = np.mean(duas)
|
||||||
|
cv["cur_dua_int"] = duas[0]
|
||||||
|
cv["job_num_int"] = len(duas)
|
||||||
|
if scales: cv["scale_flt"] = np.max(scales)
|
||||||
|
return cv
|
||||||
|
|
||||||
|
|
||||||
|
def turnTm2Dt(b):
|
||||||
|
if not b: return
|
||||||
|
b = str(b).strip()
|
||||||
|
if re.match(r"[0-9]{10,}", b): b = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(b[:10])))
|
||||||
|
return b
|
||||||
|
|
||||||
|
|
||||||
|
def getYMD(b):
|
||||||
|
y, m, d = "", "", "01"
|
||||||
|
if not b: return (y, m, d)
|
||||||
|
b = turnTm2Dt(b)
|
||||||
|
if re.match(r"[0-9]{4}", b): y = int(b[:4])
|
||||||
|
r = re.search(r"[0-9]{4}.?([0-9]{1,2})", b)
|
||||||
|
if r: m = r.group(1)
|
||||||
|
r = re.search(r"[0-9]{4}.?[0-9]{,2}.?([0-9]{1,2})", b)
|
||||||
|
if r: d = r.group(1)
|
||||||
|
if not d or int(d) == 0 or int(d) > 31: d = "1"
|
||||||
|
if not m or int(m) > 12 or int(m) < 1: m = "1"
|
||||||
|
return (y, m, d)
|
||||||
|
|
||||||
|
|
||||||
|
def birth(cv):
|
||||||
|
if not cv.get("birth"):
|
||||||
|
cv["integerity_flt"] *= 0.9
|
||||||
|
return cv
|
||||||
|
y, m, d = getYMD(cv["birth"])
|
||||||
|
if not m or not y: return cv
|
||||||
|
b = "%s-%02d-%02d" % (y, int(m), int(d))
|
||||||
|
cv["birth_dt"] = b
|
||||||
|
cv["birthday_kwd"] = "%02d%02d" % (int(m), int(d))
|
||||||
|
|
||||||
|
cv["age_int"] = datetime.datetime.now().year - int(y)
|
||||||
|
return cv
|
||||||
|
|
||||||
|
|
||||||
|
def parse(cv):
|
||||||
|
for k in cv.keys():
|
||||||
|
if cv[k] == '\\N': cv[k] = ''
|
||||||
|
# cv = cv.asDict()
|
||||||
|
tks_fld = ["address", "corporation_name", "discipline_name", "email", "expect_city_names",
|
||||||
|
"expect_industry_name", "expect_position_name", "industry_name", "industry_names", "name",
|
||||||
|
"position_name", "school_name", "self_remark", "title_name"]
|
||||||
|
small_tks_fld = ["corporation_name", "expect_position_name", "position_name", "school_name", "title_name"]
|
||||||
|
kwd_fld = ["address", "city", "corporation_type", "degree", "discipline_name", "expect_city_names", "email",
|
||||||
|
"expect_industry_name", "expect_position_name", "expect_type", "gender", "industry_name",
|
||||||
|
"industry_names", "political_status", "position_name", "scale", "school_name", "phone", "tel"]
|
||||||
|
num_fld = ["annual_salary", "annual_salary_from", "annual_salary_to", "expect_annual_salary", "expect_salary_from",
|
||||||
|
"expect_salary_to", "salary_month"]
|
||||||
|
|
||||||
|
is_fld = [
|
||||||
|
("is_fertility", "已育", "未育"),
|
||||||
|
("is_house", "有房", "没房"),
|
||||||
|
("is_management_experience", "有管理经验", "无管理经验"),
|
||||||
|
("is_marital", "已婚", "未婚"),
|
||||||
|
("is_oversea", "有海外经验", "无海外经验")
|
||||||
|
]
|
||||||
|
|
||||||
|
rmkeys = []
|
||||||
|
for k in cv.keys():
|
||||||
|
if cv[k] is None: rmkeys.append(k)
|
||||||
|
if (type(cv[k]) == type([]) or type(cv[k]) == type("")) and len(cv[k]) == 0: rmkeys.append(k)
|
||||||
|
for k in rmkeys: del cv[k]
|
||||||
|
|
||||||
|
integerity = 0.
|
||||||
|
flds_num = 0.
|
||||||
|
|
||||||
|
def hasValues(flds):
|
||||||
|
nonlocal integerity, flds_num
|
||||||
|
flds_num += len(flds)
|
||||||
|
for f in flds:
|
||||||
|
v = str(cv.get(f, ""))
|
||||||
|
if len(v) > 0 and v != '0' and v != '[]': integerity += 1
|
||||||
|
|
||||||
|
hasValues(tks_fld)
|
||||||
|
hasValues(small_tks_fld)
|
||||||
|
hasValues(kwd_fld)
|
||||||
|
hasValues(num_fld)
|
||||||
|
cv["integerity_flt"] = integerity / flds_num
|
||||||
|
|
||||||
|
if cv.get("corporation_type"):
|
||||||
|
for p, r in [(r"(公司|企业|其它|其他|Others*|\n|未填写|Enterprises|Company|companies)", ""),
|
||||||
|
(r"[//.· <\((]+.*", ""),
|
||||||
|
(r".*(合资|民企|股份制|中外|私营|个体|Private|创业|Owned|投资).*", "民营"),
|
||||||
|
(r".*(机关|事业).*", "机关"),
|
||||||
|
(r".*(非盈利|Non-profit).*", "非盈利"),
|
||||||
|
(r".*(外企|外商|欧美|foreign|Institution|Australia|港资).*", "外企"),
|
||||||
|
(r".*国有.*", "国企"),
|
||||||
|
(r"[ ()\(\)人/·0-9-]+", ""),
|
||||||
|
(r".*(元|规模|于|=|北京|上海|至今|中国|工资|州|shanghai|强|餐饮|融资|职).*", "")]:
|
||||||
|
cv["corporation_type"] = re.sub(p, r, cv["corporation_type"], 1000, re.IGNORECASE)
|
||||||
|
if len(cv["corporation_type"]) < 2: del cv["corporation_type"]
|
||||||
|
|
||||||
|
if cv.get("political_status"):
|
||||||
|
for p, r in [
|
||||||
|
(r".*党员.*", "党员"),
|
||||||
|
(r".*(无党派|公民).*", "群众"),
|
||||||
|
(r".*团员.*", "团员")]:
|
||||||
|
cv["political_status"] = re.sub(p, r, cv["political_status"])
|
||||||
|
if not re.search(r"[党团群]", cv["political_status"]): del cv["political_status"]
|
||||||
|
|
||||||
|
if cv.get("phone"): cv["phone"] = re.sub(r"^0*86([0-9]{11})", r"\1", re.sub(r"[^0-9]+", "", cv["phone"]))
|
||||||
|
|
||||||
|
keys = list(cv.keys())
|
||||||
|
for k in keys:
|
||||||
|
# deal with json objects
|
||||||
|
if k.find("_obj") > 0:
|
||||||
|
try:
|
||||||
|
cv[k] = json_loads(cv[k])
|
||||||
|
cv[k] = [a for _, a in cv[k].items()]
|
||||||
|
nms = []
|
||||||
|
for n in cv[k]:
|
||||||
|
if type(n) != type({}) or "name" not in n or not n.get("name"): continue
|
||||||
|
n["name"] = re.sub(r"((442)|\t )", "", n["name"]).strip().lower()
|
||||||
|
if not n["name"]: continue
|
||||||
|
nms.append(n["name"])
|
||||||
|
if nms:
|
||||||
|
t = k[:-4]
|
||||||
|
cv[f"{t}_kwd"] = nms
|
||||||
|
cv[f"{t}_tks"] = huqie.qie(" ".join(nms))
|
||||||
|
except Exception as e:
|
||||||
|
print("【EXCEPTION】:", str(traceback.format_exc()), cv[k])
|
||||||
|
cv[k] = []
|
||||||
|
|
||||||
|
# tokenize fields
|
||||||
|
if k in tks_fld:
|
||||||
|
cv[f"{k}_tks"] = huqie.qie(cv[k])
|
||||||
|
if k in small_tks_fld: cv[f"{k}_sm_tks"] = huqie.qie(cv[f"{k}_tks"])
|
||||||
|
|
||||||
|
# keyword fields
|
||||||
|
if k in kwd_fld: cv[f"{k}_kwd"] = [n.lower()
|
||||||
|
for n in re.split(r"[\t,,;;. ]",
|
||||||
|
re.sub(r"([^a-zA-Z])[ ]+([^a-zA-Z ])", r"\1,\2", cv[k])
|
||||||
|
) if n]
|
||||||
|
|
||||||
|
if k in num_fld and cv.get(k): cv[f"{k}_int"] = cv[k]
|
||||||
|
|
||||||
|
cv["email_kwd"] = cv.get("email_tks", "").replace(" ", "")
|
||||||
|
# for name field
|
||||||
|
if cv.get("name"):
|
||||||
|
nm = re.sub(r"[\n——\-\((\+].*", "", cv["name"].strip())
|
||||||
|
nm = re.sub(r"[ \t ]+", " ", nm)
|
||||||
|
if re.match(r"[a-zA-Z ]+$", nm):
|
||||||
|
if len(nm.split(" ")) > 1:
|
||||||
|
cv["name"] = nm
|
||||||
|
else:
|
||||||
|
nm = ""
|
||||||
|
elif nm and (surname.isit(nm[0]) or surname.isit(nm[:2])):
|
||||||
|
nm = re.sub(r"[a-zA-Z]+.*", "", nm[:5])
|
||||||
|
else:
|
||||||
|
nm = ""
|
||||||
|
cv["name"] = nm.strip()
|
||||||
|
name = cv["name"]
|
||||||
|
|
||||||
|
# name pingyin and its prefix
|
||||||
|
cv["name_py_tks"] = " ".join(PY.get_pinyins(nm[:20], '')) + " " + " ".join(PY.get_pinyins(nm[:20], ' '))
|
||||||
|
cv["name_py_pref0_tks"] = ""
|
||||||
|
cv["name_py_pref_tks"] = ""
|
||||||
|
for py in PY.get_pinyins(nm[:20], ''):
|
||||||
|
for i in range(2, len(py) + 1): cv["name_py_pref_tks"] += " " + py[:i]
|
||||||
|
for py in PY.get_pinyins(nm[:20], ' '):
|
||||||
|
py = py.split(" ")
|
||||||
|
for i in range(1, len(py) + 1): cv["name_py_pref0_tks"] += " " + "".join(py[:i])
|
||||||
|
|
||||||
|
cv["name_kwd"] = name
|
||||||
|
cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3]
|
||||||
|
cv["name_tks"] = (
|
||||||
|
huqie.qie(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "")
|
||||||
|
) if name else ""
|
||||||
|
else:
|
||||||
|
cv["integerity_flt"] /= 2.
|
||||||
|
|
||||||
|
if cv.get("phone"):
|
||||||
|
r = re.search(r"(1[3456789][0-9]{9})", cv["phone"])
|
||||||
|
if not r:
|
||||||
|
cv["phone"] = ""
|
||||||
|
else:
|
||||||
|
cv["phone"] = r.group(1)
|
||||||
|
|
||||||
|
# deal with date fields
|
||||||
|
if cv.get("updated_at") and isinstance(cv["updated_at"], datetime.datetime):
|
||||||
|
cv["updated_at_dt"] = cv["updated_at"].strftime('%Y-%m-%d %H:%M:%S')
|
||||||
|
else:
|
||||||
|
y, m, d = getYMD(str(cv.get("updated_at", "")))
|
||||||
|
if not y: y = "2012"
|
||||||
|
if not m: m = "01"
|
||||||
|
if not d: d = "01"
|
||||||
|
cv["updated_at_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
|
||||||
|
# long text tokenize
|
||||||
|
|
||||||
|
if cv.get("responsibilities"): cv["responsibilities_ltks"] = huqie.qie(rmHtmlTag(cv["responsibilities"]))
|
||||||
|
|
||||||
|
# for yes or no field
|
||||||
|
fea = []
|
||||||
|
for f, y, n in is_fld:
|
||||||
|
if f not in cv: continue
|
||||||
|
if cv[f] == '是': fea.append(y)
|
||||||
|
if cv[f] == '否': fea.append(n)
|
||||||
|
|
||||||
|
if fea: cv["tag_kwd"] = fea
|
||||||
|
|
||||||
|
cv = forEdu(cv)
|
||||||
|
cv = forProj(cv)
|
||||||
|
cv = forWork(cv)
|
||||||
|
cv = birth(cv)
|
||||||
|
|
||||||
|
cv["corp_proj_sch_deg_kwd"] = [c for c in cv.get("corp_tag_kwd", [])]
|
||||||
|
for i in range(len(cv["corp_proj_sch_deg_kwd"])):
|
||||||
|
for j in cv.get("sch_rank_kwd", []): cv["corp_proj_sch_deg_kwd"][i] += "+" + j
|
||||||
|
for i in range(len(cv["corp_proj_sch_deg_kwd"])):
|
||||||
|
if cv.get("highest_degree_kwd"): cv["corp_proj_sch_deg_kwd"][i] += "+" + cv["highest_degree_kwd"]
|
||||||
|
|
||||||
|
try:
|
||||||
|
if not cv.get("work_exp_flt") and cv.get("work_start_time"):
|
||||||
|
if re.match(r"[0-9]{9,}", str(cv["work_start_time"])):
|
||||||
|
cv["work_start_dt"] = turnTm2Dt(cv["work_start_time"])
|
||||||
|
cv["work_exp_flt"] = (time.time() - int(int(cv["work_start_time"]) / 1000)) / 3600. / 24. / 365.
|
||||||
|
elif re.match(r"[0-9]{4}[^0-9]", str(cv["work_start_time"])):
|
||||||
|
y, m, d = getYMD(str(cv["work_start_time"]))
|
||||||
|
cv["work_start_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
|
||||||
|
cv["work_exp_flt"] = int(str(datetime.date.today())[0:4]) - int(y)
|
||||||
|
except Exception as e:
|
||||||
|
print("【EXCEPTION】", e, "==>", cv.get("work_start_time"))
|
||||||
|
if "work_exp_flt" not in cv and cv.get("work_experience", 0): cv["work_exp_flt"] = int(cv["work_experience"]) / 12.
|
||||||
|
|
||||||
|
keys = list(cv.keys())
|
||||||
|
for k in keys:
|
||||||
|
if not re.search(r"_(fea|tks|nst|dt|int|flt|ltks|kwd|id)$", k): del cv[k]
|
||||||
|
for k in cv.keys():
|
||||||
|
if not re.search("_(kwd|id)$", k) or type(cv[k]) != type([]): continue
|
||||||
|
cv[k] = list(set([re.sub("(市)$", "", str(n)) for n in cv[k] if n not in ['中国', '0']]))
|
||||||
|
keys = [k for k in cv.keys() if re.search(r"_feas*$", k)]
|
||||||
|
for k in keys:
|
||||||
|
if cv[k] <= 0: del cv[k]
|
||||||
|
|
||||||
|
cv["tob_resume_id"] = str(cv["tob_resume_id"])
|
||||||
|
cv["id"] = cv["tob_resume_id"]
|
||||||
|
print("CCCCCCCCCCCCCCC")
|
||||||
|
|
||||||
|
return dealWithInt64(cv)
|
||||||
|
|
||||||
|
|
||||||
|
def dealWithInt64(d):
|
||||||
|
if isinstance(d, dict):
|
||||||
|
for n, v in d.items():
|
||||||
|
d[n] = dealWithInt64(v)
|
||||||
|
|
||||||
|
if isinstance(d, list):
|
||||||
|
d = [dealWithInt64(t) for t in d]
|
||||||
|
|
||||||
|
if isinstance(d, np.integer): d = int(d)
|
||||||
|
return d
|
||||||
|
|
@ -64,7 +64,11 @@ def load_model(model_dir, nm):
|
|||||||
if not os.path.exists(model_file_path):
|
if not os.path.exists(model_file_path):
|
||||||
raise ValueError("not find model file path {}".format(
|
raise ValueError("not find model file path {}".format(
|
||||||
model_file_path))
|
model_file_path))
|
||||||
sess = ort.InferenceSession(model_file_path)
|
|
||||||
|
if ort.get_device() == "GPU":
|
||||||
|
sess = ort.InferenceSession(model_file_path, providers=['CUDAExecutionProvider'])
|
||||||
|
else:
|
||||||
|
sess = ort.InferenceSession(model_file_path, providers=['CPUExecutionProvider'])
|
||||||
return sess, sess.get_inputs()[0]
|
return sess, sess.get_inputs()[0]
|
||||||
|
|
||||||
|
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
#
|
#
|
||||||
import copy
|
import copy
|
||||||
import re
|
import re
|
||||||
from deepdoc.parser import bullets_category, is_english, tokenize, remove_contents_table, \
|
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
|
||||||
hierarchical_merge, make_colon_as_title, naive_merge, random_choices
|
hierarchical_merge, make_colon_as_title, naive_merge, random_choices
|
||||||
from rag.nlp import huqie
|
from rag.nlp import huqie
|
||||||
from deepdoc.parser import PdfParser, DocxParser
|
from deepdoc.parser import PdfParser, DocxParser
|
||||||
@ -47,7 +47,7 @@ class Pdf(PdfParser):
|
|||||||
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls
|
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls
|
||||||
|
|
||||||
|
|
||||||
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||||
"""
|
"""
|
||||||
Supported file formats are docx, pdf, txt.
|
Supported file formats are docx, pdf, txt.
|
||||||
Since a book is long and not all the parts are useful, if it's a PDF,
|
Since a book is long and not all the parts are useful, if it's a PDF,
|
||||||
@ -94,7 +94,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
|
|||||||
|
|
||||||
sections = [t for t, _ in sections]
|
sections = [t for t, _ in sections]
|
||||||
# is it English
|
# is it English
|
||||||
eng = is_english(random_choices(sections, k=218))
|
eng = lang.lower() == "english"#is_english(random_choices(sections, k=218))
|
||||||
|
|
||||||
res = []
|
res = []
|
||||||
# add tables
|
# add tables
|
||||||
|
@ -14,7 +14,7 @@ import copy
|
|||||||
import re
|
import re
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from docx import Document
|
from docx import Document
|
||||||
from deepdoc.parser import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
|
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
|
||||||
make_colon_as_title
|
make_colon_as_title
|
||||||
from rag.nlp import huqie
|
from rag.nlp import huqie
|
||||||
from deepdoc.parser import PdfParser, DocxParser
|
from deepdoc.parser import PdfParser, DocxParser
|
||||||
@ -68,7 +68,7 @@ class Pdf(PdfParser):
|
|||||||
return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes]
|
return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes]
|
||||||
|
|
||||||
|
|
||||||
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||||
"""
|
"""
|
||||||
Supported file formats are docx, pdf, txt.
|
Supported file formats are docx, pdf, txt.
|
||||||
"""
|
"""
|
||||||
@ -106,7 +106,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
|
|||||||
else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
|
else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
|
||||||
|
|
||||||
# is it English
|
# is it English
|
||||||
eng = is_english(sections)
|
eng = lang.lower() == "english"#is_english(sections)
|
||||||
# Remove 'Contents' part
|
# Remove 'Contents' part
|
||||||
remove_contents_table(sections, eng)
|
remove_contents_table(sections, eng)
|
||||||
|
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
import copy
|
import copy
|
||||||
import re
|
import re
|
||||||
from deepdoc.parser import tokenize
|
from rag.nlp import huqie, tokenize
|
||||||
from rag.nlp import huqie
|
|
||||||
from deepdoc.parser import PdfParser
|
from deepdoc.parser import PdfParser
|
||||||
from rag.utils import num_tokens_from_string
|
from rag.utils import num_tokens_from_string
|
||||||
|
|
||||||
@ -57,7 +56,7 @@ class Pdf(PdfParser):
|
|||||||
return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes], tbls
|
return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes], tbls
|
||||||
|
|
||||||
|
|
||||||
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||||
"""
|
"""
|
||||||
Only pdf is supported.
|
Only pdf is supported.
|
||||||
"""
|
"""
|
||||||
@ -74,7 +73,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
|
|||||||
doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
|
doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
|
||||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
||||||
# is it English
|
# is it English
|
||||||
eng = pdf_parser.is_english
|
eng = lang.lower() == "english"#pdf_parser.is_english
|
||||||
|
|
||||||
res = []
|
res = []
|
||||||
# add tables
|
# add tables
|
||||||
|
@ -13,8 +13,7 @@
|
|||||||
import copy
|
import copy
|
||||||
import re
|
import re
|
||||||
from rag.app import laws
|
from rag.app import laws
|
||||||
from deepdoc.parser import is_english, tokenize, naive_merge
|
from rag.nlp import huqie, is_english, tokenize, naive_merge
|
||||||
from rag.nlp import huqie
|
|
||||||
from deepdoc.parser import PdfParser
|
from deepdoc.parser import PdfParser
|
||||||
from rag.settings import cron_logger
|
from rag.settings import cron_logger
|
||||||
|
|
||||||
@ -38,7 +37,7 @@ class Pdf(PdfParser):
|
|||||||
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes]
|
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes]
|
||||||
|
|
||||||
|
|
||||||
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||||
"""
|
"""
|
||||||
Supported file formats are docx, pdf, txt.
|
Supported file formats are docx, pdf, txt.
|
||||||
This method apply the naive ways to chunk files.
|
This method apply the naive ways to chunk files.
|
||||||
@ -80,7 +79,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
|
|||||||
|
|
||||||
parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。;!?"})
|
parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。;!?"})
|
||||||
cks = naive_merge(sections, parser_config["chunk_token_num"], parser_config["delimiter"])
|
cks = naive_merge(sections, parser_config["chunk_token_num"], parser_config["delimiter"])
|
||||||
eng = is_english(cks)
|
eng = lang.lower() == "english"#is_english(cks)
|
||||||
res = []
|
res = []
|
||||||
# wrap up to es documents
|
# wrap up to es documents
|
||||||
for ck in cks:
|
for ck in cks:
|
||||||
|
@ -15,8 +15,7 @@ import re
|
|||||||
from collections import Counter
|
from collections import Counter
|
||||||
|
|
||||||
from api.db import ParserType
|
from api.db import ParserType
|
||||||
from deepdoc.parser import tokenize
|
from rag.nlp import huqie, tokenize
|
||||||
from rag.nlp import huqie
|
|
||||||
from deepdoc.parser import PdfParser
|
from deepdoc.parser import PdfParser
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from rag.utils import num_tokens_from_string
|
from rag.utils import num_tokens_from_string
|
||||||
@ -140,7 +139,7 @@ class Pdf(PdfParser):
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||||
"""
|
"""
|
||||||
Only pdf is supported.
|
Only pdf is supported.
|
||||||
The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
|
The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
|
||||||
@ -156,7 +155,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
|
|||||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
||||||
doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
|
doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
|
||||||
# is it English
|
# is it English
|
||||||
eng = pdf_parser.is_english
|
eng = lang.lower() == "english"#pdf_parser.is_english
|
||||||
print("It's English.....", eng)
|
print("It's English.....", eng)
|
||||||
|
|
||||||
res = []
|
res = []
|
||||||
|
56
rag/app/picture.py
Normal file
56
rag/app/picture.py
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
import io
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from api.db import LLMType
|
||||||
|
from api.db.services.llm_service import LLMBundle
|
||||||
|
from rag.nlp import tokenize
|
||||||
|
from deepdoc.vision import OCR
|
||||||
|
|
||||||
|
ocr = OCR()
|
||||||
|
|
||||||
|
|
||||||
|
def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
|
||||||
|
try:
|
||||||
|
cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang)
|
||||||
|
except Exception as e:
|
||||||
|
callback(prog=-1, msg=str(e))
|
||||||
|
return []
|
||||||
|
img = Image.open(io.BytesIO(binary))
|
||||||
|
doc = {
|
||||||
|
"docnm_kwd": filename,
|
||||||
|
"image": img
|
||||||
|
}
|
||||||
|
bxs = ocr(np.array(img))
|
||||||
|
txt = "\n".join([t[0] for _, t in bxs if t[0]])
|
||||||
|
eng = lang.lower() == "english"
|
||||||
|
callback(0.4, "Finish OCR: (%s ...)" % txt[:12])
|
||||||
|
if (eng and len(txt.split(" ")) > 32) or len(txt) > 32:
|
||||||
|
tokenize(doc, txt, eng)
|
||||||
|
callback(0.8, "OCR results is too long to use CV LLM.")
|
||||||
|
return [doc]
|
||||||
|
|
||||||
|
try:
|
||||||
|
callback(0.4, "Use CV LLM to describe the picture.")
|
||||||
|
ans = cv_mdl.describe(binary)
|
||||||
|
callback(0.8, "CV LLM respoond: %s ..." % ans[:32])
|
||||||
|
txt += "\n" + ans
|
||||||
|
tokenize(doc, txt, eng)
|
||||||
|
return [doc]
|
||||||
|
except Exception as e:
|
||||||
|
callback(prog=-1, msg=str(e))
|
||||||
|
|
||||||
|
return []
|
@ -13,46 +13,14 @@
|
|||||||
import copy
|
import copy
|
||||||
import re
|
import re
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pptx import Presentation
|
from rag.nlp import tokenize, is_english
|
||||||
from deepdoc.parser import tokenize, is_english
|
|
||||||
from rag.nlp import huqie
|
from rag.nlp import huqie
|
||||||
from deepdoc.parser import PdfParser
|
from deepdoc.parser import PdfParser, PptParser
|
||||||
|
|
||||||
|
|
||||||
class Ppt(object):
|
class Ppt(PptParser):
|
||||||
def __init__(self):
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
def __extract(self, shape):
|
|
||||||
if shape.shape_type == 19:
|
|
||||||
tb = shape.table
|
|
||||||
rows = []
|
|
||||||
for i in range(1, len(tb.rows)):
|
|
||||||
rows.append("; ".join([tb.cell(0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
|
|
||||||
return "\n".join(rows)
|
|
||||||
|
|
||||||
if shape.has_text_frame:
|
|
||||||
return shape.text_frame.text
|
|
||||||
|
|
||||||
if shape.shape_type == 6:
|
|
||||||
texts = []
|
|
||||||
for p in shape.shapes:
|
|
||||||
t = self.__extract(p)
|
|
||||||
if t: texts.append(t)
|
|
||||||
return "\n".join(texts)
|
|
||||||
|
|
||||||
def __call__(self, fnm, from_page, to_page, callback=None):
|
def __call__(self, fnm, from_page, to_page, callback=None):
|
||||||
ppt = Presentation(fnm) if isinstance(
|
txts = super.__call__(fnm, from_page, to_page)
|
||||||
fnm, str) else Presentation(
|
|
||||||
BytesIO(fnm))
|
|
||||||
txts = []
|
|
||||||
self.total_page = len(ppt.slides)
|
|
||||||
for i, slide in enumerate(ppt.slides[from_page: to_page]):
|
|
||||||
texts = []
|
|
||||||
for shape in slide.shapes:
|
|
||||||
txt = self.__extract(shape)
|
|
||||||
if txt: texts.append(txt)
|
|
||||||
txts.append("\n".join(texts))
|
|
||||||
|
|
||||||
callback(0.5, "Text extraction finished.")
|
callback(0.5, "Text extraction finished.")
|
||||||
import aspose.slides as slides
|
import aspose.slides as slides
|
||||||
|
@ -14,7 +14,7 @@ import re
|
|||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from nltk import word_tokenize
|
from nltk import word_tokenize
|
||||||
from openpyxl import load_workbook
|
from openpyxl import load_workbook
|
||||||
from deepdoc.parser import is_english, random_choices
|
from rag.nlp import is_english, random_choices
|
||||||
from rag.nlp import huqie, stemmer
|
from rag.nlp import huqie, stemmer
|
||||||
from deepdoc.parser import ExcelParser
|
from deepdoc.parser import ExcelParser
|
||||||
|
|
||||||
@ -81,7 +81,7 @@ def beAdoc(d, q, a, eng):
|
|||||||
return d
|
return d
|
||||||
|
|
||||||
|
|
||||||
def chunk(filename, binary=None, callback=None, **kwargs):
|
def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
||||||
"""
|
"""
|
||||||
Excel and csv(txt) format files are supported.
|
Excel and csv(txt) format files are supported.
|
||||||
If the file is in excel format, there should be 2 column question and answer without header.
|
If the file is in excel format, there should be 2 column question and answer without header.
|
||||||
@ -113,7 +113,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
|||||||
break
|
break
|
||||||
txt += l
|
txt += l
|
||||||
lines = txt.split("\n")
|
lines = txt.split("\n")
|
||||||
eng = is_english([rmPrefix(l) for l in lines[:100]])
|
eng = lang.lower() == "english"#is_english([rmPrefix(l) for l in lines[:100]])
|
||||||
fails = []
|
fails = []
|
||||||
for i, line in enumerate(lines):
|
for i, line in enumerate(lines):
|
||||||
arr = [l for l in line.split("\t") if len(l) > 1]
|
arr = [l for l in line.split("\t") if len(l) > 1]
|
||||||
|
@ -20,8 +20,7 @@ from openpyxl import load_workbook
|
|||||||
from dateutil.parser import parse as datetime_parse
|
from dateutil.parser import parse as datetime_parse
|
||||||
|
|
||||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||||
from deepdoc.parser import is_english, tokenize
|
from rag.nlp import huqie, is_english, tokenize
|
||||||
from rag.nlp import huqie
|
|
||||||
from deepdoc.parser import ExcelParser
|
from deepdoc.parser import ExcelParser
|
||||||
|
|
||||||
|
|
||||||
@ -112,7 +111,7 @@ def column_data_type(arr):
|
|||||||
return arr, ty
|
return arr, ty
|
||||||
|
|
||||||
|
|
||||||
def chunk(filename, binary=None, callback=None, **kwargs):
|
def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
||||||
"""
|
"""
|
||||||
Excel and csv(txt) format files are supported.
|
Excel and csv(txt) format files are supported.
|
||||||
For csv or txt file, the delimiter between columns is TAB.
|
For csv or txt file, the delimiter between columns is TAB.
|
||||||
@ -192,7 +191,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
|||||||
clmns_map = [(py_clmns[j] + fieds_map[clmn_tys[j]], clmns[j])
|
clmns_map = [(py_clmns[j] + fieds_map[clmn_tys[j]], clmns[j])
|
||||||
for i in range(len(clmns))]
|
for i in range(len(clmns))]
|
||||||
|
|
||||||
eng = is_english(txts)
|
eng = lang.lower() == "english"#is_english(txts)
|
||||||
for ii, row in df.iterrows():
|
for ii, row in df.iterrows():
|
||||||
d = {}
|
d = {}
|
||||||
row_txt = []
|
row_txt = []
|
||||||
|
@ -13,12 +13,18 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import io
|
||||||
from abc import ABC
|
from abc import ABC
|
||||||
|
|
||||||
|
from PIL import Image
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
import os
|
import os
|
||||||
import base64
|
import base64
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
|
from api.utils import get_uuid
|
||||||
|
from api.utils.file_utils import get_project_base_directory
|
||||||
|
|
||||||
|
|
||||||
class Base(ABC):
|
class Base(ABC):
|
||||||
def __init__(self, key, model_name):
|
def __init__(self, key, model_name):
|
||||||
@ -44,25 +50,26 @@ class Base(ABC):
|
|||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": [
|
"content": [
|
||||||
{
|
|
||||||
"type": "text",
|
|
||||||
"text": "请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等。",
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "image_url",
|
||||||
"image_url": {
|
"image_url": {
|
||||||
"url": f"data:image/jpeg;base64,{b64}"
|
"url": f"data:image/jpeg;base64,{b64}"
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"text": "请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等,如果有数据请提取出数据。" if self.lang.lower() == "chinese" else \
|
||||||
|
"Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out.",
|
||||||
|
},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
class GptV4(Base):
|
class GptV4(Base):
|
||||||
def __init__(self, key, model_name="gpt-4-vision-preview"):
|
def __init__(self, key, model_name="gpt-4-vision-preview", lang="Chinese"):
|
||||||
self.client = OpenAI(api_key=key)
|
self.client = OpenAI(api_key=key)
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
|
self.lang = lang
|
||||||
|
|
||||||
def describe(self, image, max_tokens=300):
|
def describe(self, image, max_tokens=300):
|
||||||
b64 = self.image2base64(image)
|
b64 = self.image2base64(image)
|
||||||
@ -76,18 +83,40 @@ class GptV4(Base):
|
|||||||
|
|
||||||
|
|
||||||
class QWenCV(Base):
|
class QWenCV(Base):
|
||||||
def __init__(self, key, model_name="qwen-vl-chat-v1"):
|
def __init__(self, key, model_name="qwen-vl-chat-v1", lang="Chinese"):
|
||||||
import dashscope
|
import dashscope
|
||||||
dashscope.api_key = key
|
dashscope.api_key = key
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
|
self.lang = lang
|
||||||
|
|
||||||
|
def prompt(self, binary):
|
||||||
|
# stupid as hell
|
||||||
|
tmp_dir = get_project_base_directory("tmp")
|
||||||
|
if not os.path.exists(tmp_dir): os.mkdir(tmp_dir)
|
||||||
|
path = os.path.join(tmp_dir, "%s.jpg"%get_uuid())
|
||||||
|
Image.open(io.BytesIO(binary)).save(path)
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"image": f"file://{path}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等,如果有数据请提取出数据。" if self.lang.lower() == "chinese" else \
|
||||||
|
"Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out.",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
def describe(self, image, max_tokens=300):
|
def describe(self, image, max_tokens=300):
|
||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
from dashscope import MultiModalConversation
|
from dashscope import MultiModalConversation
|
||||||
response = MultiModalConversation.call(model=self.model_name,
|
response = MultiModalConversation.call(model=self.model_name,
|
||||||
messages=self.prompt(self.image2base64(image)))
|
messages=self.prompt(image))
|
||||||
if response.status_code == HTTPStatus.OK:
|
if response.status_code == HTTPStatus.OK:
|
||||||
return response.output.choices[0]['message']['content'], response.usage.output_tokens
|
return response.output.choices[0]['message']['content'][0]["text"], response.usage.output_tokens
|
||||||
return response.message, 0
|
return response.message, 0
|
||||||
|
|
||||||
|
|
||||||
@ -95,9 +124,10 @@ from zhipuai import ZhipuAI
|
|||||||
|
|
||||||
|
|
||||||
class Zhipu4V(Base):
|
class Zhipu4V(Base):
|
||||||
def __init__(self, key, model_name="glm-4v"):
|
def __init__(self, key, model_name="glm-4v", lang="Chinese"):
|
||||||
self.client = ZhipuAI(api_key=key)
|
self.client = ZhipuAI(api_key=key)
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
|
self.lang = lang
|
||||||
|
|
||||||
def describe(self, image, max_tokens=1024):
|
def describe(self, image, max_tokens=1024):
|
||||||
b64 = self.image2base64(image)
|
b64 = self.image2base64(image)
|
||||||
|
@ -5,3 +5,219 @@ retrievaler = search.Dealer(ELASTICSEARCH)
|
|||||||
|
|
||||||
from nltk.stem import PorterStemmer
|
from nltk.stem import PorterStemmer
|
||||||
stemmer = PorterStemmer()
|
stemmer = PorterStemmer()
|
||||||
|
|
||||||
|
import re
|
||||||
|
from nltk import word_tokenize
|
||||||
|
from . import huqie
|
||||||
|
from rag.utils import num_tokens_from_string
|
||||||
|
import random
|
||||||
|
|
||||||
|
BULLET_PATTERN = [[
|
||||||
|
r"第[零一二三四五六七八九十百0-9]+(分?编|部分)",
|
||||||
|
r"第[零一二三四五六七八九十百0-9]+章",
|
||||||
|
r"第[零一二三四五六七八九十百0-9]+节",
|
||||||
|
r"第[零一二三四五六七八九十百0-9]+条",
|
||||||
|
r"[\((][零一二三四五六七八九十百]+[\))]",
|
||||||
|
], [
|
||||||
|
r"第[0-9]+章",
|
||||||
|
r"第[0-9]+节",
|
||||||
|
r"[0-9]{,3}[\. 、]",
|
||||||
|
r"[0-9]{,2}\.[0-9]{,2}",
|
||||||
|
r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
|
||||||
|
r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
|
||||||
|
], [
|
||||||
|
r"第[零一二三四五六七八九十百0-9]+章",
|
||||||
|
r"第[零一二三四五六七八九十百0-9]+节",
|
||||||
|
r"[零一二三四五六七八九十百]+[ 、]",
|
||||||
|
r"[\((][零一二三四五六七八九十百]+[\))]",
|
||||||
|
r"[\((][0-9]{,2}[\))]",
|
||||||
|
], [
|
||||||
|
r"PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)",
|
||||||
|
r"Chapter (I+V?|VI*|XI|IX|X)",
|
||||||
|
r"Section [0-9]+",
|
||||||
|
r"Article [0-9]+"
|
||||||
|
]
|
||||||
|
]
|
||||||
|
|
||||||
|
def random_choices(arr, k):
|
||||||
|
k = min(len(arr), k)
|
||||||
|
return random.choices(arr, k=k)
|
||||||
|
|
||||||
|
def bullets_category(sections):
|
||||||
|
global BULLET_PATTERN
|
||||||
|
hits = [0] * len(BULLET_PATTERN)
|
||||||
|
for i, pro in enumerate(BULLET_PATTERN):
|
||||||
|
for sec in sections:
|
||||||
|
for p in pro:
|
||||||
|
if re.match(p, sec):
|
||||||
|
hits[i] += 1
|
||||||
|
break
|
||||||
|
maxium = 0
|
||||||
|
res = -1
|
||||||
|
for i, h in enumerate(hits):
|
||||||
|
if h <= maxium: continue
|
||||||
|
res = i
|
||||||
|
maxium = h
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def is_english(texts):
|
||||||
|
eng = 0
|
||||||
|
for t in texts:
|
||||||
|
if re.match(r"[a-zA-Z]{2,}", t.strip()):
|
||||||
|
eng += 1
|
||||||
|
if eng / len(texts) > 0.8:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize(d, t, eng):
|
||||||
|
d["content_with_weight"] = t
|
||||||
|
if eng:
|
||||||
|
t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
|
||||||
|
d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)])
|
||||||
|
else:
|
||||||
|
d["content_ltks"] = huqie.qie(t)
|
||||||
|
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
||||||
|
|
||||||
|
|
||||||
|
def remove_contents_table(sections, eng=False):
|
||||||
|
i = 0
|
||||||
|
while i < len(sections):
|
||||||
|
def get(i):
|
||||||
|
nonlocal sections
|
||||||
|
return (sections[i] if type(sections[i]) == type("") else sections[i][0]).strip()
|
||||||
|
|
||||||
|
if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
|
||||||
|
re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)):
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
sections.pop(i)
|
||||||
|
if i >= len(sections): break
|
||||||
|
prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
|
||||||
|
while not prefix:
|
||||||
|
sections.pop(i)
|
||||||
|
if i >= len(sections): break
|
||||||
|
prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
|
||||||
|
sections.pop(i)
|
||||||
|
if i >= len(sections) or not prefix: break
|
||||||
|
for j in range(i, min(i + 128, len(sections))):
|
||||||
|
if not re.match(prefix, get(j)):
|
||||||
|
continue
|
||||||
|
for _ in range(i, j): sections.pop(i)
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
def make_colon_as_title(sections):
|
||||||
|
if not sections: return []
|
||||||
|
if type(sections[0]) == type(""): return sections
|
||||||
|
i = 0
|
||||||
|
while i < len(sections):
|
||||||
|
txt, layout = sections[i]
|
||||||
|
i += 1
|
||||||
|
txt = txt.split("@")[0].strip()
|
||||||
|
if not txt:
|
||||||
|
continue
|
||||||
|
if txt[-1] not in "::":
|
||||||
|
continue
|
||||||
|
txt = txt[::-1]
|
||||||
|
arr = re.split(r"([。?!!?;;]| .)", txt)
|
||||||
|
if len(arr) < 2 or len(arr[1]) < 32:
|
||||||
|
continue
|
||||||
|
sections.insert(i - 1, (arr[0][::-1], "title"))
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
|
||||||
|
def hierarchical_merge(bull, sections, depth):
|
||||||
|
if not sections or bull < 0: return []
|
||||||
|
if type(sections[0]) == type(""): sections = [(s, "") for s in sections]
|
||||||
|
sections = [(t,o) for t, o in sections if t and len(t.split("@")[0].strip()) > 1 and not re.match(r"[0-9]+$", t.split("@")[0].strip())]
|
||||||
|
bullets_size = len(BULLET_PATTERN[bull])
|
||||||
|
levels = [[] for _ in range(bullets_size + 2)]
|
||||||
|
|
||||||
|
def not_title(txt):
|
||||||
|
if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt): return False
|
||||||
|
if len(txt) >= 128: return True
|
||||||
|
return re.search(r"[,;,。;!!]", txt)
|
||||||
|
|
||||||
|
for i, (txt, layout) in enumerate(sections):
|
||||||
|
for j, p in enumerate(BULLET_PATTERN[bull]):
|
||||||
|
if re.match(p, txt.strip()) and not not_title(txt):
|
||||||
|
levels[j].append(i)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
if re.search(r"(title|head)", layout):
|
||||||
|
levels[bullets_size].append(i)
|
||||||
|
else:
|
||||||
|
levels[bullets_size + 1].append(i)
|
||||||
|
sections = [t for t, _ in sections]
|
||||||
|
for s in sections: print("--", s)
|
||||||
|
|
||||||
|
def binary_search(arr, target):
|
||||||
|
if not arr: return -1
|
||||||
|
if target > arr[-1]: return len(arr) - 1
|
||||||
|
if target < arr[0]: return -1
|
||||||
|
s, e = 0, len(arr)
|
||||||
|
while e - s > 1:
|
||||||
|
i = (e + s) // 2
|
||||||
|
if target > arr[i]:
|
||||||
|
s = i
|
||||||
|
continue
|
||||||
|
elif target < arr[i]:
|
||||||
|
e = i
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
return s
|
||||||
|
|
||||||
|
cks = []
|
||||||
|
readed = [False] * len(sections)
|
||||||
|
levels = levels[::-1]
|
||||||
|
for i, arr in enumerate(levels[:depth]):
|
||||||
|
for j in arr:
|
||||||
|
if readed[j]: continue
|
||||||
|
readed[j] = True
|
||||||
|
cks.append([j])
|
||||||
|
if i + 1 == len(levels) - 1: continue
|
||||||
|
for ii in range(i + 1, len(levels)):
|
||||||
|
jj = binary_search(levels[ii], j)
|
||||||
|
if jj < 0: continue
|
||||||
|
if jj > cks[-1][-1]: cks[-1].pop(-1)
|
||||||
|
cks[-1].append(levels[ii][jj])
|
||||||
|
for ii in cks[-1]: readed[ii] = True
|
||||||
|
for i in range(len(cks)):
|
||||||
|
cks[i] = [sections[j] for j in cks[i][::-1]]
|
||||||
|
print("--------------\n", "\n* ".join(cks[i]))
|
||||||
|
|
||||||
|
return cks
|
||||||
|
|
||||||
|
|
||||||
|
def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
||||||
|
if not sections: return []
|
||||||
|
if type(sections[0]) == type(""): sections = [(s, "") for s in sections]
|
||||||
|
cks = [""]
|
||||||
|
tk_nums = [0]
|
||||||
|
def add_chunk(t, pos):
|
||||||
|
nonlocal cks, tk_nums, delimiter
|
||||||
|
tnum = num_tokens_from_string(t)
|
||||||
|
if tnum < 8: pos = ""
|
||||||
|
if tk_nums[-1] > chunk_token_num:
|
||||||
|
cks.append(t + pos)
|
||||||
|
tk_nums.append(tnum)
|
||||||
|
else:
|
||||||
|
cks[-1] += t + pos
|
||||||
|
tk_nums[-1] += tnum
|
||||||
|
|
||||||
|
for sec, pos in sections:
|
||||||
|
s, e = 0, 1
|
||||||
|
while e < len(sec):
|
||||||
|
if sec[e] in delimiter:
|
||||||
|
add_chunk(sec[s: e+1], pos)
|
||||||
|
s = e + 1
|
||||||
|
e = s + 1
|
||||||
|
else:
|
||||||
|
e += 1
|
||||||
|
if s < e: add_chunk(sec[s: e], pos)
|
||||||
|
|
||||||
|
return cks
|
||||||
|
|
||||||
|
@ -21,6 +21,7 @@ import hashlib
|
|||||||
import copy
|
import copy
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
import traceback
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
|
|
||||||
@ -36,7 +37,7 @@ from rag.nlp import search
|
|||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from rag.app import laws, paper, presentation, manual, qa, table, book, resume
|
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture
|
||||||
|
|
||||||
from api.db import LLMType, ParserType
|
from api.db import LLMType, ParserType
|
||||||
from api.db.services.document_service import DocumentService
|
from api.db.services.document_service import DocumentService
|
||||||
@ -56,47 +57,31 @@ FACTORY = {
|
|||||||
ParserType.QA.value: qa,
|
ParserType.QA.value: qa,
|
||||||
ParserType.TABLE.value: table,
|
ParserType.TABLE.value: table,
|
||||||
ParserType.RESUME.value: resume,
|
ParserType.RESUME.value: resume,
|
||||||
|
ParserType.PICTURE.value: picture,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing..."):
|
def set_progress(task_id, from_page=0, to_page=-1,
|
||||||
|
prog=None, msg="Processing..."):
|
||||||
|
if prog is not None and prog < 0:
|
||||||
|
msg = "[ERROR]"+msg
|
||||||
cancel = TaskService.do_cancel(task_id)
|
cancel = TaskService.do_cancel(task_id)
|
||||||
if cancel:
|
if cancel:
|
||||||
msg += " [Canceled]"
|
msg += " [Canceled]"
|
||||||
prog = -1
|
prog = -1
|
||||||
|
|
||||||
if to_page > 0: msg = f"Page({from_page}~{to_page}): " + msg
|
if to_page > 0:
|
||||||
|
msg = f"Page({from_page}~{to_page}): " + msg
|
||||||
d = {"progress_msg": msg}
|
d = {"progress_msg": msg}
|
||||||
if prog is not None: d["progress"] = prog
|
if prog is not None:
|
||||||
|
d["progress"] = prog
|
||||||
try:
|
try:
|
||||||
TaskService.update_progress(task_id, d)
|
TaskService.update_progress(task_id, d)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
cron_logger.error("set_progress:({}), {}".format(task_id, str(e)))
|
cron_logger.error("set_progress:({}), {}".format(task_id, str(e)))
|
||||||
|
|
||||||
if cancel:sys.exit()
|
if cancel:
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
"""
|
|
||||||
def chuck_doc(name, binary, tenant_id, cvmdl=None):
|
|
||||||
suff = os.path.split(name)[-1].lower().split(".")[-1]
|
|
||||||
if suff.find("pdf") >= 0:
|
|
||||||
return PDF(binary)
|
|
||||||
if suff.find("doc") >= 0:
|
|
||||||
return DOC(binary)
|
|
||||||
if re.match(r"(xlsx|xlsm|xltx|xltm)", suff):
|
|
||||||
return EXC(binary)
|
|
||||||
if suff.find("ppt") >= 0:
|
|
||||||
return PPT(binary)
|
|
||||||
if cvmdl and re.search(r"\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico)$",
|
|
||||||
name.lower()):
|
|
||||||
txt = cvmdl.describe(binary)
|
|
||||||
field = TextChunker.Fields()
|
|
||||||
field.text_chunks = [(txt, binary)]
|
|
||||||
field.table_chunks = []
|
|
||||||
return field
|
|
||||||
|
|
||||||
return TextChunker()(binary)
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def collect(comm, mod, tm):
|
def collect(comm, mod, tm):
|
||||||
@ -109,29 +94,38 @@ def collect(comm, mod, tm):
|
|||||||
return tasks
|
return tasks
|
||||||
|
|
||||||
|
|
||||||
def build(row, cvmdl):
|
def build(row):
|
||||||
if row["size"] > DOC_MAXIMUM_SIZE:
|
if row["size"] > DOC_MAXIMUM_SIZE:
|
||||||
set_progress(row["id"], prog=-1, msg="File size exceeds( <= %dMb )" %
|
set_progress(row["id"], prog=-1, msg="File size exceeds( <= %dMb )" %
|
||||||
(int(DOC_MAXIMUM_SIZE / 1024 / 1024)))
|
(int(DOC_MAXIMUM_SIZE / 1024 / 1024)))
|
||||||
return []
|
return []
|
||||||
|
|
||||||
callback = partial(set_progress, row["id"], row["from_page"], row["to_page"])
|
callback = partial(
|
||||||
|
set_progress,
|
||||||
|
row["id"],
|
||||||
|
row["from_page"],
|
||||||
|
row["to_page"])
|
||||||
chunker = FACTORY[row["parser_id"].lower()]
|
chunker = FACTORY[row["parser_id"].lower()]
|
||||||
try:
|
try:
|
||||||
cron_logger.info("Chunkking {}/{}".format(row["location"], row["name"]))
|
cron_logger.info(
|
||||||
cks = chunker.chunk(row["name"], binary = MINIO.get(row["kb_id"], row["location"]), from_page=row["from_page"], to_page=row["to_page"],
|
"Chunkking {}/{}".format(row["location"], row["name"]))
|
||||||
callback = callback, kb_id=row["kb_id"], parser_config=row["parser_config"])
|
cks = chunker.chunk(row["name"], binary=MINIO.get(row["kb_id"], row["location"]), from_page=row["from_page"],
|
||||||
|
to_page=row["to_page"], lang=row["language"], callback=callback,
|
||||||
|
kb_id=row["kb_id"], parser_config=row["parser_config"], tenant_id=row["tenant_id"])
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if re.search("(No such file|not found)", str(e)):
|
if re.search("(No such file|not found)", str(e)):
|
||||||
callback(-1, "Can not find file <%s>" % row["doc_name"])
|
callback(-1, "Can not find file <%s>" % row["doc_name"])
|
||||||
else:
|
else:
|
||||||
callback(-1, f"Internal server error: %s" % str(e).replace("'", ""))
|
callback(-1, f"Internal server error: %s" %
|
||||||
|
str(e).replace("'", ""))
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
cron_logger.warn("Chunkking {}/{}: {}".format(row["location"], row["name"], str(e)))
|
cron_logger.warn(
|
||||||
|
"Chunkking {}/{}: {}".format(row["location"], row["name"], str(e)))
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
callback(msg="Finished slicing files. Start to embedding the content.")
|
callback(msg="Finished slicing files(%d). Start to embedding the content."%len(cks))
|
||||||
|
|
||||||
docs = []
|
docs = []
|
||||||
doc = {
|
doc = {
|
||||||
@ -142,7 +136,8 @@ def build(row, cvmdl):
|
|||||||
d = copy.deepcopy(doc)
|
d = copy.deepcopy(doc)
|
||||||
d.update(ck)
|
d.update(ck)
|
||||||
md5 = hashlib.md5()
|
md5 = hashlib.md5()
|
||||||
md5.update((ck["content_with_weight"] + str(d["doc_id"])).encode("utf-8"))
|
md5.update((ck["content_with_weight"] +
|
||||||
|
str(d["doc_id"])).encode("utf-8"))
|
||||||
d["_id"] = md5.hexdigest()
|
d["_id"] = md5.hexdigest()
|
||||||
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
|
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
|
||||||
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
|
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
|
||||||
@ -173,7 +168,8 @@ def init_kb(row):
|
|||||||
|
|
||||||
|
|
||||||
def embedding(docs, mdl, parser_config={}):
|
def embedding(docs, mdl, parser_config={}):
|
||||||
tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [d["content_with_weight"] for d in docs]
|
tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [
|
||||||
|
d["content_with_weight"] for d in docs]
|
||||||
tk_count = 0
|
tk_count = 0
|
||||||
if len(tts) == len(cnts):
|
if len(tts) == len(cnts):
|
||||||
tts, c = mdl.encode(tts)
|
tts, c = mdl.encode(tts)
|
||||||
@ -182,7 +178,8 @@ def embedding(docs, mdl, parser_config={}):
|
|||||||
cnts, c = mdl.encode(cnts)
|
cnts, c = mdl.encode(cnts)
|
||||||
tk_count += c
|
tk_count += c
|
||||||
title_w = float(parser_config.get("filename_embd_weight", 0.1))
|
title_w = float(parser_config.get("filename_embd_weight", 0.1))
|
||||||
vects = (title_w * tts + (1-title_w) * cnts) if len(tts) == len(cnts) else cnts
|
vects = (title_w * tts + (1 - title_w) *
|
||||||
|
cnts) if len(tts) == len(cnts) else cnts
|
||||||
|
|
||||||
assert len(vects) == len(docs)
|
assert len(vects) == len(docs)
|
||||||
for i, d in enumerate(docs):
|
for i, d in enumerate(docs):
|
||||||
@ -192,7 +189,10 @@ def embedding(docs, mdl, parser_config={}):
|
|||||||
|
|
||||||
|
|
||||||
def main(comm, mod):
|
def main(comm, mod):
|
||||||
tm_fnm = os.path.join(get_project_base_directory(), "rag/res", f"{comm}-{mod}.tm")
|
tm_fnm = os.path.join(
|
||||||
|
get_project_base_directory(),
|
||||||
|
"rag/res",
|
||||||
|
f"{comm}-{mod}.tm")
|
||||||
tm = findMaxTm(tm_fnm)
|
tm = findMaxTm(tm_fnm)
|
||||||
rows = collect(comm, mod, tm)
|
rows = collect(comm, mod, tm)
|
||||||
if len(rows) == 0:
|
if len(rows) == 0:
|
||||||
@ -203,15 +203,13 @@ def main(comm, mod):
|
|||||||
callback = partial(set_progress, r["id"], r["from_page"], r["to_page"])
|
callback = partial(set_progress, r["id"], r["from_page"], r["to_page"])
|
||||||
try:
|
try:
|
||||||
embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING)
|
embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING)
|
||||||
cv_mdl = LLMBundle(r["tenant_id"], LLMType.IMAGE2TEXT)
|
|
||||||
# TODO: sequence2text model
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
callback(prog=-1, msg=str(e))
|
callback(prog=-1, msg=str(e))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
st_tm = timer()
|
cks = build(r)
|
||||||
cks = build(r, cv_mdl)
|
if cks is None:
|
||||||
if cks is None:continue
|
continue
|
||||||
if not cks:
|
if not cks:
|
||||||
tmf.write(str(r["update_time"]) + "\n")
|
tmf.write(str(r["update_time"]) + "\n")
|
||||||
callback(1., "No chunk! Done!")
|
callback(1., "No chunk! Done!")
|
||||||
@ -233,11 +231,15 @@ def main(comm, mod):
|
|||||||
cron_logger.error(str(es_r))
|
cron_logger.error(str(es_r))
|
||||||
else:
|
else:
|
||||||
if TaskService.do_cancel(r["id"]):
|
if TaskService.do_cancel(r["id"]):
|
||||||
ELASTICSEARCH.deleteByQuery(Q("match", doc_id=r["doc_id"]), idxnm=search.index_name(r["tenant_id"]))
|
ELASTICSEARCH.deleteByQuery(
|
||||||
|
Q("match", doc_id=r["doc_id"]), idxnm=search.index_name(r["tenant_id"]))
|
||||||
continue
|
continue
|
||||||
callback(1., "Done!")
|
callback(1., "Done!")
|
||||||
DocumentService.increment_chunk_num(r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
|
DocumentService.increment_chunk_num(
|
||||||
cron_logger.info("Chunk doc({}), token({}), chunks({})".format(r["id"], tk_count, len(cks)))
|
r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
|
||||||
|
cron_logger.info(
|
||||||
|
"Chunk doc({}), token({}), chunks({})".format(
|
||||||
|
r["id"], tk_count, len(cks)))
|
||||||
|
|
||||||
tmf.write(str(r["update_time"]) + "\n")
|
tmf.write(str(r["update_time"]) + "\n")
|
||||||
tmf.close()
|
tmf.close()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user