mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-04-20 13:10:05 +08:00
Minor: improve doc and rm unused file (#5634)
### What problem does this PR solve? The `ocr.res` file is already included in the model directory `rag/res/deepdoc`, but it doesn't seem to be utilized here. ### Type of change - [x] Documentation Update
This commit is contained in:
parent
b0c21b00d9
commit
ca04ae9540
@ -113,4 +113,4 @@ PDF、DOCX、EXCEL和PPT四种文档格式都有相应的解析器。最复杂
|
|||||||
|
|
||||||
### 简历
|
### 简历
|
||||||
|
|
||||||
简历是一种非常复杂的文件。一份由各种布局的非结构化文本组成的简历可以分解为由近百个字段组成的结构化数据。我们还没有打开解析器,因为我们在解析过程之后打开了处理方法。
|
简历是一种非常复杂的文档。由各种格式的非结构化文本构成的简历可以被解析为包含近百个字段的结构化数据。我们还没有启用解析器,因为在解析过程之后才会启动处理方法。
|
||||||
|
@ -42,6 +42,17 @@ if LOCK_KEY_pdfplumber not in sys.modules:
|
|||||||
|
|
||||||
class RAGFlowPdfParser:
|
class RAGFlowPdfParser:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
"""
|
||||||
|
If you have trouble downloading HuggingFace models, -_^ this might help!!
|
||||||
|
|
||||||
|
For Linux:
|
||||||
|
export HF_ENDPOINT=https://hf-mirror.com
|
||||||
|
|
||||||
|
For Windows:
|
||||||
|
Good luck
|
||||||
|
^_-
|
||||||
|
|
||||||
|
"""
|
||||||
self.ocr = OCR()
|
self.ocr = OCR()
|
||||||
if hasattr(self, "model_speciess"):
|
if hasattr(self, "model_speciess"):
|
||||||
self.layouter = LayoutRecognizer("layout." + self.model_speciess)
|
self.layouter = LayoutRecognizer("layout." + self.model_speciess)
|
||||||
@ -72,17 +83,6 @@ class RAGFlowPdfParser:
|
|||||||
model_dir, "updown_concat_xgb.model"))
|
model_dir, "updown_concat_xgb.model"))
|
||||||
|
|
||||||
self.page_from = 0
|
self.page_from = 0
|
||||||
"""
|
|
||||||
If you have trouble downloading HuggingFace models, -_^ this might help!!
|
|
||||||
|
|
||||||
For Linux:
|
|
||||||
export HF_ENDPOINT=https://hf-mirror.com
|
|
||||||
|
|
||||||
For Windows:
|
|
||||||
Good luck
|
|
||||||
^_-
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __char_width(self, c):
|
def __char_width(self, c):
|
||||||
return (c["x1"] - c["x0"]) // max(len(c["text"]), 1)
|
return (c["x1"] - c["x0"]) // max(len(c["text"]), 1)
|
||||||
|
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user