mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-07-10 05:41:46 +08:00
23 lines
566 B
Python
23 lines
566 B
Python
from pathlib import Path
|
|
from typing import Dict
|
|
|
|
from bs4 import BeautifulSoup
|
|
from llama_index.readers.file.base_parser import BaseParser
|
|
|
|
|
|
class HTMLParser(BaseParser):
|
|
"""HTML parser."""
|
|
|
|
def _init_parser(self) -> Dict:
|
|
"""Init parser."""
|
|
return {}
|
|
|
|
def parse_file(self, file: Path, errors: str = "ignore") -> str:
|
|
"""Parse file."""
|
|
with open(file, "rb") as fp:
|
|
soup = BeautifulSoup(fp, 'html.parser')
|
|
text = soup.get_text()
|
|
text = text.strip() if text else ''
|
|
|
|
return text
|