| | """Epub parser. |
| | |
| | Contains parsers for epub files. |
| | """ |
| |
|
| | from pathlib import Path |
| | from typing import Dict |
| |
|
| | from application.parser.file.base_parser import BaseParser |
| |
|
| |
|
| | class EpubParser(BaseParser): |
| | """Epub Parser.""" |
| |
|
| | def _init_parser(self) -> Dict: |
| | """Init parser.""" |
| | return {} |
| |
|
| | def parse_file(self, file: Path, errors: str = "ignore") -> str: |
| | """Parse file.""" |
| | try: |
| | import ebooklib |
| | from ebooklib import epub |
| | except ImportError: |
| | raise ValueError("`EbookLib` is required to read Epub files.") |
| | try: |
| | import html2text |
| | except ImportError: |
| | raise ValueError("`html2text` is required to parse Epub files.") |
| |
|
| | text_list = [] |
| | book = epub.read_epub(file, options={"ignore_ncx": True}) |
| |
|
| | |
| | for item in book.get_items(): |
| | |
| | if item.get_type() == ebooklib.ITEM_DOCUMENT: |
| | text_list.append( |
| | html2text.html2text(item.get_content().decode("utf-8")) |
| | ) |
| |
|
| | text = "\n".join(text_list) |
| | return text |
| |
|