Awell00 commited on
Commit
feb2065
·
verified ·
1 Parent(s): 2f407ca

feat!: add all files

Browse files
Files changed (4) hide show
  1. EpubSplit.zip +3 -0
  2. app.py +248 -0
  3. epubsplit.py +1367 -0
  4. requirements.txt +14 -0
EpubSplit.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e0dcc5fe502500fde5108298f7074c9979a27ac4452734b15e80c046b65f75e
3
+ size 484014
app.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import warnings
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ import subprocess
6
+ import io
7
+ import ebooklib
8
+ from ebooklib import epub
9
+ from huggingface_hub import InferenceClient
10
+ from epubsplit import SplitEpub
11
+ import re
12
+ import os
13
+
14
+ def install_calibre():
15
+ try:
16
+ subprocess.run(["apt-get", "update"], check=True)
17
+ subprocess.run(["apt-get", "install", "-y", "calibre"], check=True),
18
+ subprocess.run(["calibre-customize", "-a", "EpubSplit.zip"], check=True)
19
+ print("Calibre installed successfully.")
20
+ except subprocess.CalledProcessError as e:
21
+ print(f"Error installing calibre: {e}")
22
+
23
+ install_calibre()
24
+
25
+ # Suppress specific warnings
26
+ warnings.filterwarnings("ignore", message="In the future version we will turn default option ignore_ncx to True.")
27
+ warnings.filterwarnings("ignore", message="This search incorrectly ignores the root element, and will be fixed in a future version.")
28
+
29
+ # Constants
30
+ EPUB_PATH = 'book.epub'
31
+ OUTPUT_EPUB_PATH = 'output.epub'
32
+ OUTPUT_PDF_PATH = 'output.pdf'
33
+ LIBRARY_URL = os.getenv("LIBRARY_URL")
34
+ COOKIE_CONFIG = {
35
+ 'remix_userkey': os.getenv("LIBRARY_KEY"),
36
+ 'remix_userid': '14009766',
37
+ 'selectedSiteMode': 'books',
38
+ 'domainsNotWorking': os.getenv("NOT_WORKING")
39
+ }
40
+
41
+ def fetch_library_search_url():
42
+ try:
43
+ response = requests.get(LIBRARY_URL)
44
+ soup = BeautifulSoup(response.content, 'html5lib')
45
+ library_div = soup.find('div', attrs={'class': 'plainlist'})
46
+ if library_div:
47
+ links = library_div.find_all('a', class_='external text')
48
+ return next((link.get('href') for link in links if link.get('href').startswith('https')), "")
49
+ except Exception as e:
50
+ print(f"Error fetching library URL: {e}")
51
+ return ""
52
+
53
+ SEARCH_URL = fetch_library_search_url()
54
+
55
+ def fetch_book_details(isbn):
56
+ if not SEARCH_URL:
57
+ print("Search URL not available.")
58
+ return
59
+
60
+ search_endpoint = f"{SEARCH_URL}/s/{isbn}"
61
+ try:
62
+ response = requests.get(search_endpoint)
63
+ soup = BeautifulSoup(response.content, 'html5lib')
64
+ bookcards = soup.find_all('z-bookcard')
65
+
66
+ book_url = next((SEARCH_URL + card.get('href') for card in bookcards if card.get('href')), None)
67
+ if not book_url:
68
+ print("No book URL found.")
69
+ return
70
+
71
+ download_book(book_url)
72
+ except Exception as e:
73
+ print(f"Error fetching book details: {e}")
74
+
75
+ def download_book(book_url):
76
+ try:
77
+ response = requests.get(book_url, cookies=COOKIE_CONFIG)
78
+ soup = BeautifulSoup(response.content, 'html5lib')
79
+ download_link = soup.find('a', class_='addDownloadedBook')
80
+
81
+ if download_link and download_link.has_attr('href'):
82
+ download_url = SEARCH_URL + download_link['href']
83
+ download_and_convert_epub(download_url)
84
+ else:
85
+ print("Download link not found or invalid.")
86
+ except Exception as e:
87
+ print(f"Error downloading book: {e}")
88
+
89
+ def download_and_convert_epub(download_url):
90
+ try:
91
+ response = requests.get(download_url, cookies=COOKIE_CONFIG)
92
+ if response.status_code == 200:
93
+ with open(EPUB_PATH, 'wb') as epub_file:
94
+ epub_file.write(response.content)
95
+ print("EPUB downloaded successfully.")
96
+ else:
97
+ print(f"Failed to download EPUB. Status code: {response.status_code}")
98
+ except Exception as e:
99
+ print(f"Error downloading EPUB: {e}")
100
+ def extract_chapter_text(input_epub_path, chapter_indices):
101
+ print(f"Extracting chapter text for indices: {chapter_indices}")
102
+ try:
103
+ with open(input_epub_path, 'rb') as epub_file:
104
+ split_epub = SplitEpub(epub_file)
105
+ output_io = io.BytesIO()
106
+ split_epub.write_split_epub(output_io, chapter_indices)
107
+ with open(OUTPUT_EPUB_PATH, 'wb') as output_file:
108
+ output_file.write(output_io.getvalue())
109
+
110
+ return read_text_from_epub(OUTPUT_EPUB_PATH)
111
+ except Exception as e:
112
+ print(f"Error extracting chapter text: {e}")
113
+ return ""
114
+
115
+ def read_text_from_epub(output_epub_path):
116
+ try:
117
+ book = epub.read_epub(output_epub_path)
118
+ text_content = []
119
+ for item in book.get_items():
120
+ if item.get_type() == ebooklib.ITEM_DOCUMENT:
121
+ soup = BeautifulSoup(item.get_body_content(), 'html.parser')
122
+ paragraphs = soup.find_all('p')
123
+ text_content.extend(para.get_text() for para in paragraphs)
124
+ return '\n'.join(text_content)
125
+ except Exception as e:
126
+ print(f"Error reading text from EPUB: {e}")
127
+ return ""
128
+ def generate_table_of_contents():
129
+ try:
130
+ result = subprocess.run(['calibre-debug', '--run-plugin', 'EpubSplit', EPUB_PATH], capture_output=True, text=True)
131
+ pattern = re.compile(r'Line Number: (\d+)\n(?:\t.*\n)*\ttoc: \[(.*?)\]')
132
+ print(result)
133
+ return {int(line_number): title for line_number, title in pattern.findall(result.stdout)}
134
+ except Exception as e:
135
+ print(f"Error generating table of contents: {e}")
136
+ return {}
137
+
138
+ def summarize_chapter(chapter_index):
139
+ if chapter_index < 0:
140
+ return "Invalid chapter selection."
141
+
142
+ result = subprocess.run(['calibre-debug', '--run-plugin', 'EpubSplit', EPUB_PATH], capture_output=True, text=True)
143
+ pattern = re.compile(r'Line Number: (\d+)\n(?:\t.*\n)*\ttoc: \[(.*?)\]')
144
+ chapter = [int(line_number) for line_number, title in pattern.findall(result.stdout)]
145
+
146
+ chapter_text = ""
147
+ for i in range(chapter_index, chapter[chapter.index(chapter_index)+1]):
148
+ chapter_to_summarize = extract_chapter_text(EPUB_PATH, [i])
149
+ if chapter_to_summarize and len(chapter_to_summarize) > 50:
150
+ chapter_text += generate_summary(chapter_to_summarize)
151
+ chapter_text += "\n\n"
152
+
153
+ if not chapter_text:
154
+ chapter_to_summarize = extract_chapter_text(EPUB_PATH, [chapter_index+1])
155
+ if chapter_to_summarize and len(chapter_to_summarize) > 50:
156
+ chapter_text += generate_summary(chapter_to_summarize)
157
+ chapter_text += "\n\n"
158
+ return chapter_text if chapter_text else "No content found for the selected chapter."
159
+
160
+ def generate_summary(text):
161
+ try:
162
+ client = InferenceClient(api_key=TOKEN)
163
+
164
+ user_prompt = (
165
+ "Provide a clear and concise summary of the chapter, emphasizing key events, themes, and character developments. "
166
+ "Do not include introductory or concluding remarks, just focus on the main points."
167
+ f"\n\nChapter Text:\n{text}"
168
+ )
169
+
170
+ system_message = {
171
+ "role": "system",
172
+ "content": (
173
+ "You are an expert at summarizing book chapters. Your task is to condense the chapter into a focused, "
174
+ "informative summary that highlights the most important events, themes, and character developments. "
175
+ "Avoid filler and ensure the summary is succinct yet comprehensive."
176
+ )
177
+ }
178
+
179
+
180
+ messages = [
181
+ system_message,
182
+ {"role": "user", "content": user_prompt}
183
+ ]
184
+
185
+ stream = client.chat.completions.create(
186
+ model=MODEL,
187
+ messages=messages,
188
+ temperature=0.5,
189
+ max_tokens=2048,
190
+ top_p=0.7,
191
+ stream=True
192
+ )
193
+
194
+ out = ""
195
+
196
+ for chunk in stream:
197
+ if chunk.choices and len(chunk.choices) > 0:
198
+ new_content = chunk.choices[0].delta.content
199
+ out += new_content
200
+ yield out
201
+ return out
202
+ except Exception as e:
203
+ print(f"Error generating summary: {e}")
204
+ return "Error generating summary."
205
+
206
+ # Model Initialization
207
+ MODEL = "meta-llama/Llama-3.3-70B-Instruct"
208
+ TOKEN = os.getenv("TOKEN")
209
+
210
+ # Gradio App
211
+ with gr.Blocks() as app:
212
+ isbn_input = gr.Textbox(label="Enter ISBN")
213
+ chapter_dropdown = gr.Dropdown(label="Select Chapter", choices=[])
214
+ summary_output = gr.Textbox(label="Summary", lines=10, interactive=False)
215
+
216
+ def update_chapter_dropdown(isbn):
217
+ fetch_book_details(isbn)
218
+ chapters = generate_table_of_contents()
219
+ print(chapters)
220
+ return gr.update(choices=[(title.strip('\''), line_number) for line_number, title in chapters.items()])
221
+
222
+ def stream_summarize_chapter(chapter_index):
223
+ if chapter_index < 0:
224
+ yield "Invalid chapter selection."
225
+ return
226
+
227
+ result = subprocess.run(['calibre-debug', '--run-plugin', 'EpubSplit', EPUB_PATH], capture_output=True, text=True)
228
+ pattern = re.compile(r'Line Number: (\d+)\n(?:\t.*\n)*\ttoc: \[(.*?)\]')
229
+ chapter = [int(line_number) for line_number, title in pattern.findall(result.stdout)]
230
+
231
+ if not chapter:
232
+ yield "No content found for the selected chapter."
233
+ return
234
+
235
+ for i in range(chapter_index, chapter[chapter.index(chapter_index) + 1]):
236
+ chapter_to_summarize = extract_chapter_text(EPUB_PATH, [i])
237
+ if chapter_to_summarize and len(chapter_to_summarize) > 50:
238
+ for text_chunk in generate_summary(chapter_to_summarize):
239
+ yield text_chunk
240
+ else:
241
+ yield "No significant content found for this chapter."
242
+
243
+ isbn_input.change(update_chapter_dropdown, inputs=[isbn_input], outputs=[chapter_dropdown])
244
+ chapter_dropdown.change(
245
+ stream_summarize_chapter, inputs=[chapter_dropdown], outputs=[summary_output]
246
+ )
247
+
248
+ app.launch()
epubsplit.py ADDED
@@ -0,0 +1,1367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ __license__ = 'GPL v3'
5
+ __copyright__ = '2021, Jim Miller'
6
+ __docformat__ = 'restructuredtext en'
7
+
8
+ import sys, re, os, traceback, copy
9
+ from posixpath import normpath
10
+ import logging
11
+ logger = logging.getLogger(__name__)
12
+
13
+ from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED
14
+
15
+ from xml.dom.minidom import parse, parseString, getDOMImplementation, Element
16
+ from time import time
17
+
18
+ import six
19
+ from six.moves.urllib.parse import unquote
20
+ from six import string_types, text_type as unicode
21
+ from six import unichr
22
+
23
+ from bs4 import BeautifulSoup
24
+
25
+ ## font decoding code lifted from
26
+ ## calibre/src/calibre/ebooks/conversion/plugins/epub_input.py
27
+ ## copyright '2009, Kovid Goyal <kovid@kovidgoyal.net>'
28
+ ## don't bug Kovid about this use of it.
29
+
30
+ ADOBE_OBFUSCATION = 'http://ns.adobe.com/pdf/enc#RC'
31
+ IDPF_OBFUSCATION = 'http://www.idpf.org/2008/embedding'
32
+ from itertools import cycle
33
+
34
+ class FontDecrypter:
35
+ def __init__(self, epub, content_dom):
36
+ self.epub = epub
37
+ self.content_dom = content_dom
38
+ self.encryption = {}
39
+ self.old_uuid = None
40
+
41
+ def get_file(self,href):
42
+ return self.epub.read(href)
43
+
44
+ def get_encrypted_fontfiles(self):
45
+ if not self.encryption:
46
+ ## Find the .opf file.
47
+ try:
48
+ # <encryption xmlns="urn:oasis:names:tc:opendocument:xmlns:container"
49
+ # xmlns:enc="http://www.w3.org/2001/04/xmlenc#"
50
+ # xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">
51
+ # <enc:EncryptedData>
52
+ # <enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>
53
+ # <enc:CipherData>
54
+ # <enc:CipherReference URI="fonts/00017.ttf"/>
55
+ # </enc:CipherData>
56
+ # </enc:EncryptedData>
57
+ # </encryption>
58
+ encryption = self.epub.read("META-INF/encryption.xml")
59
+ encryptiondom = parseString(encryption)
60
+ # print(encryptiondom.toprettyxml(indent=' '))
61
+ for encdata in encryptiondom.getElementsByTagName('enc:EncryptedData'):
62
+ # print(encdata.toprettyxml(indent=' '))
63
+ algorithm = encdata.getElementsByTagName('enc:EncryptionMethod')[0].getAttribute('Algorithm')
64
+ if algorithm not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}:
65
+ print("Unknown font encryption: %s"%algorithm)
66
+ else:
67
+ # print(algorithm)
68
+ for encref in encdata.getElementsByTagName('enc:CipherReference'):
69
+ # print(encref.getAttribute('URI'))
70
+ self.encryption[encref.getAttribute('URI')]=algorithm
71
+ except KeyError as ke:
72
+ self.encryption = {}
73
+ return self.encryption
74
+
75
+ def get_old_uuid(self):
76
+ if not self.old_uuid:
77
+ contentdom = self.content_dom
78
+ uidkey = contentdom.getElementsByTagName("package")[0].getAttribute("unique-identifier")
79
+ for dcid in contentdom.getElementsByTagName("dc:identifier"):
80
+ if dcid.getAttribute("id") == uidkey and dcid.getAttribute("opf:scheme") == "uuid":
81
+ self.old_uuid = dcid.firstChild.data
82
+ return self.old_uuid
83
+
84
+ def get_idpf_key(self):
85
+ # idpf key:urn:uuid:221c69fe-29f3-4cb4-bb3f-58c430261cc6
86
+ # idpf key:b'\xfb\xa9\x03N}\xae~\x12 \xaa\xe0\xc11\xe2\xe7\x1b\xf6\xa5\xcas'
87
+ idpf_key = self.get_old_uuid()
88
+ import uuid, hashlib
89
+ idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key)
90
+ idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest()
91
+ return idpf_key
92
+
93
+ def get_adobe_key(self):
94
+ # adobe key:221c69fe-29f3-4cb4-bb3f-58c430261cc6
95
+ # adobe key:b'"\x1ci\xfe)\xf3L\xb4\xbb?X\xc40&\x1c\xc6'
96
+ adobe_key = self.get_old_uuid()
97
+ import uuid
98
+ adobe_key = adobe_key.rpartition(':')[-1] # skip urn:uuid:
99
+ adobe_key = uuid.UUID(adobe_key).bytes
100
+ return adobe_key
101
+
102
+ def get_decrypted_font_data(self, uri):
103
+ # print(self.get_old_uuid())
104
+ # print("idpf : %s"%self.get_idpf_key())
105
+ # print("adobe: %s"%self.get_adobe_key())
106
+ # print("uri:%s"%uri)
107
+ font_data = self.get_file(uri)
108
+ if uri in self.get_encrypted_fontfiles():
109
+ key = self.get_adobe_key() if self.get_encrypted_fontfiles()[uri] == ADOBE_OBFUSCATION else self.get_idpf_key()
110
+ font_data = self.decrypt_font_data(key, font_data, self.get_encrypted_fontfiles()[uri])
111
+ return font_data
112
+
113
+ def decrypt_font_data(self, key, data, algorithm):
114
+ is_adobe = algorithm == ADOBE_OBFUSCATION
115
+ crypt_len = 1024 if is_adobe else 1040
116
+ crypt = bytearray(data[:crypt_len])
117
+ key = cycle(iter(bytearray(key)))
118
+ decrypt = bytes(bytearray(x^next(key) for x in crypt))
119
+ return decrypt + data[crypt_len:]
120
+
121
+ def _unirepl(match):
122
+ "Return the unicode string for a decimal number"
123
+ if match.group(1).startswith('x'):
124
+ radix=16
125
+ s = match.group(1)[1:]
126
+ else:
127
+ radix=10
128
+ s = match.group(1)
129
+ try:
130
+ value = int(s, radix)
131
+ retval = "%s%s"%(unichr(value),match.group(2))
132
+ except:
133
+ # This way, at least if there's more of entities out there
134
+ # that fail, it doesn't blow the entire download.
135
+ print("Numeric entity translation failed, skipping: &#x%s%s"%(match.group(1),match.group(2)))
136
+ retval = ""
137
+ return retval
138
+
139
+ def _replaceNumberEntities(data):
140
+ # The same brokenish entity parsing in SGMLParser that inserts ';'
141
+ # after non-entities will also insert ';' incorrectly after number
142
+ # entities, including part of the next word if it's a-z.
143
+ # "Don't&#8212ever&#8212do&#8212that&#8212again," becomes
144
+ # "Don't&#8212e;ver&#8212d;o&#8212;that&#8212a;gain,"
145
+ # Also need to allow for 5 digit decimal entities &#27861;
146
+ # Last expression didn't allow for 2 digit hex correctly: &#xE9;
147
+ p = re.compile(r'&#(x[0-9a-fA-F]{,4}|[0-9]{,5})([0-9a-fA-F]*?);')
148
+ return p.sub(_unirepl, data)
149
+
150
+ def _replaceNotEntities(data):
151
+ # not just \w or \S. regexp from c:\Python25\lib\sgmllib.py
152
+ # (or equiv), SGMLParser, entityref
153
+ p = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
154
+ return p.sub(r'&\1', data)
155
+
156
+ def stripHTML(soup):
157
+ return removeAllEntities(re.sub(r'<[^>]+>','',"%s" % soup)).strip()
158
+
159
+ def conditionalRemoveEntities(value):
160
+ if isinstance(value,string_types) :
161
+ return removeEntities(value).strip()
162
+ else:
163
+ return value
164
+
165
+ def removeAllEntities(text):
166
+ # Remove &lt; &lt; and &amp;
167
+ return removeEntities(text).replace('&lt;', '<').replace('&gt;', '>').replace('&amp;', '&')
168
+
169
+ def removeEntities(text):
170
+
171
+ if text is None:
172
+ return ""
173
+ if not (isinstance(text,string_types)):
174
+ return str(text)
175
+
176
+ try:
177
+ t = unicode(text) #.decode('utf-8')
178
+ except UnicodeEncodeError as e:
179
+ try:
180
+ t = text.encode ('ascii', 'xmlcharrefreplace')
181
+ except UnicodeEncodeError as e:
182
+ t = text
183
+ text = t
184
+ # replace numeric versions of [&<>] with named versions,
185
+ # then replace named versions with actual characters,
186
+ text = re.sub(r'&#0*38;','&amp;',text)
187
+ text = re.sub(r'&#0*60;','&lt;',text)
188
+ text = re.sub(r'&#0*62;','&gt;',text)
189
+
190
+ # replace remaining &#000; entities with unicode value, such as &#039; -> '
191
+ text = _replaceNumberEntities(text)
192
+
193
+ # replace several named entities with character, such as &mdash; -> -
194
+ # see constants.py for the list.
195
+ # reverse sort will put entities with ; before the same one without, when valid.
196
+ for e in reversed(sorted(entities.keys())):
197
+ v = entities[e]
198
+ try:
199
+ text = text.replace(e, v)
200
+ except UnicodeDecodeError as ex:
201
+ # for the pound symbol in constants.py
202
+ text = text.replace(e, v.decode('utf-8'))
203
+
204
+ # SGMLParser, and in turn, BeautifulStoneSoup doesn't parse
205
+ # entities terribly well and inserts (;) after something that
206
+ # it thinks might be an entity. AT&T becomes AT&T; All of my
207
+ # attempts to fix this by changing the input to
208
+ # BeautifulStoneSoup break something else instead. But at
209
+ # this point, there should be *no* real entities left, so find
210
+ # these not-entities and removing them here should be safe.
211
+ text = _replaceNotEntities(text)
212
+
213
+ # &lt; &lt; and &amp; are the only html entities allowed in xhtml, put those back.
214
+ return text.replace('&', '&amp;').replace('&amp;lt', '&lt;').replace('&amp;gt', '&gt;')
215
+
216
+ # entity list from http://code.google.com/p/doctype/wiki/CharacterEntitiesConsistent
217
+ entities = { '&aacute;' : 'á',
218
+ '&Aacute;' : 'Á',
219
+ '&Aacute' : 'Á',
220
+ '&aacute' : 'á',
221
+ '&acirc;' : 'â',
222
+ '&Acirc;' : 'Â',
223
+ '&Acirc' : 'Â',
224
+ '&acirc' : 'â',
225
+ '&acute;' : '´',
226
+ '&acute' : '´',
227
+ '&AElig;' : 'Æ',
228
+ '&aelig;' : 'æ',
229
+ '&AElig' : 'Æ',
230
+ '&aelig' : 'æ',
231
+ '&agrave;' : 'à',
232
+ '&Agrave;' : 'À',
233
+ '&Agrave' : 'À',
234
+ '&agrave' : 'à',
235
+ '&alefsym;' : 'ℵ',
236
+ '&alpha;' : 'α',
237
+ '&Alpha;' : 'Α',
238
+ '&amp;' : '&',
239
+ '&AMP;' : '&',
240
+ '&AMP' : '&',
241
+ '&amp' : '&',
242
+ '&and;' : '∧',
243
+ '&ang;' : '∠',
244
+ '&aring;' : 'å',
245
+ '&Aring;' : 'Å',
246
+ '&Aring' : 'Å',
247
+ '&aring' : 'å',
248
+ '&asymp;' : '≈',
249
+ '&atilde;' : 'ã',
250
+ '&Atilde;' : 'Ã',
251
+ '&Atilde' : 'Ã',
252
+ '&atilde' : 'ã',
253
+ '&auml;' : 'ä',
254
+ '&Auml;' : 'Ä',
255
+ '&Auml' : 'Ä',
256
+ '&auml' : 'ä',
257
+ '&bdquo;' : '„',
258
+ '&beta;' : 'β',
259
+ '&Beta;' : 'Β',
260
+ '&brvbar;' : '¦',
261
+ '&brvbar' : '¦',
262
+ '&bull;' : '•',
263
+ '&cap;' : '∩',
264
+ '&ccedil;' : 'ç',
265
+ '&Ccedil;' : 'Ç',
266
+ '&Ccedil' : 'Ç',
267
+ '&ccedil' : 'ç',
268
+ '&cedil;' : '¸',
269
+ '&cedil' : '¸',
270
+ '&cent;' : '¢',
271
+ '&cent' : '¢',
272
+ '&chi;' : 'χ',
273
+ '&Chi;' : 'Χ',
274
+ '&circ;' : 'ˆ',
275
+ '&clubs;' : '♣',
276
+ '&cong;' : '≅',
277
+ '&copy;' : '©',
278
+ '&COPY;' : '©',
279
+ '&COPY' : '©',
280
+ '&copy' : '©',
281
+ '&crarr;' : '↵',
282
+ '&cup;' : '∪',
283
+ '&curren;' : '¤',
284
+ '&curren' : '¤',
285
+ '&dagger;' : '†',
286
+ '&Dagger;' : '‡',
287
+ '&darr;' : '↓',
288
+ '&dArr;' : '⇓',
289
+ '&deg;' : '°',
290
+ '&deg' : '°',
291
+ '&delta;' : 'δ',
292
+ '&Delta;' : 'Δ',
293
+ '&diams;' : '♦',
294
+ '&divide;' : '÷',
295
+ '&divide' : '÷',
296
+ '&eacute;' : 'é',
297
+ '&Eacute;' : 'É',
298
+ '&Eacute' : 'É',
299
+ '&eacute' : 'é',
300
+ '&ecirc;' : 'ê',
301
+ '&Ecirc;' : 'Ê',
302
+ '&Ecirc' : 'Ê',
303
+ '&ecirc' : 'ê',
304
+ '&egrave;' : 'è',
305
+ '&Egrave;' : 'È',
306
+ '&Egrave' : 'È',
307
+ '&egrave' : 'è',
308
+ '&empty;' : '∅',
309
+ '&emsp;' : ' ',
310
+ '&ensp;' : ' ',
311
+ '&epsilon;' : 'ε',
312
+ '&Epsilon;' : 'Ε',
313
+ '&equiv;' : '≡',
314
+ '&eta;' : 'η',
315
+ '&Eta;' : 'Η',
316
+ '&eth;' : 'ð',
317
+ '&ETH;' : 'Ð',
318
+ '&ETH' : 'Ð',
319
+ '&eth' : 'ð',
320
+ '&euml;' : 'ë',
321
+ '&Euml;' : 'Ë',
322
+ '&Euml' : 'Ë',
323
+ '&euml' : 'ë',
324
+ '&euro;' : '€',
325
+ '&exist;' : '∃',
326
+ '&fnof;' : 'ƒ',
327
+ '&forall;' : '∀',
328
+ '&frac12;' : '½',
329
+ '&frac12' : '½',
330
+ '&frac14;' : '¼',
331
+ '&frac14' : '¼',
332
+ '&frac34;' : '¾',
333
+ '&frac34' : '¾',
334
+ '&frasl;' : '⁄',
335
+ '&gamma;' : 'γ',
336
+ '&Gamma;' : 'Γ',
337
+ '&ge;' : '≥',
338
+ #'&gt;' : '>',
339
+ #'&GT;' : '>',
340
+ #'&GT' : '>',
341
+ #'&gt' : '>',
342
+ '&harr;' : '↔',
343
+ '&hArr;' : '⇔',
344
+ '&hearts;' : '♥',
345
+ '&hellip;' : '…',
346
+ '&iacute;' : 'í',
347
+ '&Iacute;' : 'Í',
348
+ '&Iacute' : 'Í',
349
+ '&iacute' : 'í',
350
+ '&icirc;' : 'î',
351
+ '&Icirc;' : 'Î',
352
+ '&Icirc' : 'Î',
353
+ '&icirc' : 'î',
354
+ '&iexcl;' : '¡',
355
+ '&iexcl' : '¡',
356
+ '&igrave;' : 'ì',
357
+ '&Igrave;' : 'Ì',
358
+ '&Igrave' : 'Ì',
359
+ '&igrave' : 'ì',
360
+ '&image;' : 'ℑ',
361
+ '&infin;' : '∞',
362
+ '&int;' : '∫',
363
+ '&iota;' : 'ι',
364
+ '&Iota;' : 'Ι',
365
+ '&iquest;' : '¿',
366
+ '&iquest' : '¿',
367
+ '&isin;' : '∈',
368
+ '&iuml;' : 'ï',
369
+ '&Iuml;' : 'Ï',
370
+ '&Iuml' : 'Ï',
371
+ '&iuml' : 'ï',
372
+ '&kappa;' : 'κ',
373
+ '&Kappa;' : 'Κ',
374
+ '&lambda;' : 'λ',
375
+ '&Lambda;' : 'Λ',
376
+ '&laquo;' : '«',
377
+ '&laquo' : '«',
378
+ '&larr;' : '←',
379
+ '&lArr;' : '⇐',
380
+ '&lceil;' : '⌈',
381
+ '&ldquo;' : '“',
382
+ '&le;' : '≤',
383
+ '&lfloor;' : '⌊',
384
+ '&lowast;' : '∗',
385
+ '&loz;' : '◊',
386
+ '&lrm;' : '‎',
387
+ '&lsaquo;' : '‹',
388
+ '&lsquo;' : '‘',
389
+ #'&lt;' : '<',
390
+ #'&LT;' : '<',
391
+ #'&LT' : '<',
392
+ #'&lt' : '<',
393
+ '&macr;' : '¯',
394
+ '&macr' : '¯',
395
+ '&mdash;' : '—',
396
+ '&micro;' : 'µ',
397
+ '&micro' : 'µ',
398
+ '&middot;' : '·',
399
+ '&middot' : '·',
400
+ '&minus;' : '−',
401
+ '&mu;' : 'μ',
402
+ '&Mu;' : 'Μ',
403
+ '&nabla;' : '∇',
404
+ '&nbsp;' : ' ',
405
+ '&nbsp' : ' ',
406
+ '&ndash;' : '–',
407
+ '&ne;' : '≠',
408
+ '&ni;' : '∋',
409
+ '&not;' : '¬',
410
+ '&not' : '¬',
411
+ '&notin;' : '∉',
412
+ '&nsub;' : '⊄',
413
+ '&ntilde;' : 'ñ',
414
+ '&Ntilde;' : 'Ñ',
415
+ '&Ntilde' : 'Ñ',
416
+ '&ntilde' : 'ñ',
417
+ '&nu;' : 'ν',
418
+ '&Nu;' : 'Ν',
419
+ '&oacute;' : 'ó',
420
+ '&Oacute;' : 'Ó',
421
+ '&Oacute' : 'Ó',
422
+ '&oacute' : 'ó',
423
+ '&ocirc;' : 'ô',
424
+ '&Ocirc;' : 'Ô',
425
+ '&Ocirc' : 'Ô',
426
+ '&ocirc' : 'ô',
427
+ '&OElig;' : 'Œ',
428
+ '&oelig;' : 'œ',
429
+ '&ograve;' : 'ò',
430
+ '&Ograve;' : 'Ò',
431
+ '&Ograve' : 'Ò',
432
+ '&ograve' : 'ò',
433
+ '&oline;' : '‾',
434
+ '&omega;' : 'ω',
435
+ '&Omega;' : 'Ω',
436
+ '&omicron;' : 'ο',
437
+ '&Omicron;' : 'Ο',
438
+ '&oplus;' : '⊕',
439
+ '&or;' : '∨',
440
+ '&ordf;' : 'ª',
441
+ '&ordf' : 'ª',
442
+ '&ordm;' : 'º',
443
+ '&ordm' : 'º',
444
+ '&oslash;' : 'ø',
445
+ '&Oslash;' : 'Ø',
446
+ '&Oslash' : 'Ø',
447
+ '&oslash' : 'ø',
448
+ '&otilde;' : 'õ',
449
+ '&Otilde;' : 'Õ',
450
+ '&Otilde' : 'Õ',
451
+ '&otilde' : 'õ',
452
+ '&otimes;' : '⊗',
453
+ '&ouml;' : 'ö',
454
+ '&Ouml;' : 'Ö',
455
+ '&Ouml' : 'Ö',
456
+ '&ouml' : 'ö',
457
+ '&para;' : '¶',
458
+ '&para' : '¶',
459
+ '&part;' : '∂',
460
+ '&permil;' : '‰',
461
+ '&perp;' : '⊥',
462
+ '&phi;' : 'φ',
463
+ '&Phi;' : 'Φ',
464
+ '&pi;' : 'π',
465
+ '&Pi;' : 'Π',
466
+ '&piv;' : 'ϖ',
467
+ '&plusmn;' : '±',
468
+ '&plusmn' : '±',
469
+ '&pound;' : '£',
470
+ '&pound' : '£',
471
+ '&prime;' : '′',
472
+ '&Prime;' : '″',
473
+ '&prod;' : '∏',
474
+ '&prop;' : '∝',
475
+ '&psi;' : 'ψ',
476
+ '&Psi;' : 'Ψ',
477
+ '&quot;' : '"',
478
+ '&QUOT;' : '"',
479
+ '&QUOT' : '"',
480
+ '&quot' : '"',
481
+ '&radic;' : '√',
482
+ '&raquo;' : '»',
483
+ '&raquo' : '»',
484
+ '&rarr;' : '→',
485
+ '&rArr;' : '⇒',
486
+ '&rceil;' : '⌉',
487
+ '&rdquo;' : '”',
488
+ '&real;' : 'ℜ',
489
+ '&reg;' : '®',
490
+ '&REG;' : '®',
491
+ '&REG' : '®',
492
+ '&reg' : '®',
493
+ '&rfloor;' : '⌋',
494
+ '&rho;' : 'ρ',
495
+ '&Rho;' : 'Ρ',
496
+ '&rlm;' : '‏',
497
+ '&rsaquo;' : '›',
498
+ '&rsquo;' : '’',
499
+ '&sbquo;' : '‚',
500
+ '&scaron;' : 'š',
501
+ '&Scaron;' : 'Š',
502
+ '&sdot;' : '⋅',
503
+ '&sect;' : '§',
504
+ '&sect' : '§',
505
+ '&shy;' : '­', # strange optional hyphenation control character, not just a dash
506
+ '&shy' : '­',
507
+ '&sigma;' : 'σ',
508
+ '&Sigma;' : 'Σ',
509
+ '&sigmaf;' : 'ς',
510
+ '&sim;' : '∼',
511
+ '&spades;' : '♠',
512
+ '&sub;' : '⊂',
513
+ '&sube;' : '⊆',
514
+ '&sum;' : '∑',
515
+ '&sup1;' : '¹',
516
+ '&sup1' : '¹',
517
+ '&sup2;' : '²',
518
+ '&sup2' : '²',
519
+ '&sup3;' : '³',
520
+ '&sup3' : '³',
521
+ '&sup;' : '⊃',
522
+ '&supe;' : '⊇',
523
+ '&szlig;' : 'ß',
524
+ '&szlig' : 'ß',
525
+ '&tau;' : 'τ',
526
+ '&Tau;' : 'Τ',
527
+ '&there4;' : '∴',
528
+ '&theta;' : 'θ',
529
+ '&Theta;' : 'Θ',
530
+ '&thetasym;' : 'ϑ',
531
+ '&thinsp;' : ' ',
532
+ '&thorn;' : 'þ',
533
+ '&THORN;' : 'Þ',
534
+ '&THORN' : 'Þ',
535
+ '&thorn' : 'þ',
536
+ '&tilde;' : '˜',
537
+ '&times;' : '×',
538
+ '&times' : '×',
539
+ '&trade;' : '™',
540
+ '&uacute;' : 'ú',
541
+ '&Uacute;' : 'Ú',
542
+ '&Uacute' : 'Ú',
543
+ '&uacute' : 'ú',
544
+ '&uarr;' : '↑',
545
+ '&uArr;' : '⇑',
546
+ '&ucirc;' : 'û',
547
+ '&Ucirc;' : 'Û',
548
+ '&Ucirc' : 'Û',
549
+ '&ucirc' : 'û',
550
+ '&ugrave;' : 'ù',
551
+ '&Ugrave;' : 'Ù',
552
+ '&Ugrave' : 'Ù',
553
+ '&ugrave' : 'ù',
554
+ '&uml;' : '¨',
555
+ '&uml' : '¨',
556
+ '&upsih;' : 'ϒ',
557
+ '&upsilon;' : 'υ',
558
+ '&Upsilon;' : 'Υ',
559
+ '&uuml;' : 'ü',
560
+ '&Uuml;' : 'Ü',
561
+ '&Uuml' : 'Ü',
562
+ '&uuml' : 'ü',
563
+ '&weierp;' : '℘',
564
+ '&xi;' : 'ξ',
565
+ '&Xi;' : 'Ξ',
566
+ '&yacute;' : 'ý',
567
+ '&Yacute;' : 'Ý',
568
+ '&Yacute' : 'Ý',
569
+ '&yacute' : 'ý',
570
+ '&yen;' : '¥',
571
+ '&yen' : '¥',
572
+ '&yuml;' : 'ÿ',
573
+ '&Yuml;' : 'Ÿ',
574
+ '&yuml' : 'ÿ',
575
+ '&zeta;' : 'ζ',
576
+ '&Zeta;' : 'Ζ',
577
+ '&zwj;' : '‍', # strange spacing control character, not just a space
578
+ '&zwnj;' : '‌', # strange spacing control character, not just a space
579
+ }
580
+
581
+ class SplitEpub:
582
+
583
+ def __init__(self, inputio):
584
+ self.epub = ZipFile(inputio, 'r')
585
+ self.content_dom = None
586
+ self.content_relpath = None
587
+ self.manifest_items = None
588
+ self.guide_items = None
589
+ self.toc_dom = None
590
+ self.toc_relpath = None
591
+ self.toc_map = None
592
+ self.split_lines = None
593
+ self.origauthors = []
594
+ self.origtitle = None
595
+
596
+ def get_file(self,href):
597
+ return self.epub.read(href)
598
+
599
+ def get_content_dom(self):
600
+ if not self.content_dom:
601
+ ## Find the .opf file.
602
+ container = self.epub.read("META-INF/container.xml")
603
+ containerdom = parseString(container)
604
+ rootfilenodelist = containerdom.getElementsByTagName("rootfile")
605
+ rootfilename = rootfilenodelist[0].getAttribute("full-path")
606
+
607
+ self.content_dom = parseString(self.epub.read(rootfilename))
608
+ self.content_relpath = get_path_part(rootfilename)
609
+ return self.content_dom
610
+
611
+ def get_content_relpath(self):
612
+ ## Save the path to the .opf file--hrefs inside it are relative to it.
613
+ if not self.content_relpath:
614
+ self.get_content_dom() # sets self.content_relpath also.
615
+ return self.content_relpath
616
+
617
+ def get_toc_relpath(self):
618
+ ## Save the path to the toc.ncx file--hrefs inside it are relative to it.
619
+ if not self.toc_relpath:
620
+ self.get_manifest_items() # sets self.toc_relpath also.
621
+ return self.toc_relpath
622
+
623
+ def get_manifest_items(self):
624
+ if not self.manifest_items:
625
+ self.manifest_items = {}
626
+
627
+ for item in self.get_content_dom().getElementsByTagName("item"):
628
+ fullhref=normpath(unquote(self.get_content_relpath()+item.getAttribute("href")))
629
+ #print("---- item fullhref:%s"%(fullhref))
630
+ self.manifest_items["h:"+fullhref]=(item.getAttribute("id"),item.getAttribute("media-type"))
631
+ self.manifest_items["i:"+item.getAttribute("id")]=(fullhref,item.getAttribute("media-type"))
632
+
633
+ if( item.getAttribute("media-type") == "application/x-dtbncx+xml" ):
634
+ # TOC file is only one with this type--as far as I know.
635
+ self.toc_relpath = get_path_part(fullhref)
636
+ self.toc_dom = parseString(self.epub.read(fullhref))
637
+
638
+ return self.manifest_items
639
+
640
+ def get_guide_items(self):
641
+ if not self.guide_items:
642
+ self.guide_items = {}
643
+
644
+ for item in self.get_content_dom().getElementsByTagName("reference"):
645
+ fullhref=normpath(unquote(self.get_content_relpath()+item.getAttribute("href")))
646
+ self.guide_items[fullhref]=(item.getAttribute("type"),item.getAttribute("title"))
647
+ #print("---- reference href:%s value:%s"%(fullhref,self.guide_items[fullhref],))
648
+ #self.guide_items[item.getAttribute("type")]=(fullhref,item.getAttribute("media-type"))
649
+
650
+ return self.guide_items
651
+
652
+ def get_toc_dom(self):
653
+ if not self.toc_dom:
654
+ self.get_manifest_items() # also sets self.toc_dom
655
+ return self.toc_dom
656
+
657
+ # dict() of href->[(text,anchor),...],...
658
+ # eg: "file0001.html"->[("Introduction","anchor01"),("Chapter 1","anchor02")],...
659
+ def get_toc_map(self):
660
+ if not self.toc_map:
661
+ self.toc_map = {}
662
+ # update all navpoint ids with bookid for uniqueness.
663
+ for navpoint in self.get_toc_dom().getElementsByTagName("navPoint"):
664
+ src = normpath(unquote(self.get_toc_relpath()+navpoint.getElementsByTagName("content")[0].getAttribute("src")))
665
+ if '#' in src:
666
+ (href,anchor)=src.split("#")
667
+ else:
668
+ (href,anchor)=(src,None)
669
+
670
+ # The first of these in each navPoint should be the appropriate one.
671
+ # (may be others due to nesting.
672
+ try:
673
+ text = unicode(navpoint.getElementsByTagName("text")[0].firstChild.data)
674
+ except:
675
+ #print("No chapter title found in TOC for (%s)"%src)
676
+ text = ""
677
+
678
+ if href not in self.toc_map:
679
+ self.toc_map[href] = []
680
+ if anchor == None:
681
+ # put file links ahead of ancher links. Otherwise
682
+ # a non-linear anchor link may take precedence,
683
+ # which will confuse EpubSplit. This will cause
684
+ # split lines to possibly be out of order from
685
+ # TOC, but the alternative is worse. Should be a
686
+ # rare corner case.
687
+ ## Keep order of non-anchor entries to the same file.
688
+ idx=0
689
+ while idx < len(self.toc_map[href]) and self.toc_map[href][idx][1] is None: # [1] is anchor
690
+ # print(idx)
691
+ # print(self.toc_map[href][idx])
692
+ idx = idx+1
693
+ self.toc_map[href].insert(idx,(text,anchor))
694
+ else:
695
+ self.toc_map[href].append((text,anchor))
696
+ # print(self.toc_map)
697
+ return self.toc_map
698
+
699
+ # list of dicts with href, anchor & toc text.
700
+ # 'split lines' are all the points that the epub can be split on.
701
+ # Offer a split at each spine file and each ToC point.
702
+ def get_split_lines(self):
703
+
704
+ metadom = self.get_content_dom()
705
+ ## Save indiv book title
706
+ try:
707
+ self.origtitle = metadom.getElementsByTagName("dc:title")[0].firstChild.data
708
+ except:
709
+ self.origtitle = "(Title Missing)"
710
+
711
+ ## Save authors.
712
+ for creator in metadom.getElementsByTagName("dc:creator"):
713
+ try:
714
+ if( creator.getAttribute("opf:role") == "aut" or not creator.hasAttribute("opf:role") and creator.firstChild != None):
715
+ if creator.firstChild.data not in self.origauthors:
716
+ self.origauthors.append(creator.firstChild.data)
717
+ except:
718
+ pass
719
+ if len(self.origauthors) == 0:
720
+ self.origauthors.append("(Authors Missing)")
721
+
722
+ self.split_lines = [] # list of dicts with href, anchor and toc
723
+ # spin on spine files.
724
+ count=0
725
+ for itemref in metadom.getElementsByTagName("itemref"):
726
+ idref = itemref.getAttribute("idref")
727
+ (href,type) = self.get_manifest_items()["i:"+idref]
728
+ current = {}
729
+ self.split_lines.append(current)
730
+ current['href']=href
731
+ current['anchor']=None
732
+ current['toc'] = []
733
+ if href in self.get_guide_items():
734
+ current['guide'] = self.get_guide_items()[href]
735
+ current['id'] = idref
736
+ current['type'] = type
737
+ current['num'] = count
738
+ t=self.epub.read(href).decode('utf-8')
739
+ if len(t) > 1500 : t = t[:1500] + "..."
740
+ current['sample']=t
741
+ count += 1
742
+ #print("spine:%s->%s"%(idref,href))
743
+
744
+ # if href is in the toc.
745
+ if href in self.get_toc_map():
746
+ # For each toc entry, check to see if there's an anchor, if so,
747
+ # make a new split line.
748
+ for tocitem in self.get_toc_map()[href]:
749
+ (text,anchor) = tocitem
750
+ # XXX for outputing to screen in CLI--hopefully won't need in plugin?
751
+ try:
752
+ text = "%s"%text
753
+ except:
754
+ text = "(error text)"
755
+
756
+ if anchor:
757
+ #print("breakpoint: %d"%count)
758
+ current = {}
759
+ self.split_lines.append(current)
760
+ current['href']=href
761
+ current['anchor']=anchor
762
+ current['toc']=[]
763
+ current['id'] = idref
764
+ current['type'] = type
765
+ current['num'] = count
766
+ # anchor, need to split first, then reduce to 1500.
767
+ t=splitHtml(self.epub.read(href).decode('utf-8'),anchor,before=False)
768
+ if len(t) > 1500 : t = t[:1500] + "..."
769
+ current['sample']=t
770
+ count += 1
771
+ # There can be more than one toc to the same split line.
772
+ # This won't find multiple toc to the same anchor yet.
773
+ current['toc'].append(text)
774
+ #print("\ttoc:'%s' %s#%s"%(text,href,anchor))
775
+ return self.split_lines
776
+
777
+ # pass in list of line numbers(?)
778
+ def get_split_files(self,linenums):
779
+
780
+ self.filecache = FileCache(self.get_manifest_items())
781
+
782
+ # set include flag in split_lines.
783
+ if not self.split_lines:
784
+ self.get_split_lines()
785
+ lines = self.split_lines
786
+
787
+ lines_set = set([int(k) for k in linenums])
788
+ for j in range(len(lines)):
789
+ lines[j]['include'] = j in lines_set
790
+
791
+ # loop through finding 'chunks' -- contiguous pieces in the
792
+ # same file. Each included file is at least one chunk, but if
793
+ # parts are left out, one original file can end up being more
794
+ # than one chunk.
795
+ outchunks = [] # list of tuples=(filename,start,end) 'end' is not inclusive.
796
+ inchunk = False
797
+ currentfile = None
798
+ start = None
799
+ for line in lines:
800
+ if line['include']:
801
+ if not inchunk: # start new chunk
802
+ inchunk = True
803
+ currentfile = line['href']
804
+ start = line
805
+ else: # inchunk
806
+ # different file, new chunk.
807
+ if currentfile != line['href']:
808
+ outchunks.append((currentfile,start,line))
809
+ inchunk=True
810
+ currentfile=line['href']
811
+ start=line
812
+ else: # not include
813
+ if inchunk: # save previous chunk.
814
+ outchunks.append((currentfile,start,line))
815
+ inchunk=False
816
+
817
+ # final chunk for when last in list is include.
818
+ if inchunk:
819
+ outchunks.append((currentfile,start,None))
820
+
821
+ outfiles=[] # tuples, (filename,type,data) -- filename changed to unique
822
+ for (href,start,end) in outchunks:
823
+ filedata = self.epub.read(href).decode('utf-8')
824
+
825
+ # discard before start if anchor.
826
+ if start['anchor'] != None:
827
+ filedata = splitHtml(filedata,start['anchor'],before=False)
828
+
829
+ # discard from end anchor on(inclusive), but only if same file. If
830
+ # different file, keep rest of file. If no 'end', then it was the
831
+ # last chunk and went to the end of the last file.
832
+ if end != None and end['anchor'] != None and end['href']==href:
833
+ filedata = splitHtml(filedata,end['anchor'],before=True)
834
+
835
+ filename = self.filecache.add_content_file(href,filedata)
836
+ outfiles.append([filename,start['id'],start['type'],filedata])
837
+
838
+ # print("self.oldnew:%s"%self.filecache.oldnew)
839
+ # print("self.newold:%s"%self.filecache.newold)
840
+ # print("\nanchors:%s\n"%self.filecache.anchors)
841
+ # print("\nlinkedfiles:%s\n"%self.filecache.linkedfiles)
842
+ # print("relpath:%s"%get_path_part())
843
+
844
+ # Spin through to replace internal URLs
845
+ for fl in outfiles:
846
+ #print("file:%s"%fl[0])
847
+ soup = BeautifulSoup(fl[3],'html5lib')
848
+ changed = False
849
+ for a in soup.findAll('a'):
850
+ if a.has_attr('href'):
851
+ path = normpath(unquote("%s%s"%(get_path_part(fl[0]),a['href'])))
852
+ #print("full a['href']:%s"%path)
853
+ if path in self.filecache.anchors and self.filecache.anchors[path] != path:
854
+ a['href'] = self.filecache.anchors[path][len(get_path_part(fl[0])):]
855
+ #print("replacement path:%s"%a['href'])
856
+ changed = True
857
+ if changed:
858
+ fl[3] = unicode(soup)
859
+
860
+ return outfiles
861
+
862
+ def write_split_epub(self,
863
+ outputio,
864
+ linenums,
865
+ changedtocs={},
866
+ authoropts=[],
867
+ titleopt=None,
868
+ descopt=None,
869
+ tags=[],
870
+ languages=['en'],
871
+ coverjpgpath=None):
872
+
873
+ files = self.get_split_files(linenums)
874
+
875
+ ## Write mimetype file, must be first and uncompressed.
876
+ ## Older versions of python(2.4/5) don't allow you to specify
877
+ ## compression by individual file.
878
+ ## Overwrite if existing output file.
879
+ outputepub = ZipFile(outputio, "w", compression=ZIP_STORED)
880
+ outputepub.debug = 3
881
+ outputepub.writestr("mimetype", "application/epub+zip")
882
+ outputepub.close()
883
+
884
+ ## Re-open file for content.
885
+ outputepub = ZipFile(outputio, "a", compression=ZIP_DEFLATED)
886
+ outputepub.debug = 3
887
+
888
+ ## Create META-INF/container.xml file. The only thing it does is
889
+ ## point to content.opf
890
+ containerdom = getDOMImplementation().createDocument(None, "container", None)
891
+ containertop = containerdom.documentElement
892
+ containertop.setAttribute("version","1.0")
893
+ containertop.setAttribute("xmlns","urn:oasis:names:tc:opendocument:xmlns:container")
894
+ rootfiles = containerdom.createElement("rootfiles")
895
+ containertop.appendChild(rootfiles)
896
+ rootfiles.appendChild(newTag(containerdom,"rootfile",{"full-path":"content.opf",
897
+ "media-type":"application/oebps-package+xml"}))
898
+ outputepub.writestr("META-INF/container.xml",containerdom.toprettyxml(indent=' ',encoding='utf-8'))
899
+
900
+
901
+ #### ## create content.opf file.
902
+ uniqueid="epubsplit-uid-%d" % time() # real sophisticated uid scheme.
903
+ contentdom = getDOMImplementation().createDocument(None, "package", None)
904
+ package = contentdom.documentElement
905
+
906
+ package.setAttribute("version","2.0")
907
+ package.setAttribute("xmlns","http://www.idpf.org/2007/opf")
908
+ package.setAttribute("unique-identifier","epubsplit-id")
909
+ metadata=newTag(contentdom,"metadata",
910
+ attrs={"xmlns:dc":"http://purl.org/dc/elements/1.1/",
911
+ "xmlns:opf":"http://www.idpf.org/2007/opf"})
912
+ metadata.appendChild(newTag(contentdom,"dc:identifier",text=uniqueid,attrs={"id":"epubsplit-id"}))
913
+ if( titleopt is None ):
914
+ titleopt = self.origtitle+" Split"
915
+ metadata.appendChild(newTag(contentdom,"dc:title",text=titleopt))
916
+
917
+ if( authoropts and len(authoropts) > 0 ):
918
+ useauthors=authoropts
919
+ else:
920
+ useauthors=self.origauthors
921
+
922
+ usedauthors=dict()
923
+ for author in useauthors:
924
+ if( author not in usedauthors ):
925
+ usedauthors[author]=author
926
+ metadata.appendChild(newTag(contentdom,"dc:creator",
927
+ attrs={"opf:role":"aut"},
928
+ text=author))
929
+
930
+ metadata.appendChild(newTag(contentdom,"dc:contributor",text="epubsplit",attrs={"opf:role":"bkp"}))
931
+ metadata.appendChild(newTag(contentdom,"dc:rights",text="Copyrights as per source stories"))
932
+
933
+ if languages:
934
+ for l in languages:
935
+ metadata.appendChild(newTag(contentdom,"dc:language",text=l))
936
+ else:
937
+ metadata.appendChild(newTag(contentdom,"dc:language",text="en"))
938
+
939
+ if not descopt:
940
+ # created now, but not filled in until TOC generation to save loops.
941
+ description = newTag(contentdom,"dc:description",text="Split from %s by %s."%(self.origtitle,", ".join(self.origauthors)))
942
+ else:
943
+ description = newTag(contentdom,"dc:description",text=descopt)
944
+ metadata.appendChild(description)
945
+
946
+ for tag in tags:
947
+ metadata.appendChild(newTag(contentdom,"dc:subject",text=tag))
948
+
949
+ package.appendChild(metadata)
950
+
951
+ manifest = contentdom.createElement("manifest")
952
+ package.appendChild(manifest)
953
+ spine = newTag(contentdom,"spine",attrs={"toc":"ncx"})
954
+ package.appendChild(spine)
955
+
956
+ manifest.appendChild(newTag(contentdom,"item",
957
+ attrs={'id':'ncx',
958
+ 'href':'toc.ncx',
959
+ 'media-type':'application/x-dtbncx+xml'}))
960
+
961
+ if coverjpgpath:
962
+ # <meta name="cover" content="cover.jpg"/>
963
+ metadata.appendChild(newTag(contentdom,"meta",{"name":"cover",
964
+ "content":"coverimageid"}))
965
+ # cover stuff for later:
966
+ # at end of <package>:
967
+ # <guide>
968
+ # <reference type="cover" title="Cover" href="Text/cover.xhtml"/>
969
+ # </guide>
970
+ guide = newTag(contentdom,"guide")
971
+ guide.appendChild(newTag(contentdom,"reference",attrs={"type":"cover",
972
+ "title":"Cover",
973
+ "href":"cover.xhtml"}))
974
+ package.appendChild(guide)
975
+
976
+ manifest.appendChild(newTag(contentdom,"item",
977
+ attrs={'id':"coverimageid",
978
+ 'href':"cover.jpg",
979
+ 'media-type':"image/jpeg"}))
980
+
981
+ # Note that the id of the cover xhmtl *must* be 'cover'
982
+ # for it to work on Nook.
983
+ manifest.appendChild(newTag(contentdom,"item",
984
+ attrs={'id':"cover",
985
+ 'href':"cover.xhtml",
986
+ 'media-type':"application/xhtml+xml"}))
987
+
988
+ spine.appendChild(newTag(contentdom,"itemref",
989
+ attrs={"idref":"cover",
990
+ "linear":"yes"}))
991
+
992
+ contentcount=0
993
+ for (filename,id,type,filedata) in files:
994
+ #filename = self.filecache.addHtml(href,filedata)
995
+ #print("writing :%s"%filename)
996
+ # add to manifest and spine
997
+
998
+ if coverjpgpath and filename == "cover.xhtml":
999
+ continue # don't dup cover.
1000
+
1001
+ outputepub.writestr(filename,filedata.encode('utf-8'))
1002
+ id = "a%d"%contentcount
1003
+ contentcount += 1
1004
+ manifest.appendChild(newTag(contentdom,"item",
1005
+ attrs={'id':id,
1006
+ 'href':filename,
1007
+ 'media-type':type}))
1008
+ spine.appendChild(newTag(contentdom,"itemref",
1009
+ attrs={"idref":id,
1010
+ "linear":"yes"}))
1011
+
1012
+ fontdecrypter = FontDecrypter(self.epub,self.get_content_dom())
1013
+ linked=''
1014
+ for (linked,type) in self.filecache.linkedfiles:
1015
+ # print("linked files:(%s,%s)"%(linked,type))
1016
+ # add to manifest
1017
+ if coverjpgpath and linked == "cover.jpg":
1018
+ continue # don't dup cover.
1019
+
1020
+ try:
1021
+ linkeddata = self.get_file(linked)
1022
+ if linked in fontdecrypter.get_encrypted_fontfiles():
1023
+ print("Decrypting font file: %s"%linked)
1024
+ linkeddata = fontdecrypter.get_decrypted_font_data(linked)
1025
+ outputepub.writestr(linked,linkeddata)
1026
+
1027
+ except Exception as e:
1028
+ print("Skipping linked file (%s)\nException: %s"%(linked,e))
1029
+
1030
+ id = "a%d"%contentcount
1031
+ contentcount += 1
1032
+ manifest.appendChild(newTag(contentdom,"item",
1033
+ attrs={'id':id,
1034
+ 'href':linked,
1035
+ 'media-type':type}))
1036
+
1037
+ contentxml = contentdom.toprettyxml(indent=' ') # ,encoding='utf-8'
1038
+ # tweak for brain damaged Nook STR. Nook insists on name before content.
1039
+ contentxml = contentxml.replace('<meta content="coverimageid" name="cover"/>',
1040
+ '<meta name="cover" content="coverimageid"/>')
1041
+ outputepub.writestr("content.opf",contentxml)
1042
+
1043
+ ## create toc.ncx file
1044
+ tocncxdom = getDOMImplementation().createDocument(None, "ncx", None)
1045
+ ncx = tocncxdom.documentElement
1046
+ ncx.setAttribute("version","2005-1")
1047
+ ncx.setAttribute("xmlns","http://www.daisy.org/z3986/2005/ncx/")
1048
+ head = tocncxdom.createElement("head")
1049
+ ncx.appendChild(head)
1050
+ head.appendChild(newTag(tocncxdom,"meta",
1051
+ attrs={"name":"dtb:uid", "content":uniqueid}))
1052
+ depthnode = newTag(tocncxdom,"meta",
1053
+ attrs={"name":"dtb:depth", "content":"1"})
1054
+ head.appendChild(depthnode)
1055
+ head.appendChild(newTag(tocncxdom,"meta",
1056
+ attrs={"name":"dtb:totalPageCount", "content":"0"}))
1057
+ head.appendChild(newTag(tocncxdom,"meta",
1058
+ attrs={"name":"dtb:maxPageNumber", "content":"0"}))
1059
+
1060
+ docTitle = tocncxdom.createElement("docTitle")
1061
+ docTitle.appendChild(newTag(tocncxdom,"text",text=stripHTML(titleopt)))
1062
+ ncx.appendChild(docTitle)
1063
+
1064
+ tocnavMap = tocncxdom.createElement("navMap")
1065
+ ncx.appendChild(tocnavMap)
1066
+
1067
+ # come back to lines again for TOC because files only has files(gasp-shock!)
1068
+ count=1
1069
+ for line in self.split_lines:
1070
+ if line['include']:
1071
+ # if changed, use only changed values.
1072
+ if line['num'] in changedtocs:
1073
+ line['toc'] = changedtocs[line['num']]
1074
+ # can have more than one toc entry.
1075
+ for title in line['toc']:
1076
+ newnav = newTag(tocncxdom,"navPoint",
1077
+ {"id":"a%03d"%count,"playOrder":"%d" % count})
1078
+ count += 1
1079
+ tocnavMap.appendChild(newnav)
1080
+ navlabel = newTag(tocncxdom,"navLabel")
1081
+ newnav.appendChild(navlabel)
1082
+ # For purposes of TOC titling & desc, use first book author
1083
+ navlabel.appendChild(newTag(tocncxdom,"text",text=stripHTML(title)))
1084
+ # Find the first 'spine' item's content for the title navpoint.
1085
+ # Many epubs have the first chapter as first navpoint, so we can't just
1086
+ # copy that anymore.
1087
+ if line['anchor'] and line['href']+"#"+line['anchor'] in self.filecache.anchors:
1088
+ src = self.filecache.anchors[line['href']+"#"+line['anchor']]
1089
+ #print("toc from anchors(%s#%s)(%s)"%(line['href'],line['anchor'],src))
1090
+ else:
1091
+ #print("toc from href(%s)"%line['href'])
1092
+ src = line['href']
1093
+ newnav.appendChild(newTag(tocncxdom,"content",
1094
+ {"src":src}))
1095
+
1096
+ outputepub.writestr("toc.ncx",tocncxdom.toprettyxml(indent=' ',encoding='utf-8'))
1097
+
1098
+ if coverjpgpath:
1099
+ # write, not write string. Pulling from file.
1100
+ outputepub.write(coverjpgpath,"cover.jpg")
1101
+
1102
+ outputepub.writestr("cover.xhtml",'''
1103
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"><head><title>Cover</title><style type="text/css" title="override_css">
1104
+ @page {padding: 0pt; margin:0pt}
1105
+ body { text-align: center; padding:0pt; margin: 0pt; }
1106
+ div { margin: 0pt; padding: 0pt; }
1107
+ </style></head><body><div>
1108
+ <img src="cover.jpg" alt="cover"/>
1109
+ </div></body></html>
1110
+ ''')
1111
+
1112
+ # declares all the files created by Windows. otherwise, when
1113
+ # it runs in appengine, windows unzips the files as 000 perms.
1114
+ for zf in outputepub.filelist:
1115
+ zf.create_system = 0
1116
+ outputepub.close()
1117
+
1118
+ class FileCache:
1119
+
1120
+ def __init__(self,manifest_items={}):
1121
+ self.manifest_items = manifest_items
1122
+ self.oldnew = {}
1123
+ self.newold = {}
1124
+ self.anchors = {}
1125
+ self.linkedfiles = set()
1126
+
1127
+ ## always include font files for embedded fonts
1128
+ for key, value in six.iteritems(self.manifest_items):
1129
+ # print("manifest:%s %s"%(key,value))
1130
+ if key.startswith('i:') and value[1] in ('application/vnd.ms-opentype',
1131
+ 'application/x-font-ttf',
1132
+ 'application/x-font-truetype',
1133
+ 'application/font-sfnt'):
1134
+ self.add_linked_file(value[0])
1135
+
1136
+ def add_linked_file(self, href):
1137
+ href = normpath(unquote(href)) # fix %20 & /../
1138
+ if ("h:"+href) in self.manifest_items:
1139
+ type = self.manifest_items["h:"+href][1]
1140
+ else:
1141
+ type = 'unknown'
1142
+ self.linkedfiles.add((href,type))
1143
+
1144
+ def add_content_file(self, href, filedata):
1145
+
1146
+ changedname = False
1147
+ if href not in self.oldnew:
1148
+ self.oldnew[href]=[]
1149
+ newfile = href
1150
+ else:
1151
+ changedname = True
1152
+ newfile = "%s%d-%s"%(get_path_part(href),
1153
+ len(self.oldnew[href]),
1154
+ get_file_part(href))
1155
+
1156
+ self.oldnew[href].append(newfile)
1157
+ self.newold[newfile]=href
1158
+ #print("newfile:%s"%newfile)
1159
+
1160
+ soup = BeautifulSoup(filedata,'html5lib')
1161
+ #print("soup head:%s"%soup.find('head'))
1162
+
1163
+ # same name? Don't need to worry about changing links to anchors
1164
+ for a in soup.findAll(): # not just 'a', any tag.
1165
+ #print("a:%s"%a)
1166
+ if a.has_attr('id'):
1167
+ self.anchors[href+'#'+a['id']]=newfile+'#'+a['id']
1168
+
1169
+ # <image> from baen epub.
1170
+ # <image width="462" height="616" xlink:href="cover.jpeg"/>
1171
+ for img in soup.findAll('img') + soup.findAll('image'):
1172
+ src = None
1173
+ if img.has_attr('src'):
1174
+ src=img['src']
1175
+ if img.has_attr('xlink:href'):
1176
+ src=img['xlink:href']
1177
+ if src:
1178
+ self.add_linked_file(get_path_part(href)+src)
1179
+ else:
1180
+ logger.info("img tag without src in file:(%s) tag:(%s)"%(href,img))
1181
+
1182
+ # link href="0.css" type="text/css"
1183
+ for style in soup.findAll('link',{'type':'text/css'}):
1184
+ #print("link:%s"%style)
1185
+ if style.has_attr('href'):
1186
+ self.add_linked_file(get_path_part(href)+style['href'])
1187
+
1188
+ return newfile
1189
+
1190
+ def splitHtml(data,tagid,before=False):
1191
+ soup = BeautifulSoup(data,'lxml')
1192
+ #print("splitHtml.soup head:%s"%soup.find('head'))
1193
+
1194
+ splitpoint = soup.find(id=tagid)
1195
+
1196
+ #print("splitpoint:%s"%splitpoint)
1197
+
1198
+ if splitpoint == None:
1199
+ return data
1200
+
1201
+ if before:
1202
+ # remove all next siblings.
1203
+ for n in splitpoint.findNextSiblings():
1204
+ n.extract()
1205
+
1206
+ parent = splitpoint.parent
1207
+ while parent and parent.name != 'body':
1208
+ for n in parent.findNextSiblings():
1209
+ n.extract()
1210
+ parent = parent.parent
1211
+
1212
+ splitpoint.extract()
1213
+ else:
1214
+ # remove all prev siblings.
1215
+ for n in splitpoint.findPreviousSiblings():
1216
+ n.extract()
1217
+
1218
+ parent = splitpoint.parent
1219
+ while parent and parent.name != 'body':
1220
+ for n in parent.findPreviousSiblings():
1221
+ n.extract()
1222
+ parent = parent.parent
1223
+
1224
+ return re.sub(r'( *\r?\n)+','\r\n',unicode(soup))
1225
+
1226
+ def get_path_part(n):
1227
+ relpath = os.path.dirname(n)
1228
+ if( len(relpath) > 0 ):
1229
+ relpath=relpath+"/"
1230
+ return relpath
1231
+
1232
+ def get_file_part(n):
1233
+ return os.path.basename(n)
1234
+
1235
+ ## Utility method for creating new tags.
1236
+ def newTag(dom,name,attrs=None,text=None):
1237
+ tag = dom.createElement(name)
1238
+ if( attrs is not None ):
1239
+ for attr in attrs.keys():
1240
+ tag.setAttribute(attr,attrs[attr])
1241
+ if( text is not None ):
1242
+ tag.appendChild(dom.createTextNode(text))
1243
+ return tag
1244
+
1245
+ def main(argv,usage=None):
1246
+
1247
+ from optparse import OptionParser
1248
+
1249
+ if not usage:
1250
+ # read in args, anything starting with -- will be treated as --<varible>=<value>
1251
+ usage = 'usage: python %prog'
1252
+
1253
+ parser = OptionParser(usage+''' [options] <input epub> [line numbers...]
1254
+
1255
+ Giving an epub without line numbers will return a list of line numbers: the
1256
+ possible split points in the input file. Calling with line numbers will
1257
+ generate an epub with each of the "lines" given included.''')
1258
+
1259
+ parser.add_option("-o", "--output", dest="outputopt", default="split.epub",
1260
+ help="Set OUTPUT file, Default: split.epub", metavar="OUTPUT")
1261
+ parser.add_option("--output-dir", dest="outputdiropt",
1262
+ help="Set OUTPUT directory, Default: presend working directory")
1263
+ parser.add_option('--split-by-section',
1264
+ action='store_true', dest='split_by_section',
1265
+ help='Create a new epub from each of the listed line sections instead of one containing all. Splits all sections if no lines numbers are given. Each split will be named <number>-<output name> and placed in the output-dir. Sections without a Table of Contents entry will be included with the preceding section(s)', )
1266
+ parser.add_option("-t", "--title", dest="titleopt", default=None,
1267
+ help="Use TITLE as the metadata title. Default: '<original epub title> Split' or ToC entry with --split-by-section", metavar="TITLE")
1268
+ parser.add_option("-d", "--description", dest="descopt", default=None,
1269
+ help="Use DESC as the metadata description. Default: 'Split from <epub title> by <author>'.", metavar="DESC")
1270
+ parser.add_option("-a", "--author",
1271
+ action="append", dest="authoropts", default=[],
1272
+ help="Use AUTHOR as a metadata author, multiple authors may be given, Default: <All authors from original epub>", metavar="AUTHOR")
1273
+ parser.add_option("-g", "--tag",
1274
+ action="append", dest="tagopts", default=[],
1275
+ help="Include TAG as dc:subject tag, multiple tags may be given, Default: None", metavar="TAG")
1276
+ parser.add_option("-l", "--language",
1277
+ action="append", dest="languageopts", default=[],
1278
+ help="Include LANG as dc:language tag, multiple languages may be given, Default: en", metavar="LANG")
1279
+ parser.add_option("-c", "--cover", dest="coveropt", default=None,
1280
+ help="Path to a jpg to use as cover image.", metavar="COVER")
1281
+
1282
+ (options, args) = parser.parse_args(argv)
1283
+
1284
+ ## Add .epub if not already there.
1285
+ if not options.outputopt.lower().endswith(".epub"):
1286
+ options.outputopt=options.outputopt+".epub"
1287
+
1288
+ if not options.languageopts:
1289
+ options.languageopts = ['en']
1290
+
1291
+ if not args:
1292
+ parser.print_help()
1293
+ return
1294
+
1295
+ epubO = SplitEpub(args[0])
1296
+
1297
+ lines = epubO.get_split_lines()
1298
+
1299
+ if options.split_by_section:
1300
+ if len(args) > 1:
1301
+ section_lines = args[1:]
1302
+ else:
1303
+ section_lines = range(len(lines))
1304
+
1305
+ splitslist = []
1306
+ sectionlist = []
1307
+ title=None
1308
+ for lineno in section_lines:
1309
+ toclist = lines[int(lineno)]['toc']
1310
+ if sectionlist and not toclist:
1311
+ sectionlist.append(lineno)
1312
+ else:
1313
+ ## take title from (first) ToC if available, else titleopt (_ Split internally if None)
1314
+ title = (toclist[0] if toclist else options.titleopt)
1315
+ print("title: %s"%title)
1316
+ sectionlist=[lineno]
1317
+ splitslist.append((sectionlist,title))
1318
+ if sectionlist:
1319
+ splitslist.append((sectionlist,title))
1320
+ # print(splitslist)
1321
+
1322
+ filecount = 1
1323
+ for sectionlist, title in splitslist:
1324
+ outputfile = "%0.4d-%s"%(filecount,options.outputopt)
1325
+ if options.outputdiropt:
1326
+ outputfile = os.path.join(options.outputdiropt,outputfile)
1327
+ print("output file: "+outputfile)
1328
+ epubO.write_split_epub(outputfile,
1329
+ sectionlist,
1330
+ authoropts=options.authoropts,
1331
+ titleopt=title,
1332
+ descopt=options.descopt,
1333
+ tags=options.tagopts,
1334
+ languages=options.languageopts,
1335
+ coverjpgpath=options.coveropt)
1336
+ filecount+=1
1337
+ return
1338
+ elif len(args) == 1:
1339
+ count = 0
1340
+ showlist=['toc','guide','anchor','id','href']
1341
+ for line in lines:
1342
+ print("\nLine Number: %d"%count)
1343
+ for s in showlist:
1344
+ if s in line and line[s]:
1345
+ print("\t%s: %s"%(s,line[s]))
1346
+ count += 1
1347
+ return
1348
+
1349
+ if len(args) > 1:
1350
+ outputfile = options.outputopt
1351
+ if options.outputdiropt:
1352
+ outputfile = os.path.join(options.outputdiropt,outputfile)
1353
+ print("output file: "+outputfile)
1354
+ epubO.write_split_epub(outputfile,
1355
+ args[1:],
1356
+ authoropts=options.authoropts,
1357
+ titleopt=options.titleopt,
1358
+ descopt=options.descopt,
1359
+ tags=options.tagopts,
1360
+ languages=options.languageopts,
1361
+ coverjpgpath=options.coveropt)
1362
+
1363
+ return
1364
+
1365
+ if __name__ == "__main__":
1366
+ main(sys.argv[1:])
1367
+
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ requests
3
+ beautifulsoup4
4
+ ebooklib
5
+ huggingface_hub
6
+ datasets
7
+ hf-transfer
8
+ protobuf
9
+ click
10
+ pydantic
11
+ torch
12
+ uvicorn
13
+ html5lib
14
+ lxml