Cover

#!/usr/bin/env python # -*- coding: utf-8 -*- __license__ = 'GPL v3' __copyright__ = '2021, Jim Miller' __docformat__ = 'restructuredtext en' import sys, re, os, traceback, copy from posixpath import normpath import logging logger = logging.getLogger(__name__) from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED from xml.dom.minidom import parse, parseString, getDOMImplementation, Element from time import time import six from six.moves.urllib.parse import unquote from six import string_types, text_type as unicode from six import unichr from bs4 import BeautifulSoup ## font decoding code lifted from ## calibre/src/calibre/ebooks/conversion/plugins/epub_input.py ## copyright '2009, Kovid Goyal ' ## don't bug Kovid about this use of it. ADOBE_OBFUSCATION = 'http://ns.adobe.com/pdf/enc#RC' IDPF_OBFUSCATION = 'http://www.idpf.org/2008/embedding' from itertools import cycle class FontDecrypter: def __init__(self, epub, content_dom): self.epub = epub self.content_dom = content_dom self.encryption = {} self.old_uuid = None def get_file(self,href): return self.epub.read(href) def get_encrypted_fontfiles(self): if not self.encryption: ## Find the .opf file. try: # # # # # # # # encryption = self.epub.read("META-INF/encryption.xml") encryptiondom = parseString(encryption) # print(encryptiondom.toprettyxml(indent=' ')) for encdata in encryptiondom.getElementsByTagName('enc:EncryptedData'): # print(encdata.toprettyxml(indent=' ')) algorithm = encdata.getElementsByTagName('enc:EncryptionMethod')[0].getAttribute('Algorithm') if algorithm not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}: print("Unknown font encryption: %s"%algorithm) else: # print(algorithm) for encref in encdata.getElementsByTagName('enc:CipherReference'): # print(encref.getAttribute('URI')) self.encryption[encref.getAttribute('URI')]=algorithm except KeyError as ke: self.encryption = {} return self.encryption def get_old_uuid(self): if not self.old_uuid: contentdom = self.content_dom uidkey = contentdom.getElementsByTagName("package")[0].getAttribute("unique-identifier") for dcid in contentdom.getElementsByTagName("dc:identifier"): if dcid.getAttribute("id") == uidkey and dcid.getAttribute("opf:scheme") == "uuid": self.old_uuid = dcid.firstChild.data return self.old_uuid def get_idpf_key(self): # idpf key:urn:uuid:221c69fe-29f3-4cb4-bb3f-58c430261cc6 # idpf key:b'\xfb\xa9\x03N}\xae~\x12 \xaa\xe0\xc11\xe2\xe7\x1b\xf6\xa5\xcas' idpf_key = self.get_old_uuid() import uuid, hashlib idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key) idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest() return idpf_key def get_adobe_key(self): # adobe key:221c69fe-29f3-4cb4-bb3f-58c430261cc6 # adobe key:b'"\x1ci\xfe)\xf3L\xb4\xbb?X\xc40&\x1c\xc6' adobe_key = self.get_old_uuid() import uuid adobe_key = adobe_key.rpartition(':')[-1] # skip urn:uuid: adobe_key = uuid.UUID(adobe_key).bytes return adobe_key def get_decrypted_font_data(self, uri): # print(self.get_old_uuid()) # print("idpf : %s"%self.get_idpf_key()) # print("adobe: %s"%self.get_adobe_key()) # print("uri:%s"%uri) font_data = self.get_file(uri) if uri in self.get_encrypted_fontfiles(): key = self.get_adobe_key() if self.get_encrypted_fontfiles()[uri] == ADOBE_OBFUSCATION else self.get_idpf_key() font_data = self.decrypt_font_data(key, font_data, self.get_encrypted_fontfiles()[uri]) return font_data def decrypt_font_data(self, key, data, algorithm): is_adobe = algorithm == ADOBE_OBFUSCATION crypt_len = 1024 if is_adobe else 1040 crypt = bytearray(data[:crypt_len]) key = cycle(iter(bytearray(key))) decrypt = bytes(bytearray(x^next(key) for x in crypt)) return decrypt + data[crypt_len:] def _unirepl(match): "Return the unicode string for a decimal number" if match.group(1).startswith('x'): radix=16 s = match.group(1)[1:] else: radix=10 s = match.group(1) try: value = int(s, radix) retval = "%s%s"%(unichr(value),match.group(2)) except: # This way, at least if there's more of entities out there # that fail, it doesn't blow the entire download. print("Numeric entity translation failed, skipping: &#x%s%s"%(match.group(1),match.group(2))) retval = "" return retval def _replaceNumberEntities(data): # The same brokenish entity parsing in SGMLParser that inserts ';' # after non-entities will also insert ';' incorrectly after number # entities, including part of the next word if it's a-z. # "Don't—ever—do—that—again," becomes # "Don't—e;ver—d;o—that—a;gain," # Also need to allow for 5 digit decimal entities 法 # Last expression didn't allow for 2 digit hex correctly: é p = re.compile(r'&#(x[0-9a-fA-F]{,4}|[0-9]{,5})([0-9a-fA-F]*?);') return p.sub(_unirepl, data) def _replaceNotEntities(data): # not just \w or \S. regexp from c:\Python25\lib\sgmllib.py # (or equiv), SGMLParser, entityref p = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);') return p.sub(r'&\1', data) def stripHTML(soup): return removeAllEntities(re.sub(r'<[^>]+>','',"%s" % soup)).strip() def conditionalRemoveEntities(value): if isinstance(value,string_types) : return removeEntities(value).strip() else: return value def removeAllEntities(text): # Remove < < and & return removeEntities(text).replace('<', '<').replace('>', '>').replace('&', '&') def removeEntities(text): if text is None: return "" if not (isinstance(text,string_types)): return str(text) try: t = unicode(text) #.decode('utf-8') except UnicodeEncodeError as e: try: t = text.encode ('ascii', 'xmlcharrefreplace') except UnicodeEncodeError as e: t = text text = t # replace numeric versions of [&<>] with named versions, # then replace named versions with actual characters, text = re.sub(r'�*38;','&',text) text = re.sub(r'�*60;','<',text) text = re.sub(r'�*62;','>',text) # replace remaining � entities with unicode value, such as ' -> ' text = _replaceNumberEntities(text) # replace several named entities with character, such as — -> - # see constants.py for the list. # reverse sort will put entities with ; before the same one without, when valid. for e in reversed(sorted(entities.keys())): v = entities[e] try: text = text.replace(e, v) except UnicodeDecodeError as ex: # for the pound symbol in constants.py text = text.replace(e, v.decode('utf-8')) # SGMLParser, and in turn, BeautifulStoneSoup doesn't parse # entities terribly well and inserts (;) after something that # it thinks might be an entity. AT&T becomes AT&T; All of my # attempts to fix this by changing the input to # BeautifulStoneSoup break something else instead. But at # this point, there should be *no* real entities left, so find # these not-entities and removing them here should be safe. text = _replaceNotEntities(text) # < < and & are the only html entities allowed in xhtml, put those back. return text.replace('&', '&').replace('&lt', '<').replace('&gt', '>') # entity list from http://code.google.com/p/doctype/wiki/CharacterEntitiesConsistent entities = { 'á' : 'á', 'Á' : 'Á', 'Á' : 'Á', 'á' : 'á', 'â' : 'â', 'Â' : 'Â', 'Â' : 'Â', 'â' : 'â', '´' : '´', '´' : '´', 'Æ' : 'Æ', 'æ' : 'æ', 'Æ' : 'Æ', 'æ' : 'æ', 'à' : 'à', 'À' : 'À', 'À' : 'À', 'à' : 'à', 'ℵ' : 'ℵ', 'α' : 'α', 'Α' : 'Α', '&' : '&', '&' : '&', '&' : '&', '&' : '&', '∧' : '∧', '∠' : '∠', 'å' : 'å', 'Å' : 'Å', 'Å' : 'Å', 'å' : 'å', '≈' : '≈', 'ã' : 'ã', 'Ã' : 'Ã', 'Ã' : 'Ã', 'ã' : 'ã', 'ä' : 'ä', 'Ä' : 'Ä', 'Ä' : 'Ä', 'ä' : 'ä', '„' : '„', 'β' : 'β', 'Β' : 'Β', '¦' : '¦', '¦' : '¦', '•' : '•', '∩' : '∩', 'ç' : 'ç', 'Ç' : 'Ç', 'Ç' : 'Ç', 'ç' : 'ç', '¸' : '¸', '¸' : '¸', '¢' : '¢', '¢' : '¢', 'χ' : 'χ', 'Χ' : 'Χ', 'ˆ' : 'ˆ', '♣' : '♣', '≅' : '≅', '©' : '©', '©' : '©', '©' : '©', '©' : '©', '↵' : '↵', '∪' : '∪', '¤' : '¤', '¤' : '¤', '†' : '†', '‡' : '‡', '↓' : '↓', '⇓' : '⇓', '°' : '°', '°' : '°', 'δ' : 'δ', 'Δ' : 'Δ', '♦' : '♦', '÷' : '÷', '÷' : '÷', 'é' : 'é', 'É' : 'É', 'É' : 'É', 'é' : 'é', 'ê' : 'ê', 'Ê' : 'Ê', 'Ê' : 'Ê', 'ê' : 'ê', 'è' : 'è', 'È' : 'È', 'È' : 'È', 'è' : 'è', '∅' : '∅', ' ' : ' ', ' ' : ' ', 'ε' : 'ε', 'Ε' : 'Ε', '≡' : '≡', 'η' : 'η', 'Η' : 'Η', 'ð' : 'ð', 'Ð' : 'Ð', 'Ð' : 'Ð', 'ð' : 'ð', 'ë' : 'ë', 'Ë' : 'Ë', 'Ë' : 'Ë', 'ë' : 'ë', '€' : '€', '∃' : '∃', 'ƒ' : 'ƒ', '∀' : '∀', '½' : '½', '½' : '½', '¼' : '¼', '¼' : '¼', '¾' : '¾', '¾' : '¾', '⁄' : '⁄', 'γ' : 'γ', 'Γ' : 'Γ', '≥' : '≥', #'>' : '>', #'>' : '>', #'>' : '>', #'>' : '>', '↔' : '↔', '⇔' : '⇔', '♥' : '♥', '…' : '…', 'í' : 'í', 'Í' : 'Í', 'Í' : 'Í', 'í' : 'í', 'î' : 'î', 'Î' : 'Î', 'Î' : 'Î', 'î' : 'î', '¡' : '¡', '¡' : '¡', 'ì' : 'ì', 'Ì' : 'Ì', 'Ì' : 'Ì', 'ì' : 'ì', 'ℑ' : 'ℑ', '∞' : '∞', '∫' : '∫', 'ι' : 'ι', 'Ι' : 'Ι', '¿' : '¿', '¿' : '¿', '∈' : '∈', 'ï' : 'ï', 'Ï' : 'Ï', 'Ï' : 'Ï', 'ï' : 'ï', 'κ' : 'κ', 'Κ' : 'Κ', 'λ' : 'λ', 'Λ' : 'Λ', '«' : '«', '«' : '«', '←' : '←', '⇐' : '⇐', '⌈' : '⌈', '“' : '“', '≤' : '≤', '⌊' : '⌊', '∗' : '∗', '◊' : '◊', '‎' : '‎', '‹' : '‹', '‘' : '‘', #'<' : '<', #'<' : '<', #'<' : '<', #'<' : '<', '¯' : '¯', '¯' : '¯', '—' : '—', 'µ' : 'µ', 'µ' : 'µ', '·' : '·', '·' : '·', '−' : '−', 'μ' : 'μ', 'Μ' : 'Μ', '∇' : '∇', ' ' : ' ', ' ' : ' ', '–' : '–', '≠' : '≠', '∋' : '∋', '¬' : '¬', '¬' : '¬', '∉' : '∉', '⊄' : '⊄', 'ñ' : 'ñ', 'Ñ' : 'Ñ', 'Ñ' : 'Ñ', 'ñ' : 'ñ', 'ν' : 'ν', 'Ν' : 'Ν', 'ó' : 'ó', 'Ó' : 'Ó', 'Ó' : 'Ó', 'ó' : 'ó', 'ô' : 'ô', 'Ô' : 'Ô', 'Ô' : 'Ô', 'ô' : 'ô', 'Œ' : 'Œ', 'œ' : 'œ', 'ò' : 'ò', 'Ò' : 'Ò', 'Ò' : 'Ò', 'ò' : 'ò', '‾' : '‾', 'ω' : 'ω', 'Ω' : 'Ω', 'ο' : 'ο', 'Ο' : 'Ο', '⊕' : '⊕', '∨' : '∨', 'ª' : 'ª', 'ª' : 'ª', 'º' : 'º', 'º' : 'º', 'ø' : 'ø', 'Ø' : 'Ø', 'Ø' : 'Ø', 'ø' : 'ø', 'õ' : 'õ', 'Õ' : 'Õ', 'Õ' : 'Õ', 'õ' : 'õ', '⊗' : '⊗', 'ö' : 'ö', 'Ö' : 'Ö', 'Ö' : 'Ö', 'ö' : 'ö', '¶' : '¶', '¶' : '¶', '∂' : '∂', '‰' : '‰', '⊥' : '⊥', 'φ' : 'φ', 'Φ' : 'Φ', 'π' : 'π', 'Π' : 'Π', 'ϖ' : 'ϖ', '±' : '±', '±' : '±', '£' : '£', '£' : '£', '′' : '′', '″' : '″', '∏' : '∏', '∝' : '∝', 'ψ' : 'ψ', 'Ψ' : 'Ψ', '"' : '"', '"' : '"', '"' : '"', '"' : '"', '√' : '√', '»' : '»', '»' : '»', '→' : '→', '⇒' : '⇒', '⌉' : '⌉', '”' : '”', 'ℜ' : 'ℜ', '®' : '®', '®' : '®', '®' : '®', '®' : '®', '⌋' : '⌋', 'ρ' : 'ρ', 'Ρ' : 'Ρ', '‏' : '‏', '›' : '›', '’' : '’', '‚' : '‚', 'š' : 'š', 'Š' : 'Š', '⋅' : '⋅', '§' : '§', '§' : '§', '' : '', # strange optional hyphenation control character, not just a dash '' : '', 'σ' : 'σ', 'Σ' : 'Σ', 'ς' : 'ς', '∼' : '∼', '♠' : '♠', '⊂' : '⊂', '⊆' : '⊆', '∑' : '∑', '¹' : '¹', '¹' : '¹', '²' : '²', '²' : '²', '³' : '³', '³' : '³', '⊃' : '⊃', '⊇' : '⊇', 'ß' : 'ß', 'ß' : 'ß', 'τ' : 'τ', 'Τ' : 'Τ', '∴' : '∴', 'θ' : 'θ', 'Θ' : 'Θ', 'ϑ' : 'ϑ', ' ' : ' ', 'þ' : 'þ', 'Þ' : 'Þ', 'Þ' : 'Þ', 'þ' : 'þ', '˜' : '˜', '×' : '×', '×' : '×', '™' : '™', 'ú' : 'ú', 'Ú' : 'Ú', 'Ú' : 'Ú', 'ú' : 'ú', '↑' : '↑', '⇑' : '⇑', 'û' : 'û', 'Û' : 'Û', 'Û' : 'Û', 'û' : 'û', 'ù' : 'ù', 'Ù' : 'Ù', 'Ù' : 'Ù', 'ù' : 'ù', '¨' : '¨', '¨' : '¨', 'ϒ' : 'ϒ', 'υ' : 'υ', 'Υ' : 'Υ', 'ü' : 'ü', 'Ü' : 'Ü', 'Ü' : 'Ü', 'ü' : 'ü', '℘' : '℘', 'ξ' : 'ξ', 'Ξ' : 'Ξ', 'ý' : 'ý', 'Ý' : 'Ý', 'Ý' : 'Ý', 'ý' : 'ý', '¥' : '¥', '¥' : '¥', 'ÿ' : 'ÿ', 'Ÿ' : 'Ÿ', 'ÿ' : 'ÿ', 'ζ' : 'ζ', 'Ζ' : 'Ζ', '‍' : '‍', # strange spacing control character, not just a space '‌' : '‌', # strange spacing control character, not just a space } class SplitEpub: def __init__(self, inputio): self.epub = ZipFile(inputio, 'r') self.content_dom = None self.content_relpath = None self.manifest_items = None self.guide_items = None self.toc_dom = None self.toc_relpath = None self.toc_map = None self.split_lines = None self.origauthors = [] self.origtitle = None def get_file(self,href): return self.epub.read(href) def get_content_dom(self): if not self.content_dom: ## Find the .opf file. container = self.epub.read("META-INF/container.xml") containerdom = parseString(container) rootfilenodelist = containerdom.getElementsByTagName("rootfile") rootfilename = rootfilenodelist[0].getAttribute("full-path") self.content_dom = parseString(self.epub.read(rootfilename)) self.content_relpath = get_path_part(rootfilename) return self.content_dom def get_content_relpath(self): ## Save the path to the .opf file--hrefs inside it are relative to it. if not self.content_relpath: self.get_content_dom() # sets self.content_relpath also. return self.content_relpath def get_toc_relpath(self): ## Save the path to the toc.ncx file--hrefs inside it are relative to it. if not self.toc_relpath: self.get_manifest_items() # sets self.toc_relpath also. return self.toc_relpath def get_manifest_items(self): if not self.manifest_items: self.manifest_items = {} for item in self.get_content_dom().getElementsByTagName("item"): fullhref=normpath(unquote(self.get_content_relpath()+item.getAttribute("href"))) #print("---- item fullhref:%s"%(fullhref)) self.manifest_items["h:"+fullhref]=(item.getAttribute("id"),item.getAttribute("media-type")) self.manifest_items["i:"+item.getAttribute("id")]=(fullhref,item.getAttribute("media-type")) if( item.getAttribute("media-type") == "application/x-dtbncx+xml" ): # TOC file is only one with this type--as far as I know. self.toc_relpath = get_path_part(fullhref) self.toc_dom = parseString(self.epub.read(fullhref)) return self.manifest_items def get_guide_items(self): if not self.guide_items: self.guide_items = {} for item in self.get_content_dom().getElementsByTagName("reference"): fullhref=normpath(unquote(self.get_content_relpath()+item.getAttribute("href"))) self.guide_items[fullhref]=(item.getAttribute("type"),item.getAttribute("title")) #print("---- reference href:%s value:%s"%(fullhref,self.guide_items[fullhref],)) #self.guide_items[item.getAttribute("type")]=(fullhref,item.getAttribute("media-type")) return self.guide_items def get_toc_dom(self): if not self.toc_dom: self.get_manifest_items() # also sets self.toc_dom return self.toc_dom # dict() of href->[(text,anchor),...],... # eg: "file0001.html"->[("Introduction","anchor01"),("Chapter 1","anchor02")],... def get_toc_map(self): if not self.toc_map: self.toc_map = {} # update all navpoint ids with bookid for uniqueness. for navpoint in self.get_toc_dom().getElementsByTagName("navPoint"): src = normpath(unquote(self.get_toc_relpath()+navpoint.getElementsByTagName("content")[0].getAttribute("src"))) if '#' in src: (href,anchor)=src.split("#") else: (href,anchor)=(src,None) # The first of these in each navPoint should be the appropriate one. # (may be others due to nesting. try: text = unicode(navpoint.getElementsByTagName("text")[0].firstChild.data) except: #print("No chapter title found in TOC for (%s)"%src) text = "" if href not in self.toc_map: self.toc_map[href] = [] if anchor == None: # put file links ahead of ancher links. Otherwise # a non-linear anchor link may take precedence, # which will confuse EpubSplit. This will cause # split lines to possibly be out of order from # TOC, but the alternative is worse. Should be a # rare corner case. ## Keep order of non-anchor entries to the same file. idx=0 while idx < len(self.toc_map[href]) and self.toc_map[href][idx][1] is None: # [1] is anchor # print(idx) # print(self.toc_map[href][idx]) idx = idx+1 self.toc_map[href].insert(idx,(text,anchor)) else: self.toc_map[href].append((text,anchor)) # print(self.toc_map) return self.toc_map # list of dicts with href, anchor & toc text. # 'split lines' are all the points that the epub can be split on. # Offer a split at each spine file and each ToC point. def get_split_lines(self): metadom = self.get_content_dom() ## Save indiv book title try: self.origtitle = metadom.getElementsByTagName("dc:title")[0].firstChild.data except: self.origtitle = "(Title Missing)" ## Save authors. for creator in metadom.getElementsByTagName("dc:creator"): try: if( creator.getAttribute("opf:role") == "aut" or not creator.hasAttribute("opf:role") and creator.firstChild != None): if creator.firstChild.data not in self.origauthors: self.origauthors.append(creator.firstChild.data) except: pass if len(self.origauthors) == 0: self.origauthors.append("(Authors Missing)") self.split_lines = [] # list of dicts with href, anchor and toc # spin on spine files. count=0 for itemref in metadom.getElementsByTagName("itemref"): idref = itemref.getAttribute("idref") (href,type) = self.get_manifest_items()["i:"+idref] current = {} self.split_lines.append(current) current['href']=href current['anchor']=None current['toc'] = [] if href in self.get_guide_items(): current['guide'] = self.get_guide_items()[href] current['id'] = idref current['type'] = type current['num'] = count t=self.epub.read(href).decode('utf-8') if len(t) > 1500 : t = t[:1500] + "..." current['sample']=t count += 1 #print("spine:%s->%s"%(idref,href)) # if href is in the toc. if href in self.get_toc_map(): # For each toc entry, check to see if there's an anchor, if so, # make a new split line. for tocitem in self.get_toc_map()[href]: (text,anchor) = tocitem # XXX for outputing to screen in CLI--hopefully won't need in plugin? try: text = "%s"%text except: text = "(error text)" if anchor: #print("breakpoint: %d"%count) current = {} self.split_lines.append(current) current['href']=href current['anchor']=anchor current['toc']=[] current['id'] = idref current['type'] = type current['num'] = count # anchor, need to split first, then reduce to 1500. t=splitHtml(self.epub.read(href).decode('utf-8'),anchor,before=False) if len(t) > 1500 : t = t[:1500] + "..." current['sample']=t count += 1 # There can be more than one toc to the same split line. # This won't find multiple toc to the same anchor yet. current['toc'].append(text) #print("\ttoc:'%s' %s#%s"%(text,href,anchor)) return self.split_lines # pass in list of line numbers(?) def get_split_files(self,linenums): self.filecache = FileCache(self.get_manifest_items()) # set include flag in split_lines. if not self.split_lines: self.get_split_lines() lines = self.split_lines lines_set = set([int(k) for k in linenums]) for j in range(len(lines)): lines[j]['include'] = j in lines_set # loop through finding 'chunks' -- contiguous pieces in the # same file. Each included file is at least one chunk, but if # parts are left out, one original file can end up being more # than one chunk. outchunks = [] # list of tuples=(filename,start,end) 'end' is not inclusive. inchunk = False currentfile = None start = None for line in lines: if line['include']: if not inchunk: # start new chunk inchunk = True currentfile = line['href'] start = line else: # inchunk # different file, new chunk. if currentfile != line['href']: outchunks.append((currentfile,start,line)) inchunk=True currentfile=line['href'] start=line else: # not include if inchunk: # save previous chunk. outchunks.append((currentfile,start,line)) inchunk=False # final chunk for when last in list is include. if inchunk: outchunks.append((currentfile,start,None)) outfiles=[] # tuples, (filename,type,data) -- filename changed to unique for (href,start,end) in outchunks: filedata = self.epub.read(href).decode('utf-8') # discard before start if anchor. if start['anchor'] != None: filedata = splitHtml(filedata,start['anchor'],before=False) # discard from end anchor on(inclusive), but only if same file. If # different file, keep rest of file. If no 'end', then it was the # last chunk and went to the end of the last file. if end != None and end['anchor'] != None and end['href']==href: filedata = splitHtml(filedata,end['anchor'],before=True) filename = self.filecache.add_content_file(href,filedata) outfiles.append([filename,start['id'],start['type'],filedata]) # print("self.oldnew:%s"%self.filecache.oldnew) # print("self.newold:%s"%self.filecache.newold) # print("\nanchors:%s\n"%self.filecache.anchors) # print("\nlinkedfiles:%s\n"%self.filecache.linkedfiles) # print("relpath:%s"%get_path_part()) # Spin through to replace internal URLs for fl in outfiles: #print("file:%s"%fl[0]) soup = BeautifulSoup(fl[3],'html5lib') changed = False for a in soup.findAll('a'): if a.has_attr('href'): path = normpath(unquote("%s%s"%(get_path_part(fl[0]),a['href']))) #print("full a['href']:%s"%path) if path in self.filecache.anchors and self.filecache.anchors[path] != path: a['href'] = self.filecache.anchors[path][len(get_path_part(fl[0])):] #print("replacement path:%s"%a['href']) changed = True if changed: fl[3] = unicode(soup) return outfiles def write_split_epub(self, outputio, linenums, changedtocs={}, authoropts=[], titleopt=None, descopt=None, tags=[], languages=['en'], coverjpgpath=None): files = self.get_split_files(linenums) ## Write mimetype file, must be first and uncompressed. ## Older versions of python(2.4/5) don't allow you to specify ## compression by individual file. ## Overwrite if existing output file. outputepub = ZipFile(outputio, "w", compression=ZIP_STORED) outputepub.debug = 3 outputepub.writestr("mimetype", "application/epub+zip") outputepub.close() ## Re-open file for content. outputepub = ZipFile(outputio, "a", compression=ZIP_DEFLATED) outputepub.debug = 3 ## Create META-INF/container.xml file. The only thing it does is ## point to content.opf containerdom = getDOMImplementation().createDocument(None, "container", None) containertop = containerdom.documentElement containertop.setAttribute("version","1.0") containertop.setAttribute("xmlns","urn:oasis:names:tc:opendocument:xmlns:container") rootfiles = containerdom.createElement("rootfiles") containertop.appendChild(rootfiles) rootfiles.appendChild(newTag(containerdom,"rootfile",{"full-path":"content.opf", "media-type":"application/oebps-package+xml"})) outputepub.writestr("META-INF/container.xml",containerdom.toprettyxml(indent=' ',encoding='utf-8')) #### ## create content.opf file. uniqueid="epubsplit-uid-%d" % time() # real sophisticated uid scheme. contentdom = getDOMImplementation().createDocument(None, "package", None) package = contentdom.documentElement package.setAttribute("version","2.0") package.setAttribute("xmlns","http://www.idpf.org/2007/opf") package.setAttribute("unique-identifier","epubsplit-id") metadata=newTag(contentdom,"metadata", attrs={"xmlns:dc":"http://purl.org/dc/elements/1.1/", "xmlns:opf":"http://www.idpf.org/2007/opf"}) metadata.appendChild(newTag(contentdom,"dc:identifier",text=uniqueid,attrs={"id":"epubsplit-id"})) if( titleopt is None ): titleopt = self.origtitle+" Split" metadata.appendChild(newTag(contentdom,"dc:title",text=titleopt)) if( authoropts and len(authoropts) > 0 ): useauthors=authoropts else: useauthors=self.origauthors usedauthors=dict() for author in useauthors: if( author not in usedauthors ): usedauthors[author]=author metadata.appendChild(newTag(contentdom,"dc:creator", attrs={"opf:role":"aut"}, text=author)) metadata.appendChild(newTag(contentdom,"dc:contributor",text="epubsplit",attrs={"opf:role":"bkp"})) metadata.appendChild(newTag(contentdom,"dc:rights",text="Copyrights as per source stories")) if languages: for l in languages: metadata.appendChild(newTag(contentdom,"dc:language",text=l)) else: metadata.appendChild(newTag(contentdom,"dc:language",text="en")) if not descopt: # created now, but not filled in until TOC generation to save loops. description = newTag(contentdom,"dc:description",text="Split from %s by %s."%(self.origtitle,", ".join(self.origauthors))) else: description = newTag(contentdom,"dc:description",text=descopt) metadata.appendChild(description) for tag in tags: metadata.appendChild(newTag(contentdom,"dc:subject",text=tag)) package.appendChild(metadata) manifest = contentdom.createElement("manifest") package.appendChild(manifest) spine = newTag(contentdom,"spine",attrs={"toc":"ncx"}) package.appendChild(spine) manifest.appendChild(newTag(contentdom,"item", attrs={'id':'ncx', 'href':'toc.ncx', 'media-type':'application/x-dtbncx+xml'})) if coverjpgpath: # metadata.appendChild(newTag(contentdom,"meta",{"name":"cover", "content":"coverimageid"})) # cover stuff for later: # at end of : # # # guide = newTag(contentdom,"guide") guide.appendChild(newTag(contentdom,"reference",attrs={"type":"cover", "title":"Cover", "href":"cover.xhtml"})) package.appendChild(guide) manifest.appendChild(newTag(contentdom,"item", attrs={'id':"coverimageid", 'href':"cover.jpg", 'media-type':"image/jpeg"})) # Note that the id of the cover xhmtl *must* be 'cover' # for it to work on Nook. manifest.appendChild(newTag(contentdom,"item", attrs={'id':"cover", 'href':"cover.xhtml", 'media-type':"application/xhtml+xml"})) spine.appendChild(newTag(contentdom,"itemref", attrs={"idref":"cover", "linear":"yes"})) contentcount=0 for (filename,id,type,filedata) in files: #filename = self.filecache.addHtml(href,filedata) #print("writing :%s"%filename) # add to manifest and spine if coverjpgpath and filename == "cover.xhtml": continue # don't dup cover. outputepub.writestr(filename,filedata.encode('utf-8')) id = "a%d"%contentcount contentcount += 1 manifest.appendChild(newTag(contentdom,"item", attrs={'id':id, 'href':filename, 'media-type':type})) spine.appendChild(newTag(contentdom,"itemref", attrs={"idref":id, "linear":"yes"})) fontdecrypter = FontDecrypter(self.epub,self.get_content_dom()) linked='' for (linked,type) in self.filecache.linkedfiles: # print("linked files:(%s,%s)"%(linked,type)) # add to manifest if coverjpgpath and linked == "cover.jpg": continue # don't dup cover. try: linkeddata = self.get_file(linked) if linked in fontdecrypter.get_encrypted_fontfiles(): print("Decrypting font file: %s"%linked) linkeddata = fontdecrypter.get_decrypted_font_data(linked) outputepub.writestr(linked,linkeddata) except Exception as e: print("Skipping linked file (%s)\nException: %s"%(linked,e)) id = "a%d"%contentcount contentcount += 1 manifest.appendChild(newTag(contentdom,"item", attrs={'id':id, 'href':linked, 'media-type':type})) contentxml = contentdom.toprettyxml(indent=' ') # ,encoding='utf-8' # tweak for brain damaged Nook STR. Nook insists on name before content. contentxml = contentxml.replace('', '') outputepub.writestr("content.opf",contentxml) ## create toc.ncx file tocncxdom = getDOMImplementation().createDocument(None, "ncx", None) ncx = tocncxdom.documentElement ncx.setAttribute("version","2005-1") ncx.setAttribute("xmlns","http://www.daisy.org/z3986/2005/ncx/") head = tocncxdom.createElement("head") ncx.appendChild(head) head.appendChild(newTag(tocncxdom,"meta", attrs={"name":"dtb:uid", "content":uniqueid})) depthnode = newTag(tocncxdom,"meta", attrs={"name":"dtb:depth", "content":"1"}) head.appendChild(depthnode) head.appendChild(newTag(tocncxdom,"meta", attrs={"name":"dtb:totalPageCount", "content":"0"})) head.appendChild(newTag(tocncxdom,"meta", attrs={"name":"dtb:maxPageNumber", "content":"0"})) docTitle = tocncxdom.createElement("docTitle") docTitle.appendChild(newTag(tocncxdom,"text",text=stripHTML(titleopt))) ncx.appendChild(docTitle) tocnavMap = tocncxdom.createElement("navMap") ncx.appendChild(tocnavMap) # come back to lines again for TOC because files only has files(gasp-shock!) count=1 for line in self.split_lines: if line['include']: # if changed, use only changed values. if line['num'] in changedtocs: line['toc'] = changedtocs[line['num']] # can have more than one toc entry. for title in line['toc']: newnav = newTag(tocncxdom,"navPoint", {"id":"a%03d"%count,"playOrder":"%d" % count}) count += 1 tocnavMap.appendChild(newnav) navlabel = newTag(tocncxdom,"navLabel") newnav.appendChild(navlabel) # For purposes of TOC titling & desc, use first book author navlabel.appendChild(newTag(tocncxdom,"text",text=stripHTML(title))) # Find the first 'spine' item's content for the title navpoint. # Many epubs have the first chapter as first navpoint, so we can't just # copy that anymore. if line['anchor'] and line['href']+"#"+line['anchor'] in self.filecache.anchors: src = self.filecache.anchors[line['href']+"#"+line['anchor']] #print("toc from anchors(%s#%s)(%s)"%(line['href'],line['anchor'],src)) else: #print("toc from href(%s)"%line['href']) src = line['href'] newnav.appendChild(newTag(tocncxdom,"content", {"src":src})) outputepub.writestr("toc.ncx",tocncxdom.toprettyxml(indent=' ',encoding='utf-8')) if coverjpgpath: # write, not write string. Pulling from file. outputepub.write(coverjpgpath,"cover.jpg") outputepub.writestr("cover.xhtml",''' Cover

''') # declares all the files created by Windows. otherwise, when # it runs in appengine, windows unzips the files as 000 perms. for zf in outputepub.filelist: zf.create_system = 0 outputepub.close() class FileCache: def __init__(self,manifest_items={}): self.manifest_items = manifest_items self.oldnew = {} self.newold = {} self.anchors = {} self.linkedfiles = set() ## always include font files for embedded fonts for key, value in six.iteritems(self.manifest_items): # print("manifest:%s %s"%(key,value)) if key.startswith('i:') and value[1] in ('application/vnd.ms-opentype', 'application/x-font-ttf', 'application/x-font-truetype', 'application/font-sfnt'): self.add_linked_file(value[0]) def add_linked_file(self, href): href = normpath(unquote(href)) # fix %20 & /../ if ("h:"+href) in self.manifest_items: type = self.manifest_items["h:"+href][1] else: type = 'unknown' self.linkedfiles.add((href,type)) def add_content_file(self, href, filedata): changedname = False if href not in self.oldnew: self.oldnew[href]=[] newfile = href else: changedname = True newfile = "%s%d-%s"%(get_path_part(href), len(self.oldnew[href]), get_file_part(href)) self.oldnew[href].append(newfile) self.newold[newfile]=href #print("newfile:%s"%newfile) soup = BeautifulSoup(filedata,'html5lib') #print("soup head:%s"%soup.find('head')) # same name? Don't need to worry about changing links to anchors for a in soup.findAll(): # not just 'a', any tag. #print("a:%s"%a) if a.has_attr('id'): self.anchors[href+'#'+a['id']]=newfile+'#'+a['id'] # from baen epub. # for img in soup.findAll('img') + soup.findAll('image'): src = None if img.has_attr('src'): src=img['src'] if img.has_attr('xlink:href'): src=img['xlink:href'] if src: self.add_linked_file(get_path_part(href)+src) else: logger.info("img tag without src in file:(%s) tag:(%s)"%(href,img)) # link href="0.css" type="text/css" for style in soup.findAll('link',{'type':'text/css'}): #print("link:%s"%style) if style.has_attr('href'): self.add_linked_file(get_path_part(href)+style['href']) return newfile def splitHtml(data,tagid,before=False): soup = BeautifulSoup(data,'lxml') #print("splitHtml.soup head:%s"%soup.find('head')) splitpoint = soup.find(id=tagid) #print("splitpoint:%s"%splitpoint) if splitpoint == None: return data if before: # remove all next siblings. for n in splitpoint.findNextSiblings(): n.extract() parent = splitpoint.parent while parent and parent.name != 'body': for n in parent.findNextSiblings(): n.extract() parent = parent.parent splitpoint.extract() else: # remove all prev siblings. for n in splitpoint.findPreviousSiblings(): n.extract() parent = splitpoint.parent while parent and parent.name != 'body': for n in parent.findPreviousSiblings(): n.extract() parent = parent.parent return re.sub(r'( *\r?\n)+','\r\n',unicode(soup)) def get_path_part(n): relpath = os.path.dirname(n) if( len(relpath) > 0 ): relpath=relpath+"/" return relpath def get_file_part(n): return os.path.basename(n) ## Utility method for creating new tags. def newTag(dom,name,attrs=None,text=None): tag = dom.createElement(name) if( attrs is not None ): for attr in attrs.keys(): tag.setAttribute(attr,attrs[attr]) if( text is not None ): tag.appendChild(dom.createTextNode(text)) return tag def main(argv,usage=None): from optparse import OptionParser if not usage: # read in args, anything starting with -- will be treated as --= usage = 'usage: python %prog' parser = OptionParser(usage+''' [options] [line numbers...] Giving an epub without line numbers will return a list of line numbers: the possible split points in the input file. Calling with line numbers will generate an epub with each of the "lines" given included.''') parser.add_option("-o", "--output", dest="outputopt", default="split.epub", help="Set OUTPUT file, Default: split.epub", metavar="OUTPUT") parser.add_option("--output-dir", dest="outputdiropt", help="Set OUTPUT directory, Default: presend working directory") parser.add_option('--split-by-section', action='store_true', dest='split_by_section', help='Create a new epub from each of the listed line sections instead of one containing all. Splits all sections if no lines numbers are given. Each split will be named - and placed in the output-dir. Sections without a Table of Contents entry will be included with the preceding section(s)', ) parser.add_option("-t", "--title", dest="titleopt", default=None, help="Use TITLE as the metadata title. Default: ' Split' or ToC entry with --split-by-section", metavar="TITLE") parser.add_option("-d", "--description", dest="descopt", default=None, help="Use DESC as the metadata description. Default: 'Split from by '.", metavar="DESC") parser.add_option("-a", "--author", action="append", dest="authoropts", default=[], help="Use AUTHOR as a metadata author, multiple authors may be given, Default: ", metavar="AUTHOR") parser.add_option("-g", "--tag", action="append", dest="tagopts", default=[], help="Include TAG as dc:subject tag, multiple tags may be given, Default: None", metavar="TAG") parser.add_option("-l", "--language", action="append", dest="languageopts", default=[], help="Include LANG as dc:language tag, multiple languages may be given, Default: en", metavar="LANG") parser.add_option("-c", "--cover", dest="coveropt", default=None, help="Path to a jpg to use as cover image.", metavar="COVER") (options, args) = parser.parse_args(argv) ## Add .epub if not already there. if not options.outputopt.lower().endswith(".epub"): options.outputopt=options.outputopt+".epub" if not options.languageopts: options.languageopts = ['en'] if not args: parser.print_help() return epubO = SplitEpub(args[0]) lines = epubO.get_split_lines() if options.split_by_section: if len(args) > 1: section_lines = args[1:] else: section_lines = range(len(lines)) splitslist = [] sectionlist = [] title=None for lineno in section_lines: toclist = lines[int(lineno)]['toc'] if sectionlist and not toclist: sectionlist.append(lineno) else: ## take title from (first) ToC if available, else titleopt (_ Split internally if None) title = (toclist[0] if toclist else options.titleopt) print("title: %s"%title) sectionlist=[lineno] splitslist.append((sectionlist,title)) if sectionlist: splitslist.append((sectionlist,title)) # print(splitslist) filecount = 1 for sectionlist, title in splitslist: outputfile = "%0.4d-%s"%(filecount,options.outputopt) if options.outputdiropt: outputfile = os.path.join(options.outputdiropt,outputfile) print("output file: "+outputfile) epubO.write_split_epub(outputfile, sectionlist, authoropts=options.authoropts, titleopt=title, descopt=options.descopt, tags=options.tagopts, languages=options.languageopts, coverjpgpath=options.coveropt) filecount+=1 return elif len(args) == 1: count = 0 showlist=['toc','guide','anchor','id','href'] for line in lines: print("\nLine Number: %d"%count) for s in showlist: if s in line and line[s]: print("\t%s: %s"%(s,line[s])) count += 1 return if len(args) > 1: outputfile = options.outputopt if options.outputdiropt: outputfile = os.path.join(options.outputdiropt,outputfile) print("output file: "+outputfile) epubO.write_split_epub(outputfile, args[1:], authoropts=options.authoropts, titleopt=options.titleopt, descopt=options.descopt, tags=options.tagopts, languages=options.languageopts, coverjpgpath=options.coveropt) return if __name__ == "__main__": main(sys.argv[1:])