book_summarizer / epubsplit.py
Awell00's picture
feat!: add all files
feb2065 verified
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2021, Jim Miller'
__docformat__ = 'restructuredtext en'
import sys, re, os, traceback, copy
from posixpath import normpath
import logging
logger = logging.getLogger(__name__)
from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED
from xml.dom.minidom import parse, parseString, getDOMImplementation, Element
from time import time
import six
from six.moves.urllib.parse import unquote
from six import string_types, text_type as unicode
from six import unichr
from bs4 import BeautifulSoup
## font decoding code lifted from
## calibre/src/calibre/ebooks/conversion/plugins/epub_input.py
## copyright '2009, Kovid Goyal <[email protected]>'
## don't bug Kovid about this use of it.
ADOBE_OBFUSCATION = 'http://ns.adobe.com/pdf/enc#RC'
IDPF_OBFUSCATION = 'http://www.idpf.org/2008/embedding'
from itertools import cycle
class FontDecrypter:
def __init__(self, epub, content_dom):
self.epub = epub
self.content_dom = content_dom
self.encryption = {}
self.old_uuid = None
def get_file(self,href):
return self.epub.read(href)
def get_encrypted_fontfiles(self):
if not self.encryption:
## Find the .opf file.
try:
# <encryption xmlns="urn:oasis:names:tc:opendocument:xmlns:container"
# xmlns:enc="http://www.w3.org/2001/04/xmlenc#"
# xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">
# <enc:EncryptedData>
# <enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>
# <enc:CipherData>
# <enc:CipherReference URI="fonts/00017.ttf"/>
# </enc:CipherData>
# </enc:EncryptedData>
# </encryption>
encryption = self.epub.read("META-INF/encryption.xml")
encryptiondom = parseString(encryption)
# print(encryptiondom.toprettyxml(indent=' '))
for encdata in encryptiondom.getElementsByTagName('enc:EncryptedData'):
# print(encdata.toprettyxml(indent=' '))
algorithm = encdata.getElementsByTagName('enc:EncryptionMethod')[0].getAttribute('Algorithm')
if algorithm not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}:
print("Unknown font encryption: %s"%algorithm)
else:
# print(algorithm)
for encref in encdata.getElementsByTagName('enc:CipherReference'):
# print(encref.getAttribute('URI'))
self.encryption[encref.getAttribute('URI')]=algorithm
except KeyError as ke:
self.encryption = {}
return self.encryption
def get_old_uuid(self):
if not self.old_uuid:
contentdom = self.content_dom
uidkey = contentdom.getElementsByTagName("package")[0].getAttribute("unique-identifier")
for dcid in contentdom.getElementsByTagName("dc:identifier"):
if dcid.getAttribute("id") == uidkey and dcid.getAttribute("opf:scheme") == "uuid":
self.old_uuid = dcid.firstChild.data
return self.old_uuid
def get_idpf_key(self):
# idpf key:urn:uuid:221c69fe-29f3-4cb4-bb3f-58c430261cc6
# idpf key:b'\xfb\xa9\x03N}\xae~\x12 \xaa\xe0\xc11\xe2\xe7\x1b\xf6\xa5\xcas'
idpf_key = self.get_old_uuid()
import uuid, hashlib
idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key)
idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest()
return idpf_key
def get_adobe_key(self):
# adobe key:221c69fe-29f3-4cb4-bb3f-58c430261cc6
# adobe key:b'"\x1ci\xfe)\xf3L\xb4\xbb?X\xc40&\x1c\xc6'
adobe_key = self.get_old_uuid()
import uuid
adobe_key = adobe_key.rpartition(':')[-1] # skip urn:uuid:
adobe_key = uuid.UUID(adobe_key).bytes
return adobe_key
def get_decrypted_font_data(self, uri):
# print(self.get_old_uuid())
# print("idpf : %s"%self.get_idpf_key())
# print("adobe: %s"%self.get_adobe_key())
# print("uri:%s"%uri)
font_data = self.get_file(uri)
if uri in self.get_encrypted_fontfiles():
key = self.get_adobe_key() if self.get_encrypted_fontfiles()[uri] == ADOBE_OBFUSCATION else self.get_idpf_key()
font_data = self.decrypt_font_data(key, font_data, self.get_encrypted_fontfiles()[uri])
return font_data
def decrypt_font_data(self, key, data, algorithm):
is_adobe = algorithm == ADOBE_OBFUSCATION
crypt_len = 1024 if is_adobe else 1040
crypt = bytearray(data[:crypt_len])
key = cycle(iter(bytearray(key)))
decrypt = bytes(bytearray(x^next(key) for x in crypt))
return decrypt + data[crypt_len:]
def _unirepl(match):
"Return the unicode string for a decimal number"
if match.group(1).startswith('x'):
radix=16
s = match.group(1)[1:]
else:
radix=10
s = match.group(1)
try:
value = int(s, radix)
retval = "%s%s"%(unichr(value),match.group(2))
except:
# This way, at least if there's more of entities out there
# that fail, it doesn't blow the entire download.
print("Numeric entity translation failed, skipping: &#x%s%s"%(match.group(1),match.group(2)))
retval = ""
return retval
def _replaceNumberEntities(data):
# The same brokenish entity parsing in SGMLParser that inserts ';'
# after non-entities will also insert ';' incorrectly after number
# entities, including part of the next word if it's a-z.
# "Don't&#8212ever&#8212do&#8212that&#8212again," becomes
# "Don't&#8212e;ver&#8212d;o&#8212;that&#8212a;gain,"
# Also need to allow for 5 digit decimal entities &#27861;
# Last expression didn't allow for 2 digit hex correctly: &#xE9;
p = re.compile(r'&#(x[0-9a-fA-F]{,4}|[0-9]{,5})([0-9a-fA-F]*?);')
return p.sub(_unirepl, data)
def _replaceNotEntities(data):
# not just \w or \S. regexp from c:\Python25\lib\sgmllib.py
# (or equiv), SGMLParser, entityref
p = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
return p.sub(r'&\1', data)
def stripHTML(soup):
return removeAllEntities(re.sub(r'<[^>]+>','',"%s" % soup)).strip()
def conditionalRemoveEntities(value):
if isinstance(value,string_types) :
return removeEntities(value).strip()
else:
return value
def removeAllEntities(text):
# Remove &lt; &lt; and &amp;
return removeEntities(text).replace('&lt;', '<').replace('&gt;', '>').replace('&amp;', '&')
def removeEntities(text):
if text is None:
return ""
if not (isinstance(text,string_types)):
return str(text)
try:
t = unicode(text) #.decode('utf-8')
except UnicodeEncodeError as e:
try:
t = text.encode ('ascii', 'xmlcharrefreplace')
except UnicodeEncodeError as e:
t = text
text = t
# replace numeric versions of [&<>] with named versions,
# then replace named versions with actual characters,
text = re.sub(r'&#0*38;','&amp;',text)
text = re.sub(r'&#0*60;','&lt;',text)
text = re.sub(r'&#0*62;','&gt;',text)
# replace remaining &#000; entities with unicode value, such as &#039; -> '
text = _replaceNumberEntities(text)
# replace several named entities with character, such as &mdash; -> -
# see constants.py for the list.
# reverse sort will put entities with ; before the same one without, when valid.
for e in reversed(sorted(entities.keys())):
v = entities[e]
try:
text = text.replace(e, v)
except UnicodeDecodeError as ex:
# for the pound symbol in constants.py
text = text.replace(e, v.decode('utf-8'))
# SGMLParser, and in turn, BeautifulStoneSoup doesn't parse
# entities terribly well and inserts (;) after something that
# it thinks might be an entity. AT&T becomes AT&T; All of my
# attempts to fix this by changing the input to
# BeautifulStoneSoup break something else instead. But at
# this point, there should be *no* real entities left, so find
# these not-entities and removing them here should be safe.
text = _replaceNotEntities(text)
# &lt; &lt; and &amp; are the only html entities allowed in xhtml, put those back.
return text.replace('&', '&amp;').replace('&amp;lt', '&lt;').replace('&amp;gt', '&gt;')
# entity list from http://code.google.com/p/doctype/wiki/CharacterEntitiesConsistent
entities = { '&aacute;' : 'á',
'&Aacute;' : 'Á',
'&Aacute' : 'Á',
'&aacute' : 'á',
'&acirc;' : 'â',
'&Acirc;' : 'Â',
'&Acirc' : 'Â',
'&acirc' : 'â',
'&acute;' : '´',
'&acute' : '´',
'&AElig;' : 'Æ',
'&aelig;' : 'æ',
'&AElig' : 'Æ',
'&aelig' : 'æ',
'&agrave;' : 'à',
'&Agrave;' : 'À',
'&Agrave' : 'À',
'&agrave' : 'à',
'&alefsym;' : 'ℵ',
'&alpha;' : 'α',
'&Alpha;' : 'Α',
'&amp;' : '&',
'&AMP;' : '&',
'&AMP' : '&',
'&amp' : '&',
'&and;' : '∧',
'&ang;' : '∠',
'&aring;' : 'å',
'&Aring;' : 'Å',
'&Aring' : 'Å',
'&aring' : 'å',
'&asymp;' : '≈',
'&atilde;' : 'ã',
'&Atilde;' : 'Ã',
'&Atilde' : 'Ã',
'&atilde' : 'ã',
'&auml;' : 'ä',
'&Auml;' : 'Ä',
'&Auml' : 'Ä',
'&auml' : 'ä',
'&bdquo;' : '„',
'&beta;' : 'β',
'&Beta;' : 'Β',
'&brvbar;' : '¦',
'&brvbar' : '¦',
'&bull;' : '•',
'&cap;' : '∩',
'&ccedil;' : 'ç',
'&Ccedil;' : 'Ç',
'&Ccedil' : 'Ç',
'&ccedil' : 'ç',
'&cedil;' : '¸',
'&cedil' : '¸',
'&cent;' : '¢',
'&cent' : '¢',
'&chi;' : 'χ',
'&Chi;' : 'Χ',
'&circ;' : 'ˆ',
'&clubs;' : '♣',
'&cong;' : '≅',
'&copy;' : '©',
'&COPY;' : '©',
'&COPY' : '©',
'&copy' : '©',
'&crarr;' : '↵',
'&cup;' : '∪',
'&curren;' : '¤',
'&curren' : '¤',
'&dagger;' : '†',
'&Dagger;' : '‡',
'&darr;' : '↓',
'&dArr;' : '⇓',
'&deg;' : '°',
'&deg' : '°',
'&delta;' : 'δ',
'&Delta;' : 'Δ',
'&diams;' : '♦',
'&divide;' : '÷',
'&divide' : '÷',
'&eacute;' : 'é',
'&Eacute;' : 'É',
'&Eacute' : 'É',
'&eacute' : 'é',
'&ecirc;' : 'ê',
'&Ecirc;' : 'Ê',
'&Ecirc' : 'Ê',
'&ecirc' : 'ê',
'&egrave;' : 'è',
'&Egrave;' : 'È',
'&Egrave' : 'È',
'&egrave' : 'è',
'&empty;' : '∅',
'&emsp;' : ' ',
'&ensp;' : ' ',
'&epsilon;' : 'ε',
'&Epsilon;' : 'Ε',
'&equiv;' : '≡',
'&eta;' : 'η',
'&Eta;' : 'Η',
'&eth;' : 'ð',
'&ETH;' : 'Ð',
'&ETH' : 'Ð',
'&eth' : 'ð',
'&euml;' : 'ë',
'&Euml;' : 'Ë',
'&Euml' : 'Ë',
'&euml' : 'ë',
'&euro;' : '€',
'&exist;' : '∃',
'&fnof;' : 'ƒ',
'&forall;' : '∀',
'&frac12;' : '½',
'&frac12' : '½',
'&frac14;' : '¼',
'&frac14' : '¼',
'&frac34;' : '¾',
'&frac34' : '¾',
'&frasl;' : '⁄',
'&gamma;' : 'γ',
'&Gamma;' : 'Γ',
'&ge;' : '≥',
#'&gt;' : '>',
#'&GT;' : '>',
#'&GT' : '>',
#'&gt' : '>',
'&harr;' : '↔',
'&hArr;' : '⇔',
'&hearts;' : '♥',
'&hellip;' : '…',
'&iacute;' : 'í',
'&Iacute;' : 'Í',
'&Iacute' : 'Í',
'&iacute' : 'í',
'&icirc;' : 'î',
'&Icirc;' : 'Î',
'&Icirc' : 'Î',
'&icirc' : 'î',
'&iexcl;' : '¡',
'&iexcl' : '¡',
'&igrave;' : 'ì',
'&Igrave;' : 'Ì',
'&Igrave' : 'Ì',
'&igrave' : 'ì',
'&image;' : 'ℑ',
'&infin;' : '∞',
'&int;' : '∫',
'&iota;' : 'ι',
'&Iota;' : 'Ι',
'&iquest;' : '¿',
'&iquest' : '¿',
'&isin;' : '∈',
'&iuml;' : 'ï',
'&Iuml;' : 'Ï',
'&Iuml' : 'Ï',
'&iuml' : 'ï',
'&kappa;' : 'κ',
'&Kappa;' : 'Κ',
'&lambda;' : 'λ',
'&Lambda;' : 'Λ',
'&laquo;' : '«',
'&laquo' : '«',
'&larr;' : '←',
'&lArr;' : '⇐',
'&lceil;' : '⌈',
'&ldquo;' : '“',
'&le;' : '≤',
'&lfloor;' : '⌊',
'&lowast;' : '∗',
'&loz;' : '◊',
'&lrm;' : '‎',
'&lsaquo;' : '‹',
'&lsquo;' : '‘',
#'&lt;' : '<',
#'&LT;' : '<',
#'&LT' : '<',
#'&lt' : '<',
'&macr;' : '¯',
'&macr' : '¯',
'&mdash;' : '—',
'&micro;' : 'µ',
'&micro' : 'µ',
'&middot;' : '·',
'&middot' : '·',
'&minus;' : '−',
'&mu;' : 'μ',
'&Mu;' : 'Μ',
'&nabla;' : '∇',
'&nbsp;' : ' ',
'&nbsp' : ' ',
'&ndash;' : '–',
'&ne;' : '≠',
'&ni;' : '∋',
'&not;' : '¬',
'&not' : '¬',
'&notin;' : '∉',
'&nsub;' : '⊄',
'&ntilde;' : 'ñ',
'&Ntilde;' : 'Ñ',
'&Ntilde' : 'Ñ',
'&ntilde' : 'ñ',
'&nu;' : 'ν',
'&Nu;' : 'Ν',
'&oacute;' : 'ó',
'&Oacute;' : 'Ó',
'&Oacute' : 'Ó',
'&oacute' : 'ó',
'&ocirc;' : 'ô',
'&Ocirc;' : 'Ô',
'&Ocirc' : 'Ô',
'&ocirc' : 'ô',
'&OElig;' : 'Œ',
'&oelig;' : 'œ',
'&ograve;' : 'ò',
'&Ograve;' : 'Ò',
'&Ograve' : 'Ò',
'&ograve' : 'ò',
'&oline;' : '‾',
'&omega;' : 'ω',
'&Omega;' : 'Ω',
'&omicron;' : 'ο',
'&Omicron;' : 'Ο',
'&oplus;' : '⊕',
'&or;' : '∨',
'&ordf;' : 'ª',
'&ordf' : 'ª',
'&ordm;' : 'º',
'&ordm' : 'º',
'&oslash;' : 'ø',
'&Oslash;' : 'Ø',
'&Oslash' : 'Ø',
'&oslash' : 'ø',
'&otilde;' : 'õ',
'&Otilde;' : 'Õ',
'&Otilde' : 'Õ',
'&otilde' : 'õ',
'&otimes;' : '⊗',
'&ouml;' : 'ö',
'&Ouml;' : 'Ö',
'&Ouml' : 'Ö',
'&ouml' : 'ö',
'&para;' : '¶',
'&para' : '¶',
'&part;' : '∂',
'&permil;' : '‰',
'&perp;' : '⊥',
'&phi;' : 'φ',
'&Phi;' : 'Φ',
'&pi;' : 'π',
'&Pi;' : 'Π',
'&piv;' : 'ϖ',
'&plusmn;' : '±',
'&plusmn' : '±',
'&pound;' : '£',
'&pound' : '£',
'&prime;' : '′',
'&Prime;' : '″',
'&prod;' : '∏',
'&prop;' : '∝',
'&psi;' : 'ψ',
'&Psi;' : 'Ψ',
'&quot;' : '"',
'&QUOT;' : '"',
'&QUOT' : '"',
'&quot' : '"',
'&radic;' : '√',
'&raquo;' : '»',
'&raquo' : '»',
'&rarr;' : '→',
'&rArr;' : '⇒',
'&rceil;' : '⌉',
'&rdquo;' : '”',
'&real;' : 'ℜ',
'&reg;' : '®',
'&REG;' : '®',
'&REG' : '®',
'&reg' : '®',
'&rfloor;' : '⌋',
'&rho;' : 'ρ',
'&Rho;' : 'Ρ',
'&rlm;' : '‏',
'&rsaquo;' : '›',
'&rsquo;' : '’',
'&sbquo;' : '‚',
'&scaron;' : 'š',
'&Scaron;' : 'Š',
'&sdot;' : '⋅',
'&sect;' : '§',
'&sect' : '§',
'&shy;' : '­', # strange optional hyphenation control character, not just a dash
'&shy' : '­',
'&sigma;' : 'σ',
'&Sigma;' : 'Σ',
'&sigmaf;' : 'ς',
'&sim;' : '∼',
'&spades;' : '♠',
'&sub;' : '⊂',
'&sube;' : '⊆',
'&sum;' : '∑',
'&sup1;' : '¹',
'&sup1' : '¹',
'&sup2;' : '²',
'&sup2' : '²',
'&sup3;' : '³',
'&sup3' : '³',
'&sup;' : '⊃',
'&supe;' : '⊇',
'&szlig;' : 'ß',
'&szlig' : 'ß',
'&tau;' : 'τ',
'&Tau;' : 'Τ',
'&there4;' : '∴',
'&theta;' : 'θ',
'&Theta;' : 'Θ',
'&thetasym;' : 'ϑ',
'&thinsp;' : ' ',
'&thorn;' : 'þ',
'&THORN;' : 'Þ',
'&THORN' : 'Þ',
'&thorn' : 'þ',
'&tilde;' : '˜',
'&times;' : '×',
'&times' : '×',
'&trade;' : '™',
'&uacute;' : 'ú',
'&Uacute;' : 'Ú',
'&Uacute' : 'Ú',
'&uacute' : 'ú',
'&uarr;' : '↑',
'&uArr;' : '⇑',
'&ucirc;' : 'û',
'&Ucirc;' : 'Û',
'&Ucirc' : 'Û',
'&ucirc' : 'û',
'&ugrave;' : 'ù',
'&Ugrave;' : 'Ù',
'&Ugrave' : 'Ù',
'&ugrave' : 'ù',
'&uml;' : '¨',
'&uml' : '¨',
'&upsih;' : 'ϒ',
'&upsilon;' : 'υ',
'&Upsilon;' : 'Υ',
'&uuml;' : 'ü',
'&Uuml;' : 'Ü',
'&Uuml' : 'Ü',
'&uuml' : 'ü',
'&weierp;' : '℘',
'&xi;' : 'ξ',
'&Xi;' : 'Ξ',
'&yacute;' : 'ý',
'&Yacute;' : 'Ý',
'&Yacute' : 'Ý',
'&yacute' : 'ý',
'&yen;' : '¥',
'&yen' : '¥',
'&yuml;' : 'ÿ',
'&Yuml;' : 'Ÿ',
'&yuml' : 'ÿ',
'&zeta;' : 'ζ',
'&Zeta;' : 'Ζ',
'&zwj;' : '‍', # strange spacing control character, not just a space
'&zwnj;' : '‌', # strange spacing control character, not just a space
}
class SplitEpub:
def __init__(self, inputio):
self.epub = ZipFile(inputio, 'r')
self.content_dom = None
self.content_relpath = None
self.manifest_items = None
self.guide_items = None
self.toc_dom = None
self.toc_relpath = None
self.toc_map = None
self.split_lines = None
self.origauthors = []
self.origtitle = None
def get_file(self,href):
return self.epub.read(href)
def get_content_dom(self):
if not self.content_dom:
## Find the .opf file.
container = self.epub.read("META-INF/container.xml")
containerdom = parseString(container)
rootfilenodelist = containerdom.getElementsByTagName("rootfile")
rootfilename = rootfilenodelist[0].getAttribute("full-path")
self.content_dom = parseString(self.epub.read(rootfilename))
self.content_relpath = get_path_part(rootfilename)
return self.content_dom
def get_content_relpath(self):
## Save the path to the .opf file--hrefs inside it are relative to it.
if not self.content_relpath:
self.get_content_dom() # sets self.content_relpath also.
return self.content_relpath
def get_toc_relpath(self):
## Save the path to the toc.ncx file--hrefs inside it are relative to it.
if not self.toc_relpath:
self.get_manifest_items() # sets self.toc_relpath also.
return self.toc_relpath
def get_manifest_items(self):
if not self.manifest_items:
self.manifest_items = {}
for item in self.get_content_dom().getElementsByTagName("item"):
fullhref=normpath(unquote(self.get_content_relpath()+item.getAttribute("href")))
#print("---- item fullhref:%s"%(fullhref))
self.manifest_items["h:"+fullhref]=(item.getAttribute("id"),item.getAttribute("media-type"))
self.manifest_items["i:"+item.getAttribute("id")]=(fullhref,item.getAttribute("media-type"))
if( item.getAttribute("media-type") == "application/x-dtbncx+xml" ):
# TOC file is only one with this type--as far as I know.
self.toc_relpath = get_path_part(fullhref)
self.toc_dom = parseString(self.epub.read(fullhref))
return self.manifest_items
def get_guide_items(self):
if not self.guide_items:
self.guide_items = {}
for item in self.get_content_dom().getElementsByTagName("reference"):
fullhref=normpath(unquote(self.get_content_relpath()+item.getAttribute("href")))
self.guide_items[fullhref]=(item.getAttribute("type"),item.getAttribute("title"))
#print("---- reference href:%s value:%s"%(fullhref,self.guide_items[fullhref],))
#self.guide_items[item.getAttribute("type")]=(fullhref,item.getAttribute("media-type"))
return self.guide_items
def get_toc_dom(self):
if not self.toc_dom:
self.get_manifest_items() # also sets self.toc_dom
return self.toc_dom
# dict() of href->[(text,anchor),...],...
# eg: "file0001.html"->[("Introduction","anchor01"),("Chapter 1","anchor02")],...
def get_toc_map(self):
if not self.toc_map:
self.toc_map = {}
# update all navpoint ids with bookid for uniqueness.
for navpoint in self.get_toc_dom().getElementsByTagName("navPoint"):
src = normpath(unquote(self.get_toc_relpath()+navpoint.getElementsByTagName("content")[0].getAttribute("src")))
if '#' in src:
(href,anchor)=src.split("#")
else:
(href,anchor)=(src,None)
# The first of these in each navPoint should be the appropriate one.
# (may be others due to nesting.
try:
text = unicode(navpoint.getElementsByTagName("text")[0].firstChild.data)
except:
#print("No chapter title found in TOC for (%s)"%src)
text = ""
if href not in self.toc_map:
self.toc_map[href] = []
if anchor == None:
# put file links ahead of ancher links. Otherwise
# a non-linear anchor link may take precedence,
# which will confuse EpubSplit. This will cause
# split lines to possibly be out of order from
# TOC, but the alternative is worse. Should be a
# rare corner case.
## Keep order of non-anchor entries to the same file.
idx=0
while idx < len(self.toc_map[href]) and self.toc_map[href][idx][1] is None: # [1] is anchor
# print(idx)
# print(self.toc_map[href][idx])
idx = idx+1
self.toc_map[href].insert(idx,(text,anchor))
else:
self.toc_map[href].append((text,anchor))
# print(self.toc_map)
return self.toc_map
# list of dicts with href, anchor & toc text.
# 'split lines' are all the points that the epub can be split on.
# Offer a split at each spine file and each ToC point.
def get_split_lines(self):
metadom = self.get_content_dom()
## Save indiv book title
try:
self.origtitle = metadom.getElementsByTagName("dc:title")[0].firstChild.data
except:
self.origtitle = "(Title Missing)"
## Save authors.
for creator in metadom.getElementsByTagName("dc:creator"):
try:
if( creator.getAttribute("opf:role") == "aut" or not creator.hasAttribute("opf:role") and creator.firstChild != None):
if creator.firstChild.data not in self.origauthors:
self.origauthors.append(creator.firstChild.data)
except:
pass
if len(self.origauthors) == 0:
self.origauthors.append("(Authors Missing)")
self.split_lines = [] # list of dicts with href, anchor and toc
# spin on spine files.
count=0
for itemref in metadom.getElementsByTagName("itemref"):
idref = itemref.getAttribute("idref")
(href,type) = self.get_manifest_items()["i:"+idref]
current = {}
self.split_lines.append(current)
current['href']=href
current['anchor']=None
current['toc'] = []
if href in self.get_guide_items():
current['guide'] = self.get_guide_items()[href]
current['id'] = idref
current['type'] = type
current['num'] = count
t=self.epub.read(href).decode('utf-8')
if len(t) > 1500 : t = t[:1500] + "..."
current['sample']=t
count += 1
#print("spine:%s->%s"%(idref,href))
# if href is in the toc.
if href in self.get_toc_map():
# For each toc entry, check to see if there's an anchor, if so,
# make a new split line.
for tocitem in self.get_toc_map()[href]:
(text,anchor) = tocitem
# XXX for outputing to screen in CLI--hopefully won't need in plugin?
try:
text = "%s"%text
except:
text = "(error text)"
if anchor:
#print("breakpoint: %d"%count)
current = {}
self.split_lines.append(current)
current['href']=href
current['anchor']=anchor
current['toc']=[]
current['id'] = idref
current['type'] = type
current['num'] = count
# anchor, need to split first, then reduce to 1500.
t=splitHtml(self.epub.read(href).decode('utf-8'),anchor,before=False)
if len(t) > 1500 : t = t[:1500] + "..."
current['sample']=t
count += 1
# There can be more than one toc to the same split line.
# This won't find multiple toc to the same anchor yet.
current['toc'].append(text)
#print("\ttoc:'%s' %s#%s"%(text,href,anchor))
return self.split_lines
# pass in list of line numbers(?)
def get_split_files(self,linenums):
self.filecache = FileCache(self.get_manifest_items())
# set include flag in split_lines.
if not self.split_lines:
self.get_split_lines()
lines = self.split_lines
lines_set = set([int(k) for k in linenums])
for j in range(len(lines)):
lines[j]['include'] = j in lines_set
# loop through finding 'chunks' -- contiguous pieces in the
# same file. Each included file is at least one chunk, but if
# parts are left out, one original file can end up being more
# than one chunk.
outchunks = [] # list of tuples=(filename,start,end) 'end' is not inclusive.
inchunk = False
currentfile = None
start = None
for line in lines:
if line['include']:
if not inchunk: # start new chunk
inchunk = True
currentfile = line['href']
start = line
else: # inchunk
# different file, new chunk.
if currentfile != line['href']:
outchunks.append((currentfile,start,line))
inchunk=True
currentfile=line['href']
start=line
else: # not include
if inchunk: # save previous chunk.
outchunks.append((currentfile,start,line))
inchunk=False
# final chunk for when last in list is include.
if inchunk:
outchunks.append((currentfile,start,None))
outfiles=[] # tuples, (filename,type,data) -- filename changed to unique
for (href,start,end) in outchunks:
filedata = self.epub.read(href).decode('utf-8')
# discard before start if anchor.
if start['anchor'] != None:
filedata = splitHtml(filedata,start['anchor'],before=False)
# discard from end anchor on(inclusive), but only if same file. If
# different file, keep rest of file. If no 'end', then it was the
# last chunk and went to the end of the last file.
if end != None and end['anchor'] != None and end['href']==href:
filedata = splitHtml(filedata,end['anchor'],before=True)
filename = self.filecache.add_content_file(href,filedata)
outfiles.append([filename,start['id'],start['type'],filedata])
# print("self.oldnew:%s"%self.filecache.oldnew)
# print("self.newold:%s"%self.filecache.newold)
# print("\nanchors:%s\n"%self.filecache.anchors)
# print("\nlinkedfiles:%s\n"%self.filecache.linkedfiles)
# print("relpath:%s"%get_path_part())
# Spin through to replace internal URLs
for fl in outfiles:
#print("file:%s"%fl[0])
soup = BeautifulSoup(fl[3],'html5lib')
changed = False
for a in soup.findAll('a'):
if a.has_attr('href'):
path = normpath(unquote("%s%s"%(get_path_part(fl[0]),a['href'])))
#print("full a['href']:%s"%path)
if path in self.filecache.anchors and self.filecache.anchors[path] != path:
a['href'] = self.filecache.anchors[path][len(get_path_part(fl[0])):]
#print("replacement path:%s"%a['href'])
changed = True
if changed:
fl[3] = unicode(soup)
return outfiles
def write_split_epub(self,
outputio,
linenums,
changedtocs={},
authoropts=[],
titleopt=None,
descopt=None,
tags=[],
languages=['en'],
coverjpgpath=None):
files = self.get_split_files(linenums)
## Write mimetype file, must be first and uncompressed.
## Older versions of python(2.4/5) don't allow you to specify
## compression by individual file.
## Overwrite if existing output file.
outputepub = ZipFile(outputio, "w", compression=ZIP_STORED)
outputepub.debug = 3
outputepub.writestr("mimetype", "application/epub+zip")
outputepub.close()
## Re-open file for content.
outputepub = ZipFile(outputio, "a", compression=ZIP_DEFLATED)
outputepub.debug = 3
## Create META-INF/container.xml file. The only thing it does is
## point to content.opf
containerdom = getDOMImplementation().createDocument(None, "container", None)
containertop = containerdom.documentElement
containertop.setAttribute("version","1.0")
containertop.setAttribute("xmlns","urn:oasis:names:tc:opendocument:xmlns:container")
rootfiles = containerdom.createElement("rootfiles")
containertop.appendChild(rootfiles)
rootfiles.appendChild(newTag(containerdom,"rootfile",{"full-path":"content.opf",
"media-type":"application/oebps-package+xml"}))
outputepub.writestr("META-INF/container.xml",containerdom.toprettyxml(indent=' ',encoding='utf-8'))
#### ## create content.opf file.
uniqueid="epubsplit-uid-%d" % time() # real sophisticated uid scheme.
contentdom = getDOMImplementation().createDocument(None, "package", None)
package = contentdom.documentElement
package.setAttribute("version","2.0")
package.setAttribute("xmlns","http://www.idpf.org/2007/opf")
package.setAttribute("unique-identifier","epubsplit-id")
metadata=newTag(contentdom,"metadata",
attrs={"xmlns:dc":"http://purl.org/dc/elements/1.1/",
"xmlns:opf":"http://www.idpf.org/2007/opf"})
metadata.appendChild(newTag(contentdom,"dc:identifier",text=uniqueid,attrs={"id":"epubsplit-id"}))
if( titleopt is None ):
titleopt = self.origtitle+" Split"
metadata.appendChild(newTag(contentdom,"dc:title",text=titleopt))
if( authoropts and len(authoropts) > 0 ):
useauthors=authoropts
else:
useauthors=self.origauthors
usedauthors=dict()
for author in useauthors:
if( author not in usedauthors ):
usedauthors[author]=author
metadata.appendChild(newTag(contentdom,"dc:creator",
attrs={"opf:role":"aut"},
text=author))
metadata.appendChild(newTag(contentdom,"dc:contributor",text="epubsplit",attrs={"opf:role":"bkp"}))
metadata.appendChild(newTag(contentdom,"dc:rights",text="Copyrights as per source stories"))
if languages:
for l in languages:
metadata.appendChild(newTag(contentdom,"dc:language",text=l))
else:
metadata.appendChild(newTag(contentdom,"dc:language",text="en"))
if not descopt:
# created now, but not filled in until TOC generation to save loops.
description = newTag(contentdom,"dc:description",text="Split from %s by %s."%(self.origtitle,", ".join(self.origauthors)))
else:
description = newTag(contentdom,"dc:description",text=descopt)
metadata.appendChild(description)
for tag in tags:
metadata.appendChild(newTag(contentdom,"dc:subject",text=tag))
package.appendChild(metadata)
manifest = contentdom.createElement("manifest")
package.appendChild(manifest)
spine = newTag(contentdom,"spine",attrs={"toc":"ncx"})
package.appendChild(spine)
manifest.appendChild(newTag(contentdom,"item",
attrs={'id':'ncx',
'href':'toc.ncx',
'media-type':'application/x-dtbncx+xml'}))
if coverjpgpath:
# <meta name="cover" content="cover.jpg"/>
metadata.appendChild(newTag(contentdom,"meta",{"name":"cover",
"content":"coverimageid"}))
# cover stuff for later:
# at end of <package>:
# <guide>
# <reference type="cover" title="Cover" href="Text/cover.xhtml"/>
# </guide>
guide = newTag(contentdom,"guide")
guide.appendChild(newTag(contentdom,"reference",attrs={"type":"cover",
"title":"Cover",
"href":"cover.xhtml"}))
package.appendChild(guide)
manifest.appendChild(newTag(contentdom,"item",
attrs={'id':"coverimageid",
'href':"cover.jpg",
'media-type':"image/jpeg"}))
# Note that the id of the cover xhmtl *must* be 'cover'
# for it to work on Nook.
manifest.appendChild(newTag(contentdom,"item",
attrs={'id':"cover",
'href':"cover.xhtml",
'media-type':"application/xhtml+xml"}))
spine.appendChild(newTag(contentdom,"itemref",
attrs={"idref":"cover",
"linear":"yes"}))
contentcount=0
for (filename,id,type,filedata) in files:
#filename = self.filecache.addHtml(href,filedata)
#print("writing :%s"%filename)
# add to manifest and spine
if coverjpgpath and filename == "cover.xhtml":
continue # don't dup cover.
outputepub.writestr(filename,filedata.encode('utf-8'))
id = "a%d"%contentcount
contentcount += 1
manifest.appendChild(newTag(contentdom,"item",
attrs={'id':id,
'href':filename,
'media-type':type}))
spine.appendChild(newTag(contentdom,"itemref",
attrs={"idref":id,
"linear":"yes"}))
fontdecrypter = FontDecrypter(self.epub,self.get_content_dom())
linked=''
for (linked,type) in self.filecache.linkedfiles:
# print("linked files:(%s,%s)"%(linked,type))
# add to manifest
if coverjpgpath and linked == "cover.jpg":
continue # don't dup cover.
try:
linkeddata = self.get_file(linked)
if linked in fontdecrypter.get_encrypted_fontfiles():
print("Decrypting font file: %s"%linked)
linkeddata = fontdecrypter.get_decrypted_font_data(linked)
outputepub.writestr(linked,linkeddata)
except Exception as e:
print("Skipping linked file (%s)\nException: %s"%(linked,e))
id = "a%d"%contentcount
contentcount += 1
manifest.appendChild(newTag(contentdom,"item",
attrs={'id':id,
'href':linked,
'media-type':type}))
contentxml = contentdom.toprettyxml(indent=' ') # ,encoding='utf-8'
# tweak for brain damaged Nook STR. Nook insists on name before content.
contentxml = contentxml.replace('<meta content="coverimageid" name="cover"/>',
'<meta name="cover" content="coverimageid"/>')
outputepub.writestr("content.opf",contentxml)
## create toc.ncx file
tocncxdom = getDOMImplementation().createDocument(None, "ncx", None)
ncx = tocncxdom.documentElement
ncx.setAttribute("version","2005-1")
ncx.setAttribute("xmlns","http://www.daisy.org/z3986/2005/ncx/")
head = tocncxdom.createElement("head")
ncx.appendChild(head)
head.appendChild(newTag(tocncxdom,"meta",
attrs={"name":"dtb:uid", "content":uniqueid}))
depthnode = newTag(tocncxdom,"meta",
attrs={"name":"dtb:depth", "content":"1"})
head.appendChild(depthnode)
head.appendChild(newTag(tocncxdom,"meta",
attrs={"name":"dtb:totalPageCount", "content":"0"}))
head.appendChild(newTag(tocncxdom,"meta",
attrs={"name":"dtb:maxPageNumber", "content":"0"}))
docTitle = tocncxdom.createElement("docTitle")
docTitle.appendChild(newTag(tocncxdom,"text",text=stripHTML(titleopt)))
ncx.appendChild(docTitle)
tocnavMap = tocncxdom.createElement("navMap")
ncx.appendChild(tocnavMap)
# come back to lines again for TOC because files only has files(gasp-shock!)
count=1
for line in self.split_lines:
if line['include']:
# if changed, use only changed values.
if line['num'] in changedtocs:
line['toc'] = changedtocs[line['num']]
# can have more than one toc entry.
for title in line['toc']:
newnav = newTag(tocncxdom,"navPoint",
{"id":"a%03d"%count,"playOrder":"%d" % count})
count += 1
tocnavMap.appendChild(newnav)
navlabel = newTag(tocncxdom,"navLabel")
newnav.appendChild(navlabel)
# For purposes of TOC titling & desc, use first book author
navlabel.appendChild(newTag(tocncxdom,"text",text=stripHTML(title)))
# Find the first 'spine' item's content for the title navpoint.
# Many epubs have the first chapter as first navpoint, so we can't just
# copy that anymore.
if line['anchor'] and line['href']+"#"+line['anchor'] in self.filecache.anchors:
src = self.filecache.anchors[line['href']+"#"+line['anchor']]
#print("toc from anchors(%s#%s)(%s)"%(line['href'],line['anchor'],src))
else:
#print("toc from href(%s)"%line['href'])
src = line['href']
newnav.appendChild(newTag(tocncxdom,"content",
{"src":src}))
outputepub.writestr("toc.ncx",tocncxdom.toprettyxml(indent=' ',encoding='utf-8'))
if coverjpgpath:
# write, not write string. Pulling from file.
outputepub.write(coverjpgpath,"cover.jpg")
outputepub.writestr("cover.xhtml",'''
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"><head><title>Cover</title><style type="text/css" title="override_css">
@page {padding: 0pt; margin:0pt}
body { text-align: center; padding:0pt; margin: 0pt; }
div { margin: 0pt; padding: 0pt; }
</style></head><body><div>
<img src="cover.jpg" alt="cover"/>
</div></body></html>
''')
# declares all the files created by Windows. otherwise, when
# it runs in appengine, windows unzips the files as 000 perms.
for zf in outputepub.filelist:
zf.create_system = 0
outputepub.close()
class FileCache:
def __init__(self,manifest_items={}):
self.manifest_items = manifest_items
self.oldnew = {}
self.newold = {}
self.anchors = {}
self.linkedfiles = set()
## always include font files for embedded fonts
for key, value in six.iteritems(self.manifest_items):
# print("manifest:%s %s"%(key,value))
if key.startswith('i:') and value[1] in ('application/vnd.ms-opentype',
'application/x-font-ttf',
'application/x-font-truetype',
'application/font-sfnt'):
self.add_linked_file(value[0])
def add_linked_file(self, href):
href = normpath(unquote(href)) # fix %20 & /../
if ("h:"+href) in self.manifest_items:
type = self.manifest_items["h:"+href][1]
else:
type = 'unknown'
self.linkedfiles.add((href,type))
def add_content_file(self, href, filedata):
changedname = False
if href not in self.oldnew:
self.oldnew[href]=[]
newfile = href
else:
changedname = True
newfile = "%s%d-%s"%(get_path_part(href),
len(self.oldnew[href]),
get_file_part(href))
self.oldnew[href].append(newfile)
self.newold[newfile]=href
#print("newfile:%s"%newfile)
soup = BeautifulSoup(filedata,'html5lib')
#print("soup head:%s"%soup.find('head'))
# same name? Don't need to worry about changing links to anchors
for a in soup.findAll(): # not just 'a', any tag.
#print("a:%s"%a)
if a.has_attr('id'):
self.anchors[href+'#'+a['id']]=newfile+'#'+a['id']
# <image> from baen epub.
# <image width="462" height="616" xlink:href="cover.jpeg"/>
for img in soup.findAll('img') + soup.findAll('image'):
src = None
if img.has_attr('src'):
src=img['src']
if img.has_attr('xlink:href'):
src=img['xlink:href']
if src:
self.add_linked_file(get_path_part(href)+src)
else:
logger.info("img tag without src in file:(%s) tag:(%s)"%(href,img))
# link href="0.css" type="text/css"
for style in soup.findAll('link',{'type':'text/css'}):
#print("link:%s"%style)
if style.has_attr('href'):
self.add_linked_file(get_path_part(href)+style['href'])
return newfile
def splitHtml(data,tagid,before=False):
soup = BeautifulSoup(data,'lxml')
#print("splitHtml.soup head:%s"%soup.find('head'))
splitpoint = soup.find(id=tagid)
#print("splitpoint:%s"%splitpoint)
if splitpoint == None:
return data
if before:
# remove all next siblings.
for n in splitpoint.findNextSiblings():
n.extract()
parent = splitpoint.parent
while parent and parent.name != 'body':
for n in parent.findNextSiblings():
n.extract()
parent = parent.parent
splitpoint.extract()
else:
# remove all prev siblings.
for n in splitpoint.findPreviousSiblings():
n.extract()
parent = splitpoint.parent
while parent and parent.name != 'body':
for n in parent.findPreviousSiblings():
n.extract()
parent = parent.parent
return re.sub(r'( *\r?\n)+','\r\n',unicode(soup))
def get_path_part(n):
relpath = os.path.dirname(n)
if( len(relpath) > 0 ):
relpath=relpath+"/"
return relpath
def get_file_part(n):
return os.path.basename(n)
## Utility method for creating new tags.
def newTag(dom,name,attrs=None,text=None):
tag = dom.createElement(name)
if( attrs is not None ):
for attr in attrs.keys():
tag.setAttribute(attr,attrs[attr])
if( text is not None ):
tag.appendChild(dom.createTextNode(text))
return tag
def main(argv,usage=None):
from optparse import OptionParser
if not usage:
# read in args, anything starting with -- will be treated as --<varible>=<value>
usage = 'usage: python %prog'
parser = OptionParser(usage+''' [options] <input epub> [line numbers...]
Giving an epub without line numbers will return a list of line numbers: the
possible split points in the input file. Calling with line numbers will
generate an epub with each of the "lines" given included.''')
parser.add_option("-o", "--output", dest="outputopt", default="split.epub",
help="Set OUTPUT file, Default: split.epub", metavar="OUTPUT")
parser.add_option("--output-dir", dest="outputdiropt",
help="Set OUTPUT directory, Default: presend working directory")
parser.add_option('--split-by-section',
action='store_true', dest='split_by_section',
help='Create a new epub from each of the listed line sections instead of one containing all. Splits all sections if no lines numbers are given. Each split will be named <number>-<output name> and placed in the output-dir. Sections without a Table of Contents entry will be included with the preceding section(s)', )
parser.add_option("-t", "--title", dest="titleopt", default=None,
help="Use TITLE as the metadata title. Default: '<original epub title> Split' or ToC entry with --split-by-section", metavar="TITLE")
parser.add_option("-d", "--description", dest="descopt", default=None,
help="Use DESC as the metadata description. Default: 'Split from <epub title> by <author>'.", metavar="DESC")
parser.add_option("-a", "--author",
action="append", dest="authoropts", default=[],
help="Use AUTHOR as a metadata author, multiple authors may be given, Default: <All authors from original epub>", metavar="AUTHOR")
parser.add_option("-g", "--tag",
action="append", dest="tagopts", default=[],
help="Include TAG as dc:subject tag, multiple tags may be given, Default: None", metavar="TAG")
parser.add_option("-l", "--language",
action="append", dest="languageopts", default=[],
help="Include LANG as dc:language tag, multiple languages may be given, Default: en", metavar="LANG")
parser.add_option("-c", "--cover", dest="coveropt", default=None,
help="Path to a jpg to use as cover image.", metavar="COVER")
(options, args) = parser.parse_args(argv)
## Add .epub if not already there.
if not options.outputopt.lower().endswith(".epub"):
options.outputopt=options.outputopt+".epub"
if not options.languageopts:
options.languageopts = ['en']
if not args:
parser.print_help()
return
epubO = SplitEpub(args[0])
lines = epubO.get_split_lines()
if options.split_by_section:
if len(args) > 1:
section_lines = args[1:]
else:
section_lines = range(len(lines))
splitslist = []
sectionlist = []
title=None
for lineno in section_lines:
toclist = lines[int(lineno)]['toc']
if sectionlist and not toclist:
sectionlist.append(lineno)
else:
## take title from (first) ToC if available, else titleopt (_ Split internally if None)
title = (toclist[0] if toclist else options.titleopt)
print("title: %s"%title)
sectionlist=[lineno]
splitslist.append((sectionlist,title))
if sectionlist:
splitslist.append((sectionlist,title))
# print(splitslist)
filecount = 1
for sectionlist, title in splitslist:
outputfile = "%0.4d-%s"%(filecount,options.outputopt)
if options.outputdiropt:
outputfile = os.path.join(options.outputdiropt,outputfile)
print("output file: "+outputfile)
epubO.write_split_epub(outputfile,
sectionlist,
authoropts=options.authoropts,
titleopt=title,
descopt=options.descopt,
tags=options.tagopts,
languages=options.languageopts,
coverjpgpath=options.coveropt)
filecount+=1
return
elif len(args) == 1:
count = 0
showlist=['toc','guide','anchor','id','href']
for line in lines:
print("\nLine Number: %d"%count)
for s in showlist:
if s in line and line[s]:
print("\t%s: %s"%(s,line[s]))
count += 1
return
if len(args) > 1:
outputfile = options.outputopt
if options.outputdiropt:
outputfile = os.path.join(options.outputdiropt,outputfile)
print("output file: "+outputfile)
epubO.write_split_epub(outputfile,
args[1:],
authoropts=options.authoropts,
titleopt=options.titleopt,
descopt=options.descopt,
tags=options.tagopts,
languages=options.languageopts,
coverjpgpath=options.coveropt)
return
if __name__ == "__main__":
main(sys.argv[1:])