Source code for ebooklib.utils
# This file is part of EbookLib.
# Copyright (c) 2013 Aleksandar Erkalovic <aerkalov@gmail.com>
#
# EbookLib is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# EbookLib is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with EbookLib. If not, see <http://www.gnu.org/licenses/>.
import io
import mimetypes
import os
from lxml import etree
mimetype_initialised = False
[docs]
def debug(obj):
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(obj)
[docs]
def parse_string(s):
parser = etree.XMLParser(recover=True, resolve_entities=False)
try:
tree = etree.parse(io.BytesIO(s.encode("utf-8")), parser=parser)
except Exception:
tree = etree.parse(io.BytesIO(s), parser=parser)
return tree
[docs]
def parse_html_string(s):
from lxml import html
utf8_parser = html.HTMLParser(encoding="utf-8")
html_tree = html.document_fromstring(s, parser=utf8_parser)
return html_tree
[docs]
def guess_type(extenstion):
global mimetype_initialised
if not mimetype_initialised:
mimetypes.init()
mimetypes.add_type("application/xhtml+xml", ".xhtml")
mimetype_initialised = True
return mimetypes.guess_type(extenstion)
[docs]
def create_pagebreak(pageref, label=None, html=True):
from ebooklib.epub import NAMESPACES
pageref_attributes = {
"{%s}type" % NAMESPACES["EPUB"]: "pagebreak", # noqa
"title": "{pageref}".format(pageref=pageref), # noqa: UP032
"id": "{pageref}".format(pageref=pageref), # noqa: UP032
}
pageref_elem = etree.Element("span", pageref_attributes, nsmap={"epub": NAMESPACES["EPUB"]})
if label:
pageref_elem.text = label
if html:
return etree.tostring(pageref_elem, encoding="unicode")
return pageref_elem
[docs]
def get_pages(item):
body = parse_html_string(item.get_body_content())
pages = []
for elem in body.iter():
if "epub:type" in elem.attrib:
if elem.get("id") is not None:
_text = None
if elem.text is not None and elem.text.strip() != "":
_text = elem.text.strip()
if _text is None:
_text = elem.get("aria-label")
if _text is None:
_text = get_headers(elem)
pages.append((item.get_name(), elem.get("id"), _text or elem.get("id")))
return pages
[docs]
def get_pages_for_items(items):
pages_from_docs = [get_pages(item) for item in items]
return [item for pages in pages_from_docs for item in pages]
[docs]
class Directory(object): # noqa: UP004
def __init__(self, directory_path):
self.directory_path = directory_path
[docs]
def read(self, subname):
with open(os.path.join(self.directory_path, subname), "rb") as fp:
return fp.read()