Source code for ferenda.transformer
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from tempfile import mkdtemp
import os
import shutil
import re
import pkg_resources
from lxml import etree
from lxml.etree import XSLT
from ferenda import errors, util
# assumption: A transformer is initialized with a single template. If
# you want to use a different template, create a different
# transformer.
[docs]class Transformer(object):
"""Transforms parsed "pure content" documents into "browser-ready"
HTML5 files with site branding and navigation, using a template of
some kind.
:param transformertype: The engine to be used for transforming. Right now only ``"XSLT"`` is supported.
:type transformertype: str
:param template: The main template file.
:type template: str
:param templatedirs: Directories that may contain supporting templates used by the main template.
:type templatedirs: str
:param documentroot: The base directory for all generated files -- used to make relative references to CSS/JS files correct.
:type documentroot: str
:param config: Any configuration information used by the
transforming engine. Can be a path to a config
file, a python data structure, or anything else
compatible with the engine selected by
``transformertype``.
.. note::
An initialized Transformer object only transforms using the
template file provided at initialization. If you need to use
another template file, create another Transformer object.
"""
def __init__(self, transformertype,
template,
templatedirs,
documentroot=None,
config=None):
cls = {'XSLT': XSLTTransform,
'JINJA': JinjaTransform}[transformertype]
self.t = cls(template, templatedirs)
self.documentroot = documentroot
self.config = config
# valid parameters
# - annotationfile: intermediate/basefile.grit.xml
[docs] def transform(self, indata, depth, parameters=None, uritransform=None):
"""Perform the transformation. This method always operates on the
"native" datastructure -- this might be different depending on
the transformer engine. For XSLT, which is implemented through
lxml, its in- and outdata are lxml trees
If you need an engine-indepent API, use
:meth:`~ferenda.Transformer.transform_stream` or
:meth:`~ferenda.Transformer.transform_file` instead
:param indata: The document to be transformed
:param depth: The directory nesting level, compared to ``documentroot``
:type depth: int
:param parameters: Any parameters that should be provided to the
template
:type parameters: dict
:param uritransform: A function, when called with an URI,
returns a transformed URI/URL (such as
the relative path to a static file) --
used when transforming to files used for
static offline use.
:type uritransform: callable
:returns: The transformed document
"""
if parameters == None:
parameters = {}
# the provided configuration (might be a file or a python dict
# or anything else, depending on the transformer engine) will
# contain lists of JS/CSS resources. In order to make it
# possible to use relative links to these (needed for offline
# static HTML files), we first do a transformer
# engine-specific adaption of the configuration depending on
# the directory depth level of the outfile (as provided
# through the depth parameter), then we provide this adapted
# configuration to the transform call
if self.config:
adapted_config = self.t.getconfig(self.config, depth)
else:
adapted_config = None
outdata = self.t.transform(indata, adapted_config, parameters)
if uritransform:
self._transform_links(outdata.getroot(), uritransform)
return outdata
def _transform_links(self, tree, uritransform):
for part in tree:
# depth-first transformation seems the easiest
self._transform_links(part, uritransform)
if part.tag != "a":
continue
uri = part.get("href")
if not uri:
continue
part.set("href", uritransform(uri))
[docs] def transform_stream(self, instream, depth,
parameters=None, uritransform=None):
"""Accepts a file-like object, returns a file-like object."""
return self.t.native_to_stream(
self.transform(self.t.stream_to_native(instream),
depth,
parameters,
uritransform))
[docs] def transform_file(self, infile, outfile,
parameters=None, uritransform=None):
"""Accepts two filenames, reads from *infile*, writes to *outfile*."""
depth = self._depth(os.path.dirname(outfile),
self.documentroot+os.sep+"index.html")
helpful = os.environ.get('FERENDA_TRANSFORMDEBUG', False)
if helpful:
import logging
log = logging.getLogger("ferenda.transformer")
if self.config:
if os.path.exists(self.t.orig_template):
xslfile = self.t.orig_template
else:
import pkg_resources
xslfile = pkg_resources.resource_filename('ferenda', self.t.orig_template)
p = parameters.copy()
for key, value in p.items():
if key.endswith("file"):
p[key] = os.path.relpath(value,
os.path.dirname(xslfile))
p['configurationfile'] = self.t.getconfig(self.config, depth)
log.debug("Equiv: xsltproc --nonet %s %s %s > %s" %
(" ".join(['--stringparam %s "%s"' % (x, p[x]) for x in p]),
os.path.relpath(xslfile,
os.getcwd()),
infile, outfile))
else:
log.warning("self.config not set, cannot construct equivalent xsltproc command line")
self.t.native_to_file(self.transform(self.t.file_to_native(infile),
depth,
parameters,
uritransform),
outfile)
def _depth(self, outfiledir, root):
# NB: root must be a file in the root dir
return os.path.relpath(root, outfiledir).count("..")
class TransformerEngine(object):
def __init__(self, template, templatedirs):
pass
class XSLTTransform(TransformerEngine):
def __init__(self, template, templatedirs, **kwargs):
self.orig_template = template
self.orig_templatedirs = templatedirs # ?
self.format = True # FIXME: make configurable
self.templdir = self._setup_templates(template, templatedirs)
worktemplate = self.templdir + os.sep + os.path.basename(template)
assert os.path.exists(worktemplate)
parser = etree.XMLParser(remove_blank_text=self.format)
xsltree = etree.parse(worktemplate, parser)
try:
self._transformer = etree.XSLT(xsltree)
except etree.XSLTParseError as e:
raise errors.TransformError(str(e.error_log))
def __del__(self):
if os.path.exists(self.templdir):
# this had better be a tempdir!
shutil.rmtree(self.templdir)
# purpose: get all XSLT files (main and supporting) into one place
# (should support zipped eggs, even if setup.py don't)
# template: full path to actual template to be used
# templatedirs: directory of supporting XSLT templates
# returns: directory name of the place where all files ended up
def _setup_templates(self, template, templatedirs):
workdir = mkdtemp()
# copy everything to this temp dir
for d in templatedirs:
if os.path.exists(d) and os.path.isdir(d):
for f in os.listdir(d):
shutil.copy2(d + os.sep + f, workdir + os.sep + f)
elif pkg_resources.resource_isdir('ferenda', d):
for f in pkg_resources.resource_listdir('ferenda', d):
fp = pkg_resources.resource_stream('ferenda', d + "/" + f)
dest = workdir + os.sep + f
with open(dest, "wb") as dest_fp:
dest_fp.write(fp.read())
if os.path.basename(template) not in os.listdir(workdir):
shutil.copy2(template, workdir)
return workdir
# getconfig may return different data depending on engine -- in
# this case it creates a xml file and returns the path for it
def getconfig(self, configfile, depth):
filename = configfile
if depth != 0:
(base, ext) = os.path.splitext(configfile)
filename = "%(base)s-depth-%(depth)d%(ext)s" % locals()
if not util.outfile_is_newer([configfile], filename):
tree = etree.parse(configfile)
# adjust the relevant link attribute for some nodes
for xpath, attrib in (("stylesheets/link", "href"),
("javascripts/script", "src"),
(".//img", "src")):
for node in tree.findall(xpath):
# don't adjust absolute links
if not (re.match("(https?://|/)", node.get(attrib))):
node.set(attrib, "../" * depth + node.get(attrib))
tree.write(filename)
return filename
def transform(self, indata, config=None, parameters={}):
strparams = {}
if config:
# paths to be used with the document() function
# must use unix path separators
if os.sep == "\\":
config = config.replace(os.sep, "/")
# print("Tranform: Using config %s. Contents:" % config)
# print(util.readfile(config))
strparams['configurationfile'] = XSLT.strparam(config)
for key, value in parameters.items():
if key.endswith("file"):
# relativize path of file relative to the XSL file
# we'll be using. The mechanism could be clearer...
value = os.path.relpath(value, self.templdir)
if os.sep == "\\":
value = value.replace(os.sep, "/")
strparams[key] = XSLT.strparam(value)
try:
return self._transformer(indata, **strparams)
except etree.XSLTApplyError as e:
raise errors.TransformError(str(e))
if len(self._transformer.error_log) > 0:
raise errors.TransformError(str(_transformer.error_log))
# nativedata = lxml.etree
def native_to_file(self, nativedata, outfile):
res = self.html5_doctype_workaround(
etree.tostring(nativedata, pretty_print=self.format))
util.ensure_dir(outfile)
with open(outfile, "wb") as fp:
fp.write(res)
@staticmethod
def html5_doctype_workaround(indata):
# FIXME: This is horrible
if indata.startswith(b"<remove-this-tag>"):
found = False
endidx = -1
while not found:
if indata[endidx] == b"<" or indata[endidx] == 60:
found = True
else:
endidx -= 1
indata = b"<!DOCTYPE html>\n" + indata[17:endidx].strip()
return indata
def file_to_native(self, infile):
return etree.parse(infile)
# FIXME: hook in the transform_links step somehow?
class JinjaTransform(TransformerEngine):
pass
# client code
#
# doc.body = elements.Body()
# for r in res:
# doc.body.append(html.Div(
# [html.H2([elements.Link(r['title'], uri=r['uri'])]),
# r['text']], **{'class':'hit'}))
# pages = [html.P(["Results %(firstresult)s-%(lastresult)s of %(totalresults)s" % pager])]
# for pagenum in range(pager['pagecount']):
# if pagenum + 1 == pager['pagenum']:
# pages.append(html.Span([str(pagenum+1)],**{'class':'page'}))
# else:
# querystring['p'] = str(pagenum+1)
# url = environ['PATH_INFO'] + "?" + urlencode(querystring)
# pages.append(html.A([str(pagenum+1)],**{'class':'page',
# 'href':url}))
# doc.body.append(html.Div(pages, **{'class':'pager'}))
#
# transformer = TemplateTransformer(transformertype="XSLT",
# template="res/xsl/generic.xsl",
# templatedirs=["res/xsl"],
# documentroot="/var/www/site")
#
# newtree = transformer.transform_tree(doc.body.as_xhtml(),
# reldepth=1)
# fp.write(etree.tostring(newtree, pretty_print=True))
#
# -- or --
#
#
# util.writefile("indata.xhtml", doc.body.as_xhtml().serialize())
# transformer.transform("indata.xhtml", "/var/www/site/my/own/file.html")
#
# references to root resources in file.html are now on the form
# "../../css/main.css", since file.html is 2 levels deep compared to
# documentroot.
#