Source code for ferenda.transformer

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from tempfile import mkdtemp
import os
import shutil
import re

import pkg_resources
from lxml import etree
from lxml.etree import XSLT

from ferenda import errors, util

# assumption: A transformer is initialized with a single template. If
# you want to use a different template, create a different
# transformer.

[docs]class Transformer(object): """Transforms parsed "pure content" documents into "browser-ready" HTML5 files with site branding and navigation, using a template of some kind. :param transformertype: The engine to be used for transforming. Right now only ``"XSLT"`` is supported. :type transformertype: str :param template: The main template file. :type template: str :param templatedirs: Directories that may contain supporting templates used by the main template. :type templatedirs: str :param documentroot: The base directory for all generated files -- used to make relative references to CSS/JS files correct. :type documentroot: str :param config: Any configuration information used by the transforming engine. Can be a path to a config file, a python data structure, or anything else compatible with the engine selected by ``transformertype``. .. note:: An initialized Transformer object only transforms using the template file provided at initialization. If you need to use another template file, create another Transformer object. """ def __init__(self, transformertype, template, templatedirs, documentroot=None, config=None): cls = {'XSLT': XSLTTransform, 'JINJA': JinjaTransform}[transformertype] self.t = cls(template, templatedirs) self.documentroot = documentroot self.config = config # valid parameters # - annotationfile: intermediate/basefile.grit.xml
[docs] def transform(self, indata, depth, parameters=None, uritransform=None): """Perform the transformation. This method always operates on the "native" datastructure -- this might be different depending on the transformer engine. For XSLT, which is implemented through lxml, its in- and outdata are lxml trees If you need an engine-indepent API, use :meth:`~ferenda.Transformer.transform_stream` or :meth:`~ferenda.Transformer.transform_file` instead :param indata: The document to be transformed :param depth: The directory nesting level, compared to ``documentroot`` :type depth: int :param parameters: Any parameters that should be provided to the template :type parameters: dict :param uritransform: A function, when called with an URI, returns a transformed URI/URL (such as the relative path to a static file) -- used when transforming to files used for static offline use. :type uritransform: callable :returns: The transformed document """ if parameters == None: parameters = {} # the provided configuration (might be a file or a python dict # or anything else, depending on the transformer engine) will # contain lists of JS/CSS resources. In order to make it # possible to use relative links to these (needed for offline # static HTML files), we first do a transformer # engine-specific adaption of the configuration depending on # the directory depth level of the outfile (as provided # through the depth parameter), then we provide this adapted # configuration to the transform call if self.config: adapted_config = self.t.getconfig(self.config, depth) else: adapted_config = None outdata = self.t.transform(indata, adapted_config, parameters) if uritransform: self._transform_links(outdata.getroot(), uritransform) return outdata
def _transform_links(self, tree, uritransform): for part in tree: # depth-first transformation seems the easiest self._transform_links(part, uritransform) if part.tag != "a": continue uri = part.get("href") if not uri: continue part.set("href", uritransform(uri))
[docs] def transform_stream(self, instream, depth, parameters=None, uritransform=None): """Accepts a file-like object, returns a file-like object.""" return self.t.native_to_stream( self.transform(self.t.stream_to_native(instream), depth, parameters, uritransform))
[docs] def transform_file(self, infile, outfile, parameters=None, uritransform=None): """Accepts two filenames, reads from *infile*, writes to *outfile*.""" depth = self._depth(os.path.dirname(outfile), self.documentroot+os.sep+"index.html") helpful = os.environ.get('FERENDA_TRANSFORMDEBUG', False) if helpful: import logging log = logging.getLogger("ferenda.transformer") if self.config: if os.path.exists(self.t.orig_template): xslfile = self.t.orig_template else: import pkg_resources xslfile = pkg_resources.resource_filename('ferenda', self.t.orig_template) p = parameters.copy() for key, value in p.items(): if key.endswith("file"): p[key] = os.path.relpath(value, os.path.dirname(xslfile)) p['configurationfile'] = self.t.getconfig(self.config, depth) log.debug("Equiv: xsltproc --nonet %s %s %s > %s" % (" ".join(['--stringparam %s "%s"' % (x, p[x]) for x in p]), os.path.relpath(xslfile, os.getcwd()), infile, outfile)) else: log.warning("self.config not set, cannot construct equivalent xsltproc command line") self.t.native_to_file(self.transform(self.t.file_to_native(infile), depth, parameters, uritransform), outfile)
def _depth(self, outfiledir, root): # NB: root must be a file in the root dir return os.path.relpath(root, outfiledir).count("..")
class TransformerEngine(object): def __init__(self, template, templatedirs): pass class XSLTTransform(TransformerEngine): def __init__(self, template, templatedirs, **kwargs): self.orig_template = template self.orig_templatedirs = templatedirs # ? self.format = True # FIXME: make configurable self.templdir = self._setup_templates(template, templatedirs) worktemplate = self.templdir + os.sep + os.path.basename(template) assert os.path.exists(worktemplate) parser = etree.XMLParser(remove_blank_text=self.format) xsltree = etree.parse(worktemplate, parser) try: self._transformer = etree.XSLT(xsltree) except etree.XSLTParseError as e: raise errors.TransformError(str(e.error_log)) def __del__(self): if os.path.exists(self.templdir): # this had better be a tempdir! shutil.rmtree(self.templdir) # purpose: get all XSLT files (main and supporting) into one place # (should support zipped eggs, even if don't) # template: full path to actual template to be used # templatedirs: directory of supporting XSLT templates # returns: directory name of the place where all files ended up def _setup_templates(self, template, templatedirs): workdir = mkdtemp() # copy everything to this temp dir for d in templatedirs: if os.path.exists(d) and os.path.isdir(d): for f in os.listdir(d): shutil.copy2(d + os.sep + f, workdir + os.sep + f) elif pkg_resources.resource_isdir('ferenda', d): for f in pkg_resources.resource_listdir('ferenda', d): fp = pkg_resources.resource_stream('ferenda', d + "/" + f) dest = workdir + os.sep + f with open(dest, "wb") as dest_fp: dest_fp.write( if os.path.basename(template) not in os.listdir(workdir): shutil.copy2(template, workdir) return workdir # getconfig may return different data depending on engine -- in # this case it creates a xml file and returns the path for it def getconfig(self, configfile, depth): filename = configfile if depth != 0: (base, ext) = os.path.splitext(configfile) filename = "%(base)s-depth-%(depth)d%(ext)s" % locals() if not util.outfile_is_newer([configfile], filename): tree = etree.parse(configfile) # adjust the relevant link attribute for some nodes for xpath, attrib in (("stylesheets/link", "href"), ("javascripts/script", "src"), (".//img", "src")): for node in tree.findall(xpath): # don't adjust absolute links if not (re.match("(https?://|/)", node.get(attrib))): node.set(attrib, "../" * depth + node.get(attrib)) tree.write(filename) return filename def transform(self, indata, config=None, parameters={}): strparams = {} if config: # paths to be used with the document() function # must use unix path separators if os.sep == "\\": config = config.replace(os.sep, "/") # print("Tranform: Using config %s. Contents:" % config) # print(util.readfile(config)) strparams['configurationfile'] = XSLT.strparam(config) for key, value in parameters.items(): if key.endswith("file"): # relativize path of file relative to the XSL file # we'll be using. The mechanism could be clearer... value = os.path.relpath(value, self.templdir) if os.sep == "\\": value = value.replace(os.sep, "/") strparams[key] = XSLT.strparam(value) try: return self._transformer(indata, **strparams) except etree.XSLTApplyError as e: raise errors.TransformError(str(e)) if len(self._transformer.error_log) > 0: raise errors.TransformError(str(_transformer.error_log)) # nativedata = lxml.etree def native_to_file(self, nativedata, outfile): res = self.html5_doctype_workaround( etree.tostring(nativedata, pretty_print=self.format)) util.ensure_dir(outfile) with open(outfile, "wb") as fp: fp.write(res) @staticmethod def html5_doctype_workaround(indata): # FIXME: This is horrible if indata.startswith(b"<remove-this-tag>"): found = False endidx = -1 while not found: if indata[endidx] == b"<" or indata[endidx] == 60: found = True else: endidx -= 1 indata = b"<!DOCTYPE html>\n" + indata[17:endidx].strip() return indata def file_to_native(self, infile): return etree.parse(infile) # FIXME: hook in the transform_links step somehow? class JinjaTransform(TransformerEngine): pass # client code # # doc.body = elements.Body() # for r in res: # doc.body.append(html.Div( # [html.H2([elements.Link(r['title'], uri=r['uri'])]), # r['text']], **{'class':'hit'})) # pages = [html.P(["Results %(firstresult)s-%(lastresult)s of %(totalresults)s" % pager])] # for pagenum in range(pager['pagecount']): # if pagenum + 1 == pager['pagenum']: # pages.append(html.Span([str(pagenum+1)],**{'class':'page'})) # else: # querystring['p'] = str(pagenum+1) # url = environ['PATH_INFO'] + "?" + urlencode(querystring) # pages.append(html.A([str(pagenum+1)],**{'class':'page', # 'href':url})) # doc.body.append(html.Div(pages, **{'class':'pager'})) # # transformer = TemplateTransformer(transformertype="XSLT", # template="res/xsl/generic.xsl", # templatedirs=["res/xsl"], # documentroot="/var/www/site") # # newtree = transformer.transform_tree(doc.body.as_xhtml(), # reldepth=1) # fp.write(etree.tostring(newtree, pretty_print=True)) # # -- or -- # # # util.writefile("indata.xhtml", doc.body.as_xhtml().serialize()) # transformer.transform("indata.xhtml", "/var/www/site/my/own/file.html") # # references to root resources in file.html are now on the form # "../../css/main.css", since file.html is 2 levels deep compared to # documentroot. #