Source code for ferenda.decorators

# -*- coding: utf-8 -*-
"""Most of these decorators are intended to handle various aspects of
a complete :py:meth:`~ferenda.DocumentRepository.parse`
implementation. Normally you should only use the
:py:func:`~ferenda.decorators.managedparsing` decorator (if you even
override the basic implementation). If you create separate actions
aside from the standards (``download``, ``parse``, ``generate`` et
al), you should also use :py:func:`~ferenda.decorators.action` so that
manage.py will be able to call it.
"""
from __future__ import unicode_literals
from datetime import datetime
import codecs
import functools
import itertools
import os
import time

import six
from rdflib import Graph, URIRef
from layeredconfig import LayeredConfig

from ferenda import util
from ferenda import DocumentEntry
from ferenda.errors import DocumentRemovedError, ParseError
from ferenda.elements import serialize

[docs]def timed(f):
    """Automatically log a statement of how long the function call takes"""
    @functools.wraps(f)
    def wrapper(self, doc):
        start = time.time()
        ret = f(self, doc)
        # FIXME: We shouldn't log this if we don't actually do any
        # work. The easiest way is to make sure parseifneeded wraps
        # timed, not the other way round.

        # ALSO: the addition of "parse" here makes the decorator only
        # useful for the parse method. It'd be better to have the
        # decorator take a format string and a method to call to
        # log. But maybe the util.logtime context manager is better
        # suited for this usecase?
        if isinstance(self.config.processes, int) and self.config.processes > 1:
            self.log.info('%s: parse OK (%.3f sec) [pid %s]', doc.basefile, time.time() - start, os.getpid())
        else:
            self.log.info('%s: parse OK (%.3f sec)', doc.basefile, time.time() - start)
        return ret
    return wrapper


[docs]def recordlastdownload(f):
    """Automatically stores current time in ``self.config.lastdownload``
    """
    @functools.wraps(f)
    def wrapper(self, *args, **kwargs):
        ret = f(self, *args, **kwargs)
        # only update the lastdownload for full downloads (if no
        # specific basefile was specified)
        if not args:
            self.config.lastdownload = datetime.now()
            LayeredConfig.write(self.config)
        return ret
    return wrapper


[docs]def parseifneeded(f):
    """Makes sure the parse function is only called if needed, i.e. if
    the outfile is nonexistent or older than the infile(s), or if the
    user has specified in the config file or on the command line that
    it should be re-generated."""
    @functools.wraps(f)
    def wrapper(self, doc):
        # note: We hardcode the use of .parseneeded() and the
        # 'parseforce' config option, which means that this decorator
        # can only be used sensibly with the .parse() function.
        force = (self.config.force is True or
                 self.config.parseforce is True)
        if not force and not self.parseneeded(doc.basefile):
            self.log.debug("%s: Skipped", doc.basefile)
            return True  # Signals that everything is OK
        else:
            self.log.debug("%s: Starting", doc.basefile)
            return f(self, doc)
    return wrapper


[docs]def render(f):
    """Handles the serialization of the :py:class:`~ferenda.Document`
    object to XHTML+RDFa and RDF/XML files. Must be used in
    conjunction with :py:func:`~ferenda.decorators.makedocument`.

    """
    # NOTE: The actual rendering is two lines of code. The bulk of
    # this function validates that the XHTML+RDFa file that we end up
    # with contains the exact same triples as is present in the doc
    # object (including both the doc.meta Graph and any other Graph
    # that might be present on any doc.body object). Also, this func
    # validates taht the documententry file has been properly filled,
    # which is sort of outside of the responsibility of this func,
    # but...
    
    def iterate_graphs(node):
        res = []
        if hasattr(node, 'meta') and node.meta is not None:
            res.append(node.meta)
        try:
            for subnode in node:
                if not isinstance(subnode, six.string_types):
                    res.extend(iterate_graphs(subnode))
        except TypeError: # node was not iterable
            pass
        return res

    @functools.wraps(f)
    def wrapper(self, doc):
        # call the actual function that creates the doc data
        ret = f(self, doc)
        # now render thath doc data as files (JSON, XHTML, RDF/XML)
        if self.config.serializejson == True:
            with self.store.open_serialized(doc.basefile, "wb") as fp:
                r = serialize(doc, format="json") # should be a (unicode) str
                fp.write(r.encode('utf-8'))
            self.log.debug("%s: Created %s" % (doc.basefile, self.store.serialized_path(doc.basefile)))

        updated = self.render_xhtml(doc, self.store.parsed_path(doc.basefile))
        if updated:
            self.log.debug("%s: Created %s" % (doc.basefile, self.store.parsed_path(doc.basefile)))

        # css file + background images + png renderings of text
        self.create_external_resources(doc)

        # Extract all triples on the XHTML/RDFa data to a separate
        # RDF/XML file
        distilled_graph = Graph()
        with codecs.open(self.store.parsed_path(doc.basefile),
                         encoding="utf-8") as fp:  # unicode
            distilled_graph.parse(data=fp.read(), format="rdfa",
                                  publicID=doc.uri)


        # The act of parsing from RDFa binds a lot of namespaces
        # in the graph in an unneccesary manner. Particularly it
        # binds both 'dc' and 'dcterms' to
        # 'http://purl.org/dc/terms/', which makes serialization
        # less than predictable. Blow these prefixes away.
        distilled_graph.bind("dc", URIRef("http://purl.org/dc/elements/1.1/"))
        distilled_graph.bind(
            "dcterms",
            URIRef("http://example.org/this-prefix-should-not-be-used"))

        util.ensure_dir(self.store.distilled_path(doc.basefile))
        with open(self.store.distilled_path(doc.basefile),
                  "wb") as distilled_file:
            # print("============distilled===============")
            # print(distilled_graph.serialize(format="turtle").decode('utf-8'))
            distilled_graph.serialize(distilled_file, format="pretty-xml")
        self.log.debug(
            '%s: %s triples extracted to %s', doc.basefile,
            len(distilled_graph), self.store.distilled_path(doc.basefile))

        # Validate that all required triples are present (we check
        # distilled_graph, but we could just as well check doc.meta)
        for p in self.required_predicates:
            x = distilled_graph.value(URIRef(doc.uri), p)
            if not x:
                self.log.warning("%s: Metadata is missing a %s triple" %
                                 (doc.basefile, distilled_graph.qname(p)))

        # Validate that all triples specified in doc.meta and any
        # .meta property on any body object is present in the
        # XHTML+RDFa file.
        for g in iterate_graphs(doc.body):
            doc.meta += g

        for triple in distilled_graph:
            # len_before = len(doc.meta)
            doc.meta.remove(triple)
            # len_after = len(doc.meta)

        if doc.meta:
            self.log.warning("%s: %d triple(s) from the original metadata was "
                             "not found in the serialized XHTML file:\n%s",
                             doc.basefile, len(doc.meta),
                             doc.meta.serialize(format="nt").decode('utf-8').strip())

        # Validate that entry.title and entry.id has been filled
        # (might be from doc.meta and doc.uri, might be other things
        entry = DocumentEntry(self.store.documententry_path(doc.basefile))
        if not entry.id:
            self.log.warning("%s: entry.id missing" % doc.basefile)
        if not entry.title:
            self.log.warning("%s: entry.title missing" % doc.basefile)
        return ret
    return wrapper


[docs]def handleerror(f):
    """Make sure any errors in :py:meth:`ferenda.DocumentRepository.parse`
    are handled appropriately and do not stop the parsing of all documents.
    """
    @functools.wraps(f)
    def wrapper(self, doc):
        try:
            return f(self, doc)
        except DocumentRemovedError as e:
            self.log.info(
                "%s: Document has been removed (%s)", doc.basefile, e)
            util.robust_remove(self.parsed_path(doc.basefile))
            return False
        except ParseError as e:
            self.log.error("%s: ParseError %s", doc.basefile, e)
            # FIXME: we'd like to use the shorter "if
            # ('fatalexceptions' in self.config" but a Mock we're
            # using in testDecorators.Decorators.test_handleerror does
            # not emulate this way of using the LayeredConfig
            # object. Until we rewrite the testcase better, this is
            # what we have to do.
            if (hasattr(self.config, 'fatalexceptions') and
                    self.config.fatalexceptions):
                raise
            else:
                return False
        except Exception:
            self.log.exception("parse of %s failed", doc.basefile)
            # FIXME: see above
            if (hasattr(self.config, 'fatalexceptions') and
                    self.config.fatalexceptions):
                raise
            else:
                return False
    return wrapper


[docs]def makedocument(f):
    """Changes the signature of the parse method to expect a Document
    object instead of a basefile string, and creates the object."""
    @functools.wraps(f)
    def wrapper(self, basefile):
        doc = self.make_document(basefile)
        return f(self, doc)
    return wrapper


[docs]def managedparsing(f):
    """Use all standard decorators for parse() in the correct order
    (:py:func:`~ferenda.decorators.makedocument`, :py:func:`~ferenda.decorators.parseifneeded`, :py:func:`~ferenda.decorators.timed`, :py:func:`~ferenda.decorators.render`)"""
    return makedocument(
        parseifneeded(
            # handleerror( # is this really a good idea?
            timed(
                render(f))))


[docs]def action(f):
    """Decorator that marks a class or instance method as runnable by
    :py:func:`ferenda.manager.run`
    """
    f.runnable = True
    return f


[docs]def downloadmax(f):
    """Makes any generator respect the ``downloadmax`` config parameter.

    """
    @functools.wraps(f)
    def wrapper(self, params):
        if 'downloadmax' in self.config:
            self.log.info("Downloading max %d documents" %
                          (self.config.downloadmax))
            generator = itertools.islice(f(self, params),
                                         self.config.downloadmax)
        else:
            self.log.debug("Downloading all the docs")
            generator = f(self, params)
        for value in generator:
            yield value
    return wrapper


[docs]def newstate(state):
    def real_decorator(f):
        setattr(f, 'newstate', state)
        return f
    return real_decorator