Source code for ferenda.devel

# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function
import sys
import os
from difflib import unified_diff
from tempfile import mkstemp
import inspect
import codecs

from rdflib import Graph, URIRef, RDF
import six
from six import text_type as str
from layeredconfig import LayeredConfig

from ferenda import TextReader, TripleStore, FulltextIndex
from ferenda.elements import serialize
from ferenda import decorators, util

class DummyStore(object):

    def __init__(self, path, **kwargs):
        pass  # pragma: no cover

    def list_basefiles_for(self, action, basedir=None):
        return []  # pragma: no cover


[docs]class Devel(object):

    """Collection of utility commands for developing docrepos.

    This module acts as a docrepo (and as such is easily callable from
    ``ferenda-manager.py``), but instead of ``download``, ``parse``,
    ``generate`` et al, contains various tool commands that is useful
    for developing and debugging your own docrepo classes.

    Use it by first enabling it::

        ./ferenda-build.py ferenda.Devel enable

    And then run individual tools like::

        ./ferenda-build.py devel dumprdf path/to/xhtml/rdfa.xhtml

    """

    alias = "devel"

[docs]    @decorators.action
    def dumprdf(self, filename, format="turtle"):
        """Extract all RDF data from a parsed file and dump it to stdout.

        :param filename: Full path of the parsed XHTML+RDFa file.
        :type filename: str
        :param format: The serialization format for RDF data (same as for :py:meth:`rdflib.graph.Graph.serialize`)
        :type format: str

        Example::

            ./ferenda-build.py devel dumprdf path/to/xhtml/rdfa.xhtml nt


        """
        g = Graph()
        g.parse(data=util.readfile(filename), format="rdfa")
        # At least the turtle serializer creates UTF-8 data. Fix this!
        print((g.serialize(None, format=format).decode("utf-8")))

[docs]    @decorators.action
    def dumpstore(self, format="turtle"):
        """Extract all RDF data from the system triplestore and dump
        it to stdout using the specified format.

        :param format: The serialization format for RDF data (same as
                       for :py:meth:`ferenda.TripleStore.get_serialized`).
        :type format: str

        Example::

            ./ferenda-build.py devel dumpstore nt > alltriples.nt
        """
        # print("Creating store of type %s, location %s, repository %s" %
        #       (self.config.storetype, self.config.storelocation, self.config.storerepository))
        store = TripleStore.connect(self.config.storetype,
                                    self.config.storelocation,
                                    self.config.storerepository)
        print(store.get_serialized(format=format).decode('utf-8'))

#    Not really useful for anything than finding bugs in ferenda itself
#
#    def testlog(self):
#        """Logs a series of messages at various levels, to test that
#        your client code logging configuration behaves as
#        expectedly."""
#        log = logging.getLogger(__name__)
#        log.critical('Log message at CRITICAL level')
#        log.error('Log message at ERROR level')
#        log.warn('Log message at WARN level')
#        log.info('Log message at INFO level')
#        log.debug('Log message at DEBUG level')
#        sub = logging.getLogger(__name__+'.sublogger')
#        sub.critical('Sublog message at CRITICAL level')
#        sub.error('Sublog message at ERROR level')
#        sub.warn('Sublog message at WARN level')
#        sub.info('Sublog message at INFO level')
#        sub.debug('Sublog message at DEBUG level')

[docs]    @decorators.action
    def csvinventory(self, alias):
        """Create an inventory of documents, as a CSV file. Only documents
        that have been parsed and yielded some minimum amount of RDF
        metadata will be included.

        :param alias: Docrepo alias
        :type  alias: str
        """
        predicates = ['basefile',
                      'subobjects', # sections that have rdf:type
                      'rdf:type',
                      'dcterms:identifier',
                      'dcterms:title',
                      'dcterms:published',
                      'prov:wasGeneratedBy',
                      ]
        import csv
        if six.PY2:
            delimiter = b';'
            out = sys.stdout
        else:
            import codecs
            delimiter = ';'
            out = codecs.getwriter("latin-1")(sys.stdout.detach())
            out.errors = "replace"
        
        writer = csv.DictWriter(out, predicates, delimiter=delimiter)
        repo = self._repo_from_alias(alias)
        writer.writerow(dict([(p,p) for p in predicates]))
        for basefile in repo.store.list_basefiles_for("relate"):
            baseuri = URIRef(repo.canonical_uri(basefile))
            with repo.store.open_distilled(basefile) as fp:
                row = {'basefile': basefile}
                g = Graph().parse(fp, format="xml")
                for (p, o) in g.predicate_objects(baseuri):
                    qname = g.qname(p)
                    if qname in predicates:
                        if isinstance(o, URIRef):
                            row[qname] = g.qname(o)
                        else:
                            # it seems py2 CSV modue expects latin-1
                            # encoded bytestrings (for non-ascii
                            # values), while py3 CSV expects unicode
                            # (sensibly)
                            fld = str(o)
                            if six.PY2:
                                fld = fld.encode("latin-1", errors="replace")
                            row[qname] = fld
                row['subobjects'] = len(list(g.subject_objects(RDF.type)))
                writer.writerow(row)

    def _repo_from_alias(self, alias):
        #  (FIXME: This uses several undocumented APIs)
        mainconfig = self.config._parent
        assert mainconfig is not None, "Devel must be initialized with a full set of configuration"
        repoconfig = getattr(mainconfig, alias)
        from ferenda import manager
        repocls = manager._load_class(getattr(repoconfig, 'class'))
        repo = repocls()
        repo.config = getattr(mainconfig, alias)
        # work in all parameters from get_default_options
        for key, val in repo.get_default_options().items():
            if key not in repo.config:
                LayeredConfig.set(repo.config, key, val, "defaults")
        repo.store = repo.documentstore_class(
            repo.config.datadir + os.sep + repo.alias,
            downloaded_suffix=repo.downloaded_suffix,
            storage_policy=repo.storage_policy)
        return repo
        
[docs]    @decorators.action
    def mkpatch(self, alias, basefile, description):
        """Create a patch file from downloaded or intermediate files. Before
        running this tool, you should hand-edit the intermediate
        file. If your docrepo doesn't use intermediate files, you
        should hand-edit the downloaded file instead. The tool will
        first stash away the intermediate (or downloaded) file, then
        re-run :py:meth:`~ferenda.DocumentRepository.parse` (or
        :py:meth:`~ferenda.DocumentRepository.download_single`) in
        order to get a new intermediate (or downloaded) file. It will
        then calculate the diff between these two versions and save it
        as a patch file in it's proper place (as determined by
        ``config.patchdir``), where it will be picked up automatically
        by :py:meth:`~ferenda.DocumentRepository.patch_if_needed`.

        :param alias: Docrepo alias
        :type  alias: str
        :param basefile: The basefile for the document to patch
        :type  basefile: str

        Example::

            ./ferenda-build.py devel mkpatch myrepo basefile1 "Removed sensitive personal information"

        """
        # 1. initialize the docrepo indicated by "alias"
        repo = self._repo_from_alias(alias)
        
        # 2. find out if there is an intermediate file or downloaded
        # file for basefile
        if os.path.exists(repo.store.intermediate_path(basefile)):
            stage = "intermediate"
            outfile = repo.store.intermediate_path(basefile)
        else:
            stage = "download"
            outfile = repo.store.downloaded_path(basefile)

        # 2.1 stash a copy
        fileno, stash = mkstemp()
        with os.fdopen(fileno, "wb") as fp:
            fp.write(util.readfile(outfile, mode="rb"))
        
        # 2.1 if intermediate: stash a copy, run
        # parse(config.force=True) to regenerate the intermediate file
        if stage == "intermediate":
            repo.config.force = True
            try: 
                repo.parse(basefile)  
            except:
                # maybe this throws an error (hopefully after creating
                # the intermediate file)? may be the reason for
                # patching in the first place?
                pass
            
        # 2.2 if only downloaded: stash a copy, run download_single(config.refresh=True)
        else:
            repo.config.refresh = True
            repo.download_single(basefile)
            
        # 3. calculate the diff using difflib.

        # Assume that intermediate files use the same encoding as
        # source files
        encoding = repo.source_encoding
        outfile_lines = codecs.open(outfile, encoding=encoding).readlines()
        stash_lines = codecs.open(stash, encoding=encoding).readlines()
        difflines = list(unified_diff(outfile_lines,
                                      stash_lines,
                                      outfile,
                                      stash))
        os.unlink(stash)
        # 4. calculate place of patch using docrepo.store.
        patchstore = repo.documentstore_class(repo.config.patchdir +
                                              os.sep + repo.alias)
        patchpath = patchstore.path(basefile, "patches", ".patch")

        # 3.1 If comment is single-line, append it on the first hunks
        # @@-control line
        if description.count("\n") == 0:
            for idx,line in enumerate(difflines):
                if line.startswith("@@") and line.endswith("@@\n"):
                    difflines[idx] = difflines[idx].replace("@@\n",
                                                            "@@ "+description+"\n")
                    break
        else:
            # 4.2 if comment is not single-line, write the rest
            # in corresponding .desc file
            descpath = patchstore.path(basefile, "patches", ".desc")
            util.writefile(descpath, description)
            
        # 4.1 write patch
        patchcontent = "".join(difflines)
        if patchcontent:
            # write the patch using the same encoding as the
            # downloaded/intermediate files
            util.writefile(patchpath, patchcontent, encoding=encoding)
            # print("Created patch %s" % patchpath)
            return patchpath
        else:
            print("WARNING: patch would be empty, not creating it")

[docs]    @decorators.action
    def parsestring(self, string, citationpattern, uriformatter=None):
        """Parse a string using a named citationpattern and print
        parse tree and optionally formatted uri(s) on stdout.

        :param string: The text to parse
        :type  string: str
        :param citationpattern: The fully qualified name of a citationpattern
        :type  citationpattern: str
        :param uriformatter: The fully qualified name of a uriformatter
        :type  uriformatter: str
        
        .. note::

           This is not implemented yet

        Example::

            ./ferenda-build.py devel parsestring \\
                "According to direktiv 2007/42/EU, ..." \\
                ferenda.citationpatterns.eulaw
        
        """
        raise NotImplementedError

[docs]    @decorators.action
    def fsmparse(self, functionname, source):
        """Parse a list of text chunks using a named fsm parser and
        output the parse tree and final result to stdout.

        :param functionname: A function that returns a configured
                             :py:class:`~ferenda.FSMParser`
        :type  functionname: str
        :param source:       A file containing the text chunks, separated
                             by double newlines
        :type source:        str

        """
        modulename, classname, methodname = functionname.rsplit(".", 2)
        __import__(modulename)
        m = sys.modules[modulename]
        for name, cls in inspect.getmembers(m, inspect.isclass):
            if name == classname:
                break
        method = getattr(cls,methodname)
        parser = method()
        parser.debug = True
        tr = TextReader(source)
        b = parser.parse(tr.getiterator(tr.readparagraph))
        print(serialize(b))

[docs]    @decorators.action
    def queryindex(self, querystring):
        """Query the system fulltext index and return the IDs/URIs for matching documents.

        :param querystring: The query
        :type querystring: str
        """
        index = FulltextIndex.connect(self.config.indextype,
                                      self.config.indexlocation)
        rows = index.query(querystring)
        for row in rows:
            print("%s (%s): %s" % (row['identifier'], row['about'], row['text']))

[docs]    @decorators.action
    def construct(self, template, uri, format="turtle"):
        sq = util.readfile(template) % {'uri': uri}
        ts = TripleStore.connect(self.config.storetype,
                                 self.config.storelocation,
                                 self.config.storerepository)
        print("# Constructing the following from %s, repository %s, type %s" %
              (self.config.storelocation,
               self.config.storerepository,
               self.config.storetype))
        print("".join(["# %s\n" % x for x in sq.split("\n")]))
        p = {}
        with util.logtime(print,
                          "# %(triples)s triples constructed in %(elapsed).3fs",
                          p):
            res = ts.construct(sq)
            p['triples'] = len(res)
            print(res.serialize(format=format).decode('utf-8'))

[docs]    @decorators.action
    def select(self, template, uri, format="json"):
        sq = util.readfile(template) % {'uri': uri}
        ts = TripleStore.connect(self.config.storetype,
                                 self.config.storelocation,
                                 self.config.storerepository)

        print("# Constructing the following from %s, repository %s, type %s" %
              (self.config.storelocation,
               self.config.storerepository,
               self.config.storetype))
        print("".join(["# %s\n" % x for x in sq.split("\n")]))
        p = {}
        with util.logtime(print,
                          "# Selected in %(elapsed).3fs",
                          p):
            res = ts.select(sq, format=format)
            # res should be a unicode string, not an encoded bytestring
            # print(res)

            # NO! res must be a bytestring, select should return
            # whatever is the appropriately encoded version for the
            # given format.
            print(res.decode('utf-8'))

[docs]    @decorators.action
    def destroyindex(self):
        f = FulltextIndex.connect(self.config.indextype,
                                  self.config.indexlocation,
                                  [])
        f.destroy()
        print("%s index at %s destroyed" % (self.config.indextype,
                                            self.config.indexlocation))

    # FIXME: These are dummy implementations of methods and class
    # variables that manager.py expects all docrepos to have. We don't
    # want to have coverage counting these as missing lines, hence the
    # pragma: no cover comments.

    def __init__(self, config=None, **kwargs):
        self.store = DummyStore(None)
        self.config = config
    
    documentstore_class = DummyStore
    downloaded_suffix = ".html"
    storage_policy = "file"

[docs]    def get_default_options(self):
        return {}  # pragma: no cover

[docs]    def download(self):
        pass  # pragma: no cover

[docs]    def parse(self, basefile):
        pass  # pragma: no cover

[docs]    def relate(self, basefile):
        pass  # pragma: no cover

[docs]    def generate(self, basefile):
        pass  # pragma: no cover

[docs]    def toc(self, otherrepos):
        pass  # pragma: no cover

[docs]    def news(self, otherrepos):
        pass  # pragma: no cover

[docs]    def status(self):
        pass  # pragma: no cover

[docs]    @classmethod
    def setup(cls, action, config):
        pass  # pragma: no cover

[docs]    @classmethod
    def teardown(cls, action, config):
        pass  # pragma: no cover