Source code for ferenda.devel

# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function
import sys
import os
from difflib import unified_diff
from tempfile import mkstemp
import inspect
import codecs

from rdflib import Graph, URIRef, RDF
import six
from six import text_type as str
from layeredconfig import LayeredConfig

from ferenda import TextReader, TripleStore, FulltextIndex
from ferenda.elements import serialize
from ferenda import decorators, util

class DummyStore(object):

    def __init__(self, path, **kwargs):
        pass  # pragma: no cover

    def list_basefiles_for(self, action, basedir=None):
        return []  # pragma: no cover


[docs]class Devel(object): """Collection of utility commands for developing docrepos. This module acts as a docrepo (and as such is easily callable from ``ferenda-manager.py``), but instead of ``download``, ``parse``, ``generate`` et al, contains various tool commands that is useful for developing and debugging your own docrepo classes. Use it by first enabling it:: ./ferenda-build.py ferenda.Devel enable And then run individual tools like:: ./ferenda-build.py devel dumprdf path/to/xhtml/rdfa.xhtml """ alias = "devel"
[docs] @decorators.action def dumprdf(self, filename, format="turtle"): """Extract all RDF data from a parsed file and dump it to stdout. :param filename: Full path of the parsed XHTML+RDFa file. :type filename: str :param format: The serialization format for RDF data (same as for :py:meth:`rdflib.graph.Graph.serialize`) :type format: str Example:: ./ferenda-build.py devel dumprdf path/to/xhtml/rdfa.xhtml nt """ g = Graph() g.parse(data=util.readfile(filename), format="rdfa") # At least the turtle serializer creates UTF-8 data. Fix this! print((g.serialize(None, format=format).decode("utf-8")))
[docs] @decorators.action def dumpstore(self, format="turtle"): """Extract all RDF data from the system triplestore and dump it to stdout using the specified format. :param format: The serialization format for RDF data (same as for :py:meth:`ferenda.TripleStore.get_serialized`). :type format: str Example:: ./ferenda-build.py devel dumpstore nt > alltriples.nt """ # print("Creating store of type %s, location %s, repository %s" % # (self.config.storetype, self.config.storelocation, self.config.storerepository)) store = TripleStore.connect(self.config.storetype, self.config.storelocation, self.config.storerepository) print(store.get_serialized(format=format).decode('utf-8'))
# Not really useful for anything than finding bugs in ferenda itself # # def testlog(self): # """Logs a series of messages at various levels, to test that # your client code logging configuration behaves as # expectedly.""" # log = logging.getLogger(__name__) # log.critical('Log message at CRITICAL level') # log.error('Log message at ERROR level') # log.warn('Log message at WARN level') # log.info('Log message at INFO level') # log.debug('Log message at DEBUG level') # sub = logging.getLogger(__name__+'.sublogger') # sub.critical('Sublog message at CRITICAL level') # sub.error('Sublog message at ERROR level') # sub.warn('Sublog message at WARN level') # sub.info('Sublog message at INFO level') # sub.debug('Sublog message at DEBUG level')
[docs] @decorators.action def csvinventory(self, alias): """Create an inventory of documents, as a CSV file. Only documents that have been parsed and yielded some minimum amount of RDF metadata will be included. :param alias: Docrepo alias :type alias: str """ predicates = ['basefile', 'subobjects', # sections that have rdf:type 'rdf:type', 'dcterms:identifier', 'dcterms:title', 'dcterms:published', 'prov:wasGeneratedBy', ] import csv if six.PY2: delimiter = b';' out = sys.stdout else: import codecs delimiter = ';' out = codecs.getwriter("latin-1")(sys.stdout.detach()) out.errors = "replace" writer = csv.DictWriter(out, predicates, delimiter=delimiter) repo = self._repo_from_alias(alias) writer.writerow(dict([(p,p) for p in predicates])) for basefile in repo.store.list_basefiles_for("relate"): baseuri = URIRef(repo.canonical_uri(basefile)) with repo.store.open_distilled(basefile) as fp: row = {'basefile': basefile} g = Graph().parse(fp, format="xml") for (p, o) in g.predicate_objects(baseuri): qname = g.qname(p) if qname in predicates: if isinstance(o, URIRef): row[qname] = g.qname(o) else: # it seems py2 CSV modue expects latin-1 # encoded bytestrings (for non-ascii # values), while py3 CSV expects unicode # (sensibly) fld = str(o) if six.PY2: fld = fld.encode("latin-1", errors="replace") row[qname] = fld row['subobjects'] = len(list(g.subject_objects(RDF.type))) writer.writerow(row)
def _repo_from_alias(self, alias): # (FIXME: This uses several undocumented APIs) mainconfig = self.config._parent assert mainconfig is not None, "Devel must be initialized with a full set of configuration" repoconfig = getattr(mainconfig, alias) from ferenda import manager repocls = manager._load_class(getattr(repoconfig, 'class')) repo = repocls() repo.config = getattr(mainconfig, alias) # work in all parameters from get_default_options for key, val in repo.get_default_options().items(): if key not in repo.config: LayeredConfig.set(repo.config, key, val, "defaults") repo.store = repo.documentstore_class( repo.config.datadir + os.sep + repo.alias, downloaded_suffix=repo.downloaded_suffix, storage_policy=repo.storage_policy) return repo
[docs] @decorators.action def mkpatch(self, alias, basefile, description): """Create a patch file from downloaded or intermediate files. Before running this tool, you should hand-edit the intermediate file. If your docrepo doesn't use intermediate files, you should hand-edit the downloaded file instead. The tool will first stash away the intermediate (or downloaded) file, then re-run :py:meth:`~ferenda.DocumentRepository.parse` (or :py:meth:`~ferenda.DocumentRepository.download_single`) in order to get a new intermediate (or downloaded) file. It will then calculate the diff between these two versions and save it as a patch file in it's proper place (as determined by ``config.patchdir``), where it will be picked up automatically by :py:meth:`~ferenda.DocumentRepository.patch_if_needed`. :param alias: Docrepo alias :type alias: str :param basefile: The basefile for the document to patch :type basefile: str Example:: ./ferenda-build.py devel mkpatch myrepo basefile1 "Removed sensitive personal information" """ # 1. initialize the docrepo indicated by "alias" repo = self._repo_from_alias(alias) # 2. find out if there is an intermediate file or downloaded # file for basefile if os.path.exists(repo.store.intermediate_path(basefile)): stage = "intermediate" outfile = repo.store.intermediate_path(basefile) else: stage = "download" outfile = repo.store.downloaded_path(basefile) # 2.1 stash a copy fileno, stash = mkstemp() with os.fdopen(fileno, "wb") as fp: fp.write(util.readfile(outfile, mode="rb")) # 2.1 if intermediate: stash a copy, run # parse(config.force=True) to regenerate the intermediate file if stage == "intermediate": repo.config.force = True try: repo.parse(basefile) except: # maybe this throws an error (hopefully after creating # the intermediate file)? may be the reason for # patching in the first place? pass # 2.2 if only downloaded: stash a copy, run download_single(config.refresh=True) else: repo.config.refresh = True repo.download_single(basefile) # 3. calculate the diff using difflib. # Assume that intermediate files use the same encoding as # source files encoding = repo.source_encoding outfile_lines = codecs.open(outfile, encoding=encoding).readlines() stash_lines = codecs.open(stash, encoding=encoding).readlines() difflines = list(unified_diff(outfile_lines, stash_lines, outfile, stash)) os.unlink(stash) # 4. calculate place of patch using docrepo.store. patchstore = repo.documentstore_class(repo.config.patchdir + os.sep + repo.alias) patchpath = patchstore.path(basefile, "patches", ".patch") # 3.1 If comment is single-line, append it on the first hunks # @@-control line if description.count("\n") == 0: for idx,line in enumerate(difflines): if line.startswith("@@") and line.endswith("@@\n"): difflines[idx] = difflines[idx].replace("@@\n", "@@ "+description+"\n") break else: # 4.2 if comment is not single-line, write the rest # in corresponding .desc file descpath = patchstore.path(basefile, "patches", ".desc") util.writefile(descpath, description) # 4.1 write patch patchcontent = "".join(difflines) if patchcontent: # write the patch using the same encoding as the # downloaded/intermediate files util.writefile(patchpath, patchcontent, encoding=encoding) # print("Created patch %s" % patchpath) return patchpath else: print("WARNING: patch would be empty, not creating it")
[docs] @decorators.action def parsestring(self, string, citationpattern, uriformatter=None): """Parse a string using a named citationpattern and print parse tree and optionally formatted uri(s) on stdout. :param string: The text to parse :type string: str :param citationpattern: The fully qualified name of a citationpattern :type citationpattern: str :param uriformatter: The fully qualified name of a uriformatter :type uriformatter: str .. note:: This is not implemented yet Example:: ./ferenda-build.py devel parsestring \\ "According to direktiv 2007/42/EU, ..." \\ ferenda.citationpatterns.eulaw """ raise NotImplementedError
[docs] @decorators.action def fsmparse(self, functionname, source): """Parse a list of text chunks using a named fsm parser and output the parse tree and final result to stdout. :param functionname: A function that returns a configured :py:class:`~ferenda.FSMParser` :type functionname: str :param source: A file containing the text chunks, separated by double newlines :type source: str """ modulename, classname, methodname = functionname.rsplit(".", 2) __import__(modulename) m = sys.modules[modulename] for name, cls in inspect.getmembers(m, inspect.isclass): if name == classname: break method = getattr(cls,methodname) parser = method() parser.debug = True tr = TextReader(source) b = parser.parse(tr.getiterator(tr.readparagraph)) print(serialize(b))
[docs] @decorators.action def queryindex(self, querystring): """Query the system fulltext index and return the IDs/URIs for matching documents. :param querystring: The query :type querystring: str """ index = FulltextIndex.connect(self.config.indextype, self.config.indexlocation) rows = index.query(querystring) for row in rows: print("%s (%s): %s" % (row['identifier'], row['about'], row['text']))
[docs] @decorators.action def construct(self, template, uri, format="turtle"): sq = util.readfile(template) % {'uri': uri} ts = TripleStore.connect(self.config.storetype, self.config.storelocation, self.config.storerepository) print("# Constructing the following from %s, repository %s, type %s" % (self.config.storelocation, self.config.storerepository, self.config.storetype)) print("".join(["# %s\n" % x for x in sq.split("\n")])) p = {} with util.logtime(print, "# %(triples)s triples constructed in %(elapsed).3fs", p): res = ts.construct(sq) p['triples'] = len(res) print(res.serialize(format=format).decode('utf-8'))
[docs] @decorators.action def select(self, template, uri, format="json"): sq = util.readfile(template) % {'uri': uri} ts = TripleStore.connect(self.config.storetype, self.config.storelocation, self.config.storerepository) print("# Constructing the following from %s, repository %s, type %s" % (self.config.storelocation, self.config.storerepository, self.config.storetype)) print("".join(["# %s\n" % x for x in sq.split("\n")])) p = {} with util.logtime(print, "# Selected in %(elapsed).3fs", p): res = ts.select(sq, format=format) # res should be a unicode string, not an encoded bytestring # print(res) # NO! res must be a bytestring, select should return # whatever is the appropriately encoded version for the # given format. print(res.decode('utf-8'))
[docs] @decorators.action def destroyindex(self): f = FulltextIndex.connect(self.config.indextype, self.config.indexlocation, []) f.destroy() print("%s index at %s destroyed" % (self.config.indextype, self.config.indexlocation))
# FIXME: These are dummy implementations of methods and class # variables that manager.py expects all docrepos to have. We don't # want to have coverage counting these as missing lines, hence the # pragma: no cover comments. def __init__(self, config=None, **kwargs): self.store = DummyStore(None) self.config = config documentstore_class = DummyStore downloaded_suffix = ".html" storage_policy = "file"
[docs] def get_default_options(self): return {} # pragma: no cover
[docs] def download(self): pass # pragma: no cover
[docs] def parse(self, basefile): pass # pragma: no cover
[docs] def relate(self, basefile): pass # pragma: no cover
[docs] def generate(self, basefile): pass # pragma: no cover
[docs] def toc(self, otherrepos): pass # pragma: no cover
[docs] def news(self, otherrepos): pass # pragma: no cover
[docs] def status(self): pass # pragma: no cover
[docs] @classmethod def setup(cls, action, config): pass # pragma: no cover
[docs] @classmethod def teardown(cls, action, config): pass # pragma: no cover