Source code for ferenda.devel

# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import *
import builtins

from ast import literal_eval
from bz2 import BZ2File
from collections import OrderedDict, defaultdict, Counter
from difflib import unified_diff
from datetime import datetime
from itertools import islice
from io import BytesIO, StringIO
from tempfile import mkstemp
from time import sleep
from operator import attrgetter
from pprint import pformat
import codecs
import fileinput
import inspect
import json
import logging
import os
import random
import re
import shutil
import sys
import time
import traceback
from wsgiref.util import request_uri
from urllib.parse import parse_qsl, urlencode

from rdflib import Graph, URIRef, RDF, Literal
from rdflib.namespace import DCTERMS
from layeredconfig import LayeredConfig, Defaults
from lxml import etree
from ferenda.thirdparty.patchit import PatchSet, PatchSyntaxError, PatchConflictError

from ferenda.compat import Mock
from ferenda import (TextReader, TripleStore, FulltextIndex, WSGIApp,
                     Document, DocumentRepository,
                     CompositeRepository, DocumentEntry, Transformer,
                     RequestHandler, ResourceLoader)
from ferenda.elements import serialize
from ferenda.elements.html import Body, P, H1, H2, H3, Form, Textarea, Input, Label, Button, Textarea, Br, Div, A, Pre, Code, UL, LI
from ferenda import decorators, util, manager

class DummyStore(object):

    def __init__(self, path, **kwargs):
        pass  # pragma: no cover

    def list_basefiles_for(self, action, basedir=None, force=True):
        return []  # pragma: no cover

class WSGIOutputHandler(logging.Handler):
    def __init__(self, writer):
        self.writer = writer
        super(WSGIOutputHandler, self).__init__()

    def emit(self, record):
        entry = self.format(record) + "\n"
        except OSError as e:
            # if self.writer has closed, it probably means that the
            # HTTP client has closed the connection. But we don't stop
            # for that.

class DevelHandler(RequestHandler):

    def supports(self, environ):
        return environ['PATH_INFO'].startswith("/devel/")

    def handle(self, environ):
        segments = [x for x in environ['PATH_INFO'].split("/") if x]
        if environ['REQUEST_METHOD'] == 'POST':
            reqbody = environ['wsgi.input'].read(int(environ.get('CONTENT_LENGTH', 0)))
            params = dict(parse_qsl(reqbody.decode("utf-8")))
            params = dict(parse_qsl(environ['QUERY_STRING']))

        handler = {'patch': self.handle_patch,
                   'logs': self.handle_logs,
                   'change-parse-options': self.handle_change_parse_options,
                   'build': self.handle_build,
                   'streaming-test': self.handle_streaming_test}[segments[1]]
        body = handler(environ, params)
        res = self._render(segments[1], body, request_uri(environ), self.repo.config)
        length = len(res)
        fp = BytesIO(res)
        return fp, length, 200, "text/html"

    def _render(self, title, body, uri, config, template="xsl/generic.xsl"):
        repo = DocumentRepository(config=config)
        doc = repo.make_document()
        doc.uri = uri
                      Literal(title, lang="sv")))
        doc.body = body
        xhtml = repo.render_xhtml_tree(doc)
        documentroot = repo.config.datadir
        conffile = os.sep.join([documentroot, 'rsrc',
        transformer = Transformer('XSLT', template, "xsl",
        urltransform = None
        if 'develurl' in repo.config and repo.config.develurl:
            urltransform = repo.get_url_transform_func(develurl=repo.config.develurl)
        depth = len(doc.uri.split("/")) - 3
        tree = transformer.transform(xhtml, depth,
        return etree.tostring(tree, encoding="utf-8")

    def stream(self, environ, start_response):
        if environ['PATH_INFO'].endswith('change-parse-options'):
            return self.handle_change_parse_options_stream(environ, start_response)
        elif environ['PATH_INFO'].endswith('streaming-test'):
            return self.handle_streaming_test_stream(environ, start_response)
        elif environ['PATH_INFO'].endswith('build'):
            return self.handle_build_stream(environ, start_response)
            start_response('500 Server error', [('Content-Type', 'text/plain')])
            return ['No streaming handler registered for PATH_INFO %s' % environ['PATH_INFO']]

    def _setup_streaming_logger(self, writer):
        # these internal libs use logging to log things we rather not disturb the user with
        for logname in ['urllib3.connectionpool',
            log = logging.getLogger(logname)
            log.propagate = False

        wsgihandler = WSGIOutputHandler(writer)
            logging.Formatter("%(asctime)s [%(name)s] %(levelname)s %(message)s",
        rootlogger = logging.getLogger()
        for handler in rootlogger.handlers:
        return rootlogger
    def _shutdown_streaming_logger(self, rootlogger):
        for h in list(rootlogger.handlers):
            if isinstance(h, WSGIOutputHandler):

    def handle_build(self, environ, params):
        if params:
            params = defaultdict(str, params)
            label = "Running %(repo)s %(action)s %(basefile)s %(all)s %(force)s %(sefresh)s" % params
            params["stream"] = "true"
            streamurl = environ['PATH_INFO'] + "?" + urlencode(params)
            return Body([H2(["ferenda-build"]),
                         Pre(**{'class': 'pre-scrollable',
                                'id': 'streaming-log-output',
                                'src': streamurl})
            return Body([
                          Div([Label(["repo"], **{'for': "repo", 'class': "sr-only"}),
                               Input(**{'type': "text", 'id': "repo", 'name': "repo", 'placeholder': "repo", 'class': "form-control"}),
                               Label(["action"], **{'for': "action", 'class': "sr-only"}),
                               Input(**{'type': "text", 'id': "action", 'name': "action", 'placeholder': "action", 'class': "form-control"}),
                               Label(["basefile"], **{'for': "basefile", 'class': "sr-only"}),
                               Input(**{'type': "text", 'id': "basefile", 'name': "basefile", 'placeholder': "basefile", 'class': "form-control"})
                          ], **{'class': 'form-group'}),
                         Div([Input(**{'type': "checkbox", 'id': "all", 'name': "all", 'value': "--all"}),
                              Label(["--all"], **{'for': "all"}),
                              Input(**{'type': "checkbox", 'id': "force", 'name': "force", 'value': "--force"}),
                              Label(["--force"], **{'for': "force"}),
                              Input(**{'type': "checkbox", 'id': "refresh", 'name': "refresh", 'value': "--refresh"}),
                              Label(["--refresh"], **{'for': "refresh"}),
                              Button(["Build"], **{'type': "submit", 'class': "btn btn-default"})
                         ], **{'class': 'form-group'})
                      ], **{'class': 'form-inline'})])])

    def handle_build_stream(self, environ, start_response):
        content_type = 'application/octet-stream'
        writer = start_response('200 OK', [('Content-Type', content_type),
                                           ('X-Accel-Buffering', 'no')]) 
        rootlogger = self._setup_streaming_logger(writer)
        log = logging.getLogger(__name__)"Running ...")
        params = dict(parse_qsl(environ['QUERY_STRING']))
        argv = [params[x] for x in ('repo', 'action', 'basefile', 'all', 'force', 'refresh') if params.get(x)]
        except Exception as e:
            exc_type, exc_value, tb = sys.exc_info()
            tblines = traceback.format_exception(exc_type, exc_value, tb)
            msg = "\n".join(tblines)
            # ok we're done
        return []

    def handle_streaming_test(self, environ, params):
        return Body([
            Div([H2(["Streaming test"]),
                 Pre(**{'class': 'pre-scrollable',
                        'id': 'streaming-log-output',
                        'src': environ['PATH_INFO'] + "?stream=true"})])])

    def handle_streaming_test_stream(self, environ, start_response):
        # using this instead of text/plain prevent chrome from
        # buffering at the beginning (according to
        #, there are three ways
        # of overcoming this: The "X-Content-Type-Options: nosniff"
        # header, sending at least 1024 bytes of data right away, or
        # using a non text/plain content-type. The latter seems the
        # easiest.
        content_type = 'application/octet-stream'
        # the second header disables nginx/uwsgi buffering so that
        # results are actually streamed to the client, see
        writer = start_response('200 OK', [('Content-Type', content_type),
                                           ('X-Accel-Buffering', 'no'),
                                           ('X-Content-Type-Options', 'nosniff')]) 
        rootlogger = self._setup_streaming_logger(writer)
        log = logging.getLogger(__name__)"1024 bytes of start data: " + "x" * 1024)
        log.debug("Debug messages should work")
        sleep(1)"Info messages should work")
        log.warning("Warnings should, unsurprisingly, work")
        return []

    def handle_change_parse_options(self, environ, params):
        # this method changes the options and creates a response page
        # that, in turn, does an ajax request that ends up calling
        # handle_change_parse_options_stream
        assert params
        assert environ['REQUEST_METHOD'] == 'POST'
        repo = params['repo']
        subrepo = params['subrepo']
        basefile = params['basefile']
        newvalue = params['newvalue']
        reason = params['reason']
        inst = self.repo._repo_from_alias(repo)
        optionsfile = inst.resourceloader.filename("options/")
        want = '("%s", "%s"):' % (repo, basefile)
        lineidx = None
        out = ""
        with open(optionsfile) as f:
            for idx, line in enumerate(f):
                if want in line:
                    lineidx = idx
                    currentvalue =': "([^"]+)",', line).group(1)
                    line = line.replace(currentvalue, newvalue)
                    line = line.rstrip() + " # " + reason + "\n"
                out += line
        util.writefile(optionsfile, out)
        # now we must invalidate the cached property
        if 'parse_options' in inst.__dict__:
            del inst.__dict__['parse_options']
        if lineidx:
            datasrc = "%s?repo=%s&subrepo=%s&basefile=%s&stream=true" % (
            res = [H2(["Changing options for %s in repo %s" % (basefile, repo)]),
                   # Pre([pformat(environ)]),
                   P(["Changed option at line %s from " % lineidx,
                      " to ",
                   P(["Now downloading and processing (please be patient...)"]),
                   Pre(**{'class': 'pre-scrollable',
                          'id': 'streaming-log-output',
                          'src': datasrc})]
            res = [H2(["Couldn't change options for %s in repo %s" % (basefile, repo)]),
                   P(["Didn't manage to find a line matching ",
                      " in ",
        return Body([

    def handle_change_parse_options_stream(self, environ, start_response):
        writer = start_response('200 OK', [('Content-Type', 'application/octet-stream'),
                                           ('X-Accel-Buffering', 'no')]) 
        rootlogger = self._setup_streaming_logger(writer)
        # now do the work
        params = dict(parse_qsl(environ['QUERY_STRING']))
        repoconfig = getattr(self.repo.config._parent, params['repo'])
        repoconfig.loglevel = "DEBUG"
        repo = self.repo._repo_from_alias(params['repo'], repoconfig=repoconfig)
        if 'subrepo' in params:
            subrepoconfig = getattr(self.repo.config._parent, params['subrepo'])
            subrepoconfig.loglevel = "DEBUG"
            subrepo = self.repo._repo_from_alias(params['subrepo'], repoconfig=subrepoconfig)
            subrepo = repo
        basefile = params['basefile']
  "Downloading %s" % basefile)
            subrepo.config.refresh = True  # the repo might have a partial download, eg of index HTML page but without PDF document
            # sleep(1)
  "Parsing %s" % basefile)
            # sleep(1)
  "Relating %s" % basefile)
            # sleep(1)
  "Generating %s" % basefile)
            # sleep(1)
        except Exception as e:
            exc_type, exc_value, tb = sys.exc_info()
            tblines = traceback.format_exception(exc_type, exc_value, tb)
            msg = "\n".join(tblines)
            # ok we're done
        return []

    def handle_patch(self, environ, params):
        def open_intermed_text(repo, basefile, mode="rb"):
            intermediatepath =
            opener = open
            if repo.config.compress == "bz2":
                intermediatepath += ".bz2"
                opener = BZ2File
            if os.path.exists(intermediatepath):
                stage = "intermediate"
                outfile = intermediatepath
                stage = "download"
                outfile =
            fp = opener(outfile, mode)
            return fp
        def format_exception():
            exc_type, exc_value, tb = sys.exc_info()
            tblines = traceback.format_exception(exc_type, exc_value, tb)
            tbstr = "\n".join(tblines)
            return tbstr

        if not params:
            # start page: list available patches maybe? form with repo names and textbox for basefile?
            res = Body([
                    H2(["Create a new patch"]),
                            Label(["repo"], **{'for': 'repo'}),
                            Input(**{'type':"text", 'id': "repo", 'name': "repo", 'class': "form-control"}),
                            Label(["basefile"], **{'for': 'basefile'}),
                            Input(**{'type':"text", 'id': "basefile", 'name': "basefile", 'class': "form-control"})],
                            **{'class': 'form-group'}),
                        Button(["Create"], **{'type': "submit", 'class': "btn btn-default"})],
                     action=environ['PATH_INFO'], method="GET")
            return res
            alias = params['repo']
            basefile = params['basefile']
            repo = self.repo._repo_from_alias(alias)
            patchstore = repo.documentstore_class(repo.config.patchdir +
                                                  os.sep + repo.alias)
            patchpath = patchstore.path(basefile, "patches", ".patch")
            if environ['REQUEST_METHOD'] == 'POST':
                # fp = open_intermed_text(repo, basefile, mode="wb")
                # FIXME: Convert CRLF -> LF. We should determine from
                # existing intermed file what the correct lineending
                # convention is
                # fp.write(params['filecontents'].replace("\r\n", "\n").encode(repo.source_encoding))
                # fp.close()
                self.repo.mkpatch(repo, basefile, params.get('description',''),
                                  params['filecontents'].replace("\r\n", "\n"))
                log = []
                if params.get('parse') == "true":
                    repo.config.force = True
                    log.append(P(["Parsing %s" % basefile]))
                        log.append(P(["Parsing successful"]))
                    except Exception:
                        params['generate'] = "false"

                if params.get('generate') == "true":
                    repo.config.force = True
                    log.append(P(["Generating %s" % basefile]))
                        log.append(P(["Generation successful: ",
                                     A([basefile], href=repo.canonical_uri(basefile))]))
                    except Exception:

                if os.path.exists(patchpath):
                    patchcontent = util.readfile(patchpath)
                    res = Body([
                            H2(["patch generated at %s" % patchpath]),
                            P("Contents of the new patch"),
                    res = Body([
                        Div([H2(["patch was not generated"])]),
                return res
                print("load up intermediate file, display it in a textarea + textbox for patchdescription")
                fp = open_intermed_text(repo, basefile)
                outfile = util.name_from_fp(fp)
                text =
                patchdescription = None
                if os.path.exists(patchpath) and params.get('ignoreexistingpatch') != 'true':
                    ignorepatchlink = "%s?%s&ignoreexistingpatch=true" % (environ['PATH_INFO'], environ['QUERY_STRING'])
                    with, 'r', encoding=repo.source_encoding) as pfp:
                        if repo.config.patchformat == 'rot13':
                            pfp = StringIO(codecs.decode(, "rot13"))
                            ps = PatchSet.from_stream(pfp)
                            lines = text.split("\n")
                            offsets = ps.patches[0].adjust(lines)
                            text = "\n".join(ps.patches[0].merge(lines))
                            if ps.patches[0].hunks[0].comment:
                                patchdescription = ps.patches[0].hunks[0].comment
                                patchdescription = ""
                            instructions = Div([
                                P(["Existing patch at %s has been applied (" % patchpath,
                                   A("ignore existing patch", href=ignorepatchlink), ")"]),
                                P(["Contents of that patch, for reference"]),
                            if any(offsets):
                                instructions.append(P("Patch did not apply cleanly, the following adjustments were made: %s" % offsets))
                        except (PatchSyntaxError, PatchConflictError) as e:
                            instructions = Div([
                                P(["Existing patch at %s could not be applied (" % patchpath,
                                   A("ignore existing patch", href=ignorepatchlink), ")"]),
                                P("The error was:"),
                            patchdescription = ""
                    instructions = P(["Change the original data as needed"])

                # the extra \n before filecontents text is to
                # compensate for a missing \n introduced by the
                # textarea tag
                res = Body([
                    H2(["Editing %s" % outfile]),
                        Form([Textarea(["\n"+text], **{'id': 'filecontents',
                                                  'name': 'filecontents',
                                                  'cols': '80',
                                                  'rows': '30',
                                                  'class': 'form-control'}),
                                  Label(["Description of patch"], **{'for': 'description'}),
                                           'name': 'description',
                                           'value': patchdescription,
                                           'class': 'form-control'})
                                  ], **{'class': 'form-group'}),
                                      Input(**{'type': 'checkbox',
                                               'id': 'parse',
                                               'name': 'parse',
                                               'checked': 'checked',
                                               'value': 'true',
                                               'class': 'form-check-input'}),
                                      "Parse resulting file"], **{'class': 'form-check-label'})],
                                  **{'class': 'form-check'}),
                                      Input(**{'type': 'checkbox',
                                               'id': 'generate',
                                               'name': 'generate',
                                               'checked': 'checked',
                                               'value': 'true',
                                               'class': 'form-check-input'}),
                                      "Generate HTML from results of parse"], **{'class': 'form-check-label'})],
                                  **{'class': 'form-check'}),
                              Input(id="repo", type="hidden", name="repo", value=alias),
                              Input(id="basefile", type="hidden", name="basefile", value=basefile),
                              Button(["Create patch"], **{'type': 'submit',
                                                          'class': 'btn btn-default'})],
                             action=environ['PATH_INFO'], method="POST"
                return res
        # return fp, length, status, mimetype

    def analyze_log(self, filename, listerrors=False):
        modules = defaultdict(int)
        locations = defaultdict(int)
        locationmsg = {}
        errors = []
        output = StringIO()
        with open(filename) as fp:
            for line in fp:
                    timestamp, module, level, message = line.split(" ", 3)
                except ValueError:
                if level == "ERROR":
                    if module == "root":
                        module = message.split(" ", 1)[0]
                    modules[module] += 1
                    m ="\([\w/]\d+\)", message)
                    if m:
                        location =
                        locations[location] += 1
                        if location not in locationmsg:
                            locationmsg[location] = message.strip()
                    if listerrors:
                        m = re.match("([\w\.]+) (\w+) ([^ ]*) failed", message)
                        if m:
        if listerrors:
            for repo, basefile in errors:
                print(repo,basefile, file=output)
            print("Top error modules:", file=output)
            self.printdict(modules, file=output)
            print("Top error messages:", file=output)
            self.printdict(locations, locationmsg, file=output)
        return output.getvalue()

    def printdict(self, d, labels=None, file=sys.stdout):
        # prints out a dict with int values, sorted by these
        for k in sorted(d, key=d.get, reverse=True):
            if labels:
                lbl = labels[k]
                lbl = k
            print("%4d %s" % (d[k], lbl), file=file)

    re_message_loc = re.compile
    def analyze_buildstats(self, logfilename):
        output = StringIO()
        counters = defaultdict(Counter)
        msgloc = re.compile(" \([\w/]\d+\)").search
        eventok = re.compile("[^ ]+: (download|parse|relate|generate|transformlinks) OK").match
        with open(logfilename) as fp:
            for line in fp:
                    timestamp, module, level, message = line.split(" ", 3)
                except ValueError:
                m = msgloc(message)
                if m:
                    message = message[:m.start()]
                m = eventok(message)
                if m:
                    action =
                    counters[action][module] += 1
        sortkeys = defaultdict(int,
                               {"download": -5,
                                "parse": -4,
                                "relate": -3,
                                "generate": -2,
                                "transformlinks": -1})
        actions = sorted(counters.keys(), key=sortkeys.get)  # maybe sort in a reasonable order?
        if actions:
            alength = max([len(a) for a in actions])
            formatstring = "%-" + str(alength) + "s: %d (%s)\n"
            for action in actions:
                actionsum = sum(counters[action].values())
                modcounts = ", ".join(["%s: %s" % (k, v) for k, v in sorted(counters[action].items())])
                output.write(formatstring % (action, actionsum, modcounts))
            # download: 666 (sfs 421, prop 42, soukb 12)
            # parse:    555 (sfs 400, prop 0,  sou 12)
            # relate:   500 (sfs 140, prop 0,  sou 12)
            # generate: 450 (sfs 130, prop 0,  sou 12)
            output.write("[no successful processing actions found]\n")
        return output.getvalue()

    def handle_logs(self, environ, params):
        logdir = self.repo.config.datadir + os.sep + "logs"
        def elapsedtime(f):
            with open(f) as fp:
                first = fp.readline()
       - 500)
                last ="\n")[-2]
            start = datetime.strptime(first.split(" ")[0], "%H:%M:%S")
            end = datetime.strptime(last.split(" ")[0], "%H:%M:%S")
            return end - start  # FIXME: Handle wraparound

        def firstline(f):
            with open(logdir+os.sep+f) as fp:
                # trim uninteresting things from start and end
                l = fp.readline().split(" ", 3)[-1].rsplit(" (", 1)[0]
                if l.strip():
                    return l
                    return "[log is empty?]"
        def linkelement(f):
            href = environ['PATH_INFO'] + "?file=" + f
            return LI([A(f, href=href), " ", Code([firstline(f)]), " (%.2f kb)" % (os.path.getsize(logdir+os.sep+f) / 1024)])

        if not params:
            logfiles = sorted([f for f in os.listdir(logdir) if f.endswith(".log")], reverse=True)
            return Body([
                Div([UL([linkelement(f) for f in logfiles])])])
        elif 'file' in params:
            start = time.time()
            assert re.match("\d{8}-\d{6}.log$", params['file']), "invalid log file name"
            logfilename = logdir+os.sep+params['file']
            buildstats = self.analyze_buildstats(logfilename)
            errorstats = self.analyze_log(logfilename)
            if not errorstats:
                errorstats = "[analyze_log didn't return any output?]"
            logcontents = util.readfile(logfilename)
            elapsed = elapsedtime(logfilename)
            return Body([
                     P(["Log processed in %.3f s. The logged action took %.0f s." % (time.time() - start, elapsed.total_seconds())]),
                     Pre([logcontents], **{'class': 'logviewer'})])])

[docs]class Devel(object): """Collection of utility commands for developing docrepos. This module acts as a docrepo (and as such is easily callable from ````), but instead of ``download``, ``parse``, ``generate`` et al, contains various tool commands that is useful for developing and debugging your own docrepo classes. Use it by first enabling it:: ./ ferenda.Devel enable And then run individual tools like:: ./ devel dumprdf path/to/xhtml/rdfa.xhtml """ alias = "devel"
[docs] @decorators.action def dumprdf(self, filename, format="turtle"): """Extract all RDF data from a parsed file and dump it to stdout. :param filename: Full path of the parsed XHTML+RDFa file. :type filename: str :param format: The serialization format for RDF data (same as for :py:meth:`rdflib.graph.Graph.serialize`) :type format: str Example:: ./ devel dumprdf path/to/xhtml/rdfa.xhtml nt """ print = builtins.print g = Graph() g.parse(data=util.readfile(filename), format="rdfa") # At least the turtle serializer creates UTF-8 data. Fix this! print((g.serialize(None, format=format).decode("utf-8")))
[docs] @decorators.action def dumpstore(self, format="turtle"): """Extract all RDF data from the system triplestore and dump it to stdout using the specified format. :param format: The serialization format for RDF data (same as for :py:meth:`ferenda.TripleStore.get_serialized`). :type format: str Example:: ./ devel dumpstore nt > alltriples.nt """ # print("Creating store of type %s, location %s, repository %s" % # (self.config.storetype, self.config.storelocation, self.config.storerepository)) print = builtins.print store = TripleStore.connect(self.config.storetype, self.config.storelocation, self.config.storerepository) print(store.get_serialized(format=format).decode('utf-8'))
# Not really useful for anything than finding bugs in ferenda itself # # def testlog(self): # """Logs a series of messages at various levels, to test that # your client code logging configuration behaves as # expectedly.""" # log = logging.getLogger(__name__) # log.critical('Log message at CRITICAL level') # log.error('Log message at ERROR level') # log.warning('Log message at WARNING level') #'Log message at INFO level') # log.debug('Log message at DEBUG level') # sub = logging.getLogger(__name__+'.sublogger') # sub.critical('Sublog message at CRITICAL level') # sub.error('Sublog message at ERROR level') # sub.warning('Sublog message at WARNING level') #'Sublog message at INFO level') # sub.debug('Sublog message at DEBUG level')
[docs] @decorators.action def csvinventory(self, alias, predicates=None): """Create an inventory of documents, as a CSV file. Only documents that have been parsed and yielded some minimum amount of RDF metadata will be included. :param alias: Docrepo alias :type alias: str """ if predicates is None: predicates = ['basefile', 'subobjects', # sections that have rdf:type 'rdf:type', 'dcterms:identifier', 'dcterms:title', 'dcterms:published', 'prov:wasGeneratedBy', ] else: # predicates are given as a comma separated list, eg ./ devel csvinventory kkv rpubl:malnummer,rpubl:avgorandedatum,rinfoex:instanstyp,rinfoex:domstol,rinfoex:upphandlande,rinfoex:leverantor,rinfoex:arendetyp,rinfoex:avgorande predicates = predicates.split(",") import csv # if six.PY2: # delimiter = b';' # out = sys.stdout # else: import codecs delimiter = ';' out = codecs.getwriter("latin-1")(sys.stdout.detach()) out.errors = "replace" writer = csv.DictWriter(out, predicates, delimiter=delimiter) repo = self._repo_from_alias(alias) writer.writerow(dict([(p, p) for p in predicates])) for basefile in"relate"): baseuri = URIRef(repo.canonical_uri(basefile)) with as fp: row = {} if 'basefile' in predicates: row['basefile'] = basefile g = Graph().parse(fp, format="xml") for (p, o) in g.predicate_objects(baseuri): qname = g.qname(p) if qname in predicates: if isinstance(o, URIRef): row[qname] = g.qname(o) else: # it seems py2 CSV modue expects latin-1 # encoded bytestrings (for non-ascii # values), while py3 CSV expects unicode # (sensibly) fld = str(o) # if six.PY2: # fld = fld.encode("latin-1", errors="replace") row[qname] = fld if 'subobjects' in predicates: row['subobjects'] = len(list(g.subject_objects(RDF.type))) writer.writerow(row)
def _repo_from_alias(self, alias, datadir=None, repoconfig=None): # (FIXME: This uses several undocumented APIs) mainconfig = self.config._parent assert mainconfig is not None, "Devel must be initialized with a full set of configuration" if repoconfig is None: repoconfig = getattr(mainconfig, alias) from ferenda import manager repocls = manager._load_class(getattr(repoconfig, 'class')) repo = repocls() repo.config = getattr(mainconfig, alias) # work in all parameters from get_default_options for key, val in repocls.get_default_options().items(): if key not in repo.config: LayeredConfig.set(repo.config, key, val, "defaults") if datadir is None: datadir = repo.config.datadir + os.sep + repo.alias = datadir return repo
[docs] @decorators.action def mkpatch(self, alias, basefile, description, patchedtext=None): """Create a patch file from downloaded or intermediate files. Before running this tool, you should hand-edit the intermediate file. If your docrepo doesn't use intermediate files, you should hand-edit the downloaded file instead. The tool will first stash away the intermediate (or downloaded) file, then re-run :py:meth:`~ferenda.DocumentRepository.parse` (or :py:meth:`~ferenda.DocumentRepository.download_single`) in order to get a new intermediate (or downloaded) file. It will then calculate the diff between these two versions and save it as a patch file in it's proper place (as determined by ``config.patchdir``), where it will be picked up automatically by :py:meth:`~ferenda.DocumentRepository.patch_if_needed`. :param alias: Docrepo alias :type alias: str :param basefile: The basefile for the document to patch :type basefile: str Example:: ./ devel mkpatch myrepo basefile1 "Removed sensitive personal information" """ # 1. initialize the docrepo indicated by "alias" # alias might sometimes be the initialized repo so check for that first... if isinstance(alias, str): repo = self._repo_from_alias(alias) else: repo = alias # 2. find out if there is an intermediate file or downloaded # file for basefile. FIXME: unify this with open_intermed_patchedtext # in handle_patch intermediatepath = if repo.config.compress == "bz2": intermediatepath += ".bz2" if os.path.exists(intermediatepath): stage = "intermediate" outfile = intermediatepath else: stage = "download" outfile = if patchedtext: # If we provide the new patchedtext as a parameter (assumed to be # unicode patchedtext, not bytestring, the existing intermediate # file is assumed to be untouched patchedtext_lines = patchedtext.split("\n") patchedtext_path = "" else: # but if we don't, the existing intermediate file is # assumed to be edited in-place, and we need to stash it # away, then regenerate a pristine version of the # intermediate file fileno, patchedtext_path = mkstemp() with os.fdopen(fileno, "wb") as fp: patchedtext_lines = util.readfile(outfile, encoding=repo.source_encoding).split("\n") fp.write("\n".join(patchedtext_lines).encode(repo.source_encoding)) # 2.1 if intermediate: after stashing a copy of the # intermediate file, delete the original and run # parse(config.force=True) to regenerate the intermediate file if stage == "intermediate": repo.config.force = True util.robust_remove(intermediatepath) try: repo.config.ignorepatch = True repo.parse(basefile) repo.config.ignorepatch = False except: # maybe this throws an error (hopefully after creating # the intermediate file)? may be the reason for # patching in the first place? pass # 2.2 if only downloaded: stash a copy, run download_single(config.refresh=True) else: repo.config.refresh = True repo.download_single(basefile) # 2.9 re-add line endings to patchedtext_lines if patchedtext_lines[-1] == "": # remove last phantom line # caused by splitting # "foo\nbar\n" -- this should # only be two lines! patchedtext_lines.pop() patchedtext_lines = [x + "\n" for x in patchedtext_lines] # 3. calculate the diff using difflib. # Assume that intermediate files use the same encoding as # source files if repo.config.compress == "bz2": opener = BZ2File else: opener = open encoding = repo.source_encoding with opener(outfile, mode="rb") as fp: outfile_lines = [l.decode(encoding) for l in fp.readlines()] difflines = list(unified_diff(outfile_lines, patchedtext_lines, outfile, patchedtext_path)) if patchedtext_path and os.path.exists(patchedtext_path): os.unlink(patchedtext_path) # 4. calculate place of patch using patchstore = repo.documentstore_class(repo.config.patchdir + os.sep + repo.alias) patchpath = patchstore.path(basefile, "patches", ".patch") # 3.1 If comment is single-line, append it on the first hunks # @@-control line if description.count("\n") == 0: for idx, line in enumerate(difflines): if line.startswith("@@") and line.endswith("@@\n"): difflines[idx] = difflines[idx].replace("@@\n", "@@ " + description + "\n") break else: # 4.2 if comment is not single-line, write the rest # in corresponding .desc file descpath = patchstore.path(basefile, "patches", ".desc") util.writefile(descpath, description) # 4.1 write patch patchcontent = "".join(difflines) if patchcontent: if repo.config.patchformat == "rot13": print("rot13:ing the patch at %s" % patchpath) patchcontent = codecs.encode(patchcontent, "rot13") # write the patch using the same encoding as the # downloaded/intermediate files util.writefile(patchpath, patchcontent, encoding=encoding) # print("Created patch %s" % patchpath) return patchpath else: print("WARNING: patch would be empty, not creating it")
[docs] @decorators.action def parsestring(self, string, citationpattern, uriformatter=None): """Parse a string using a named citationpattern and print parse tree and optionally formatted uri(s) on stdout. :param string: The text to parse :type string: str :param citationpattern: The fully qualified name of a citationpattern :type citationpattern: str :param uriformatter: The fully qualified name of a uriformatter :type uriformatter: str .. note:: This is not implemented yet Example:: ./ devel parsestring \\ "According to direktiv 2007/42/EU, ..." \\ ferenda.citationpatterns.eulaw """ raise NotImplementedError
[docs] @decorators.action def fsmparse(self, functionname, source): """Parse a list of text chunks using a named fsm parser and output the parse tree and final result to stdout. :param functionname: A function that returns a configured :py:class:`~ferenda.FSMParser` :type functionname: str :param source: A file containing the text chunks, separated by double newlines :type source: str """ print = builtins.print modulename, classname, methodname = functionname.rsplit(".", 2) __import__(modulename) m = sys.modules[modulename] for name, cls in inspect.getmembers(m, inspect.isclass): if name == classname: break method = getattr(cls, methodname) parser = method() parser.debug = True tr = TextReader(source) b = parser.parse(tr.getiterator(tr.readparagraph)) print(serialize(b))
[docs] @decorators.action def queryindex(self, querystring): """Query the system fulltext index and return the IDs/URIs for matching documents. :param querystring: The query :type querystring: str """ print = builtins.print # from import Propositioner, Direktiv, SOU, Ds, JO, JK, ARN,DV # from import MyndFskr, LNMediaWiki, LNKeyword # repos = [Propositioner(), Direktiv(), SOU(), Ds(), JO(), JK(), ARN(), DV(), LNKeyword(), MyndFskr(), LNMediaWiki()] repos = [] index = FulltextIndex.connect(self.config.indextype, self.config.indexlocation, repos) rows, pager = index.query(querystring) for row in rows: print("%s (%s): %s" % (row['label'], row['uri'], row['text']))
[docs] @decorators.action def construct(self, template, uri, format="turtle"): """Run the specified SPARQL CONSTRUCT query.""" print = builtins.print sq = util.readfile(template) % {'uri': uri} ts = TripleStore.connect(self.config.storetype, self.config.storelocation, self.config.storerepository) print("# Constructing the following from %s, repository %s, type %s" % (self.config.storelocation, self.config.storerepository, self.config.storetype)) print("".join(["# %s\n" % x for x in sq.split("\n")])) p = {} with util.logtime(print, "# %(triples)s triples constructed in %(elapsed).3fs", p): res = ts.construct(sq) p['triples'] = len(res) print(res.serialize(format=format).decode('utf-8'))
[docs] @decorators.action def select(self, template, uri, format="json"): """Run the specified SPARQL SELECT query.""" sq = util.readfile(template) % {'uri': uri} ts = TripleStore.connect(self.config.storetype, self.config.storelocation, self.config.storerepository) print = builtins.print print("# Constructing the following from %s, repository %s, type %s" % (self.config.storelocation, self.config.storerepository, self.config.storetype)) print("".join(["# %s\n" % x for x in sq.split("\n")])) p = {} with util.logtime(print, "# Selected in %(elapsed).3fs", p): res =, format=format) # res should be a unicode string, not an encoded bytestring # print(res) # NO! res must be a bytestring, select should return # whatever is the appropriately encoded version for the # given format. print(res.decode('utf-8'))
[docs] @decorators.action def destroyindex(self): """Clear all data in the fulltext search index.""" f = FulltextIndex.connect(self.config.indextype, self.config.indexlocation, []) f.destroy() print("%s index at %s destroyed" % (self.config.indextype, self.config.indexlocation))
[docs] @decorators.action def clearstore(self): """Clear all data in the current triplestore.""" store = TripleStore.connect(self.config.storetype, self.config.storelocation, self.config.storerepository) triplecount = store.triple_count() store.clear() print("%s triplestore at %s %s cleared (was %s triples, now %s)" % (self.config.storetype, self.config.storelocation, self.config.storerepository, triplecount, store.triple_count()))
[docs] @decorators.action def wsgi(self, path="/"): """Runs WSGI calls in-process.""" globalconfig = self.config._parent from ferenda import manager classnames = [ getattr( repoconfig, 'class') for repoconfig in globalconfig._subsections.values() if hasattr( repoconfig, 'class')] repos = [ manager._instantiate_class( manager._load_class(x), globalconfig) for x in classnames if x != 'ferenda.Devel'] url = globalconfig.develurl if 'develurl' in globalconfig else globalconfig.url app = WSGIApp(repos, manager._find_config_file(), url=url) DEFAULT_HTTP_ACCEPT = 'text/xml, application/xml, application/xhtml+xml, text/html;q=0.9, text/plain;q=0.8, image/png,*/*;q=0.5' if "?" in path: pathinfo, querystring = path.split("?", 1) else: pathinfo, querystring = path, "" environ = {'HTTP_ACCEPT': DEFAULT_HTTP_ACCEPT, 'PATH_INFO': pathinfo, 'SERVER_NAME': 'localhost', 'SERVER_PORT': '8000', 'QUERY_STRING': querystring, 'wsgi.url_scheme': 'http' } start_response = Mock() for chunk in app(environ, start_response): if isinstance(chunk, bytes): chunk = chunk.decode("utf-8") sys.stdout.write(chunk)
[docs] @decorators.action def samplerepo(self, alias, sourcedir, sourcerepo=None, destrepo=None, samplesize=None): """Copy a random selection of documents from an external docrepo to the current datadir.""" if not samplesize: if 'samplesize' in self.config: samplesize = int(self.config.samplesize) else: samplesize = 10 if sourcerepo is None: sourcerepo = self._repo_from_alias(alias, sourcedir) if destrepo is None: destrepo = self._repo_from_alias(alias) randomsample = True if randomsample: basefiles = list("parse")) samplesize = min([len(basefiles), samplesize]) basefiles = random.sample(basefiles, samplesize) else: basefiles = islice("parse"), 0, samplesize) for basefile in basefiles: if isinstance(sourcerepo, CompositeRepository): sourcerepo = self._repo_from_alias(alias) for cls in sourcerepo.subrepos: subsourcerepo = sourcerepo.get_instance(cls) subsdestrepo = destrepo.get_instance(cls) try: self._samplebasefile(sourcerepo, destrepo, basefile) break # everything OK, no need to copy more except IOError: # or whatever could happen pass # try the next one or bail else: print("None of the subrepos had basefile %s" % basefile) else: self._samplebasefile(sourcerepo, destrepo, basefile)
[docs] @decorators.action def copyrepos(self, sourcedir, basefilelist): """Copy some specified documents to the current datadir. The documents are specified in BASEFILELIST, and copied from the external directory SOURCEDIR. To be used with the output of, eg $ ../tools/ data/logs/20160522-120204.log --listerrors > errors.txt $ ./ devel copyrepos /path/to/big/external/datadir errors.txt """ with open(basefilelist) as fp: basefilelist = [] for line in fp: if line.startswith("("): basefilelist.append(literal_eval(line)) else: # remove comments line = line.rsplit("#", 1)[0].strip() if not line: # remove blank lines continue basefilelist.append(line.strip().split(" ", 1)) destrepos = {} sourcerepos = {} for (alias, basefile) in basefilelist: if alias not in destrepos: try: destrepos[alias] = self._repo_from_alias(alias) sourcerepos[alias] = self._repo_from_alias(alias, sourcedir + os.sep + alias) except AttributeError: # means the repo alias was wrong continue destrepo = destrepos[alias] sourcerepo = sourcerepos[alias] if isinstance(sourcerepo, CompositeRepository): for cls in sourcerepo.subrepos: subsourcerepo = sourcerepo.get_instance(cls) = (sourcedir + os.sep + subsourcerepo.alias) if os.path.exists( subdestrepo = destrepo.get_instance(cls) self._samplebasefile(subsourcerepo, subdestrepo, basefile) break else: self._samplebasefile(sourcerepo, destrepo, basefile)
def _samplebasefile(self, sourcerepo, destrepo, basefile): print(" %s: copying %s" % (sourcerepo.alias, basefile)) src = dst = if os.path.splitext(src)[1] != os.path.splitext(dst)[1]: # FIX for (and possibly other multi-suffix # repos) this will yield an incorrect suffix (eg ".zip") dst = os.path.splitext(dst)[0] + os.path.splitext(src)[1] isrc = if sourcerepo.config.compress == "bz2": isrc += ".bz2" idst = if destrepo.config.compress == "bz2": idst += ".bz2" copy = shutil.copy2 if == "dir": src = os.path.dirname(src) dst = os.path.dirname(dst) isrc = os.path.dirname(isrc) idst = os.path.dirname(idst) if os.path.exists(dst): shutil.rmtree(dst) if os.path.exists(idst): shutil.rmtree(idst) copy = shutil.copytree util.ensure_dir(dst) try: copy(src, dst) if os.path.exists(isrc): util.ensure_dir(idst) copy(isrc, idst) except FileNotFoundError as e: print("WARNING: %s" % e) # NOTE: For SFS (and only SFS), there exists separate # register files under # data/sfs/register/1998/204.html. Maybe we should use # storage_policy="dir" and handle those things as # attachments? if os.path.exists(, "register", ".html")): dst =, "register", ".html") util.ensure_dir(dst) shutil.copy2(, "register", ".html"), dst) # also copy the docentry json file if os.path.exists( util.ensure_dir( shutil.copy2(,
[docs] @decorators.action def samplerepos(self, sourcedir): """Copy a random selection of external documents to the current datadir - for all docrepos.""" # from ferenda.sources.general import Static from import Static if 'samplesize' in self.config: samplesize = int(self.config.samplesize) else: samplesize = 10 classes = set([Static,]) # blacklist static because of how it # hardcodes .store.staticdir -- leads to # copy attempts with identical src and # dst for alias in self.config._parent._subsections: if alias == self.alias: # ie "devel" continue destrepo = self._repo_from_alias(alias) if destrepo.__class__ in classes: print("...skipping class %r" % destrepo.__class__) continue if ('parse' in self.config._parent._subsections[alias] and self.config._parent._subsections[alias].parse in (False, 'False')): print("...skipping class %r (parse=False)" % destrepo.__class__) continue if isinstance(destrepo, CompositeRepository): sourcerepo = self._repo_from_alias(alias) for cls in destrepo.subrepos: subdestrepo = destrepo.get_instance(cls) if isinstance(subdestrepo, CompositeRepository): print(" up on nested compositerepository") continue if subdestrepo.__class__ in classes: print("...skipping class %r" % subdestrepo.__class__) continue classes.add(subdestrepo.__class__) subsourcerepo = sourcerepo.get_instance(cls) assert id(subdestrepo) != id(subsourcerepo) = (sourcedir + os.sep + subsourcerepo.alias) alias = subsourcerepo.alias aliasdir = print("%s/%s: Copying docs from %s" % (sourcerepo.alias, alias, aliasdir)) self.samplerepo(alias, aliasdir, subsourcerepo, subdestrepo, samplesize=round(samplesize/ len(destrepo.subrepos))) else: classes.add(destrepo.__class__) aliasdir = sourcedir+os.sep+alias print("%s: Copying docs from %s" % (alias, aliasdir)) self.samplerepo(alias, aliasdir)
[docs] @decorators.action def statusreport(self, alias=None): """Generate report on which files parse()d OK, with errors, or failed. Creates a servable HTML file containing information about how the last parse went for each doc in the given repo (or all repos if none given). """ log = logging.getLogger("devel") if alias: repos = [self._repo_from_alias(alias)] else: repos = [self._repo_from_alias(alias) for alias in self.config._parent._subsections] root = etree.fromstring("<status></status>") for repo in sorted(repos, key=attrgetter("alias")): # Find out if this repo is outwardly-responsible for # parsing -- we check against "False" as well since # LayeredConfig may lack typing info for this setting and # so interprets the value in the .ini file as a str, not a # bool... if 'parse' in repo.config and repo.config.parse in (False, "False"): continue # listing basefiles for the action "news" gives us # everyting that has a docentry file. basefiles = list("news")) if not basefiles: continue repo_el = etree.SubElement(root, "repo", {"alias": repo.alias}) successcnt = warncnt = failcnt = removecnt = errcnt = 0 durations = defaultdict(dict) for basefile in basefiles: # sys.stdout.write(".") # print("%s/%s" % (repo.alias, basefile)) entrypath = if not os.path.exists(entrypath): log.warning("%s/%s: file %s doesn't exist" % (repo.alias, basefile, entrypath)) errcnt += 1 continue elif os.path.getsize(entrypath) == 0: log.warning("%s/%s: file %s is 0 bytes" % (repo.alias, basefile, entrypath)) errcnt += 1 continue try: entry = DocumentEntry(entrypath) except ValueError as e: log.error("%s/%s: %s %s" % (repo.alias, basefile, e.__class__.__name__, e)) errcnt += 1 continue if not entry.status: # an empty dict log.warning("%s/%s: file %s has no status sub-dict" % (repo.alias, basefile, entrypath)) errcnt += 1 continue if "parse" in entry.status and "success" in entry.status["parse"] and entry.status["parse"]["success"] == "removed": log.debug("%s/%s: document was removed in parse" % (repo.alias, basefile)) durations["parse"][basefile] = -1 continue doc_el = etree.SubElement(repo_el, "basefile", {"id": basefile}) # FIXME: we should sort the entries in a reasonable way, eg # "download"/"parse"/"relate"/"generate"/any custom # action, probably through a custom key func for action in sorted(entry.status): status = entry.status[action] if not status: log.warning("%s/%s: file %s has no status data for action %s" % (repo.alias, basefile, entrypath, action)) continue if "success" in status and status["success"] == "removed": # this special truthy value indicates that # everything went as OK as it could, but the # actual document doesn't exist (anymore) so we # don't feature it in our overview. # # FIXME: Can this ever be reached, seemingly # as we check for entry.status.parse.success # == "removed" above, and no other action # could produce a removed status? durations[action][basefile] = -1 removecnt += 1 continue durations[action][basefile] = status["duration"] action_el = etree.SubElement(doc_el, "action", {"id": action, "success": str(status["success"]), "duration": str(status["duration"]), "date": str(status["date"])}) if status["success"]: successcnt += 1 else: failcnt += 1 if "warnings" in status: warncnt += 1 # add additional (optional) text data if present for optional in ("warnings", "error", "traceback"): if optional in status: opt_el = etree.SubElement(action_el, optional) opt_el.text = status[optional]"%s: %s processed, %s ok (%s w/ warnings), %s failed, %s removed. %s corrupted entries." % (repo.alias, len(basefiles), successcnt, warncnt, failcnt, removecnt, errcnt)) with open(".durations", "entries", ".json", storage_policy="file"), "w") as fp: json.dump(durations, fp, indent=4) conffile = os.path.abspath( os.sep.join([self.config.datadir, 'rsrc', 'resources.xml'])) resourceloader = [x.resourceloader for x in repos if hasattr(x, 'resourceloader')][0] transformer = Transformer('XSLT', "xsl/statusreport.xsl", "xsl", resourceloader=resourceloader, config=conffile) xhtmltree = transformer.transform(root, depth=1) outfile = os.sep.join([self.config.datadir, 'status', 'status.html']) util.ensure_dir(outfile) with open(outfile, "wb") as fp: fp.write(etree.tostring(xhtmltree, encoding="utf-8", pretty_print=True))"Wrote %s" % outfile)
# FIXME: These are dummy implementations of methods and class # variables that expects all docrepos to have. We don't # want to have coverage counting these as missing lines, hence the # pragma: no cover comments. def __init__(self, config=None, **kwargs): = DummyStore(None) if config is None: config = LayeredConfig(Defaults(kwargs)) self.config = config self.requesthandler = DevelHandler(self) documentstore_class = DummyStore downloaded_suffix = ".html" storage_policy = "file" ns = {} resourceloader = ResourceLoader()
[docs] @classmethod def get_default_options(cls): return {} # pragma: no cover
[docs] def download(self): pass # pragma: no cover
[docs] def parse(self, basefile): pass # pragma: no cover
[docs] def relate(self, basefile): pass # pragma: no cover
[docs] def generate(self, basefile): pass # pragma: no cover
[docs] def toc(self, otherrepos): pass # pragma: no cover
[docs] def news(self, otherrepos): pass # pragma: no cover
[docs] def tabs(self): return []
[docs] def footer(self): return []
[docs] def facets(self): return []
[docs] def basefile_from_uri(self, uri): return None
[docs] @classmethod def setup(cls, action, config, *args, **kwargs): pass # pragma: no cover
[docs] @classmethod def teardown(cls, action, config, *args, **kwargs): pass # pragma: no cover