Source code for ferenda.resources

# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import *

from io import BytesIO
import inspect
import json
import logging
import os

from lxml import etree
from lxml.builder import ElementMaker
from rdflib import URIRef, Literal, BNode, Graph, Namespace, RDF, RDFS
from rdflib.namespace import FOAF, SKOS
BIBO = Namespace("http://purl.org/ontology/bibo/")
from layeredconfig import LayeredConfig, Defaults

from ferenda import DocumentRepository, ResourceLoader
from ferenda import util, errors


[docs]class Resources(object):

    """Creates and manages various assets/resources needed for web serving.
    """

    def __init__(self, repos, resourcedir, **kwargs):
        # FIXME: document what kwargs could be (particularly 'combineresources')
        self.repos = repos
        self.resourcedir = resourcedir
        defaults = DocumentRepository.get_default_options()
        defaults.update(kwargs)
        self.config = LayeredConfig(Defaults(defaults))
        # the below call to setup_logger alters the logging level of
        # the root logger, which can't be good practice. Also, we
        # should probably not log to the root logger, but rather to
        # ferenda.resources.
        #
        # from ferenda.manager import setup_logger
        # self.log = setup_logger()
        self.log = logging.getLogger("ferenda.resources")
        # FIXME: How should we set up a global loadpath from the
        # individual repos?
        loadpaths = [ResourceLoader.make_loadpath(repo) for repo in repos]
        loadpath = ["."]  # cwd always has priority -- makes sense?
        for subpath in loadpaths:
            for p in subpath:
                if p not in loadpath:
                    loadpath.append(p)
        self.resourceloader = ResourceLoader(*loadpath)

[docs]    def make(self,
             css=True,
             js=True,
             img=True,
             xml=True,
             api=None):
        res = {}
        if api is None:
            api = not self.config.staticsite
        if css:
            res['css'] = self.make_css()
        if js:
            res['js'] = self.make_js()
        if img:
            res['img'] = self.make_img()
        if xml:
            res['xml'] = self.make_resources_xml(res.get('css', []), res.get('js', []))
        if api:
            res['json'] = self.make_api_files()

        # finally, normalize paths according to os.path.sep
        # conventions
        if os.sep == "\\":
            for part in res:
                result = []
                for x in res[part]:
                    if x.startswith("http://") or x.startswith("https://"):
                        result.append(x)
                    else:
                        result.append(x.replace('/', os.sep))
                res[part] = result
        return res

[docs]    def make_css(self):
        import cssmin
        combinefile = None
        if self.config.combineresources:
            combinefile = os.sep.join([self.resourcedir, 'css', 'combined.css'])
        return self._make_files(
            'cssfiles', self.resourcedir + os.sep + 'css', combinefile, cssmin.cssmin)

[docs]    def make_js(self):
        # slimit provides better perf, but isn't py3 compatible
        # import slimit
        # js = slimit.minify(
        #     jsbuffer.getvalue(), mangle=True, mangle_toplevel=True)
        import jsmin
        combinefile = None
        if self.config.combineresources:
            combinefile = os.sep.join([self.resourcedir, 'js', 'combined.js'])
        return self._make_files(
            'jsfiles', self.resourcedir + os.sep + 'js', combinefile, jsmin.jsmin)

[docs]    def make_img(self):
        return self._make_files('imgfiles', self.resourcedir + os.sep + 'img')

[docs]    def make_resources_xml(self, cssfiles, jsfiles):
        E = ElementMaker()  # namespace = None, nsmap={None: ...}
        root = E.configuration(
            E.sitename(self.config.sitename),
            E.sitedescription(self.config.sitedescription),
            E.url(self.config.url),
            E.tabs(*self._links('tabs')),
            E.footerlinks(*self._links('footer')),
            E.stylesheets(*self._li_wrap(cssfiles, 'link', 'href', rel="stylesheet")),
            E.javascripts(*self._li_wrap(jsfiles, 'script', 'src', text=" "))
        )

        if not self.config.staticsite:
            root.append(
                E.search(
                    E.endpoint(self.config.searchendpoint)
                )
            )

        outfile = self.resourcedir + os.sep + "resources.xml"
        util.writefile(
            outfile,
            etree.tostring(
                root,
                encoding="utf-8",
                pretty_print=True).decode("utf-8"))
        self.log.info("Wrote %s" % outfile)
        return [self._filepath_to_urlpath(outfile, 1)]

    # FIXME: When creating <script> elements, must take care not to
    # create self-closing tags (like by creating a single space text
    # node)
    def _li_wrap(self, items, container, attribute, text=None, **kwargs):
        elements = []
        for item in items:
            kwargs[attribute] = item
            e = etree.Element(container, **kwargs)
            e.text = text
            elements.append(e)
        return elements

    def _links(self, methodname):
        E = ElementMaker()
        elements = []
        for repo in self.repos:
            alias = repo.alias
            items = getattr(repo, methodname)()
            self.log.debug("Adding %(methodname)s from docrepo %(alias)s" %
                           locals())
            elements.extend(self._links_listitems(items))
        return elements

    def _links_listitems(self, listitems):
        E = ElementMaker()
        elements = []
        for item in listitems:
            if len(item) == 2:
                (label, url) = item
                sublists = None
            else:
                (label, url, sublists) = item
            self.log.debug(
                " - %(label)s (%(url)s)" % locals())
            if url:
                li = E.li(E.a({'href': url}, label))
            else:
                li = E.li(label)
            if sublists:
                subelements = []
                for sublist in sublists:
                    subelements.extend(self._links_listitems(sublist))
                li.append(E.ul(*subelements))
            elements.append(li)
        return elements

    def _make_files(self, option, filedir, combinefile=None, combinefunc=None):
        urls = []
        buf = BytesIO()
        processed = set()
        # eg. self.config.cssfiles
        if getattr(self.config, option):  # it's possible to set eg
                                          # cssfiles=None when
                                          # creating the Resources
                                          # object
            for f in getattr(self.config, option):
                urls.append(self._process_file(f, buf, filedir, "ferenda.ini"))
                processed.add(f)
        for repo in self.repos:
            # FIXME: create a more generic way of optionally
            # signalling to a repo that "Hey, now it's time to create
            # your resources if you can"
            if repo.__class__.__name__ == "SFS" and option == "imgfiles":
                self.log.info("calling into SFS._makeimages()")
                LayeredConfig.set(repo.config, 'imgfiles', repo._makeimages())
            for f in getattr(repo.config, option):
                if f in processed:
                    continue
                urls.append(self._process_file(f, buf, filedir, repo.alias))
                processed.add(f)
        urls = list(filter(None, urls))
        if combinefile:
            txt = buf.getvalue().decode('utf-8')
            util.writefile(combinefile, combinefunc(txt))
            return [self._filepath_to_urlpath(combinefile, 2)]
        else:
            return urls

    def _process_file(self, filename, buf, destdir, origin=""):
        """
        Helper function to concatenate or copy CSS/JS (optionally
        processing them with e.g. Scss) or other files to correct place
        under the web root directory.

        :param filename: The name (relative to the ferenda package) of the file
        :param buf: A buffer into which the contents of the file is written
                    (if combineresources == True)
        :param destdir: The directory into which the file will be copied
                        (unless combineresources == True)
        :param origin: The source of the configuration that specifies this file
        :returns: The URL path of the resulting file, relative to the web root
                  (or None if combineresources == True)
        :rtype: str
        """
        if filename.startswith("http://") or filename.startswith("https://"):
            if self.config.combineresources:
                raise errors.ConfigurationError(
                    "makeresources: Can't use combineresources=True in combination with external js/css URLs (%s)" % filename)
            self.log.debug("Using external url %s" % filename)
            return filename
        try: 
            fp = self.resourceloader.openfp(filename, binary=True)
        except errors.ResourceNotFound:
            self.log.warning("file %(filename)s (specified in %(origin)s)"
                             " doesn't exist" % locals())
            return None

        (base, ext) = os.path.splitext(filename)

        if self.config.combineresources:
            self.log.debug("combining %s into buffer" % filename)
            d = fp.read()
            buf.write(d)
            fp.close()
            return None
        else:
            # FIXME: don't copy (at least not log) if the outfile
            # already exists.
            # self.log.debug("writing %s out to %s" % (filename, destdir))
            outfile = destdir + os.sep + os.path.basename(filename)
            if (os.path.islink(outfile) and
                os.path.relpath(os.path.join(os.path.dirname(outfile),
                                             os.readlink(outfile))) == util.name_from_fp(fp)):
                self.log.warning("%s is a symlink to source file %s, won't overwrite" % (outfile, util.name_from_fp(fp)))
            else:
                util.ensure_dir(outfile)
                with open(outfile, "wb") as fp2:
                    fp2.write(fp.read())
                fp.close()
            return self._filepath_to_urlpath(outfile, 2)

[docs]    def make_api_files(self):
        # this should create the following files under resourcedir
        # api/context.json (aliased to /json-ld/context.json if legacyapi)
        # api/terms.json (aliased to /var/terms.json if legacyapi)
        # api/common.json (aliased to /var/common.json if legacyapi)
        # MAYBE api/ui/  - copied from ferenda/res/ui
        files = []
        context = os.sep.join([self.resourcedir, "api", "context.json"])
        if self.config.legacyapi:
            self.log.info("Creating API files for legacyapi")
            contextpath = "/json-ld/context.json"
            termspath = "/var/terms"
            commonpath = "/var/common"
        else:
            # FIXME: create correct URL path
            contextpath = "/rsrc/api/context.json"
            termspath = "/rsrc/api/terms.json"
            commonpath = "/rsrc/api/common.json"
        util.ensure_dir(context)
        with open(context, "w") as fp:
            contextdict = self._get_json_context()
            s = json.dumps({"@context": contextdict}, separators=(', ', ': '),
                           indent=4, sort_keys=True)
            fp.write(s)
        files.append(self._filepath_to_urlpath(context, 2))

        common = os.sep.join([self.resourcedir, "api", "common.json"])
        terms = os.sep.join([self.resourcedir, "api", "terms.json"])

        for (filename, func, urlpath) in ((common, self._get_common_graph, commonpath),
                                          (terms,  self._get_term_graph,   termspath)):
            g = func(self.config.url + urlpath[1:])
            d = json.loads(g.serialize(format="json-ld", context=contextdict,
                                       indent=4).decode("utf-8"))
            # d might not contain a @context (if contextdict == {}, ie
            # no repos are given)
            if '@context' in d:
                d['@context'] = contextpath
            if self.config.legacyapi:
                d = self._convert_legacy_jsonld(d, self.config.url + urlpath[1:])
            with open(filename, "w") as fp:
                s = json.dumps(d, indent=4, separators=(', ', ': '), sort_keys=True)
                fp.write(s)
                
            files.append(self._filepath_to_urlpath(filename, 2))

        if self.config.legacyapi:
            # copy ui explorer app to <url>/rsrc/ui/ -- this does not get
            # included in files
            targetdir = os.sep.join([self.resourcedir, "ui"])
            self.resourceloader.extractdir("ui", targetdir)
        return files

    def _convert_legacy_jsonld(self, indata, rooturi):
        # the json structure should be a top node containing only
        # @context, iri (localhost:8000/var/terms), type (foaf:Document)
        # and topic - a list of dicts, where each dict looks like:
        #
        # {"iri" : "referatserie",
        #  "comment" : "Anger vilken referatserie som referatet eventuellt tillhör.",
        #  "label" : "Referatserie",
        #  "type" : "DatatypeProperty"}
        out = {}
        topics = []

        # the property containing the id/uri for the
        # record may be under @id or iri, depending on
        # whether self.config.legacyapi was in effect for
        # _get_json_context()
        if self.config.legacyapi:
            idfld = 'iri'
        else:
            idfld = '@id'

        # indata might be a mapping containing a list of mappings
        # under @graph, or it might just be the actual list.
        wantedlist = None
        if isinstance(indata, list):
            wantedlist = indata
        else:
            for topkey, topval in indata.items():
                if topkey == "@graph":
                    wantedlist = topval
                    break

        if not wantedlist:
            self.log.warning(
                "Couldn't find list of mappings in %s, topics will be empty" %
                indata)
        else:
            shortened = {}
            for subject in sorted(wantedlist, key=lambda x: x["iri"]):
                if subject[idfld] == rooturi:
                    for key, value in subject.items():
                        if key in (idfld, 'foaf:topic'):
                            continue
                        out[key] = value
                else:
                    for key in subject:
                        if isinstance(subject[key], list):
                            # make sure multiple values are sorted for
                            # the same reason as below
                            subject[key].sort()

                    # FIXME: We want to use just the urileaf for
                    # legacyapi clients (ie Standard instead of
                    # bibo:Standard) but to be proper json-ld, this
                    # requires that we define contexts for this. Which
                    # we don't (yet)
                    if ("iri" in subject and
                            ":" in subject["iri"] and
                            "://" not in subject["iri"]):
                        short = subject["iri"].split(":", 1)[1]
                        if short in shortened:
                            self.log.warning(
                                "Cannot shorten IRI %s -> %s, already defined (%s)" %
                                (subject["iri"], short, shortened[short]))
                            del subject["iri"]  # skips adding this to topics
                        else:
                            shortened[short] = subject["iri"]
                            subject["iri"] = short
                    if "iri" in subject and subject["iri"]:
                        topics.append(subject)

        # make sure the triples are in a predictable order, so we can
        # compare on the JSON level for testing
        out['topic'] = sorted(topics, key=lambda x: x[idfld])
        out['iri'] = rooturi
        if '@context' in indata:
            out['@context'] = indata['@context']
        return out

    def _get_json_context(self):
        data = {}
        # step 1: define all prefixes
        for repo in self.repos:
            for (prefix, ns) in repo.ns.items():
                if prefix in data:
                    assert data[prefix] == str(
                        ns), "Conflicting URIs for prefix %s" % prefix
                else:
                    data[prefix] = str(ns)

        # foaf and rdfs must always be defined prefixes
        data["foaf"] = "http://xmlns.com/foaf/0.1/"
        data["rdfs"] = "http://www.w3.org/2000/01/rdf-schema#"

        # the legacy api client expects some terms to be available using
        # shortened forms (eg 'label' instead of 'rdfs:label'), so we must
        # define them in our context
        if self.config.legacyapi:
            data['iri'] = "@id"
            data['type'] = "@type"
            data['label'] = 'rdfs:label'
            data['name'] = 'foaf:name'
            data['altLabel'] = 'skos:altLabel'
            # data["@language"] = "en" # how to set this? majority vote of
            # repos / documents? note that it's
            # only a default.
        return data

    def _get_term_graph(self, graphuri):
        # produce a rdf graph of the terms (classes and properties) in
        # the vocabs we're using. This should preferably entail
        # loading the vocabularies (stored as RDF/OWL documents), and
        # expressing all the things that are owl:*Property, owl:Class,
        # rdf:Property and rdf:Class. As an intermediate step, we
        # could have preprocessed rdf graphs (stored in
        # res/vocab/dcterms.ttl, res/vocab/bibo.ttl etc) derived from the
        # vocabularies and pull them in like we pull in namespaces in
        # self.ns The rdf graph should be rooted in an url (eg
        # http://localhost:8080/var/terms, and then have each term as
        # a foaf:topic. Each term should be described with its
        # rdf:type, rdfs:label (most important!) and possibly
        # rdfs:comment
        root = URIRef(graphuri)
        g = Graph()
        g.add((root, RDF.type, FOAF.Document))
        bigg = Graph()
        paths = set()
        for repo in self.repos:
            for p, ns in repo.ns.items():
                if p in ("rdf", "rdfs", "owl"):
                    continue
                g.bind(p, ns)
                resourcename = "vocab/%s.ttl" % p
                if repo.resourceloader.exists(resourcename):
                    ontopath = repo.resourceloader.filename(resourcename)
                    if ontopath not in paths:
                        self.log.debug("Loading vocabulary %s" % ontopath)
                        with open(ontopath) as onto:
                            bigg.parse(onto, format="turtle")
                        paths.add(ontopath)

        g.bind("foaf", "http://xmlns.com/foaf/0.1/")
        for (s, p, o) in bigg:
            if p in (RDF.type, RDFS.label, RDFS.comment):
                if isinstance(s, BNode): # occurs in the def of foaf:member
                    continue 
                g.add((root, FOAF.topic, s))  # unless we've already added it?
                if isinstance(o, Literal):  # remove language typing info
                    o = Literal(str(o))
                g.add((s, p, o))  # control duplicates somehow
        return g

    def _get_common_graph(self, graphuri):
        # create a graph with foaf:names for all entities (publishers,
        # publication series etc) that our data mentions.
        root = URIRef(graphuri)
        g = Graph()
        g.bind("skos", SKOS)
        g.bind("foaf", FOAF)
        g.add((root, RDF.type, FOAF.Document))
        paths = set()
        bigg = Graph()
        for repo in self.repos:
            for cls in inspect.getmro(repo.__class__):
                if hasattr(cls, "alias"):
                    resourcename = "extra/%s.ttl" % cls.alias
                    if repo.resourceloader.exists(resourcename):
                        commonpath = repo.resourceloader.filename(resourcename)
                        if commonpath not in paths:
                            self.log.debug("loading data %s" % commonpath)
                            with open(commonpath) as common:
                                bigg.parse(common, format="turtle")
                            paths.add(commonpath)
        for (s, p, o) in bigg:
            if p in (FOAF.name, SKOS.prefLabel,
                     SKOS.altLabel, BIBO.identifier):
                g.add((root, FOAF.topic, s))
                # strip any typing/langtagging (because of reasons)
                if isinstance(o, Literal):
                    o = Literal(str(o))
                g.add((s, p, o))
                # try to find a type
                g.add((s, RDF.type, bigg.value(s, RDF.type)))
        return g

    def _filepath_to_urlpath(self, path, keep_segments=2):
        """
        :param path: the full or relative filepath to transform into a urlpath
        :param keep_segments: the number of directory segments to keep (the ending filename is always kept)
        """
        # data/repo/rsrc/js/main.js, 3 -> repo/rsrc/js/main.js
        # /var/folders/tmp4q6b1g/rsrc/resources.xml, 1 -> rsrc/resources.xml
        # C:\docume~1\owner\locals~1\temp\tmpgbyuk7\rsrc\css\test.css, 2 - rsrc/css/test.css
        path = path.replace(os.sep, "/")
        urlpath = "/".join(path.split("/")[-(keep_segments + 1):])
        # print("_filepath_to_urlpath (%s): %s -> %s" % (keep_segments, path, urlpath))
        return urlpath