Source code for ferenda.compositerepository

# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import *

import os
import time
import logging
from collections import defaultdict, OrderedDict

from ferenda import DocumentRepository, DocumentStore
from ferenda import util, errors
from ferenda.decorators import updateentry

[docs]class CompositeStore(DocumentStore):
    """Custom store for CompositeRepository objects."""

    def __init__(self, datadir,
                 storage_policy="file",
                 compression=None,
                 docrepo_instances=None):
        self.datadir = datadir  # docrepo.datadir + docrepo.alias
        self.storage_policy = storage_policy
        if not docrepo_instances:
            docrepo_instances = OrderedDict()
        self.docrepo_instances = docrepo_instances
        self.basefiles = defaultdict(set)

[docs]    def list_basefiles_for(self, action, basedir=None, force=True):
        if not basedir:
            basedir = self.datadir
        # if action in ("parse", "news"): # NB: since symlinks from
        # <mainrepo>/entries to <subrepo>/entries is now created as
        # part of parse (in .copy_parsed), and possibly even as part
        # of download (see lagen.nu.myndfskr), we only need to query
        # subrepos prior to the parse step
        if action in ("parse"): 
            documents = set()
            for cls, inst in self.docrepo_instances.items():
                for basefile in inst.store.list_basefiles_for(action, force=force):
                    self.basefiles[cls].add(basefile)
                    if basefile not in documents:
                        documents.add(basefile)
                        yield basefile
        else:
            for basefile in super(CompositeStore,
                                  self).list_basefiles_for(action, basedir, force):
                yield basefile


[docs]class CompositeRepository(DocumentRepository):
    """Acts as a proxy for a list of sub-repositories.

    Calls the download() method for each of the included
    subrepos. Parse calls each subrepos parse() method in order until
    one succeeds, unless config.failfast is True. In that case any
    errors from the first subrepo is re-raised.

    """

    subrepos = ()  # list of classes

    """List of respository classes to use."""
    documentstore_class = CompositeStore
    extrabases = ()
    """List of mixin classes to add to each subrepo class."""

    supress_subrepo_logging = True

[docs]    def get_instance(self, instanceclass):
        if instanceclass not in self._instances:
            if hasattr(self, '_config'):
                config = self.config
            else:
                config = None

            # FIXME: this instance will be using a default
            # ResourceLoader, eg if a subrepo is at foo/bar.py, only
            # foo/bar/res will we in that resourceloaders path. This
            # causes problems, primarily if our CompositeRepository is
            # subclassed to somewhere else, eg subclass/bar.py -- we
            # might want to use resources at subclass/res instead.
            inst = instanceclass(config)
            # if we don't have a config object yet, the created
            # instance is just temporary -- don't save it
            if hasattr(self, '_config') and self.supress_subrepo_logging:
                # if the composite object has loglevel INFO, make the
                # subrepo have a slightly higher loglevel to avoid
                # creating almost-duplicate logging entries like:
                #
                # <time> subrepo1 INFO basefile: parse OK (2.42 sec)
                # <time> comprepo INFO basefile: parse OK (2.52 sec)
                #
                # Although not if the subrepo itself has subrepos
                #
                # FIXME: This is not good when using a compositerepo
                # for downloading
                if (self.log.getEffectiveLevel() == logging.INFO and
                    inst.log.getEffectiveLevel() == logging.INFO and
                    not isinstance(inst, CompositeRepository)):
                    inst.log.setLevel(inst.log.getEffectiveLevel() + 1)
            self._instances[instanceclass] = inst
        return self._instances[instanceclass]

    def __init__(self, config=None, **kwargs):
        self._instances = OrderedDict()
        # after this, self.config WILL be set (regardless of whether a
        # config object was provided or not
        super(CompositeRepository, self).__init__(config, **kwargs)

        newsubrepos = []
        for c in self.subrepos:  # populate self._instances
            if self.extrabases:
                bases = [x for x in self.extrabases if x not in c.__bases__]
                bases.append(c)
                c = type(c.__name__, tuple(bases), dict(c.__dict__))
                newsubrepos.append(c)
            if self.loadpath:
                c.loadpath = self.loadpath
            self.get_instance(c)
        if newsubrepos:
            self.subrepos = newsubrepos
        cls = self.documentstore_class

        self.store = cls(self.config.datadir + os.sep + self.alias,
                         storage_policy=self.storage_policy,
                         docrepo_instances=self._instances)
        if self.downloaded_suffix != ".html" and self.store.downloaded_suffixes == [".html"]:
            self.store.downloaded_suffixes = [self.downloaded_suffix]


[docs]    @classmethod
    def get_default_options(cls):
        # 1. Get options from superclass (NB: according to MRO...)
        opts = super(CompositeRepository, cls).get_default_options()
        # 2. Add extra options that ONLY exists in subrepos
        for c in cls.subrepos:
            for k, v in c.get_default_options().items():
                if k not in opts:
                    opts[k] = v
        # 3. add the extra 'failfast' option
        opts['failfast'] = False
        return opts

    # FIXME: we have no real need for this property getter override
    # (it's exactly the same as DocumentRepository.config) itself, but
    # since we want to override the setter, we need to use this to
    # define config.setter
    @property
    def config(self):
        return self._config

    @config.setter
    def config(self, config):
        # FIXME: This doesn't work (AttributeError: 'super' object has
        # no attribute 'config'), so we just copy the entire method
        # super(CompositeRepository, self).config = config
        self._config = config
        self.store = self.documentstore_class(
            config.datadir + os.sep + self.alias,
            storage_policy=self.storage_policy,
            docrepo_instances=self._instances)

[docs]    def download(self, basefile=None):
        for c in self.subrepos:
            inst = self.get_instance(c)
            # make sure that our store has access to our now
            # initialized subrepo objects
            if c not in self.store.docrepo_instances:
                self.store.docrepo_instances[c] = inst
            try:
                ret = inst.download(basefile)
            except Exception as e:  # be resilient
                loc = util.location_exception(e)
                self.log.error("download for %s failed: %s (%s)" % (c.alias, e, loc))
                ret = False
            if basefile and ret:
                # we got the doc we want, we're done!
                return

    # NOTE: this impl should NOT use the @managedparsing decorator --
    # but it can use @updateentry to catch warnings and errors thrown
    # by a subrepo
[docs]    @updateentry("parse")
    def parse(self, basefile):
        # first, check if we really need to parse. If any subrepo
        # returns that .store.needed(...., "parse") is false and we
        # have parsed file in the mainrepo, then we're done. This is
        # mainly to avoid the log message below (to be in line with
        # expected repo behaviour of not logging anything at severity
        # INFO if no real work was done), it does not noticably affect
        # performance
        force = (self.config.force is True or
                 self.config.parseforce is True)
        if not force:
            for c in self.subrepos:
                inst = self.get_instance(c)
                needed = inst.store.needed(basefile, "parse")
                if not needed and os.path.exists(self.store.parsed_path(basefile)):
                    self.log.debug("%s: Skipped" % basefile)
                    return True  # signals everything OK

        start = time.time()
        ret = False
        for inst in self.get_preferred_instances(basefile):
            try:
                ret = inst.parse(basefile)
            # Any error thrown (errors.ParseError or something
            # else) means we try next subrepo -- unless we want to
            # fail fast with a nice stacktrace during debugging.
            except Exception as e:
                if self.config.failfast:
                    raise
                else:
                    self.log.debug("%s: parse with %s failed: %s" %
                                   (basefile,
                                    inst.qualified_class_name(),
                                    str(e)))
                    ret = False
            if ret:
                break
            
        if ret:
            oldbasefile = basefile
            if ret is not True and ret != basefile:
                # this is a signal that parse discovered that the
                # basefile was adjusted. We should raise
                # DocumentRenamedError at the very end to get
                # updateentry do the right thing.
                basefile = ret
                # Also, touch the old parsed path so we don't
                # regenerate.
                with self.store.open_parsed(oldbasefile, "w"):
                    pass
                

            self.copy_parsed(basefile, inst)
            self.log.info("%(basefile)s parse OK (%(elapsed).3f sec)",
                          {'basefile': basefile,
                           'elapsed': time.time() - start})

            if basefile != oldbasefile:
                msg = "%s: In subrepo %s basefile turned out to really be %s" % (
                    oldbasefile, inst.qualified_class_name(), basefile)
                raise errors.DocumentRenamedError(True, msg, oldbasefile, basefile)
            return ret
        else:
            # subrepos should only contain those repos that actually
            # had a chance of parsing (basefile in
            # self.store.basefiles[c])
            subrepos_lbl = ", ".join([self.get_instance(x).qualified_class_name()
                                      for x in self.subrepos if basefile in self.store.basefiles[x]])
            if subrepos_lbl:
                raise errors.ParseError(
                    "No instance of %s was able to parse %s" %
                    (subrepos_lbl, basefile))
            else:
                raise errors.ParseError(
                    "No available instance (out of %s) had basefile %s" %
                    (len(self.subrepos), basefile))
                

[docs]    def get_preferred_instances(self, basefile):
        for c in self.subrepos:
            inst = self.get_instance(c)
            if (basefile in self.store.basefiles[c] or
                os.path.exists(inst.store.downloaded_path(basefile))):
                yield(inst)

[docs]    def copy_parsed(self, basefile, instance):
        # If the distilled and parsed links are recent, assume that
        # all external resources are OK as well
        
        if (not self.config.force and 
            util.outfile_is_newer([instance.store.distilled_path(basefile)],
                                  self.store.distilled_path(basefile)) and
            util.outfile_is_newer([instance.store.parsed_path(basefile)],
                                  self.store.parsed_path(basefile))):
            self.log.debug("%s: Attachments are (likely) up-to-date" % basefile)
            return

        util.link_or_copy(instance.store.documententry_path(basefile),
                          self.store.documententry_path(basefile))

        util.link_or_copy(instance.store.distilled_path(basefile),
                          self.store.distilled_path(basefile))

        util.link_or_copy(instance.store.parsed_path(basefile),
                          self.store.parsed_path(basefile))

        cnt = 0
        if instance.store.storage_policy == "dir":
            for attachment in instance.store.list_attachments(basefile, "parsed"):
                cnt += 1
                src = instance.store.parsed_path(basefile, attachment=attachment)
                target = self.store.parsed_path(basefile, attachment=attachment)
                util.link_or_copy(src, target)
            if cnt:
                self.log.debug("%s: Linked %s attachments from %s to %s" %
                               (basefile,
                                cnt,
                                os.path.dirname(instance.store.parsed_path(basefile)),
                                os.path.dirname(self.store.parsed_path(basefile))))