Source code for ferenda.compositerepository

# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import *

import os
import time
import logging
from collections import defaultdict, OrderedDict

from ferenda import DocumentRepository, DocumentStore
from ferenda import util, errors
from ferenda.decorators import updateentry

[docs]class CompositeStore(DocumentStore): """Custom store for CompositeRepository objects.""" def __init__(self, datadir, storage_policy="file", compression=None, docrepo_instances=None): self.datadir = datadir # docrepo.datadir + docrepo.alias self.storage_policy = storage_policy if not docrepo_instances: docrepo_instances = OrderedDict() self.docrepo_instances = docrepo_instances self.basefiles = defaultdict(set)
[docs] def list_basefiles_for(self, action, basedir=None, force=True): if not basedir: basedir = self.datadir # if action in ("parse", "news"): # NB: since symlinks from # <mainrepo>/entries to <subrepo>/entries is now created as # part of parse (in .copy_parsed), and possibly even as part # of download (see lagen.nu.myndfskr), we only need to query # subrepos prior to the parse step if action in ("parse"): documents = set() for cls, inst in self.docrepo_instances.items(): for basefile in inst.store.list_basefiles_for(action, force=force): self.basefiles[cls].add(basefile) if basefile not in documents: documents.add(basefile) yield basefile else: for basefile in super(CompositeStore, self).list_basefiles_for(action, basedir, force): yield basefile
[docs]class CompositeRepository(DocumentRepository): """Acts as a proxy for a list of sub-repositories. Calls the download() method for each of the included subrepos. Parse calls each subrepos parse() method in order until one succeeds, unless config.failfast is True. In that case any errors from the first subrepo is re-raised. """ subrepos = () # list of classes """List of respository classes to use.""" documentstore_class = CompositeStore extrabases = () """List of mixin classes to add to each subrepo class.""" supress_subrepo_logging = True
[docs] def get_instance(self, instanceclass): if instanceclass not in self._instances: if hasattr(self, '_config'): config = self.config else: config = None # FIXME: this instance will be using a default # ResourceLoader, eg if a subrepo is at foo/bar.py, only # foo/bar/res will we in that resourceloaders path. This # causes problems, primarily if our CompositeRepository is # subclassed to somewhere else, eg subclass/bar.py -- we # might want to use resources at subclass/res instead. inst = instanceclass(config) # if we don't have a config object yet, the created # instance is just temporary -- don't save it if hasattr(self, '_config') and self.supress_subrepo_logging: # if the composite object has loglevel INFO, make the # subrepo have a slightly higher loglevel to avoid # creating almost-duplicate logging entries like: # # <time> subrepo1 INFO basefile: parse OK (2.42 sec) # <time> comprepo INFO basefile: parse OK (2.52 sec) # # Although not if the subrepo itself has subrepos # # FIXME: This is not good when using a compositerepo # for downloading if (self.log.getEffectiveLevel() == logging.INFO and inst.log.getEffectiveLevel() == logging.INFO and not isinstance(inst, CompositeRepository)): inst.log.setLevel(inst.log.getEffectiveLevel() + 1) self._instances[instanceclass] = inst return self._instances[instanceclass]
def __init__(self, config=None, **kwargs): self._instances = OrderedDict() # after this, self.config WILL be set (regardless of whether a # config object was provided or not super(CompositeRepository, self).__init__(config, **kwargs) newsubrepos = [] for c in self.subrepos: # populate self._instances if self.extrabases: bases = [x for x in self.extrabases if x not in c.__bases__] bases.append(c) c = type(c.__name__, tuple(bases), dict(c.__dict__)) newsubrepos.append(c) if self.loadpath: c.loadpath = self.loadpath self.get_instance(c) if newsubrepos: self.subrepos = newsubrepos cls = self.documentstore_class self.store = cls(self.config.datadir + os.sep + self.alias, storage_policy=self.storage_policy, docrepo_instances=self._instances) if self.downloaded_suffix != ".html" and self.store.downloaded_suffixes == [".html"]: self.store.downloaded_suffixes = [self.downloaded_suffix]
[docs] @classmethod def get_default_options(cls): # 1. Get options from superclass (NB: according to MRO...) opts = super(CompositeRepository, cls).get_default_options() # 2. Add extra options that ONLY exists in subrepos for c in cls.subrepos: for k, v in c.get_default_options().items(): if k not in opts: opts[k] = v # 3. add the extra 'failfast' option opts['failfast'] = False return opts
# FIXME: we have no real need for this property getter override # (it's exactly the same as DocumentRepository.config) itself, but # since we want to override the setter, we need to use this to # define config.setter @property def config(self): return self._config @config.setter def config(self, config): # FIXME: This doesn't work (AttributeError: 'super' object has # no attribute 'config'), so we just copy the entire method # super(CompositeRepository, self).config = config self._config = config self.store = self.documentstore_class( config.datadir + os.sep + self.alias, storage_policy=self.storage_policy, docrepo_instances=self._instances)
[docs] def download(self, basefile=None): for c in self.subrepos: inst = self.get_instance(c) # make sure that our store has access to our now # initialized subrepo objects if c not in self.store.docrepo_instances: self.store.docrepo_instances[c] = inst try: ret = inst.download(basefile) except Exception as e: # be resilient loc = util.location_exception(e) self.log.error("download for %s failed: %s (%s)" % (c.alias, e, loc)) ret = False if basefile and ret: # we got the doc we want, we're done! return
# NOTE: this impl should NOT use the @managedparsing decorator -- # but it can use @updateentry to catch warnings and errors thrown # by a subrepo
[docs] @updateentry("parse") def parse(self, basefile): # first, check if we really need to parse. If any subrepo # returns that .store.needed(...., "parse") is false and we # have parsed file in the mainrepo, then we're done. This is # mainly to avoid the log message below (to be in line with # expected repo behaviour of not logging anything at severity # INFO if no real work was done), it does not noticably affect # performance force = (self.config.force is True or self.config.parseforce is True) if not force: for c in self.subrepos: inst = self.get_instance(c) needed = inst.store.needed(basefile, "parse") if not needed and os.path.exists(self.store.parsed_path(basefile)): self.log.debug("%s: Skipped" % basefile) return True # signals everything OK start = time.time() ret = False for inst in self.get_preferred_instances(basefile): try: ret = inst.parse(basefile) # Any error thrown (errors.ParseError or something # else) means we try next subrepo -- unless we want to # fail fast with a nice stacktrace during debugging. except Exception as e: if self.config.failfast: raise else: self.log.debug("%s: parse with %s failed: %s" % (basefile, inst.qualified_class_name(), str(e))) ret = False if ret: break if ret: oldbasefile = basefile if ret is not True and ret != basefile: # this is a signal that parse discovered that the # basefile was adjusted. We should raise # DocumentRenamedError at the very end to get # updateentry do the right thing. basefile = ret # Also, touch the old parsed path so we don't # regenerate. with self.store.open_parsed(oldbasefile, "w"): pass self.copy_parsed(basefile, inst) self.log.info("%(basefile)s parse OK (%(elapsed).3f sec)", {'basefile': basefile, 'elapsed': time.time() - start}) if basefile != oldbasefile: msg = "%s: In subrepo %s basefile turned out to really be %s" % ( oldbasefile, inst.qualified_class_name(), basefile) raise errors.DocumentRenamedError(True, msg, oldbasefile, basefile) return ret else: # subrepos should only contain those repos that actually # had a chance of parsing (basefile in # self.store.basefiles[c]) subrepos_lbl = ", ".join([self.get_instance(x).qualified_class_name() for x in self.subrepos if basefile in self.store.basefiles[x]]) if subrepos_lbl: raise errors.ParseError( "No instance of %s was able to parse %s" % (subrepos_lbl, basefile)) else: raise errors.ParseError( "No available instance (out of %s) had basefile %s" % (len(self.subrepos), basefile))
[docs] def get_preferred_instances(self, basefile): for c in self.subrepos: inst = self.get_instance(c) if (basefile in self.store.basefiles[c] or os.path.exists(inst.store.downloaded_path(basefile))): yield(inst)
[docs] def copy_parsed(self, basefile, instance): # If the distilled and parsed links are recent, assume that # all external resources are OK as well if (not self.config.force and util.outfile_is_newer([instance.store.distilled_path(basefile)], self.store.distilled_path(basefile)) and util.outfile_is_newer([instance.store.parsed_path(basefile)], self.store.parsed_path(basefile))): self.log.debug("%s: Attachments are (likely) up-to-date" % basefile) return util.link_or_copy(instance.store.documententry_path(basefile), self.store.documententry_path(basefile)) util.link_or_copy(instance.store.distilled_path(basefile), self.store.distilled_path(basefile)) util.link_or_copy(instance.store.parsed_path(basefile), self.store.parsed_path(basefile)) cnt = 0 if instance.store.storage_policy == "dir": for attachment in instance.store.list_attachments(basefile, "parsed"): cnt += 1 src = instance.store.parsed_path(basefile, attachment=attachment) target = self.store.parsed_path(basefile, attachment=attachment) util.link_or_copy(src, target) if cnt: self.log.debug("%s: Linked %s attachments from %s to %s" % (basefile, cnt, os.path.dirname(instance.store.parsed_path(basefile)), os.path.dirname(self.store.parsed_path(basefile))))