Source code for ferenda.sources.general.skeleton

# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import *

import sys
import re

from rdflib import Graph, URIRef, Literal, RDF, RDFS
from lxml import etree
import requests

from ferenda import DocumentRepository, TripleStore


[docs]class Skeleton(DocumentRepository): """Utility docrepo to fetch all RDF data from a triplestore (either our triple store, or a remote one, fetched through the combined ferenda atom feed), find out those resources that are referred to but not present in the data (usually older documents that are not available in electronic form), and create "skeleton entries" for those resources. """ alias = "closet" start_url = "http://rinfo.demo.lagrummet.se/feed/current" downloaded_suffix = ".nt" def download(self): graph = self.download_from_triplestore() # or, alternatively graph = self.download_from_atom() def download_from_triplestore(self): sq = "SELECT ?something ?references ?uri where ?something ?references ?uri AND NOT ?uri ?references ?anything" store = TripleStore(self.config.storetype, self.config.storelocation, self.config.storerepository) with self.store.open_downloaded("biggraph") as fp: for row in store.select(sq): fp.write("<%(something)s> <%(references)s> <%(uri)s> .\n") def download_from_atom(self): refresh = self.config.force feed_url = self.start_url ns = 'http://www.w3.org/2005/Atom' done = False biggraph = Graph() biggraph.bind("dcterms", self.ns['dcterms']) biggraph.bind("rpubl", self.ns['rpubl']) while not done: self.log.info("Feed: %s" % feed_url) tree = etree.parse(requests.get(feed_url).text) for entry in tree.findall('{%s}entry' % (ns)): try: self.log.info(" Examining entry") rdf_url = None for node in entry: if (node.tag == "{%s}link" % ns and node.get('type') == 'application/rdf+xml'): rdf_url = urljoin(feed_url, node.get("href")) elif (node.tag == "{%s}content" % ns and node.get('type') == 'application/rdf+xml'): rdf_url = urljoin(feed_url, node.get("src")) if rdf_url: self.log.info(" RDF: %s" % rdf_url) g = Graph() g.parse(requests.get(rdf_url).text) for triple in g: s, p, o = triple if (not isinstance(o, URIRef) or not str(o).startswith(self.config.url)): g.remove(triple) self.log.debug(" Adding %s triples" % len(g)) biggraph += g except KeyboardInterrupt: raise except: e = sys.exc_info()[1] self.log.error("ERROR: %s" % e) done = True for link in list(tree.findall('{%s}link' % (ns))): self.log.info(" Examining link") if link.get('rel') == 'prev-archive': feed_url = urljoin(feed_url, link.get("href")) done = False # done = True self.log.info("Done downloading") with self.store.open_downloaded("biggraph", "wb") as fp: fp.write(biggraph.serialize(format="nt")) def parse(self, basefile): # Find out possible skeleton entries by loading the entire # graph of resource references, and find resources that only # exist as objects. # # Note: if we used download_from_triplestore we know that this list # is clean -- we could just iterate the graph w/o filtering g = Graph() self.log.info("Parsing %s" % basefile) g.parse(self.store.downloaded_path(basefile), format="nt") self.log.info("Compiling object set") # create a uri -> True dict mapping -- maybe? objects = dict(zip([str(o).split("#")[0] for (s, p, o) in g], True)) self.log.info("Compiling subject set") subjects = dict(zip([str(s).split("#")[0] for (s, p, o) in g], True)) self.log.info("%s objects, %s subjects. Iterating through existing objects" % (len(objects), len(subjects))) for o in objects: if not o.startswith(self.config.url): continue if '9999:999' in o: continue if o in subjects: continue for repo in otherrepos: skelbase = repo.basefile_from_uri(repo) if skelbase: skel = repo.triples_from_uri(o) # need to impl with self.store.open_distilled(skelbase, "wb") as fp: fp.write(skel.serialize(format="pretty-xml")) self.log.info("Created skel for %s" % o) return True # FIXME: Move this to SwedishLegalSource -- also unify # triples_from_uri with SwedishLegalSource.infer_metadata(basefile) RATTSFALL = 1 KONSOLIDERAD = 2 FORESKRIFT = 3 PROPOSITION = 4 UTREDNING = 5 def triples_from_uri(self, uri): types = {self.RATTSFALL: self.ns['rpubl']["Rattsfallsreferat"], self.KONSOLIDERAD: self.ns['rpubl']["KonsolideradGrundforfattning"], self.FORESKRIFT: self.ns['rpubl']["Myndighetsforeskrift"], self.PROPOSITION: self.ns['rpubl']["Proposition"], self.UTREDNING: self.ns['rpubl']["Utredningsbetankande"], } # Maps keys used by the internal dictionaries that LegalRef # constructs, which in turn are modelled after production rule names # in the EBNF grammar. predicate = {"type": RDF.type, "rf": self.ns['rpubl']["rattsfallspublikation"], "fs": self.ns['rpubl']["forfattningssamling"], "artal": self.ns['rpubl']["artal"], "lopnummer": self.ns['rpubl']["lopnummer"], "sidnummer": self.ns['rpubl']["sidnummer"], "arsutgava": self.ns['rpubl']["arsutgava"], "kapitel": self.ns['rpubl']["kapitel"], "paragraf": self.ns['rpubl']["paragraf"], "identifier": self.ns['dcterms']["identifier"], } patterns = {self.RATTSFALL: re.compile( "http://rinfo.lagrummet.se/publ/rf/(?P<rf>\w+)/(?P<arsutgava>\d+)(/|)(?P<sep>[s:])(_(?P<sidnummer>\d+)|(?P<lopnummer>\d+))").match, self.KONSOLIDERAD: # NB: These shouldn't have any # rpubl:forfattningssamling triples. re.compile( "http://rinfo.lagrummet.se/publ/sfs/(?P<arsutgava>\d{4}):(?P<lopnummer>\w+)#?(k_(?P<kapitel>[0-9a-z]+))?(p_(?P<paragraf>[0-9a-z]+))?").match, self.FORESKRIFT: re.compile( "http://rinfo.lagrummet.se/publ/(?P<fs>[\w-]+fs)/(?P<arsutgava>\d{4}):(?P<lopnummer>\w+)").match, self.UTREDNING: re.compile( "http://rinfo.lagrummet.se/publ/(?P<utr>(sou|ds))/(?P<arsutgava>\d{4}(/\d{2}|)):(?P<lopnummer>\w+)").match, self.PROPOSITION: re.compile( "http://rinfo.lagrummet.se/publ/(?P<prop>prop)/(?P<arsutgava>\d{4}(/\d{2}|)):(?P<lopnummer>\w+)").match } identifier = {self.RATTSFALL: "%(rf)s %(arsutgava)s%(sep)s%(lopnummer)s", self.KONSOLIDERAD: "SFS %(arsutgava)s:%(lopnummer)s", self.FORESKRIFT: "%(fs)s %(arsutgava)s:%(lopnummer)s", self.PROPOSITION: "Prop. %(arsutgava)s:%(lopnummer)s", self.UTREDNING: "%(utr)s. %(arsutgava)s:%(lopnummer)s" } dictionary = None for (pid, pattern) in list(patterns.items()): m = pattern(uri) if m: dictionary = m.groupdict() dictionary["type"] = pid break if not dictionary: raise ValueError("Can't parse URI %s" % uri) graph = Graph() for key, value in list(self.ns.items()): graph.bind(key, value) subj = URIRef(uri) for key in dictionary: if dictionary[key] is None: continue if key.startswith("_"): continue if key == "type": graph.add((subj, RDF.type, URIRef(types[dictionary[key]]))) elif key in ("fs", "rf", "utr"): uri = "http://rinfo.lagrummet.se/serie/%s/%s" % ( key, dictionary[key]) graph.add((subj, predicate[key], URIRef(uri))) elif key in ("prop"): pass #uri = "http://rinfo.lagrummet.se/serie/%s" % key #graph.add((subj, predicate[key], URIRef(uri))) elif key in ("sep"): pass else: graph.add((subj, predicate[key], Literal(dictionary[key]))) id_templ = identifier[dictionary["type"]] if 'sep' in dictionary and dictionary['sep'] == "s": # Extra handling of NJA URIs dictionary['sep'] = " s. " dictionary['lopnummer'] = dictionary['sidnummer'] for key in ('fs', 'rf'): if key in dictionary: dictionary[key] = dictionary[key].upper() graph.add( (subj, predicate["identifier"], Literal(id_templ % dictionary))) graph.add( (subj, RDFS.comment, Literal("Detta dokument finns inte i elektronisk form i rättsinformationssystemet"))) return graph