# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division,
print_function, unicode_literals)
from builtins import *
import sys
import re
from rdflib import Graph, URIRef, Literal, RDF, RDFS
from lxml import etree
import requests
from ferenda import DocumentRepository, TripleStore
[docs]class Skeleton(DocumentRepository):
"""Utility docrepo to fetch all RDF data from a triplestore (either
our triple store, or a remote one, fetched through the combined
ferenda atom feed), find out those resources that are referred
to but not present in the data (usually older documents that
are not available in electronic form), and create "skeleton
entries" for those resources.
"""
alias = "closet"
start_url = "http://rinfo.demo.lagrummet.se/feed/current"
downloaded_suffix = ".nt"
def download(self):
graph = self.download_from_triplestore()
# or, alternatively
graph = self.download_from_atom()
def download_from_triplestore(self):
sq = "SELECT ?something ?references ?uri where ?something ?references ?uri AND NOT ?uri ?references ?anything"
store = TripleStore(self.config.storetype,
self.config.storelocation,
self.config.storerepository)
with self.store.open_downloaded("biggraph") as fp:
for row in store.select(sq):
fp.write("<%(something)s> <%(references)s> <%(uri)s> .\n")
def download_from_atom(self):
refresh = self.config.force
feed_url = self.start_url
ns = 'http://www.w3.org/2005/Atom'
done = False
biggraph = Graph()
biggraph.bind("dcterms", self.ns['dcterms'])
biggraph.bind("rpubl", self.ns['rpubl'])
while not done:
self.log.info("Feed: %s" % feed_url)
tree = etree.parse(requests.get(feed_url).text)
for entry in tree.findall('{%s}entry' % (ns)):
try:
self.log.info(" Examining entry")
rdf_url = None
for node in entry:
if (node.tag == "{%s}link" % ns and
node.get('type') == 'application/rdf+xml'):
rdf_url = urljoin(feed_url, node.get("href"))
elif (node.tag == "{%s}content" % ns and
node.get('type') == 'application/rdf+xml'):
rdf_url = urljoin(feed_url, node.get("src"))
if rdf_url:
self.log.info(" RDF: %s" % rdf_url)
g = Graph()
g.parse(requests.get(rdf_url).text)
for triple in g:
s, p, o = triple
if (not isinstance(o, URIRef) or
not str(o).startswith(self.config.url)):
g.remove(triple)
self.log.debug(" Adding %s triples" % len(g))
biggraph += g
except KeyboardInterrupt:
raise
except:
e = sys.exc_info()[1]
self.log.error("ERROR: %s" % e)
done = True
for link in list(tree.findall('{%s}link' % (ns))):
self.log.info(" Examining link")
if link.get('rel') == 'prev-archive':
feed_url = urljoin(feed_url, link.get("href"))
done = False
# done = True
self.log.info("Done downloading")
with self.store.open_downloaded("biggraph", "wb") as fp:
fp.write(biggraph.serialize(format="nt"))
def parse(self, basefile):
# Find out possible skeleton entries by loading the entire
# graph of resource references, and find resources that only
# exist as objects.
#
# Note: if we used download_from_triplestore we know that this list
# is clean -- we could just iterate the graph w/o filtering
g = Graph()
self.log.info("Parsing %s" % basefile)
g.parse(self.store.downloaded_path(basefile), format="nt")
self.log.info("Compiling object set")
# create a uri -> True dict mapping -- maybe?
objects = dict(zip([str(o).split("#")[0] for (s, p, o) in g], True))
self.log.info("Compiling subject set")
subjects = dict(zip([str(s).split("#")[0] for (s, p, o) in g], True))
self.log.info("%s objects, %s subjects. Iterating through existing objects" %
(len(objects), len(subjects)))
for o in objects:
if not o.startswith(self.config.url):
continue
if '9999:999' in o:
continue
if o in subjects:
continue
for repo in otherrepos:
skelbase = repo.basefile_from_uri(repo)
if skelbase:
skel = repo.triples_from_uri(o) # need to impl
with self.store.open_distilled(skelbase, "wb") as fp:
fp.write(skel.serialize(format="pretty-xml"))
self.log.info("Created skel for %s" % o)
return True
# FIXME: Move this to SwedishLegalSource -- also unify
# triples_from_uri with SwedishLegalSource.infer_metadata(basefile)
RATTSFALL = 1
KONSOLIDERAD = 2
FORESKRIFT = 3
PROPOSITION = 4
UTREDNING = 5
def triples_from_uri(self, uri):
types = {self.RATTSFALL: self.ns['rpubl']["Rattsfallsreferat"],
self.KONSOLIDERAD: self.ns['rpubl']["KonsolideradGrundforfattning"],
self.FORESKRIFT: self.ns['rpubl']["Myndighetsforeskrift"],
self.PROPOSITION: self.ns['rpubl']["Proposition"],
self.UTREDNING: self.ns['rpubl']["Utredningsbetankande"],
}
# Maps keys used by the internal dictionaries that LegalRef
# constructs, which in turn are modelled after production rule names
# in the EBNF grammar.
predicate = {"type": RDF.type,
"rf": self.ns['rpubl']["rattsfallspublikation"],
"fs": self.ns['rpubl']["forfattningssamling"],
"artal": self.ns['rpubl']["artal"],
"lopnummer": self.ns['rpubl']["lopnummer"],
"sidnummer": self.ns['rpubl']["sidnummer"],
"arsutgava": self.ns['rpubl']["arsutgava"],
"kapitel": self.ns['rpubl']["kapitel"],
"paragraf": self.ns['rpubl']["paragraf"],
"identifier": self.ns['dcterms']["identifier"],
}
patterns = {self.RATTSFALL:
re.compile(
"http://rinfo.lagrummet.se/publ/rf/(?P<rf>\w+)/(?P<arsutgava>\d+)(/|)(?P<sep>[s:])(_(?P<sidnummer>\d+)|(?P<lopnummer>\d+))").match,
self.KONSOLIDERAD:
# NB: These shouldn't have any
# rpubl:forfattningssamling triples.
re.compile(
"http://rinfo.lagrummet.se/publ/sfs/(?P<arsutgava>\d{4}):(?P<lopnummer>\w+)#?(k_(?P<kapitel>[0-9a-z]+))?(p_(?P<paragraf>[0-9a-z]+))?").match,
self.FORESKRIFT:
re.compile(
"http://rinfo.lagrummet.se/publ/(?P<fs>[\w-]+fs)/(?P<arsutgava>\d{4}):(?P<lopnummer>\w+)").match,
self.UTREDNING:
re.compile(
"http://rinfo.lagrummet.se/publ/(?P<utr>(sou|ds))/(?P<arsutgava>\d{4}(/\d{2}|)):(?P<lopnummer>\w+)").match,
self.PROPOSITION:
re.compile(
"http://rinfo.lagrummet.se/publ/(?P<prop>prop)/(?P<arsutgava>\d{4}(/\d{2}|)):(?P<lopnummer>\w+)").match
}
identifier = {self.RATTSFALL: "%(rf)s %(arsutgava)s%(sep)s%(lopnummer)s",
self.KONSOLIDERAD: "SFS %(arsutgava)s:%(lopnummer)s",
self.FORESKRIFT: "%(fs)s %(arsutgava)s:%(lopnummer)s",
self.PROPOSITION: "Prop. %(arsutgava)s:%(lopnummer)s",
self.UTREDNING: "%(utr)s. %(arsutgava)s:%(lopnummer)s"
}
dictionary = None
for (pid, pattern) in list(patterns.items()):
m = pattern(uri)
if m:
dictionary = m.groupdict()
dictionary["type"] = pid
break
if not dictionary:
raise ValueError("Can't parse URI %s" % uri)
graph = Graph()
for key, value in list(self.ns.items()):
graph.bind(key, value)
subj = URIRef(uri)
for key in dictionary:
if dictionary[key] is None:
continue
if key.startswith("_"):
continue
if key == "type":
graph.add((subj, RDF.type, URIRef(types[dictionary[key]])))
elif key in ("fs", "rf", "utr"):
uri = "http://rinfo.lagrummet.se/serie/%s/%s" % (
key, dictionary[key])
graph.add((subj, predicate[key], URIRef(uri)))
elif key in ("prop"):
pass
#uri = "http://rinfo.lagrummet.se/serie/%s" % key
#graph.add((subj, predicate[key], URIRef(uri)))
elif key in ("sep"):
pass
else:
graph.add((subj, predicate[key], Literal(dictionary[key])))
id_templ = identifier[dictionary["type"]]
if 'sep' in dictionary and dictionary['sep'] == "s":
# Extra handling of NJA URIs
dictionary['sep'] = " s. "
dictionary['lopnummer'] = dictionary['sidnummer']
for key in ('fs', 'rf'):
if key in dictionary:
dictionary[key] = dictionary[key].upper()
graph.add(
(subj, predicate["identifier"], Literal(id_templ % dictionary)))
graph.add(
(subj, RDFS.comment, Literal("Detta dokument finns inte i elektronisk form i rättsinformationssystemet")))
return graph