Source code for ferenda.sources.general.keyword

# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import *

# system libraries
import re
import os
from collections import defaultdict
import unicodedata
import gzip

# 3rdparty libs
import requests
from lxml import etree
from rdflib import Literal, Namespace
import yaml

# my libs
from ferenda import util
from ferenda import DocumentRepository, TripleStore, DocumentStore, Describer
from ferenda.decorators import managedparsing
from ferenda.elements import Body

class KeywordStore(DocumentStore):

    def basefile_to_pathfrag(self, basefile):
        # Shard all files under initial letter, eg "Avtal" => "A/Avtal"
        first = basefile[0]
        # then encode ":" because it messes with the filesystem (maybe?)
        basefile = basefile.replace(":", "%3A")
        return "%s/%s" % (first, basefile)

    def pathfrag_to_basefile(self, pathfrag):
        pathfrag = super(KeywordStore, self).pathfrag_to_basefile(pathfrag).replace("%3A",":")
        first, basefile = pathfrag.split("/", 1)
        return basefile

[docs]class Keyword(DocumentRepository): """Implements support for 'keyword hubs', conceptual resources which themselves aren't related to any document, but to which other documents are related. As an example, if a docrepo has documents that each contains a set of keywords, and the docrepo parse implementation extracts these keywords as ``dcterms:subject`` resources, this docrepo creates a document resource for each of those keywords. The main content for the keyword may come from the :class:`~ferenda.sources.general.MediaWiki` docrepo, and all other documents in any of the repos that refer to this concept resource are automatically listed. """ # FIXME be more comprehensible alias = "keyword" downloaded_suffix = ".txt" download_archive = False documentstore_class = KeywordStore xslt_template = "xsl/keyword.xsl" rdf_type = Namespace(util.ns['skos']).Concept namespaces = ['skos', 'prov', 'dcterms'] invalid_term_start = [".", "/", ":"] invalid_term_end = [".", ","] term_max_len = 100 term_min_len = 2 def __init__(self, config=None, **kwargs): super(Keyword, self).__init__(config, **kwargs) self.mediawikirepo = None if self.config._parent and hasattr(self.config._parent, 'mediawiki'): from ferenda.manager import _load_class classname = getattr(self.config._parent.mediawiki, 'class') cls = _load_class(classname) self.mediawikirepo = cls(self.config._parent.mediawiki, keywordrepo=self) # extra functions -- subclasses can add / remove from this self.termset_funcs = [self.download_termset_mediawiki, self.download_termset_wikipedia] # self.termset_funcs = [] @classmethod def get_default_options(cls): opts = super(Keyword, cls).get_default_options() # The API endpoint URLs change with MW language opts['mediawikiexport'] = 'http://localhost/wiki/Special:Export/%s(basefile)' opts[ 'wikipediatitles'] = '' return opts def canonical_uri(self, basefile): # keywords often contain spaces -- convert to underscore to get nicer URIs return super(Keyword, self).canonical_uri(basefile.replace(" ", "_")) def basefile_from_uri(self, uri): # do the inverse conversion from canonical_uri. NOTE: if your # Keyword-derived repo might handle keywords that contain "_", # you need to have some other basefile <-> uri strategy. ret = super(Keyword, self).basefile_from_uri(uri) if ret: ret = ret.replace("_", " ") return ret def download(self, basefile=None): # Get all "term sets" (used dcterms:subject Objects, wiki pages # describing legal concepts, swedish wikipedia pages...) terms = defaultdict(dict) # 1) Query the triplestore for all dcterms:subject triples (is this # semantically sensible for a "download" action -- the content # isn't really external?) -- term set "subjects" (these come # from both court cases and legal definitions in law text) sq = """ PREFIX dcterms:<> PREFIX rdfs:<> SELECT ?uri ?subject ?label WHERE { {?uri dcterms:subject ?subject . } OPTIONAL {?subject rdfs:label ?label . } } """ store = TripleStore.connect(self.config.storetype, self.config.storelocation, self.config.storerepository) results =, "python") for row in results: if 'label' in row: label = row['label'] else: label = self.basefile_from_uri(row['subject']) if label is None: self.log.warning("could not determine keyword from %s" % row['subject']) continue sanitized = self.sanitize_term(label) if sanitized: if sanitized not in terms: terms[sanitized]['subjects'] = [] terms[sanitized]['subjects'].append(row['uri']) self.log.debug("Retrieved %s subject terms from triplestore" % len(terms)) for termset_func in self.termset_funcs: termset_func(terms) for term in terms: term = self.sanitize_term(term) if not term: continue oldterms = "" termpath = if os.path.exists(termpath): oldterms = yaml.load(util.readfile(termpath)) if terms[term] != oldterms: util.ensure_dir(termpath) util.writefile(termpath, yaml.dump(terms[term], default_flow_style=False))"%s: in %s termsets" % (term, len(terms[term]))) else: self.log.debug("%s: skipped" % term) def sanitize_term(self, term): # sanity checking -- not everything can be a legit # keyword. Must be under 100 chars and not start with . or / term = util.normalize_space(term) if (self.term_max_len >= len(term) >= self.term_min_len and term[0] not in self.invalid_term_start and term[-1] not in self.invalid_term_end): return term # else return None def download_termset_mediawiki(self, terms): if 'mediawikidump' in self.config: # 2) Download the dump from # -- term set "mediawiki" xml = etree.parse(requests.get(self.config.mediawikidump).text) wikinamespaces = [] MW_NS = "{%s}" % xml.getroot().nsmap[None] for ns_el in xml.findall("//" + MW_NS + "namespace"): wikinamespaces.append(ns_el.text) for page_el in xml.findall(MW_NS + "page"): title = page_el.find(MW_NS + "title").text if title == "Huvudsida": continue if ":" in title and title.split(":")[0] in wikinamespaces: continue # only process pages in the main namespace terms[title]['mediawiki'] = True elif self.mediawikirepo: for term in"parse"): terms[term]['mediawiki'] = True else: self.log.error("Neither mediawikidump or mediawikirepo is defined, " "can't download mediawiki terms") self.log.debug("Retrieved subject terms from wiki, now have %s terms" % len(terms)) def _download_termset_mediawiki_titles(self): # subclasses might want to override this to filter stuff out # from the set of titles for title in"parse"): yield title def download_termset_wikipedia(self, terms): # 3) Download the Wikipedia dump from # # -- term set "wikipedia" filename = + "/downloaded/wikititles.gz" updated = self.download_if_needed(self.config.wikipediatitles, None, archive=self.download_archive, filename=filename) with, mode='rt', encoding="utf-8") as fp: # to avoid creating a term for every page on wikipedia, # only register those terms that have already been # featured in another termset. This means that this # function must run last of all termset funcs for term in fp: term = term.strip() if term in terms: terms[term]['wikipedia'] = True self.log.debug("Retrieved terms from wikipedia, now have %s terms" % len(terms)) @managedparsing def parse(self, doc): # create a dummy txt d = Describer(doc.meta, doc.uri) d.rdftype(self.rdf_type) d.value(self.ns['dcterms'].title, Literal(doc.basefile, lang=doc.lang)) d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) doc.body = Body() # can be empty, all content in doc.meta self.parse_entry_update(doc) return True re_tagstrip = re.compile(r'<[^>]*>') # FIXME: This is copied verbatim from -- maybe it could go # into DocumentRepository or util? (or possibly triplestore?) def store_select(self, store, query_template, uri, context=None): params = {'uri': uri, 'context': context} with, "rb") as fp: sq ='utf-8') % params # Only supports (or needs) uniongraph if self.config.storetype == "FUSEKI": if context: kwargs = {'uniongraph': False} else: kwargs = {'uniongraph': True} else: kwargs = {} return, "python", **kwargs) def time_store_select( self, store, query_template, basefile, context=None, label="things"): values = {'basefile': basefile, 'label': label, 'count': None} uri = self.canonical_uri(basefile) msg = ("%(basefile)s: selected %(count)s %(label)s " "(%(elapsed).3f sec)") with util.logtime(self.log.debug, msg, values): result = self.store_select(store, query_template, uri, context) values['count'] = len(result) return result # FIXME: translate this to be consistent with construct_annotations # (e.g. return a RDF graph through one or a few SPARQL queries), # not a XML monstrosity def prep_annotation_file(self, basefile): uri = self.canonical_uri(basefile) keyword = basefile store = TripleStore.connect(self.config.storetype, self.config.storelocation, self.config.storerepository) # Use SPARQL queries to create a rdf graph (to be used by the # xslt transform) containing the wiki authored # dcterms:description for this term. FIXME: This isn't a real # RDF graph yet. wikidesc = self.time_store_select(store, "sparql/keyword_subjects.rq", basefile, None, "descriptions") # compatibility hack to enable lxml to process qnames for namespaces def ns(string): if ":" in string: prefix, tag = string.split(":", 1) return "{%s}%s" % (str(self.ns[prefix]), tag) # FIXME: xhv MUST be part of nsmap if 'xhtml' not in self.ns: self.ns['xhtml'] = "" root_node = etree.Element(ns("rdf:RDF"), nsmap=self.ns) main_node = etree.SubElement(root_node, ns("rdf:Description")) main_node.set(ns("rdf:about"), uri) for d in wikidesc: desc_node = etree.SubElement(main_node, ns("dcterms:description")) xhtmlstr = "<div xmlns=''>%s</div>" % (d['desc']) # xhtmlstr = xhtmlstr.replace( # ' xmlns=""', '') desc_node.append(etree.fromstring(xhtmlstr.encode('utf-8'))) # subclasses override this to add extra annotations from other # sources self.prep_annotation_file_termsets(basefile, main_node) treestring = etree.tostring(root_node, encoding="utf-8", pretty_print=True) with, mode="wb") as fp: fp.write(treestring) return def prep_annotation_file_termsets(self, basefile, main_node): pass