# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division,
unicode_literals, print_function)
from builtins import *
from datetime import date, datetime
from rdflib import URIRef, Namespace
from rdflib.namespace import RDF, RDFS, DC, SKOS, FOAF, DCTERMS
SCHEMA = Namespace("http://schema.org/")
BIBO = Namespace("http://purl.org/ontology/bibo/")
from ferenda import fulltextindex # to get the IndexedType classes
from ferenda import util
[docs]class Facet(object):
"""Create a facet from the given rdftype and some optional parameters.
:param rdftype: The type of facet being created
:type rdftype: rdflib.term.URIRef
:param label: A template for the label property of TocPageset objects
created from this facet
:type label: str
:param pagetitle: A template for the title property of TocPage objects
created from this facet
:type pagetitle: str
:param indexingtype: Object specifying how to store the data selected
by this facet in the fulltext index
:type indexingtype: ferenda.fulltext.IndexedType
:param selector: A function that takes *(row, binding, resource_graph)*
and returns a string acting as a category of some kind
:type selector: callable
:param key: A function that takes *(row, binding, resource_graph)* and
returns a string usable for sorting
:type key: callable
:param toplevel_only: Whether this facet should be applied to documents
only, or any named (ie. given an URI) fragment of
a document.
:type toplevel_only: bool
:param use_for_toc: Whether this facet should be used for TOC generation
:type use_for_toc: bool
:param use_for_feed: Whether this facet should be used for newsfeed
generation
:type use_for_feed: bool
:param selector_descending: Whether the values returned by ``selector``
should be presented in lexical descending
order
:type selector_descending: bool
:param key_descending: Whether documents, when sorted through the ``key``
function, should be presented in reverse order.
:type key_descending: bool
:param multiple_values: Whether more than one instance of the ``rdftype``
value should be processed (such as multiple
keywords each specified by one ``dcterms:subject``
triple).
:type multiple_values: bool
:param dimension_type: The general type of this facet -- can be ``"type"``
(values are ``rdf:type``), ``"ref"`` (values are
URIs), ``"year"`` (values are xsd:datetime or
similar), or ``"value"`` (values are string
literals).
:type dimension_type: str
:param dimension_label: An alternate label for this facet to be used if
the ``selector`` logic is more transformative
than selectional (ie. if it transforms dates to
True or False values depending on whether they're
April 1st, you might set this to "aprilfirst")
:type dimension_label: str
:param identificator: A function that takes *(row, binding,
resource_graph)* and returns an identifier-like
string usable as an id string or URL segment.
:type identificator: callable
If optional parameters aren't provided, then appropriate values are
selected if rdfrtype is one of some common rdf properties:
=================== ======================================================
facet description
=================== ======================================================
rdf:type Grouped by :py:meth:`~rdflib.graph.Graph.qname` of the
``rdf:type`` of the document, eg. ``foaf:Document``.
Not used for toc
------------------- ------------------------------------------------------
dcterms:title Grouped by first "sortable" letter, eg for a document
titled "The Little Prince" returns "l". Is used as a
facet for the API, but it's debatable if it's useful
------------------- ------------------------------------------------------
dcterms:identifier Also grouped by first sortable letter. When indexing,
the resulting fulltext index field has a high boost
value, which increases the chances of this document
ranking high when one searches for its identifier.
------------------- ------------------------------------------------------
dcterms:abstract Not used for toc
------------------- ------------------------------------------------------
dc:creator Should be a free-test (string literal) value
------------------- ------------------------------------------------------
dcterms:publisher Should be a URIRef
------------------- ------------------------------------------------------
dcterms:references
------------------- ------------------------------------------------------
dcterms:issued Used for grouping documents published/issued in the
same year
------------------- ------------------------------------------------------
dc:subject A document can have multiple dc:subjects and all are
indexed/processed
------------------- ------------------------------------------------------
dcterms:subject Works like dc:subject, but the value should be a
URIRef
------------------- ------------------------------------------------------
schema:free A boolean value
=================== ======================================================
This module contains a number of classmethods that can be used as
arguments to ``selector`` and ``key``, eg
>>> from rdflib import Namespace
>>> MYVOCAB = Namespace("http://example.org/vocab/")
>>> f = Facet(MYVOCAB.enactmentDate, selector=Facet.year)
>>> f.selector({'myvocab_enactmentDate': '2014-07-06'},
... 'myvocab_enactmentDate')
'2014'
"""
_resourcecache = {}
[docs] @classmethod
def defaultselector(cls, row, binding, resource_graph=None):
"""This returns ``row[binding]`` without any transformation.
>>> row = {"rdf_type": "http://purl.org/ontology/bibo/Book",
... "dcterms_title": "A Tale of Two Cities",
... "dcterms_issued": "1859-04-30",
... "dcterms_publisher": "http://example.org/chapman_hall",
... "schema_free": "true"}
>>> Facet.defaultselector(row, "dcterms_title")
'A Tale of Two Cities'
"""
return row[binding]
[docs] @classmethod
def defaultidentificator(cls, row, binding, resource_graph=None):
"""This returns ``row[binding]`` run through a simple slug-like transformation.
>>> row = {"rdf_type": "http://purl.org/ontology/bibo/Book",
... "dcterms_title": "A Tale of Two Cities",
... "dcterms_issued": "1859-04-30",
... "dcterms_publisher": "http://example.org/chapman_hall",
... "schema_free": "true"}
>>> Facet.defaultidentificator(row, "dcterms_title")
'a-tale-of-two-cities'
"""
return row[binding].lower().replace(" ", "-")
[docs] @classmethod
def year(cls, row, binding='dcterms_issued', resource_graph=None):
"""This returns the the year part of ``row[binding]``.
>>> row = {"rdf_type": "http://purl.org/ontology/bibo/Book",
... "dcterms_title": "A Tale of Two Cities",
... "dcterms_issued": "1859-04-30",
... "dcterms_publisher": "http://example.org/chapman_hall",
... "schema_free": "true"}
>>> Facet.year(row, "dcterms_issued")
'1859'
"""
d = row[binding]
if not isinstance(d, (datetime, date)):
datestring = d
# assume a date(time) like '2014-06-05T12:00:00', '2014-06-05'
# or even '2014-06'
formatstring = {19: "%Y-%m-%dT%H:%M:%S",
10: "%Y-%m-%d",
7: "%Y-%m"}[len(datestring)]
d = datetime.strptime(datestring, formatstring)
return str(d.year)
[docs] @classmethod
def booleanvalue(cls, row, binding='schema_free', resource_graph=None):
"""
Returns True iff row[binding] == "true", False otherwise.
>>> row = {"rdf_type": "http://purl.org/ontology/bibo/Book",
... "dcterms_title": "A Tale of Two Cities",
... "dcterms_issued": "1859-04-30",
... "dcterms_publisher": "http://example.org/chapman_hall",
... "schema_free": "true"}
>>> Facet.booleanvalue(row, "schema_free")
True
"""
# only 'true' is True, everything else is False (unless boolean)
return row[binding] if isinstance(row[binding], bool) else row[binding] == 'true'
[docs] @classmethod
def titlesortkey(cls, row, binding='dcterms_title', resource_graph=None):
"""Returns a version of row[binding] suitable for sorting. The
function :py:func:`~ferenda.util.title_sortkey` is used for
string transformation.
>>> row = {"rdf_type": "http://purl.org/ontology/bibo/Book",
... "dcterms_title": "A Tale of Two Cities",
... "dcterms_issued": "1859-04-30",
... "dcterms_publisher": "http://example.org/chapman_hall",
... "schema_free": "true"}
>>> Facet.titlesortkey(row, "dcterms_title")
'ataleoftwocities'
"""
return util.title_sortkey(row[binding])
[docs] @classmethod
def firstletter(cls, row, binding='dcterms_title', resource_graph=None):
"""Returns the first letter of row[binding], transformed into a
sortable string.
>>> row = {"rdf_type": "http://purl.org/ontology/bibo/Book",
... "dcterms_title": "A Tale of Two Cities",
... "dcterms_issued": "1859-04-30",
... "dcterms_publisher": "http://example.org/chapman_hall",
... "schema_free": "true"}
>>> Facet.firstletter(row, "dcterms_title")
'a'
"""
titlesortkey = cls.titlesortkey(row, binding)
if titlesortkey:
return titlesortkey[0]
else:
# Handle the degenerate case where title consists
# entirely of non-letters (eg. "---").
return "-"
[docs] @classmethod
def resourcelabel(cls, row, binding='dcterms_publisher', resource_graph=None):
"""Lookup a suitable text label for row[binding] in resource_graph.
>>> row = {"rdf_type": "http://purl.org/ontology/bibo/Book",
... "dcterms_title": "A Tale of Two Cities",
... "dcterms_issued": "1859-04-30",
... "dcterms_publisher": "http://example.org/chapman_hall",
... "schema_free": "true"}
>>> import rdflib
>>> resources = rdflib.Graph().parse(format="turtle", data=\"""
... @prefix foaf: <http://xmlns.com/foaf/0.1/> .
...
... <http://example.org/chapman_hall> a foaf:Organization;
... foaf:name "Chapman & Hall" .
...
... \
""")
>>> Facet.resourcelabel(row, "dcterms_publisher", resources)
'Chapman & Hall'
"""
# FIXME: if the graph changes in between calls, the cache
# won't be invalidated and give incorrrect results
k = (row[binding], resource_graph.identifier)
if k in cls._resourcecache:
return cls._resourcecache[k]
uri = URIRef(row[binding])
for pred in (RDFS.label, SKOS.prefLabel, SKOS.altLabel, DCTERMS.title,
DCTERMS.alternative, FOAF.name, BIBO.identifier):
if resource_graph.value(uri, pred):
cls._resourcecache[k] = str(resource_graph.value(uri, pred))
return cls._resourcecache[k]
else:
cls._resourcecache[k] = row[binding]
return cls._resourcecache[k]
[docs] @classmethod
def sortresource(cls, row, binding='dcterms_publisher', resource_graph=None):
"""Returns a sortable version of the resource label for
``row[binding]``.
>>> row = {"rdf_type": "http://purl.org/ontology/bibo/Book",
... "dcterms_title": "A Tale of Two Cities",
... "dcterms_issued": "1859-04-30",
... "dcterms_publisher": "http://example.org/chapman_hall",
... "schema_free": "true"}
>>> import rdflib
>>> resources = rdflib.Graph().parse(format="turtle", data=\"""
... @prefix foaf: <http://xmlns.com/foaf/0.1/> .
...
... <http://example.org/chapman_hall> a foaf:Organization;
... foaf:name "Chapman & Hall" .
...
... \
""")
>>> Facet.sortresource(row, "dcterms_publisher", resources)
'chapmanhall'
"""
row[binding] = cls.resourcelabel(row, binding, resource_graph)
return cls.titlesortkey(row, binding)
[docs] @classmethod
def term(cls, row, binding='dcterms_publisher', resource_graph=None):
"""Returns the leaf part of the URI found in ``row[binding]``.
>>> row = {"rdf_type": "http://purl.org/ontology/bibo/Book",
... "dcterms_title": "A Tale of Two Cities",
... "dcterms_issued": "1859-04-30",
... "dcterms_publisher": "http://example.org/chapman_hall",
... "schema_free": "true"}
>>> Facet.term(row, "dcterms_publisher")
'chapman_hall'
"""
ret = util.uri_leaf(row[binding])
if not ret:
# FIXME: get a logger and complain. but also get something
# that can act as a URI fragmentx
ret = row[binding].replace(" ", "_")
return ret
[docs] @classmethod
def qname(cls, row, binding='rdf_type', resource_graph=None):
"""Returns the qname of the rdf URIref contained in row[binding], as
determined by the namespace prefixes registered in
resource_graph.
>>> row = {"rdf_type": "http://purl.org/ontology/bibo/Book",
... "dcterms_title": "A Tale of Two Cities",
... "dcterms_issued": "1859-04-30",
... "dcterms_publisher": "http://example.org/chapman_hall",
... "schema_free": "true"}
>>> import rdflib
>>> resources = rdflib.Graph()
>>> resources.bind("bibo", "http://purl.org/ontology/bibo/")
>>> Facet.qname(row, "rdf_type", resources)
'bibo:Book'
"""
u = URIRef(row[binding])
return resource_graph.qname(u)
[docs] @classmethod
def resourcelabel_or_qname(cls, row, binding='rdf_type', resource_graph=None):
res = cls.resourcelabel(row, binding, resource_graph)
if res == row[binding]: # couldn't find a real label, try qname instead
res = cls.qname(row, binding, resource_graph)
return res
# define a number of default values, used if the user does not
# explicitly specify indexingtype/selector/key
defaults = None
# formatting directives for label/pagetitle:
# %(criteria)s = The human-readable criteria for sorting/dividing/faceting, eg "date of publication", "document title" or "publisher"
# %(selected)s = The selected value, eg "2014", "A", "O'Reilly and Associates Publishing, inc."
# %(selected_uri)s = For resource-type values, the underlying URI, eg "http://example.org/ext/publisher/oreilly"
def __init__(self,
rdftype=DCTERMS.title,
# any rdflib.URIRef -- should be called 'rdfpredicate'??
label=None, # toclabel
pagetitle=None,
indexingtype=None, # if not given, determined by rdftype
selector=None, # - "" -
key=None, # - "" -
identificator=None, # - "" - (normally same as selector)
toplevel_only=None, # - "" -
use_for_toc=None, # - "" -
use_for_feed=None, # - "" -
selector_descending=None,
key_descending=None,
multiple_values=None,
dimension_type=None, # could be determined by indexingtype
dimension_label=None
):
def _finddefault(provided, rdftype, argumenttype, default):
if provided is None:
if rdftype in self.defaults and argumenttype in self.defaults[rdftype]:
return self.defaults[rdftype][argumenttype]
else:
# since self.defaults doesn't contain meaningless
# defaults (like selector for rdf:type) it's not a
# good UI to warn about this. Might need to add
# more data to self.defaults in order to re-enable
# this.
# log = logging.getLogger(__name__)
# log.warning("Cannot map rdftype %s with argumenttype %s, defaulting to %r" %
# (rdftype, argumenttype, default))
return default
else:
return provided
self.rdftype = rdftype
self.label = _finddefault(label, rdftype, 'label', "Sorted by %(term)s")
self.pagetitle = _finddefault(
pagetitle,
rdftype,
'pagetitle',
"Documents where %(term)s = %(selected)s")
self.indexingtype = _finddefault(
indexingtype,
rdftype,
'indexingtype',
fulltextindex.Text())
self.selector = _finddefault(selector, rdftype, 'selector', self.defaultselector)
self.key = _finddefault(key, rdftype, 'key', self.defaultselector)
self.identificator = _finddefault(
identificator,
rdftype,
'identificator',
self.defaultidentificator)
self.toplevel_only = _finddefault(toplevel_only, rdftype, 'toplevel_only', False)
self.use_for_toc = _finddefault(use_for_toc, rdftype, 'use_for_toc', False)
self.use_for_feed = _finddefault(use_for_feed, rdftype, 'use_for_feed', False)
self.selector_descending = _finddefault(
selector_descending,
rdftype,
'selector_descending',
False)
self.key_descending = _finddefault(key_descending, rdftype, 'key_descending', False)
self.multiple_values = _finddefault(
multiple_values,
rdftype,
'multiple_values',
False)
self.dimension_type = _finddefault(dimension_type, rdftype, 'dimension_type', None)
# dimension_label should only be provided if an unusual
# selector for a rdftype is used (eg is_april_fools() for
# dcterms:issued), therefore no rdftype-dependent default.
self.dimension_label = dimension_label
def __repr__(self):
dictrepr = "".join(
(" %s=%r" %
(k, v) for k, v in sorted(
self.__dict__.items()) if not callable(v)))
return ("<%s%s>" % (self.__class__.__name__, dictrepr))
def __eq__(self, other):
# compare only those properties that affects the SET of
# selected data using this facet
return (self.rdftype == other.rdftype and
self.dimension_type == other.dimension_type and
self.dimension_label == other.dimension_label and
self.selector == other.selector)
Facet.defaults = {RDF.type: {
'indexingtype': fulltextindex.URI(),
'toplevel_only': False,
'use_for_toc': False,
'use_for_feed': True,
'selector': Facet.resourcelabel_or_qname,
'identificator': Facet.term,
'dimension_type': "term",
'pagetitle': 'All %(selected)s documents'},
DCTERMS.title: {
'indexingtype': fulltextindex.Text(boost=4),
'toplevel_only': False,
'use_for_toc': True,
'selector': Facet.firstletter,
'key': Facet.titlesortkey,
'identificator': Facet.firstletter,
'dimension_type': None, # or "value",
'pagetitle': 'Documents starting with "%(selected)s"'
},
DCTERMS.identifier: {
'indexingtype': fulltextindex.Label(boost=16),
'toplevel_only': False,
'use_for_toc': False, # typically no info that isn't already in title
'selector': Facet.firstletter,
'key': Facet.titlesortkey,
'identificator': Facet.firstletter,
},
DCTERMS.abstract: {
'indexingtype': fulltextindex.Text(boost=2),
'toplevel_only': True,
'use_for_toc': False
},
DC.creator: {
'indexingtype': fulltextindex.Label(),
'toplevel_only': True,
'use_for_toc': True,
'selector': Facet.defaultselector,
'key': Facet.titlesortkey,
'dimension_type': "value"
},
DCTERMS.publisher: {
'indexingtype': fulltextindex.Resource(),
'toplevel_only': True,
'use_for_toc': True,
'use_for_feed': True,
'selector': Facet.resourcelabel,
'key': Facet.resourcelabel,
'identificator': Facet.term,
'dimension_type': 'ref',
'pagetitle': 'Documents published by %(selected)s'
},
DCTERMS.references: { # NB: this is a single URI reference w/o label
'indexingtype': fulltextindex.URI(),
'use_for_toc': False,
},
DCTERMS.issued: {
'label': "Sorted by publication year",
'pagetitle': "Documents published in %(selected)s",
'indexingtype': fulltextindex.Datetime(),
'toplevel_only': True,
'use_for_toc': True,
'selector': Facet.year,
'key': Facet.defaultselector,
'identificator': Facet.year,
'selector_descending': False,
'key_descending': False,
'dimension_type': "year"
},
DC.subject: {
# eg. one or more string literals (not URIRefs),
'indexingtype': fulltextindex.Keyword(),
'multiple_values': True,
'toplevel_only': True,
'use_for_toc': True,
'selector': Facet.defaultselector,
'key': Facet.defaultselector,
'multiple_values': True,
'dimension_type': 'value',
},
DCTERMS.subject: {
# eg. one or more URIRefs + labels
'indexingtype': fulltextindex.Resource(),
'multiple_values': True,
'toplevel_only': True,
'use_for_toc': True,
'selector': Facet.resourcelabel,
'key': Facet.resourcelabel,
'identificator': Facet.term,
'multiple_values': True,
'dimension_type': 'ref',
},
SCHEMA.free: { # "A flag to signal that the publication is accessible for free."
'indexingtype': fulltextindex.Boolean(),
'toplevel_only': True,
'use_for_toc': True,
'use_for_feed': True,
'selector': Facet.booleanvalue,
'key': Facet.defaultselector,
'dimension_type': 'value'
}
}