Source code for ferenda.sources.tech.rfc

# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import *

import re
import os
from datetime import datetime, date
from itertools import islice
import requests
import requests.exceptions

from rdflib import URIRef, Graph, XSD
from rdflib.term import _PythonToXSD
from pyparsing import Word, CaselessLiteral, Optional, nums

from ferenda import DocumentRepository
from ferenda import TextReader, Describer, FSMParser, CitationParser, URIFormatter, Facet
from ferenda import util
from ferenda.decorators import action, recordlastdownload, managedparsing, downloadmax
from ferenda.elements import Body, Heading, Preformatted, Paragraph, UnorderedList, ListItem, Section, Subsection, Subsubsection, UnicodeElement, CompoundElement, Link, serialize
from ferenda.errors import ParseError


class RFCHeader(UnicodeElement):
    pass


class DocTitle(UnicodeElement):
    pass


class Pagebreak(CompoundElement):
    pass


class PreambleSection(CompoundElement):
    tagname = "div"
    counter = 0

    def _get_classname(self):
        return self.__class__.__name__.lower()
    classname = property(_get_classname)

    # FIXME: For Sections (incl Sub- and Subsubsections), we use
    # decorate_bodyparts to create .uri and .meta properties, which
    # then the default as_xhtml serializes neatly. For this, we set
    # the attributes "manually". This is not consistent. We should
    # probably extend decorate_bodyparts to handle preamblesections as
    # well.
    def as_xhtml(self, uri, parent_uri=None):
        element = super(PreambleSection, self).as_xhtml(uri, parent_uri)
        element.set('property', 'dcterms:title')
        element.set('content', self.title)
        element.set('typeof', 'bibo:DocumentPart')
        self.__class__.counter += 1
        element.set('about', uri + "#PS" + str(self.__class__.counter))
        # alternate uri strategy
        # element.set('about', uri + "#" + self.title.lower().replace(" ", "_"))
        return element


[docs]class RFC(DocumentRepository):
    alias = "rfc"
    start_url = "http://www.ietf.org/download/rfc-index.txt"
    document_url_template = "http://tools.ietf.org/rfc/rfc%(basefile)s.txt"
    document_url_regex = "http://tools.ietf.org/rfc/rfc(?P<basefile>\w+).txt"
    downloaded_suffix = ".txt"
    namespaces = ('rdf',  # always needed
                  'dcterms',
                  # title, identifier, etc (could be replaced by equiv bibo prop?)
                  'bibo',  # Standard and DocumentPart classes, chapter prop
                  'xsd',  # datatypes
                  'prov',  # for :wasGeneratedBy
                  ('rfc', 'http://example.org/ontology/rfc/')  # custom (fake) ontology
                  )
    sparql_annotations = "sparql/rfc-annotations.rq"
    xslt_template = "xsl/rfc.xsl"
    rdf_type = [URIRef("http://example.org/ontology/rfc/RFC"),
                URIRef("http://example.org/ontology/rfc/BCP"),
                URIRef("http://example.org/ontology/rfc/STD"),
                URIRef("http://example.org/ontology/rfc/FYI")]

    # NOTES:
    #
    # Like many large document collections that has existed for a long
    # time, the RFCs aren't using a thurough standard formatting,
    # particularly for older RFC. Here are examples:
    #
    # Most older RFCs - headers are often all-caps (can be handled by
    # parser or constructor)
    #
    # RFC 759 - Header is totally different (can be patched), header
    #           has a IEN: 113 field
    # RFC 869 - Header is totally different (RFC 759-like), most
    #           paragraphs are double-spaced
    # RFC 889 - Header isn't right-justified but instead ragged
    # RFC 909 - Header is totally different (RFC 759-like), document
    #           headings are centered (can be handled by alternate
    #           recognizer)

    @action
    @recordlastdownload
    def download(self, basefile=None):
        """Download rfcs starting from http://www.ietf.org/download/rfc-index.txt"""
        if basefile and self.document_url_template:
            return self.download_single(basefile)
        res = requests.get(self.start_url)
        indextext = res.text
        reader = TextReader(
            string=indextext,
            linesep=TextReader.UNIX)  # see TextReader class
        iterator = reader.getiterator(reader.readparagraph)
        for (basefile, url) in self.download_get_basefiles(iterator):
            try:
                if not os.path.exists(self.store.downloaded_path(basefile)):
                    self.download_single(basefile)
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 404:
                    # create a empty dummy file in order to
                    # avoid looking for it over and over again:
                    with open(self.store.downloaded_path(basefile), "w"):
                        pass

    @downloadmax
    def download_get_basefiles(self, source):
        for p in reversed(list(source)):
            if re.match("^(\d{4}) ", p):  # looks like a RFC number
                if not "Not Issued." in p:  # Skip RFC known to not exist
                    basefile = str(int(p[:4]))  # eg. '0822' -> '822'
                    yield (basefile, None)

    @staticmethod  # so as to be easily called from command line
    def get_parser(basefile="0"):

        # recognizers, constructors and helpers are created as nested
        # ordinary functions, but could just as well be staticmethods
        # (or module-global functions)

        def is_rfcheader(parser, chunk=None, lenient=True):
            if not chunk:
                chunk = parser.reader.peek()
            (leftlines, rightlines, linelens) = _splitcolumns(chunk)
            # all rfc headers are at least 2 lines long (eg. rfc 889)
            if len(linelens) < 2:
                return False
            targetlen = linelens[0]
            for (idx, length) in enumerate(linelens):
                if rightlines[idx] == "" and length > 40:
                    return False
                elif rightlines[idx] != "" and length != targetlen and not lenient:
                    return False
                    # Most modern RFC has justified right margin
                    # (which is what this test targets) but some older
                    # RFCs (like 889) have ragged right margin (or
                    # rather left-justified two columns). However,
                    # since make_rfcheader checks next chunk as well
                    # (if there is a spurious double newline right in
                    # the middle of the header, which is a thing that
                    # has happened (RFC 6912)), this recognizer has a
                    # lenient and a non-lenient mode.
            return True

        # FIXME: use this in parse_header as well
        def _splitcolumns(chunk):
            linelens = []
            leftlines = []
            rightlines = []
            for line in chunk.split("\n"):
                linelens.append(len(line))
                if "   " in line:
                    (left, right) = line.split("   ", 1)
                else:
                    (left, right) = line, ""
                leftlines.append(left)
                rightlines.append(right)
            return (leftlines, rightlines, linelens)

        def is_doctitle(parser, chunk=None):
            return True

        def is_pagebreak(parser, chunk=None):
            if not chunk:
                chunk = parser.reader.peek()
            return ('\f' in chunk)

        def is_header(parser, chunk=None):
            if not chunk:
                chunk = parser.reader.peek()
            stripchunk = chunk.strip()
            # a header should be non-emtpy, be on a single line, not
            # end with "." and not start with an indent.
            if ((stripchunk != "") and
                (len(stripchunk.split("\n")) == 1) and
                (not stripchunk.endswith('.')) and
                    (not chunk.startswith(' '))):
                return True

        def is_section(parser, chunk=None):
            (ordinal, title, identifier) = analyze_sectionstart(parser, chunk)
            return section_segments_count(ordinal) == 1

        def is_subsection(parser, chunk=None):
            (ordinal, title, identifier) = analyze_sectionstart(parser, chunk)
            return section_segments_count(ordinal) == 2

        def is_subsubsection(parser, chunk=None):
            (ordinal, title, identifier) = analyze_sectionstart(parser, chunk)
            return section_segments_count(ordinal) == 3

        def is_preformatted(parser, chunk=None):
            if not chunk:
                chunk = parser.reader.peek()
            # all paragraphs start with a three space indent -- start
            # by removing this
            stripped = "\n".join([x[3:] for x in chunk.split("\n")])
            # replace double spaces after end of sentences to avoid
            # false positives:
            stripped = stripped.replace(".  ", ". ")
            # If any double spaces left, probably preformatted text
            # (eg. tables etc). Same if several periods are present
            # (indicative of leaders in TOCs)
            return ("  " in stripped or
                    "...." in stripped or
                    ". . . " in stripped)

        def is_bnf(parser, chunk=None):
            if not chunk:
                chunk = parser.reader.peek()
                return (is_preformatted(parser, chunk) and " = " in chunk)

        def is_paragraph(parser, chunk=None):
            return True

        def is_ul_listitem(parser, chunk=None):
            if not chunk:
                chunk = parser.reader.peek()
            return chunk.strip().startswith("o  ")

        def is_definition_title(parser, chunk=None):
            # looks like header but starts indented
            return False

        def is_definition(parser, chunk=None):
            # entire p is indented 6 spaces instead of 3. But if it
            # follows a ul li, problably continuation of that.
            return False

        def make_body(parser):
            return parser.make_children(Body())
        setattr(make_body, 'newstate', 'body')

        def make_preamble_section(parser):
            s = PreambleSection(title=parser.reader.next())
            return parser.make_children(s)
        setattr(make_preamble_section, 'newstate', 'preamble-section')

        # used for older rfcs
        def make_abstract(parser):
            s = PreambleSection(title="(Abstract)")
            return parser.make_children(s)
        setattr(make_abstract, 'newstate', 'preamble-section')

        def skip_pagebreak(parser):
            chunk = parser.reader.next()
            lastline = chunk.split("\n")[-1]
            parts = re.split("  +", lastline)
            if len(parts) > 2:
                return Pagebreak(shorttitle=parts[1])
            else:
                return None

        def make_header(parser):
            chunk = parser.reader.next()
            h = Heading(chunk.strip())
            return h

        def make_paragraph(parser):
            chunk = p.reader.next()
            return Paragraph([" ".join(chunk.split())])

        def make_preformatted(parser):
            chunk = p.reader.next()
            return Preformatted([chunk])

        def make_bnf(parser):
            chunk = p.reader.next()
            return Preformatted([chunk], **{'class': 'bnf'})

        def make_section(parser):
            (secnumber,
             title,
             identifier) = analyze_sectionstart(parser,
                                                parser.reader.next())
            s = Section(ordinal=secnumber,
                        title=title,
                        identifier=identifier)
            return parser.make_children(s)
        setattr(make_section, 'newstate', 'section')

        def make_subsection(parser):
            (secnumber,
             title,
             identifier) = analyze_sectionstart(parser,
                                                parser.reader.next())
            s = Subsection(ordinal=secnumber,
                           title=title,
                           identifier=identifier)
            return parser.make_children(s)
        setattr(make_subsection, 'newstate', 'subsection')

        def make_subsubsection(parser):
            (secnumber,
             title,
             identifier) = analyze_sectionstart(parser,
                                                parser.reader.next())
            s = Subsubsection(ordinal=secnumber,
                              title=title,
                              identifier=identifier)
            return parser.make_children(s)
        setattr(make_subsubsection, 'newstate', 'subsubsection')

        def make_unordered_list(parser):
            (listtype, ordinal, separator, rest) = analyze_listitem(parser.reader.peek())
            ol = UnorderedList(type=listtype)  # should
            ol.append(parser.make_child(make_listitem, "listitem"))
            return parser.make_children(ol)
        setattr(make_unordered_list, 'newstate', 'unorderedlist')

        def make_listitem(parser):
            chunk = parser.reader.next()
            (listtype, ordinal, separator, rest) = analyze_listitem(chunk)
            li = ListItem(ordinal=ordinal)
            li.append(rest)
            return parser.make_children(li)
        setattr(make_listitem, 'newstate', 'listitem')

        def make_rfcheader(parser):
            headerchunk = parser.reader.next()
            if is_rfcheader(parser, lenient=False):
                headerchunk += "\n" + parser.reader.next()
            return RFCHeader(headerchunk)

        def make_doctitle(parser):
            return DocTitle(parser.reader.next())

        # Some helpers for the above
        def section_segments_count(s):
            return ((s is not None) and
                    len(list(filter(None, s.split(".")))))

        # Matches
        # "1 Blahonga" => ("1","Blahonga", "RFC 1234, section 1")
        # "1.2.3. This is a subsubsection" => ("1.2.3", "This is a subsection", "RFC 1234, section 1.2.3")
        # "   Normal paragraph" => (None, "   Normal paragraph", None)
        re_sectionstart = re.compile("^(\d[\.\d]*) +(.*[^\.])$").match

        def analyze_sectionstart(parser, chunk=None):
            if not chunk:
                chunk = parser.reader.peek()
            m = re_sectionstart(chunk)
            if m:
                ordinal = m.group(1).rstrip(".")
                title = m.group(2)
                identifier = "RFC %s, section %s" % (basefile, ordinal)
                return (ordinal, title, identifier)
            else:
                return (None, chunk, None)

        def analyze_listitem(chunk):
            # returns: same as list-style-type in CSS2.1, sans
            # 'georgian', 'armenian' and 'greek', plus 'dashed'
            listtype = ordinal = separator = None

            # FIXME: Tighten these patterns to RFC conventions
            # match "1. Foo..." or "14) bar..." but not "4 This is a heading"
            if chunk.startswith("   o  "):
                return ("disc", None, None, chunk[6:])

            return (listtype, ordinal, separator, chunk)  # None * 3

        p = FSMParser()

        p.set_recognizers(is_pagebreak,
                          is_rfcheader,
                          is_doctitle,
                          is_section,
                          is_subsection,
                          is_subsubsection,
                          is_header,
                          is_ul_listitem,
                          is_preformatted,
                          is_definition_title,
                          is_definition,
                          is_paragraph)
        # start_state: "body" or "rfcheader", then "title", then
        # "preamble" (consisting of preamblesections that has title
        # (eg "Abstract", "Status of This Memo" + content), then "section".
        commonstates = ("section", "subsection", "subsubsection")
        p.set_transitions({("body", is_rfcheader): (make_rfcheader, "doctitle"),
                           ("doctitle", is_doctitle): (make_doctitle, "preamble"),
                           ("preamble", is_header): (make_preamble_section, "preamble-section"),
                           ("preamble", is_paragraph): (make_abstract, "preamble-section"),
                           ("preamble-section", is_paragraph): (make_paragraph, None),
                           ("preamble-section", is_header): (False, None),
                           ("preamble-section", is_pagebreak): (skip_pagebreak, None),
                           ("preamble-section", is_section): (False, "after-preamble"),
                           ("after-preamble", is_section): (make_section, "section"),
                           ("section", is_subsection): (make_subsection, "subsection"),
                           ("section", is_section): (False, None),
                           ("subsection", is_subsubsection): (make_subsubsection, "subsubsection"),
                           ("subsection", is_subsection): (False, None),
                           ("subsection", is_section): (False, None),
                           ("subsubsection", is_subsubsection): (False, None),
                           ("subsubsection", is_subsection): (False, None),
                           ("subsubsection", is_section): (False, None),
                           (commonstates, is_ul_listitem): (make_unordered_list, "ul-list"),
                           ("ul-list", is_ul_listitem): (make_listitem, "listitem"),
                           ("ul-list", is_paragraph): (False, None),
                           ("listitem", is_paragraph): (False, None),
                           (commonstates, is_bnf): (make_bnf, None),
                           (commonstates, is_preformatted): (make_preformatted, None),
                           (commonstates, is_paragraph): (make_paragraph, None),
                           (commonstates, is_pagebreak): (skip_pagebreak, None),
                           })
        p.initial_state = "body"
        p.initial_constructor = make_body
        return p

    def make_citation_parser(self):
        def rfc_uriformatter(parts):
            uri = ""
            if 'RFC' in parts:
                uri += self.canonical_uri(parts['RFC'].lstrip("0"))
            if 'Sec' in parts:
                uri += "#S" + parts['Sec'].rstrip(".")
            return uri
        section_citation = (CaselessLiteral("section") + Word(
            nums + ".").setResultsName("Sec")).setResultsName("SecRef")
        rfc_citation = (
            Optional("[") + "RFC" + Word(nums).setResultsName("RFC") + Optional("]")).setResultsName("RFCRef")
        section_rfc_citation = (
            section_citation +
            "of" +
            rfc_citation).setResultsName("SecRFCRef")
        citparser = CitationParser(section_rfc_citation,
                                   section_citation,
                                   rfc_citation)
        citparser.set_formatter(URIFormatter(("SecRFCRef", rfc_uriformatter),
                                             ("SecRef", rfc_uriformatter),
                                             ("RFCRef", rfc_uriformatter)))
        return citparser

    @action
    @managedparsing
    def parse(self, doc):
        """Parse downloaded documents into structured XML and RDF."""

        reader = TextReader(self.store.downloaded_path(doc.basefile),
                            linesep=TextReader.UNIX)
        # Some more preprocessing: Remove the faux-bold formatting
        # used in some RFCs (using repetitions of characters
        # interleaved with backspace control sequences). Note: that
        # is '\b' as in backspace, not r'\b' as in word boundary
        # docstring = re.sub('.\b','',docstring)
        cleanparagraphs = (re.sub('.\b', '', x) for x in
                           reader.getiterator(reader.readparagraph))
        
        parser = self.get_parser(doc.basefile)

        if not self.config.fsmdebug:
            self.config.fsmdebug = 'FERENDA_FSMDEBUG' in os.environ
        parser.debug = self.config.fsmdebug
        doc.body = parser.parse(cleanparagraphs)

        header = doc.body.pop(0)  # body.findByClass(RFCHeader)
        title = " ".join(doc.body.pop(0).split())  # body.findByClass(DocHeader)
        for part in doc.body:
            if isinstance(part, PreambleSection) and part.title == "Table of Contents":
                doc.body.remove(part)
                break

        # create (RDF) metadata for document Note: The provided
        # basefile may be incorrect -- let whatever is in the header
        # override
        realid = self.get_rfc_num(header)
        if not realid:  # eg RFC 100 -- fallback to basefile in that case
            realid = doc.basefile
        doc.uri = self.canonical_uri(realid)
        desc = Describer(doc.meta, doc.uri)
        desc.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name())
        desc.value(self.ns['dcterms'].title, title, lang="en")
        self.parse_header(header, desc)
        # parse_header might have set .rdftype, but if not:
        try:
            desc.getrdftype()
        except KeyError:
            desc.rdftype(self.ns['rfc'].RFC)

        if not desc.getvalues(self.ns['dcterms'].identifier):
            desc.value(self.ns['dcterms'].identifier, "RFC %s" % doc.basefile)

        doc.lang = "en"

        # process body - remove the temporary Pagebreak objects, after
        # having extracted the shortTitle found in them
        shorttitle = self.cleanup_body(doc.body)
        if shorttitle and (desc.getvalue(self.ns['dcterms'].title) != shorttitle):
            desc.value(self.ns['bibo'].shortTitle, shorttitle, lang="en")

        # process body - add good metadata
        citparser = self.make_citation_parser()
        doc.body = citparser.parse_recursive(doc.body)
        PreambleSection.counter = 0
        # self.decorate_bodyparts(doc.body,doc.uri)
        if self.config.fsmdebug:
            print(serialize(doc.body))
        return True

    def cleanup_body(self, part):
        shorttitle = None
        newparts = []  # a copy of the children w/o any Pagebreaks
        for subpart in part:
            if isinstance(subpart, Pagebreak):
                shorttitle = subpart.shorttitle
            else:
                if isinstance(subpart, str):
                    pass
                else:
                    short = self.cleanup_body(subpart)
                    if shorttitle is None:
                        shorttitle = short
                newparts.append(subpart)
        part[:] = newparts
        return shorttitle

    def get_rfc_num(self, header):
        lines = header.split("\n")
        left = [x.split("   ", 1)[0].strip() for x in lines]
        for line in left[1:]:
            if ":" not in line:
                continue
            (key, val) = (x.strip() for x in line.split(": "))
            if key == "Request for Comments":
                # only return integer part
                return re.sub("\D", "", val)

        raise ParseError("Couldn't find RFC number in header")

    def parse_header(self, header, desc):
        # split header in left-hand and right-hand side, and line by line
        lines = header.split("\n")
        left = [x.split("   ", 1)[0].strip() for x in lines]
        right = [x.split("   ", 1)[1].strip() for x in lines if "   " in x]
        # first line of lefthand side is publishing organization (?)
        publisher_label = left[0]
        try:
            desc.rel(self.ns['dcterms'].publisher,
                     self.lookup_resource(publisher_label))
        except KeyError:
            self.log.warning("Couldn't look up a proper resource URI for %s" %
                             publisher_label)
            desc.value(self.ns['dcterms'].publisher,
                       publisher_label)

        # following lefthand side are key-value headers
        for line in left[1:]:
            if line.strip() == "":
                continue
            if ": " not in line:
                self.log.warning("Cannot treat %r as a key-value header" % line)
                continue

            (key, value) = (x.strip() for x in line.split(": "))
            if key == "Request for Comments":
                # make sure we only extract the numeric part --
                # normally value should be numeric, but we've seen
                # "RFC 1006", "#154" and there are doubtless other
                # variants
                value = re.sub("\D", "", value)
                if value:  # eg RFC 100
                    desc.value(self.ns['dcterms'].identifier, "RFC %s" % value)
            elif key == "Category":
                try:
                    desc.rel(self.ns['dcterms'].subject,
                             self.lookup_resource(value, predicate=self.ns['bibo'].identifier))
                except KeyError:
                    self.log.warning("Couldn't look up a proper resource URI for %s" %
                                     value)
                    desc.value(self.ns['dcterms'].subject, value)
            elif key == "ISSN":
                desc.value(self.ns['dcterms'].issn, value)
            elif key in ("Updates", "Obsoletes"):
                pred = {'Updates': self.ns['rfc'].updates,
                        'Obsoletes': self.ns['rfc'].obsoletes}[key]

                for valuepart in value.split(", "):
                    rfcmatch = re.search('\d+', valuepart)
                    if rfcmatch:
                        uri = self.canonical_uri(rfcmatch.group(0))
                        desc.rel(pred, uri)
                    else:
                        self.log.warning("Can't pick out RFC number from line %s" % line)
            elif key == "BCP":
                desc.value(self.ns['rfc'].bcpnumber, value)
                desc.rdftype(self.ns['rfc'].BCP)
            elif key == "STD":
                desc.value(self.ns['rfc'].stdnumber, value)
                desc.rdftype(self.ns['rfc'].FYI)
            elif key == "FYI":
                desc.value(self.ns['rfc'].fyinumber, value)
                desc.rdftype(self.ns['rfc'].FYI)
            else:
                # Unknown headers seen: BCP, STD, FYI
                self.log.warning("Unknown header key %s (value %s)" % (key, value))

        # For right hand side, any line beginning with a single letter
        # followed by '. ' is probably a name
        for line in right:
            if re.match("[A-Z]\. ", line):
                desc.value(self.ns['dcterms'].creator, line)
            elif re.match("\w+ \d{4}$", line):
                # NOTE: this requires english locale!
                with util.switch_locale():
                    # Parses eg "May 2001" - which should not be
                    # converted to a regular date, as we're missing
                    # the date information. Use xsd:gYearMonth.
                    dt = datetime.strptime(line, "%B %Y")
                d = util.gYearMonth(dt.year, dt.month)
                # rdflib.term now misconverts XSD.gYearMonth to a
                # YYYY-MM-DD string, when we really want a YYYY-MM
                # string. Install a custom converter to handle
                # this. (We can't use the official rdflib.term.bind
                # API, since it installs the converter last, so that
                # the (incorrect) date converter is run first
                # (util.gYearMonth derives from date))
                if _PythonToXSD[0][0] != util.gYearMonth:
                    _PythonToXSD.insert(0, (util.gYearMonth, (str, XSD.gYearMonth)))
                desc.value(self.ns['dcterms'].issued, d,
                           datatype=XSD.gYearMonth)
            else:
                # company affiliation - include that separate from
                # personal author identity
                desc.value(self.ns['dcterms'].rightsHolder, line)

    def facets(self):
        def select_rfcnum(row, binding, resource_graph):
            # "RFC 6998" -> "6900"
            return row[binding][4:-2] + "00"

        return [Facet(self.ns['rdf'].type),
                Facet(self.ns['dcterms'].identifier,
                      label="Sorted by RFC #",
                      pagetitle="RFC %(selected)s00-%(selected)s99",
                      selector=select_rfcnum,
                      use_for_toc=True),
                Facet(self.ns['dcterms'].title),
                Facet(self.ns['dcterms'].publisher,
                      label="Sorted by stream",
                      pagetitle="The %(selected)s stream"),
                Facet(self.ns['dcterms'].issued),
                Facet(self.ns['dcterms'].subject,  # should be rfc:category not dcterms:status?
                      label="Sorted by status",
                      pagetitle="Status: %(selected)s")]

    def toc_item(self, binding, row):
        return [row['dcterms_identifier'] + ": ",
                Link(row['dcterms_title'],
                     uri=row['uri'])]

    def news_criteria(self):
        from ferenda import Describer, NewsCriteria

        # function that returns a closure, which acts as a custom
        # selector function for the NewsCriteria objects.
        def selector_for(category):
            def selector(entry):
                graph = Graph()
                with self.store.open_distilled(entry.basefile) as fp:
                    graph.parse(data=fp.read())
                desc = Describer(graph, entry.id)
                return desc.getrel(self.ns['dcterms'].subject) == category
            return selector

        return [NewsCriteria('all', 'All RFCs'),
                NewsCriteria('informational', 'Informational RFCs',
                             selector=selector_for("http://localhost:8000/dataset/rfc/informational")),
                NewsCriteria('bcp', 'Best Current Practice RFCs',
                             selector=selector_for("http://localhost:8000/dataset/rfc/best-current-practice")),
                NewsCriteria('experimental', 'Experimental RFCs',
                             selector=selector_for("http://localhost:8000/dataset/rfc/experimental")),
                NewsCriteria('historic', 'Historic RFCs',
                             selector=selector_for("http://localhost:8000/dataset/rfc/historic")),
                NewsCriteria('unknown', 'Unknown',
                             selector=selector_for("http://localhost:8000/dataset/rfc/unknown")),
                NewsCriteria('standards', 'Standards Track RFCs',
                             selector=selector_for("http://localhost:8000/dataset/rfc/standard"))]

    def frontpage_content(self, primary=False):
        from rdflib import URIRef
        items = ""
        for entry in islice(self.news_entries(), 5):
            graph = Graph()
            with self.store.open_distilled(entry.basefile) as fp:
                graph.parse(data=fp.read())

            data = {
                'identifier': graph.value(URIRef(entry.id), self.ns['dcterms'].identifier).toPython(),
                'uri': entry.id,
                'title': entry.title}
            items += '<li>%(identifier)s <a href="%(uri)s">%(title)s</a></li>' % data
        return ("""<h2><a href="%(uri)s">Request for comments</a></h2>
                   <p>A complete archive of RFCs in Linked Data form. Contains %(doccount)s documents.</p>
                   <p>Latest 5 documents:</p>
                   <ul>
                      %(items)s
                   </ul>""" % {'uri': self.dataset_uri(),
                               'items': items,
                               'doccount': len(list(self.store.list_basefiles_for("_postgenerate")))})

    def tabs(self):
        return [("RFCs", self.dataset_uri())]