Source code for ferenda.sources.general.sitenews

from datetime import datetime
import codecs
import os
import re
import tempfile

import bs4
from rdflib import URIRef, Literal, Namespace
from rdflib.namespace import DCTERMS, RDF
SCHEMA = Namespace("http://schema.org/")

from ferenda import DocumentRepository, Facet, Feedset, DocumentEntry
from ferenda import util
from ferenda.elements import UnorderedList, ListItem, Body
from ferenda.elements.html import elements_from_soup, Div, DL, DT, DD, Img, A
from ferenda.decorators import managedparsing

[docs]class Sitenews(DocumentRepository):
    """Generates a set of news documents from a single text file.

    This is a simple way of creating a feed of news about the site
    itself, with permalinks for individual posts and a Atom feed for
    subscribing in a feed reader.

    The text file is loaded by `ferenda.ResourceLoader`, so it can be
    placed in any resource directory for any repo used. By default,
    the resource name is "static/sitenews.txt" but this can be changed
    with `config.newsfile`

    The text file should be structured with each post/entry having a
    header line, followed by a empty line, then the body of the
    post. The body ends when a new header line (or EOF) is
    encountered. The header line should be formatted like `<ISO 8859-1
    datetime> <Entry title>`.

    The body should be a regular HTML fragment.

    """

    alias = "sitenews"
    downloaded_suffix = ".txt"
    rdf_type = SCHEMA.BlogPosting  # or maybe just schema:Article
    namespaces = ['rdf', 'rdfs', 'xsd', 'xsi', 'dcterms', 'prov', 'schema']
    sparql_annotations = None
    news_sortkey = 'published'
    readmore_label = 'Read more...'

    @classmethod
    def get_default_options(cls):
        opts = super(Sitenews, cls).get_default_options()
        opts['newsfile'] = 'static/sitenews.txt'
        return opts

    re_news_subjectline = re.compile(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) (.*)').match

    def download(self):
        # do something with static/sitenews.txt --> split into
        # <datadir>/sitenews/<timestamp>.txt
        ofp = temppath = path = basefile = None
        with codecs.open(self.resourceloader.filename(self.config.newsfile),
                         encoding="utf-8") as fp:
            for line in fp:
                m = self.re_news_subjectline(line)
                if m:
                    if ofp:
                        ofp.close()
                        if util.replace_if_different(temppath, path):
                            self.log.info("%s: creating news item" % basefile)
                    d = datetime.strptime(m.group(1), "%Y-%m-%d %H:%M:%S")
                    basefile = str(int(d.timestamp()))
                    path = self.store.downloaded_path(basefile)
                    fileno, temppath = tempfile.mkstemp(text=True)
                    util.ensure_dir(path)
                    # ofp = codecs.open(path, "w", encoding="utf-8")
                    ofp = os.fdopen(fileno, "w")
                ofp.write(line)
            ofp.close()
            if util.replace_if_different(temppath, path):
                self.log.info("%s: download OK (creating news item)" % basefile)
                

    @managedparsing
    def parse(self, doc):
        head, body = util.readfile(self.store.downloaded_path(doc.basefile)).split("\n\n", 1)
        datestr, timestr, title = head.split(" ", 2)
        published = datetime.strptime("%s %s" % (datestr, timestr), "%Y-%m-%d %H:%M:%S")

        doc.meta.add((URIRef(doc.uri), RDF.type, self.rdf_type))
        doc.meta.add((URIRef(doc.uri), DCTERMS.issued, Literal(published)))
        doc.meta.add((URIRef(doc.uri), DCTERMS.title, Literal(title, lang=doc.lang)))
        soup = bs4.BeautifulSoup("<div class='sitenews-item'>"+body+"</div>", "lxml")
        doc.body = elements_from_soup(soup.body)
        # move timestamp into dcterms:issued, title into dcterms:title
        # parse body with elements_from_soup
        # set first real para as dcterms:abstract (XMLLiteral)
        doc.body[0][0] = Div([doc.body[0][0]],
                          datatype="rdf:XMLLiteral",
                          property="dcterms:abstract")

        # but we need to add it to doc.meta RIGHT AWAY because of reasons...
        doc.meta.add((URIRef(doc.uri), DCTERMS.abstract,
                      Literal(body.split("\n\n")[0], datatype=RDF.XMLLiteral)))
        self.parse_entry_update(doc) # need to set published and possibly updated
        entry = DocumentEntry(self.store.documententry_path(doc.basefile))
        entry.published = published
        entry.save()
        return True

    def parse_entry_summary(self, doc):
        summary = doc.meta.value(URIRef(doc.uri), DCTERMS.abstract)
        if len(doc.body[0]) > 1:
            if self.readmore_label:
                permalink = self.canonical_uri(doc.basefile)
                readmore_link = " <a href='%s'>%s</a>" % (permalink, self.readmore_label)
                summarytext = summary.replace("</p>", "%s</p>" % readmore_link)
            summary = Literal(summarytext, datatype=summary.datatype)
        return summary


    def facets(self):
        return [Facet(DCTERMS.issued)]

    toc_title = "All news feeds"
    def toc(self, otherrepos):
        documentlist = []
        # create just one single page: no leftnav, contains only a sort-of nested list 
        for repo in [self] + otherrepos:
            if not repo.config.tabs:
                continue
            qname_graph = repo.make_graph()
            feeds = []
            # row = {'alias': repo.alias,
            #        'uri': repo.dataset_uri(feed=True)}
            # item = self.toc_item('alias', row)
            tabs = repo.tabs()
            if tabs:
                item = tabs[0][0]
            else:
                # item = repo.alias
                #
                # if a repo doesn't provide any tabs, it's probably
                # mostly for internal use (like static and mediawiki
                # -- let's just skip it
                continue
            documentlist.append((item, feeds))
            feedsets = repo.news_feedsets(repo.news_facet_entries(),
                                          repo.facets())
            feedcnt = 0
            for feedset in feedsets:
                for feed in feedset.feeds:
                    feedcnt += 1
                    row = {'title': feed.title,
                           'uri': repo.dataset_uri(param=feed.binding,
                                                   value=feed.slug,
                                                   feed=True),
                           'feeduri': repo.dataset_uri(param=feed.binding,
                                                       value=feed.slug,
                                                       feed=".atom")}
                    item = self.toc_item('title', row)
                    feeds.append(item)
            self.log.info("sitenews.toc: Added %s feeds in %s feedsets for %s" %
                          (feedcnt, len(feedsets), repo.alias))
        self.toc_generate_page(None, None, documentlist, [], "index", title=self.toc_title)

    def toc_item(self, binding, row):
        return [A([Img(alt="Atom feed",
                               src="/rsrc/img/atom.png",
                               width=14,
                               height=14)],
                  href=row['feeduri']),
                A(row[binding],
                  href=row['uri'])]
        
    def toc_generate_page_body(self, documentlist, nav):
        ul = UnorderedList([ListItem(x) for x in documentlist], role='main')
        dl = DL(**{'class': 'dl-horizontal'})
        for label, doclist in documentlist:
            dl.append(DT(label))
            for doc in doclist:
                dl.append(DD(doc))
        return Body([nav,
                     dl
        ])

    def tabs(self):
        if self.config.tabs:
            uri = self.dataset_uri()
            return [("News", uri)]
        else:
            return []