Source code for ferenda.sources.general.wiki

# -*- coding: utf-8 -*-
from __future__ import unicode_literals

# system
from tempfile import mktemp
import random
import re
import os
from six import text_type as str
from six import binary_type as bytes

# 3rdparty
from lxml import etree
from rdflib import Namespace, URIRef, Literal
import requests

# mine
from ferenda import DocumentRepository, DocumentStore
from ferenda import util
from ferenda.sources.general import Keyword
# from keywords import Keyword

try:
    from ferenda.thirdparty.mw import Parser, Semantics, Settings, Preprocessor
except ImportError as e:
    import sys
    if sys.version_info < (2, 7):
        raise RuntimeError("ferenda.sources.general.Wiki is not supported under python 2.6: %s" % str(e))
    else:
        raise e # dunno
        
import unicodedata

class MediaWikiStore(DocumentStore):
    def basefile_to_pathfrag(self, basefile):
        return basefile.replace(":", os.sep).replace(" ", "_")

    def pathfrag_to_basefile(self, pathfrag):
        # This unicode normalization turns "a" + U+0308 (COMBINING
        # DIAERESIS) into a honest 'ä'. This is an issue on mac file
        # systems. FIXME: should this be a part of
        # DocumentStore.pathfrag_to_basefile?
        return unicodedata.normalize("NFC", pathfrag.replace("_", " ").replace(os.sep, ":"))


[docs]class MediaWiki(DocumentRepository): """Downloads content from a Mediawiki system and converts it to annotations on other documents. For efficient downloads, this docrepo requires that there exists a XML dump (created by `dumpBackup.php <http://www.mediawiki.org/wiki/Manual:DumpBackup.php>`_) of the mediawiki contents that can be fetched over HTTP/HTTPS. Configure the location of this dump using the ``mediawikiexport`` parameter:: [mediawiki] class = ferenda.sources.general.MediaWiki mediawikiexport = http://localhost/wiki/allpages-dump.xml .. note:: This docrepo relies on the smc.mw module, which doesn't work on python 2.6, only 2.7 and newer. """ alias = "mediawiki" downloaded_suffix = ".xml" documentstore_class = MediaWikiStore rdf_type = Namespace(util.ns['skos']).Concept keyword_class = Keyword namespaces = ['rdf', 'skos', 'prov', 'dcterms'] def __init__(self, config=None, **kwargs): super(MediaWiki, self).__init__(config, **kwargs) if self.config._parent and hasattr(self.config._parent, 'keyword'): self.keywordrepo = self.keyword_class(self.config._parent.keyword) else: self.keywordrepo = self.keyword_class() def get_default_options(self): opts = super(MediaWiki, self).get_default_options() # The API endpoint URLs change with MW language opts['mediawikiexport'] = 'http://localhost/wiki/Special:Export/%s(basefile)' opts['mediawikidump'] = 'http://localhost/wiki/allpages-dump.xml' opts['mediawikinamespaces'] = ['Category'] # process pages in this namespace (as well as pages in the # default namespace) return opts def download(self, basefile=None): if basefile: return self.download_single(basefile) if self.config.mediawikidump: xmldumppath = self.store.path('dump', 'downloaded', '.xml') try: resp = requests.get(self.config.mediawikidump) self.log.info("Loaded XML dump from %s" % self.config.mediawikidump) with self.store._open(xmldumppath, mode="wb") as fp: fp.write(resp.content) except Exception: # try to loa pass # xml = etree.parse(resp.content) xml = etree.parse(xmldumppath) else: raise ConfigurationError("config.mediawikidump not set") MW_NS = "{%s}" % xml.getroot().nsmap[None] wikinamespaces = [] # FIXME: Find out the proper value of MW_NS for ns_el in xml.findall("//" + MW_NS + "namespace"): wikinamespaces.append(ns_el.text) # Get list of existing basefiles - if any of those # does not appear in the XML dump, remove them afterwards basefiles = list(self.store.list_basefiles_for("parse")) total = written = 0 for page_el in xml.findall(MW_NS + "page"): basefile = page_el.find(MW_NS + "title").text if basefile == "Huvudsida": continue if ":" in basefile and basefile.split(":")[0] in wikinamespaces: (namespace, localtitle) = basefile.split(":", 1) if namespace not in self.config.mediawikinamespaces: continue writefile = False p = self.store.downloaded_path(basefile) newcontent = etree.tostring(page_el, encoding="utf-8") if not os.path.exists(p): writefile = True else: oldcontent = util.readfile(p, "rb") if newcontent != oldcontent: writefile = True if writefile: util.ensure_dir(p) with open(p, "wb") as fp: fp.write(newcontent) self.log.info("%s: extracting from XML dump" % basefile) written += 1 if basefile in basefiles: del basefiles[basefiles.index(basefile)] total += 1 if 'dump' in basefiles: # never remove del basefiles[basefiles.index('dump')] for b in basefiles: self.log.info("%s: removing stale document" % b) util.robust_remove(self.store.downloaded_path(b)) self.log.info("Examined %s documents, wrote %s of them" % (total, written)) def download_single(self, basefile): # download a single term, for speed url = self.config.mediawikiexport % {'basefile': basefile} self.download_if_needed(url, basefile) re_anchors = re.compile('(<a.*?</a>)', re.DOTALL) re_anchor = re.compile('<a[^>]*>(.*)</a>', re.DOTALL) re_tags = re.compile('(</?[^>]*>)', re.DOTALL) # NOTE: What is this thing, really? Is it a wiki document by # itself, or is it metadata about a concept identified by a # keyword / label? def parse_metadata_from_soup(self, soup, doc): super(MediaWiki, self).parse_metadata_from_soup(soup, doc) # remove dcterms:identifier because it's pointless doc.meta.remove((URIRef(doc.uri), self.ns['dcterms'].identifier, Literal(doc.basefile))) def parse_document_from_soup(self, soup, doc): wikitext = soup.find("text").text parser = self.get_wikiparser() settings = self.get_wikisettings() semantics = self.get_wikisemantics(parser, settings) preprocessor = self.get_wikipreprocessor(settings) # the main responsibility of the preprocessor is to expand templates wikitext = preprocessor.expand(doc.basefile, wikitext) xhtml = parser.parse(wikitext, "document", filename=doc.basefile, semantics=semantics, trace=False) doc.body = self.postprocess(doc, xhtml) return None def canonical_uri(self, basefile): # by default, a wiki page is expected to describe a # concept/keyword -- so we use our associated Keyword repo to # find its uri. return self.keywordrepo.canonical_uri(basefile) def get_wikiparser(self): return Parser(parseinfo=False, whitespace='', nameguard=False) def get_wikisemantics(self, parser, settings): return WikiSemantics(parser, settings) def get_wikisettings(self): return WikiSettings(lang=self.lang) def get_wikipreprocessor(self, settings): return WikiPreprocessor(settings) def postprocess(self, doc, xhtmltree, toplevel_property=True): body = xhtmltree.getchildren()[0] # render_xhtml_tree will add @about if toplevel_property: # shouldn't add these in postprocess_commentary mode body.set("property", "dcterms:description") body.set("datatype", "rdf:XMLLiteral") containerdiv = etree.Element("div") for child in body: body.remove(child) containerdiv.append(child) body.append(containerdiv) # find any links that indicate that this concept has the # dcterms:subject of something (typically indicated by # Category tags) for subjectlink in xhtmltree.findall(".//a[@rel='dcterms:subject']"): # add metadata doc.meta.add((URIRef(doc.uri), self.ns['dcterms'].subject, URIRef(subjectlink.get("href")))) # remove from tree parent = subjectlink.getparent() parent.remove(subjectlink) # if the containing element is empty, remove as well if not (len(parent) or parent.text or parent.tail): parent.getparent().remove(parent) # convert xhtmltree to a ferenda.Elements tree root = self.elements_from_node(xhtmltree) return root[0] def elements_from_node(self, node): from ferenda.elements.html import _tagmap assert node.tag in _tagmap element = _tagmap[node.tag](**node.attrib) if node.text and node.text.strip(): element.append(str(node.text)) for child in node: if isinstance(child, str): element.append(str(child)) else: subelement = self.elements_from_node(child) if subelement is not None: element.append(subelement) if child.tail and child.tail.strip(): element.append(str(child.tail)) return element @classmethod def generate_all_setup(cls, config): # This is not a document repository that produces its own # pages -- rather, it creates description metadata (through # download/parse/relate) that other repos (primarily Keyword) # can use. THerefore, we return False in this setup method to # signify that no work needs to be done return False def toc(self, otherrepos=[]): # and no toc either return def news(self, otherrepos=[]): # nor newsfeeds return def tabs(self): return [] def frontpage_content(self, primary=False): return
# # differ from the default relate_triples in that it uses a different # # context for every basefile and clears this beforehand. # # Note that a basefile can contain statements # # about multiple and changing subjects, so it's not trivial to erase all # # statements that stem from a basefile w/o a dedicated context. # def relate_triples(self, basefile): # context = self.dataset_uri() + "#" + basefile.replace(" ", "_") # ts = self._get_triplestore() # with util.logtime(self.log.debug, # "%(basefile)s: Added %(rdffile)s to context %(context)s (%(elapsed).3f sec)", # {'basefile': basefile, # 'context': context, # 'rdffile': self.store.distilled_path(basefile), # 'triplestore': self.config.storelocation}): # data = open(self.store.distilled_path(basefile)).read() # ts.clear(context=context) # ts.add_serialized(data, format="xml", context=context) class WikiSemantics(Semantics): def document(self, ast): html = super(WikiSemantics, self).document(ast) # remove the newly-created toc. If postprocess_toc was a # Semantics method we could just override this in this # superclass, now we'll have to rip it out after the fact. toc = html.find(".//div[@id='toc']") if toc is not None: toc.getparent().remove(toc) return html def internal_link(self, ast): el = super(WikiSemantics, self).internal_link(ast) target = "".join(ast.target).strip() name = self.settings.canonical_page_name(target) if name[0].prefix == 'category': el.set("rel", "dcterms:subject") return el class WikiSettings(Settings): def make_url(self, name, **kwargs): uri = super(WikiSettings, self).make_url(name, **kwargs) return uri class WikiPreprocessor(Preprocessor): def get_template(self, namespace, pagename): # FIXME: This is a special hack for supporting # {{DISPLAYTITLE}} (not a proper template? Check if smc.mw is # supposed to have support for wgAllowDisplayTitle if pagename.startswith("DISPLAYTITLE:"): pagename = "DISPLAYTITLE" if namespace.prefix != "template": return None tmpl = self.settings.templates.get((namespace.prefix, pagename), None) return tmpl