# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from copy import copy
from six import text_type as str
from ferenda.elements import LinkSubject
[docs]class CitationParser(object):
"""Finds citations to documents and other resources in text
strings. Each type of citation is specified by a
`pyparsing <http://pyparsing.wikispaces.com/Documentation>`_
grammar, and for each found citation a URI can be constructed
using a :py:class:`~ferenda.URIFormatter` object.
:param grammars: The grammar(s) for the citations that this
parser should find, in order of priority.
:type grammars: list of ``pyparsing.ParserElement`` objects
Usage:
>>> from pyparsing import Word,nums
>>> rfc_grammar = ("RFC " + Word(nums).setResultsName("rfcnumber")).setResultsName("rfccite")
>>> pep_grammar = ("PEP" + Word(nums).setResultsName("pepnumber")).setResultsName("pepcite")
>>> citparser = CitationParser(rfc_grammar, pep_grammar)
>>> res = citparser.parse_string("The WSGI spec (PEP 333) references RFC 2616 (The HTTP spec)")
>>> # res is a list of strings and/or pyparsing.ParseResult objects
>>> from ferenda import URIFormatter
>>> from ferenda.elements import Link
>>> f = URIFormatter(('rfccite',
... lambda p: "http://www.rfc-editor.org/rfc/rfc%(rfcnumber)s" % p),
... ('pepcite',
... lambda p: "http://www.python.org/dev/peps/pep-0%(pepnumber)s/" % p))
>>> citparser.set_formatter(f)
>>> res = citparser.parse_recursive(["The WSGI spec (PEP 333) references RFC 2616 (The HTTP spec)"])
>>> res == ['The WSGI spec (', Link('PEP 333',uri='http://www.python.org/dev/peps/pep-0333/'), ') references ', Link('RFC 2616',uri='http://www.rfc-editor.org/rfc/rfc2616'), ' (The HTTP spec)']
True
"""
def __init__(self, *grammars):
self._grammars = []
for grammar in grammars:
self.add_grammar(grammar)
self._formatter = None
[docs] def add_grammar(self, grammar):
"""Add another grammar.
:param grammar: The grammar to add
:type grammar: ``pyparsing.ParserElement``
"""
self._grammars.append(grammar)
[docs] def parse_string(self, string, predicate="dcterms:references"):
"""Find any citations in a text string, using the configured grammars.
:param string: Text to parse for citations
:type string: str
:returns: strings (for parts of the input text that do not contain
any citation) and/or tuples (for found citation) consisting
of (string, ``pyparsing.ParseResult``)
:rtype: list
"""
# Returns a list of strings and/or tuples, where each tuple is
# (string,pyparsing.ParseResult)
nodes = [string]
res = nodes # if self._grammars is None
for grammar in self._grammars:
res = []
for node in nodes:
if not isinstance(node, str):
res.append(node)
continue
matches = grammar.scanString(node)
start = 0
after = 0
for match, before, after in matches:
if before > start:
res.append(node[start:before])
res.append((node[before:after], match))
start = after
if after < len(node):
res.append(node[after:])
nodes = list(res)
return res
[docs] def parse_recursive(self, part, predicate="dcterms:references"):
"""Traverse a nested tree of elements, finding citations in
any strings contained in the tree. Found citations are marked
up as :py:class:`~ferenda.elements.Link` elements with the uri
constructed by the :py:class:`~ferenda.URIFormatter` set by
:py:meth:`~ferenda.CitationParser.set_formatter`.
:param part: The root element of the structure to parse
:type part: list
:returns: a correspondingly nested structure.
:rtype: list"""
res = []
if not isinstance(part, str):
for subpart in part:
if isinstance(subpart, str):
res.extend(self.parse_recursive(subpart, predicate))
else:
res.append(self.parse_recursive(subpart, predicate))
# replace our exising subparts/children with the combined result of parse_recursive
part[:] = res[:]
return part
else: # ok, simple string
# FIXME: We need to keep track of the URI for the part
# of the document we're in, so that we can resolve
# partial/relative references
# splits a string into a list of string and ParseResult objects
#
nodes = self.parse_string(part, predicate)
for node in nodes:
if isinstance(node, str):
if type(part) == str:
res.append(node)
else:
# handle str-derived types by instantiting
# that type and cloning its properties, so:
#
# Header("foo 123 baz", lvl=2)
#
# can result in:
#
# [Header("foo ", lvl=2),
# Link("123", uri="..."),
# Header(" baz", lvl=2)]
#
replacement = type(part)(node)
replacement.__dict__ = copy(part.__dict__)
res.append(replacement)
elif isinstance(node, tuple):
(text, parseresult) = node
# node = self.resolve_relative(node,currentloc)
uri = self._formatter.format(parseresult)
if uri:
res.append(LinkSubject(
text, uri=uri, predicate=predicate))
else:
res.append(text)
# FIXME: concatenate adjacent str nodes
return res