Source code for

# -*- coding: utf-8 -*-
from __future__ import unicode_literals

import re
from collections import deque, defaultdict

from rdflib import URIRef

from ferenda import DocumentRepository
from ferenda.elements import CompoundElement, OrdinalElement

# More TODO: create test/files/repo/eut/source/all.json like
# {'': 'treaties.html'}
# Create downloaded/tfeu.html and teu.html (same content or possibly a bit shortened)
# Create distilled/tfeu.ttl and distilled/teu.ttl, should include structural elements
# Create parsed/tfeu.xhtml and parsed/teu.xhtml.
# Then done!

# The general outline of a treaty is:
# <Body> C
#   <Paragraph> C (unicode/Link) - starting and ending titles
#   <Preamble> C
#     <Paragraph> - the typographic term, aka "Stycke"
#   <Part> CO - not present for TEU
#     <Title> CO
#       <Chapter> CO
#         <Section> CO
#           <Article> CO
#             <Subarticle> CO
#                <Paragraph> C
#                  <unicode>
#                  <Link>
#                <UnordedList leader="dash"> C
#                  <ListItem> C
#                <OrderedList type="letter"> CO

# or should we have a class method ontology_uri, complimentary to canonical_uri/dataset_uri ?
vocab_uri = ""

class PreambleRecital(CompoundElement, OrdinalElement):

# the most toplevel structural element. Only used for TFEU, not TEU

class Part(CompoundElement, OrdinalElement):

# nb: this is completely different from ferenda.elements.Title -- this title is a toplevel
# structural element that encompasses chapters, sections, articles etc

class Title(CompoundElement, OrdinalElement):

class Chapter(CompoundElement, OrdinalElement):

class Section(CompoundElement, OrdinalElement):

class Article(CompoundElement, OrdinalElement):
    fragment_label = "A"
    # FIXME: extend CompoundElement.as_xhtml to check for rdf_type and use it as an @about
    # attribute (using a make_graph() graph to qname it)
    rdf_type = URIRef(vocab_uri + "Article")

class Subarticle(CompoundElement, OrdinalElement):
    fragment_label = "P"
    rdf_type = URIRef(vocab_uri + "SubArticle")

class ListItem(CompoundElement):
    fragment_label = "L"
    rdf_type = URIRef(vocab_uri + "ListItem")

[docs]class EurlexTreaties(DocumentRepository): """Handles the foundation treaties of the European union.""" # overrides of superclass variables alias = "eut" # European Union Treaties start_url = "" document_url_template = "" rdf_type = URIRef(vocab_uri + "Treaty") # # Downloading def download(self, basefile=None): # NB: The very same document contains both TEU and TFEU. We download it twice # (wasting some storage space) and let parse() pick out the relevant parts. self.download_single("teu") self.download_single("tfeu") # # Parsing -- FIXME: this should be easily ported to FSMParser re_part = re.compile("PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN)$").match re_title = re.compile("TITLE ([IVX]+)$").match re_chapter = re.compile("CHAPTER (\d+)$").match re_section = re.compile("SECTION (\d+)$").match re_article = re.compile("Article (\d+)$").match re_subarticle = re.compile("^(\d+)\. ").search re_unorderedliststart = re.compile("^- ").search re_orderedliststart = re.compile("^\(\w\) ").search re_romanliststart = re.compile("^\([ivx]+\) ").search ordinal_list = ('ONE', 'TWO', 'THREE', 'FOUR', 'FIVE', 'SIX', 'SEVEN', 'EIGHT', 'NINE', 'TEN', 'ELEVEN', 'TWELVE') ordinal_dict = dict( list(zip(ordinal_list, list(range(1, len(ordinal_list) + 1))))) def parse_metadata_from_soup(self, soup, doc): if not doc.uri: doc.uri = self.canonical_uri(doc.basefile) desc = Describer(doc.meta, doc.uri) desc.rdftype(self.rdf_type) if basefile == "teu": desc.value(self.ns['dcterms'].title, "Treaty on European Union", lang="en") elif basefile == "tfeu": desc.value( self.ns['dcterms'].title, "Treaty on the Functioning of the European Union", lang="en") def parse_document_from_soup(soup, doc): if basefile == "teu": startnode = soup.findAll(text="-" * 50)[1].parent elif basefile == "tfeu": startnode = soup.findAll(text="-" * 50)[2].parent lines = deque() for p in startnode.findNextSiblings("p"): if p.string == "-" * 50:"found the end") break else: if p.string: lines.append(str(p.string)) doc.body = self.make_body(lines) self.process_body(doc.body, '', doc.uri) def make_body(self, lines): b = Body() while lines: line = lines.popleft() if line == "PREAMBLE": b.append(self.make_preamble(lines)) elif self.re_title(line): lines.appendleft(line) b.append(self.make_title(lines)) elif self.re_part(line): lines.appendleft(line) b.append(self.make_part(lines)) else: b.append(Paragraph([line])) # print type(b[-1]) return b def make_preamble(self, lines): p = PreambleRecital(title="PREAMBLE") while lines: line = lines.popleft() if (self.re_part(line) or self.re_title(line)): lines.appendleft(line) return p else: p.append(Paragraph([line])) self.log.warn("make_preamble ran out of lines!") return p def make_part(self, lines): partnumber = lines.popleft() ordinal = self.ordinal_dict[self.re_part(partnumber).group(1)] parttitle = lines.popleft() p = Part(ordinal=ordinal, ordinaltitle=partnumber, title=parttitle) while lines: line = lines.popleft() if (self.re_part(line)): lines.appendleft(line) return p elif (self.re_title(line)): lines.appendleft(line) p.append(self.make_title(lines)) elif (self.re_article(line)): # print "make_part: %s matches article" % line lines.appendleft(line) p.append(self.make_article(lines)) else: p.append(Paragraph([line])) self.log.warn( "make_part appended naked Paragraph '%s...'" % line[:25]) return p def make_title(self, lines): titlenumber = lines.popleft() ordinal = self._from_roman(self.re_title(titlenumber).group(1)) titletitle = lines.popleft() t = Title(ordinal=ordinal, ordinaltitle=titlenumber, title=titletitle) while lines: line = lines.popleft() if (self.re_part(line) or self.re_title(line)): lines.appendleft(line) return t elif (self.re_chapter(line)): lines.appendleft(line) t.append(self.make_chapter(lines)) elif (self.re_article(line)): # print "make_title: %s matches article" % line lines.appendleft(line) t.append(self.make_article(lines)) else: t.append(Paragraph([line])) self.log.warn( "make_title appended naked Paragraph '%s...'" % line[:25]) return t def make_chapter(self, lines): chapternumber = lines.popleft() ordinal = int(self.re_chapter(chapternumber).group(1)) chaptertitle = lines.popleft() c = Chapter( ordinal=ordinal, ordinaltitle=chapternumber, title=chaptertitle) while lines: line = lines.popleft() if (self.re_part(line) or self.re_title(line) or self.re_chapter(line)): lines.appendleft(line) return c elif (self.re_section(line)): lines.appendleft(line) c.append(self.make_section(lines)) elif (self.re_article(line)): # print "make_chapter: %s matches article" % line lines.appendleft(line) c.append(self.make_article(lines)) else: c.append(Paragraph([line])) self.log.warn("make_chapter appended naked Paragraph '%s...'" % line[:25]) return c def make_section(self, lines): sectionnumber = lines.popleft() ordinal = int(self.re_section(sectionnumber).group(1)) sectiontitle = lines.popleft() s = Section( ordinal=ordinal, ordinaltitle=sectionnumber, title=sectiontitle) while lines: line = lines.popleft() if (self.re_part(line) or self.re_title(line) or self.re_chapter(line) or self.re_section(line)): lines.appendleft(line) return s elif (self.re_article(line)): # print "make_section: %s matches article" % line lines.appendleft(line) s.append(self.make_article(lines)) else: s.append(Paragraph([line])) self.log.warn("make_section appended naked Paragraph '%s...'" % line[:25]) return s def make_article(self, lines): articlenumber = lines.popleft() ordinal = int(self.re_article(articlenumber).group(1))"Making article: %s" % ordinal) exarticlenumber = lines.popleft() if not exarticlenumber.startswith("(ex Article"): lines.appendleft(exarticlenumber) a = Article(ordinal=ordinal, ordinaltitle=articlenumber) else: a = Article(ordinal=ordinal, ordinaltitle=articlenumber, exarticlenumber=exarticlenumber) while lines: line = lines.popleft() if (self.re_part(line) or self.re_title(line) or self.re_chapter(line) or self.re_section(line) or self.re_article(line)): lines.appendleft(line) return a elif (self.re_subarticle(line)): lines.appendleft(line) a.append(self.make_subarticle(lines)) elif (self.re_unorderedliststart(line)): lines.appendleft(line) a.append(self.make_unordered_list(lines, "dash")) elif (self.re_orderedliststart(line)): lines.appendleft(line) a.append(self.make_ordered_list(lines, "lower-alpha")) else: # print "Appending %s" % line[:40] a.append(Paragraph([line])) return a def make_subarticle(self, lines): line = lines.popleft() subarticlenum = int(self.re_subarticle(line).group(1)) #"Making subarticle %d: %s" % (subarticlenum, line[:30])) s = Subarticle(ordinal=subarticlenum) lines.appendleft(line) while lines: line = lines.popleft() if (self.re_part(line) or self.re_title(line) or self.re_chapter(line) or self.re_section(line) or self.re_article(line)): lines.appendleft(line) return s elif (self.re_subarticle(line) and int(self.re_subarticle(line).group(1)) != subarticlenum): lines.appendleft(line) return s elif (self.re_unorderedliststart(line)): lines.appendleft(line) s.append(self.make_unordered_list(lines, "dash")) elif (self.re_orderedliststart(line)): lines.appendleft(line) s.append(self.make_ordered_list(lines, "lower-alpha")) else: # this is OK s.append(Paragraph([line])) return s def make_unordered_list(self, lines, style): ul = UnorderedList(style=style) while lines: line = lines.popleft() if not self.re_unorderedliststart(line): lines.appendleft(line) return ul else: ul.append(ListItem([line])) return ul def make_ordered_list(self, lines, style): ol = OrderedList(style=style) while lines: line = lines.popleft() # try romanliststart before orderedliststart -- (i) matches # both, but is likely the former if self.re_romanliststart(line): # print "make_ordered_list: re_romanliststart: %s" % line[:40] if style == "lower-roman": ol.append(ListItem([line])) else: lines.appendleft(line) ol.append(self.make_ordered_list(lines, "lower-roman")) elif self.re_orderedliststart(line): # print "make_ordered_list: re_orderedliststart: %s" % line[:40] if style == "lower-alpha": ol.append(ListItem([line])) else: # we were in a roman-style sublist, so we should pop up lines.appendleft(line) return ol else: # print "make_ordered_list: done: %s" % line[:40] lines.appendleft(line) return ol return ol # Post-process the document tree in a recursive fashion in order to: # # Find addressable units (resources that should have unique URI:s, # e.g. articles and subarticles) and construct IDs for them, like # "A7", "A25(b)(ii)" (or A25S1P2N2 or...?) # # How should we handle Articles themselves -- they have individual # CELEX numbers and therefore URIs (but subarticles don't)? def process_body(self, element, prefix, baseuri): if isinstance(element, str): return # print "Starting with " + str(type(element)) counters = defaultdict(int) for p in element: counters[type(p)] += 1 # print "handling " + str(type(p)) if hasattr(p, 'fragment_label'): # this is an addressable resource elementtype = p.fragment_label if hasattr(p, 'ordinal'): elementordinal = p.ordinal else: elementordinal = counters[type(p)] fragment = "%s%s%s" % (prefix, elementtype, elementordinal) if elementtype == "A": uri = "%s%03d" % (baseuri, elementordinal) else: uri = "%s%s%s" % (baseuri, elementtype, elementordinal) = fragment p.attrs = {'id':, 'about': uri, 'typeof': p.rdftype} if elementtype == "A": uri += "#" else: fragment = prefix uri = baseuri self.process_body(p, fragment, uri)