# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import re
from collections import deque, defaultdict
from rdflib import URIRef
from ferenda import DocumentRepository
from ferenda.elements import CompoundElement, OrdinalElement
# More TODO: create test/files/repo/eut/source/all.json like
# {'http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=OJ:C:2008:115:0001:01:EN:HTML': 'treaties.html'}
#
# Create downloaded/tfeu.html and teu.html (same content or possibly a bit shortened)
#
# Create distilled/tfeu.ttl and distilled/teu.ttl, should include structural elements
#
# Create parsed/tfeu.xhtml and parsed/teu.xhtml.
#
# Then done!
# The general outline of a treaty is:
# <Body> C
# <Paragraph> C (unicode/Link) - starting and ending titles
# <Preamble> C
# <Paragraph> - the typographic term, aka "Stycke"
# <Part> CO - not present for TEU
# <Title> CO
# <Chapter> CO
# <Section> CO
# <Article> CO
# <Subarticle> CO
# <Paragraph> C
# <unicode>
# <Link>
# <UnordedList leader="dash"> C
# <ListItem> C
# <OrderedList type="letter"> CO
# or should we have a class method ontology_uri, complimentary to canonical_uri/dataset_uri ?
vocab_uri = "http://lagen.nu/eurlex#"
class PreambleRecital(CompoundElement, OrdinalElement):
pass
# the most toplevel structural element. Only used for TFEU, not TEU
class Part(CompoundElement, OrdinalElement):
pass
# nb: this is completely different from ferenda.elements.Title -- this title is a toplevel
# structural element that encompasses chapters, sections, articles etc
class Title(CompoundElement, OrdinalElement):
pass
class Chapter(CompoundElement, OrdinalElement):
pass
class Section(CompoundElement, OrdinalElement):
pass
class Article(CompoundElement, OrdinalElement):
fragment_label = "A"
# FIXME: extend CompoundElement.as_xhtml to check for rdf_type and use it as an @about
# attribute (using a make_graph() graph to qname it)
rdf_type = URIRef(vocab_uri + "Article")
class Subarticle(CompoundElement, OrdinalElement):
fragment_label = "P"
rdf_type = URIRef(vocab_uri + "SubArticle")
class ListItem(CompoundElement):
fragment_label = "L"
rdf_type = URIRef(vocab_uri + "ListItem")
[docs]class EurlexTreaties(DocumentRepository):
"""Handles the foundation treaties of the European union."""
# overrides of superclass variables
alias = "eut" # European Union Treaties
start_url = "http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=OJ:C:2008:115:0001:01:EN:HTML"
document_url_template = "http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=OJ:C:2008:115:0001:01:EN:HTML#%(basefile)s"
rdf_type = URIRef(vocab_uri + "Treaty")
#
# Downloading
def download(self, basefile=None):
# NB: The very same document contains both TEU and TFEU. We download it twice
# (wasting some storage space) and let parse() pick out the relevant parts.
self.download_single("teu")
self.download_single("tfeu")
#
# Parsing -- FIXME: this should be easily ported to FSMParser
re_part = re.compile("PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN)$").match
re_title = re.compile("TITLE ([IVX]+)$").match
re_chapter = re.compile("CHAPTER (\d+)$").match
re_section = re.compile("SECTION (\d+)$").match
re_article = re.compile("Article (\d+)$").match
re_subarticle = re.compile("^(\d+)\. ").search
re_unorderedliststart = re.compile("^- ").search
re_orderedliststart = re.compile("^\(\w\) ").search
re_romanliststart = re.compile("^\([ivx]+\) ").search
ordinal_list = ('ONE', 'TWO', 'THREE', 'FOUR', 'FIVE', 'SIX', 'SEVEN',
'EIGHT', 'NINE', 'TEN', 'ELEVEN', 'TWELVE')
ordinal_dict = dict(
list(zip(ordinal_list, list(range(1, len(ordinal_list) + 1)))))
def parse_metadata_from_soup(self, soup, doc):
if not doc.uri:
doc.uri = self.canonical_uri(doc.basefile)
desc = Describer(doc.meta, doc.uri)
desc.rdftype(self.rdf_type)
if basefile == "teu":
desc.value(self.ns['dcterms'].title, "Treaty on European Union", lang="en")
elif basefile == "tfeu":
desc.value(
self.ns['dcterms'].title, "Treaty on the Functioning of the European Union", lang="en")
def parse_document_from_soup(soup, doc):
if basefile == "teu":
startnode = soup.findAll(text="-" * 50)[1].parent
elif basefile == "tfeu":
startnode = soup.findAll(text="-" * 50)[2].parent
lines = deque()
for p in startnode.findNextSiblings("p"):
if p.string == "-" * 50:
self.log.info("found the end")
break
else:
if p.string:
lines.append(str(p.string))
doc.body = self.make_body(lines)
self.process_body(doc.body, '', doc.uri)
def make_body(self, lines):
b = Body()
while lines:
line = lines.popleft()
if line == "PREAMBLE":
b.append(self.make_preamble(lines))
elif self.re_title(line):
lines.appendleft(line)
b.append(self.make_title(lines))
elif self.re_part(line):
lines.appendleft(line)
b.append(self.make_part(lines))
else:
b.append(Paragraph([line]))
# print type(b[-1])
return b
def make_preamble(self, lines):
p = PreambleRecital(title="PREAMBLE")
while lines:
line = lines.popleft()
if (self.re_part(line) or self.re_title(line)):
lines.appendleft(line)
return p
else:
p.append(Paragraph([line]))
self.log.warn("make_preamble ran out of lines!")
return p
def make_part(self, lines):
partnumber = lines.popleft()
ordinal = self.ordinal_dict[self.re_part(partnumber).group(1)]
parttitle = lines.popleft()
p = Part(ordinal=ordinal, ordinaltitle=partnumber, title=parttitle)
while lines:
line = lines.popleft()
if (self.re_part(line)):
lines.appendleft(line)
return p
elif (self.re_title(line)):
lines.appendleft(line)
p.append(self.make_title(lines))
elif (self.re_article(line)):
# print "make_part: %s matches article" % line
lines.appendleft(line)
p.append(self.make_article(lines))
else:
p.append(Paragraph([line]))
self.log.warn(
"make_part appended naked Paragraph '%s...'" % line[:25])
return p
def make_title(self, lines):
titlenumber = lines.popleft()
ordinal = self._from_roman(self.re_title(titlenumber).group(1))
titletitle = lines.popleft()
t = Title(ordinal=ordinal, ordinaltitle=titlenumber, title=titletitle)
while lines:
line = lines.popleft()
if (self.re_part(line) or self.re_title(line)):
lines.appendleft(line)
return t
elif (self.re_chapter(line)):
lines.appendleft(line)
t.append(self.make_chapter(lines))
elif (self.re_article(line)):
# print "make_title: %s matches article" % line
lines.appendleft(line)
t.append(self.make_article(lines))
else:
t.append(Paragraph([line]))
self.log.warn(
"make_title appended naked Paragraph '%s...'" % line[:25])
return t
def make_chapter(self, lines):
chapternumber = lines.popleft()
ordinal = int(self.re_chapter(chapternumber).group(1))
chaptertitle = lines.popleft()
c = Chapter(
ordinal=ordinal, ordinaltitle=chapternumber, title=chaptertitle)
while lines:
line = lines.popleft()
if (self.re_part(line) or
self.re_title(line) or
self.re_chapter(line)):
lines.appendleft(line)
return c
elif (self.re_section(line)):
lines.appendleft(line)
c.append(self.make_section(lines))
elif (self.re_article(line)):
# print "make_chapter: %s matches article" % line
lines.appendleft(line)
c.append(self.make_article(lines))
else:
c.append(Paragraph([line]))
self.log.warn("make_chapter appended naked Paragraph '%s...'" %
line[:25])
return c
def make_section(self, lines):
sectionnumber = lines.popleft()
ordinal = int(self.re_section(sectionnumber).group(1))
sectiontitle = lines.popleft()
s = Section(
ordinal=ordinal, ordinaltitle=sectionnumber, title=sectiontitle)
while lines:
line = lines.popleft()
if (self.re_part(line) or
self.re_title(line) or
self.re_chapter(line) or
self.re_section(line)):
lines.appendleft(line)
return s
elif (self.re_article(line)):
# print "make_section: %s matches article" % line
lines.appendleft(line)
s.append(self.make_article(lines))
else:
s.append(Paragraph([line]))
self.log.warn("make_section appended naked Paragraph '%s...'" %
line[:25])
return s
def make_article(self, lines):
articlenumber = lines.popleft()
ordinal = int(self.re_article(articlenumber).group(1))
self.log.info("Making article: %s" % ordinal)
exarticlenumber = lines.popleft()
if not exarticlenumber.startswith("(ex Article"):
lines.appendleft(exarticlenumber)
a = Article(ordinal=ordinal, ordinaltitle=articlenumber)
else:
a = Article(ordinal=ordinal, ordinaltitle=articlenumber,
exarticlenumber=exarticlenumber)
while lines:
line = lines.popleft()
if (self.re_part(line) or
self.re_title(line) or
self.re_chapter(line) or
self.re_section(line) or
self.re_article(line)):
lines.appendleft(line)
return a
elif (self.re_subarticle(line)):
lines.appendleft(line)
a.append(self.make_subarticle(lines))
elif (self.re_unorderedliststart(line)):
lines.appendleft(line)
a.append(self.make_unordered_list(lines, "dash"))
elif (self.re_orderedliststart(line)):
lines.appendleft(line)
a.append(self.make_ordered_list(lines, "lower-alpha"))
else:
# print "Appending %s" % line[:40]
a.append(Paragraph([line]))
return a
def make_subarticle(self, lines):
line = lines.popleft()
subarticlenum = int(self.re_subarticle(line).group(1))
# self.log.info("Making subarticle %d: %s" % (subarticlenum, line[:30]))
s = Subarticle(ordinal=subarticlenum)
lines.appendleft(line)
while lines:
line = lines.popleft()
if (self.re_part(line) or
self.re_title(line) or
self.re_chapter(line) or
self.re_section(line) or
self.re_article(line)):
lines.appendleft(line)
return s
elif (self.re_subarticle(line) and
int(self.re_subarticle(line).group(1)) != subarticlenum):
lines.appendleft(line)
return s
elif (self.re_unorderedliststart(line)):
lines.appendleft(line)
s.append(self.make_unordered_list(lines, "dash"))
elif (self.re_orderedliststart(line)):
lines.appendleft(line)
s.append(self.make_ordered_list(lines, "lower-alpha"))
else:
# this is OK
s.append(Paragraph([line]))
return s
def make_unordered_list(self, lines, style):
ul = UnorderedList(style=style)
while lines:
line = lines.popleft()
if not self.re_unorderedliststart(line):
lines.appendleft(line)
return ul
else:
ul.append(ListItem([line]))
return ul
def make_ordered_list(self, lines, style):
ol = OrderedList(style=style)
while lines:
line = lines.popleft()
# try romanliststart before orderedliststart -- (i) matches
# both, but is likely the former
if self.re_romanliststart(line):
# print "make_ordered_list: re_romanliststart: %s" % line[:40]
if style == "lower-roman":
ol.append(ListItem([line]))
else:
lines.appendleft(line)
ol.append(self.make_ordered_list(lines, "lower-roman"))
elif self.re_orderedliststart(line):
# print "make_ordered_list: re_orderedliststart: %s" % line[:40]
if style == "lower-alpha":
ol.append(ListItem([line]))
else: # we were in a roman-style sublist, so we should pop up
lines.appendleft(line)
return ol
else:
# print "make_ordered_list: done: %s" % line[:40]
lines.appendleft(line)
return ol
return ol
# Post-process the document tree in a recursive fashion in order to:
#
# Find addressable units (resources that should have unique URI:s,
# e.g. articles and subarticles) and construct IDs for them, like
# "A7", "A25(b)(ii)" (or A25S1P2N2 or...?)
#
# How should we handle Articles themselves -- they have individual
# CELEX numbers and therefore URIs (but subarticles don't)?
def process_body(self, element, prefix, baseuri):
if isinstance(element, str):
return
# print "Starting with " + str(type(element))
counters = defaultdict(int)
for p in element:
counters[type(p)] += 1
# print "handling " + str(type(p))
if hasattr(p, 'fragment_label'): # this is an addressable resource
elementtype = p.fragment_label
if hasattr(p, 'ordinal'):
elementordinal = p.ordinal
else:
elementordinal = counters[type(p)]
fragment = "%s%s%s" % (prefix, elementtype, elementordinal)
if elementtype == "A":
uri = "%s%03d" % (baseuri, elementordinal)
else:
uri = "%s%s%s" % (baseuri, elementtype, elementordinal)
p.id = fragment
p.attrs = {'id': p.id,
'about': uri,
'typeof': p.rdftype}
if elementtype == "A":
uri += "#"
else:
fragment = prefix
uri = baseuri
self.process_body(p, fragment, uri)