# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division,
print_function, unicode_literals)
from builtins import *
# A number of different classes each fetching the same data from
# different sources (and with different data formats and data fidelity)
import os
import re
import functools
import codecs
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from rdflib import Literal, URIRef, Namespace
from rdflib.namespace import DCTERMS, XSD, RDFS
import requests
from . import (SwedishLegalSource, SwedishLegalStore, FixedLayoutSource,
Trips, Regeringen, RPUBL, Offtryck)
from .elements import *
from ferenda import CompositeRepository, CompositeStore
from ferenda import TextReader
from ferenda import util
from ferenda import PDFAnalyzer, Facet, FSMParser
from ferenda.decorators import downloadmax, recordlastdownload, newstate
from ferenda.elements import Body, Heading, ListItem, Paragraph
from ferenda.errors import DocumentRemovedError
from ferenda.compat import urljoin
from ferenda.pdfreader import Page
def dir_sanitize_identifier(identifier):
# common sanitizer for all purposes
if not identifier:
return identifier # allow infer_identifier to do it's magic later
if identifier.startswith("Direktiv "):
identifier = identifier.replace("Direktiv ", "Dir. ")
if identifier.startswith("Dir. dir. "):
identifier = identifier.replace("dir. ", "")
if identifier.startswith("Dir "):
identifier = identifier.replace("Dir ", "Dir. ")
if identifier.startswith("dir."):
identifier = identifier.replace("dir.", "Dir.")
if identifier.startswith("Dir:"):
identifier = identifier.replace("Dir:", "Dir.")
# "Dir.1994:111" -> "Dir. 1994:111"
if re.match("Dir\.\d+", identifier):
identifier = "Dir. " + identifier[4:]
# Dir. 2006.44 -> Dir. 2006:44
if re.match("Dir\. \d+\.\d+", identifier):
# replace the rightmost . with a :
identifier = identifier[::-1].replace(".", ":", 1)[::-1]
if not identifier.startswith("Dir. "):
identifier = "Dir. " + identifier
if not re.match("Dir\. (19|20)\d{2}:[1-9]\d{0,2}$", identifier):
raise ValueError("Irregular identifier %s (after mangling)" % identifier)
return Literal(identifier)
# custom style analyzer
class DirAnalyzer(PDFAnalyzer):
# direktiv has no footers
footer_significance_threshold = 0
def analyze_styles(self, styles):
styledefs = {}
ds = styles.most_common(1)[0][0]
styledefs['default'] = self.fontdict(ds)
# Largest style: used for the text "Kommittédirektiv" on the frontpage
# 2nd largest: used for the title
# 3rd largest: used for the id (eg "Dir. 2014:158") on the frontpage.
# 4th largest (same size as body text but bold): h1
# 5th largest (same size as body text but italic): h2
styles = sorted(styles.keys(), key=self.fontsize_key,
reverse=True)[1:5]
if len(styles) < 3: # only happens for dir 1991:49, which do
# not use any header styles except ts
(ts, dummy) = styles
h1 = None
h2 = None
elif len(styles) < 4: # might be the case if no h2:s are ever used
(ts, dummy, h1) = styles
h2 = None
else:
(ts, dummy, h1, h2) = styles
if h2 == ds: # what we thought was h2 was really the
# default style, meaning no h2:s are used in
# the doc
h2 = None
styledefs['title'] = self.fontdict(ts)
if h1:
styledefs['h1'] = self.fontdict(h1)
if h2:
styledefs['h2'] = self.fontdict(h2)
return styledefs
class Continuation(object):
pass
[docs]class DirTrips(Trips):
"""Downloads Direktiv in plain text format from http://rkrattsbaser.gov.se/dir/"""
alias = "dirtrips"
start_url = "http://rkrattsbaser.gov.se/dir/adv?sort=asc"
document_url_template = "http://rkrattsbaser.gov.se/dir?bet=%(basefile)s"
rdf_type = RPUBL.Kommittedirektiv
urispace_segment = "dir"
storage_policy = "file"
@recordlastdownload
def download(self, basefile=None):
if basefile:
return super(DirTrips, self).download(basefile)
else:
if 'lastdownload' in self.config and self.config.lastdownload and not self.config.refresh:
startdate = self.config.lastdownload - timedelta(days=30)
self.start_url += "&UDAT=%s+till+%s" % (
datetime.strftime(startdate, "%Y-%m-%d"),
datetime.strftime(datetime.now(), "%Y-%m-%d"))
super(DirTrips, self).download()
def downloaded_to_intermediate(self, basefile, attachment=None):
return self._extract_text(basefile)
def extract_head(self, fp, basefile):
textheader = fp.read(2048)
if not isinstance(textheader, str):
# Depending on whether the fp is opened through standard
# open() or bz2.BZ2File() in self.parse_open(), it might
# return bytes or unicode strings. This seem to be a
# problem in BZ2File (or how we use it). Just roll with it.
textheader = bytes(textheader)
textheader = textheader.decode(self.source_encoding, errors="ignore")
idx = textheader.index("-"*64)
header = textheader[:idx]
fp.seek(len(header) + 66)
return header
def extract_metadata(self, rawheader, basefile): # -> dict
predicates = {'Departement': "rpubl:departement",
'Beslut': "rpubl:beslutsdatum"}
headers = [x.strip() for x in rawheader.split("\n\n") if x.strip()]
title, identifier = headers[0].rsplit(", ", 1)
d = self.metadata_from_basefile(basefile)
d.update({'dcterms:identifier': identifier.strip(),
'dcterms:title': title.strip()})
if d['dcterms:title'] == "Utgår":
raise DocumentRemovedError("%s: Removed" % basefile,
dummyfile=self.store.parsed_path(basefile))
for header in headers[1:]:
key, val = header.split(":")
d[predicates[key.strip()]] = val.strip()
d["dcterms:publisher"] = self.lookup_resource("Regeringskansliet")
if "rpubl:beslutsdatum" in d:
d["dcterms:issued"] = d["rpubl:beslutsdatum"] # best we can do
return d
def sanitize_identifier(self, identifier):
return dir_sanitize_identifier(identifier)
def sanitize_rubrik(self, rubrik):
if rubrik == "Utgår":
raise DocumentRemovedError()
rubrik = re.sub("^/r2/ ", "", rubrik)
return Literal(rubrik, lang="sv")
def extract_body(self, fp, basefile):
rawtext = fp.read()
if isinstance(rawtext, bytes): # happens when creating the intermediate file
rawtext = rawtext.decode(self.source_encoding)
# remove whitespace on otherwise empty lines
rawtext = re.sub("\n\t\n", "\n\n", rawtext)
reader = TextReader(string=rawtext,
linesep=TextReader.UNIX)
return reader
def parse_body_parseconfigs(self):
return ("default", "simple")
def get_parser(self, basefile, sanitized, parseconfig="default"):
def is_header(parser):
p = parser.reader.peek()
# older direktiv sources start with dir number
if re.match(r'Dir\.? \d{4}:\d+$', p):
return False
return (headerlike(p) and
not is_strecksats(parser, parser.reader.peek(2)))
def is_strecksats(parser, chunk=None):
if chunk is None:
chunk = parser.reader.peek()
return chunk.startswith(("--", "- "))
def is_section(parser):
(ordinal, headingtype, title) = analyze_sectionstart(parser)
if ordinal:
return headingtype == "h1"
def is_subsection(parser):
(ordinal, headingtype, title) = analyze_sectionstart(parser)
if ordinal:
return headingtype == "h2"
def is_paragraph(parser):
return True
@newstate('body')
def make_body(parser):
return parser.make_children(Body())
@newstate('section')
def make_section(parser):
chunk = parser.reader.next()
ordinal, headingtype, title = analyze_sectionstart(parser, chunk)
s = Avsnitt(ordinal=ordinal, title=title)
return parser.make_children(s)
@newstate('strecksats')
def make_strecksatslista(parser):
ul = Strecksatslista()
li = make_listitem(parser)
ul.append(li)
res = parser.make_children(ul)
return res
def make_listitem(parser):
chunk = parser.reader.next()
s = str(chunk)
if " " in s:
# assume text before first space is the bullet
s = s.split(" ",1)[1]
else:
# assume the bullet is a single char
s = s[1:]
return Strecksatselement([s])
def make_header(parser):
return Heading([parser.reader.next()])
def make_paragraph(parser):
return Paragraph([parser.reader.next()])
@newstate('unorderedsection')
def make_unorderedsection(parser):
s = UnorderedSection(title=parser.reader.next().strip())
return parser.make_children(s)
def headerlike(p):
return (p[0].lower() != p[0]
and len(p) < 150
and not (p.endswith(".") and
not (p.endswith("m.m.") or
p.endswith("m. m.") or
p.endswith("m.fl.") or
p.endswith("m. fl."))))
re_sectionstart = re.compile("^(\d[\.\d]*) +([A-ZÅÄÖ].*)$").match
def analyze_sectionstart(parser, chunk=None):
"""returns (ordinal, headingtype, text) if it looks like a section
heading, (None, None, chunk) otherwise."""
if chunk is None:
chunk = parser.reader.peek()
m = re_sectionstart(chunk)
if m and headerlike(m.group(2)):
return (m.group(1),
"h" + str(m.group(1).count(".") + 1),
m.group(2).strip())
else:
return None, None, chunk
p = FSMParser()
if parseconfig == "simple":
recognizers = [is_header, is_strecksats, is_paragraph]
else:
recognizers = [is_section,
is_subsection,
is_header,
is_strecksats,
is_paragraph]
p.set_recognizers(*recognizers)
commonstates = ("body", "section", "subsection", "unorderedsection")
p.set_transitions({(commonstates, is_paragraph): (make_paragraph, None),
(commonstates, is_strecksats): (make_strecksatslista, "strecksats"),
(commonstates, is_header): (make_unorderedsection, "unorderedsection"),
(commonstates, is_section): (make_section, "section"),
("unorderedsection", is_header): (False, None),
("unorderedsection", is_section): (False, None),
("strecksats", is_paragraph): (False, None),
("strecksats", is_strecksats): (make_listitem, None),
("section", is_header): (False, None),
("section", is_section): (False, None),
("section", is_subsection): (make_section, "subsection"),
("subsection", is_subsection): (False, None),
("subsection", is_section): (False, None)})
p.initial_state = "body"
p.initial_constructor = make_body
p.debug = os.environ.get('FERENDA_FSMDEBUG', False)
return p.parse
def tokenize(self, reader):
return reader.getiterator(reader.readparagraph)
[docs]class DirAsp(FixedLayoutSource):
"""Downloads Direktiv in PDF format from http://rkrattsdb.gov.se/kompdf/"""
alias = "dirasp"
# FIXME: these url should start with http://rkrattsdb.gov.se/, but
# on at least some systems we have some IPv4/IPv6 problems with
# that URI similar to what required the config.ipbasedurls option
# in trips.py -- maybe we need something similar here (or fix our
# systems at a lower level...)
start_url = "http://193.188.157.100/kompdf/search.asp"
document_url = "http://193.188.157.100/KOMdoc/%(yy)02d/%(yy)02d%(num)04d.PDF"
source_encoding = "iso-8859-1"
rdf_type = RPUBL.Kommittedirektiv
storage_policy = "dir"
# these defs are to play nice with SwedishLegalSource.get_parser
KOMMITTEDIREKTIV = "dir"
PROPOSITION = SOU = DS = None
document_type = KOMMITTEDIREKTIV
urispace_segment = "dir"
def download(self, basefile=None):
if basefile:
return super(DirAsp, self).download(basefile)
resp = requests.get(self.start_url)
soup = BeautifulSoup(resp.text, "lxml")
depts = [opt['value'] for opt in soup.find_all("option", value=True)]
for basefile, url in self.download_get_basefiles(depts):
# since the server doesn't support conditional caching and
# direktivs are basically never updated once published, we
# avoid even calling download_single if we already have
# the doc.
if ((not self.config.refresh) and
(not os.path.exists(self.store.downloaded_path(basefile)))):
self.download_single(basefile, url)
@downloadmax
def download_get_basefiles(self, depts):
for dept in depts:
resp = requests.post(urljoin(self.start_url, 'sql_search_rsp.asp'),
{'departement': dept.encode('latin-1'),
'kom_nr': '',
'title': '',
'ACTION': ' SÖK '.encode('latin-1')})
soup = BeautifulSoup(resp.text, "lxml")
hits = list(soup.find_all(True, text=re.compile(r'(\d{4}:\d+)')))
self.log.debug("Searching for dept %s, %d results" % (dept, len(hits)))
for hit in hits:
link = hit.find_parent("a")
# convert 2006:02 to 2006:2 for consistency
segments = re.search("(\d+):(\d+)", link.text).groups()
basefile = ":".join([str(int(x)) for x in segments])
# we use link.absolute_url rather than relying on our
# own basefile -> url code in remote_url. It seems
# that in least one case the URL formatting rule is
# not followed by the system...
yield basefile, urljoin(self.start_url, link['href'])
def remote_url(self, basefile):
yy = int(basefile[2:4])
num = int(basefile[5:])
return self.document_url % {'yy': yy, 'num': num}
def metadata_from_basefile(self, basefile):
a = super(DirAsp, self).metadata_from_basefile(basefile)
a["rpubl:arsutgava"], a["rpubl:lopnummer"] = basefile.split(":", 1)
return a
def infer_identifier(self, basefile):
return "Dir. %s" % basefile
def postprocess_doc(self, doc):
next_is_title = False
newbody = Body()
glue = lambda x, y, z: False
for para in doc.body.textboxes(gluefunc=glue, pageobjects=True):
strpara = str(para).strip()
if strpara == "Kommittédirektiv":
next_is_title = True
elif next_is_title:
doc.meta.add((URIRef(doc.uri), DCTERMS.title, Literal(strpara)))
next_is_title = False
elif strpara.startswith("Beslut vid regeringssammanträde den "):
datestr = strpara[36:] # length of above prefix
if datestr.endswith("."):
datestr = datestr[:-1]
doc.meta.add((URIRef(doc.uri), DCTERMS.issued,
Literal(self.parse_swedish_date(datestr),
datatype=XSD.date)))
if isinstance(para, Page):
newbody.append(Sidbrytning(ordinal=para.number,
width=para.width,
height=para.height,
src=para.src))
else:
newbody.append(para)
doc.body = newbody
[docs]class DirRegeringen(Regeringen):
"""Downloads Direktiv in PDF format from http://www.regeringen.se/"""
alias = "dirregeringen"
cssfiles = ['pdfview.css']
jsfiles = ['pdfviewer.js']
re_basefile_strict = re.compile(r'Dir\. (\d{4}:\d+)')
re_basefile_lax = re.compile(r'(?:[Dd]ir\.?|) ?(\d{4}:\d+)')
re_urlbasefile_strict = re.compile("kommittedirektiv/\d+/\d+/[a-z]*\.?-?(\d{4})(\d+)-?/$")
re_urlbasefile_lax = re.compile("kommittedirektiv/\d+/\d+/.*?(\d{4})_?(\d+)")
rdf_type = RPUBL.Kommittedirektiv
document_type = Regeringen.KOMMITTEDIREKTIV
def sanitize_identifier(self, identifier):
return dir_sanitize_identifier(identifier)
def infer_identifier(self, basefile):
return "Dir. %s" % basefile
# inherit list_basefiles_for from CompositeStore, basefile_to_pathfrag
# from SwedishLegalStore)
class DirektivStore(CompositeStore, SwedishLegalStore):
pass
# Does parsing, generating etc from base files:
[docs]class Direktiv(CompositeRepository, FixedLayoutSource):
"A composite repository containing ``DirTrips``, ``DirAsp`` and ``DirRegeringen``."""
subrepos = DirRegeringen, DirAsp, DirTrips
alias = "dir"
xslt_template = "xsl/forarbete.xsl"
storage_policy = "dir"
rdf_type = RPUBL.Kommittedirektiv
documentstore_class = DirektivStore
sparql_annotations = "sparql/describe-with-subdocs.rq"
sparql_expect_results = False
# NB: The same logic as in
# ferenda.sources.legal.se.{Regeringen,Riksdagen}.metadata_from_basefile
# news() needs to be able to compute URI from basefile, so we need
# to reimplement this logic. Maybe that's stupid as there should
# already be a distilled RDF file available in
# distilled/[BASEFILE].rdf...
def metadata_from_basefile(self, basefile):
a = super(Direktiv, self).metadata_from_basefile(basefile)
a["rpubl:arsutgava"], a["rpubl:lopnummer"] = basefile.split(":", 1)
return a
def facets(self):
return super(Direktiv, self).facets() + [Facet(DCTERMS.title,
toplevel_only=False)]