# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division,
print_function, unicode_literals)
from builtins import *
# From python stdlib
import re
import os
from datetime import datetime, timedelta
# 3rd party modules
import lxml.html
import requests
from rdflib import Literal, URIRef
from rdflib.namespace import SKOS, XSD, DCTERMS, FOAF
from bs4 import BeautifulSoup
# My own stuff
from ferenda import FSMParser, DocumentEntry
from ferenda import decorators, util
from ferenda.elements import Body, Paragraph
from ferenda.errors import DownloadError
from . import RPUBL
from .fixedlayoutsource import FixedLayoutSource, FixedLayoutStore
from .swedishlegalsource import UnorderedSection
from .elements import *
class JOStore(FixedLayoutStore):
def basefile_to_pathfrag(self, basefile):
# store data using years as top-level dir, even though the
# diarienummer are constructed the other way round.
#
# "1000-2004" => "2004/1000"
# "6356-2012" => "2012/6356"
if "-" not in basefile:
return super(JOStore, self).basefile_to_pathfrag(basefile)
no, year = basefile.split("-")
return "%s/%s" % (year, no)
def pathfrag_to_basefile(self, pathfrag):
# "2004/1000" => "1000-2004"
# "2012/6356" => "6356-2012"
year, no = pathfrag.split(os.sep)
return "%s-%s" % (no, year)
[docs]class JO(FixedLayoutSource):
"""Hanterar beslut från Riksdagens Ombudsmän, www.jo.se
Modulen hanterar hämtande av beslut från JOs webbplats i PDF samt
omvandlande av dessa till XHTML.
"""
alias = "jo"
start_url = "http://www.jo.se/sv/JO-beslut/Soka-JO-beslut/?query=*&pn=1"
document_url_regex = "http://www.jo.se/PageFiles/(?P<dummy>\d+)/(?P<basefile>\d+\-\d+)(?P<junk>[,%\d\-]*).pdf"
headnote_url_template = "http://www.jo.se/sv/JO-beslut/Soka-JO-beslut/?query=%(basefile)s&pn=1"
rdf_type = RPUBL.VagledandeMyndighetsavgorande
storage_policy = "dir"
downloaded_suffix = ".pdf"
documentstore_class = JOStore
urispace_segment = "avg/jo"
xslt_template = "xsl/avg.xsl"
sparql_annotations = "sparql/avg-annotations.rq"
sparql_expect_results = False
def metadata_from_basefile(self, basefile):
attribs = super(JO, self).metadata_from_basefile(basefile)
attribs["rpubl:diarienummer"] = basefile
attribs["dcterms:publisher"] = self.lookup_resource(
'JO', SKOS.altLabel)
return attribs
@decorators.action
@decorators.recordlastdownload
def download(self, basefile=None, url=None):
if basefile:
if not url:
entry = DocumentEntry(self.store.documententry_path(basefile))
url = entry.orig_url
if url:
return self.download_single(basefile, url)
else:
raise DownloadError("%s doesn't support downloading single basefiles w/o page URL" %
self.__class__.__name__)
self.session = requests.session()
if ('lastdownload' in self.config and
self.config.lastdownload and
not self.config.refresh):
startdate = self.config.lastdownload - timedelta(days=30)
self.start_url += "&from=%s" % datetime.strftime(startdate, "%Y-%m-%d")
for basefile, url in self.download_get_basefiles(self.start_url):
self.download_single(basefile, url)
@decorators.downloadmax
def download_get_basefiles(self, start_url):
# FIXME: try to download a single result HTML page, since
# there are a few metadata props there.
done = False
url = start_url
pagecount = 1
self.log.debug("Starting at %s" % start_url)
while not done:
nextpage = None
assert "pn=%s" % pagecount in url
soughtnext = url.replace("pn=%s" % pagecount,
"pn=%s" % (pagecount + 1))
self.log.debug("Getting page #%s" % pagecount)
resp = requests.get(url)
tree = lxml.html.document_fromstring(resp.text)
tree.make_links_absolute(url, resolve_base_href=True)
for element, attribute, link, pos in tree.iterlinks():
m = re.match(self.document_url_regex, link)
if m:
yield m.group("basefile"), link
elif link == soughtnext:
nextpage = link
pagecount += 1
if nextpage:
url = nextpage
else:
done = True
def download_single(self, basefile, url):
ret = super(JO, self).download_single(basefile, url)
if ret or self.config.refresh:
headnote_url = self.headnote_url_template % {'basefile': basefile}
resp = requests.get(headnote_url)
if "1 totalt antal träffar" in resp.text:
# don't save the entire 100+ KB HTML mess when we only
# want a litle 6 KB piece. Disk space is cheap but not
# infinite
soup = BeautifulSoup(resp.text, "lxml").find("div", "MidContent")
soup.find("ol", "breadcrumb").decompose()
soup.find("div", id="SearchSettings").decompose()
with self.store.open_downloaded(basefile, mode="wb", attachment="headnote.html") as fp:
fp.write(soup.prettify().encode("utf-8"))
self.log.debug("%s: downloaded headnote from %s" %
(basefile, headnote_url))
else:
self.log.warning("Could not find unique headnote for %s at %s" %
(basefile, headnote_url))
return ret
def source_url(self, basefile):
return ("http://www.jo.se/sv/JO-beslut/Soka-JO-beslut/"
"?query=%(basefile)s&caseNumber=%(basefile)s" % locals())
def extract_head(self, fp, basefile):
if "headnote.html" in list(self.store.list_attachments(basefile,
"downloaded")):
with self.store.open_downloaded(basefile,
attachment="headnote.html") as fp:
return BeautifulSoup(fp, "lxml")
# else: return None
def infer_identifier(self, basefile):
return "JO dnr %s" % basefile.replace("/", "-")
def extract_metadata(self, rawhead, basefile):
d = self.metadata_from_basefile(basefile)
if rawhead: # sometimes there's no headnote.html
for label, key in {"Ämbetsberättelse": 'dcterms:bibliographicCitation',
"Beslutsdatum": 'dcterms:issued',
"Diarienummer": 'rpubl:diarienummer'}.items():
labelnode = rawhead.find(text=re.compile("%s:" % label))
if labelnode:
d[key] = util.normalize_space(labelnode.next_sibling.text)
# this data might contain spurious spaces due to <span
# class="Definition"> tags -- see eg 3128-2002. Data in
# the document is preferable
d["dcterms:title"] = util.normalize_space(rawhead.find("h2").text)
return d
def polish_metadata(self, attribs, infer_nodes=True):
resource = super(JO, self).polish_metadata(attribs, infer_nodes)
# add a known foaf:name for the publisher to our polished graph
# FIXME/NOTE: In swedishlegalsource.ttl, the foaf:name is
# given as Riksdagens ombudsmän (and thus is used in TOC
# generation). I think that "Justitieombudsmannen" is better.
resource.value(DCTERMS.publisher).add(FOAF.name, Literal("Justitieombudsmannen", lang="sv"))
return resource
def postprocess_doc(self, doc):
def helper(node, meta):
for subnode in list(node):
if isinstance(subnode, Meta):
kwargs = {'lang': getattr(subnode, 'lang', None),
'datatype': getattr(subnode, 'datatype', None)}
for s in subnode:
# A meta node for pred rpubl:diarienummer
# might have two str nodes -- we must not skip
# one
#
# if doc.meta.value(URIRef(doc.uri), subnode.predicate):
# continue
# But if we find a dcterms:title, we throw the
# old one (probably gotten from the
# headnote.html) out, as it's probably lower
# quality
if subnode.predicate == DCTERMS.title:
oldtitle = doc.meta.value(URIRef(doc.uri), DCTERMS.title)
if oldtitle:
doc.meta.remove((URIRef(doc.uri), DCTERMS.title, oldtitle))
l = Literal(s, **kwargs)
meta.add((URIRef(doc.uri), subnode.predicate, l))
node.remove(subnode)
elif isinstance(subnode, list):
helper(subnode, meta)
helper(doc.body, doc.meta)
d = doc.meta.value(URIRef(doc.uri), RPUBL.avgorandedatum)
# only use the dcterms:issued value from the document if we
# don't already have one from the metadata
if d and not doc.meta.value(URIRef(doc.uri), DCTERMS.issued):
doc.meta.add((URIRef(doc.uri), DCTERMS.issued, d))
def tokenize(self, reader):
def gluecondition(textbox, nextbox, prevbox):
linespacing = nextbox.height / 1.5 # allow for large linespacing
return (textbox.font.size == nextbox.font.size and
textbox.top + textbox.height + linespacing >= nextbox.top)
return reader.textboxes(gluecondition)
def get_parser(self, basefile, sanitized, parseconfig="default"):
def is_heading(parser):
return parser.reader.peek().font.size == 17
def is_dnr(parser):
chunk = parser.reader.peek()
if (chunk.font.size == 12 and
re.match('\d+-\d{2,4}', str(chunk))):
return True
def is_datum(parser):
chunk = parser.reader.peek()
if (chunk.font.size == 12 and
re.match('\d{4}-\d{2}-\d{2}', str(chunk))):
return True
def is_nonessential(parser):
chunk = parser.reader.peek()
if chunk.top >= 1159 or chunk.top <= 146:
return True
def is_abstract(parser):
if str(parser.reader.peek()).startswith("Beslutet i korthet:"):
return True
def is_section(parser):
chunk = parser.reader.peek()
strchunk = str(chunk)
if chunk.font.size == 14 and chunk[0].tag == "b" and not strchunk.endswith("."):
return True
def is_blockquote(parser):
chunk = parser.reader.peek()
if chunk.left >= 255:
return True
def is_normal(parser):
chunk = parser.reader.peek()
if chunk.left < 255:
return True
def is_paragraph(parser):
return True
@decorators.newstate("body")
def make_body(parser):
return parser.make_children(Body())
def make_heading(parser):
# h = Heading(str(parser.reader.next()).strip())
h = Meta([str(parser.reader.next()).strip()],
predicate=DCTERMS.title,
lang="sv")
return h
@decorators.newstate("abstract")
def make_abstract(parser):
a = Abstract([Paragraph(parser.reader.next())])
return parser.make_children(a)
@decorators.newstate("section")
def make_section(parser):
s = UnorderedSection(title=str(parser.reader.next()).strip())
return parser.make_children(s)
@decorators.newstate("blockquote")
def make_blockquote(parser):
b = Blockquote()
return parser.make_children(b)
def make_paragraph(parser):
# A Paragraph containing PDFReader.Textelement object will
# render these as <span> objects (the default rendering. A
# PDFReader.Textbox object containing same will render
# unstyled Textelements as plain strings, cutting down on
# unneccesary <span> elements. However, these themselves
# render with unneccessary @style and @class attributes,
# which we don't want. For now, lets stick with Paragraphs
# as containers and maybe later figure out how to get
# PDFReader.Textelements to render themselves sanely.
#
# p = parser.reader.next()
p = Paragraph(parser.reader.next())
return p
def make_datum(parser):
datestr = str(parser.reader.next()).strip()
year = int(datestr.split("-")[0])
if 2100 > year > 1970:
parser.remove_recognizer(is_datum)
d = [datestr]
return Meta(d, predicate=RPUBL.avgorandedatum,
datatype=XSD.date)
else:
self.log.warning("Year in %s doesn't look valid" % datestr)
return None
def make_dnr(parser):
parser.remove_recognizer(is_dnr)
ds = [x for x in str(parser.reader.next()).strip().split(" ")]
return Meta(ds, predicate=RPUBL.diarienummer)
def skip_nonessential(parser):
parser.reader.next() # return nothing
p = FSMParser()
p.initial_state = "body"
p.initial_constructor = make_body
p.set_recognizers(is_datum,
is_dnr,
is_nonessential,
is_heading,
is_abstract,
is_section,
is_normal,
is_blockquote,
is_paragraph)
p.set_transitions({("body", is_heading): (make_heading, None),
("body", is_nonessential): (skip_nonessential, None),
("body", is_datum): (make_datum, None),
("body", is_dnr): (make_dnr, None),
("body", is_abstract): (make_abstract, "abstract"),
("body", is_section): (make_section, "section"),
("body", is_blockquote): (make_blockquote, "blockquote"),
("body", is_paragraph): (make_paragraph, None),
("abstract", is_paragraph): (make_paragraph, None),
("abstract", is_section): (False, None),
("abstract", is_dnr): (False, None),
("abstract", is_datum): (False, None),
("section", is_paragraph): (make_paragraph, None),
("section", is_nonessential): (skip_nonessential, None),
("section", is_section): (False, None),
("section", is_blockquote): (make_blockquote, "blockquote"),
("section", is_datum): (make_datum, None),
("section", is_dnr): (make_dnr, None),
("blockquote", is_blockquote): (make_paragraph, None),
("blockquote", is_nonessential): (skip_nonessential, None),
("blockquote", is_section): (False, None),
("blockquote", is_normal): (False, None),
("blockquote", is_datum): (make_datum, None),
("blockquote", is_dnr): (make_dnr, None),
})
p.debug = os.environ.get('FERENDA_FSMDEBUG', False)
return p.parse
_default_creator = "Riksdagens ombudsmän"
def _relate_fulltext_value_rootlabel(self, desc):
return desc.getvalue(DCTERMS.identifier)
def tabs(self):
if self.config.tabs:
return [("JO", self.dataset_uri())]
else:
return []