Source code for ferenda.sources.legal.se.jk

# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import *

import re
import os
from datetime import datetime, timedelta
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
from rdflib import Literal
from rdflib.namespace import SKOS, DCTERMS, FOAF

from . import SwedishLegalSource, SwedishLegalStore, RPUBL
from .elements import *
from .swedishlegalsource import AnonStycke
from ferenda import FSMParser, DocumentEntry
from ferenda import util, errors
from ferenda.decorators import downloadmax, recordlastdownload, newstate
from ferenda.elements import Body


class JKStore(SwedishLegalStore):
    def basefile_to_pathfrag(self, basefile):
        # store data using years as top-level dir by extracting the
        # year from the middle of the diarienummer:
        # "3541-97-21" => "1997/3541-97-21"
        # "3497-06-40" => "2006/3497-06-40"
        if "-" not in basefile:
            return super(JKStore, self).basefile_to_pathfrag(basefile)
        no, year, dtype = basefile.split("-")
        if int(year) > 50:  # arbitrary cutoff
            year = "19" + year
        else:
            year = "20" + year
        return "%s/%s" % (year, basefile)

    def pathfrag_to_basefile(self, pathfrag):
        # "1997/3541-97-21" => "3541-97-21"
        # "2006/3497-06-40" => "3497-06-40"
        year, basefile = pathfrag.split(os.sep)
        return basefile


[docs]class JK(SwedishLegalSource):
    alias = "jk"

    start_url = "http://www.jk.se/beslut-och-yttranden/"
    document_url_regex = "http://www.jk.se/Beslut/(?P<kategori>[\w\-]+)/(?P<basefile>\d+\-\d+\-\d+).aspx"
    rdf_type = RPUBL.VagledandeMyndighetsavgorande
    documentstore_class = JKStore
    urispace_segment = "avg/jk"
    download_iterlinks = False
    xslt_template = "xsl/avg.xsl"
    sparql_annotations = "sparql/avg-annotations.rq"
    sparql_expect_results = False


    @recordlastdownload
    def download(self, basefile=None, reporter=None):
        if basefile:
            resp = self.session.post(self.start_url, data={'diarienummer': basefile})
            soup = BeautifulSoup(resp.text, "lxml")
            link = soup.find("div", "ruling-results").find("a", href=re.compile("/beslut-och-yttranden/"))
            if not link:
                raise errors.DownloadFileNotFoundError(basefile)
            url = urljoin(self.start_url, link["href"])
            return self.download_single(basefile, url)
        else:
            return super(JK, self).download(basefile, reporter)
        
    def download_get_first_page(self):
        data = {'page': '9999'}   # this'll yield a single page with
                                  # every descision ever. This is
                                  # inefficient, but their webdevs
                                  # have broken pagination, so...
        self.log.debug("Starting at %s" % self.start_url)
        resp = requests.post(self.start_url, data=data)
        return resp
                
    @downloadmax
    def download_get_basefiles(self, source):
        document_url_regex = re.compile("/(?P<basefile>\d+\-\d+\-\d+)/$")
        soup = BeautifulSoup(source, "lxml")
        for link in soup.find_all("a", href=document_url_regex):
            basefile = document_url_regex.search(link["href"]).group("basefile")
            yield basefile, urljoin(self.start_url, link["href"])

    def adjust_basefile(self, doc, orig_uri):
        pass # See comments in swedishlegalsource.py 

    def source_url(self, basefile):
        # this source does not have any predictable URLs, so we try to
        # find if we made a note on the URL when we ran download()
        entry = DocumentEntry(self.store.documententry_path(basefile))
        return entry.orig_url.replace(" ", "%20")

    def metadata_from_basefile(self, basefile):
        attribs = super(JK, self).metadata_from_basefile(basefile)
        attribs["rpubl:diarienummer"] = basefile
        attribs["dcterms:publisher"] = self.lookup_resource(
                    'JK', SKOS.altLabel)
        return attribs

    def extract_head(self, fp, basefile):
        return BeautifulSoup(fp.read(), "lxml")

    def infer_identifier(self, basefile):
        return "JK %s" % basefile

    def extract_metadata(self, soup, basefile):
        # for the following decisions, we only have downloaded
        # versions that uses an older HTML template. These descisions
        # are for some reason no longer available at www.jk.se.
        # 
        # 1075-97-40, 1263-97-21, 2600-97-21, 3541-97-21, 396-98-22,
        # 402-98-30, 358-01-40, 3272-01-65, 930-03-21, 2226-03-40,
        # 3059-03-40, 3775-04-35, 4839-06-40, 5458-07-41, 6372-07-31,
        # 3131-11-31, 3261-13-40 and 2356-15-28
        #
        # So we have an alternate scraping strategy for these.
        
        content = soup.find("div", "content")
        if content:
            title = content.find("h2").get_text()
            metadata = content.find("div", "date").get_text()
            # eg "Diarienr: 4008-16-31 / Beslutsdatum: 12 jul 2016"
            diarienummer = metadata.split()[1]
            beslutsdatum = metadata.rsplit(": ", 1)[1]
        else:
            # old HTML template
            title = soup.find("h1", "besluttitle").get_text()
            beslutsdatum = soup.find("span", class_="label",
                                     text="Beslutsdatum").find_next_sibling("span").get_text()
            diarienummer = soup.find("span", class_="label",
                                     text="Diarienummer").find_next_sibling("span").get_text()
            # in some rare cases given like <span
            # class="data">3391-14-30 3696-14-30</span>, in which case
            # we choose the last one. But sometimes they're given as
            # "4243-13-40 m.fl.", in which case we choose the first
            # "one"
            if " " in diarienummer:
                idx = 0 if "m.fl." in diarienummer else -1
                diarienummer = diarienummer.split(" ")[idx]

        if diarienummer != basefile:
            self.log.warning("%s: Different diarienummer %s found in document" % (basefile, diarienummer))
            
        a = self.metadata_from_basefile(basefile)
        a.update({"dcterms:title": title,
                  "dcterms:publisher": self.lookup_resource("JK", SKOS.altLabel),
                  "rpubl:beslutsdatum": beslutsdatum,
                  "dcterms:issued": beslutsdatum,
                  "rpubl:diarienummer": diarienummer,
                  "dcterms:identifier": self.infer_identifier(diarienummer)})
        return a

    def polish_metadata(self, attribs, infer_nodes=True):
        resource = super(JK, self).polish_metadata(attribs, infer_nodes)
        # add a known foaf:name for the publisher to our polished graph
        resource.value(DCTERMS.publisher).add(FOAF.name, Literal("Justitiekanslern", lang="sv"))
        return resource
    
    def extract_body(self, fp, basefile):
        # NB: extract_head already did this (so the fp will have been
        # read to the end -- need to seek(0)
        fp.seek(0)
        soup = BeautifulSoup(fp.read(), "lxml")
        main = soup.find("div", "content")
        if main:
            main.find("div", "actions").extract()
            main.find("div", "date").extract()
            main.find("h2").extract()
        else:
            # old HTML template -- see comment in parse_metadata
            main = soup.find("div", id="mainContent")
            main.find("div", id="breadcrumbcontainer").extract()
            main.find("h1", "besluttitle").extract()
            main.find("div", "beslutmetadatacontainer").decompose()
        return main

    def tokenize(self, main):
        # list all tags (x.name) that aren't empty (x.get_text().strip())
        return main.find_all(lambda x: x.name and x.get_text().strip())


    def get_parser(self, basefile, sanitized_body, parseconfig="default"):
        # a typical decision structure:

        # [h1] Justitiekanslerns beslut
        #    ... text ...
        #    [h2] Ärendet (h3)
        #        [h3] Bakgrund (p/em)
        #        ... text ...
        #        [h3] Anspråket
        #        ... text ...
        #        [h3 class="reglering"] Rättslig reglering m.m. (p/strong)
        #    [h2] Justitiekanslerns bedömning
        #        [h3] Skadestånd
        #        [h3] Tillsyn
        def is_section(parser):
            return parser.reader.peek().name == "h3"

        def is_subsection(parser):
            chunk = parser.reader.peek()
            return chunk.name == "p" and list(chunk.children)[0].name == "em"

        def is_special_subsection(parser):
            chunk = parser.reader.peek()
            return chunk.name == "p" and list(chunk.children)[0].name == "strong"

        def is_subsubsection(parser):
            chunk = parser.reader.peek()
            return chunk.name == "p" and list(chunk.children)[0].name == "u"

        def is_paragraph(parser):
            return True

        @newstate('body')
        def make_body(parser):
            return parser.make_children(Body())

        @newstate('section')
        def make_section(parser):
            s = AnonSektion(title=parser.reader.next().get_text())
            return parser.make_children(s)

        @newstate('subsection')
        def make_subsection(parser):
            s = AnonSektion(title=parser.reader.next().get_text())
            return parser.make_children(s)

        @newstate('special_subsection')
        def make_special_subsection(parser):
            s = AnonSektion(title=parser.reader.next().get_text())
            return parser.make_children(s)

        @newstate('subsubsection')
        def make_subsubsection(parser):
            s = AnonSektion(title=parser.reader.next().get_text())
            return parser.make_children(s)

        def make_paragraph(parser):
            # FIXME: this strips out formatting tags NB: Now this is a
            # SFS stycke that has fragment_label, id/uri and other
            # crap. Let's see if it still works!
            return AnonStycke([parser.reader.next().get_text()])

        p = FSMParser()
        p.set_recognizers(is_section,
                          is_subsection,
                          is_subsubsection,
                          is_paragraph)
        p.set_transitions({
            ("body", is_section): (make_section, "section"),
            ("section", is_section): (False, None),
            ("section", is_subsection): (make_subsection, "subsection"),
            ("section", is_special_subsection): (make_special_subsection, "special_subsection"),
            ("subsection", is_section): (False, None),
            ("subsection", is_subsection): (False, None),
            ("subsection", is_special_subsection): (False, None),
            ("subsection", is_subsubsection): (make_subsection, "subsubsection"),
            ("special_subsection", is_section): (False, None),
            ("special_subsection", is_subsection): (False, None),
            ("special_subsection", is_subsubsection): (make_subsubsection, "subsubsection"),
            ("subsubsection", is_section): (False, None),
            ("subsubsection", is_special_subsection): (False, None),
            ("subsubsection", is_subsection): (False, None),
            ("subsubsection", is_subsubsection): (False, None),
            (("body", "section", "subsection", "subsubsection"), is_paragraph): (make_paragraph, None)
        })
        p.initial_state = "body"
        p.initial_constructor = make_body
        p.debug = os.environ.get('FERENDA_FSMDEBUG', False)
        return p.parse

    _default_creator = "Justitiekanslern"

    def _relate_fulltext_value_rootlabel(self, desc):
        return desc.getvalue(DCTERMS.identifier)

    def tabs(self):
        if self.config.tabs:
            return [("JK", self.dataset_uri())]
        else:
            return []