Source code for ferenda.sources.legal.se.arn

# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import *

import re
import os
from datetime import datetime
import time

# 3rd party
from bs4 import BeautifulSoup
import requests
import requests.exceptions
from rdflib import Literal, URIRef
from rdflib.namespace import DCTERMS, FOAF

# My own stuff
from ferenda import util
from ferenda.errors import DownloadError
from ferenda.elements import Body
from ferenda.decorators import downloadmax, recordlastdownload
from . import FixedLayoutSource, FixedLayoutStore, RPUBL


class ARNStore(FixedLayoutStore):

    """Customized DocumentStore that handles multiple download suffixes
    and transforms YYYY-NNN basefiles to YYYY/NNN pathfrags"""

    def basefile_to_pathfrag(self, basefile):
        return basefile.replace("-", "/")

    def pathfrag_to_basefile(self, pathfrag):
        return pathfrag.replace("/", "-", 1)


[docs]class ARN(FixedLayoutSource):

    """Hanterar referat från Allmänna Reklamationsnämnden, www.arn.se.

    Modulen hanterar hämtande av referat från ARNs webbplats, omvandlande
    av dessa till XHTML1.1+RDFa, samt transformering till browserfärdig
    HTML5.
    """

    alias = "arn"
    # xslt_template = "xsl/arn.xsl"
    start_url = ("http://adokweb.arn.se/digiforms/sessionInitializer?"
                 "processName=SearchRefCasesProcess")
    documentstore_class = ARNStore
    rdf_type = RPUBL.VagledandeMyndighetsavgorande
    storage_policy = "dir"
    urispace_segment = "avg/arn"
    xslt_template = "xsl/avg.xsl"
    sparql_annotations = "sparql/avg-annotations.rq"
    sparql_expect_results = False

    def metadata_from_basefile(self, basefile):
        attribs = super(ARN, self).metadata_from_basefile(basefile)
        attribs["rpubl:diarienummer"] = basefile
        attribs["dcterms:publisher"] = self.lookup_resource(
                    'Allmänna reklamationsnämnden')
        return attribs

    @recordlastdownload
    def download(self, basefile=None):
        if basefile:
            raise DownloadError("Downloading single basefiles is not supported")
        self.session = requests.Session()
        resp = self.session.get(self.start_url)
        soup = BeautifulSoup(resp.text, "lxml")
        action = soup.find("form")["action"]

        if ('lastdownload' in self.config and
                self.config.lastdownload and
                not self.config.refresh):
            d = self.config.lastdownload
            datefrom = '%d-%02d-%02d' % (d.year, d.month, d.day)
            dateto = '%d-01-01' % (d.year + 1)
        else:
            # only fetch one year at a time
            datefrom = '1992-01-01'
            dateto = '1993-01-01'

        params = {
            '/root/searchTemplate/decision': 'obegransad',
            '/root/searchTemplate/decisionDateFrom': datefrom,
            '/root/searchTemplate/decisionDateTo': dateto,
            '/root/searchTemplate/department': 'alla',
            '/root/searchTemplate/journalId': '',
            '/root/searchTemplate/searchExpression': '',
            '_cParam0': 'method=search',
            '_cmdName': 'cmd_process_next',
            '_validate': 'page'
        }

        for basefile, url, fragment in self.download_get_basefiles((action, params)):
            if (self.config.refresh or
                    (not os.path.exists(self.store.downloaded_path(basefile)))):
                self.download_single(basefile, url, fragment)

    @downloadmax
    def download_get_basefiles(self, args):
        action, params = args
        done = False
        self.log.debug("Retrieving all results from %s to %s" %
                       (params['/root/searchTemplate/decisionDateFrom'],
                        params['/root/searchTemplate/decisionDateTo']))
        paramcopy = dict(params)
        while not done:
            # First we need to use the files argument to send the POST
            # request as multipart/form-data
            req = requests.Request(
                "POST", action, cookies=self.session.cookies, files=paramcopy).prepare()
            # Then we need to remove filename
            # from req.body in an unsupported manner in order not to
            # upset the sensitive server
            body = req.body
            if isinstance(body, bytes):
                body = body.decode()  # should be pure ascii
            req.body = re.sub(
                '; filename="[\w\-\/]+"', '', body).encode()
            req.headers['Content-Length'] = str(len(req.body))
            # And finally we have to allow RFC-violating redirects for POST

            resp = False
            remaining_attempts = 5
            while (not resp) and (remaining_attempts > 0):
                try:
                    resp = self.session.send(req, allow_redirects=True)
                except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
                    self.log.warning(
                        "Failed to POST %s: error %s (%s remaining attempts)" % (action, e, remaining_attempts))
                    remaining_attempts -= 1
                    time.sleep(1)

            soup = BeautifulSoup(resp.text, "lxml")
            for link in soup.find_all(
                    "input", "standardlink", onclick=re.compile("javascript:window.open")):
                url = link['onclick'][
                    24:-
                    2]  # remove 'javascript:window.open' call around the url
                # this probably wont break...
                fragment = link.find_parent("table").find_parent("table")
                basefile = fragment.find_all("div", "strongstandardtext")[1].text
                yield basefile, url, fragment
            if soup.find("input", value="Nästa sida"):
                self.log.debug("Now retrieving next page in current search")
                paramcopy = {'_cParam0': "method=nextPage",
                             '_validate': "none",
                             '_cmdName': "cmd_process_next"}
            else:
                fromYear = int(params['/root/searchTemplate/decisionDateFrom'][:4])
                if fromYear >= datetime.now().year:
                    done = True
                else:
                    # advance one year
                    params[
                        '/root/searchTemplate/decisionDateFrom'] = "%s-01-01" % str(fromYear + 1)
                    params[
                        '/root/searchTemplate/decisionDateTo'] = "%s-01-01" % str(fromYear + 2)
                    self.log.debug("Now retrieving all results from %s to %s" %
                                   (params['/root/searchTemplate/decisionDateFrom'],
                                    params['/root/searchTemplate/decisionDateTo']))
                    paramcopy = dict(params)
                    # restart the search, so that poor digiforms
                    # doesn't get confused

                    resp = False
                    remaining_attempts = 5
                    while (not resp) and (remaining_attempts > 0):
                        try:
                            resp = self.session.get(self.start_url)
                        except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
                            self.log.warning(
                                "Failed to POST %s: error %s (%s remaining attempts)" % (action, e, remaining_attempts))
                            remaining_attempts -= 1
                            time.sleep(1)

                    soup = BeautifulSoup(resp.text, "lxml")
                    action = soup.find("form")["action"]

    def download_name_file(self, tmpfile, basefile, assumedfile):
        with open(tmpfile, "rb") as fp:
            sig = fp.read(4)
            if sig == b'\xffWPC':
                doctype = ".wpd"
            elif sig == b'\xd0\xcf\x11\xe0':
                doctype = ".doc"
            elif sig == b'PK\x03\x04':
                doctype = ".docx"
            elif sig == b'{\\rt':
                doctype = ".rtf"
            elif sig == b'%PDF':
                doctype = ".pdf"
            else:
                self.log.warning(
                    "%s has unknown signature %r -- don't know what kind of file it is" % (filename, sig))
                doctype = ".pdf"  # don't do anything
        return self.store.path(basefile, 'downloaded', doctype)

    def download_single(self, basefile, url, fragment):
        ret = super(ARN, self).download_single(basefile, url)
        if ret:
            # the HTML fragment from the search result page contains
            # metadata not available in the main document, so save it
            # as fragment.html
            with self.store.open_downloaded(basefile, mode="wb",
                                            attachment="fragment.html") as fp:
                fp.write(str(fragment).encode("utf-8"))
        return ret

    def remote_url(self, basefile):
        # it's not possible to construct stable URLs to document
        # resources. Thank you Digiforms.
        return None
    
    def extract_head(self, fp, basefile):
        # the fp contains the PDF file, but most of the metadata is in
        # stored HTML fragment attachment. So we open that separately.
        fragment = self.store.downloaded_path(basefile, attachment="fragment.html")
        return BeautifulSoup(util.readfile(fragment, encoding="utf-8"), "lxml")


    def extract_metadata(self, soup, basefile):
        d = self.metadata_from_basefile(basefile)
        def nextcell(key):
            cell = soup.find(text=key)
            if cell:
                return cell.find_parent("td").find_next_sibling("td").get_text().strip()
            else:
                raise KeyError("Could not find cell key %s" % key)
        d.update({'dcterms:identifier': self.infer_identifier(basefile),
                  'rpubl:arendenummer': nextcell("Änr"),
                  'rpubl:diarienummer': nextcell("Änr"),
                  'rpubl:avgorandedatum': nextcell("Avgörande"),
                  'dcterms:subject': nextcell("Avdelning"),
                  'dcterms:title': soup.table.find_all("tr")[3].get_text(),
                  'dcterms:issued': nextcell("Avgörande")
        })
        assert d["rpubl:diarienummer"] == basefile, "Doc metadata differs from basefile"
        return d

    def sanitize_metadata(self, attribs, basefile):
        # remove trailing "Avgörande 1993-05-03; 92-2571"
        if attribs['dcterms:title'].strip():
            attribs['dcterms:title'] = Literal(
                re.sub("Avgörande \d+-\d+-\d+; \d+-\d+\.?",
                       "", util.normalize_space(attribs['dcterms:title'])),
                lang="sv")
        else:
            del attribs['dcterms:title'] # no real content -- delete
                                         # it and fill the value with
                                         # stuff from the document
                                         # later.
        return attribs

    def polish_metadata(self, attribs, infer_nodes=True):
        resource = super(ARN, self).polish_metadata(attribs, infer_nodes)
        # add a known foaf:name for the publisher to our polished graph
        resource.value(DCTERMS.publisher).add(FOAF.name, Literal("Allmänna reklamationsnämnden", lang="sv"))
        return resource

    def infer_identifier(self, basefile):
        return "ARN %s" % basefile

    def get_parser(self, basefile, sanitized, parseconfig="default"):
        return lambda stream: Body(list(stream))

    def tokenize(self, reader):
        def gluecondition(textbox, nextbox, prevbox):
            linespacing = 7
            res = (textbox.font.family == nextbox.font.family and
                   textbox.font.size == nextbox.font.size and
                   textbox.top + textbox.height + linespacing >= nextbox.top and
                   nextbox.top > prevbox.top)
            return res
        return reader.textboxes(gluecondition)

    def postprocess_doc(self, doc):
        for box in doc.body:
            del box.top
            del box.left
            del box.width
            del box.height
            del box.fontid
        if not doc.meta.value(URIRef(doc.uri), DCTERMS.title):
            # The title of the document wasn't in the HTML
            # fragment. Use first line of the document instead (and
            # scrub trailing "Avgörande 1993-05-03; 92-2571" the
            # normal way.
            t = {'dcterms:title': str(doc.body[0])}
            t = self.sanitize_metadata(t, doc.basefile)
            doc.meta.add((URIRef(doc.uri), DCTERMS.title, t['dcterms:title']))

    def create_external_resources(self, doc):
        pass

    _default_creator = "Allmänna reklamationsnämnden"
    
    def _relate_fulltext_value_rootlabel(self, desc):
        return desc.getvalue(DCTERMS.identifier)

    def tabs(self):
        if self.config.tabs:
            return [("ARN", self.dataset_uri())]
        else:
            return []