Source code for

# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import *

import re
import os
from datetime import datetime
import time

# 3rd party
from bs4 import BeautifulSoup
import requests
import requests.exceptions
from rdflib import Literal, URIRef
from rdflib.namespace import DCTERMS, FOAF

# My own stuff
from ferenda import util
from ferenda.errors import DownloadError
from ferenda.elements import Body
from ferenda.decorators import downloadmax, recordlastdownload
from . import FixedLayoutSource, FixedLayoutStore, RPUBL

class ARNStore(FixedLayoutStore):

    """Customized DocumentStore that handles multiple download suffixes
    and transforms YYYY-NNN basefiles to YYYY/NNN pathfrags"""

    def basefile_to_pathfrag(self, basefile):
        return basefile.replace("-", "/")

    def pathfrag_to_basefile(self, pathfrag):
        return pathfrag.replace("/", "-", 1)

[docs]class ARN(FixedLayoutSource): """Hanterar referat från Allmänna Reklamationsnämnden, Modulen hanterar hämtande av referat från ARNs webbplats, omvandlande av dessa till XHTML1.1+RDFa, samt transformering till browserfärdig HTML5. """ alias = "arn" # xslt_template = "xsl/arn.xsl" start_url = ("" "processName=SearchRefCasesProcess") documentstore_class = ARNStore rdf_type = RPUBL.VagledandeMyndighetsavgorande storage_policy = "dir" urispace_segment = "avg/arn" xslt_template = "xsl/avg.xsl" sparql_annotations = "sparql/avg-annotations.rq" sparql_expect_results = False def metadata_from_basefile(self, basefile): attribs = super(ARN, self).metadata_from_basefile(basefile) attribs["rpubl:diarienummer"] = basefile attribs["dcterms:publisher"] = self.lookup_resource( 'Allmänna reklamationsnämnden') return attribs @recordlastdownload def download(self, basefile=None): if basefile: raise DownloadError("Downloading single basefiles is not supported") self.session = requests.Session() resp = self.session.get(self.start_url) soup = BeautifulSoup(resp.text, "lxml") action = soup.find("form")["action"] if ('lastdownload' in self.config and self.config.lastdownload and not self.config.refresh): d = self.config.lastdownload datefrom = '%d-%02d-%02d' % (d.year, d.month, dateto = '%d-01-01' % (d.year + 1) else: # only fetch one year at a time datefrom = '1992-01-01' dateto = '1993-01-01' params = { '/root/searchTemplate/decision': 'obegransad', '/root/searchTemplate/decisionDateFrom': datefrom, '/root/searchTemplate/decisionDateTo': dateto, '/root/searchTemplate/department': 'alla', '/root/searchTemplate/journalId': '', '/root/searchTemplate/searchExpression': '', '_cParam0': 'method=search', '_cmdName': 'cmd_process_next', '_validate': 'page' } for basefile, url, fragment in self.download_get_basefiles((action, params)): if (self.config.refresh or (not os.path.exists( self.download_single(basefile, url, fragment) @downloadmax def download_get_basefiles(self, args): action, params = args done = False self.log.debug("Retrieving all results from %s to %s" % (params['/root/searchTemplate/decisionDateFrom'], params['/root/searchTemplate/decisionDateTo'])) paramcopy = dict(params) while not done: # First we need to use the files argument to send the POST # request as multipart/form-data req = requests.Request( "POST", action, cookies=self.session.cookies, files=paramcopy).prepare() # Then we need to remove filename # from req.body in an unsupported manner in order not to # upset the sensitive server body = req.body if isinstance(body, bytes): body = body.decode() # should be pure ascii req.body = re.sub( '; filename="[\w\-\/]+"', '', body).encode() req.headers['Content-Length'] = str(len(req.body)) # And finally we have to allow RFC-violating redirects for POST resp = False remaining_attempts = 5 while (not resp) and (remaining_attempts > 0): try: resp = self.session.send(req, allow_redirects=True) except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e: self.log.warning( "Failed to POST %s: error %s (%s remaining attempts)" % (action, e, remaining_attempts)) remaining_attempts -= 1 time.sleep(1) soup = BeautifulSoup(resp.text, "lxml") for link in soup.find_all( "input", "standardlink", onclick=re.compile("")): url = link['onclick'][ 24:- 2] # remove '' call around the url # this probably wont break... fragment = link.find_parent("table").find_parent("table") basefile = fragment.find_all("div", "strongstandardtext")[1].text yield basefile, url, fragment if soup.find("input", value="Nästa sida"): self.log.debug("Now retrieving next page in current search") paramcopy = {'_cParam0': "method=nextPage", '_validate': "none", '_cmdName': "cmd_process_next"} else: fromYear = int(params['/root/searchTemplate/decisionDateFrom'][:4]) if fromYear >= done = True else: # advance one year params[ '/root/searchTemplate/decisionDateFrom'] = "%s-01-01" % str(fromYear + 1) params[ '/root/searchTemplate/decisionDateTo'] = "%s-01-01" % str(fromYear + 2) self.log.debug("Now retrieving all results from %s to %s" % (params['/root/searchTemplate/decisionDateFrom'], params['/root/searchTemplate/decisionDateTo'])) paramcopy = dict(params) # restart the search, so that poor digiforms # doesn't get confused resp = False remaining_attempts = 5 while (not resp) and (remaining_attempts > 0): try: resp = self.session.get(self.start_url) except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e: self.log.warning( "Failed to POST %s: error %s (%s remaining attempts)" % (action, e, remaining_attempts)) remaining_attempts -= 1 time.sleep(1) soup = BeautifulSoup(resp.text, "lxml") action = soup.find("form")["action"] def download_name_file(self, tmpfile, basefile, assumedfile): with open(tmpfile, "rb") as fp: sig = if sig == b'\xffWPC': doctype = ".wpd" elif sig == b'\xd0\xcf\x11\xe0': doctype = ".doc" elif sig == b'PK\x03\x04': doctype = ".docx" elif sig == b'{\\rt': doctype = ".rtf" elif sig == b'%PDF': doctype = ".pdf" else: self.log.warning( "%s has unknown signature %r -- don't know what kind of file it is" % (filename, sig)) doctype = ".pdf" # don't do anything return, 'downloaded', doctype) def download_single(self, basefile, url, fragment): ret = super(ARN, self).download_single(basefile, url) if ret: # the HTML fragment from the search result page contains # metadata not available in the main document, so save it # as fragment.html with, mode="wb", attachment="fragment.html") as fp: fp.write(str(fragment).encode("utf-8")) return ret def remote_url(self, basefile): # it's not possible to construct stable URLs to document # resources. Thank you Digiforms. return None def extract_head(self, fp, basefile): # the fp contains the PDF file, but most of the metadata is in # stored HTML fragment attachment. So we open that separately. fragment =, attachment="fragment.html") return BeautifulSoup(util.readfile(fragment, encoding="utf-8"), "lxml") def extract_metadata(self, soup, basefile): d = self.metadata_from_basefile(basefile) def nextcell(key): cell = soup.find(text=key) if cell: return cell.find_parent("td").find_next_sibling("td").get_text().strip() else: raise KeyError("Could not find cell key %s" % key) d.update({'dcterms:identifier': self.infer_identifier(basefile), 'rpubl:arendenummer': nextcell("Änr"), 'rpubl:diarienummer': nextcell("Änr"), 'rpubl:avgorandedatum': nextcell("Avgörande"), 'dcterms:subject': nextcell("Avdelning"), 'dcterms:title': soup.table.find_all("tr")[3].get_text(), 'dcterms:issued': nextcell("Avgörande") }) assert d["rpubl:diarienummer"] == basefile, "Doc metadata differs from basefile" return d def sanitize_metadata(self, attribs, basefile): # remove trailing "Avgörande 1993-05-03; 92-2571" if attribs['dcterms:title'].strip(): attribs['dcterms:title'] = Literal( re.sub("Avgörande \d+-\d+-\d+; \d+-\d+\.?", "", util.normalize_space(attribs['dcterms:title'])), lang="sv") else: del attribs['dcterms:title'] # no real content -- delete # it and fill the value with # stuff from the document # later. return attribs def polish_metadata(self, attribs, infer_nodes=True): resource = super(ARN, self).polish_metadata(attribs, infer_nodes) # add a known foaf:name for the publisher to our polished graph resource.value(DCTERMS.publisher).add(, Literal("Allmänna reklamationsnämnden", lang="sv")) return resource def infer_identifier(self, basefile): return "ARN %s" % basefile def get_parser(self, basefile, sanitized, parseconfig="default"): return lambda stream: Body(list(stream)) def tokenize(self, reader): def gluecondition(textbox, nextbox, prevbox): linespacing = 7 res = ( == and textbox.font.size == nextbox.font.size and + textbox.height + linespacing >= and > return res return reader.textboxes(gluecondition) def postprocess_doc(self, doc): for box in doc.body: del del box.left del box.width del box.height del box.fontid if not doc.meta.value(URIRef(doc.uri), DCTERMS.title): # The title of the document wasn't in the HTML # fragment. Use first line of the document instead (and # scrub trailing "Avgörande 1993-05-03; 92-2571" the # normal way. t = {'dcterms:title': str(doc.body[0])} t = self.sanitize_metadata(t, doc.basefile) doc.meta.add((URIRef(doc.uri), DCTERMS.title, t['dcterms:title'])) def create_external_resources(self, doc): pass _default_creator = "Allmänna reklamationsnämnden" def _relate_fulltext_value_rootlabel(self, desc): return desc.getvalue(DCTERMS.identifier) def tabs(self): if self.config.tabs: return [("ARN", self.dataset_uri())] else: return []