Source code for ferenda.sources.legal.se.sou

# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import *


import re
import os
import logging
import unicodedata
from datetime import datetime
from urllib.parse import urljoin

import requests.exceptions
from rdflib import URIRef, Literal, Graph, Namespace
from rdflib.namespace import SKOS, DC, RDF, XSD, DCTERMS
BIBO = Namespace("http://purl.org/ontology/bibo/")
from bs4 import BeautifulSoup
import lxml.html
from cached_property import cached_property

from ferenda import (PDFAnalyzer, CompositeRepository, DocumentEntry,
                     PDFDocumentRepository, CompositeStore, Facet, DocumentStore)
from ferenda import util, decorators, errors
from ferenda.pdfreader import StreamingPDFReader
from . import Regeringen, SwedishLegalSource, FixedLayoutSource, SwedishLegalStore, Offtryck, RPUBL
from .swedishlegalsource import lazyread

def sou_sanitize_identifier(identifier):
    if not identifier:
        return identifier # allow infer_identifier to do it's magic later
    if not re.match("SOU (19|20)\d{2}:[1-9]\d{0,2}$", identifier):
        raise ValueError("Irregular identifier %s (after mangling)" %  identifier)
    return Literal(identifier)

class SOUAnalyzer(PDFAnalyzer):
    # SOU running headers can contain quite a bit of text, 3% (60 chars for avg page of 2000)
    header_significance_threshold = 0.03
    # footers less so (but more than the default .2%), 1 %
    footer_significance_threshold = 0.01

    # h1 / h2's can be a bit rare though, particularly in older
    # material which only use different size for h1:s (0.07% is
    # enough)
    style_significance_threshold = 0.0007

    gluefunc = None

    def guess_pagenumber(self, page, probable_pagenumber=1):
        if self.scanned_source:
            # KB scans have predictable page numbering -- the first
            # three pdf pages are fake cover, real cover and inlay --
            # the logical page 1 starts at physical page 4. Assume no
            # breaks in pagination, since the actual pagenumber is
            # rarely (if ever) included in the actual OCR information
            # (!)
            if probable_pagenumber == 4 and not hasattr(self, 'paginate_cover_accounted'):
                self.paginate_cover_accounted = True
                return 1
            else:
                return None
        else:
            return super(SOUAnalyzer, self).guess_pagenumber(page, probable_pagenumber)
            

    @cached_property
    def documents(self):
        def titleish(pageidx):
            # return the largest text element found on the page (first
            # one in case of a tie) -- that's probably the title on
            # the page
            iterator = self.pdf.textboxes(self.gluefunc, startpage=pageidx, pagecount=1) if self.gluefunc else self.pdf[pageidx]
            candidate = None
            for te in iterator:
                if candidate is None or str(te)[0].isupper() and te.font.size > candidate.font.size:
                    candidate = te
            return candidate
        documents = []
        currentdoc = 'frontmatter'
        prev_pagesrc = None
        pageidx_offset = 0
        for pageidx, page in enumerate(self.pdf):
            # FIXME: Generalize this way of detecting a multi-volume
            # document (as opposed to a single document split into
            # multiple PDF files).
            if page.src != prev_pagesrc and 'del-2' in page.src:
                if currentdoc == 'endregister' and len(page.as_plaintext()) < 1000:
                    # this is probably a single document split into two
                    currentdoc = 'main' # maybe 
                else:
                    # this is probably a multi-volume document 
                    currentdoc = 'frontmatter'
                pageidx_offset = pageidx
            # Sanity check: 
            if pageidx - pageidx_offset > 8 and currentdoc == 'frontmatter':
                logging.getLogger("pdfanalyze").warning("missed the transition from frontmatter to main")
                # act as there never was any frontmatter -- all pages
                # are considered part of the main content.
                currentdoc = "main"
                documents[0][-1] = "main"
            pgtitle = titleish(pageidx)
            if pgtitle is not None:
                pgtitle = str(pgtitle).strip()
                if re.match("(Till [sS]|S)tatsrådet ", pgtitle):
                    currentdoc = "main"
                elif pgtitle in ("Innehåll", "Innehållsförteckning", "Innehåll del 2"):
                    currentdoc = "main"
                elif re.match("Statens offentliga utredningar \d+", str(pgtitle).strip()):
                    currentdoc = 'endregister'
            styles = self.count_styles(pageidx, 1)
            # find the most dominant style on the page. If it uses the
            # EU font (even if it's the second most dominant), it's a
            # separate section.
            if styles and [s for s in self.count_styles(pageidx, 1).most_common(2) if s[0][0].startswith("EUAlbertina")]:
                currentdoc = 'eudok'
            elif currentdoc == "eudok":
                currentdoc == "main" ## CONTINUE
            
            # update the current document segment tuple or start a new one
            if documents and documents[-1][2] == currentdoc:
                documents[-1][1] += 1
            else:
                documents.append([pageidx, 1, currentdoc])
            prev_pagesrc = page.src
        return documents


[docs]class SOURegeringen(Regeringen):
    alias = "souregeringen"
    re_basefile_strict = re.compile(r'SOU (\d{4}:\d+)')
    re_basefile_lax = re.compile(r'(?:SOU|) ?(\d{4}:\d+)', re.IGNORECASE)
    re_urlbasefile_strict = re.compile("statens-offentliga-utredningar/\d+/\d+/[a-z]*\.?-?(\d{4})(\d+)-?/$")
    re_urlbasefile_lax = re.compile("statens-offentliga-utredningar/\d+/\d+/.*?(\d{4})_?(\d+)")
    rdf_type = RPUBL.Utredningsbetankande
    document_type = Regeringen.SOU
    def canonical_uri(self, basefile):
        year, ordinal = basefile.split(":")
        attrib = {'rpubl:arsutgava': year,
                  'rpubl:lopnummer': ordinal,
                  'rpubl:utrSerie': self.lookup_resource("SOU", SKOS.altLabel),
                  'rdf:type': self.rdf_type}
        resource = self.attributes_to_resource(attrib)
        return self.minter.space.coin_uri(resource) 

    def sanitize_identifier(self, identifier):
        return sou_sanitize_identifier(identifier)

class SOUKBStore(SwedishLegalStore):
    downloaded_suffixes = [".pdf", ".rdf"]

[docs]class SOUKB(Offtryck, PDFDocumentRepository):
    alias = "soukb"
    storage_policy = "dir"
    downloaded_suffix = ".pdf"
    basefile_regex = "(?P<basefile>\d{4}:\d+)"
    start_url = "http://regina.kb.se/sou/"
    download_reverseorder = True
    rdf_type = RPUBL.Utredningsbetankande
    urispace_segment = "sou"
    # A bit nonsensical, but required for SwedishLegalSource.get_parser
    document_type = SOU = True
    PROPOSITION = DS = KOMMITTEDIREKTIV = False
    documentstore_class = SOUKBStore
    
    @classmethod
    def get_default_options(cls):
        opts = super(SOUKB, cls).get_default_options()
        opts['ocr'] = True
        return opts
    
    def download(self, basefile=None):
        if basefile:
            resp = self.session.get(self.start_url)
            tree = lxml.html.document_fromstring(resp.text)
            tree.make_links_absolute(self.start_url, resolve_base_href=True)
            source = tree.iterlinks()
            # 1. look through download_get_basefiles for basefile
            for (b, url) in self.download_get_basefiles(source):
                if b == basefile:
                    return self.download_single(basefile, url)
            else:
                self.log.error("%s: Couldn't find requested basefile" % basefile)
        else:
            return super(SOUKB, self).download()
         

    @decorators.downloadmax
    def download_get_basefiles(self, source):
        # modified copy of DocumentRepository.download_get_basefiles
        # that also yields the link title, based on the assumption
        # that this is valuable to download_single. 
        yielded = set()
        if self.download_reverseorder:
            source = reversed(list(source))
        for (element, attribute, link, pos) in source:
            # Also makes sure the link is not external (SOU 1997:119
            # links to external site regeringen.se for some reason...)
            if "kb.se/" not in link:
                continue
            basefile = None
            # Two step process: First examine link text to see if
            # basefile_regex match. If not, examine link url to see
            # if document_url_regex
            if (self.basefile_regex and
                element.text and
                    re.search(self.basefile_regex, element.text)):
                m = re.search(self.basefile_regex, element.text)
                basefile = m.group("basefile")
            if basefile and (basefile, link) not in yielded:
                yielded.add((basefile, link))
                yield (basefile, (link, element.tail.strip()))

    def download_single(self, basefile, url):
        if self.get_parse_options(basefile) == "skip":
            raise errors.DocumentSkippedError("%s should not be downloaded according to options.py" % basefile)
        rdffilename = self.store.downloaded_path(basefile, attachment="index.rdf")
        if self.get_parse_options(basefile) == "metadataonly" and os.path.exists(rdffilename) and (not self.config.refresh):
            # it is kind of bad that we can even get here in these
            # cases (if a rdffile exists, and a empty index.pdf
            # exists, shouldn't download() skip that file? Right now
            # it ignores empty files and passes them to
            # download_single.
            return False
        
        # url is really a 2-tuple
        url, title = url
        resp = self.session.get(url)
        soup = BeautifulSoup(resp.text, "lxml")
        pdflink = soup.find("a", href=re.compile(".*\.pdf$"))
        pdfurl = pdflink.get("href")
        thumburl = urljoin(url, soup.find("img", "tumnagel").get("src"))
        librisid = url.rsplit("-")[1]
        rdfurl = "http://data.libris.kb.se/open/bib/%s.rdf" % librisid
        filename = self.store.downloaded_path(basefile)
        created = not os.path.exists(filename)
        updated = False
        
        # download rdf metadata before actual content
        try:
            # it appears that URLs like
            # http://data.libris.kb.se/open/bib/8351225.rdf now
            # returns empty responses. Until we find out the proper
            # RDF endpoint URLs, we should check and warn for this
            # (and infer a minimal RDF by hand from what we can, eg
            # dc:title from the link text)
            self.download_if_needed(rdfurl, basefile,
                                    filename=rdffilename,
                                    archive=False)
            if os.path.getsize(rdffilename) == 0:
                self.log.warning("%s: %s returned 0 response, infer RDF" %
                                 (basefile, rdfurl))
                base = URIRef("http://libris.kb.se/resource/bib/%s" %
                              librisid)
                fakegraph = Graph()
                fakegraph.bind("dc", str(DC))
                fakegraph.add((base, DC.title, Literal(title, lang="sv")))
                year = basefile.split(":")[0] # Libris uses str type
                fakegraph.add((base, DC.date, Literal(year)))
                with open(rdffilename, "wb") as fp:
                    fakegraph.serialize(fp, format="pretty-xml")
        except requests.exceptions.HTTPError as e:
            self.log.error("Failed to load attachment: %s" % e)
            raise

        if self.get_parse_options(basefile) == "metadataonly":
            self.log.debug("%s: Marked as 'metadataonly', not downloading actual PDF file" % basefile)
            with self.store.open_downloaded(basefile, "w") as fp:
                pass
        else:
            if self.download_if_needed(pdfurl, basefile) or self.config.refresh:
                if created:
                    self.log.info("%s: download OK from %s" % (basefile, pdfurl))
                else:
                    self.log.info(
                        "%s: download OK (new version) from %s" % (basefile, pdfurl))
                updated = True
                try:
                    self.download_if_needed(thumburl, basefile,
                                            filename=self.store.downloaded_path(
                            basefile, attachment="thumb.jpg"))
                except requests.exceptions.HTTPError as e:
                    self.log.error("Failed to load attachment: %s" % e)
                    raise
            else:
                self.log.debug("%s: exists and is unchanged" % basefile)
        entry = DocumentEntry(self.store.documententry_path(basefile))
        now = datetime.now()
        entry.orig_url = url  # or pdfurl?
        if created:
            entry.orig_created = now
        if updated:
            entry.orig_updated = now
        entry.orig_checked = now
        entry.save()
        return updated

    def source_url(self, basefile):
        # this source does not have any predictable URLs, so we try to
        # find if we made a note on the URL when we ran download()
        # FIXME: This code is repeated in jk.py and regeringen.py --
        # maybe we should let the default impl of source_url try this
        # strategy if eg self.remote_url is None?
        entry = DocumentEntry(self.store.documententry_path(basefile))
        return entry.orig_url

    def metadata_from_basefile(self, basefile):
        attrib = super(SOUKB, self).metadata_from_basefile(basefile) 
        year, ordinal = basefile.split(":")
        attrib["rpubl:arsutgava"] = year
        attrib["rpubl:lopnummer"] = ordinal
        attrib["rpubl:utrSerie"] = self.lookup_resource("SOU", SKOS.altLabel)
        return attrib

    @lazyread
    def downloaded_to_intermediate(self, basefile, attachment=None):
        intermediate_path = self.store.intermediate_path(basefile)
        intermediate_dir = os.path.dirname(intermediate_path)
        keep_xml = "bz2" if self.config.compress == "bz2" else True
        reader = StreamingPDFReader()
        kwargs = {'filename': self.store.downloaded_path(basefile, attachment=attachment),
                  'workdir': intermediate_dir,
                  'images': self.config.pdfimages,
                  'keep_xml': keep_xml}
        if self.config.ocr:
            kwargs['ocr_lang'] = 'swe'
        return reader.convert(**kwargs)

    def extract_head(self, fp, basefile):
        return None  # "rawhead" is never used
        
    def extract_metadata(self, rawhead, basefile):
        metadata = util.readfile(self.store.downloaded_path(
            basefile, attachment="index.rdf"))
        # For some reason these RDF files might use canonical
        # decomposition form (NFD) which is less optimal. Fix this.
        metadata = unicodedata.normalize("NFC", metadata)
        sourcegraph = Graph().parse(data=metadata)
        rooturi = sourcegraph.value(predicate=RDF.type, object=BIBO.Book)
        if rooturi is None:
            # then just try to identify the main uri and use that 
            subjects = set(sourcegraph.subjects())
            if len(subjects) == 1:
                rooturi = next(iter(subjects))
        title = sourcegraph.value(subject=rooturi, predicate=DC.title)
        issued = sourcegraph.value(subject=rooturi, predicate=DC.date)
        if isinstance(issued, str):
            # sometimes dc:date is weird like "1976[1974]" (SOU 1974:42)
            if len(issued) != 4:
                self.log.warning("expected issued date as single 4-digit year, got %s" % issued)
                # fall back on an approximation based on the basefile
                issued = basefile.split(":")[0]
            issued = Literal(util.gYear(int(issued)), datatype=XSD.gYear)
                
        attribs = self.metadata_from_basefile(basefile)
        attribs["dcterms:title"] = title
        if issued:
            attribs["dcterms:issued"] = issued
        return attribs

    def sanitize_metadata(self, props, doc):
        if props.get('dcterms:title') and " : betänkande" in props['dcterms:title']:
            props['dcterms:title'] = props['dcterms:title'].rsplit(" : ")[0]
        return props

    def sanitize_identifier(self, identifier):
        return sou_sanitize_identifier(identifier)
    
    def extract_body(self, fp, basefile):
        reader = StreamingPDFReader()
        parser = "ocr" if self.config.ocr else "xml"
        reader.read(fp, parser=parser)
        for page in reader:
            page.src = "index.pdf"  # FIXME: don't hardcode the filename
        return reader
        
    def sanitize_body(self, rawbody):
        sanitized = super(SOUKB, self).sanitize_body(rawbody)
        # Offtryck.sanitize_body will set self.analyzer
        sanitized.analyzer.scanned_source = True  # everything from KB
                                                  # is scanned, even
                                                  # though the PDF
                                                  # includes OCR
                                                  # information, so
                                                  # the real source is
                                                  # not a hOCR file
        return sanitized

    def create_external_resources(self, doc):
        pass

# inherit list_basefiles_for from CompositeStore, basefile_to_pathfrag
# from SwedishLegalStore)
class SOUStore(CompositeStore, SwedishLegalStore):
    pass

    
[docs]class SOU(CompositeRepository, FixedLayoutSource):
    alias = "sou"
    rdf_type = RPUBL.Utredningsbetankande
    subrepos = (SOURegeringen, SOUKB)
    urispace_segment = "sou"
    urispace_segment_legacy = "utr/sou"
    documentstore_class = SOUStore
    xslt_template = "xsl/forarbete.xsl"
    sparql_annotations = "sparql/describe-with-subdocs.rq"
    sparql_expect_results = False

    # NB: The same logic as in
    # ferenda.sources.legal.se.{Regeringen,Riksdagen}.metadata_from_basefile
    def metadata_from_basefile(self, basefile):
        a = super(SOU, self).metadata_from_basefile(basefile)
        a["rpubl:arsutgava"], a["rpubl:lopnummer"] = basefile.split(":", 1)
        a["rpubl:utrSerie"] = self.lookup_resource("SOU", SKOS.altLabel)
        return a

    def facets(self):
        return super(SOU, self).facets() + [Facet(DCTERMS.title)]