Source code for ferenda.sources.legal.se.propositioner

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import re
import os
from datetime import datetime
import codecs

from bs4 import BeautifulSoup
from lxml import etree
import requests
from layeredconfig import LayeredConfig
from six import text_type as str

from ferenda import util, errors
from ferenda.elements import UnicodeElement, CompoundElement, \
    Heading, Preformatted, Paragraph, Section, Link, ListItem, \
    serialize
from ferenda import CompositeRepository, CompositeStore
from ferenda import PDFDocumentRepository
from ferenda import Describer
from ferenda import TextReader
from ferenda import PDFReader
from ferenda import DocumentEntry
from ferenda.decorators import managedparsing
from . import Trips, NoMoreLinks
from . import Regeringen
from . import Riksdagen
from . import RPUBL
from . import SwedishLegalSource, SwedishLegalStore
from .swedishlegalsource import offtryck_parser, offtryck_gluefunc

[docs]class PropRegeringen(Regeringen): alias = "propregeringen" re_basefile_strict = re.compile(r'Prop. (\d{4}/\d{2,4}:\d+)') re_basefile_lax = re.compile( r'(?:Prop\.?|) ?(\d{4}/\d{2,4}:\d+)', re.IGNORECASE) rdf_type = RPUBL.Proposition document_type = Regeringen.PROPOSITION # sparql_annotations = "res/sparql/prop-annotations.rq" sparql_annotations = None # don't even bother creating an annotation file
[docs]class PropTrips(Trips): alias = "proptrips" base = "THWALLAPROP" app = "prop" basefile_regex = "(?P<basefile>\d+/\d+:\d+)$" download_params = [{'maxpage': 101, 'app': app, 'base': base}] downloaded_suffix = ".html" rdf_type = RPUBL.Proposition storage_policy = "dir" def get_default_options(self): opts = super(PropTrips, self).get_default_options() opts['lastbase'] = "THWALLAPROP" return opts # don't use @recordlastdownload -- download_get_basefiles_page # should set self.config.lastbase instead def download(self, basefile=None): if basefile: return super(PropTrips, self).download(basefile) else: if ('lastbase' in self.config and self.config.lastbase and not self.config.refresh): now = datetime.now() maxbase = "PROPARKIV%s%s" % (now.year % 100, (now.year+1) % 100) while self.config.lastbase != maxbase: self.download_params[0]['base'] = self.config.lastbase # override "THWALLAPROP" with eg "PROPARKIV0809" r = super(PropTrips, self).download() # download_get_basefiles_page sets lastbase as it goes along else: r = super(PropTrips, self).download() LayeredConfig.write(self.config) # assume we have data to write return r def _basefile_to_base(self, basefile): # 1992/93:23 -> "PROPARKIV9293" # 1999/2000:23 -> "PROPARKIV9900" (y1, y2, idx) = re.split("[:/]", basefile) return "PROPARKIV%02d%02d" % (int(y1) % 100, int(y2) % 100) def download_get_basefiles_page(self, pagetree): # feed the lxml tree into beautifulsoup by serializing it to a # string -- is there a better way? soup = BeautifulSoup(etree.tostring(pagetree)) for tr in soup.findAll("tr"): if ((not tr.find("a")) or not re.match(self.basefile_regex, tr.find("a").text)): # FIXME: Maybe re.search instead of .match to find # "Prop. 2012/13:152" continue # First, look at desc (third td): descnodes = [util.normalize_space(x) for x in tr.find_all("td")[2] if isinstance(x, str)] bilaga = None if len(descnodes) > 1: if descnodes[1].startswith("Bilaga:"): bilaga = util.normalize_space(descnodes[0].split(",")[-1]) desc = "\n".join(descnodes) # then, find basefile (second td) tds = tr.find_all("td") td = tds[1] basefile = td.a.text assert re.match(self.basefile_regex, basefile) basefile = self.sanitize_basefile(basefile) # assume entries are strictly sorted from ancient to # recent. Therefore, as soon as we encounter a new time # period (eg 1998/99) we can update self.config.lastbase self.config.lastbase = self._basefile_to_base(basefile) url = td.a['href'] # self.download_single(basefile, refresh=refresh, url=url) # and, if present, extra files (in td 4+5) extraurls = [] for td in tr.findAll("td")[3:]: extraurls.append(td.a['href']) # we slightly abuse the protocol between # download_get_basefiles and this generator -- instead of # yielding just two strings, we yield two tuples with some # extra information that download_single will need. yield (basefile, bilaga), (url, extraurls) nextpage = None for element, attribute, link, pos in pagetree.iterlinks(): if element.text and element.text.strip() == "Fler poster": nextpage = link raise NoMoreLinks(nextpage) document_url_template = ("http://rkrattsbaser.gov.se/cgi-bin/thw?" "${HTML}=prop_lst&${OOHTML}=prop_dok" "&${SNHTML}=prop_err&${HILITE}=1&${MAXPAGE}=26" "&${TRIPSHOW}=format=THW" "&${SAVEHTML}=/prop/prop_form2.html" "&${BASE}=%(base)s&${FREETEXT}=&${FREETEXT}=" "&PRUB=&DOK=p&PNR=%(pnr)s&ORG=") def remote_url(self, basefile): base = self._basefile_to_base(basefile) pnr = basefile.split(":")[1] return self.document_url_template % {'base': base, 'pnr': pnr} def download_single(self, basefile, url=None): if url is None: url = self.remote_url(basefile) if not url: # remote_url failed return # unpack the tuples we may recieve instead of plain strings mainattachment = None if isinstance(basefile, tuple): basefile, attachment = basefile if attachment: mainattachment = attachment + ".html" if isinstance(url, tuple): url, extraurls = url updated = created = False checked = True filename = self.store.downloaded_path(basefile, attachment=mainattachment) created = not os.path.exists(filename) # since the server doesn't support conditional caching and # propositioner are basically never updated once published, we # avoid even calling download_if_needed if we already have # the doc. if (os.path.exists(self.store.downloaded_path(basefile)) and not self.config.refresh): self.log.debug("%s: already exists" % basefile) return if self.download_if_needed(url, basefile, filename=filename): if created: self.log.info("%s: downloaded from %s" % (basefile, url)) else: self.log.info( "%s: downloaded new version from %s" % (basefile, url)) updated = True else: self.log.debug("%s: exists and is unchanged" % basefile) for url in extraurls: if url.endswith('msword.application'): # NOTE: We cannot be sure that this is # actually a Word (CDF) file. For older files # it might be a WordPerfect file (.wpd) or a # RDF file, for newer it might be a .docx. We # cannot be sure until we've downloaded it. # So we quickly read the first 4 bytes r = requests.get(url, stream=True) sig = r.raw.read(4) # r.raw.close() #bodyidx = head.index("\n\n") #sig = head[bodyidx:bodyidx+4] if sig == b'\xffWPC': doctype = ".wpd" elif sig == b'\xd0\xcf\x11\xe0': doctype = ".doc" elif sig == b'PK\x03\x04': doctype = ".docx" elif sig == b'{\\rt': doctype = ".rtf" else: self.log.error( "%s: Attached file has signature %r -- don't know what type this is" % (basefile, sig)) continue elif url.endswith('pdf.application'): doctype = ".pdf" else: self.log.warning("Unknown doc type %s" % td.a['href'].split("=")[-1]) doctype = None if doctype: if attachment: filename = self.store.downloaded_path( basefile, attachment=attachment + doctype) else: filename = self.store.downloaded_path(basefile, attachment="index" + doctype) self.log.debug("%s: downloading attachment %s" % (basefile, filename)) self.download_if_needed(url, basefile, filename=filename) if mainattachment == None: entry = DocumentEntry(self.store.documententry_path(basefile)) now = datetime.now() entry.orig_url = url if created: entry.orig_created = now if updated: entry.orig_updated = now if checked: entry.orig_checked = now entry.save() return updated # Correct some invalid identifiers spotted in the wild: # 1999/20 -> 1999/2000 # 2000/2001 -> 2000/01 # 1999/98 -> 1999/2000 def sanitize_basefile(self, basefile): (y1, y2, idx) = re.split("[:/]", basefile) assert len( y1) == 4, "Basefile %s is invalid beyond sanitization" % basefile if y1 == "1999" and y2 != "2000": sanitized = "1999/2000:" + idx self.log.warning("Basefile given as %s, correcting to %s" % (basefile, sanitized)) elif (y1 != "1999" and (len(y2) != 2 or # eg "2000/001" int(y1[2:]) + 1 != int(y2))): # eg "1999/98 sanitized = "%s/%02d:%s" % (y1, int(y1[2:]) + 1, idx) self.log.warning("Basefile given as %s, correcting to %s" % (basefile, sanitized)) else: sanitized = basefile return sanitized # For parsing: # 1999/94 and 1994/95 has only plaintext # 1995/96 to 2006/07 has plaintext + doc # 2007/08 onwards has plaintext, doc and pdf @managedparsing def parse(self, doc): try: # prefer PDF or Word files over the plaintext-containing HTML files # FIXME: PDF or Word files are now stored as attachments htmlfile = self.store.downloaded_path(doc.basefile) pdffile = self.store.path(doc.basefile, 'downloaded', '.pdf') wordfiles = (self.store.path(doc.basefile, 'downloaded', '.doc'), self.store.path(doc.basefile, 'downloaded', '.docx'), self.store.path(doc.basefile, 'downloaded', '.wpd'), self.store.path(doc.basefile, 'downloaded', '.rtf')) # check if ANY of these exist if not filter(None, [os.path.exists(f) for f in wordfiles + (htmlfile, pdffile)]): raise errors.NoDownloadedFileError("File '%s' (or any .pdf/.doc/.docx/.wpd/.rdf variant) not found" % htmlfile) wordfile = None for f in wordfiles: if os.path.exists(f): wordfile = f doc.uri = self.canonical_uri(doc.basefile) d = Describer(doc.meta, doc.uri) d.rdftype(self.rdf_type) # if we lack a .pdf file, use Open/LibreOffice to convert any # .wpd or .doc file to .pdf first if (wordfile and not os.path.exists(pdffile)): intermediate_pdf = self.store.path( doc.basefile, "intermediate", ".pdf") if not os.path.exists(intermediate_pdf): cmdline = "%s --headless -convert-to pdf -outdir '%s' %s" % (self.config.get('soffice', 'soffice'), os.path.dirname( intermediate_pdf), wordfile) self.log.debug( "%s: Converting to PDF: %s" % (doc.basefile, cmdline)) (ret, stdout, stderr) = util.runcmd( cmdline, require_success=True) pdffile = intermediate_pdf if os.path.exists(pdffile): self.log.debug("%s: Using %s" % (doc.basefile, pdffile)) intermediate_dir = os.path.dirname( self.store.path(doc.basefile, 'intermediate', '.foo')) keep_xml = "bz2" if self.config.compress == "bz2" else True pdfreader = PDFReader(filename=pdffile, workdir=intermediate_dir, keep_xml=keep_xml) self.parse_from_pdfreader(pdfreader, doc) else: downloaded_path = self.store.downloaded_path(doc.basefile) intermediate_path = self.store.path( doc.basefile, 'intermediate', '.txt') self.log.debug("%s: Using %s (%s)" % (doc.basefile, downloaded_path, intermediate_path)) if not os.path.exists(intermediate_path): html = codecs.open( downloaded_path, encoding="iso-8859-1").read() util.writefile(intermediate_path, util.extract_text( html, '<pre>', '</pre>'), encoding="utf-8") textreader = TextReader(intermediate_path, encoding="utf-8") self.parse_from_textreader(textreader, doc) # How to represent that one XHTML doc was created from # plaintext, and another from PDF? create a bnode # representing the source prov:wasDerivedFrom and set its # dcterms:format to correct mime type d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) self.infer_triples(d, doc.basefile) return True except Exception as e: err = errors.ParseError(str(e)) if isinstance(e, IOError): err.dummyfile = self.store.parsed_path(doc.basefile) raise err def parse_from_pdfreader(self, pdfreader, doc): parser = offtryck_parser(preset='proposition') doc.body = parser.parse(pdfreader.textboxes(offtryck_gluefunc)) def parse_from_textreader(self, textreader, doc): describer = Describer(doc.meta, doc.uri) for p in textreader.getiterator(textreader.readparagraph): # print "Handing %r (%s)" % (p[:40], len(doc.body)) if not p.strip(): continue elif not doc.body and 'Obs! Dokumenten i denna databas kan vara ofullständiga.' in p: continue elif not doc.body and p.strip().startswith("Dokument:"): # We already know this continue elif not doc.body and p.strip().startswith("Titel:"): describer.value( self.ns['dcterms'].title, util.normalize_space(p[7:])) else: doc.body.append(Preformatted([p]))
[docs]class PropRiksdagen(Riksdagen): alias = "propriksdagen" rdf_type = RPUBL.Proposition document_type = Riksdagen.PROPOSITION
# inherit list_basefiles_for from CompositeStore, basefile_to_pathfrag # from SwedishLegalStore) class PropositionerStore(CompositeStore, SwedishLegalStore): pass class Propositioner(CompositeRepository, SwedishLegalSource): subrepos = PropRegeringen, PropTrips, PropRiksdagen alias = "prop" xslt_template = "res/xsl/forarbete.xsl" storage_policy = "dir" rdf_type = RPUBL.Proposition documentstore_class = PropositionerStore sparql_annotations = None # don't even bother creating an annotation file def tabs(self, primary=False): return [('Förarbeten', self.dataset_uri())]