# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import re
import os
from datetime import datetime
import codecs
from bs4 import BeautifulSoup
from lxml import etree
import requests
from six import text_type as str
from ferenda import util, errors
from ferenda.elements import UnicodeElement, CompoundElement, \
Heading, Preformatted, Paragraph, Section, Link, ListItem, \
serialize
from ferenda import CompositeRepository, CompositeStore
from ferenda import PDFDocumentRepository
from ferenda import Describer
from ferenda import TextReader
from ferenda import PDFReader
from ferenda import DocumentEntry
from ferenda import LayeredConfig
from ferenda.decorators import managedparsing
from . import Trips, NoMoreLinks
from . import Regeringen
from . import Riksdagen
from . import RPUBL
from . import SwedishLegalSource, SwedishLegalStore
from .swedishlegalsource import offtryck_parser, offtryck_gluefunc
[docs]class PropRegeringen(Regeringen):
alias = "propregeringen"
re_basefile_strict = re.compile(r'Prop. (\d{4}/\d{2,4}:\d+)')
re_basefile_lax = re.compile(
r'(?:Prop\.?|) ?(\d{4}/\d{2,4}:\d+)', re.IGNORECASE)
rdf_type = RPUBL.Proposition
document_type = Regeringen.PROPOSITION
[docs]class PropTrips(Trips):
alias = "proptrips"
base = "THWALLAPROP"
app = "prop"
basefile_regex = "(?P<basefile>\d+/\d+:\d+)$"
download_params = [{'maxpage': 101, 'app': app, 'base': base}]
downloaded_suffix = ".html"
rdf_type = RPUBL.Proposition
storage_policy = "dir"
def get_default_options(self):
opts = super(PropTrips, self).get_default_options()
opts['lastbase'] = "THWALLAPROP"
return opts
# don't use @recordlastdownload -- download_get_basefiles_page
# should set self.config.lastbase instead
def download(self, basefile=None):
if basefile:
return super(PropTrips, self).download(basefile)
else:
if (hasattr(self.config, 'lastbase') and
self.config.lastbase and
not self.config.refresh):
now = datetime.now()
maxbase = "PROPARKIV%s%s" % (now.year % 100, (now.year+1) % 100)
while self.config.lastbase != maxbase:
self.download_params[0]['base'] = self.config.lastbase # override "THWALLAPROP" with eg "PROPARKIV0809"
r = super(PropTrips, self).download() # download_get_basefiles_page sets lastbase as it goes along
else:
r = super(PropTrips, self).download()
LayeredConfig.write(self.config) # assume we have data to write
return r
def download_get_basefiles_page(self, pagetree):
def basefile_to_base(basefile):
# 1992/93:23 -> "PROPARKIV9293"
# 1999/2000:23 -> "PROPARKIV9900"
(y1, y2, idx) = re.split("[:/]", basefile)
return "PROPARKIV%02d%02d" % (int(y1) % 100, int(y2) % 100)
# feed the lxml tree into beautifulsoup by serializing it to a
# string -- is there a better way?
soup = BeautifulSoup(etree.tostring(pagetree))
for tr in soup.findAll("tr"):
if ((not tr.find("a")) or
not re.match(self.basefile_regex, tr.find("a").text)):
# FIXME: Maybe re.search instead of .match to find
# "Prop. 2012/13:152"
continue
# First, look at desc (third td):
descnodes = [util.normalize_space(x) for x
in tr.find_all("td")[2]
if isinstance(x, str)]
bilaga = None
if len(descnodes) > 1:
if descnodes[1].startswith("Bilaga:"):
bilaga = util.normalize_space(descnodes[0].split(",")[-1])
desc = "\n".join(descnodes)
# then, find basefile (second td)
tds = tr.find_all("td")
td = tds[1]
basefile = td.a.text
assert re.match(self.basefile_regex, basefile)
basefile = self.sanitize_basefile(basefile)
# assume entries are strictly sorted from ancient to
# recent. Therefore, as soon as we encounter a new time
# period (eg 1998/99) we can update self.config.lastbase
self.config.lastbase = basefile_to_base(basefile)
url = td.a['href']
# self.download_single(basefile, refresh=refresh, url=url)
# and, if present, extra files (in td 4+5)
extraurls = []
for td in tr.findAll("td")[3:]:
extraurls.append(td.a['href'])
# we slightly abuse the protocol between
# download_get_basefiles and this generator -- instead of
# yielding just two strings, we yield two tuples with some
# extra information that download_single will need.
yield (basefile, bilaga), (url, extraurls)
nextpage = None
for element, attribute, link, pos in pagetree.iterlinks():
if element.text and element.text.strip() == "Fler poster":
nextpage = link
raise NoMoreLinks(nextpage)
def download_single(self, basefile, url):
# unpack the tuples we may recieve instead of plain strings
if isinstance(basefile, tuple):
basefile, attachment = basefile
if attachment:
mainattachment = attachment + ".html"
else:
mainattachment = None
if isinstance(url, tuple):
url, extraurls = url
updated = created = False
checked = True
filename = self.store.downloaded_path(basefile, attachment=mainattachment)
created = not os.path.exists(filename)
# since the server doesn't support conditional caching and
# propositioner are basically never updated once published, we
# avoid even calling download_if_needed if we already have
# the doc.
if (os.path.exists(self.store.downloaded_path(basefile))
and not self.config.refresh):
self.log.debug("%s: already exists" % basefile)
return
if self.download_if_needed(url, basefile, filename=filename):
if created:
self.log.info("%s: downloaded from %s" % (basefile, url))
else:
self.log.info(
"%s: downloaded new version from %s" % (basefile, url))
updated = True
else:
self.log.debug("%s: exists and is unchanged" % basefile)
for url in extraurls:
if url.endswith('msword.application'):
# NOTE: We cannot be sure that this is
# actually a Word (CDF) file. For older files
# it might be a WordPerfect file (.wpd) or a
# RDF file, for newer it might be a .docx. We
# cannot be sure until we've downloaded it.
# So we quickly read the first 4 bytes
r = requests.get(url, stream=True)
sig = r.raw.read(4)
# r.raw.close()
#bodyidx = head.index("\n\n")
#sig = head[bodyidx:bodyidx+4]
if sig == b'\xffWPC':
doctype = ".wpd"
elif sig == b'\xd0\xcf\x11\xe0':
doctype = ".doc"
elif sig == b'PK\x03\x04':
doctype = ".docx"
elif sig == b'{\\rt':
doctype = ".rtf"
else:
self.log.error(
"%s: Attached file has signature %r -- don't know what type this is" % (basefile, sig))
continue
elif url.endswith('pdf.application'):
doctype = ".pdf"
else:
self.log.warning("Unknown doc type %s" %
td.a['href'].split("=")[-1])
doctype = None
if doctype:
if attachment:
filename = self.store.downloaded_path(
basefile, attachment=attachment + doctype)
else:
filename = self.store.downloaded_path(basefile, attachment="index" + doctype)
self.log.debug("%s: downloading attachment %s" % (basefile, filename))
self.download_if_needed(url, basefile, filename=filename)
if mainattachment == None:
entry = DocumentEntry(self.store.documententry_path(basefile))
now = datetime.now()
entry.orig_url = url
if created:
entry.orig_created = now
if updated:
entry.orig_updated = now
if checked:
entry.orig_checked = now
entry.save()
return updated
# Correct some invalid identifiers spotted in the wild:
# 1999/20 -> 1999/2000
# 2000/2001 -> 2000/01
# 1999/98 -> 1999/2000
def sanitize_basefile(self, basefile):
(y1, y2, idx) = re.split("[:/]", basefile)
assert len(
y1) == 4, "Basefile %s is invalid beyond sanitization" % basefile
if y1 == "1999" and y2 != "2000":
sanitized = "1999/2000:" + idx
self.log.warning("Basefile given as %s, correcting to %s" %
(basefile, sanitized))
elif (y1 != "1999" and
(len(y2) != 2 or # eg "2000/001"
int(y1[2:]) + 1 != int(y2))): # eg "1999/98
sanitized = "%s/%02d:%s" % (y1, int(y1[2:]) + 1, idx)
self.log.warning("Basefile given as %s, correcting to %s" %
(basefile, sanitized))
else:
sanitized = basefile
return sanitized
# For parsing:
# 1999/94 and 1994/95 has only plaintext
# 1995/96 to 2006/07 has plaintext + doc
# 2007/08 onwards has plaintext, doc and pdf
@managedparsing
def parse(self, doc):
try:
# prefer PDF or Word files over the plaintext-containing HTML files
# FIXME: PDF or Word files are now stored as attachments
htmlfile = self.store.downloaded_path(doc.basefile)
pdffile = self.store.path(doc.basefile, 'downloaded', '.pdf')
wordfiles = (self.store.path(doc.basefile, 'downloaded', '.doc'),
self.store.path(doc.basefile, 'downloaded', '.docx'),
self.store.path(doc.basefile, 'downloaded', '.wpd'),
self.store.path(doc.basefile, 'downloaded', '.rtf'))
# check if ANY of these exist
if not filter(None, [os.path.exists(f) for f in wordfiles + (htmlfile, pdffile)]):
raise errors.NoDownloadedFileError("File '%s' (or any .pdf/.doc/.docx/.wpd/.rdf variant) not found" % htmlfile)
wordfile = None
for f in wordfiles:
if os.path.exists(f):
wordfile = f
doc.uri = self.canonical_uri(doc.basefile)
d = Describer(doc.meta, doc.uri)
d.rdftype(self.rdf_type)
# if we lack a .pdf file, use Open/LibreOffice to convert any
# .wpd or .doc file to .pdf first
if (wordfile
and not os.path.exists(pdffile)):
intermediate_pdf = self.store.path(
doc.basefile, "intermediate", ".pdf")
if not os.path.exists(intermediate_pdf):
cmdline = "%s --headless -convert-to pdf -outdir '%s' %s" % (self.config.get('soffice', 'soffice'),
os.path.dirname(
intermediate_pdf),
wordfile)
self.log.debug(
"%s: Converting to PDF: %s" % (doc.basefile, cmdline))
(ret, stdout, stderr) = util.runcmd(
cmdline, require_success=True)
pdffile = intermediate_pdf
if os.path.exists(pdffile):
self.log.debug("%s: Using %s" % (doc.basefile, pdffile))
intermediate_dir = os.path.dirname(
self.store.path(doc.basefile, 'intermediate', '.foo'))
pdfreader = PDFReader()
keep_xml = "bz2" if self.config.compress == "bz2" else True
pdfreader.read(pdffile, intermediate_dir, keep_xml=keep_xml)
self.parse_from_pdfreader(pdfreader, doc)
else:
downloaded_path = self.store.downloaded_path(doc.basefile)
intermediate_path = self.store.path(
doc.basefile, 'intermediate', '.txt')
self.log.debug("%s: Using %s (%s)" % (doc.basefile,
downloaded_path, intermediate_path))
if not os.path.exists(intermediate_path):
html = codecs.open(
downloaded_path, encoding="iso-8859-1").read()
util.writefile(intermediate_path, util.extract_text(
html, '<pre>', '</pre>'), encoding="utf-8")
textreader = TextReader(intermediate_path, encoding="utf-8")
self.parse_from_textreader(textreader, doc)
# How to represent that one XHTML doc was created from
# plaintext, and another from PDF? create a bnode
# representing the source prov:wasDerivedFrom and set its
# dcterms:format to correct mime type
d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name())
self.infer_triples(d, doc.basefile)
return True
except Exception as e:
err = errors.ParseError(str(e))
if isinstance(e, IOError):
err.dummyfile = self.store.parsed_path(doc.basefile)
raise err
def parse_from_pdfreader(self, pdfreader, doc):
parser = offtryck_parser(preset='proposition')
doc.body = parser.parse(pdfreader.textboxes(offtryck_gluefunc))
def parse_from_textreader(self, textreader, doc):
describer = Describer(doc.meta, doc.uri)
for p in textreader.getiterator(textreader.readparagraph):
# print "Handing %r (%s)" % (p[:40], len(doc.body))
if not p.strip():
continue
elif not doc.body and 'Obs! Dokumenten i denna databas kan vara ofullständiga.' in p:
continue
elif not doc.body and p.strip().startswith("Dokument:"):
# We already know this
continue
elif not doc.body and p.strip().startswith("Titel:"):
describer.value(
self.ns['dcterms'].title, util.normalize_space(p[7:]))
else:
doc.body.append(Preformatted([p]))
[docs]class PropRiksdagen(Riksdagen):
alias = "propriksdagen"
rdf_type = RPUBL.Proposition
document_type = Riksdagen.PROPOSITION
# inherit list_basefiles_for from CompositeStore, basefile_to_pathfrag
# from SwedishLegalStore)
class PropositionerStore(CompositeStore, SwedishLegalStore):
pass
class Propositioner(CompositeRepository, SwedishLegalSource):
subrepos = PropRegeringen, PropTrips, PropRiksdagen
alias = "prop"
xslt_template = "paged.xsl"
storage_policy = "dir"
rdf_type = RPUBL.Proposition
documentstore_class = PropositionerStore
def tabs(self, primary=False):
return [('Förarbeten', self.dataset_uri())]