Source code for ferenda.sources.legal.se.dv

# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import *
"""Hanterar domslut (detaljer och referat) från Domstolsverket. Data
hämtas fran DV:s (ickepublika) FTP-server, eller fran lagen.nu."""

# system libraries (incl six-based renames)
from bz2 import BZ2File
from collections import defaultdict
from datetime import datetime, timedelta
from ftplib import FTP
from io import BytesIO
from time import mktime
from urllib.parse import urljoin, urlparse
import codecs
import itertools
import logging
import os
import re
import tempfile
import zipfile

# 3rdparty libs
from cached_property import cached_property
from rdflib import Namespace, URIRef, Graph, RDF, RDFS, BNode
from rdflib.namespace import DCTERMS, SKOS, FOAF
import requests
import lxml.html
from lxml import etree
from bs4 import BeautifulSoup, NavigableString
try:
    # this is a optional dependency that only works on py3 and which
    # is only needed when multiple processes write to a single shared
    # file (generated/uri.map) over NFS
    from flufl.lock import Lock
except ImportError:
    Lock = None


# my libs
from ferenda import (Document, DocumentStore, Describer, WordReader, FSMParser, Facet)
from ferenda.decorators import newstate, action
from ferenda import util, errors, fulltextindex
from ferenda.sources.legal.se.legalref import LegalRef
from ferenda.elements import (Body, Paragraph, CompoundElement, OrdinalElement,
                              Heading, Link)

from ferenda.elements.html import Strong, Em, Div, P
from . import SwedishLegalSource, SwedishCitationParser, RPUBL
from .elements import *


PROV = Namespace(util.ns['prov'])


class DVStore(DocumentStore):

    """Customized DocumentStore.
    """
    downloaded_suffixes = [".docx", ".doc"]
    
    def basefile_to_pathfrag(self, basefile):
        return basefile

    def pathfrag_to_basefile(self, pathfrag):
        return pathfrag


class KeywordContainsDescription(errors.FerendaException):
    def __init__(self, keywords, descriptions):
        self.keywords = keywords
        self.descriptions = descriptions

class DuplicateReferatDoc(errors.DocumentRemovedError):
    pass

        
[docs]class DV(SwedishLegalSource): """Handles legal cases, in report form, from primarily final instance courts. Cases are fetched from Domstolsverkets FTP server for "Vägledande avgöranden", and are converted from doc/docx format. """ alias = "dv" downloaded_suffix = ".zip" rdf_type = (RPUBL.Rattsfallsreferat, RPUBL.Rattsfallsnotis) documentstore_class = DVStore # This is very similar to SwedishLegalSource.required_predicates, # only DCTERMS.title has been changed to RPUBL.referatrubrik (and if # our validating function grokked that rpubl:referatrubrik # rdfs:isSubpropertyOf dcterms:title, we wouldn't need this). Also, we # removed dcterms:issued because there is no actual way of getting # this data (apart from like the file time stamps). On further # thinking, we remove RPUBL.referatrubrik as it's not present (or # required) for rpubl:Rattsfallsnotis required_predicates = [RDF.type, DCTERMS.identifier, PROV.wasGeneratedBy] DCTERMS = Namespace(util.ns['dcterms']) sparql_annotations = "sparql/dv-annotations.rq" sparql_expect_results = False xslt_template = "xsl/dv.xsl" @classmethod def relate_all_setup(cls, config, *args, **kwargs): # FIXME: If this was an instancemethod, we could use # self.store methods instead parsed_dir = os.path.sep.join([config.datadir, 'dv', 'parsed']) mapfile = os.path.sep.join( [config.datadir, 'dv', 'generated', 'uri.map']) log = logging.getLogger(cls.alias) if (not util.outfile_is_newer(util.list_dirs(parsed_dir, ".xhtml"), mapfile)) or config.force: re_xmlbase = re.compile('<head about="([^"]+)"') log.info("Creating uri.map file") cnt = 0 # also remove any uri-<client>-<pid>.map files that might be laying around for m in util.list_dirs(os.path.dirname(mapfile), ".map"): if m == mapfile: continue util.robust_remove(m) util.robust_remove(mapfile + ".new") util.ensure_dir(mapfile) # FIXME: Not sure utf-8 is the correct codec for us -- it # might be iso-8859-1 (it's to be used by mod_rewrite). with codecs.open(mapfile + ".new", "w", encoding="utf-8") as fp: paths = set() for f in util.list_dirs(parsed_dir, ".xhtml"): if not os.path.getsize(f): # skip empty files continue # get basefile from f in the simplest way basefile = f[len(parsed_dir) + 1:-6] head = codecs.open(f, encoding='utf-8').read(1024) m = re_xmlbase.search(head) if m: path = urlparse(m.group(1)).path if path in paths: log.warning("Path %s is already in map" % path) continue assert path assert basefile if config.mapfiletype == "nginx": fp.write("%s\t/dv/generated/%s.html;\n" % (path, basefile)) else: # remove prefix "/dom/" from path path = path.replace("/%s/" % cls.urispace_segment, "", 1) fp.write("%s\t%s\n" % (path, basefile)) cnt += 1 paths.add(path) else: log.warning( "%s: Could not find valid head[@about] in %s" % (basefile, f)) util.robust_rename(mapfile + ".new", mapfile) log.info("uri.map created, %s entries" % cnt) else: log.debug("Not regenerating uri.map") pass return super(DV, cls).relate_all_setup(config, *args, **kwargs) # def relate(self, basefile, otherrepos): pass @classmethod def get_default_options(cls): opts = super(DV, cls).get_default_options() opts['ftpuser'] = '' # None # Doesn't work great since Defaults is a typesource... opts['ftppassword'] = '' # None opts['mapfiletype'] = 'apache' # or nginx return opts def canonical_uri(self, basefile): # The canonical URI for HDO/B3811-03 should be # https://lagen.nu/dom/nja/2004s510. We can't know # this URI before we parse the document. Once we have, we can # find the first rdf:type = rpubl:Rattsfallsreferat (or # rpubl:Rattsfallsnotis) and get its url. # # FIXME: It would be simpler and faster to read # DocumentEntry(self.store.entry_path(basefile))['id'], but # parse does not yet update the DocumentEntry once it has the # canonical uri/id for the document. p = self.store.distilled_path(basefile) if not os.path.exists(p): raise ValueError("No distilled file for basefile %s at %s" % (basefile, p)) with self.store.open_distilled(basefile) as fp: g = Graph().parse(data=fp.read()) for uri, rdftype in g.subject_objects(predicate=RDF.type): if rdftype in (RPUBL.Rattsfallsreferat, RPUBL.Rattsfallsnotis): return str(uri) raise ValueError("Can't find canonical URI for basefile %s in %s" % (basefile, p)) # we override make_document to avoid having it calling # canonical_uri prematurely def make_document(self, basefile=None): doc = Document() doc.basefile = basefile doc.meta = self.make_graph() doc.lang = self.lang doc.body = Body() doc.uri = None # can't know this yet return doc urispace_segment = "rf" expected_cases = {"ra": 1993, "nja": 1981, "rh": 1993, "ad": 1993, "mod": 1999, "md": 2004} # override to account for the fact that there is no 1:1 # correspondance between basefiles and uris def basefile_from_uri(self, uri): def build_basefilemap(path, filename): if self.config.mapfiletype == "nginx": # chop of leading "/dom/" path = path[len(self.urispace_segment)+2:] # /dv/generated/HDO/T-254.html; => HDO/T-254 filename = filename[14:-6] self._basefilemap[path] = filename basefile = super(DV, self).basefile_from_uri(uri) # the "basefile" at this point is just the remainder of the # URI (eg "nja/1995s362"). Create a lookup table to find the # real basefile (eg "HDO/Ö463-95_1") if basefile: if not hasattr(self, "_basefilemap"): self._basefilemap = {} self.readmapfile(build_basefilemap) if basefile in self._basefilemap: return self._basefilemap[basefile] else: # this will happen for older cases for which we don't # have any files. We invent URI-derived basefiles for # these to gain a sort of skeleton entry for those, # which we can use to track eg. frequently referenced # older cases. # however, we check if we OUGHT to have a basefile # (because it's recent enough) and warn. court, year = basefile.split("/", 1) year=int(year[:4]) if court not in self.expected_cases or self.expected_cases[court] <= year: self.log.warning("%s: Could not find corresponding basefile" % uri) return basefile.replace(":", "/") def readmapfile(self, callback): mapfile = self.store.path("uri", "generated", ".map") util.ensure_dir(mapfile) if self.config.clientname: mapfiles = list(util.list_dirs(os.path.dirname(mapfile), ".map")) else: mapfiles = [mapfile] if self.config.mapfiletype == "nginx": regex = "/%s/(.*)\t/dv/generated/(.*).html;" % self.urispace_segment else: idx = len(self.urispace_base) + len(self.urispace_segment) + 2 regex = "(.*)\t(.*)" append_path = True for mapfile in mapfiles: if os.path.exists(mapfile): with codecs.open(mapfile, encoding="utf-8") as fp: for line in fp: path, filename = line.strip().split("\t", 1) ret = callback(path, filename) if ret is not None: return ret def download(self, basefile=None): if basefile is not None: raise ValueError("DV.download cannot process a basefile parameter") # recurse =~ download everything, which we do if refresh is # specified OR if we've never downloaded before recurse = False # if self.config.lastdownload has not been set, it has only # the type value, so self.config.lastdownload will raise # AttributeError. Should it return None instead? if self.config.refresh or 'lastdownload' not in self.config: recurse = True self.downloadcount = 0 # number of files extracted from zip files # (not number of zip files) try: if self.config.ftpuser: self.download_ftp("", recurse, self.config.ftpuser, self.config.ftppassword) else: self.log.warning( "Config variable ftpuser not set, downloading from secondary source (https://lagen.nu/dv/downloaded/) instead") self.download_www("", recurse) except errors.MaxDownloadsReached: # ok we're done! pass def download_ftp(self, dirname, recurse, user=None, password=None, connection=None): self.log.debug('Listing contents of %s' % dirname) lines = [] if not connection: connection = FTP('ftp.dom.se') connection.login(user, password) connection.cwd(dirname) connection.retrlines('LIST', lines.append) for line in lines: parts = line.split() filename = parts[-1].strip() if line.startswith('d') and recurse: self.download_ftp(filename, recurse, connection=connection) elif line.startswith('-'): basefile = os.path.splitext(filename)[0] if dirname: basefile = dirname + "/" + basefile # localpath = self.store.downloaded_path(basefile) localpath = self.store.path(basefile, 'downloaded/zips', '.zip') if os.path.exists(localpath) and not self.config.refresh: pass # we already got this else: util.ensure_dir(localpath) self.log.debug('Fetching %s to %s' % (filename, localpath)) connection.retrbinary('RETR %s' % filename, # FIXME: retrbinary calls .close()? open(localpath, 'wb').write) self.process_zipfile(localpath) connection.cwd('/') def download_www(self, dirname, recurse): url = 'https://lagen.nu/dv/downloaded/%s' % dirname self.log.debug('Listing contents of %s' % url) resp = requests.get(url) iterlinks = lxml.html.document_fromstring(resp.text).iterlinks() for element, attribute, link, pos in iterlinks: if link.startswith("/"): continue elif link.endswith("/") and recurse: self.download_www(link, recurse) elif link.endswith(".zip"): basefile = os.path.splitext(link)[0] if dirname: basefile = dirname + basefile # localpath = self.store.downloaded_path(basefile) localpath = self.store.path(basefile, 'downloaded/zips', '.zip') if os.path.exists(localpath) and not self.config.refresh: pass # we already got this else: absolute_url = urljoin(url, link) self.log.debug('Fetching %s to %s' % (link, localpath)) resp = requests.get(absolute_url) with self.store.open(basefile, "downloaded/zips", ".zip", "wb") as fp: fp.write(resp.content) self.process_zipfile(localpath) # eg. HDO_T3467-96.doc or HDO_T3467-96_1.doc re_malnr = re.compile(r'([^_]*)_([^_\.]*)()_?(\d*)(\.docx?)') # eg. HDO_T3467-96_BYTUT_2010-03-17.doc or # HDO_T3467-96_BYTUT_2010-03-17_1.doc or # HDO_T254-89_1_BYTUT_2009-04-28.doc (which is sort of the # same as the above but the "_1" goes in a different place) re_bytut_malnr = re.compile( r'([^_]*)_([^_\.]*)_?(\d*)_BYTUT_\d+-\d+-\d+_?(\d*)(\.docx?)') re_tabort_malnr = re.compile( r'([^_]*)_([^_\.]*)_?(\d*)_TABORT_\d+-\d+-\d+_?(\d*)(\.docx?)') # temporary helper @action def process_all_zipfiles(self): self.downloadcount = 0 zippath = self.store.path('', 'downloaded/zips', '') # Peocess zips in subdirs first (HDO, ADO # etc), then in numerics-only order. mykey = lambda v: (-len(v.split(os.sep)), "".join(c for c in v if c.isnumeric())) zipfiles = sorted(util.list_dirs(zippath, suffix=".zip"), key=mykey) for zipfilename in zipfiles: self.log.info("%s: Processing..." % zipfilename) self.process_zipfile(zipfilename) @action def process_zipfile(self, zipfilename): """Extract a named zipfile into appropriate documents""" removed = replaced = created = untouched = 0 if not hasattr(self, 'downloadcount'): self.downloadcount = 0 try: zipf = zipfile.ZipFile(zipfilename, "r") except zipfile.BadZipfile as e: self.log.error("%s is not a valid zip file: %s" % (zipfilename, e)) return for bname in zipf.namelist(): if not isinstance(bname, str): # py2 # Files in the zip file are encoded using codepage 437 name = bname.decode('cp437') else: name = bname if "_notis_" in name: base, suffix = os.path.splitext(name) segments = base.split("_") coll, year = segments[0], segments[1] # Extract this doc as a temp file -- we won't be # creating an actual permanent file, but let # extract_notis extract individual parts of this file # to individual basefiles fp = tempfile.NamedTemporaryFile("wb", suffix=suffix, delete=False) filebytes = zipf.read(bname) fp.write(filebytes) fp.close() tempname = fp.name r = self.extract_notis(tempname, year, coll) created += r[0] untouched += r[1] os.unlink(tempname) else: name = os.path.split(name)[1] if 'BYTUT' in name: m = self.re_bytut_malnr.match(name) elif 'TABORT' in name: m = self.re_tabort_malnr.match(name) else: m = self.re_malnr.match(name) if m: (court, malnr, opt_referatnr, referatnr, suffix) = ( m.group(1), m.group(2), m.group(3), m.group(4), m.group(5)) assert ((suffix == ".doc") or (suffix == ".docx") ), "Unknown suffix %s in %r" % (suffix, name) if referatnr: basefile = "%s/%s_%s" % (court, malnr, referatnr) elif opt_referatnr: basefile = "%s/%s_%s" % (court, malnr, opt_referatnr) else: basefile = "%s/%s" % (court, malnr) basefile = basefile.strip() # to avoid spurious trailing spaces in the filename before the file suffix outfile = self.store.path(basefile, 'downloaded', suffix) if "TABORT" in name: self.log.info("%s: Removing" % basefile) if not os.path.exists(outfile): self.log.warning("%s: %s doesn't exist" % (basefile, outfile)) else: os.unlink(outfile) removed += 1 elif "BYTUT" in name: self.log.info("%s: download OK (replacing with new)" % basefile) if not os.path.exists(outfile): self.log.warning("%s: %s doesn't exist" % (basefile, outfile)) replaced += 1 else: self.log.info("%s: download OK (unpacking)" % basefile) if os.path.exists(outfile): untouched += 1 continue else: created += 1 if not "TABORT" in name: data = zipf.read(bname) with self.store.open(basefile, "downloaded", suffix, "wb") as fp: fp.write(data) # Make the unzipped files have correct timestamp zi = zipf.getinfo(bname) dt = datetime(*zi.date_time) ts = mktime(dt.timetuple()) os.utime(outfile, (ts, ts)) self.downloadcount += 1 # fix HERE if ('downloadmax' in self.config and self.config.downloadmax and self.downloadcount >= self.config.downloadmax): raise errors.MaxDownloadsReached() else: self.log.warning('Could not interpret filename %r i %s' % (name, os.path.relpath(zipfilename))) self.log.debug('Processed %s, created %s, replaced %s, removed %s, untouched %s files' % (os.path.relpath(zipfilename), created, replaced, removed, untouched)) def extract_notis(self, docfile, year, coll="HDO"): def find_month_in_previous(basefile): # The big word file with all notises might not # start with a month name -- try to find out # current month by examining the previous notis # (belonging to a previous word file). # # FIXME: It's possible that the two word files might be # different types (eg docx and doc). In that case the # resulting file will contain both OOXML and DocBook tags. self.log.warning( "No month specified in %s, attempting to look in previous file" % basefile) # HDO/2009_not_26 -> HDO/2009_not_25 tmpfunc = lambda x: str(int(x.group(0)) - 1) prev_basefile = re.sub('\d+$', tmpfunc, basefile) prev_path = self.store.intermediate_path(prev_basefile) avd_p = None if os.path.exists(prev_path): with self.store.open_intermediate(prev_basefile, "rb") as fp: soup = BeautifulSoup(fp.read(), "lxml") tmp = soup.find(["w:p", "para"]) if re_avdstart.match(tmp.get_text().strip()): avd_p = tmp if not avd_p: raise errors.ParseError( "Cannot find value for month in %s (looked in %s)" % (basefile, prev_path)) return avd_p # Given a word document containing a set of "notisfall" from # either HD or HFD (earlier RegR), spit out a constructed # intermediate XML file for each notis and create a empty # placeholder downloaded file. The empty file should never be # opened since parse_open will prefer the constructed # intermediate file. if coll == "HDO": re_notisstart = re.compile( "(?P<day>Den \d+:[ae]. |)(?P<ordinal>\d+)\s*\.\s*\((?P<malnr>\w\s\d+-\d+)\)", flags=re.UNICODE) re_avdstart = re.compile( "(Januari|Februari|Mars|April|Maj|Juni|Juli|Augusti|September|Oktober|November|December)$") else: # REG / HFD re_notisstart = re.compile( "[\w\: ]*Lnr:(?P<court>\w+) ?(?P<year>\d+) ?not ?(?P<ordinal>\d+)", flags=re.UNICODE) re_avdstart = None created = untouched = 0 intermediatefile = os.path.splitext(docfile)[0] + ".xml" with open(intermediatefile, "wb") as fp: filetype = WordReader().read(docfile, fp) soup = BeautifulSoup(util.readfile(intermediatefile), "lxml") os.unlink(intermediatefile) # won't be needed past this point if filetype == "docx": p_tag = "w:p" xmlns = ' xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"' else: p_tag = "para" xmlns = '' iterator = soup.find_all(p_tag, limit=2147483647) basefile = None fp = None avd_p = None day = None intermediate_path = None for p in iterator: t = p.get_text().strip() if re_avdstart: # keep track of current month, store that in avd_p m = re_avdstart.match(t) if m: avd_p = p continue m = re_notisstart.match(t) if m: ordinal = m.group("ordinal") try: if m.group("day"): day = m.group("day") else: # inject current day in the first text node of # p (which should inside of a <emphasis # role="bold" or equivalent). subnode = None # FIXME: is this a deprecated method? for c in p.recursiveChildGenerator(): if isinstance(c, NavigableString): c.string.replace_with(day + str(c.string)) break except IndexError: pass if intermediate_path: previous_intermediate_path = intermediate_path basefile = "%(coll)s/%(year)s_not_%(ordinal)s" % locals() self.log.info("%s: Extracting from %s file" % (basefile, filetype)) created += 1 downloaded_path = self.store.path(basefile, 'downloaded', '.' + filetype) util.ensure_dir(downloaded_path) with open(downloaded_path, "w"): pass # just create an empty placeholder file -- # parse_open will load the intermediate file # anyway. if fp: fp.write(b"</body>\n") fp.close() util.ensure_dir(self.store.intermediate_path(basefile)) fp = self.store.open_intermediate(basefile, mode="wb") bodytag = '<body%s>' % xmlns fp.write(bodytag.encode("utf-8")) if filetype != "docx": fp.write(b"\n") if coll == "HDO" and not avd_p: avd_p = find_month_in_previous(basefile) if avd_p: fp.write(str(avd_p).encode("utf-8")) if fp: fp.write(str(p).encode("utf-8")) if filetype != "docx": fp.write(b"\n") if fp: # should always be the case fp.write(b"</body>\n") fp.close() else: self.log.error("%s/%s: No notis were extracted (%s)" % (coll, year, docfile)) return created, untouched re_delimSplit = re.compile("[;,] ?").split labels = {'Rubrik': DCTERMS.description, 'Domstol': DCTERMS.creator, 'Målnummer': RPUBL.malnummer, 'Domsnummer': RPUBL.domsnummer, 'Diarienummer': RPUBL.diarienummer, 'Avdelning': RPUBL.domstolsavdelning, 'Referat': DCTERMS.identifier, 'Avgörandedatum': RPUBL.avgorandedatum, } def remote_url(self, basefile): # There is no publicly available URL where the source document # could be fetched. return None def adjust_basefile(self, doc, orig_uri): pass # See comments in swedishlegalsource.py def parse_open(self, basefile, attachment=None): intermediate_path = self.store.intermediate_path(basefile) if not os.path.exists(intermediate_path): fp = self.downloaded_to_intermediate(basefile) else: fp = self.store.open_intermediate(basefile, "rb") # Determine if the previously-created intermediate files # came from .doc or OOXML (.docx) sources by sniffing the # first bytes. start = fp.read(6) assert isinstance(start, bytes), "fp seems to have been opened in a text-like mode" if start in (b"<w:doc", b"<body "): filetype = "docx" elif start in (b"<book ", b"<book>", b"<body>"): filetype = "doc" else: raise ValueError("Can't guess filetype from %r" % start) fp.seek(0) self.filetype = filetype return self.patch_if_needed(fp, basefile) def downloaded_to_intermediate(self, basefile, attachment=None): assert "_not_" not in basefile, "downloaded_to_intermediate can't handle Notisfall %s" % basefile docfile = self.store.downloaded_path(basefile) intermediatefile = self.store.intermediate_path(basefile) if os.path.getsize(docfile) == 0: raise errors.ParseError("%s: Downloaded file %s is empty, %s should have " "been created by download() but is missing!" % (basefile, docfile, intermediatefile)) wr = WordReader() fp = self.store.open_intermediate(basefile, mode="wb") self.filetype = wr.read(docfile, fp, simplify=True) # FIXME: Do something with filetype if it's not what we expect fp.close() if hasattr(fp, 'utime'): os.utime(self.store.intermediate_path(basefile), (fp.utime, fp.utime)) # re-open in read mode -- since we can't open a compressed # file in read-write mode return self.store.open_intermediate(basefile, mode="rb") def extract_head(self, fp, basefile): filetype = self.filetype patched = fp.read() # rawhead is a simple dict that we'll later transform into a # rdflib Graph. rawbody is a list of plaintext strings, each # representing a paragraph. if "not" in basefile: rawhead, rawbody = self.parse_not(patched, basefile, filetype) elif filetype == "docx": rawhead, rawbody = self.parse_ooxml(patched, basefile) else: rawhead, rawbody = self.parse_antiword_docbook(patched, basefile) # stash the body away for later reference self._rawbody = rawbody return rawhead def extract_metadata(self, rawhead, basefile): # we have already done all the extracting in extract_head return rawhead def parse_entry_title(self, doc): # FIXME: The primary use for entry.title is to generate # feeds. Should we construct a feed-friendly title here # (rpubl:referatrubrik is often too wordy, dcterm:identifier + # dcterms:subject might be a better choice -- also notisfall # does not have any rpubl:referatrubrik) title = doc.meta.value(URIRef(doc.uri), RPUBL.referatrubrik) if title: return str(title) def extract_body(self, fp, basefile): return self._rawbody def sanitize_body(self, rawbody): result = [] seen_delmal = {} # ADO 1994 nr 102 to nr 113 have double \n between *EVERY # LINE*, not between every paragraph. Lines are short, less # than 60 chars. This leads to is_heading matching almost # every chunk. The weirdest thing is that the specific line # starting with "Ledamöter: " do NOT exhibit this trait... Try # to detect and undo. if (isinstance(rawbody[0], str) and # Notisfall rawbody is a list of lists... max(len(line) for line in rawbody if not line.startswith("Ledamöter: ")) < 60): self.log.warning("Source has double newlines between every line, attempting " "to reconstruct sections") newbody = [] currentline = "" for idx, line in enumerate(rawbody): if (line.isupper() or # this is a obvious header (idx + 1 < len(rawbody) and rawbody[idx+1].isupper()) or # next line is a obvious header (idx + 1 < len(rawbody) and # line is short and a probable sentence enter + next line starts with a new sentence len(line) < 45 and line[-1] in (".", "?", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9") and rawbody[idx+1][0].isupper()) or (idx + 1 < len(rawbody) and re.match("\d\.\s+[A-ZÅÄÖ]", rawbody[idx+1])) # next line seem to be a ordered paragraph ): newbody.append(currentline + "\n" + line) currentline = "" else: currentline += "\n" + line rawbody = newbody for idx, x in enumerate(rawbody): if isinstance(x, str): # detect and fix smushed numbered sections which MD # has, eg "18Marknadsandelar är..." -> # "18. Marknadsandelar är..." x = re.sub("^(\d{1,3})([A-ZÅÄÖ])", r"\1. \2", x) m = re.match("(KÄRANDE|SVARANDE|SAKEN)([A-ZÅÄÖ].*)", x) if m: # divide smushed-together headings like MD has, # eg. "SAKENMarknadsföring av bilverkstäder..." x = [m.group(1), m.group(2)] else: # match smushed-together delmål markers like in "(jfr # 1990 s 772 och s 796)I" and "Domslut HD fastställer # HovR:ns domslut.II" # # But since we apparently need to handle spaces before # "I", we might get false positives with sentences like # "...och Dalarna. I\ndistributionsrörelsen # sysselsattes...". Try to avoid this by checking for # probable sentence start in next line m = re.match("(.*[\.\) ])(I+)$", x, re.DOTALL) if (m and rawbody[idx+1][0].isupper() and not re.search("mellandomstema I+$", x, flags=re.IGNORECASE)): x = [m.group(1), m.group(2)] else: x = [x] for p in x: m = re.match("(I{1,3}|IV)\.? ?(|\(\w+\-\d+\))$", p) if m: seen_delmal[m.group(1)] = True result.append(Paragraph([p])) else: result.append(Paragraph(x)) # Many referats that are split into delmål lack the first # initial "I" that signifies the start of the first delmål # (but do have "II", "III" and possibly more) if seen_delmal and "I" not in seen_delmal: self.log.warning("Inserting missing 'I' for first delmal") result.insert(0, Paragraph(["I"])) return result def parse_not(self, text, basefile, filetype): basefile_regex = re.compile("(?P<type>\w+)/(?P<year>\d+)_not_(?P<ordinal>\d+)") referat_templ = {'REG': 'RÅ %(year)s not %(ordinal)s', 'HDO': 'NJA %(year)s not %(ordinal)s', 'HFD': 'HFD %(year)s not %(ordinal)s'} head = {} body = [] m = basefile_regex.match(basefile).groupdict() coll = m['type'] head["Referat"] = referat_templ[coll] % m soup = BeautifulSoup(text, "lxml") if filetype == "docx": ptag = "w:p", "para" # support intermediate files with a # mix of OOXML/DocBook else: ptag = "para", "w:p" iterator = soup.find_all(ptag, limit=2147483647) if coll == "HDO": # keep this in sync w extract_notis re_notisstart = re.compile( "(?:Den (?P<avgdatum>\d+):[ae].\s+|)(?P<ordinal>\d+)\s*\.\s*\((?P<malnr>\w[ \xa0]\d+-\d+)\)", flags=re.UNICODE) re_avgdatum = re_malnr = re_notisstart re_lagrum = re_sokord = None # headers consist of the first two chunks. (month, then # date+ordinal+malnr) header = iterator.pop(0), iterator[0] # need to re-read the second line later curryear = m['year'] currmonth = self.swedish_months[header[0].get_text().strip().lower()] secondline = util.normalize_space(header[-1].get_text()) m = re_notisstart.match(secondline) if m: head["Rubrik"] = secondline[m.end():].strip() if curryear == "2003": # notisfall in this year lack # newline between heading and # actual text, so we use a # heuristic to just match # first sentence m2 = re.search("\. [A-ZÅÄÖ]", head["Rubrik"]) if m2: # now we know where the first sentence ends. Only keep that. head["Rubrik"] = head["Rubrik"][:m2.start()+1] else: # "REG", "HFD" # keep in sync like above re_notisstart = re.compile( "[\w\: ]*Lnr:(?P<court>\w+) ?(?P<year>\d+) ?not ?(?P<ordinal>\d+)") re_malnr = re.compile(r"[AD][:-] ?(?P<malnr>\d+\-\d+)") # the avgdatum regex attempts to include valid dates, eg # not "2770-71-12".It's also somewhat tolerant of # formatting mistakes, eg accepts " :03-06-16" instead of # "A:03-06-16" re_avgdatum = re.compile(r"[AD ]: ?(?P<avgdatum>\d{2,4}\-[01]\d\-\d{2})") re_sokord = re.compile("Uppslagsord: ?(?P<sokord>.*)", flags=re.DOTALL) re_lagrum = re.compile("Lagrum: ?(?P<lagrum>.*)", flags=re.DOTALL) # headers consists of the first five or six # chunks. Doesn't end until "^Not \d+." header = [] done = False while not done and iterator: line = iterator[0].get_text().strip() # can possibly be "Not 1a." (RÅ 1994 not 1) or # "Not. 109." (RÅ 1998 not 109). There might be a # space separating the notis from the next sentence, # but there might also not be! if re.match("Not(is|)\.? \d+[abc]?\.? ?", line): done = True if ". - " in line[:2000]: # Split out "Not 56" and the first # sentence up to ". -", signalling that is the # equiv of referatrubrik rubr = line.split(". - ", 1)[0] rubr = re.sub("Not(is|)\.? \d+[abc]?\.? ", "", rubr) head['Rubrik'] = rubr else: if line.endswith("Notisen har utgått."): raise errors.DocumentRemovedError(basefile, dummyfile=self.store.parsed_path(basefile)) else: tmp = iterator.pop(0) if tmp.get_text().strip(): # REG specialcase if header and header[-1].get_text().strip() == "Lagrum:": # get the first bs4.element.Tag child children = [x for x in tmp.children if hasattr(x, 'get_text')] if children: header[-1].append(children[0]) else: header.append(tmp) if not done: raise errors.ParseError("Cannot find notis number in %s" % basefile) if coll == "HDO": head['Domstol'] = "Högsta Domstolen" elif coll == "HFD": head['Domstol'] = "Högsta förvaltningsdomstolen" elif coll == "REG": head['Domstol'] = "Regeringsrätten" else: raise errors.ParseError("Unsupported: %s" % coll) for node in header: t = util.normalize_space(node.get_text()) # if not malnr, avgdatum found, look for those for fld, key, rex in (('Målnummer', 'malnr', re_malnr), ('Avgörandedatum', 'avgdatum', re_avgdatum), ('Lagrum', 'lagrum', re_lagrum), ('Sökord', 'sokord', re_sokord)): if not rex: continue m = rex.search(t) if m and m.group(key): if fld in ('Lagrum'): # Sökord is split by sanitize_metadata head[fld] = self.re_delimSplit(m.group(key)) else: head[fld] = m.group(key) if coll == "HDO" and 'Avgörandedatum' in head: head[ 'Avgörandedatum'] = "%s-%02d-%02d" % (curryear, currmonth, int(head['Avgörandedatum'])) # Do a basic conversion of the rest (bodytext) to Element objects # # This is generic enough that it could be part of WordReader for node in iterator: line = [] if filetype == "doc": subiterator = node elif filetype == "docx": subiterator = node.find_all("w:r", limit=2147483647) for part in subiterator: if part.name: t = part.get_text() else: t = str(part) # convert NavigableString to pure string # if not t.strip(): # continue if filetype == "doc" and part.name == "emphasis": # docbook if part.get("role") == "bold": if line and isinstance(line[-1], Strong): line[-1][-1] += t else: line.append(Strong([t])) else: if line and isinstance(line[-1], Em): line[-1][-1] += t else: line.append(Em([t])) # ooxml elif filetype == "docx" and part.find("w:rpr") and part.find("w:rpr").find(["w:b", "w:i"]): rpr = part.find("w:rpr") if rpr.find("w:b"): if line and isinstance(line[-1], Strong): line[-1][-1] += t else: line.append(Strong([t])) elif rpr.find("w:i"): if line and isinstance(line[-1], Em): line[-1][-1] += t else: line.append(Em([t])) else: if line and isinstance(line[-1], str): line[-1] += t else: line.append(t) if line: body.append(line) return head, body def parse_ooxml(self, text, basefile): soup = BeautifulSoup(text, "lxml") head = {} # Högst uppe på varje domslut står domstolsnamnet ("Högsta # domstolen") följt av referatnumret ("NJA 1987 # s. 113"). firstfield = soup.find("w:t") # Ibland är domstolsnamnet uppsplittat på två # w:r-element. Bäst att gå på all text i # föräldra-w:tc-cellen firstfield = firstfield.find_parent("w:tc") head['Domstol'] = firstfield.get_text(strip=True) nextfield = firstfield.find_next("w:tc") head['Referat'] = nextfield.get_text(strip=True) # Hitta övriga enkla metadatafält i sidhuvudet for key in self.labels: if key in head: continue node = soup.find(text=re.compile(key + ':')) if not node: # FIXME: should warn for missing Målnummer iff # Domsnummer is not present, and vice versa. But at # this point we don't have all fields if key not in ('Diarienummer', 'Domsnummer', 'Avdelning', 'Målnummer'): self.log.warning("%s: Couldn't find field %r" % (basefile, key)) continue txt = "".join([n.get_text() for n in node.find_next("w:t").find_parent("w:p").find_all("w:t", limit=2147483647)]) if txt.strip(): # skippa fält med tomma strängen-värden (eller bara whitespace) head[key] = txt # Hitta sammansatta metadata i sidhuvudet for key in ["Lagrum", "Rättsfall"]: node = soup.find(text=re.compile(key + ':')) if node: textnodes = node.find_parent('w:tc').find_next_sibling('w:tc') if not textnodes: continue items = [] for textnode in textnodes.find_all('w:t', limit=2147483647): t = textnode.get_text(strip=True) if t: items.append(t) if items: head[key] = items # The main text body of the verdict body = [] for p in soup.find(text=re.compile('EFERAT')).find_parent( 'w:tr').find_next_sibling('w:tr').find_all('w:p', limit=2147483647): ptext = '' for e in p.find_all("w:t", limit=2147483647): ptext += e.string body.append(ptext) # Finally, some more metadata in the footer if soup.find(text=re.compile(r'Sökord:')): head['Sökord'] = soup.find( text=re.compile(r'Sökord:')).find_next('w:t').get_text(strip=True) if soup.find(text=re.compile('^\s*Litteratur:\s*$')): n = soup.find(text=re.compile('^\s*Litteratur:\s*$')) head['Litteratur'] = n.findNext('w:t').get_text(strip=True) return head, body def parse_antiword_docbook(self, text, basefile): soup = BeautifulSoup(text, "lxml") head = {} header_elements = soup.find("para") header_text = '' for el in header_elements.contents: if hasattr(el, 'name') and el.name == "informaltable": break else: header_text += el.string # Högst uppe på varje domslut står domstolsnamnet ("Högsta # domstolen") följt av referatnumret ("NJA 1987 # s. 113"). Beroende på worddokumentet ser dock XML-strukturen # olika ut. Det vanliga är att informationen finns i en # pipeseparerad paragraf: parts = [x.strip() for x in header_text.split("|")] if len(parts) > 1: head['Domstol'] = parts[0] head['Referat'] = parts[1] else: # alternativ står de på första raden i en informaltable row = soup.find("informaltable").tgroup.tbody.row.find_all('entry') head['Domstol'] = row[0].get_text(strip=True) head['Referat'] = row[1].get_text(strip=True) # Hitta övriga enkla metadatafält i sidhuvudet for key in self.labels: node = soup.find(text=re.compile(key + ':')) if node: txt = node.find_parent('entry').find_next_sibling( 'entry').get_text(strip=True) if txt: head[key] = txt # Hitta sammansatta metadata i sidhuvudet for key in ["Lagrum", "Rättsfall"]: node = soup.find(text=re.compile(key + ':')) if node: head[key] = [] textchunk = node.find_parent( 'entry').find_next_sibling('entry').string for line in [util.normalize_space(x) for x in textchunk.split("\n\n")]: if line: head[key].append(line) body = [] for p in soup.find(text=re.compile('REFERAT')).find_parent('tgroup').find_next_sibling( 'tgroup').find('entry').get_text(strip=True).split("\n\n"): body.append(p) # Hitta sammansatta metadata i sidfoten head['Sökord'] = soup.find(text=re.compile('Sökord:')).find_parent( 'entry').next_sibling.next_sibling.get_text(strip=True) if soup.find(text=re.compile('^\s*Litteratur:\s*$')): n = soup.find(text=re.compile('^\s*Litteratur:\s*$')).find_parent( 'entry').next_sibling.next_sibling.get_text(strip=True) head['Litteratur'] = n return head, body # correct broken/missing metadata def sanitize_metadata(self, head, basefile): basefile_regex = re.compile('(?P<type>\w+)/(?P<year>\d+)-(?P<ordinal>\d+)') nja_regex = re.compile( "NJA ?(\d+) ?s\.? ?(\d+) *\( ?(?:NJA|) ?[ :]?(\d+) ?: ?(\d+)") date_regex = re.compile("(\d+)[^\d]+(\d+)[^\d]+(\d+)") referat_regex = re.compile( "(?P<type>[A-ZÅÄÖ]+)[^\d]*(?P<year>\d+)[^\d]+(?P<ordinal>\d+)") referat_templ = {'ADO': 'AD %(year)s nr %(ordinal)s', 'AD': '%(type)s %(year)s nr %(ordinal)s', 'MDO': 'MD %(year)s:%(ordinal)s', 'NJA': '%(type)s %(year)s s. %(ordinal)s', None: '%(type)s %(year)s:%(ordinal)s' } # 0. strip whitespace for k, v in head.items(): if isinstance(v, str): head[k] = v.strip() # 1. Attempt to fix missing Referat if not head.get("Referat"): # For some courts (MDO, ADO) it's possible to reconstruct a missing # Referat from the basefile m = basefile_regex.match(basefile) if m and m.group("type") in ('ADO', 'MDO'): head["Referat"] = referat_templ[m.group("type")] % (m.groupdict()) # 2. Correct known problems with Domstol not always being correctly specified if "Hovrättenför" in head["Domstol"] or "Hovrättenöver" in head["Domstol"]: head["Domstol"] = head["Domstol"].replace("Hovrätten", "Hovrätten ") try: # if this throws a KeyError, it's not canonically specified self.lookup_resource(head["Domstol"], cutoff=1) except KeyError: # lookup URI with fuzzy matching, then turn back to canonical label head["Domstol"] = self.lookup_label(str(self.lookup_resource(head["Domstol"], warn=False))) # 3. Convert head['Målnummer'] to a list. Occasionally more than one # Malnummer is provided (c.f. AD 1994 nr 107, AD # 2005-117, AD 2003-111) in a comma, semicolo or space # separated list. AD 2006-105 even separates with " och ". # # HOWEVER, "Ö 2475-12" must not be converted to ["Ö2475-12"], not ['Ö', '2475-12'] if head.get("Målnummer"): if head["Målnummer"][:2] in ('Ö ', 'B ', 'T '): head["Målnummer"] = [head["Målnummer"].replace(" ", "")] else: res = [] for v in re.split("och|,|;|\s", head['Målnummer']): if v.strip(): res.append(v.strip()) head['Målnummer'] = res # 4. Create a general term for Målnummer or Domsnummer to act # as a local identifier if head.get("Målnummer"): head["_localid"] = head["Målnummer"] elif head.get("Domsnummer"): # NB: localid needs to be a list head["_localid"] = [head["Domsnummer"]] else: raise errors.ParseError("Required key (Målnummer/Domsnummer) missing") # 5. For NJA, Canonicalize the identifier through a very # forgiving regex and split of the alternative identifier # as head['_nja_ordinal'] # # "NJA 2008 s 567 (NJA 2008:86)"=>("NJA 2008 s 567", "NJA 2008:86") # "NJA 2011 s. 638(NJA2011:57)" => ("NJA 2011 s 638", "NJA 2001:57") # "NJA 2012 s. 16(2012:2)" => ("NJA 2012 s 16", "NJA 2012:2") if "NJA" in head["Referat"] and " not " not in head["Referat"]: m = nja_regex.match(head["Referat"]) if m: head["Referat"] = "NJA %s s %s" % (m.group(1), m.group(2)) head["_nja_ordinal"] = "NJA %s:%s" % (m.group(3), m.group(4)) else: raise errors.ParseError("Unparseable NJA ref '%s'" % head["Referat"]) # 6 Canonicalize referats: Fix crap like "AD 2010nr 67", # "AD2011 nr 17", "HFD_2012 ref.58", "RH 2012_121", "RH2010 # :180", "MD:2012:5", "MIG2011:14", "-MÖD 2010:32" and many # MANY more if " not " not in head["Referat"]: # notiser always have OK Referat m = referat_regex.search(head["Referat"]) if m: if m.group("type") in referat_templ: head["Referat"] = referat_templ[m.group("type")] % m.groupdict() else: head["Referat"] = referat_templ[None] % m.groupdict() elif basefile.split("/")[0] in ('ADO', 'MDO'): # FIXME: The same logic as under 1, duplicated m = basefile_regex.match(basefile) head["Referat"] = referat_templ[m.group("type")] % (m.groupdict()) else: raise errors.ParseError("Unparseable ref '%s'" % head["Referat"]) # 7. Convert Sökord string to an actual list if head.get("Sökord"): try: res = self.sanitize_sokord(head["Sökord"], basefile) except KeywordContainsDescription as e: res = e.keywords rubrik = " / ".join(e.descriptions) # I don't really know if this turns out to be a good # idea, but let's try it if "Rubrik" in head and "_not_" in basefile: self.log.debug("%s: Changing rubrik %s -> %s (is that better?)" % (basefile, head["Rubrik"], rubrik)) head["Rubrik"] = rubrik head["Sökord"] = res # 8. Convert Avgörandedatum to a sensible value in the face of # irregularities like '2010-11 30', '2011 03-23' '2011- # 01-27', '2009.08.28' or '07-12-28' m = date_regex.match(head["Avgörandedatum"]) if m: if len(m.group(1)) < 4: if int(m.group(1)) >= 80: # '80-01-01' => '1980-01-01', year = '19' + m.group(1) else: # '79-01-01' => '2079-01-01', year = '20' + m.group(1) else: year = m.group(1) head["Avgörandedatum"] = "%s-%s-%s" % (year, m.group(2), m.group(3)) else: raise errors.ParseError("Unparseable date %s" % head["Avgörandedatum"]) # 9. Done! return head # Strings that might look like descriptions but actually are legit # keywords (albeit very wordy keywords) sokord_whitelist = ("Rättsprövning enligt lagen (2006:304) om rättsprövning av vissa regeringsbeslut",) def sanitize_sokord(self, sokordstring, basefile): def capitalize(s): # remove any non-word char start (like "- ", which # sometimes occur due to double-dashes) s = re.sub("^\W+", "", s) return util.ucfirst(s) def probable_description(s): # FIXME: try to determine if this is sentence-like # (eg. containing common verbs like "ansågs", containing # more than x words etc) return (s not in self.sokord_whitelist) and len(s) >= 50 res = [] descs = [] if basefile.startswith("XXX/"): delimiter = "," else: delimiter = ";" for s in sokordstring.split(delimiter): s = util.normalize_space(s) if not s: continue # normalize the delimiter between the main keyword and # subkeyword for some common variations like "Allmän # handling, allmän handling eller inte", "Allmän handling? # (brev till biskop i pastoral angelägenhet)" or "Allmän # handling -övriga frågor?" if " - " not in s: s = re.sub("(Allmän handling|Allmän försäkring|Arbetsskadeförsäkring|Besvärsrätt|Byggnadsmål|Plan- och bygglagen|Förhandsbesked|Resning)[:,?]?\s+\(?(.*?)\)?$", r"\1 - \2", s) subres = [] substrings = s.split(" - ") for idx, subs in enumerate(substrings): # often, what should be keywords is more of # descriptions, never occurring more than once. Try to # identify these pseudo-descriptions (which sometimes # are pretty good as descriptions go) if not probable_description(subs): subres.append(capitalize(subs)) else: if idx + 1 != len(substrings): self.log.warning("%s: Found probable description %r in sökord, but not at last position" % (basefile, subs)) descs.append(capitalize(subs)) res.append(tuple(subres)) if descs: # the remainder is not a legit keyword term. However, it # might be a useful title. Communicate it back to the # caller (but if we have several, omit shorter substrings # of longer descs) descs.sort(key=len) descs = [desc for idx, desc in enumerate(descs) if (idx + 1) == len(descs) or desc not in descs[idx+1]] raise KeywordContainsDescription(res, descs) return res @cached_property def rattsfall_parser(self): return SwedishCitationParser(LegalRef(LegalRef.RATTSFALL, LegalRef.EURATTSFALL), self.minter, self.commondata) @cached_property def lagrum_parser(self): return SwedishCitationParser(LegalRef(LegalRef.LAGRUM, LegalRef.EULAGSTIFTNING), self.minter, self.commondata) @cached_property def litteratur_parser(self): return SwedishCitationParser(LegalRef(LegalRef.FORARBETEN), self.minter, self.commondata) # create nice RDF from the sanitized metadata def polish_metadata(self, head): def ref_to_uri(ref): nodes = self.rattsfall_parser.parse_string(ref) assert isinstance(nodes[0], Link), "Can't make URI from '%s'" % ref return nodes[0].uri def split_nja(value): return [x[:-1] for x in value.split("(")] # 1. mint uris and create the two Describers we'll use graph = self.make_graph() refuri = ref_to_uri(head["Referat"]) refdesc = Describer(graph, refuri) for malnummer in head['_localid']: bnodetmp = BNode() gtmp = Graph() gtmp.bind("rpubl", RPUBL) gtmp.bind("dcterms", DCTERMS) dtmp = Describer(gtmp, bnodetmp) dtmp.rdftype(RPUBL.VagledandeDomstolsavgorande) dtmp.value(RPUBL.malnummer, malnummer) dtmp.value(RPUBL.avgorandedatum, head['Avgörandedatum']) dtmp.rel(DCTERMS.publisher, self.lookup_resource(head["Domstol"])) resource = dtmp.graph.resource(bnodetmp) domuri = self.minter.space.coin_uri(resource) domdesc = Describer(graph, domuri) # 2. convert all strings in head to proper RDF (well, some is # converted to mixed-content lists and stored in self._bodymeta self._bodymeta = defaultdict(list) for label, value in head.items(): if label == "Rubrik": value = util.normalize_space(value) refdesc.value(RPUBL.referatrubrik, value, lang="sv") domdesc.value(DCTERMS.title, value, lang="sv") elif label == "Domstol": with domdesc.rel(DCTERMS.publisher, self.lookup_resource(value)): # NB: Here, we take the court name as provided in # the source document as the value for the # foaf:name triple. We could also look it up in # self.commondata, but since we already have it... domdesc.value(FOAF.name, value) elif label == "Målnummer": for v in value: # FIXME: In these cases (multiple målnummer, which # primarily occurs with AD), we should really # create separate domdesc objects (there are two # verdicts, just summarized in one document) domdesc.value(RPUBL.malnummer, v) elif label == "Domsnummer": domdesc.value(RPUBL.domsnummer, value) elif label == "Diarienummer": domdesc.value(RPUBL.diarienummer, value) elif label == "Avdelning": domdesc.value(RPUBL.avdelning, value) elif label == "Referat": for pred, regex in {'rattsfallspublikation': r'([^ ]+)', 'arsutgava': r'(\d{4})', 'lopnummer': r'\d{4}(?:\:| nr | not )(\d+)', 'sidnummer': r's.? ?(\d+)'}.items(): m = re.search(regex, value) if m: if pred == 'rattsfallspublikation': uri = self.lookup_resource(m.group(1), predicate=SKOS.altLabel, cutoff=1) refdesc.rel(RPUBL[pred], uri) else: refdesc.value(RPUBL[pred], m.group(1)) refdesc.value(DCTERMS.identifier, value) elif label == "_nja_ordinal": refdesc.value(DCTERMS.bibliographicCitation, value) m = re.search(r'\d{4}(?:\:| nr | not )(\d+)', value) if m: refdesc.value(RPUBL.lopnummer, m.group(1)) elif label == "Avgörandedatum": domdesc.value(RPUBL.avgorandedatum, self.parse_iso_date(value)) # The following metadata (Lagrum, Rättsfall and # Litteratur) is handled slightly differently -- it's not # added to the RDF graph (doc.meta) that will later be # addede to the XHTML <head> section. Instead, it's saved # in a struct which will later be added to doc.body. This # is so that we can represent mixed content metadata # (links, linktexts and unlinked text) elif label == "Lagrum": for i in value: # better be list not string # if (re.search("\d+/\d+/(EU|EG|EEG)", i) or # re.search("\((EU|EG|EEG)\) nr \d+/\d+", i) or # " direktiv" in i or " förordning" in i): # self.log.warning("%s(%s): Lagrum ref to EULaw: '%s'" % # (head.get("Referat"), head.get("Målnummer"), i)) self._bodymeta[label].append(self.lagrum_parser.parse_string(i, predicate="rpubl:lagrum")) elif label == "Rättsfall": for i in value: self._bodymeta[label].append(self.rattsfall_parser.parse_string(i, predicate="rpubl:rattsfallshanvisning")) elif label == "Litteratur": if value: for i in value.split(";"): self._bodymeta[label].append(self.litteratur_parser.parse_string(i, predicate="dcterms:relation")) elif label == "Sökord": for s in value: self.add_keyword_to_metadata(domdesc, s) # 3. mint some owl:sameAs URIs -- but only if not using canonical URIs # (moved to lagen.nu.DV) # 4. Add some same-for-everyone properties refdesc.rel(DCTERMS.publisher, self.lookup_resource('Domstolsverket')) if 'not' in head['Referat']: refdesc.rdftype(RPUBL.Rattsfallsnotis) else: refdesc.rdftype(RPUBL.Rattsfallsreferat) domdesc.rdftype(RPUBL.VagledandeDomstolsavgorande) refdesc.rel(RPUBL.referatAvDomstolsavgorande, domuri) refdesc.value(PROV.wasGeneratedBy, self.qualified_class_name()) self._canonical_uri = refuri return refdesc.graph.resource(refuri) def add_keyword_to_metadata(self, domdesc, keyword): # Canonical uris don't define a URI space for # keywords/concepts. Instead refer to bnodes # with rdfs:label set (cf. rdl/documentation/exempel/ # documents/publ/Domslut/HD/2009/T_170-08.rdf). # # Subclasses that has an idea of how to create a URI for a # keyword/concept might override this. assert isinstance(keyword, tuple), "Keyword %s should have been a tuple of sub-keywords (possible 1-tuple)" with domdesc.rel(DCTERMS.subject): # if subkeywords, create a label like "Allmän handling»Begärd handling saknades" domdesc.value(RDFS.label, "»".join(keyword), lang=self.lang) def postprocess_doc(self, doc): if self.config.mapfiletype == "nginx": path = urlparse(doc.uri).path else: idx = len(self.urispace_base) + len(self.urispace_segment) + 2 path = doc.uri[idx:] def map_append_needed(mapped_path, filename): if mapped_path == path: # This means that a previously parsed, basefile # already maps to the same URI (eg because a referat # of multiple dom documents occur as several different # (identical) basefiles. If it's a different basefile # (and not just the same parsed twice), raise # DuplicateReferatDoc try: # convert generated path to a basefile if possible basefile = filename.split("/",3)[-1].split(".")[0] except: basefile = filename if doc.basefile != basefile: raise DuplicateReferatDoc(basefile, dummyfile=self.store.parsed_path(doc.basefile)) return False # the result of readmapfile can be either False (the case in # question already appeared in the uri.map file(s), or None # (the case was not found anywhere -- we should # append it to the appropriate uri.map file) append_needed = self.readmapfile(map_append_needed) if append_needed is not False: if self.config.clientname: # in a distributed setting, use a # uri-<clientname>-<pid>.map, eg # "uri-sophie-4435.map", to avoid corruption of a # single file by multiple writer, or slowness due to # lock contention. mapfile = self.store.path("uri", "generated", ".%s.%s.map" % (self.config.clientname, os.getpid())) else: mapfile = self.store.path("uri", "generated", ".map") with codecs.open(mapfile, "a", encoding="utf-8") as fp: if self.config.mapfiletype == "nginx": fp.write("%s\t/dv/generated/%s.html;\n" % (path, doc.basefile)) else: fp.write("%s\t%s\n" % (path, doc.basefile)) if hasattr(self, "_basefilemap"): delattr(self, "_basefilemap") # NB: This cannot be made to work 100% as there is not a 1:1 # mapping between basefiles and URIs since multiple basefiles # might map to the same URI (those basefiles should be # equivalent though). Examples: # HDO/B883-81_1 -> https://lagen.nu/rf/nja/1982s350 -> HDO/B882-81_1 # HFD/1112-14 -> https://lagen.nu/rf/hfd/2014:35 -> HFD/1113-14 # # However, we detect that above and throw a # DuplicateReferatDoc error for the second (or third, or # fourth...) basefile encountered. computed_basefile = self.basefile_from_uri(doc.uri) assert doc.basefile == computed_basefile, "%s -> %s -> %s" % (doc.basefile, doc.uri, computed_basefile) # remove empty Instans objects (these can happen when both a # separate heading, as well as a clue in a paragraph, # indicates a new court). roots = [] for node in doc.body: if isinstance(node, Delmal): roots.append(node) if roots == []: roots.append(doc.body) for root in roots: for node in list(root): if isinstance(node, Instans) and len(node) == 0: # print("Removing Instans %r" % node.court) root.remove(node) # add information from _bodymeta to doc.body bodymeta = Div(**{'class': 'bodymeta', 'about': str(doc.meta.value(URIRef(doc.uri), RPUBL.referatAvDomstolsavgorande))}) for k, v in sorted(self._bodymeta.items()): d = Div(**{'class': k}) for i in v: d.append(P(i)) bodymeta.append(d) doc.body.insert(0, bodymeta) def infer_identifier(self, basefile): p = self.store.distilled_path(basefile) if not os.path.exists(p): raise ValueError("No distilled file for basefile %s at %s" % (basefile, p)) with self.store.open_distilled(basefile) as fp: g = Graph().parse(data=fp.read()) uri = self.canonical_uri(basefile) return str(g.value(URIRef(uri), DCTERMS.identifier)) def parse_body_parseconfigs(self): return ("default", "simple") # @staticmethod def get_parser(self, basefile, sanitized, parseconfig="default"): re_courtname = re.compile( "^(Högsta domstolen|Hovrätten (över|för)[A-ZÅÄÖa-zåäö ]+|([A-ZÅÄÖ][a-zåäö]+ )(tingsrätt|hovrätt))(|, mark- och miljödomstolen|, Mark- och miljööverdomstolen)$") # productions = {'karande': '..', # 'court': '..', # 'date': '..'} # at parse time, initialize matchers rx = ( {'name': 'fr-överkl', 're': '(?P<karanden>[\w\.\(\)\- ]+) överklagade (beslutet|domen) ' 'till (?P<court>(Förvaltningsrätten|Länsrätten|Kammarrätten) i \w+(| län)' '(|, migrationsdomstolen|, Migrationsöverdomstolen)|' 'Högsta förvaltningsdomstolen)( \((?P<date>\d+-\d+-\d+), ' '(?P<constitution>[\w\.\- ,]+)\)|$)', 'method': 'match', 'type': ('instans',), 'court': ('REG', 'HFD', 'MIG')}, {'name': 'fr-dom', 're': '(?P<court>(Förvaltningsrätten|' 'Länsrätten|Kammarrätten) i \w+(| län)' '(|, migrationsdomstolen|, Migrationsöverdomstolen)|' 'Högsta förvaltningsdomstolen) \((?P<date>\d+-\d+-\d+), ' '(?P<constitution>[\w\.\- ,]+)\)', 'method': 'match', 'type': ('dom',), 'court': ('REG', 'HFD', 'MIG')}, {'name': 'tr-dom', 're': '(?P<court>TR:n|Tingsrätten|HovR:n|Hovrätten|Mark- och miljödomstolen) \((?P<constitution>[\w\.\- ,]+)\) (anförde|fastställde|stadfäste|meddelade) (följande i |i beslut i |i |)(dom|beslut) (d\.|d|den) (?P<date>\d+ \w+\.? \d+)', 'method': 'match', 'type': ('dom',), 'court': ('HDO', 'HGO', 'HNN', 'HON', 'HSB', 'HSV', 'HVS')}, {'name': 'hd-dom', 're': 'Målet avgjordes efter huvudförhandling (av|i) (?P<court>HD) \((?P<constitution>[\w:\.\- ,]+)\),? som', 'method': 'match', 'type': ('dom',), 'court': ('HDO',)}, {'name': 'hd-dom2', 're': '(?P<court>HD) \((?P<constitution>[\w:\.\- ,]+)\) meddelade den (?P<date>\d+ \w+ \d+) följande', 'method': 'match', 'type': ('dom',), 'court': ('HDO',)}, {'name': 'hd-fastst', 're': '(?P<court>HD) \((?P<constitution>[\w:\.\- ,]+)\) (beslöt|fattade (slutligt|följande slutliga) beslut)', 'method': 'match', 'type': ('dom',), 'court': ('HDO',)}, {'name': 'mig-dom', 're': '(?P<court>Kammarrätten i Stockholm, Migrationsöverdomstolen) \((?P<date>\d+-\d+-\d+), (?P<constitution>[\w\.\- ,]+)\)', 'method': 'match', 'type': ('dom',), 'court': ('MIG',)}, {'name': 'miv-forstainstans', 're': '(?P<court>Migrationsverket) avslog (ansökan|ansökningarna) den (?P<date>\d+ \w+ \d+) och beslutade att', 'method': 'match', 'type': ('dom',), 'court': ('MIG',)}, {'name': 'miv-forstainstans-2', 're': '(?P<court>Migrationsverket) avslog den (?P<date>\d+ \w+ \d+) A:s ansökan och beslutade att', 'method': 'match', 'type': ('dom',), 'court': ('MIG',)}, {'name': 'mig-dom-alt', 're': 'I sin dom avslog (?P<court>Förvaltningsrätten i Stockholm, migrationsdomstolen) \((?P<date>\d+- ?\d+-\d+), (?P<constitution>[\w\.\- ,]+)\)', 'method': 'match', 'type': ('dom',), 'court': ('MIG',)}, {'name': 'allm-åkl', 're': 'Allmän åklagare yrkade (.*)vid (?P<court>(([A-ZÅÄÖ]' '[a-zåäö]+ )+)(TR|tingsrätt))', 'method': 'match', 'type': ('instans',), 'court': ('HDO', 'HGO', 'HNN', 'HON', 'HSB', 'HSV', 'HVS')}, {'name': 'stämning', 're': 'stämning å (?P<svarande>.*) vid (?P<court>(([A-ZÅÄÖ]' '[a-zåäö]+ )+)(TR|tingsrätt))', 'method': 'search', 'type': ('instans',), 'court': ('HDO', 'HGO', 'HNN', 'HON', 'HSB', 'HSV', 'HVS')}, {'name': 'ansökan', 're': 'ansökte vid (?P<court>(([A-ZÅÄÖ][a-zåäö]+ )+)' '(TR|tingsrätt)) om ', 'method': 'search', 'type': ('instans',), 'court': ('HDO', 'HGO', 'HNN', 'HON', 'HSB', 'HSV', 'HVS')}, {'name': 'riksåkl', 're': 'Riksåklagaren väckte i (?P<court>HD|HovR:n (över|för) ' '([A-ZÅÄÖ][a-zåäö]+ )+|[A-ZÅÄÖ][a-zåäö]+ HovR) åtal', 'method': 'match', 'type': ('instans',), 'court': ('HDO', 'HGO', 'HNN', 'HON', 'HSB', 'HSV', 'HVS')}, {'name': 'tr-överkl', 're': '(?P<karande>[\w\.\(\)\- ]+) (fullföljde talan|' 'överklagade) (|TR:ns dom.*)i (?P<court>HD|(HovR:n|hovrätten) ' '(över|för) (Skåne och Blekinge|Västra Sverige|Nedre ' 'Norrland|Övre Norrland)|(Svea|Göta) (HovR|hovrätt))', 'method': 'match', 'type': ('instans',), 'court': ('HDO', 'HGO', 'HNN', 'HON', 'HSB', 'HSV', 'HVS')}, {'name': 'fullfölj-överkl', 're': '(?P<karanden>[\w\.\(\)\- ]+) fullföljde sin talan$', 'method': 'match', 'type': ('instans',)}, {'name': 'myndighetsansökan', 're': 'I (ansökan|en ansökan|besvär) hos (?P<court>\w+) ' '(om förhandsbesked|yrkade)', 'method': 'match', 'type': ('instans',), 'court': ('REG', 'HFD')}, {'name': 'myndighetsbeslut', 're': '(?P<court>\w+) beslutade (därefter |)(den (?P<date>\d+ \w+ \d+)|' '[\w ]+) att', 'method': 'match', 'type': ('instans',), 'court': ('REG', 'HFD', 'MIG')}, {'name': 'myndighetsbeslut2', 're': '(?P<court>[\w ]+) (bedömde|vägrade) i (bistånds|)beslut' ' (|den (?P<date>\d+ \w+ \d+))', 'method': 'match', 'type': ('instans',), 'court': ('REG', 'HFD')}, {'name': 'hd-revision', 're': '(?P<karanden>[\w\.\(\)\- ]+) sökte revision och yrkade(,' 'i första hand,|, såsom hans talan fick förstås,|,|) att (?P<court>HD|)', 'method': 'match', 'type': ('instans',), 'court': ('HDO',)}, {'name': 'hd-revision2', 're': '(?P<karanden>[\w\.\(\)\- ]+) sökte revision$', 'method': 'match', 'type': ('instans',), 'court': 'HDO'}, {'name': 'hd-revision3', 're': '(?P<karanden>[\w\.\(\)\- ]+) sökte revision och framställde samma yrkanden', 'method': 'match', 'type': ('instans',), 'court': 'HDO'}, {'name': 'överklag-bifall', 're': '(?P<karanden>[\w\.\(\)\- ]+) (anförde besvär|' 'överklagade) och yrkade bifall till (sin talan i ' '(?P<prevcourt>HovR:n|TR:n)|)', 'method': 'match', 'type': ('instans',), 'court': ('HDO', 'HGO', 'HNN', 'HON', 'HSB', 'HSV', 'HVS')}, {'name': 'överklag-2', 're': '(?P<karanden>[\w\.\(\)\- ]+) överklagade ' '(för egen del |)och yrkade (i själva saken |)att ' '(?P<court>HD|HovR:n|kammarrätten|Regeringsrätten|)', 'method': 'match', 'type': ('instans',)}, {'name': 'överklag-3', 're': '(?P<karanden>[\w\.\(\)\- ]+) överklagade (?P<prevcourt>' '\w+)s (beslut|omprövningsbeslut|dom)( i ersättningsfrågan|) (hos|till) ' '(?P<court>[\w\, ]+?)( och yrkade| och anförde|, som| \(Sverige\)|$)', 'method': 'match', 'type': ('instans',)}, {'name': 'överklag-4', 're': '(?!Även )(?P<karanden>(?!HD fastställer)[\w\.\(\)\- ]+) överklagade ((?P<prevcourt>\w+)s (beslut|dom)|beslutet|domen)( och|$)', 'method': 'match', 'type': ('instans',)}, {'name': 'hd-ansokan', 're': '(?P<karanden>[\w\.\(\)\- ]+) anhöll i ansökan som inkom ' 'till (?P<court>HD) d \d+ \w+ \d+', 'method': 'match', 'type': ('instans',), 'court': ('HDO',)}, {'name': 'hd-skrivelse', 're': '(?P<karanden>[\w\.\(\)\- ]+) anförde i en till ' '(?P<court>HD) den \d+ \w+ \d+ ställd', 'method': 'match', 'type': ('instans',), 'court': ('HDO',)}, {'name': 'överklag-5', 're': '(?!Även )(?P<karanden>[\w\.\(\)\- ]+?) överklagade ' '(?P<prevcourt>\w+)s (dom|domar)', 'method': 'match', 'type': ('instans',)}, {'name': 'överklag-6', 're': '(?P<karanden>[\w\.\(\)\- ]+) överklagade domen till ' '(?P<court>\w+)($| och yrkade)', 'method': 'match', 'type': ('instans',)}, {'name': 'myndighetsbeslut3', 're': 'I sitt beslut den (?P<date>\d+ \w+ \d+) avslog ' '(?P<court>\w+)', 'method': 'match', 'type': ('instans',), 'court': ('REG', 'HFD', 'MIG')}, {'name': 'domskal', 're': "(Skäl|Domskäl|HovR:ns domskäl|Hovrättens domskäl)(\. |$)", 'method': 'match', 'type': ('domskal',)}, {'name': 'domskal-ref', 're': "(Tingsrätten|TR[:\.]n|Hovrätten|HD|Högsta förvaltningsdomstolen) \([^)]*\) (meddelade|anförde|fastställde|yttrade)", 'method': 'match', 'type': ('domskal',)}, {'name': 'domskal-dom-fr', # a simplified copy of fr-överkl 're': '(?P<court>(Förvaltningsrätten|' 'Länsrätten|Kammarrätten) i \w+(| län)' '(|, migrationsdomstolen|, Migrationsöverdomstolen)|' 'Högsta förvaltningsdomstolen) \((?P<date>\d+-\d+-\d+), ' '(?P<constitution>[\w\.\- ,]+)\),? yttrade', 'method': 'match', 'type': ('domskal',)}, {'name': 'domslut-standalone', 're': '(Domslut|(?P<court>Hovrätten|HD|hd|Högsta förvaltningsdomstolen):?s avgörande)$', 'method': 'match', 'type': ('domslut',)}, {'name': 'domslut-start', 're': '(?P<court>[\w ]+(domstolen|rätten))s avgörande$', 'method': 'match', 'type': ('domslut',)} ) court = basefile.split("/")[0] matchers = defaultdict(list) matchersname = defaultdict(list) for pat in rx: if 'court' not in pat or court in pat['court']: for t in pat['type']: # print("Adding pattern %s to %s" % (pat['name'], t)) matchers[t].append( getattr( re.compile( pat['re'], re.UNICODE), pat['method'])) matchersname[t].append(pat['name']) def is_delmal(parser, chunk=None): # should handle "IV", "I (UM1001-08)" and "I." etc # but not "I DEFINITION" or "Inledning"... if chunk: strchunk = str(chunk) else: strchunk = str(parser.reader.peek()).strip() if len(strchunk) < 20: m = re.match("(I{1,3}|IV)\.? ?(|\(\w+\-\d+\))$", strchunk) if m: res = {'id': m.group(1)} if m.group(2): res['malnr'] = m.group(2)[1:-1] return res return {} def is_instans(parser, chunk=None): """Determines whether the current position starts a new instans part of the report. """ chunk = parser.reader.peek() strchunk = str(chunk) res = analyze_instans(strchunk) # sometimes, HD domskäl is written in a way that mirrors # the referat of the lower instance (eg. "1. Optimum # ansökte vid Lunds tingsrätt om stämning mot..."). If the # instans progression goes from higher->lower court, # something is amiss. if (hasattr(parser, 'current_instans') and parser.current_instans.court == "Högsta domstolen" and isinstance(res.get('court'), str) and "tingsrätt" in res['court']): return False if res: # in some referats, two subsequent chunks both matches # analyze_instans, even though they refer to the _same_ # instans. Check to see if that is the case if (hasattr(parser, 'current_instans') and hasattr(parser.current_instans, 'court') and parser.current_instans.court and is_equivalent_court(res['court'], parser.current_instans.court)): return {} else: return res elif parser._state_stack == ['body']: # if we're at root level, *anything* starts a new instans return True else: return {} def is_equivalent_court(newcourt, oldcourt): # should handle a bunch of cases # >>> is_equivalent_court("Göta Hovrätt", "HovR:n") # True # >>> is_equivalent_court("HD", "Högsta domstolen") # True # >>> is_equivalent_court("Linköpings tingsrätt", "HovR:n") # False # >>> is_equivalent_court(True, "Högsta domstolen") # True # if newcourt is True: # return newcourt newcourt = canonicalize_court(newcourt) oldcourt = canonicalize_court(oldcourt) if newcourt is True and str(oldcourt) in ('Högsta domstolen'): # typically an effect of both parties appealing to the # supreme court return True if newcourt == oldcourt: return True else: return False def canonicalize_court(courtname): if isinstance(courtname, bool): return courtname # we have no idea which court this # is, only that it is A court else: return courtname.replace( "HD", "Högsta domstolen").replace("HovR", "Hovrätt") def is_heading(parser): chunk = parser.reader.peek() strchunk = str(chunk) if not strchunk.strip(): return False # a heading is reasonably short and does not end with a # period (or other sentence ending typography) return len(strchunk) < 140 and not (strchunk.endswith(".") or strchunk.endswith(":") or strchunk.startswith("”")) def is_betankande(parser): strchunk = str(parser.reader.peek()) return strchunk in ("Målet avgjordes efter föredragning.", "HD avgjorde målet efter föredragning.") def is_dom(parser): strchunk = str(parser.reader.peek()) res = analyze_dom(strchunk) return res def is_domskal(parser): strchunk = str(parser.reader.peek()) res = analyze_domskal(strchunk) return res def is_domslut(parser): strchunk = str(parser.reader.peek()) return analyze_domslut(strchunk) def is_skiljaktig(parser): strchunk = str(parser.reader.peek()) return re.match( "(Justitie|Kammarrätts)råde[nt] ([^\.]*) var (skiljaktig|av skiljaktig mening)", strchunk) def is_tillagg(parser): strchunk = str(parser.reader.peek()) return re.match( "Justitieråde[nt] ([^\.]*) (tillade för egen del|gjorde för egen del ett tillägg)", strchunk) def is_endmeta(parser): strchunk = str(parser.reader.peek()) return re.match("HD:s (beslut|dom|domar) meddela(de|d|t): den", strchunk) def is_paragraph(parser): return True # Turns out, this is really difficult if you consider # abbreviations. This particular heuristic splits on periods # only (Sentences ending with ? or ! are rare in legal text) # and only if followed by a capital letter (ie next sentence) # or EOF. Does not handle things like "Mr. Smith" but that's # also rare in swedish text. However, needs to handle "Linder, # Erliksson, referent, och C. Bohlin", so another heuristic is # that the sentence before can't end in a single capital # letter. def split_sentences(text): text = util.normalize_space(text) text += " " return [x.strip() for x in re.split("(?<![A-ZÅÄÖ])\. (?=[A-ZÅÄÖ]|$)", text)] def analyze_instans(strchunk): res = {} # Case 1: Fixed headings indicating new instance if re_courtname.match(strchunk): res['court'] = strchunk res['complete'] = True return res else: # case 2: common wording patterns indicating new # instance # "H.T. sökte revision och yrkade att HD måtte fastställa" => # <Instans name="HD"><str>H.T. sökte revision och yrkade att <PredicateSubject rel="HD" uri="http://lagen.nu/org/2008/hogsta-domstolen/">HD>/PredicateSubject> # <div class="instans" rel="dc:creator" href="..." # the needed sentence is usually 1st or 2nd # (occassionally 3rd), searching more yields risk of # false positives. sentences = split_sentences(strchunk)[:3] # In rare cases (HDO/T50-91_1) a chunk might be a # Domskäl, but the second sentence looks like the # start of an Instans. Since recognizers come in a # particular order, is_instans is run before # is_domskal, we must detect this false positive domskal_match = matchers['domskal'][matchersname['domskal'].index('domskal')] if domskal_match(sentences[0]): return res for sentence in sentences: for (r, rname) in zip(matchers['instans'], matchersname['instans']): m = r(sentence) if m: # print("analyze_instans: Matcher '%s' succeeded on '%s'" % (rname, sentence)) mg = m.groupdict() if 'court' in mg and mg['court']: res['court'] = mg['court'].strip() else: res['court'] = True # if 'prevcourt' in mg and mg['prevcourt']: # res['prevcourt'] = mg['prevcourt'].strip() if 'date' in mg and mg['date']: parse_swed = DV().parse_swedish_date parse_iso = DV().parse_iso_date try: res['date'] = parse_swed(mg['date']) except ValueError: res['date'] = parse_iso(mg['date']) return res return res def analyze_dom(strchunk): res = {} # special case for "referat" who are nothing but straight verdict documents. if strchunk.strip() == "SAKEN": return {'court': True} # probably only the 1st sentence is interesting for sentence in split_sentences(strchunk)[:1]: for (r, rname) in zip(matchers['dom'], matchersname['dom']): m = r(sentence) if m: # print("analyze_dom: Matcher '%s' succeeded on '%s': %r" % (rname, sentence,m.groupdict())) mg = m.groupdict() if 'court' in mg and mg['court']: res['court'] = mg['court'].strip() if 'date' in mg and mg['date']: parse_swed = DV().parse_swedish_date parse_iso = DV().parse_iso_date try: res['date'] = parse_swed(mg['date']) except ValueError: try: res['date'] = parse_iso(mg['date']) except ValueError: pass # or res['date'] = mg['date']?? # if 'constitution' in mg: # res['constitution'] = parse_constitution(mg['constitution']) return res return res def analyze_domskal(strchunk): res = {} # only 1st sentence for sentence in split_sentences(strchunk)[:1]: for (r, rname) in zip(matchers['domskal'], matchersname['domskal']): m = r(sentence) if m: # print("analyze_domskal: Matcher '%s' succeeded on '%s'" % (rname, sentence)) res['domskal'] = True return res return res def analyze_domslut(strchunk): res = {} # only 1st sentence for sentence in split_sentences(strchunk)[:1]: for (r, rname) in zip(matchers['domslut'], matchersname['domslut']): m = r(sentence) if m: # print("analyze_domslut: Matcher '%s' succeeded on '%s'" % (rname, sentence)) mg = m.groupdict() if 'court' in mg and mg['court']: res['court'] = mg['court'].strip() else: res['court'] = True return res return res def parse_constitution(strchunk): res = [] for thing in strchunk.split(", "): if thing in ("ordförande", "referent"): res[-1]['position'] = thing elif thing.startswith("ordförande ") or thing.startswith("ordf "): pos, name = thing.split(" ", 1) if name.startswith("t f lagmannen"): title, name = name[:13], name[14:] elif name.startswith("hovrättsrådet"): title, name = name[:13], name[14:] else: title = None r = {'name': name, 'position': pos, 'title': title} if 'title' not in r: del r['title'] res.append(r) else: name = thing res.append({'name': name}) # also filter nulls return res # FIXME: This and make_paragraph ought to be expressed as # generic functions in the ferenda.fsmparser module @newstate('body') def make_body(parser): return parser.make_children(Body()) @newstate('delmal') def make_delmal(parser): attrs = is_delmal(parser, parser.reader.next()) if hasattr(parser, 'current_instans'): delattr(parser, 'current_instans') d = Delmal(ordinal=attrs['id'], malnr=attrs.get('malnr')) return parser.make_children(d) @newstate('instans') def make_instans(parser): chunk = parser.reader.next() strchunk = str(chunk) idata = analyze_instans(strchunk) # idata may be {} if the special toplevel rule in is_instans applied if 'complete' in idata: i = Instans(court=strchunk) court = strchunk elif 'court' in idata and idata['court'] is not True: i = Instans([chunk], court=idata['court']) court = idata['court'] else: i = Instans([chunk], court=parser.defaultcourt) court = parser.defaultcourt if court is None: court = "" # we might need to calculate the courts len() below # FIXME: ugly hack, but is_instans needs access to this # object... parser.current_instans = i res = parser.make_children(i) # might need to adjust the court parameter based on better # information in the parse tree for child in res: if isinstance(child, Dom) and hasattr(child, 'court'): # longer courtnames are better if len(str(child.court)) > len(court): i.court = child.court return res def make_heading(parser): # a heading is by definition a single line return Heading(parser.reader.next()) @newstate('betankande') def make_betankande(parser): b = Betankande() b.append(parser.reader.next()) return parser.make_children(b) @newstate('dom') def make_dom(parser): # fix date, constitution etc. Note peek() instead of read() -- # this is so is_domskal can have a chance at the same data ddata = analyze_dom(str(parser.reader.peek())) d = Dom(avgorandedatum=ddata.get('date'), court=ddata.get('court'), malnr=ddata.get('caseid')) return parser.make_children(d) @newstate('domskal') def make_domskal(parser): d = Domskal() return parser.make_children(d) @newstate('domslut') def make_domslut(parser): d = Domslut() return parser.make_children(d) @newstate('skiljaktig') def make_skiljaktig(parser): s = Skiljaktig() s.append(parser.reader.next()) return parser.make_children(s) @newstate('tillagg') def make_tillagg(parser): t = Tillagg() t.append(parser.reader.next()) return parser.make_children(t) @newstate('endmeta') def make_endmeta(parser): m = Endmeta() m.append(parser.reader.next()) return parser.make_children(m) def make_paragraph(parser): chunk = parser.reader.next() strchunk = str(chunk) if not strchunk.strip(): # filter out empty things return None if parser.has_ordered_paras and ordered(strchunk): # FIXME: Cut the ordinal from chunk somehow if isinstance(chunk, Paragraph): chunks = list(chunk) chunks[0] = re.sub("^\s*\d+\. ", "", chunks[0]) p = OrderedParagraph(chunks, ordinal=ordered(strchunk)) else: chunk = re.sub("^\s*\d+\. ", "", chunk) p = OrderedParagraph([chunk], ordinal=ordered(strchunk)) else: if isinstance(chunk, Paragraph): p = chunk else: p = Paragraph([chunk]) return p def ordered(chunk): """Given a string that might be a ordered paragraph, return the ordinal if so, or None otherwise.x """ # most ordered paras use "18. Blahonga". But when quoting # EU law, sometimes "18 Blahonga". Treat these the same. # NOTE: It should not match eg "24hPoker är en # bolagskonstruktion..." (HDO/B2760-09) m = re.match("(\d+)\.?\s", chunk) if m: return m.group(1) def transition_domskal(symbol, statestack): if 'betankande' in statestack: # Ugly hack: mangle the statestack so that *next time* # we encounter a is_domskal, we pop the statestack, # but for now we push to it. # FIXME: This made TestDV.test_parse_HDO_O2668_07 fail # since is_dom wasn't amongst the possible recognizers # when "HD (...) fattade slutligt beslut i enlighet # [...]" was up. I don't know if this logic is needed # anymore, but removing it does not cause test # failures. # statestack[statestack.index('betankande')] = "__done__" return make_domskal, "domskal" else: # here's where we pop the stack return False, None p = FSMParser() # "dom" should not really be a commonstate (it should # theoretically alwawys be followed by domskal or maybe # domslut) but in some cases, the domskal merges with the # start of dom in such a way that we can't transition into # domskal right away (eg HovR:s dom in HDO/B10-86_1 and prob # countless others) commonstates = ( "body", "delmal", "instans", "dom", "domskal", "domslut", "betankande", "skiljaktig", "tillagg") if parseconfig == "simple": p.set_recognizers(is_paragraph) p.set_transitions({ ("body", is_paragraph): (make_paragraph, None) }) p.has_ordered_paras = False else: p.set_recognizers(is_delmal, is_endmeta, is_instans, is_dom, is_betankande, is_domskal, is_domslut, is_skiljaktig, is_tillagg, is_heading, is_paragraph) p.set_transitions({ ("body", is_delmal): (make_delmal, "delmal"), ("body", is_instans): (make_instans, "instans"), ("body", is_endmeta): (make_endmeta, "endmeta"), ("delmal", is_instans): (make_instans, "instans"), ("delmal", is_delmal): (False, None), ("delmal", is_endmeta): (False, None), ("instans", is_betankande): (make_betankande, "betankande"), ("instans", is_domslut): (make_domslut, "domslut"), ("instans", is_dom): (make_dom, "dom"), ("instans", is_instans): (False, None), ("instans", is_skiljaktig): (make_skiljaktig, "skiljaktig"), ("instans", is_tillagg): (make_tillagg, "tillagg"), ("instans", is_delmal): (False, None), ("instans", is_endmeta): (False, None), # either (make_domskal, "domskal") or (False, None) ("betankande", is_domskal): transition_domskal, ("betankande", is_domslut): (make_domslut, "domslut"), ("betankande", is_dom): (False, None), ("__done__", is_domskal): (False, None), ("__done__", is_skiljaktig): (False, None), ("__done__", is_tillagg): (False, None), ("__done__", is_delmal): (False, None), ("__done__", is_endmeta): (False, None), ("__done__", is_domslut): (make_domslut, "domslut"), ("dom", is_domskal): (make_domskal, "domskal"), ("dom", is_domslut): (make_domslut, "domslut"), ("dom", is_instans): (False, None), ("dom", is_skiljaktig): (False, None), # Skiljaktig mening is not considered # part of the dom, but rather an appendix ("dom", is_tillagg): (False, None), ("dom", is_endmeta): (False, None), ("dom", is_delmal): (False, None), ("domskal", is_delmal): (False, None), ("domskal", is_domslut): (False, None), ("domskal", is_instans): (False, None), ("domslut", is_delmal): (False, None), ("domslut", is_instans): (False, None), ("domslut", is_domskal): (False, None), ("domslut", is_skiljaktig): (False, None), ("domslut", is_tillagg): (False, None), ("domslut", is_endmeta): (False, None), ("domslut", is_dom): (False, None), ("skiljaktig", is_domslut): (False, None), ("skiljaktig", is_instans): (False, None), ("skiljaktig", is_skiljaktig): (False, None), ("skiljaktig", is_tillagg): (False, None), ("skiljaktig", is_delmal): (False, None), ("skiljaktig", is_endmeta): (False, None), ("tillagg", is_tillagg): (False, None), ("tillagg", is_delmal): (False, None), ("tillagg", is_endmeta): (False, None), ("endmeta", is_paragraph): (make_paragraph, None), (commonstates, is_heading): (make_heading, None), (commonstates, is_paragraph): (make_paragraph, None), }) # only NJA and MD cases (distinguished by the first three # chars of basefile) can have ordered paragraphs p.has_ordered_paras = basefile[:3] in ('HDO', 'MDO') # parser configuration that is identical between the 'default' # and 'simple' parser p.initial_state = "body" p.initial_constructor = make_body p.debug = os.environ.get('FERENDA_FSMDEBUG', False) # In some cases it's difficult to determine court from document alone. p.defaultcourt = {'PMD': 'Patent- och marknadsöverdomstolen', 'MMD': 'Mark- och miljööverdomstolen'}.get(basefile.split("/")[0]) # return p return p.parse # FIXME: Get this information from self.commondata and the slugs # file. However, that data does not contain lower-level # courts. For now, we use the slugs used internally at # Domstolsverket, but lower-case. Also, this list does not attempt # to bridge when a court changes name (eg. LST and FST are # distinct, even though they refer to the "same" court). In the # case of adminstrative decisions, this also includes slugs for # commmon administrative agencies. courtslugs = { "Skatterättsnämnden": "SRN", "Skatteverket": "SKV", "Migrationsverket": "MIV", "PTS": "PTS", "Attunda tingsrätt": "TAT", "Blekinge tingsrätt": "TBL", "Bollnäs TR": "TBOL", "Borås tingsrätt": "TBOR", "Eskilstuna tingsrätt": "TES", "Eslövs TR": "TESL", "Eksjö TR": "TEK", "Falu tingsrätt": "TFA", "Försäkringskassan": "FSK", "Förvaltningsrätten i Göteborg": "FGO", "Förvaltningsrätten i Göteborg, migrationsdomstolen": "MGO", "Förvaltningsrätten i Malmö": "FMA", "Förvaltningsrätten i Malmö, migrationsdomstolen": "MFM", "Förvaltningsrätten i Stockholm": "FST", "Förvaltningsrätten i Stockholm, migrationsdomstolen": "MFS", "Gotlands tingsrätt": "TGO", "Gävle tingsrätt": "TGA", "Göta hovrätt": "HGO", "Göteborgs TR": "TGO", "Göteborgs tingsrätt": "TGO", "Halmstads tingsrätt": "THA", "Helsingborgs tingsrätt": "THE", "Hudiksvalls tingsrätt": "THU", "Jönköpings tingsrätt": "TJO", "Kalmar tingsrätt": "TKA", "Kammarrätten i Sundsvall": "KSU", "Kristianstads tingsrätt": "TKR", "Linköpings tingsrätt": "TLI", "Ljusdals TR": "TLJ", "Luleå tingsrätt": "TLU", "Lunds tingsrätt": "TLU", "Lycksele tingsrätt": "TLY", "Länsrätten i Dalarnas län": "LDA", "Länsrätten i Göteborg": "LGO", "Länsrätten i Jämtlands län": "LJA", "Länsrätten i Kopparbergs län": "LKO", "Länsrätten i Kronobergs län": "LKR", "Länsrätten i Malmöhus län": "LMAL", "Länsrätten i Mariestad": "LMAR", "Länsrätten i Norbottens län": "LNO", "Länsrätten i Skaraborgs län": "LSK", "Länsrätten i Skåne län": "LSK", "Länsrätten i Stockholms län": "LST", "Länsrätten i Stockholms län, migrationsdomstolen": "MLS", "Länsrätten i Södermanlands län": "LSO", "Länsrätten i Uppsala län": "LUP", "Länsrätten i Vänersborg": "LVAN", "Länsrätten i Värmlands län": "LVAR", "Länsrätten i Västerbottens län": "LVAB", "Länsrätten i Västmanlands län": "LVAL", "Länsrätten i Älvsborgs län": "LAL", "Malmö TR": "TMA", "Malmö tingsrätt": "TMA", "Mariestads tingsrätt": "TMAR", "Mora tingsrätt": "TMO", "Nacka tingsrätt": "TNA", "Norrköpings tingsrätt": "TNO", "Nyköpings tingsrätt": "TNY", "Skaraborgs tingsrätt": "TSK", "Skövde TR": "TSK", "Solna tingsrätt": "TSO", "Stockholms TR": "TST", "Stockholms tingsrätt": "TST", "Sundsvalls tingsrätt": "TSU", "Svea hovrätt, Mark- och miljööverdomstolen": "MHS", "Södertälje tingsrätt": "TSE", "Södertörns tingsrätt": "TSN", "Södra Roslags TR": "TSR", "Uddevalla tingsrätt": "TUD", "Umeå tingsrätt": "TUM", "Uppsala tingsrätt": "TUP", "Varbergs tingsrätt": "TVAR", "Vänersborgs tingsrätt": "TVAN", "Värmlands tingsrätt": "TVARM", "Västmanlands tingsrätt": "TVAS", "Växjö tingsrätt": "TVA", "Ångermanlands tingsrätt": "TAN", "Örebro tingsrätt": "TOR", "Östersunds tingsrätt": "TOS", "Kammarrätten i Jönköping": "KJO", "Kammarrätten i Göteborg": "KGO", "Kammarrätten i Stockholm": "KST", "Göta HovR": "HGO", "HovR:n för Nedre Norrland": "HNN", "HovR:n för Västra Sverige": "HVS", "HovR:n för Övre Norrland": "HON", "HovR:n över Skåne och Blekinge": "HSB", "Hovrätten för Nedre Norrland": "HNN", "Hovrätten för Västra Sverige": "HVS", "Hovrätten för Västra Sverige": "HVS", "Hovrätten för Övre Norrland": "HON", "Hovrätten över Skåne och Blekinge": "HSB", "Hovrätten över Skåne och Blekinge": "HSB", "Svea HovR": "HSV", "Svea hovrätt": "HSV", # supreme courts generally use abbrevs established by # Vägledande rättsfall. "Kammarrätten i Stockholm, Migrationsöverdomstolen": "MIG", "Migrationsöverdomstolen": "MIG", "Högsta förvaltningsdomstolen": "HFD", "Regeringsrätten": "REGR", # REG is "Regeringen" "Högsta domstolen": "HDO", "HD": "HDO", "arbetsdomstolen": "ADO", "Mark- och miljööverdomstolen": "MMD", "Patentbesvärsrätten": "PBR", "Patent- och marknadsöverdomstolen": "PMÖD", # for when the type of court, but not the specific court, is given "HovR:n": "HovR", "Hovrätten": "HovR", "Kammarrätten": "KamR", "Länsrätten": "LR", "TR:n": "TR", "Tingsrätten": "TR", "länsrätten": "LR", "miljödomstolen": "MID", "tingsrätten": "TR", "Länsstyrelsen": "LST", "Marknadsdomstolen": "MD", "Migrationsdomstolen": "MID", "fastighetsdomstolen": "FD", "förvaltningsrätten": "FR", "hovrätten": "HovR", "kammarrätten": "KamR", # additional courts/agencies "Alingsås TR": "TAL", "Alingsås tingsrätt": "TAL", "Arbetslöshetskassan": "ALK", "Arvika TR": "TAR", "Banverket": "BAN", "Bodens TR": "TBO", "Bollnäs tingsrätt": "TBO", "Borås TR": "TBO", "Byggnadsnämnden": "BYN", "Datainspektionen": "DI", "Eksjö tingsrätt": "TEK", "Energimyndigheten": "ENM", "Enköpings TR": "TEN", "Enköpings tingsrätt": "TEN", "Eskilstuna TR": "TES", "Falköpings TR": "TFA", "Falköpings tingsrätt": "TFA", "Falu TR": "TFA", "Fastighetsmäklarnämnden": "FMN", "Fastighetstaxeringsnämnden": "FTN", "Finansinspektionen": "FI", "Forskarskattenämnden": "FSN", "Förvaltningsrätten i Falun": "FFA", "Förvaltningsrätten i Härnösand": "FHA", "Förvaltningsrätten i Jönköping": "FJO", "Förvaltningsrätten i Karlstad": "FKA", "Förvaltningsrätten i Linköping": "FLI", "Förvaltningsrätten i Luleå": "FLU", "Förvaltningsrätten i Luleå, migrationsdomstolen": "FLUM", "Förvaltningsrätten i Skåne län": "FSK", "Förvaltningsrätten i Umeå": "FUM", "Förvaltningsrätten i Uppsala": "FUP", "Förvaltningsrätten i Växjö": "FVA", "Gotlands TR": "TGO", "Gällivare TR": "TGÄ", "Gällivare tingsrätt": "TGÄ", "Gävle TR": "TGÄ", "Hallsbergs TR": "THA", "Halmstads TR": "THA", "Handens TR": "THA", "Handens tingsrätt": "THA", "Haparanda TR": "THA", "Haparanda tingsrätt": "THA", "Hedemora TR": "THE", "Helsingborgs TR": "THE", "Huddinge TR": "THU", "Huddinge tingsrätt": "THU", "Hudiksvalls TR": "THU", "Härnösands TR": "THÄ", "Härnösands tingsrätt": "THÄ", "Hässleholms TR": "THÄ", "Hässleholms tingsrätt": "THÄ", "Invandrarverket": "INV", "Jakobsbergs TR": "TJA", "Jordbruksverket": "JBV", "Jämtbygdens TR": "TJÄ", "Jönköpings TR": "TJÖ", "Kalmar TR": "TKA", "Kammarkollegiet": "KK", "Karlshamns TR": "TKA", "Karlskoga TR": "TKA", "Karlskoga tingsrätt": "TKA", "Karlskrona TR": "TKA", "Karlskrona tingsrätt": "TKA", "Karlstads TR": "TKA", "Karlstads tingsrätt": "TKA", "Katrineholms TR": "TKA", "Katrineholms tingsrätt": "TKA", "Klippans TR": "TKL", "Klippans tingsrätt": "TKL", "Koncessionsnämnden för miljöskydd": "KFM", "Kriminalvården": "KRV", "Kristianstads TR": "TKR", "Kristinehamns TR": "TKR", "Kristinehamns tingsrätt": "TKR", "Kyrkogårdsnämnden": "KGN", "Kyrkogårdsstyrelsen": "KGS", "Köpings TR": "TKÖ", "Landskrona TR": "TLA", "Landskrona tingsrätt": "TLA", "Leksands TR": "TLE", "Lidköpings TR": "TLI", "Lidköpings tingsrätt": "TLI", "Lindesbergs TR": "TLI", "Linköpings TR": "TLI", "Ljungby TR": "TLJ", "Ljungby tingsrätt": "TLJ", "Ludvika TR": "TLU", "Ludvika tingsrätt": "TLU", "Luleå TR": "TLU", "Lunds TR": "TLU", "Lycksele TR": "TLY", "Läkemedelsverket": "LMV", "Länsrätten i Blekinge län": "LBL", "Länsrätten i Gotlands län": "LGO", "Länsrätten i Gävleborgs län": "LGÄ", "Länsrätten i Göteborg, migrationsdomstolen": "LGÖ", "Länsrätten i Hallands län": "LHA", "Länsrätten i Jönköpings län": "LJÖ", "Länsrätten i Kalmar län": "LKA", "Länsrätten i Kristianstads län": "LKR", "Länsrätten i Norrbottens län": "LNO", "Länsrätten i Skåne": "LSK", "Länsrätten i Skåne län, migrationsdomstolen": "LSK", "Länsrätten i Stockholm": "LST", "Länsrätten i Stockholm län": "LST", "Länsrätten i Stockholm, migrationsdomstolen": "LSTM", "Länsrätten i Västernorrlands län": "LVÄ", "Länsrätten i Örebro län": "LÖR", "Länsrätten i Östergötlands län": "LÖS", "Länsstyrelsen i Dalarnas län": "LSTD", "Länsstyrelsen i Stockholms län": "LSTS", "Mariestads TR": "TMA", "Mjölby TR": "TMJ", "Mora TR": "TMO", "Motala TR": "TMO", "Mölndals TR": "TMÖ", "Mölndals tingsrätt": "TMÖ", "Nacka TR": "TNA", "Nacka tingsrätt, mark- och miljödomstolen": "TNAM", "Nacka tingsrätt, miljödomstolen": "TNAM", "Norrköpings TR": "TNO", "Norrtälje tingsrätt": "TNO", "Nyköpings TR": "TNY", "Omsorgsnämnden i Trollhättans kommun": "OMS", "Oskarshamns TR": "TOS", "Oskarshamns tingsrätt": "TOS", "Piteå TR": "TPI", "Polismyndigheten": "POL", "RTV": "RTV", "Regeringen": "REG", "Revisorsnämnden": "REV", "Ronneby TR": "TRO", "Ronneby tingsrätt": "TRO", "Rättsskyddscentralen": "RSC", "Sala TR": "TSA", "Sandvikens TR": "TSA", "Simrishamns TR": "TSI", "Sjuhäradsbygdens TR": "TSJ", "Sjuhäradsbygdens tingsrätt": "TSJ", "Skattemyndigheten": "SKM", "Skattemyndigheten i Luleå": "SKML", "Skattverket": "SKV", "Skellefteå TR": "TSK", "Skellefteå tingsrätt": "TSK", "Skövde tingsrätt": "TSK", "Socialnämnden": "SON", "Socialstyrelsen": "SOS", "Sollefteå TR": "TSO", "Sollentuna TR": "TSO", "Sollentuna tingsrätt": "TSO", "Solna TR": "TSO", "Statens jordbruksverk": "SJV", "Stenungsunds TR": "TST", "Stenungsunds tingsrätt": "TST", "Stockholm tingsrätt": "TST", "Strömstads tingsrätt": "TST", "Sundsvalls TR": "TSU", "Sunne TR": "TSU", "Sunne tingsrätt": "TSU", "Svegs TR": "TSV", "Södertälje TR": "TSÖ", "Södra Roslags tingsrätt": "TSÖ", "Sölvesborgs TR": "TSÖ", "Tierps TR": "TTI", "Tierps tingsrätt": "TTI", "Trafiknämnden": "TRN", "Transportstyrelsen": "TS", "Trelleborgs TR": "TTR", "Trelleborgs tingsrätt": "TTR", "Trollhättans TR": "TTR", "Tullverket": "TV", "Uddevalla TR": "TUD", "Umeå TR": "TUM", "Umeå tingsrätt, mark- och miljödomstolen": "TUMM", "Ungdomsstyrelsen": "US", "Uppsala TR": "TUP", "Varbergs TR": "TVA", "Vattenöverdomstolen": "VÖD", "Vänersborgs TR": "TVÄ", "Vänersborgs tingsrätt, Miljödomstolen": "TVÄM", "Vänersborgs tingsrätt, mark- och miljödomstolen": "TVÄM", "Värnamo TR": "TVÄ", "Värnamo tingsrätt": "TVÄ", "Västerviks TR": "TVÄ", "Västerås TR": "TVÄ", "Västerås tingsrätt": "TVÄ", "Växjö TR": "TVÄ", "Växjö tingsrätt, mark- och miljödomstolen": "TVÄM", "Växjö tingsrätt, miljödomstolen": "TVÄM", "Ystads TR": "TYS", "Ystads tingsrätt": "TYS", "hovrätten för Västra Sverige": "HVS", "kammarrätten i Göteborg": "KGO", "länsrätten i Skåne län, migrationsdomstolen": "LSKM", "länsstyrelsen": "LST", "migrationsdomstolen": "MD", "regeringen": "REG", "skattemyndigheten": "SKM", "Ängelholms TR": "TÄN", "Åmåls TR": "TÅM", "Örebro TR": "TÖR", "Örnsköldsviks tingsrätt": "TÖR", "Östersunds TR": "TÖS" } def construct_id(self, node, state): if isinstance(node, Delmal): state = dict(state) node.uri = state['uri'] + "#" + node.ordinal elif isinstance(node, Instans): if node.court: state = dict(state) courtslug = self.courtslugs.get(node.court, "XXX") if courtslug == "XXX": self.log.warning("%s No slug defined for court %s" % (state["basefile"], node.court)) if "#" not in state['uri']: state['uri'] += "#" else: state['uri'] += "/" node.uri = state['uri'] + courtslug else: return state elif isinstance(node, OrderedParagraph): separator = "/" if "#" in state['uri'] else "#" node.uri = state['uri'] + separator + "P" + node.ordinal return state elif isinstance(node, (Body, Dom, Domskal)): return state else: return None state['uri'] = node.uri return state def visitor_functions(self, basefile): return ((self.construct_id, {'uri': self._canonical_uri, 'basefile': basefile}), ) def facets(self): # NOTE: it's important that RPUBL.rattsfallspublikation is the # first facet (toc_pagesets depend on it) def myselector(row, binding, resource_graph=None): return (util.uri_leaf(row['rpubl_rattsfallspublikation']), row['rpubl_arsutgava']) # FIXME: This isn't used anymore -- when was it used and by which facet? def mykey(row, binding, resource_graph=None): if binding == "main": # we'd really like # rpubl:VagledandeDomstolsavgorande/rpubl:avgorandedatum, # but that requires modifying facet_query return row['update'] else: return util.split_numalpha(row['dcterms_identifier']) return [Facet(RPUBL.rattsfallspublikation, indexingtype=fulltextindex.Resource(), use_for_toc=True, use_for_feed=True, selector=myselector, # => ("ad","2001"), ("nja","1981") key=Facet.resourcelabel, identificator=Facet.defaultselector, dimension_type='ref'), Facet(RPUBL.referatrubrik, indexingtype=fulltextindex.Text(boost=4), toplevel_only=True, use_for_toc=False), Facet(DCTERMS.identifier, use_for_toc=False), Facet(RPUBL.arsutgava, indexingtype=fulltextindex.Label(), use_for_toc=False, selector=Facet.defaultselector, key=Facet.defaultselector, dimension_type='value'), Facet(RDF.type, use_for_toc=False, use_for_feed=True, # dimension_label="main", # FIXME: # dimension_label must be calculated as rdf_type # or else the data from faceted_data() won't be # usable by wsgi.stats # key= # FIXME add useful key method for sorting docs identificator=lambda x, y, z: None), Facet(RPUBL.avgorandedatum, # we need this data when # creating feeds, but not # to sort/group by use_for_toc=False, use_for_feed=False) ] + self.standardfacets def facet_query(self, context): query = super(DV, self).facet_query(context) # FIXME: This is really hacky, but the rpubl:avgorandedatum # that we need is not a property of the root resource, but # rather a linked resource. So we postprocess the query to get # at that linked resource return query.replace("?uri rpubl:avgorandedatum ?rpubl_avgorandedatum", "?uri rpubl:referatAvDomstolsavgorande ?domuri . ?domuri rpubl:avgorandedatum ?rpubl_avgorandedatum") def _relate_fulltext_resources(self, body): res = [] uris = set() for r in body.findall(".//*[@about]"): if r.get("class") == "bodymeta": continue if r.get("about") not in uris: uris.add(r.get("about")) res.append(r) return [body] + res _relate_fulltext_value_cache = {} def _relate_fulltext_value(self, facet, resource, desc): def rootlabel(desc): return desc.getvalue(DCTERMS.identifier) if facet.dimension_label in ("label", "creator", "issued"): # "creator" and "issued" should be identical for the root # resource and all contained subresources. "label" can # change slighly. resourceuri = resource.get("about") rooturi = resourceuri.split("#")[0] if "#" not in resourceuri: l = rootlabel(desc) desc.about(desc.getrel(RPUBL.referatAvDomstolsavgorande)) self._relate_fulltext_value_cache[rooturi] = { "creator": desc.getrel(DCTERMS.publisher), "issued": desc.getvalue(RPUBL.avgorandedatum), "label": l } desc.about(resourceuri) v = self._relate_fulltext_value_cache[rooturi][facet.dimension_label] if facet.dimension_label == "label" and "#" in resourceuri: if desc.getvalues(DCTERMS.creator): court = desc.getvalue(DCTERMS.creator) else: court = resource.get("about").split("#")[1] # v = "%s (%s)" % (v, court) v = court return facet.dimension_label, v else: return super(DV, self)._relate_fulltext_value(facet, resource, desc) def tabs(self): return [("Vägledande rättsfall", self.dataset_uri())]