# a RequestHandler is part of a docrepo and responsible for
# determining if the docrepo can respond to a particular request, and
# for determining the physical path of the file corresponding to that
# request.
from wsgiref.util import request_uri
import re
import os
import sys
from io import BytesIO
from functools import partial
from urllib.parse import urlparse, unquote, parse_qsl
import mimetypes
import traceback
from rdflib import Graph
from ferenda.thirdparty import httpheader
from ferenda import util
from ferenda.errors import RequestHandlerError
class RequestHandler(object):
_mimesuffixes = {'xhtml': 'application/xhtml+xml',
'rdf': 'application/rdf+xml',
'atom': 'application/atom+xml'}
_rdfformats = {'application/rdf+xml': 'pretty-xml',
'text/turtle': 'turtle',
'application/n-triples': 'nt',
'application/json': 'json-ld'}
_revformats = dict([(v, k) for k, v in _rdfformats.items()])
_rdfsuffixes = {'rdf': 'pretty-xml',
'ttl': 'turtle',
'nt': 'nt',
'json': 'json-ld'}
_mimemap = {'text/html': 'generated_path',
'application/xhtml+xml': 'parsed_path',
'application/rdf+xml': 'distilled_path'}
_suffixmap = {'xhtml': 'parsed_path',
'rdf': 'distilled_path'}
def __init__(self, repo):
self.repo = repo
def dataset_params_from_uri(self, uri):
"""Given a parametrized dataset URI, return the parameter and value
used (or an empty tuple, if it is a dataset URI handled by
this repo, but without any parameters).
>>> d = DocumentRepository()
>>> d.alias
'base'
>>> d.config.url = "http://example.org/"
>>> d.dataset_params_from_uri("http://example.org/dataset/base?title=a")
{"param": "title", "value": "a", "feed": False}
>>> d.dataset_params_from_uri("http://example.org/dataset/base")
{}
>>> d.dataset_params_from_uri("http://example.org/dataset/base/feed/title")
{"param": "title", "feed": True}
"""
wantedprefix = self.repo.config.url + "dataset/" + self.repo.alias
if (uri == wantedprefix or
("?" in uri and uri.startswith(wantedprefix)) or
("/feed" in uri and uri.startswith(wantedprefix))):
path = uri[len(wantedprefix) + 1:]
params = {}
if path.startswith("feed"):
params['feed'] = True
if "=" in path:
param, value = path.split("=", 1)
params['param'] = param
params['value'] = value
return params
# else return None (which is different from {})
def basefile_params_from_basefile(self, basefile):
if "?" not in basefile:
return {}
else:
return dict(parse_qsl(basefile.split("?", 1)[1]))
def supports(self, environ):
"""Returns True iff this particular handler supports this particular request."""
segments = environ['PATH_INFO'].split("/", 3)
# with PATH_INFO like /dataset/base.rdf, we still want the
# alias to check to be "base", not "base.rdf"
if len(segments) <= 2:
return False
reponame = segments[2]
# this segment might contain suffix or parameters -- remove
# them before comparison
m = re.search('[^\.\?]*$', reponame)
if m and m.start() > 0:
reponame = reponame[:m.start()-1]
return reponame == self.repo.alias
def supports_uri(self, uri):
return self.supports({'PATH_INFO': urlparse(uri).path})
def path(self, uri):
"""Returns the physical path that the provided URI respolves
to. Returns None if this requesthandler does not support the
given URI, or the URI doesnt resolve to a static file.
"""
suffix = None
if urlparse(uri).path.startswith("/dataset/"):
params = self.dataset_params_from_uri(uri)
if ".atom" in uri:
suffix = "atom"
environ = {}
else:
environ = {"HTTP_ACCEPT": "text/html"}
contenttype = self.contenttype(environ, uri, None, params, suffix)
pathfunc = self.get_dataset_pathfunc(environ, params, contenttype, suffix)
if pathfunc:
return pathfunc()
else:
return None
else:
params = self.basefile_params_from_basefile(uri)
if params:
uri = uri.split("?")[0]
basefile = self.repo.basefile_from_uri(uri)
if basefile is None:
return None
if 'format' in params:
suffix = params['format']
else:
if 'attachment' in params:
leaf = params['attachment']
else:
leaf = uri.split("/")[-1]
if "." in leaf:
suffix = leaf.rsplit(".", 1)[1]
environ = {}
if not suffix:
environ['HTTP_ACCEPT'] = "text/html"
contenttype = self.contenttype(environ, uri, basefile, params, suffix)
pathfunc = self.get_pathfunc(environ, basefile, params, contenttype, suffix)
if pathfunc:
return pathfunc(basefile)
def request_uri(self, environ):
rawuri = request_uri(environ)
uri = unquote(rawuri.encode("latin-1").decode("utf-8"))
if getattr(self.repo.config, 'develurl', None):
# in some circumstances, we might want to set develurl to
# https://... while the actual uri provided will be
# http://... (eg. due to TLS-terminating proxies and other
# things), so we change the protocol of the request to
# match the protocol as specified by config.develuri
uriproto = uri.split("://")[0]
develproto = self.repo.config.develurl.split("://")[0]
if uriproto != develproto:
uri = re.sub("^"+uriproto, develproto, uri)
uri = uri.replace(self.repo.config.develurl, self.repo.config.url)
if getattr(self.repo.config, 'acceptalldomains', False):
# eg if the request_uri is http://localhost:8080/docs/1
# (and config.develurl is not set or doesn't match this),
# and config.url is https://example.org/, chnage
# request_uri to https://example.org/docs/1
uri = self.repo.config.url + uri.split("/", 3)[-1]
return uri
def handle(self, environ):
"""provides a response to a particular request by returning a a tuple
*(fp, length, status, mimetype)*, where *fp* is an open file of the
document to be returned.
"""
segments = environ['PATH_INFO'].split("/", 3)
uri = self.request_uri(environ)
if "?" in uri:
uri, querystring = uri.rsplit("?", 1)
else:
querystring = None
suffix = None
if segments[1] == "dataset":
basefile = None
tmpuri = uri
if "." in uri.split("/")[-1]:
tmpuri = tmpuri.rsplit(".", 1)[0]
if querystring:
tmpuri += "?" + querystring
params = self.dataset_params_from_uri(tmpuri)
else:
basefile = self.repo.basefile_from_uri(uri)
if not basefile:
raise RequestHandlerError("%s couldn't resolve %s to a basefile" % (self.repo.alias, uri))
if querystring:
params = dict(parse_qsl(querystring))
else:
params = self.basefile_params_from_basefile(basefile)
if 'format' in params:
suffix = params['format']
else:
if 'attachment' in params:
leaf = params['attachment']
else:
leaf = uri.split("/")[-1]
if "." in leaf:
suffix = leaf.rsplit(".", 1)[1]
contenttype = self.contenttype(environ, uri, basefile, params, suffix)
if segments[1] == "dataset":
path, data = self.lookup_dataset(environ, params, contenttype, suffix)
else:
path, data = self.lookup_resource(environ, basefile, params,
contenttype, suffix)
return self.prep_request(environ, path, data, contenttype)
def contenttype(self, environ, uri, basefile, params, suffix):
accept = environ.get('HTTP_ACCEPT')
preferred = None
if accept:
# do proper content-negotiation, but make sure
# application/xhtml+xml ISN'T one of the available options (as
# modern browsers may prefer it to text/html, and our
# application/xhtml+xml isn't what they want) -- ie we only
# serve application/xhtml+xml if a client specifically only
# asks for that. Yep, that's a big FIXME.
available = ("text/html") # add to this?
preferred = httpheader.acceptable_content_type(accept,
available,
ignore_wildcard=False)
contenttype = None
if accept != "text/html" and accept in self._mimemap:
contenttype = accept
elif suffix in self._mimesuffixes:
contenttype = self._mimesuffixes[suffix]
elif accept in self._rdfformats:
contenttype = accept
elif suffix in self._rdfsuffixes:
contenttype = self._revformats[self._rdfsuffixes[suffix]]
elif suffix and "."+suffix in mimetypes.types_map:
contenttype = mimetypes.types_map["."+suffix]
else:
if ((not suffix) and
preferred and
preferred[0].media_type == "text/html"):
contenttype = preferred[0].media_type
# pathfunc = repo.store.generated_path
return contenttype
def get_pathfunc(self, environ, basefile, params, contenttype, suffix):
"""Given the parameters, return a function that will, given a
basefile, produce the proper path to that basefile. If the
parameters indicate a version of the resource that does not
exist as a static file on disk (like ".../basefile/data.rdf"),
returns None
"""
# try to lookup pathfunc from contenttype (or possibly suffix, or maybe params)
if "repo" in params:
# this must be a CompositeRepository that has the get_instance method
for cls in self.repo.subrepos:
if cls.alias == params['repo']:
repo = self.repo.get_instance(cls)
break
else:
raise ValueError("No '%s' repo is a subrepo of %s" %
(param['repo'], self.repo.alias))
else:
repo = self.repo
if "dir" in params:
method = {'downloaded': repo.store.downloaded_path,
'intermediate': repo.store.intermediate_path,
'parsed': repo.store.parsed_path}[params["dir"]]
if "page" in params and "format" in params:
baseparam = "-size 400x300 -pointsize 12 -gravity center"
baseattach = None
try:
if "attachment" in params:
sourcefile = method(basefile, attachment=params["attachment"])
else:
sourcefile = method(basefile)
# we might run this on a host to where we haven't
# transferred the downloaded files -- try to
# re-aquire them now that someone wants to watch
# them.
if not os.path.exists(sourcefile):
repo.download(basefile)
assert params["page"].isdigit(), "%s is not a digit" % params["page"]
assert params["format"] in ("png", "jpg"), ("%s is not a valid image format" %
params["format"])
baseattach = "page_%s.%s" % (params["page"], params["format"])
if "attachment" in params:
baseattach = "%s_%s" % (params["attachment"], baseattach)
outfile = repo.store.intermediate_path(basefile, attachment=baseattach)
if not os.path.exists(outfile):
# params['page'] is 0-based, pdftoppm is 1-based
cmdline = "pdftoppm -f %s -singlefile -png %s %s" % (int(params["page"])+1, sourcefile, outfile.replace(".png",".tmp"))
util.runcmd(cmdline, require_success=True)
cmdline = "convert %s -trim %s" % (outfile.replace(".png", ".tmp.png"), outfile)
util.runcmd(cmdline, require_success=True)
os.unlink(outfile.replace(".png", ".tmp.png"))
except Exception as e:
if not baseattach:
baseattach = "page_error.png"
outfile = repo.store.intermediate_path(basefile, attachment=baseattach)
errormsg = "%s\n%s: %s" % ("".join(traceback.format_tb(sys.exc_info()[2])), e.__class__.__name__, str(e))
errormsg = errormsg.replace("\n", "\\n").replace("'", "\\'")
cmdline = 'convert label:"%s" %s' % (errormsg, outfile)
util.runcmd(cmdline, require_success=True)
method = partial(repo.store.intermediate_path, attachment=baseattach)
return method # we really don't want to partial()
# this method again below
elif contenttype in self._mimemap and not basefile.endswith("/data"):
method = getattr(repo.store, self._mimemap[contenttype])
elif suffix in self._suffixmap and not basefile.endswith("/data"):
method = getattr(repo.store, self._suffixmap[suffix])
elif "attachment" in params and mimetypes.guess_extension(contenttype):
method = repo.store.generated_path
else:
# method = repo.store.generated_path
return None
if "attachment" in params:
method = partial(method, attachment=params["attachment"])
return method
def get_dataset_pathfunc(self, environ, params, contenttype, suffix):
suffix = {"text/html": "html",
"application/atom+xml": "atom"}.get(contenttype, None)
if suffix:
if params:
if 'feed' in params:
if 'param' in params:
pseudobasefile = "feed/%s.%s" % (params['value'], suffix)
else:
pseudobasefile = "feed/main.%s" % suffix
else:
pseudobasefile = "toc/%s/%s.html" % (params['param'], params['value'])
else:
pseudobasefile = "toc/index.html"
return partial(self.repo.store.resourcepath, pseudobasefile)
elif contenttype == "application/n-triples" or suffix == "nt":
return partial(self.repo.store.resourcepath, "distilled/dump.nt")
def lookup_resource(self, environ, basefile, params, contenttype, suffix):
pathfunc = self.get_pathfunc(environ, basefile, params, contenttype, suffix)
if not pathfunc:
extended = False
# no static file exists, we need to call code to produce data
if basefile.endswith("/data"):
extended = True
basefile = basefile[:-5]
if contenttype in self._rdfformats or suffix in self._rdfsuffixes:
g = Graph()
g.parse(self.repo.store.distilled_path(basefile))
if extended:
annotation_graph = self.repo.annotation_file_to_graph(
self.repo.store.annotation_path(basefile))
g += annotation_graph
path = None
if contenttype in self._rdfformats:
data = g.serialize(format=self._rdfformats[contenttype])
elif suffix in self._rdfsuffixes:
data = g.serialize(format=rdfsuffixes[suffix])
else:
data = None
path = None
if pathfunc:
path = pathfunc(basefile)
data = None
return path, data
def lookup_dataset(self, environ, params, contenttype, suffix):
# FIXME: This should also make use of pathfunc
data = None
path = None
suffix = {"text/html": "html",
"application/atom+xml": "atom"}.get(contenttype, None)
if suffix:
if params:
if 'feed' in params:
if 'param' in params:
pseudobasefile = "feed/%s.%s" % (params['value'], suffix)
else:
pseudobasefile = "feed/main.%s" % suffix
else:
pseudobasefile = "toc/%s/%s.html" % (params['param'], params['value'])
else:
pseudobasefile = "toc/index.html"
path = self.repo.store.resourcepath(pseudobasefile)
elif contenttype == "application/n-triples" or suffix == "nt":
path = self.repo.store.resourcepath("distilled/dump.nt")
elif contenttype in self._rdfformats or suffix in self._rdfsuffixes:
g = Graph()
g.parse(self.repo.store.resourcepath("distilled/dump.nt"),
format="nt")
if contenttype in self._rdfformats:
format = self._rdfformats[contenttype]
else:
format = self._rdfsuffixes[suffix]
data = g.serialize(format=format)
return path, data
def prep_request(self, environ, path, data, contenttype):
if path and os.path.exists(path):
status = 200
# FIXME: These are not terribly well designed flow control
# mechanisms
if path.endswith("page_error.png"):
status = 500
elif path.endswith(".404"):
status = 404
fp = open(path, 'rb')
return (fp,
os.path.getsize(path),
status,
contenttype)
elif data:
return (BytesIO(data),
len(data),
200,
contenttype)
else:
msg = "<h1>406</h1>No acceptable media found for <tt>%s</tt>" % environ.get('HTTP_ACCEPT', 'text/html')
return(BytesIO(msg.encode('utf-8')),
len(msg.encode('utf-8')),
406,
"text/html")