# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from contextlib import contextmanager
import shutil
import os
from tempfile import NamedTemporaryFile
import filecmp
import six
from six.moves.urllib_parse import quote, unquote
from ferenda import util
from ferenda import errors
[docs]class DocumentStore(object):
"""
Unifies handling of reading and writing of various data files
during the ``download``, ``parse`` and ``generate`` stages.
:param datadir: The root directory (including docrepo path
segment) where files are stored.
:type datadir: str
:param downloaded_suffix: File suffix for the main source document
format. Determines the suffix of
downloaded files.
:type downloaded_suffix: str
:param storage_policy: Some repositories have documents in several
formats, documents split amongst several
files or embedded resources. If
``storage_policy`` is set to ``dir``, then
each document gets its own directory (the
default filename being ``index`` +suffix),
otherwise each doc gets stored as a file in
a directory with other files. Affects
:py:meth:`~ferenda.DocumentStore.path`
(and therefore all other ``*_path``
methods)
:type storage_policy: str
"""
def __init__(self, datadir, downloaded_suffix=".html", storage_policy="file"):
self.datadir = datadir # docrepo.datadir + docrepo.alias
self.downloaded_suffix = downloaded_suffix
self.storage_policy = storage_policy
@contextmanager
def _open(self, filename, mode):
if "w" in mode:
fp = NamedTemporaryFile(mode, delete=False)
fp.realname = filename
try:
yield fp
finally:
tempname = fp.name
fp.close()
if not os.path.exists(filename) or not filecmp.cmp(tempname, filename):
util.ensure_dir(filename)
shutil.move(tempname, filename)
else:
os.unlink(tempname)
else:
if "a" in mode and not os.path.exists(filename):
util.ensure_dir(filename)
fp = open(filename, mode)
yield fp
# TODO: Maybe this is a worthwhile extension to the API? Could ofc
# easily be done everywhere where a non-document related path is
# needed.
[docs] def resourcepath(self, resourcename):
return self.datadir + os.sep + resourcename.replace("/", os.sep)
[docs] def path(self, basefile, maindir, suffix, version=None, attachment=None, storage_policy=None):
"""Calculate a full filesystem path for the given parameters.
:param basefile: The basefile of the resource we're calculating a filename for
:type basefile: str
:param maindir: The stage of processing, e.g. ``downloaded`` or ``parsed``
:type maindir: str
:param suffix: Appropriate file suffix, e.g. ``.txt`` or ``.pdf``
:param version: Optional. The archived version id
:type version: str
:param attachment: Optional. Any associated file needed by the main file.
:type attachment: str
:param storage_policy: Optional. Used to override `storage_policy` if needed
:type attachment: str
.. note::
This is a generic method with many parameters. In order to
keep your code tidy and and loosely coupled to the actual
storage policy, you should use methods like
:meth:`~ferenda.DocumentStore.downloaded_path` or :meth:`~ferenda.DocumentStore.parsed_path` when
possible.
Example:
>>> d = DocumentStore(datadir="/tmp/base")
>>> realsep = os.sep
>>> os.sep = "/"
>>> d.path('123/a', 'parsed', '.xhtml') == '/tmp/base/parsed/123/a.xhtml'
True
>>> d.storage_policy = "dir"
>>> d.path('123/a', 'parsed', '.xhtml') == '/tmp/base/parsed/123/a/index.xhtml'
True
>>> d.path('123/a', 'downloaded', None, 'r4711', 'appendix.txt') == '/tmp/base/archive/downloaded/123/a/r4711/appendix.txt'
True
>>> os.sep = realsep
:param basefile: The basefile for which to calculate the path
:type basefile: str
:param maindir: The processing stage directory (normally ``downloaded``, ``parsed``, or ``generated``)
:type maindin: str
:param suffix: The file extension including period (i.e. ``.txt``, not ``txt``)
:type suffix: str
:param version: Optional, the archived version id
:type version: str
:param attachment: Optional. Any associated file needed by the main file. Requires that ``storage_policy`` is set to ``dir``. ``suffix`` is ignored if this parameter is used.
:type attachment: str
:returns: The full filesystem path
:rtype: str
"""
pathfrag = self.basefile_to_pathfrag(basefile)
if not storage_policy:
storage_policy = self.storage_policy
if version:
v_pathfrag = self.basefile_to_pathfrag(version)
segments = [self.datadir,
'archive', maindir, pathfrag, v_pathfrag]
else:
segments = [self.datadir, maindir, pathfrag]
if storage_policy == "dir":
if attachment:
for illegal in ':/':
if illegal in attachment:
raise errors.AttachmentNameError(
"Char '%s' in attachment name '%s' not allowed" % (illegal, attachment))
segments.append(attachment)
else:
segments.append("index" + suffix)
else:
if attachment != None:
raise errors.AttachmentPolicyError(
"Can't add attachments (name %s) if "
"storage_policy != 'dir'" % attachment)
segments[-1] += suffix
unixpath = "/".join(segments)
if os.sep == "/":
return unixpath
else:
return unixpath.replace("/", os.sep)
[docs] @contextmanager
def open(self, basefile, maindir, suffix, mode="r", version=None, attachment=None):
"""Context manager that opens files for reading or writing. The
parameters are the same as for
:meth:`~ferenda.DocumentStore.path`, and the note is
applicable here as well -- use
:meth:`~ferenda.DocumentStore.open_downloaded`,
:meth:`~ferenda.DocumentStore.open_parsed` et al if possible.
Example:
>>> store = DocumentStore(datadir="/tmp/base")
>>> with store.open('123/a', 'parsed', '.xhtml', mode="w") as fp:
... res = fp.write("hello world")
>>> os.path.exists("/tmp/base/parsed/123/a.xhtml")
True
"""
filename = self.path(basefile, maindir, suffix, version, attachment)
fp = NamedTemporaryFile(mode, delete=False)
fp.realname = filename
try:
yield fp
finally:
tempname = fp.name
fp.close()
if not os.path.exists(filename) or not filecmp.cmp(tempname, filename):
util.ensure_dir(filename)
shutil.move(tempname, filename)
else:
os.unlink(tempname)
[docs] def list_basefiles_for(self, action, basedir=None):
"""Get all available basefiles that can be used for the
specified action.
:param action: The action for which to get available
basefiles (``parse``, ``relate``, ``generate``
or ``news``)
:type action: str
:param basedir: The base directory in which to search for
available files. If not provided, defaults to
``self.datadir``.
:type basedir: str
:returns: All available basefiles
:rtype: generator
"""
if not basedir:
basedir = self.datadir
directory = None
if action == "parse":
directory = os.path.sep.join((basedir, "downloaded"))
if self.storage_policy == "dir":
# If each document is stored in a separate directory,
# there is usually other auxillary files (attachments
# and whatnot) in that directory as well. Make sure we
# only yield a single file from each directory. By
# convention, the main file is called index.html,
# index.pdf or whatever.
# print("storage_policy dir: %s" % self.storage_policy)
suffix = os.sep + "index" + self.downloaded_suffix
else:
# print("storage_policy file: %s" % self.storage_policy)
suffix = self.downloaded_suffix
elif action == "relate":
directory = os.path.sep.join((basedir, "distilled"))
suffix = ".rdf"
elif action == "generate":
directory = os.path.sep.join((basedir, "parsed"))
if self.storage_policy == "dir":
suffix = os.sep + "index.xhtml"
else:
suffix = ".xhtml"
elif action == "news":
directory = os.path.sep.join((basedir, "entries"))
suffix = ".json"
# FIXME: fake action, needed for get_status. replace with
# something more elegant
elif action in ("_postgenerate"):
directory = os.path.sep.join((basedir, "generated"))
suffix = ".html"
if not directory:
raise ValueError("No directory calculated for action %s" % action)
if not os.path.exists(directory):
return
# FIXME: Some stores need a more sophisticated way of filtering than this.
for x in util.list_dirs(directory, suffix, reverse=True):
# ignore empty files placed by download (which may
# have done that in order to avoid trying to
# re-download nonexistent resources)
if os.path.exists(x) and os.path.getsize(x) > 0:
# get a pathfrag from full path
# suffixlen = len(suffix) if self.storage_policy == "file" else len(suffix) + 1
suffixlen = len(suffix)
x = x[len(directory) + 1:-suffixlen]
yield self.pathfrag_to_basefile(x)
[docs] def list_versions(self, basefile, action=None):
"""Get all archived versions of a given basefile.
:param basefile: The basefile to list archived versions for
:type basefile: str
:param action: The type of file to look for (either
``downloaded``, ``parsed`` or ``generated``. If
``None``, look for all types.
:type action: str
:returns: All available versions for that basefile
:rtype: generator
"""
if action:
assert action in ('downloaded', 'parsed', 'generated'), "Action %s invalid" % action
actions = (action,)
else:
actions = ('downloaded', 'parsed', 'generated')
basedir = self.datadir
pathfrag = self.basefile_to_pathfrag(basefile)
yielded_basefiles = []
for action in actions:
directory = os.sep.join((basedir, "archive",
action, pathfrag))
if not os.path.exists(directory):
continue
for x in util.list_dirs(directory, reverse=False):
if os.path.exists(x):
# /datadir/base/archive/downloaded/basefile/version.html
# => version.html
x = x[len(directory) + 1:]
if self.storage_policy == "dir":
# version/index.html => version
x = os.sep.join(x.split(os.sep)[:-1])
else:
# version.html => version
x = os.path.splitext(x)[0]
if os.sep in x:
# we didn't find an archived file for
# basefile, instead we found an archived file
# for another basefile that startswith our
# basefile (eg '123' and '123/a', and we found
# '123/a/4.html')
continue
# print("Found file %r %r" % (x, self.pathfrag_to_basefile(x)))
basefile = self.pathfrag_to_basefile(x)
if basefile not in yielded_basefiles:
yielded_basefiles.append(basefile)
yield basefile
[docs] def list_attachments(self, basefile, action, version=None):
"""Get all attachments for a basefile in a specified state
:param action: The state (type of file) to look for (either
``downloaded``, ``parsed`` or ``generated``. If
``None``, look for all types.
:type action: str
:param basefile: The basefile to list attachments for
:type basefile: str
:param version: The version of the basefile to list attachments for. If None, list attachments for the current version.
:type version: str
:returns: All available attachments for the basefile
:rtype: generator
"""
if self.storage_policy != "dir":
raise errors.AttachmentPolicyError("Can't list attachments if storage_policy != 'dir'")
basedir = self.datadir
# pathfrag = self.pathfrag_to_basefile(basefile) # that can't be right?
pathfrag = self.basefile_to_pathfrag(basefile)
if version:
v_pathfrag = self.basefile_to_pathfrag(version)
directory = os.sep.join((basedir, "archive", action, pathfrag, v_pathfrag))
else:
directory = os.sep.join((basedir, action, pathfrag))
# FIXME: Similar map exists in list_basefiles_for and in other
# places throughout the code. Should subclasses be able to
# control suffixes beyond the simple self.downloaded_suffix
# mechanism?
suffixmap = {'downloaded': self.downloaded_suffix,
'parsed': '.xhtml',
'generated': '.html'}
mainfile = "index" + suffixmap[action]
for x in util.list_dirs(directory, reverse=False):
# /datadir/base/downloaded/basefile/attachment.txt => attachment.txt
x = x[len(directory) + 1:]
if x != mainfile:
yield x
[docs] def basefile_to_pathfrag(self, basefile):
"""Given a basefile, returns a string that can safely be used
as a fragment of the path for any representation of that
file. The default implementation recognizes a number of
characters that are unsafe to use in file names and replaces
them with HTTP percent-style encoding.
Example:
>>> d = DocumentStore("/tmp")
>>> realsep = os.sep
>>> os.sep = "/"
>>> d.basefile_to_pathfrag('1998:204') == '1998/%3A204'
True
>>> os.sep = realsep
If you wish to override how document files are stored in
directories, you can override this method, but you should make
sure to also override
:py:meth:`~ferenda.DocumentStore.pathfrag_to_basefile` to
work as the inverse of this method.
:param basefile: The basefile to encode
:type basefile: str
:returns: The encoded path fragment
:rtype: str
"""
safe = '/;@&=+,'
if six.PY2:
# urllib.quote in python 2 cannot handle unicode values
# for the s parameter (2.6 cannot even handle unicode
# values for the safe parameter). FIXME: We should create
# a shim as ferenda.compat.quote and use that
basefile = basefile.encode('utf-8')
safe = safe.encode('ascii') # pragma: no cover
return quote(basefile, safe=safe).replace('%', os.sep + '%')
[docs] def pathfrag_to_basefile(self, pathfrag):
"""Does the inverse of
:py:meth:`~ferenda.DocumentStore.basefile_to_pathfrag`,
that is, converts a fragment of a file path into the
corresponding basefile.
:param pathfrag: The path fragment to decode
:type pathfrag: str
:returns: The resulting basefile
:rtype: str
"""
if os.sep == "\\":
pathfrag = pathfrag.replace("\\", "/")
return unquote(pathfrag.replace('/%', '%'))
[docs] def archive(self, basefile, version):
"""Moves the current version of a document to an archive. All
files related to the document are moved (downloaded, parsed,
generated files and any existing attachment files).
:param basefile: The basefile of the document to archive
:type basefile: str
:param version: The version id to archive under
:type version: str
"""
for meth in (self.downloaded_path, self.documententry_path,
self.parsed_path, self.serialized_path,
self.distilled_path,
self.annotation_path, self.generated_path):
# FIXME: what about intermediate? Ignore them as they
# should be able to be regenerated at any time?
src = meth(basefile)
dest = meth(basefile, version)
if self.storage_policy == "dir" and meth in (self.downloaded_path,
self.parsed_path,
self.generated_path):
src = os.path.dirname(src)
dest = os.path.dirname(dest)
if not os.path.exists(src):
continue
if os.path.exists(dest):
raise errors.ArchivingError(
"Archive destination %s for basefile %s version %s already exists!" % (dest, basefile, version))
# self.log.debug("Archiving %s to %s" % (src,dest))
# print("Archiving %s to %s" % (src,dest))
util.ensure_dir(dest)
shutil.move(src, dest)
[docs] def downloaded_path(self, basefile, version=None, attachment=None):
"""Get the full path for the downloaded file for the given
basefile (and optionally archived version and/or attachment
filename).
:param basefile: The basefile for which to calculate the path
:type basefile: str
:param version: Optional. The archived version id
:type version: str
:param attachment: Optional. Any associated file needed by the main file.
:type attachment: str
:returns: The full filesystem path
:rtype: str
"""
return self.path(basefile, 'downloaded',
self.downloaded_suffix, version, attachment)
[docs] def open_downloaded(self, basefile, mode="r", version=None, attachment=None):
"""Opens files for reading and writing,
c.f. :meth:`~ferenda.DocumentStore.open`. The parameters are
the same as for
:meth:`~ferenda.DocumentStore.downloaded_path`.
"""
filename = self.downloaded_path(basefile, version, attachment)
return self._open(filename, mode)
[docs] def documententry_path(self, basefile, version=None):
"""Get the full path for the documententry JSON file for the given
basefile (and optionally archived version).
:param basefile: The basefile for which to calculate the path
:type basefile: str
:param version: Optional. The archived version id
:type version: str
:returns: The full filesystem path
:rtype: str
"""
return self.path(basefile, 'entries', '.json', version, storage_policy="file")
[docs] def parsed_path(self, basefile, version=None, attachment=None):
"""Get the full path for the parsed XHTML file for the given
basefile.
:param basefile: The basefile for which to calculate the path
:type basefile: str
:param version: Optional. The archived version id
:type version: str
:param attachment: Optional. Any associated file needed by the
main file (created by
:py:meth:`~ferenda.DocumentRepository.parse`)
:type attachment: str
:returns: The full filesystem path
:rtype: str
"""
return self.path(basefile, 'parsed', '.xhtml',
version, attachment)
[docs] def open_parsed(self, basefile, mode="r", version=None, attachment=None):
"""Opens files for reading and writing,
c.f. :meth:`~ferenda.DocumentStore.open`. The parameters are
the same as for
:meth:`~ferenda.DocumentStore.parsed_path`.
"""
filename = self.parsed_path(basefile, version, attachment)
return self._open(filename, mode)
[docs] def serialized_path(self, basefile, version=None, attachment=None):
"""Get the full path for the serialized JSON file for the given
basefile.
:param basefile: The basefile for which to calculate the path
:type basefile: str
:param version: Optional. The archived version id
:type version: str
:returns: The full filesystem path
:rtype: str
"""
return self.path(basefile, 'serialized', '.json',
version, storage_policy="file")
[docs] def open_serialized(self, basefile, mode="r", version=None):
"""Opens files for reading and writing,
c.f. :meth:`~ferenda.DocumentStore.open`. The parameters are
the same as for
:meth:`~ferenda.DocumentStore.serialized_path`.
"""
filename = self.serialized_path(basefile, version)
return self._open(filename, mode)
[docs] def distilled_path(self, basefile, version=None):
"""Get the full path for the distilled RDF/XML file for the given
basefile.
:param basefile: The basefile for which to calculate the path
:type basefile: str
:param version: Optional. The archived version id
:type version: str
:returns: The full filesystem path
:rtype: str
"""
return self.path(basefile, 'distilled', '.rdf',
version, storage_policy="file")
[docs] def open_distilled(self, basefile, mode="r", version=None):
"""Opens files for reading and writing,
c.f. :meth:`~ferenda.DocumentStore.open`. The parameters are
the same as for
:meth:`~ferenda.DocumentStore.distilled_path`.
"""
filename = self.distilled_path(basefile, version)
return self._open(filename, mode)
[docs] def generated_path(self, basefile, version=None, attachment=None):
"""Get the full path for the generated file for the given
basefile (and optionally archived version and/or attachment
filename).
:param basefile: The basefile for which to calculate the path
:type basefile: str
:param version: Optional. The archived version id
:type version: str
:param attachment: Optional. Any associated file needed by the main file.
:type attachment: str
:returns: The full filesystem path
:rtype: str
"""
return self.path(basefile, 'generated', '.html',
version, attachment)
# Removed this method until I find a reason to use it
#
# def open_generated(self, basefile, mode="r", version=None, attachment=None):
# """Opens files for reading and writing,
# c.f. :meth:`~ferenda.DocumentStore.open`. The parameters are
# the same as for
# :meth:`~ferenda.DocumentStore.generated_path`.
#
# """
# filename = self.generated_path(basefile, version, attachment)
# return self._open(filename, mode)
[docs] def annotation_path(self, basefile, version=None):
"""Get the full path for the annotation file for the given
basefile (and optionally archived version).
:param basefile: The basefile for which to calculate the path
:type basefile: str
:param version: Optional. The archived version id
:type version: str
:returns: The full filesystem path
:rtype: str
"""
return self.path(basefile, 'annotations', '.grit.xml',
version, storage_policy="file")
[docs] def open_annotation(self, basefile, mode="r", version=None):
"""Opens files for reading and writing,
c.f. :meth:`~ferenda.DocumentStore.open`. The parameters are
the same as for
:meth:`~ferenda.DocumentStore.annotation_path`."""
filename = self.annotation_path(basefile, version)
return self._open(filename, mode)
[docs] def dependencies_path(self, basefile):
"""Get the full path for the dependency file for the given
basefile
:param basefile: The basefile for which to calculate the path
:type basefile: str
:returns: The full filesystem path
:rtype: str
"""
return self.path(basefile, 'deps', '.txt', storage_policy="file")
[docs] def open_dependencies(self, basefile, mode="r"):
"""Opens files for reading and writing,
c.f. :meth:`~ferenda.DocumentStore.open`. The parameters are
the same as for
:meth:`~ferenda.DocumentStore.dependencies_path`."""
filename = self.dependencies_path(basefile)
return self._open(filename, mode)
[docs] def atom_path(self, basefile):
"""Get the full path for the atom file for the given
basefile
.. note::
This is used by :meth:`ferenda.DocumentRepository.news` and
does not really operate on "real" basefiles. It might be
removed. You probably shouldn't use it unless you override
:meth:`~ferenda.DocumentRepository.news`
:param basefile: The basefile for which to calculate the path
:type basefile: str
:returns: The full filesystem path
:rtype: str
"""
return self.path(basefile, 'feed', '.atom', storage_policy="file")