Source code for ferenda.pdfreader

# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import *

from bz2 import BZ2File
from glob import glob
from io import BytesIO
from time import sleep
import itertools
import logging
import os
import re
import shutil
import subprocess
import tempfile
import warnings
import unicodedata
from lxml import etree
from lxml.builder import ElementMaker
from layeredconfig import LayeredConfig, Defaults
from cached_property import cached_property

from ferenda import util, errors
from ferenda.fsmparser import Peekable
from ferenda.elements import serialize
from ferenda.elements import UnicodeElement, CompoundElement, OrdinalElement

E = ElementMaker(namespace="http://www.w3.org/1999/xhtml",
                 nsmap={None: "http://www.w3.org/1999/xhtml"})

[docs]class PDFReader(CompoundElement): """Parses PDF files and makes the content available as a object hierarchy. Calling the :py:meth:`~ferenda.PDFReader.read` method returns a :py:class:`ferenda.pdfreader.PDFFile` object, which is a list of :py:class:`ferenda.pdfreader.Page` objects, which each is a list of :py:class:`ferenda.pdfreader.Textbox` objects, which each is a list of :py:class:`ferenda.pdfreader.Textelement` objects. .. note:: This class depends on the command line tool pdftohtml from `poppler <http://poppler.freedesktop.org/>`_. The class can also handle any other type of document (such as Word/OOXML/WordPerfect/RTF) that OpenOffice or LibreOffice handles by first converting it to PDF using the ``soffice`` command line tool (which then must be in your ``$PATH``). If the PDF contains only scanned pages (without any OCR information), the pages can be run through the ``tesseract`` command line tool (which, again, needs to be in your ``$PATH``). You need to provide the main language of the document as the ``ocr_lang`` parameter, and you need to have installed the tesseract language files for that language. """ ################################################################ # properties and methods relating to the initialization of the # PDFReader object detect_footnotes = True def __init__(self, pages=None, filename=None, workdir=None, images=True, convert_to_pdf=False, keep_xml=True, ocr_lang=None, fontspec=None, textdecoder=None): """Initializes a PDFReader object from an existing PDF file. After initialization, the PDFReader contains a list of :py:class:`~ferenda.pdfreader.Page` objects. :param pages: Internal parameter. You should not specify this. Specify all other parameters using keywords. :param filename: The full path to the PDF file (or, if ``convert_to_pdf`` is set, any other document file) :param workdir: A directory where intermediate files (particularly background PNG files) are stored. If not provided, a temporary directory will be created and be available as the ``workdir`` property of the object. :param convert_to_pdf: If filename is any other type of document other than PDF, attempt to first convert it to PDF using the ``soffice`` command line tool (from OpenOffice/LibreOffice). :type convert_to_pdf: bool :param keep_xml: If False, remove the intermediate XML representation of the PDF that gets created in ``workdir``. If true, keep it around to speed up subsequent parsing operations. If set to the special value ``"bz2"``, keep it but compress it with :py:mod:`bz2`. :type keep_xml: bool :param ocr_lang: If provided, PDFReader will extract scanned images from the PDF file, and run an OCR program on it, using the ``ocr_lang`` language heuristics. (Note that this is not neccessarily an IETF language tag like "sv" or "en-GB", but rather whatever the underlying ``tesseract`` program uses). :param ocr_lang: str """ self.log = logging.getLogger('pdfreader') if pages: # special-case: The object has been initialized as a # regular list (by deserialize), we have no need to # parse and create pages. return if not filename: return # another specialcase: create an empty object so # that we can call the ._tesseract in other # scenarios self.fontspec = fontspec or {} self.filename = filename self.workdir = workdir if self.workdir is None: self.workdir = tempfile.mkdtemp() if textdecoder is None: self._textdecoder = BaseTextDecoder() else: self._textdecoder = textdecoder # FIXME: For testing, we'd like to avoid this conversation if # we already have the real_convertedfile that we'll end up # with, in order to not convert to PDF needlessly if convert_to_pdf: newfilename = workdir + os.sep + \ os.path.splitext(os.path.basename(filename))[0] + ".pdf" if not os.path.exists(newfilename): util.ensure_dir(newfilename) cmdline = "soffice --headless --convert-to pdf --outdir '%s' %s" % ( workdir, filename) self.log.debug("%s: Converting to PDF: %s" % (filename, cmdline)) (ret, stdout, stderr) = util.runcmd( cmdline, require_success=True) filename = newfilename assert os.path.exists(filename), "PDF %s not found" % filename basename = os.path.basename(filename) stem = os.path.splitext(basename)[0] if ocr_lang: suffix = ".hocr.html" converter = self._tesseract converter_extra = {'lang': ocr_lang} parser = self._parse_hocr else: suffix = ".xml" converter = self._pdftohtml converter_extra = {'images': images, 'keeppdffile': convert_to_pdf} parser = self._parse_xml convertedfile = os.sep.join([workdir, stem + suffix]) if keep_xml == "bz2": real_convertedfile = convertedfile + ".bz2" else: real_convertedfile = convertedfile tmpfilename = os.sep.join([workdir, basename]) # copying the filename to the workdir is only needed if we use # PDFReader._pdftohtml if not util.outfile_is_newer([filename], real_convertedfile): util.copy_if_different(filename, tmpfilename) # this is the expensive operation res = converter(tmpfilename, workdir, **converter_extra) # print("contents of workdir %s after conversion: %r" % (workdir, os.listdir(workdir))) if keep_xml == "bz2": with open(convertedfile, mode="rb") as rfp: # BZ2File supports the with statement in py27+, # but we support py2.6 wfp = BZ2File(real_convertedfile, "wb") wfp.write(rfp.read()) wfp.close() os.unlink(convertedfile) else: # keep_xml = True pass else: # print("outfile_is_newer returned True: real_convertedfile: %s (%s)" % (real_convertedfile, os.path.exists(real_convertedfile))) pass if not os.path.exists(real_convertedfile): print("%s don't exist -- parsing will fail!" % real_convertedfile) print("%s has the following files: %s" % (workdir, os.listdir(workdir))) # it's important that we open the file as a bytestream since # we might do byte-level manipulation in _parse_xml. if keep_xml == "bz2": fp = BZ2File(real_convertedfile) else: fp = open(real_convertedfile, "rb") res = parser(fp) fp.close() if keep_xml == False: os.unlink(convertedfile) return res def _tesseract(self, pdffile, workdir, lang, hocr=True): root = os.path.splitext(os.path.basename(pdffile))[0] # step 0: copy the pdf into a temp dir (which is probably on # local disk, saving us some network traffic if the pdf file # is huge and on a NFS mount somewhere) tmpdir = tempfile.mkdtemp() tmppdffile = os.sep.join([tmpdir, os.path.basename(pdffile)]) util.copy_if_different(pdffile, tmppdffile) # step 1: find the number of pages cmd = "pdfinfo %s" % tmppdffile (returncode, stdout, stderr) = util.runcmd(cmd, require_success=True) m = re.search(r"Pages:\s+(\d+)", stdout) number_of_pages = int(m.group(1)) self.log.debug("%(root)s.pdf has %(number_of_pages)s pages" % locals()) # step 2: extract the images (should be one per page), 10 # pages at a time (pdfimages flakes out on larger loads) to_int = int for idx, i in enumerate(range(int(number_of_pages / 10) + 1)): frompage = (i * 10) + 1 topage = min((i + 1) * 10, number_of_pages) if frompage > topage: continue # if the PDF contains embedded JPG images, extract them # as-is. Other embedded formats (JPEG2000, JBIG2, CCITT) # are converted to PNG. cmd = "pdfimages -png -j -p -f %(frompage)s -l %(topage)s %(tmppdffile)s %(tmpdir)s/%(root)s" % locals( ) self.log.debug("- running " + cmd) (returncode, stdout, stderr) = util.runcmd(cmd, require_success=True) # step 2.1: convert and combine the recently extracted # images (which can be ppm, jpg, ccitt or whatever) into a # new tif (so that we add 10 pages at a time to the tif, # as imagemagick can create a number of pretty large files # for each page, so converting 200 images will fill 10 G # of your temp space -- which we'd like to avoid) imagefiles = glob("%(tmpdir)s/%(root)s-*" % locals()) if len(imagefiles) < topage - (frompage - 1): self.log.warning("Expected to find %s images from running '%s', found %s" % ( topage - (frompage - 1), cmd, len(imagefiles))) # it's entirely possible (for pdf containing real # blank pages, ie w/o scanned data, that we can # end up with zero pages. We'll just have to go # on. if imagefiles: cmd = "convert %(tmpdir)s/%(root)s-* -compress Zip %(tmpdir)s/%(root)s_tmp%(idx)04d.tif" % locals() self.log.debug("- running " + cmd) (returncode, stdout, stderr) = util.runcmd(cmd, require_success=True) # step 2.2: Remove extracted image files now that they're in the .tif for f in imagefiles: os.unlink(f) # Step 3: Combine all the 10-page tifs into a giant tif using tiffcp # FIXME: This can take more than 60 seconds, in which case # nginx (when running this code via change-parse-options) # might time out. One way would be to use raw subprocess.Popen # and occasionally write status updates to the log (like with # tesseract below). tiffcp itself doesn't seem to be very # verbose though. cmd = "tiffcp -c zip %(tmpdir)s/%(root)s_tmp*.tif %(tmpdir)s/%(root)s.tif" % locals() self.log.debug("- running " + cmd) # (returncode, stdout, stderr) = util.runcmd(cmd, require_success=True) process = subprocess.Popen(cmd, shell=True) timer = 0 while process.poll() == None: timer += 1 if not (timer % 100): self.log.debug("tiffcp processing: %s s elapsed..." % str(timer/10)) sleep(0.1) returncode = process.poll() if returncode != 0: stdout, stderr = process.communicate() raise errors.ExternalCommandError(stderr) # Step 3: OCR the giant tif file to create a .hocr.html file # Note that -psm 1 (automatic page segmentation with # orientation and script detection) requires the installation # of tesseract-ocr-3.01.osd.tar.gz usehocr = "hocr" if hocr else "" suffix = ".hocr" if hocr else "" pagebreaks = "-c include_page_breaks=1" if not hocr else "" # Tesseract 4.0 removes this option cmd = "tesseract %(tmpdir)s/%(root)s.tif %(tmpdir)s/%(root)s%(suffix)s -l %(lang)s -psm 1 %(usehocr)s %(pagebreaks)s" % locals( ) self.log.debug("running " + cmd) # run the command in a more involved way so that we can log its' progress process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) outputs = [] for line in iter(process.stdout.readline, b''): output = line.decode("utf-8", errors="ignore").strip() outputs.append(output) if output.startswith("Page "): self.log.debug("OCR processed: %s" % output) returncode = process.poll() if returncode != 0: raise errors.ExternalCommandError(outputs[-1]) if hocr: # Step 4: Later versions of tesseract adds a automatic .hocr # suffix, while earlier versions add a automatic .html. Other # parts of the code expects the .html suffix, so we check to # see if we have new-tesseract behaviour and compensate. if os.path.exists("%(tmpdir)s/%(root)s%(suffix)s.hocr" % locals()): util.robust_rename("%(tmpdir)s/%(root)s%(suffix)s.hocr" % locals(), "%(tmpdir)s/%(root)s%(suffix)s.html" % locals()) # Step 5: Move our hOCR file to the workdir, then cleanup util.robust_rename("%(tmpdir)s/%(root)s%(suffix)s.html" % locals(), "%(workdir)s/%(root)s%(suffix)s.html" % locals()) else: util.robust_rename("%(tmpdir)s/%(root)s.txt" % locals(), "%(workdir)s/%(root)s.txt" % locals()) shutil.rmtree(tmpdir) def _pdftohtml(self, tmppdffile, workdir, images, keeppdffile): root = os.path.splitext(os.path.basename(tmppdffile))[0] try: if images: # two pass coding: First use -c (complex) to extract # background pictures, then use -xml to get easy-to-parse # text with bounding boxes. cmd = "pdftohtml -nodrm -c %s" % tmppdffile self.log.debug("Converting with images: %s" % cmd) (returncode, stdout, stderr) = util.runcmd(cmd, require_success=True) # print("1: ran %s (%s), stdout %r, stderr %r" % (cmd, returncode, stdout, stderr)) # print("contents of %s is now %r" % (workdir, os.listdir(workdir))) # we won't need the html files, or the blank PNG files for f in os.listdir(workdir): if f.startswith(root) and f.endswith(".html"): os.unlink(workdir + os.sep + f) elif f.startswith(root) and f.endswith(".png"): # this checks the number of unique colors in the # bitmap. If there's only one color, we don't need # the file (returncode, stdout, stderr) = util.runcmd( 'convert %s -format "%%k" info:' % (workdir + os.sep + f)) if stdout.strip() == "1": os.unlink(workdir + os.sep + f) else: self.log.debug("Keeping non-blank image %s" % f) # imgflag = "-i" if not images else "" # Change in how we treat images: As we've extracted # background pictures above, we don't really need to # extract each individual image again. Also, for some PDFs # (FFFS 2011:34, an image is generated for most every # non-text dot, resulting in thousands of images per # page. So always ignore images. imgflag = "-i" # Without -fontfullname, all fonts are just reported as # having family="Times"... # Without -hidden, some scanned-and-OCR:ed files turn up # empty cmd = "pdftohtml -nodrm -xml -fontfullname -hidden %s %s" % (imgflag, tmppdffile) self.log.debug("Converting: %s" % cmd) (returncode, stdout, stderr) = util.runcmd(cmd, require_success=True) # print("2: ran %s (%s), stdout %r, stderr %r" % (cmd, returncode, stdout, stderr)) # print("contents of %s is now %r" % (workdir, os.listdir(workdir))) xmlfile = os.path.splitext(tmppdffile)[0] + ".xml" # if pdftohtml fails (if it's an old version that doesn't # support the fullfontname flag) it still uses returncode # 0! Only way to know if it failed is to inspect stderr # and look for if the xml file wasn't created. if stderr and not os.path.exists(xmlfile): raise errors.ExternalCommandError(stderr) fontinfofile = "%s.fontinfo" % xmlfile maxlen = os.statvfs(os.path.dirname(fontinfofile)).f_namemax if maxlen < len(os.path.basename(fontinfofile)): fontinfofile = os.path.dirname(fontinfofile) + os.sep + os.path.basename(fontinfofile)[:maxlen] cmd = "pdffonts %s > %s" % (tmppdffile, fontinfofile) self.log.debug("Getting font info: %s" % cmd) (returncode, stdout, stderr) = util.runcmd(cmd, require_success=True) # print("3: ran %s (%s), stdout %r, stderr %r" % (cmd, returncode, stdout, stderr)) # print("contents of %s is now %r" % (workdir, os.listdir(workdir))) finally: if not keeppdffile: os.unlink(tmppdffile) assert not os.path.exists(tmppdffile), "tmppdffile still there:" + tmppdffile dims = r"bbox (?P<left>\d+) (?P<top>\d+) (?P<right>\d+) (?P<bottom>\d+)" re_dimensions = re.compile(dims).search def _parse_hocr(self, fp, dummy=None): if dummy: warnings.warn("filenames passed to _parse_hocr are now ignored", DeprecationWarning) def dimensions(s): m = self.re_dimensions(s) return dict([(k, round(int(v) / px_per_point)) for (k, v) in m.groupdict().items()]) tree = etree.parse(fp) for pageelement in tree.findall( "//{http://www.w3.org/1999/xhtml}div[@class='ocr_page']"): pageheight_in_inch = 11.69 # A4 page -- FIXME: use real page dimensions pointsize = 1 / 72 pageheight_in_points = pageheight_in_inch / pointsize bbox = self.re_dimensions(pageelement.get('title')) px_per_point = (int(bbox.group("bottom")) - int(bbox.group("top"))) / pageheight_in_points dim = dimensions(pageelement.get('title')) page = Page(number=int(pageelement.get('id')[5:]), width=dim['right'] - dim['left'], height=dim['bottom'] - dim['top'], src=None, background=None) # we discard elements at the ocr_carea (content area?) # level, we're only concerned with paragraph-level # elements, which we use ocr_line for (to be consistent # with _parse_xml). However, if those element are wrapped # in ocr_par elements, then tesseract has indictated # paragraph-level segmentation which we make use of. for boxelement in pageelement.findall( ".//{http://www.w3.org/1999/xhtml}span[@class='ocr_line']"): boxdim = dimensions(boxelement.get('title')) textelements = [] par = boxelement.find("..[@class='ocr_par']") if par is not None: parid = par.get('id') else: parid = None for element in boxelement.findall( ".//{http://www.w3.org/1999/xhtml}span[@class='ocrx_word']"): dim = dimensions(element.get("title")) t = "".join(element.itertext()) + element.tail if not t.strip(): continue # strip empty things t = t.replace("\n", " ") if element.getchildren(): # probably a <em> or <strong> element tag = {'{http://www.w3.org/1999/xhtml}em': 'i', '{http://www.w3.org/1999/xhtml}strong': 'b'}[element.getchildren()[0].tag] else: tag = None text = Textelement(t, tag=tag, top=dim['top'], left=dim['left'], width=dim['right'] - dim['left'], height=dim['bottom'] - dim['top']) textelements.append(text) # try to determine footnotes by checking if first # element is numeric and way smaller than the # others. in that case, set it's tag to "sup" (for # superscript) if len(textelements) == 0: continue # the box didn't contain any real text, only lines of whitespace avgheight = sum([x.height for x in textelements]) // len(textelements) if textelements[0].strip().isdigit() and textelements[0].height <= avgheight / 2: textelements[0].tag = "sup" # Now that we know all text elements that should be in # the Textbox, we can guess the font size. fontspec = {'family': "unknown", 'size': avgheight} # find any previous definition of this fontspec fontid = None for specid, spec in self.fontspec.items(): if (fontspec['size'] == spec['size'] and fontspec['family'] == spec['family']): fontid = specid # None was found, create a new if not fontid: fontid = str(len(self.fontspec)) # start at 0 fontspec['id'] = fontid self.fontspec[fontid] = fontspec # finally create the box and add all our elements # (should not be more than one?) to it kwargs = {'top': boxdim['top'], 'left': boxdim['left'], 'width': boxdim['right'] - boxdim['left'], 'height': boxdim['bottom'] - boxdim['top'], 'fontspec': self.fontspec, 'fontid': fontid} if parid: kwargs['parid'] = parid kwargs['pdf'] = self box = Textbox(**kwargs) for e in textelements: box.append(e) page.append(box) self.append(page) self.log.debug("PDFReader initialized: %d pages" % (len(self))) def _parse_xml(self, xmlfp, dummy=None): filename = util.name_from_fp(xmlfp) # first up, try to locate a fontinfo.txt file fontinfo = {} fields = [] fonttypemap = {"Type 1": "Type1", "Type 1C": "Type1C", "Type 1C (OT)": "Type1C(OT)", "Type 3": "Type3", "TrueType (OT)": "TrueType(OT)", "CID Type 0": "CIDType0", "CID Type 0C": "CIDType0C", "CID Type 0C (OT)": "CIDType0C(OT)", "CID TrueType": "CIDTrueType", "CID TrueType (OT)": "CIDTrueType(OT)"} fontinfofile = filename.replace(".bz2", "") + ".fontinfo" # print("Looking for %s (%s)" % (fontinfofile, os.path.exists(fontinfofile))) if os.path.exists(fontinfofile): with open(fontinfofile) as fp: for line in fp: if not fields: fields = line.split() elif not line.startswith("-----"): # remove all spaces in the "type" column by # knowing possible values for k in fonttypemap: if k in line: line = line.replace(k, fonttypemap[k]) # NOW we can finally split on whitespace cols = line.split() if cols[0] not in fontinfo: # the output from # pdffonts might # include several # fonts with the # same family... fontinfo[cols[0]] = dict(zip(fields, cols)) if dummy: warnings.warn("filenames passed to _parse_xml are now ignored", DeprecationWarning) def txt(element_text): return re.sub(r"[\s\xa0\xc2]+", " ", str(element_text)) self.log.debug("Loading %s" % filename) if "Custom" in [f.get("encoding") for f in fontinfo.values()]: # the xmlfp might contain 0x03 (ctrl-C) for text nodes # using a custom encoding, where space is really # meant. Also a lot of other chars from 0x04 -- 0x19. It's # a bug in pdftohtml that such invalid chars are # included. Unfortunately, lxml/libxml seem to strip these # invalid chars when parsing, before Textbox.decode can # access it. So we preprocess the bytestream (in-memory -- # a custom wrapped codec would be better but more # complicated) to change these to xml numeric character # references # newfp = BytesIO() buffer = xmlfp.read() if not isinstance(buffer, bytes): self.log.warning("File %s was opened in text, not binary mode" % util.name_from_fp(xmlfp)) buffer = bytes(buffer.encode("utf-8")) else: # convert to a py3 style bytes() object (one that # returns ints, not strs, when iterating over it) buffer = bytes(buffer) # NOTE: That bug in pdftohtml has now been fixed # (https://bugs.freedesktop.org/show_bug.cgi?id=101770) in # poppler 0.57. If our poppler is that version or newer, # we're SOL. Look at the top of buffer to determine # poppler version number and crap out if not compatible. version = re.search('version="([^"]*)"', buffer[40:150].decode()).group(1) version = [int(x) for x in version.split(".")] if version >= [0, 57]: raise errors.ExternalCommandError("Your version of pdftohtml (poppler-utils) is too new " "and lacks a bug that is required to access text using " "custom encodings. We are so very sorry.") for idx, b in enumerate(buffer): # leave some control chars as-is (CR/LF but not TAB) if b < 0x20 and b not in (0xa, 0xd): # note: We don't use real xml numeric character # references as "&#3;" as this is just as invalid # as a real 0x03 byte in XML. Instead we # double-escape it. entity = "&amp;#%s;" % b newfp.write(entity.encode()) else: # newfp.write(six.int2byte(b)) newfp.write(bytes((b,))) newfp.seek(0) xmlfp = newfp try: root = etree.parse(xmlfp).getroot() except etree.XMLSyntaxError as e: self.log.debug( "pdftohtml created incorrect markup, trying to fix using BeautifulSoup: %s" % e) xmlfp.seek(0) from bs4 import BeautifulSoup soup = BeautifulSoup(xmlfp, "lxml") xmlfp = BytesIO(str(soup).encode("utf-8")) xmlfp.name = filename # now the root node hierarchy is # <html><body><pdf2xml><page>..., not # <pdf2xml><page>... So just skip the top two levels root = etree.parse(xmlfp).getroot()[0][0] self.log.debug("BeautifulSoup workaround successful") assert root.tag == "pdf2xml", "Unexpected root node from pdftohtml -xml: %s" % root.tag # We're experimenting with a auto-detecting decoder, which # needs a special API call in order to do the detection. If # this turns out to be a good idea we'll rework it into an # official subclass of BaseTextDecoder (maybe # AnalyzingTextDecoder) and test with isinstance if hasattr(self._textdecoder, 'analyze_font'): self._analyze_font_encodings(root, fontinfo) for pageelement in root: lastbox = None if pageelement.tag == "outline": # FIXME: We should do something with this information continue elif pageelement.tag is etree.Comment: # NOTE: Comments are never created by pdftohtml, but # might be present in testcases continue assert pageelement.tag == "page", "Got <%s>, expected <page>" % page.tag page = Page(number=int(pageelement.get('number')), # alwaysint? width=int(pageelement.get('width')), height=int(pageelement.get('height')), src=None, background=None) basename = os.path.splitext(filename)[0] if filename.endswith(".bz2"): basename = os.path.splitext(basename)[0] background = "%s%03d.png" % ( basename, page.number) # Reasons this file might not exist: it was blank and # therefore removed, or We're running under RepoTester if os.path.exists(background): page.background = background after_footnote = False peekable_pageelements = Peekable(pageelement) for element in peekable_pageelements: if element.tag is etree.Comment: continue if element.tag == 'image': # FIXME: do something clever with these continue if element.tag == 'fontspec': self._parse_xml_add_fontspec(element, fontinfo, self.fontspec) continue assert element.tag == 'text', "Got <%s>, expected <text>" % element.tag # eliminate "empty" textboxes, including "<text><i> </i></text>\n" if (((element.text and txt(element.text).strip() == "") or (element.text is None)) and not element.getchildren()): # print "Skipping empty box" continue if len(page) > 0: lastbox = page[-1] try: nextelement = peekable_pageelements.peek() except StopIteration: nextelement = None box = self._parse_xml_make_textbox(element, nextelement, after_footnote, lastbox, page) if box is None: # might consist entirely of empty, # therefore skipped, textelements. continue # we need to distinguish between inline footnote # markers (should go with preceeding textbox): if hasattr(box, 'merge-with-current'): delattr(box, 'merge-with-current') page[-1] = page[-1] + box after_footnote = True # and footer footnote markers (should create a new # textbox): elif len(box) and box[0].tag and (box[0].tag.endswith(("sup", "s"))): page.append(box) after_footnote = True elif (after_footnote and abs(page[-1].right - box.left) < 3): page[-1] = page[-1] + box after_footnote = False else: page.append(box) # done reading the page self.append(page) self.log.debug("PDFReader initialized: %d pages, %d fontspecs" % (len(self), len(self.fontspec))) def _parse_xml_make_textbox(self, element, nextelement, after_footnote, lastbox, page): textelements = self._parse_xml_make_textelement(element) attribs = dict(element.attrib) thisfont = self.fontspec[int(element.get('font'))] lastfont = lastbox.font if lastbox else None nextfont = self.fontspec[int(nextelement.get('font'))] if nextelement is not None and nextelement.get('font') else None if self.detect_footnotes: if (len(textelements) and (textelements[0].strip().isdigit() and # check both previous box and next (for catching footnote markers in the foooter) (lastfont and lastfont.family == thisfont['family'] and lastfont.size > thisfont['size'] and # final test that footnote marker is in the vincinity # of the text it's a footnote for (ie not a random # pagenumber): -5 < int(element.get("left")) - lastbox.right < 10 and # is really close to 0 < lastbox.bottom - (int(element.get("top")) + int(element.get("height"))) < 20) # is slightly lower than or (nextfont and thisfont['family'] == nextfont['family'] and thisfont['size'] < nextfont['size'] and -5 < int(nextelement.get("left")) - (int(element.get("left")) + int(element.get("width"))) < 10 and # is really close to 0 < (int(nextelement.get("top")) + int(nextelement.get("height"))) - (int(element.get("top")) + int(element.get("height"))) < 20) # is slightly lower than )): # this must be a footnote -- alter tag to show that it # should be rendered with superscript if textelements[0].tag is None: textelements[0].tag = "" if isinstance(textelements[0], LinkedTextelement) or textelements[0].tag: textelements[0].tag += "s" else: textelements[0].tag = "sup" # is it in the main text, ie immediately # following the last textbox? Then append it to that textbox if lastbox and abs(lastbox.right - int(attribs['left'])) < 3: # return a Box that the caller will merge with current attribs['fontid'] = attribs.pop('font') attribs['merge-with-current'] = True return Textbox(textelements, **attribs) elif min([x.left for x in page] + [0]) - int(attribs['left']) < 3: # then create a new textbox and let # the after_footnote magic append more # textboxes to it. Note: we don't use # the small font used in the footnote # marker, but rather peek at the # normal-sized font that immediately # follows. Also the box placement and # height is determined by the # following element if nextelement is not None: attribs['fontspec'] = self.fontspec attribs['fontid'] = int(nextelement.attrib['font']) attribs['top'] = nextelement.attrib['top'] attribs['height'] = nextelement.attrib['height'] attribs['pdf'] = self del attribs['font'] return self._textdecoder(Textbox(textelements, **attribs), self.fontspec) else: self.log.debug("Text element %s (%s) looks like a footnote " "marker, but not in main text nor footer " "area" % (serialize(textelements[0]).strip(), attribs)) elif (after_footnote and lastfont.family == thisfont['family'] and lastfont.size == thisfont['size'] and lastbox.top == int(attribs['top']) and lastbox.height == int(attribs['height']) and abs(lastbox.right - int(attribs['left'])) < 3): lastbox.append(self._parse_xml_make_textelement(element)) after_footnote = False after_footnote = False # all textboxes share the same fontspec dict attribs['fontspec'] = self.fontspec attribs['fontid'] = int(attribs['font']) attribs['pdf'] = self del attribs['font'] # merge whitespace-only-boxes, even if that changes the tag of the whitespace, eg # <textelement tag="b">something</textelement> # <textelement tag="bi"> </textelement> # # <textelement tag="b">something </textelement> merged = [] for x in textelements: # if only whitespace, merge with previous (if compatible) if (merged and x and not x.strip() and type(merged[-1]) == type(x) and getattr(merged[-1], 'uri', None) == getattr(x, 'uri', None)): merged[-1] = merged[-1] + x else: merged.append(x) return self._textdecoder(Textbox(merged, **attribs), self.fontspec) import string ws_trans = {ord("\n"): " ", ord("\t"): " ", ord("\xa0"): " "} def _parse_xml_make_textelement(self, element, keep_ws_only=False, **origkwargs): # the complication is that a hierarchical sequence of tags # should be converted to a list of # # case 1: plain text -> Textelement # case 2: tag = a -> LinkedTextelement # case 3: tab=b/i -> Textelement, tag=... # case 4: tag=b + tag=i -> Textelement, tag=bi # complicated cases: # TE = Textelement # LTE = LinkedTextelement # # 1: <text>Here <b>is <i> some <a href="...">text</a></i></b></text> # -> <b>is <i> some <a href="...">text</a></i></b> # -> <i> some <a href="...">text</a></i>, tag="b" # -> <a href="...">text</a>, tag="bi" # -> (TE("Here"), TE("is", tag="b"), TE("some", tag="bi"), LTE("text", tag="bi", uri="...")) # 2: <text><b><i><a href="...">1</a></i>/<b></text> # -> LTE("1", tag="bis", uri="..."), footnote=True (shld caller determine that?) # 3: <text><b></i>that </i> is </b> complicated</text>, after_footnote=True # -> TextElement("that", tag="bi"), TE("is", tag="i"), TE("complicated"), # 4: <text>2</text> # -> TE("2", tag="sup") # def cleantag(kwargs): # returns the "a" tag from a tag string, if present (w/o # altering the original kwargs) kwargscopy = dict(kwargs) if "a" in kwargs.get("tag", ""): kwargscopy["tag"] = kwargscopy["tag"].replace("a", "") if kwargscopy["tag"] == "": kwargscopy["tag"] = None return kwargscopy def normspace(txt): # like util.normalize_space, but preserves a single leading/trailing space if not isinstance(txt, str): # under py2, element.text can # sometimes be a bytestring? txt = txt.decode() txt = txt.translate(self.ws_trans) if re.match(r" +$", txt): return " " endspace = " " if txt.endswith(" ") and len(txt) > 1 else "" startspace = " " if txt.startswith(" ") else "" return startspace + util.normalize_space(txt) + endspace res = [] cls = origcls = Textelement origtag = None kwargs = dict(origkwargs) if 'tag' not in kwargs: kwargs['tag'] = "" if element.tag == "a": cls = LinkedTextelement kwargs['uri'] = element.get("href") kwargs['tag'] = (kwargs.get('tag', '') + element.tag) elif element.tag in ("b", "i"): if "a" in kwargs.get('tag', ''): cls = LinkedTextelement kwargs['tag'] += element.tag else: assert element.tag == "text", "Got <%s>, expected <{text,b,i,a}> " % element.tag if element.text and (element.text.strip() or element.tag == "a" or keep_ws_only): res.append(cls(normspace(element.text), **cleantag(kwargs))) for idx, child in enumerate(element): # special rule: if next-to-last element didn't end with # whitespace, allow for the last element to be counted # even if only whitespace (To handle the case # "<text>blahonga<i> </i></text><text>other</text>") kwso = False if idx + 1 == len(element) and res and not res[-1][-1].isspace(): kwso = True res.extend(self._parse_xml_make_textelement(child, keep_ws_only=kwso, **kwargs)) if element.tag != "text" and element.tail: if element.tail.strip(): if element.text and element.text.strip() == "": # even though we've skipped an empty tag like "<i> # </i>", we record the fact that we've done so, since # it is useful for some unreliable font family # heuristics origkwargs['skippedempty'] = element.tag res.append(origcls(normspace(element.tail), **cleantag(origkwargs))) elif (res and not isinstance(res[-1], LinkedTextelement) and not res[-1][-1] in (" ", "–", "-")): # ie the tail consists only of whitespace -- move that # ws inside of the element instead (unless we have # indications that it's not needed) res[-1] = res[-1] + " " return res def _parse_xml_add_fontspec(self, element, fontinfo, fontspec): fontid = int(element.attrib['id']) # make sure we always deal with a basic dict (not # lxml.etree._Attrib) where all keys are str # object (not bytes) fspec = dict([(k, str(v)) for k, v in element.attrib.items()]) # then make it more usable fspec['size'] = int(fspec['size']) if fontinfo.get(fspec['family']): # Commmon values: MacRoman, WinAnsi, Custom fspec['encoding'] = fontinfo[fspec['family']]['encoding'] if "+" in fspec['family']: fspec['family'] = fspec['family'].split("+", 1)[1] fontspec[fontid] = self._textdecoder.fontspec(fspec) def _analyze_font_encodings(self, root, fontinfo): encoded_fontids = {} for pageelement in root: if pageelement.tag == "outline": continue elif isinstance(pageelement, etree._Comment): continue # we need to loop through all textboxes on all pages, # because the very last one might have a new fontspec for e in pageelement: if e.tag == 'fontspec': fontid = e.attrib['id'] family = e.attrib['family'] if fontinfo.get(family) and fontinfo[family]['encoding'] == "Custom": encoded_fontids[fontid] = [] elif e.tag == 'text' and e.attrib["font"] in encoded_fontids: if len(encoded_fontids[e.attrib["font"]]) < 10: encoded_fontids[e.attrib["font"]].append(e) for fontid, samples in encoded_fontids.items(): try: offset = self._textdecoder.analyze_font(fontid, samples) if offset: self.log.debug("Font %s: Decoding with offset %02x" % (fontid, offset)) else: self.log.debug("Font %s: No offset used" % fontid) except errors.PDFDecodeError: self.log.debug("Font %s: Encoding could not be detected, assuming no encoding" % fontid) ################################################################ # Properties and methods relating to the initialized PDFReader # object tagname = "div" classname = "pdfreader"
[docs] def is_empty(self): return 0 == sum([len(x) for x in self])
[docs] def textboxes(self, gluefunc=None, pageobjects=False, keepempty=False, startpage=0, pagecount=None, cache=True): """Return an iterator of the textboxes available. ``gluefunc`` should be a callable that is called with (textbox, nextbox, prevbox), and returns True iff nextbox should be appended to textbox. If ``pageobjects``, the iterator can return Page objects to signal that pagebreak has ocurred (these Page objects may or may not have Textbox elements). If ``keepempty``, process and return textboxes that have no text content (these are filtered out by default) If ``cache``, store the resulting list of textboxes for each page and return it the next time. """ textbox = None prevbox = None if gluefunc: glue = gluefunc else: glue = self._default_glue if pagecount: pages = self[startpage:startpage+pagecount] else: pages = self for page in pages: if pageobjects: yield page if cache: if page._textboxes_cache is not None: # reuse the existing cache # print("Reusing cache for page %s" % page.number) for textbox in page._textboxes_cache: yield textbox else: # print("Setting up cache for page %s" % page.number) page._textboxes_cache = [] if not cache or not page._textboxes_cache: for nextbox in page: if not (keepempty or str(nextbox).strip()): continue if not textbox: # MUST glue textbox = nextbox else: if glue(textbox, nextbox, prevbox): # can't modify textbox in place -- this messes # things up if we want/need to run textboxes() # twice. Must create a new one. # textbox += nextbox textbox = textbox + nextbox else: if cache: page._textboxes_cache.append(textbox) yield textbox textbox = nextbox prevbox = nextbox if textbox: if cache: page._textboxes_cache.append(textbox) yield textbox textbox = None
[docs] def median_box_width(self, threshold=0): """Returns the median box width of all pages.""" boxwidths = [] for page in self: for box in page: if box.right - box.left < threshold: continue # print "Box width: %d" % (box.right-box.left) boxwidths.append(box.right - box.left) boxwidths.sort() return boxwidths[int(len(boxwidths) / 2)]
@staticmethod def _default_glue(textbox, nextbox, prevbox): def basefamily(family): return family.replace("-", "").replace("Bold", "").replace("Italic", "") # default logic: if lines are next to each other # horizontally, line up vertically, and have the same # font, then they should be glued linespacing = 1.5 # a = str(textbox) # b = str(nextbox) # c = textbox.font.family == nextbox.font.family and textbox.font.size == nextbox.font.size # d = textbox.top < nextbox.top # e1 = textbox.bottom + (prevbox.height * linespacing) - prevbox.height # e2 = nextbox.top # e = e1 >= e2 # f = textbox.font.family # g = nextbox.font.family # Accept font families that are almost equal (only differ by a # "Bold" or "Italic" in one but not the other). Otherwise # common constructs like: # # <b>Lead text</b>: Lorem ipsum dolor sit amet, consectetur # adipiscing elit. Donec suscipit nulla ut lorem dapibus. # # wont be considered the same textbox. if (basefamily(textbox.font.family) == basefamily(nextbox.font.family) and textbox.font.size == nextbox.font.size and textbox.left == nextbox.left and textbox.top < nextbox.top and textbox.bottom + (prevbox.height * linespacing) - prevbox.height >= nextbox.top): return True def __iadd__(self, other): if not hasattr(self, 'files'): self.files = [(0, len(self), self.filename)] self.files.append((len(self), len(other), other.filename)) super(PDFReader, self).__iadd__(other) return self
class StreamingPDFReader(PDFReader): def __init__(self, *args, **kwargs): """Experimental API for PDFReader that separates conversion (Word etc->)PDF->intermediate format from parsing of the intermediate XML/hOCR data. """ self.log = logging.getLogger('pdfreader') self.fontspec = kwargs.get('fontspec') or {} def parse(self, filename, workdir, images=True, convert_to_pdf=False, keep_xml=True, ocr_lang=None, fontspec=None, textdecoder=None): self.read(self.convert(filename, workdir, images, convert_to_pdf, keep_xml, ocr_lang), textdecoder=textdecoder) def intermediate_filename(self, filename, ocr_lang, keep_xml): basename = os.path.basename(filename) stem = os.path.splitext(basename)[0] if ocr_lang: suffix = ".hocr.html" else: suffix = ".xml" convertedfile = os.sep.join([self.workdir, stem + suffix]) if keep_xml == "bz2": real_convertedfile = convertedfile + ".bz2" else: real_convertedfile = convertedfile return real_convertedfile def convert(self, filename, workdir=None, images=True, convert_to_pdf=False, keep_xml=True, ocr_lang=None): self.filename=filename self.workdir = workdir if self.workdir is None: self.workdir = tempfile.mkdtemp() if convert_to_pdf: newfilename = workdir + os.sep + \ os.path.splitext(os.path.basename(filename))[0] + ".pdf" if not os.path.exists(newfilename): util.ensure_dir(newfilename) cmdline = "soffice --headless --convert-to pdf --outdir '%s' %s" % ( workdir, filename) self.log.debug("%s: Converting to PDF: %s" % (filename, cmdline)) (ret, stdout, stderr) = util.runcmd( cmdline, require_success=True) filename = newfilename assert os.path.exists(filename), "PDF %s not found" % filename convertedfile = self.intermediate_filename(filename, ocr_lang, keep_xml) if ocr_lang: converter = self._tesseract converter_extra = {'lang': ocr_lang} tmpfilename = filename else: converter = self._pdftohtml converter_extra = {'images': images, 'keeppdffile': convert_to_pdf} tmpfilename = os.sep.join([workdir, os.path.basename(filename)]) # copying the filename to the workdir is only needed if we use # PDFReader._pdftohtml if not util.outfile_is_newer([filename], convertedfile): if not ocr_lang: # this is somewhat expensive and not really needed when converter is tesseract util.copy_if_different(filename, tmpfilename) # this is the expensive operation converter(tmpfilename, workdir, **converter_extra) # check if result is empty (has no content in any text node, except outline nodes) try: with open(convertedfile.replace(".bz2", "")) as fp: tree = etree.parse(fp) for bad in tree.findall("outline"): bad.getparent().remove(bad) if not etree.tostring(tree, method="text", encoding="utf-8").strip(): os.unlink(convertedfile.replace(".bz2", "")) raise errors.PDFFileIsEmpty("%s contains no text" % filename) except (etree.XMLSyntaxError, UnicodeDecodeError) as e: # this means pdftohtml created incorrect markup. This # probably means that the doc is nonempty, which is # all we care about at this point. At a later stage # (in _parse_xml), a workaround will be applied to the # document on the fly. pass if keep_xml == "bz2": with open(convertedfile.replace(".bz2", ""), mode="rb") as rfp: # BZ2File supports the with statement in py27+, # but we support py2.6 wfp = BZ2File(convertedfile, "wb") wfp.write(rfp.read()) wfp.close() os.unlink(convertedfile.replace(".bz2", "")) else: # keep_xml = True pass # it's important that we open the file as a bytestream since # we might do byte-level manipulation in _parse_xml. if keep_xml == "bz2": fp = BZ2File(convertedfile, mode="rb") else: fp = open(convertedfile, "rb") return fp def read(self, fp, parser="xml", textdecoder=None): if textdecoder is None: self._textdecoder = BaseTextDecoder() else: self._textdecoder = textdecoder filename = util.name_from_fp(fp) self.filename = filename if parser == "ocr": parser = self._parse_hocr else: parser = self._parse_xml parser(fp) # does not return anything useful fp.close() return self # for chainability
[docs]class Page(CompoundElement, OrdinalElement): """Represents a Page in a PDF file. Has *width* and *height* properties.""" tagname = "div" classname = "pdfpage" margins = None def __init__(self, *args, **kwargs): self._textboxes_cache = None super(Page, self).__init__(*args, **kwargs) @property def id(self): # FIXME: this will only work for documents consisting of a # single PDF file, not multiple (see # pdfdocumentrepository.create_external_resources to # understand why) if isinstance(self.number, str): # if the page number is a roman numeral, there is no usable way of padding it return "page%s" % self.number else: return "page%03d" % self.number # text: can be string, re obj or callable (gets called with the box obj) # fontsize: can be int or callable # fontname: can be string or callable # top,left,bottom,right
[docs] def boundingbox(self, top=0, left=0, bottom=None, right=None): """A generator of :py:class:`ferenda.pdfreader.Textbox` objects that fit into the bounding box specified by the parameters. """ if not bottom: bottom = self.height if not right: right = self.width for box in self: if (box.top >= top and box.left >= left and box.bottom <= bottom and box.right <= right): # print " SUCCESS" yield box
# else: # print " FAIL"
[docs] def crop(self, top=0, left=0, bottom=None, right=None): """Removes any :py:class:`ferenda.pdfreader.Textbox` objects that does not fit within the bounding box specified by the parameters.""" # Crop any text box that sticks out # Actually if top and left != 0, we need to adjust them newboxes = [] for box in self.boundingbox(top, left, bottom, right): box.top = box.top - top box.left = box.left - left box.right = box.right - right box.bottom = box.bottom - bottom newboxes.append(box) self[:] = [] self.extend(newboxes) self.width = right - left self.height = bottom - top # Then crop the background images... somehow if os.path.exists(self.background): cmdline = "convert %s -crop %dx%d+%d+%d +repage %s" % (self.background, self.width, self.height, left, top, self.background + ".new") # print "Running %s" % cmdline (returncode, stdout, stderr) = util.runcmd(cmdline, require_success=True) util.replace_if_different( "%s.new" % self.background, self.background)
def __str__(self): textexcerpt = " ".join([str(x) for x in self]) return "Page %s (%d x %d): '%s...'" % ( self.number, self.width, self.height, str(textexcerpt[:40])) def __repr__(self): return '<%s %s (%dx%d): %d textboxes>' % (self.__class__.__name__, self.number, self.width, self.height, len(self))
[docs]class Textbox(CompoundElement): """A textbox is a amount of text on a PDF page, with *top*, *left*, *width* and *height* properties that specifies the bounding box of the text. The *fontid* property specifies the id of font used (use :py:meth:`~ferenda.pdfreader.Textbox.getfont` to get a dict of all font properties). A textbox consists of a list of Textelements which may differ in basic formatting (bold and or italics), but otherwise all text in a Textbox has the same font and size. """ tagname = "p" classname = "textbox" def __init__(self, *args, **kwargs): assert 'top' in kwargs, "top attribute missing" assert 'left' in kwargs, "left attribute missing" assert 'width' in kwargs, "width attribute missing" assert 'height' in kwargs, "height attribute missing" assert 'fontid' in kwargs, "font id attribute missing" self.top = int(kwargs['top']) self.left = int(kwargs['left']) self.width = int(kwargs['width']) self.height = int(kwargs['height']) self.right = self.left + self.width self.bottom = self.top + self.height self.lines = int(kwargs.get("lines", 0)) self.lineheight = int(kwargs.get("lineheight", 0)) # a running average # self._fontspecid = kwargs['fontid'] self.fontid = kwargs['fontid'] or 0 if 'fontspec' in kwargs: self._fontspec = kwargs['fontspec'] del kwargs['fontspec'] else: self._fontspec = {} if 'pdf' in kwargs: self._pdf = kwargs['pdf'] del kwargs['pdf'] else: self._pdf = None del kwargs['top'] del kwargs['left'] del kwargs['width'] del kwargs['height'] del kwargs['fontid'] super(Textbox, self).__init__(*args, **kwargs) def __str__(self): s = "".join(self) return s def __repr__(self): # <Textbox 30x18+278+257 "5.1"> # <Textbox 430x14+287+315 "Regeringens förslag: Nä[...]g ska "> s = str(self) if len(s) > 40: s = s[:25] + "[...]" + s[-10:] #if six.PY2: # # s = repr(s) # s = s.encode('ascii', 'replace') if self.font: fontinfo = "%s@%s " % (self.font.family, self.font.size) else: fontinfo = "" return '<%s %sx%s+%s+%s %s"%s">' % (self.__class__.__name__, self.width, self.height, self.left, self.top, fontinfo, s) def __add__(self, other): # expand dimensions top = min(self.top, other.top) left = min(self.left, other.left) width = max(self.left + self.width, other.left + other.width) - left height = max(self.top + self.height, other.top + other.height) - top if self.lines + other.lines and other.lineheight: lineheight = (self.lineheight * self.lines + other.lineheight * other.lines) / self.lines + other.lines else: lineheight = self.lineheight lines = self.lines + other.lines if self.bottom > other.top + (other.height / 2) and self.lines and other.lines: # self and other is really on the same line lines -= 1 res = Textbox(top=top, left=left, width=width, height=height, fontid=self.fontid, fontspec=self._fontspec, pdf=self._pdf, lines=lines, lineheight=lineheight) # add all Textelement objects, concatenating adjacent TE:s if # their tags match. tag = None if len(self) == 0 else self[0].tag c = Textelement(tag=tag) # possibly add a space instead of a missing newline -- but # not before superscript elements if (self and other and not (self[-1].tag and "s" in self[-1].tag or other[0].tag and "s" in other[0].tag) and not self[-1].endswith((" ", "-", "–"))): self.append(Textelement(" ", tag=self[-1].tag)) for e in itertools.chain(self, other): if e.tag != c.tag: if c: res.append(c) res.append(e) c = Textelement(tag=e.tag) else: c = c + e # it MIGHT be the case that we need to merge c with the last # Textelement added to res iff their tags match if len(res) and c and c.tag == res[-1].tag and type(c) == type(res[-1]): res[-1] = res[-1] + c elif c: res.append(c) return res def __iadd__(self, other): self.top = min(self.top, other.top) self.left = min(self.left, other.left) self.width = max(self.left + self.width, other.left + other.width) - self.left self.height = max(self.top + self.height, other.top + other.height) - self.top self.right = self.left + self.width self.bottom = self.top + self.height if self.lines + other.lines and other.lineheight: lineheight = (self.lineheight * self.lines + other.lineheight * other.lines) / self.lines + other.lines else: lineheight = self.lineheight self.lineheight = lineheight self.lines += other.lines if self.bottom > other.top + (other.height / 2) and self.lines and other.lines: # self and other is really on the same line self.lines -= 1 if len(self): c = self.pop() else: c = Textelement(tag=None) for e in other: if e.tag != c.tag: if c: self.append(c) self.append(e) c = Textelement(tag=e.tag) # c = e else: c = c + e if c: self.append(c) return self @property def linespacing(self): if self.lines > 1: return ((self.height - self.lineheight) / (self.lines - 1)) / self.font.size # else return None (linespacing is undefined for single-line textboxes) # # def append(self, thing): # if len(self) == 0 or self[-1].tag != thing.tag: # return super(Textbox, self).append(thing) # else: # # concatenate adjacent TE:s if their tags match. # self[-1] = self[-1] + thing # return #
[docs] def as_xhtml(self, uri, parent_uri=None): children = [] first = True prevpart = None for subpart in self: if (not first and type(subpart) == type(prevpart) and getattr(subpart, 'tag', None) == getattr(prevpart, 'tag', None) and getattr(subpart, 'uri', None) == getattr(prevpart, 'uri', None)): prevpart = prevpart + subpart elif prevpart: # make sure Textelements w/o a tag doesn't render with # as_xhtml as this adds a meaningless <span> if (hasattr(prevpart, 'as_xhtml') and (not isinstance(prevpart, Textelement) or prevpart.tag or getattr(prevpart, 'uri', None))): prevpart = prevpart.as_xhtml(uri, parent_uri) if prevpart is not None: children.append(self._cleanstring(prevpart)) prevpart = subpart else: prevpart = subpart first = False if (hasattr(prevpart, 'as_xhtml') and (not isinstance(prevpart, Textelement) or prevpart.tag)): prevpart = prevpart.as_xhtml(uri, parent_uri) if prevpart is not None: children.append(self._cleanstring(prevpart)) attribs = {} if hasattr(self, 'fontid'): attribs['class'] = 'textbox fontspec%s' % self.fontid element = E("p", attribs, *children) # FIXME: we should output these positioned style attributes # only when the resulting document is being serialized in a # positioned fashion (and probably also the textbox/fontspec # attributes). if hasattr(self, 'top') and hasattr(self, 'left'): element.set( 'style', 'top: %spx; left: %spx; height: %spx; width: %spx' % (self.top, self.left, self.height, self.width)) return element
def _cleanstring(self, thing): if not isinstance(thing, str): return thing newstring = "" for char in thing: if unicodedata.category(char) != "Cc": newstring += char return newstring @cached_property def font(self): if self.fontid is not None: return LayeredConfig(Defaults(self._fontspec[self.fontid])) else: return LayeredConfig(Defaults({}))
# this doesnt work that well with the default __setattribute__ # implementation of this class' superclass. # # @font.setter # def font(self, value): # for fontspecid, fontspec in self._fontspec.items(): # if value == fontspecid: # self.font = fontspecid # if self.font is None: # .font might have the valid value 0 # self.font = str(len(self._fontspecid)) # start at 0 # self._fontspec[self.font] = value # #
[docs]class Textelement(UnicodeElement): """Represent a single part of text where each letter has the exact same formatting. The ``tag`` property specifies whether the text as a whole is bold (``'b'``) , italic(``'i'`` bold + italic (``'bi'``) or regular (``None``). """ def _get_tagname(self): if self.tag: return self.tag else: return "span"
[docs] def as_xhtml(self, uri, parent_uri=None): if self.tag and len(self.tag) > 1 and self.tag != "sup": # first create a list of elements tagmap = {"s": "sup", "b": "b", "i": "i", "a": "a"} tags = [E(tagmap[x]) for x in self.tag] # then place the text content in the last one tags[-1].text = self.clean_string() # then nest them for idx, tag in enumerate(tags): if idx < len(tags) - 1: tag.append(tags[idx+1]) return tags[0] else: return super(Textelement, self).as_xhtml(uri, parent_uri)
tagname = property(_get_tagname) def __add__(self, other): # It seems like some versions of pdf2html automatically add a # space at the end of lines to that they can be concatenated, # but some (later) versions omit this, requiring us to add a # extra space to avoid mashing words together. if len(self) and not (self.endswith(" ") or self.endswith("-") or other.startswith(" ")): extraspace = " " else: extraspace = "" if hasattr(self, 'top') and hasattr(other, 'top'): dims = {'top': min(self.top, other.top), 'left': min(self.left, other.left), 'width': max(self.left + self.width, other.left + other.width) - self.left, 'height': max(self.top + self.height, other.top + other.height) - self.top} else: if hasattr(self, 'top'): dims = {'top': self.top, 'left': self.left, 'width': self.width, 'height': self.height} elif hasattr(other, 'top'): dims = {'top': other.top, 'left': other.left, 'width': other.width, 'height': other.height} else: dims = {} strself = str(self) strother = str(other) # mandatory dehyphenation. FIXME: we'd like to make this # configurable (but where?). # # FIXME: This dehyphenates eg "EG-" + "direktiv". How many # other exceptions to this algorithm are needed. if strself and strself[-1] == '-' and strother and strother[0].islower(): strself = strself[:-1] new = self.__class__(strself + extraspace + strother, tag=self.tag, **dims) return new
class LinkedTextelement(Textelement): """Like Textelement, but with a uri property. """ def __init__(self, *args, **kwargs): kwargs['tag'] = kwargs.get('tag') kwargs['uri'] = kwargs.get('uri') super(LinkedTextelement, self).__init__(*args, **kwargs) def _get_tagname(self): return "a" tagname = property(_get_tagname) def as_xhtml(self, uri, parent_uri=None): prevtag = self.tag if self.tag is None: self.tag = "a" else: self.tag = "a" + self.tag element = super(LinkedTextelement, self).as_xhtml(uri, parent_uri) self.tag = prevtag if element is not None: element.set("href", self.uri) return element def __add__(self, other): assert not type(other) == Textelement, "Can't join a LinkedTextelement (%s) with a plain Textelement (%s)" % (self, other) assert self.uri == other.uri, "Can't join two LinkedTextelements with different URIs (%s, %s)" % (self.uri, other.uri) new = super(LinkedTextelement, self).__add__(other) new.uri = self.uri return new class BaseTextDecoder(object): def __init__(self, dummy=None): pass def __call__(self, textbox, fontspecs): return textbox def fontspec(self, fontspec): return fontspec