Source code for ferenda.pdfreader

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import os
import logging
import re
import itertools
import codecs
from glob import glob
from bz2 import BZ2File

from lxml import etree
from six import text_type as str
# from six import binary_type as bytes
import six

from ferenda import util, errors
from .elements import UnicodeElement
from .elements import CompoundElement
from .elements import OrdinalElement


class PDFReader(CompoundElement):

    """Parses PDF files and makes the content available as a object
    hierarchy. After calling :py:meth:`~ferenda.PDFReader.read`, the
    PDFReader itself is a list of :py:class:`ferenda.pdfreader.Page`
    objects, which each is a list of
    :py:class:`ferenda.pdfreader.Textbox` objects, which each is a
    list of :py:class:`ferenda.pdfreader.Textelement` objects.

    .. note::

       This class depends on the command line tool pdftohtml from
       `poppler <http://poppler.freedesktop.org/>`_.

       The class can also handle any other type of document (such as
       Word/OOXML/WordPerfect/RTF) that OpenOffice or LibreOffice
       handles by first converting it to PDF using the ``soffice``
       command line tool (which then must be in your ``$PATH``).

       If the PDF contains only scanned pages (without any OCR
       information), the pages can be run through the ``tesseract``
       command line tool (which, again, needs to be in your
       ``$PATH``). You need to provide the main language of the
       document as the ``ocr_lang`` parameter, and you need to have
       installed the tesseract language files for that language.

    """

    tagname = "div"
    classname = "pdfreader"
    def __init__(self, *args, **kwargs):
        self.fontspec = {}
        self.log = logging.getLogger('pdfreader')
        if 'filename' in kwargs:
            self.filename = kwargs['filename']
        else:
            self.filename = None
        # super(PDFReader, self).__init__(*args, **kwargs)

    def read(self, pdffile, workdir, images=True, convert_to_pdf=False, keep_xml=True, ocr_lang=None):
        """Initializes a PDFReader object from an existing PDF file. After
        initialization, the PDFReader contains a list of
        :py:class:`~ferenda.pdfreader.Page` objects.

        :param pdffile: The full path to the PDF file (or, if
                        ``convert_to_pdf`` is set, any other document
                        file)
        :param workdir: A directory where intermediate files (particularly
                        background PNG files) are stored
        :param convert_to_pdf: If pdffile is any other type of
                               document other than PDF, attempt to
                               first convert it to PDF using the
                               ``soffice`` command line tool (from
                               OpenOffice/LibreOffice).
        :type  convert_to_pdf: bool
        :param keep_xml: If False, remove the intermediate XML
                         representation of the PDF that gets created
                         in ``workdir``. If true, keep it around to
                         speed up subsequent parsing operations. If
                         set to the special value ``"bz2"``, keep it
                         but compress it with :py:mod:`bz2`.
        :type  keep_xml: bool
        :param ocr_lang: If provided, PDFReader will extract scanned
                         images from the PDF file, and run an OCR
                         program on it, using the ``ocr_lang``
                         language heuristics. (Note that this is not
                         neccessarily an IETF language tag like "sv"
                         or "en-GB", but rather whatever the
                         underlying ``tesseract`` program uses).
        :param ocr_lang: str

        """
        # start by removing all pages left behind by a previous read
        self[:] = []
        
        if convert_to_pdf:
            newpdffile = workdir + os.sep + os.path.splitext(os.path.basename(pdffile))[0] + ".pdf"
            if not os.path.exists(newpdffile):
                util.ensure_dir(newpdffile)
                cmdline = "soffice --headless -convert-to pdf -outdir '%s' %s" % (workdir, pdffile)
                self.log.debug("%s: Converting to PDF: %s" % (pdffile, cmdline))
                (ret, stdout, stderr) = util.runcmd(
                    cmdline, require_success=True)
                pdffile = newpdffile
        
        self.filename = pdffile
        assert os.path.exists(pdffile), "PDF %s not found" % pdffile
        basename = os.path.basename(pdffile)
        stem = os.path.splitext(basename)[0]

        if ocr_lang:
            suffix = ".hocr.html"
            converter = self._tesseract
            converter_extra = {'lang': ocr_lang}
            parser = self._parse_hocr
        else:
            suffix = ".xml"
            converter = self._pdftohtml
            converter_extra = {'images': images}
            parser = self._parse_xml

        convertedfile = os.sep.join([workdir, stem + suffix])
        if keep_xml == "bz2":
            real_convertedfile = convertedfile + ".bz2"
        else:
            real_convertedfile = convertedfile

        
        tmppdffile = os.sep.join([workdir, basename])
        # copying the pdffile to the workdir is only needed if we use self._pdftohtml

        if not util.outfile_is_newer([pdffile], real_convertedfile):
            util.copy_if_different(pdffile, tmppdffile)
            # this is the expensive operation
            res = converter(tmppdffile, workdir, **converter_extra)
            if keep_xml == "bz2":
                with open(convertedfile, mode="rb") as rfp:
                    # BZ2File supports the with
                    # statement in py27+, but we
                    # support py2.6
                    wfp = BZ2File(real_convertedfile, "wb")
                    wfp.write(rfp.read())
                    wfp.close()
                os.unlink(convertedfile)
            else: # keep_xml = True
                pass

        if keep_xml == "bz2":
            # FIXME: explicitly state that encoding is utf-8 (in a py26 compatible manner
            fp = BZ2File(real_convertedfile)
        else:
            fp = codecs.open(real_convertedfile, encoding="utf-8")

        res = parser(fp, real_convertedfile)

        fp.close()
        if keep_xml == False:
            os.unlink(convertedfile)
        return res

    def is_empty(self):
        return 0 == sum([len(x) for x in self])

    def textboxes(self, gluefunc=None, pageobjects=False, keepempty=False):
        """Return an iterator of the textboxes available.

        ``gluefunc`` should be a callable that is called with
        (textbox, nextbox, prevbox), and returns True iff nextbox
        should be appended to textbox.

        If ``pageobjects``, the iterator can return Page objects to
        signal that pagebreak has ocurred (these Page objects may or
        may not have Textbox elements).

        If ``keepempty``, process and return textboxes that have no
        text content (these are filtered out by default)
        """
        textbox = None
        prevbox = None
        if gluefunc:
            glue = gluefunc
        else:
            glue = self._default_glue
        for page in self:
            if pageobjects:
                yield page
            for nextbox in page:
                if not (keepempty or str(nextbox).strip()):
                    continue
                if not textbox: # MUST glue
                    textbox = nextbox
                else:
                    if glue(textbox, nextbox, prevbox):
                        textbox += nextbox
                    else:
                        yield textbox
                        textbox = nextbox
                prevbox = nextbox
            if textbox:
                yield textbox
                textbox = None

    def drawboxes(self, outfile, gluefunc=None):
        """Create a copy of the parsed PDF file, but with the textboxes
        created by ``gluefunc`` clearly marked. Returns the name of
        the created pdf file.

        ..note::

          This requires PyPDF2 and reportlab, which aren't installed
          by default (and at least reportlab is not py3 compatible).

        """
        from PyPDF2 import PdfFileWriter, PdfFileReader
        from reportlab.pdfgen import canvas
        import StringIO

        packet = None
        output = PdfFileWriter()
        existing_pdf = PdfFileReader(open(self.filename, "rb"))
        pageidx = 0
        sf = 2/3.0 # scaling factor
        dirty = False
        for tb in self.textboxes(gluefunc, pageobjects=True):
            if isinstance(tb, Page):
                if dirty:
                    can.save()
                    packet.seek(0)
                    new_pdf = PdfFileReader(packet)
                    self.log.debug("Getting page %s from existing pdf" % pageidx)
                    page = existing_pdf.getPage(pageidx)
                    page.mergePage(new_pdf.getPage(0))
                    output.addPage(page)
                    pageidx += 1
                pagesize = (tb.width*sf, tb.height*sf)
                # print("pagesize %s x %s" % pagesize)
                packet = StringIO.StringIO()
                can = canvas.Canvas(packet, pagesize=pagesize,
                                    bottomup=False)
                can.setStrokeColorRGB(0.2,0.5,0.3)
                can.translate(0,0)
            else:
                dirty = True
                # x = repr(tb)
                # print(x)
                can.rect(tb.left*sf, tb.top*sf,
                         tb.width*sf, tb.height*sf)

        packet.seek(0)
        can.save()
        new_pdf = PdfFileReader(packet)
        self.log.debug("Getting last page %s from existing pdf" % pageidx)
        page = existing_pdf.getPage(pageidx)
        page.mergePage(new_pdf.getPage(0))
        output.addPage(page)
        outputStream = open(outfile, "wb")
        output.write(outputStream)
        outputStream.close()
        self.log.debug("wrote %s" % outfile)
        return outfile

    @staticmethod
    def _default_glue(textbox, nextbox, prevbox):
        # default logic: if lines are next to each other
        # horizontally, line up vertically, and have the same
        # font, then they should be glued
        linespacing = 1
        if (textbox.getfont() == nextbox.getfont() and
            textbox.left == nextbox.left and
            textbox.top + textbox.height + linespacing >= nextbox.top):
            return True


    def _tesseract(self, tmppdffile, workdir, lang):
        root = os.path.splitext(os.path.basename(tmppdffile))[0]

        # step 1: find the number of pages
        cmd = "pdfinfo %s" % tmppdffile
        (returncode, stdout, stderr) = util.runcmd(cmd, require_success=True)
        m = re.search("Pages:\s+(\d+)", stdout)
        number_of_pages = int(m.group(1))
        self.log.debug("%(root)s.pdf has %(number_of_pages)s pages" % locals())
        # step 2: extract the images (should be one per page), 10
        # pages at a time (pdfimages flakes out on larger loads)
        to_int = int
        for idx, i in enumerate(range(int(number_of_pages / 10) + 1)):
            frompage = (i * 10) + 1
            topage = min((i + 1) * 10, number_of_pages)
            if frompage > topage:
                continue
            cmd = "pdfimages -p -f %(frompage)s -l %(topage)s %(tmppdffile)s %(workdir)s/%(root)s" % locals()
            self.log.debug("- running "+cmd)
            (returncode, stdout, stderr) = util.runcmd(cmd, require_success=True)
            # step 2.1: Combine the recently extracted images and
            # into a new tif (so that we add 10
            # pages at a time to the tif, as imagemagick can
            # create a number of pretty large files for each page,
            # so converting 200 images will fill 10 G of your temp
            # space -- which we'd like to avoid)
            cmd = "convert %(workdir)s/%(root)s-*.pbm -compress Zip %(workdir)s/%(root)s-tmp%(idx)04d.tif" % locals()
            self.log.debug("- running " + cmd)
            (returncode, stdout, stderr) = util.runcmd(cmd, require_success=True)
            # step 2.2: Remove pbm files now that they're in the .tif
            for f in glob("%(workdir)s/%(root)s-*.pbm" % locals()):
                os.unlink(f)

        # Step 3: Combine all the 10-page tifs into a giant tif using tiffcp
        cmd = "tiffcp -c zip %(workdir)s/%(root)s-tmp*.tif %(workdir)s/%(root)s.tif" % locals()
        self.log.debug("- running " + cmd)
        (returncode, stdout, stderr) = util.runcmd(cmd, require_success=True)
        
                
        # Step 3: OCR the giant tif file to create a .hocr.html file
        # Note that -psm 1 (automatic page segmentation with
        # orientation and script detection) requires the installation
        # of tesseract-ocr-3.01.osd.tar.gz
        cmd = "tesseract %(workdir)s/%(root)s.tif %(workdir)s/%(root)s.hocr -l %(lang)s -psm 1 hocr" % locals()
        self.log.debug("running " + cmd)
        (returncode, stdout, stderr) = util.runcmd(cmd, require_success=True)

        # Step 5: Cleanup (the main .tif file can stay)
        os.unlink(tmppdffile)
        for f in glob("%(workdir)s/%(root)s-tmp*.tif" % locals()):
            os.unlink(f)

    def _pdftohtml(self, tmppdffile, workdir, images):
        root = os.path.splitext(os.path.basename(tmppdffile))[0]
        try:
            if images:
                # two pass coding: First use -c (complex) to extract
                # background pictures, then use -xml to get easy-to-parse
                # text with bounding boxes.
                cmd = "pdftohtml -nodrm -c %s" % tmppdffile
                self.log.debug("Converting: %s" % cmd)
                (returncode, stdout, stderr) = util.runcmd(cmd,
                                                          require_success=True)
                # we won't need the html files, or the blank PNG files
                for f in os.listdir(workdir):
                    if f.startswith(root) and f.endswith(".html"):
                        os.unlink(workdir + os.sep + f)
                    elif f.startswith(root) and f.endswith(".png"):
                        # this checks the number of unique colors in the
                        # bitmap. If there's only one color, we don't need
                        # the file
                        (returncode, stdout, stderr) = util.runcmd('convert %s -format "%%k" info:' % (workdir + os.sep + f))
                        if stdout.strip() == "1":
                            os.unlink(workdir + os.sep + f)
                        else:
                            self.log.debug("Keeping non-blank image %s" % f)

            # Without -fontfullname, all fonts are just reported as
            # having family="Times"...
            imgflag = "-i" if not images else ""

            cmd = "pdftohtml -nodrm -xml -fontfullname %s %s" % (imgflag, tmppdffile)
            self.log.debug("Converting: %s" % cmd)
            (returncode, stdout, stderr) = util.runcmd(cmd,
                                                       require_success=True)
            # if pdftohtml fails (if it's an old version that doesn't
            # support the fullfontname flag) it still uses returncode
            # 0! Only way to know if it failed is to inspect stderr
            # and look for if the xml file wasn't created.
            xmlfile = os.path.splitext(tmppdffile)[0] + ".xml"
            if stderr and not os.path.exists(xmlfile):
                raise errors.ExternalCommandError(stderr)


        finally:
            os.unlink(tmppdffile)
            assert not os.path.exists(tmppdffile)

    re_dimensions = re.compile("bbox (?P<left>\d+) (?P<top>\d+) (?P<right>\d+) (?P<bottom>\d+)").search
    def _parse_hocr(self, fp, filename):
        def dimensions(s):
            m = self.re_dimensions(s)
            return m.groupdict()
        tree = etree.parse(fp)
        for pageelement in tree.findall("//{http://www.w3.org/1999/xhtml}div[@class='ocr_page']"):
            dim = dimensions(pageelement.get('title'))
            page = Page(number=int(pageelement.get('id')[5:]),
                        width=int(dim['right']) - int(dim['left']),
                        height=int(dim['bottom']) - int(dim['top']),
                        background=None)
            # we discard elements at the ocr_carea (content area?)
            # level, we're only concerned with paragraph-level
            # elements
            for boxelement in pageelement.findall(".//{http://www.w3.org/1999/xhtml}p[@class='ocr_par']"):
                dim = dimensions(boxelement.get('title'))
                box = Textbox(top=int(dim['top']),
                              left=int(dim['left']),
                              width=int(dim['right']) - int(dim['left']),
                              height=int(dim['bottom']) - int(dim['top']),
                              font=None)
                for element in boxelement.findall(".//{http://www.w3.org/1999/xhtml}span[@class='ocrx_word']"):
                    dim = dimensions(element.get("title"))
                    t = "".join(element.itertext()) + element.tail
                    if element.getchildren():  # probably a <em> or <strong> element
                        tag = {'{http://www.w3.org/1999/xhtml}em': 'i',
                               '{http://www.w3.org/1999/xhtml}strong': 'b'}[element.getchildren()[0].tag]
                    else:
                        tag = None
                    text = Textelement(t,
                                       tag=tag,
                                       top=int(dim['top']),
                                       left=int(dim['left']),
                                       width=int(dim['right']) - int(dim['left']),
                                       height=int(dim['bottom']) - int(dim['top']),                    )
                    box.append(text)
                page.append(box)
            self.append(page)
        self.log.debug("PDFReader initialized: %d pages" %
                       (len(self)))
            
            
        

    def _parse_xml(self, xmlfp, xmlfilename):
        def txt(element_text):
            return re.sub(r"[\s\xa0\xc2]+", " ", str(element_text))
            
        self.log.debug("Loading %s" % xmlfilename)

        try:
            tree = etree.parse(xmlfp)
        except etree.XMLSyntaxError as e:
            self.log.warning("pdftohtml created incorrect markup, trying to fix using BeautifulSoup: %s" % e)
            xmlfp.seek(0)
            from bs4 import BeautifulSoup
            from io import BytesIO
            soup = BeautifulSoup(xmlfp, "xml")
            xmlfp = BytesIO(str(soup).encode("utf-8"))
            tree = etree.parse(xmlfp)
            self.log.debug("BeautifulSoup workaround successful")

        # for each page element
        for pageelement in tree.getroot():
            if pageelement.tag == "outline":
                # FIXME: we want to do something with this information
                continue
            page = Page(number=int(pageelement.attrib['number']),  # always int?
                        width=int(pageelement.attrib['width']),
                        height=int(pageelement.attrib['height']),
                        background=None)
            background = "%s%03d.png" % (
                os.path.splitext(xmlfilename)[0], page.number)

            # Reasons this file might not exist: it was blank and therefore removed, or We're running under RepoTester
            if os.path.exists(background):
                page.background = background

            assert pageelement.tag == "page", "Got <%s>, expected <page>" % page.tag
            for element in pageelement:
                if element.tag == 'fontspec':
                    fontid =  element.attrib['id']
                    # make sure we always deal with a basic dict (not
                    # lxml.etree._Attrib) where all keys are str
                    # object (not bytes)
                    self.fontspec[fontid] = dict([(k,str(v)) for k,v in element.attrib.items()])
                    if "+" in element.attrib['family']:
                        self.fontspec[fontid]['family'] = element.attrib['family'].split("+",1)[1]
                    
                elif element.tag == 'text':
                    # eliminate "empty" textboxes
                    if element.text and txt(element.text).strip() == "" and not element.getchildren():
                        # print "Skipping empty box"
                        continue

                    attribs = dict(element.attrib)
                    attribs['fontspec'] = self.fontspec
                    b = Textbox(**attribs)

                    if element.text and element.text.strip():
                        b.append(Textelement(txt(element.text), tag=None))
                    # The below loop could be done recursively to
                    # support arbitrarily deep nesting (if we change
                    # Textelement to be a non-unicode derived type),
                    # but pdftohtml should not create such XML (there
                    # is no such data in the PDF file)
                    for child in element:
                        grandchildren = child.getchildren()
                        # special handling of the <i><b> construct
                        if grandchildren != []:
                            # print "Grandchildren handling: %s '%s' '%s'" % (len(grandchildren),
                            #                                                child.text,
                            #                                                child.tail)
                            # Handle '<text><i><b>Fordonsår</b>            <b>Faktor</b> </i></text>'
                            # assert (len(grandchildren) == 1), "General grandchildren not supported"
                            
                            if child.text:
                                b.append(Textelement(txt(child.text), tag=child.tag))
                            b.append(Textelement(
                                txt(" ".join([x.text for x in grandchildren])), tag="ib"))
                            if child.tail:
                                b.append(Textelement(txt(child.tail), tag=None))
                        else:
                            b.append(
                                Textelement(txt(child.text), tag=child.tag))
                            if child.tail:
                                b.append(Textelement(txt(child.tail), tag=None))
                    if element.tail and element.tail.strip():  # can this happen?
                        b.append(Textelement(txt(element.tail), tag=None))
                    page.append(b)
            # done reading the page
            self.append(page)
        self.log.debug("PDFReader initialized: %d pages, %d fontspecs" %
                       (len(self), len(self.fontspec)))

    def median_box_width(self, threshold=0):
        """Returns the median box width of all pages."""
        boxwidths = []
        for page in self:
            for box in page:
                if box.right - box.left < threshold:
                    continue
                # print "Box width: %d" % (box.right-box.left)
                boxwidths.append(box.right - box.left)
        boxwidths.sort()
        return boxwidths[int(len(boxwidths) / 2)]

[docs]class Page(CompoundElement, OrdinalElement): """Represents a Page in a PDF file. Has *width* and *height* properties.""" tagname = "div" classname = "pdfpage" @property
[docs] def id(self): # FIXME: this will only work for documents consisting of a # single PDF file, not multiple (see # pdfdocumentrepository.create_external_resources to # understand why) return "page%03d" % self.number # text: can be string, re obj or callable (gets called with the box obj) # fontsize: can be int or callable # fontname: can be string or callable # top,left,bottom,right
[docs] def boundingbox(self, top=0, left=0, bottom=None, right=None): """A generator of :py:class:`ferenda.pdfreader.Textbox` objects that fit into the bounding box specified by the parameters. """ if not bottom: bottom = self.height if not right: right = self.width for box in self: if (box.top >= top and box.left >= left and box.bottom <= bottom and box.right <= right): # print " SUCCESS" yield box # else: # print " FAIL"
[docs] def crop(self, top=0, left=0, bottom=None, right=None): """Removes any :py:class:`ferenda.pdfreader.Textbox` objects that does not fit within the bounding box specified by the parameters.""" # Crop any text box that sticks out # Actually if top and left != 0, we need to adjust them newboxes = [] for box in self.boundingbox(top, left, bottom, right): box.top = box.top - top box.left = box.left - left box.right = box.right - right box.bottom = box.bottom - bottom newboxes.append(box) self[:] = [] self.extend(newboxes) self.width = right - left self.height = bottom - top # Then crop the background images... somehow if os.path.exists(self.background): cmdline = "convert %s -crop %dx%d+%d+%d +repage %s" % (self.background, self.width, self.height, left, top, self.background + ".new") # print "Running %s" % cmdline (returncode, stdout, stderr) = util.runcmd(cmdline, require_success=True) util.replace_if_different( "%s.new" % self.background, self.background)
def __str__(self): textexcerpt = " ".join([str(x) for x in self]) return "Page %d (%d x %d): '%s...'" % (self.number, self.width, self.height, str(textexcerpt[:40])) def __repr__(self): return '<%s %d (%dx%d): %d textboxes>' % (self.__class__.__name__, self.number, self.width, self.height, len(self))
[docs]class Textbox(CompoundElement): """A textbox is a amount of text on a PDF page, with *top*, *left*, *width* and *height* properties that specifies the bounding box of the text. The *font* property specifies the id of font used (use :py:meth:`~ferenda.pdfreader.Textbox.getfont` to get a dict of all font properties). A textbox consists of a list of Textelements which may differ in basic formatting (bold and or italics), but otherwise all text in a Textbox has the same font and size. """ tagname = "p" classname = "textbox" def __init__(self, *args, **kwargs): assert 'top' in kwargs, "top attribute missing" assert 'left' in kwargs, "left attribute missing" assert 'width' in kwargs, "width attribute missing" assert 'height' in kwargs, "height attribute missing" assert 'font' in kwargs, "font id attribute missing" self.top = int(kwargs['top']) self.left = int(kwargs['left']) self.width = int(kwargs['width']) self.height = int(kwargs['height']) self.right = self.left + self.width self.bottom = self.top + self.height # self.__fontspecid = kwargs['font'] self.font = kwargs['font'] if 'fontspec' in kwargs: self.__fontspec = kwargs['fontspec'] del kwargs['fontspec'] del kwargs['top'] del kwargs['left'] del kwargs['width'] del kwargs['height'] del kwargs['font'] super(Textbox, self).__init__(*args, **kwargs) def __str__(self): return "".join(self) def __repr__(self): # <Textbox 30x18+278+257 "5.1"> # <Textbox 430x14+287+315 "Regeringens förslag: Nä[...]g ska "> s = str(self) if len(s) > 40: s = s[:25] + "[...]" + s[-10:] if six.PY2: # s = repr(s) s = s.encode('ascii', 'replace') if self.getfont(): fontinfo = "%s@%s " % (self.getfont()['family'], self.getfont()['size']) else: fontinfo = "" return '<%s %sx%s+%s+%s %s"%s">' % (self.__class__.__name__, self.width, self.height, self.left, self.top, fontinfo, s) def __add__(self, other): # expand dimensions top = min(self.top, other.top) left = min(self.left, other.left) width = max(self.left + self.width, other.left + other.width) - left height = max(self.top + self.height, other.top + other.height) - top res = Textbox(top=top, left=left, width=width, height=height, font=self.font, fontspec=self.__fontspec) # add all text elements c = Textelement(tag=None) for e in itertools.chain(self, other): if e.tag != c.tag: res.append(c) c = Textelement(tag=e.tag) else: c = c + e res.append(c) return res def __iadd__(self, other): if len(self): c = self.pop() else: c = Textelement(tag=None) for e in other: if e.tag != c.tag: self.append(c) c = Textelement(tag=e.tag) else: c = c + e self.append(c) self.top = min(self.top, other.top) self.left = min(self.left, other.left) self.width = max(self.left + self.width, other.left + other.width) - self.left self.height = max(self.top + self.height, other.top + other.height) - self.top return self
[docs] def as_xhtml(self, uri): element = super(Textbox, self).as_xhtml(uri) # FIXME: we should output these positioned style attributes # only when the resulting document is being serialized in a # positioned fashion. Possibly do some translation from PDF # points (which is what self.top, .left etc is using) and # pixels (which is what the CSS uses) element.set('style', 'top: %spx, left: %spx, height: %spx, width: %spx' % (self.top, self.left, self.height, self.width)) return element
[docs] def getfont(self): """Returns a fontspec dict of all properties of the font used.""" if self.font: return self.__fontspec[self.font] else: return {}
[docs]class Textelement(UnicodeElement): """Represent a single part of text where each letter has the exact same formatting. The ``tag`` property specifies whether the text as a whole is bold (``'b'``) , italic(``'i'`` bold + italic (``'bi'``) or regular (``None``). """ def _get_tagname(self): if self.tag: return self.tag else: return "span" tagname = property(_get_tagname) def __add__(self, other): return Textelement(str(self) + str(other), tag = self.tag) # The below code fixes a error with incorrectly nested tags often # found in pdftohtml generated xml. Main problem is that this relies # on sgmllib which is not included in python3. This is commented out # in the hope that more recent pdftohtml versions fix this problem in # the right place # import sgmllib # from xml.sax.saxutils import escape as xml_escape # import unicodedata # # class PDFXMLFix(sgmllib.SGMLParser): # selfclosing = ["fontspec"] # # preparations to remove invalid chars in handle_data # all_chars = (unichr(i) for i in range(0x10000)) # control_chars = ''.join( # c for c in all_chars if unicodedata.category(c) == 'Cc') # tab and newline are technically Control characters in # unicode, but we want to keep them. # control_chars = control_chars.replace("\t", "").replace("\n", "") # control_char_re = re.compile('[%s]' % re.escape(control_chars)) # # def __init__(self): # sgmllib.SGMLParser.__init__(self) # self.tags = [] # self.fp = None # # def fix(self, filename): # usetempfile = not self.fp # # if usetempfile: # tmpfile = mktemp() # self.fp = open(tmpfile, "w") # # self.fp.write('<?xml version="1.0" encoding="UTF-8"?>') # # f = open(filename) # while True: # s = f.read(8192) # if not s: # break # self.feed(s) # self.close() # # if usetempfile: # self.fp.close() # if util.replace_if_different(tmpfile, filename): # print(("replaced %s with %s" % (filename, tmpfile))) # else: # print(("%s was identical to %s" % (filename, tmpfile))) # # def close(self): # sgmllib.SGMLParser.close(self) # if self.tags: # sys.stderr.write( # "WARNING: opened tag(s) %s not closed" % self.tags) # self.fp.write( # "".join(["</%s>" % x for x in reversed(self.tags)])) # # def handle_decl(self, decl): # self.fp.write "Decl: ", decl # self.fp.write("<!%s>" % decl) # # def handle_data(self, data): # len_before = len(data) # data = xml_escape(self.control_char_re.sub('', data)) # len_after = len(data) # self.fp.write "Data: ", data.strip() # if len_before != len_after: # sys.stderr.write("WARNING: data changed from %s to %s chars: %r\n" % (len_before,len_after,data)) # self.fp.write(data) # # def unknown_starttag(self, start, attrs): # self.fp.write "Start: ", start, attrs # if start in self.selfclosing: # close = "/" # else: # close = "" # self.tags.append(start) # sys.stderr.write(repr(self.tags)+"\n") # if attrs: # fmt = ['%s="%s"' % (x[0], x[1]) for x in attrs] # self.fp.write("<%s %s%s>" % (start, " ".join(fmt), close)) # else: # self.fp.write("<%s>" % start) # # def unknown_endtag(self, end): # sys.stderr.write(repr(self.tags)+"\n") # start = self.tags.pop() # if end != start and end in self.tags: # sys.stderr.write("%s is not %s, working around\n" % (end, start)) # self.fp.write("</%s>" % start) # self.fp.write("</%s>" % end) # self.fp.write("<%s>" % start) # else: # self.fp.write("</%s>" % end)