Source code for ferenda.elements.html

# -*- coding: utf-8 -*-
"""The purpose of this module is to provide classes corresponding to
most elements (except ``<style>``, ``<script>`` and similar
non-document content elements) and core attributes (except ``@style``
and the ``%events`` attributes) of HTML4.01 and HTML5. It is not
totally compliant with the HTML4.01 and HTML5 standards, but is enough
to model most real-world HTML. It contains no provisions to ensure
that elements of a particular kind only contain allowed
sub-elements.

"""
from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import *

import logging

import bs4

from . import CompoundElement


[docs]def elements_from_soup(soup, remove_tags=('script', 'style', 'font', 'map', 'center'), keep_attributes=('class', 'id', 'dir', 'lang', 'src', 'href', 'name', 'alt')): """Converts a BeautifulSoup tree into a tree of :py:class:`ferenda.elements.html.HTMLElement` objects. Some non-semantic attributes and tags are removed in the process. :param soup: Soup object to convert :param remove_tags: Tags that should not be included :type remove_tags: list :param keep_attributes: Attributes to keep :type keep_attributes: list :returns: tree of element objects :rtype: ferenda.elements.html.HTMLElement """ log = logging.getLogger(__name__) if soup.name in remove_tags: return None if soup.name not in _tagmap: log.warning("Can't render %s" % soup.name) return None attrs = {} for attr in keep_attributes: if attr in soup.attrs: # print(" %s has attr %s" % (soup.name,attr)) if isinstance(soup[attr], list): attrs[attr] = " ".join(soup[attr]) else: attrs[attr] = soup[attr] # print("%s: %r" % (soup.name, attrs)) element = _tagmap[soup.name](**attrs) #print("%sNode: %s" % ((depth-1)*". ",soup.name)) for child in soup.children: if isinstance(child, bs4.element.Comment): #print("%sChild comment" % (depth*". ")) pass elif isinstance(child, bs4.NavigableString): #print("%sChild string %r" % (depth*". ",child[:10])) if str(child).strip() != "": # ignore pure whitespace between tags element.append(str(child)) # convert NavigableString to pure str else: #print("%sChild %s" % (depth*". ",soup.name)) subelement = elements_from_soup(child, remove_tags, keep_attributes) if subelement is not None: element.append(subelement) return element
# abstract class
[docs]class HTMLElement(CompoundElement): """Abstract base class for all elements."""
[docs]class HTML(HTMLElement): """Element corresponding to the ``<html>`` tag"""
# a title cannot contain subelements -- derive from UnicodeElement instead?
[docs]class Title(HTMLElement): """Element corresponding to the ``<title>`` tag"""
[docs]class Body(HTMLElement): """Element corresponding to the ``<body>`` tag"""
[docs] def as_xhtml(self, uri, parent_uri=None): element = super(Body, self).as_xhtml(uri, parent_uri) element.set('about', uri) return element
# %block
[docs]class P(HTMLElement): """Element corresponding to the ``<p>`` tag"""
# %heading
[docs]class H1(HTMLElement): """Element corresponding to the ``<h1>`` tag"""
[docs]class H2(HTMLElement): """Element corresponding to the ``<h2>`` tag"""
[docs]class H3(HTMLElement): """Element corresponding to the ``<h3>`` tag"""
[docs]class H4(HTMLElement): """Element corresponding to the ``<h4>`` tag"""
[docs]class H5(HTMLElement): """Element corresponding to the ``<h5>`` tag"""
[docs]class H6(HTMLElement): """Element corresponding to the ``<h6>`` tag"""
# %list
[docs]class UL(HTMLElement): """Element corresponding to the ``<ul>`` tag"""
[docs]class OL(HTMLElement): """Element corresponding to the ``<ol>`` tag"""
[docs]class LI(HTMLElement): """Element corresponding to the ``<li>`` tag"""
# %preformatted
[docs]class Pre(HTMLElement): """Element corresponding to the ``<pre>`` tag"""
# other
[docs]class DL(HTMLElement): """Element corresponding to the ``<dl>`` tag"""
[docs]class DT(HTMLElement): """Element corresponding to the ``<dt>`` tag"""
[docs]class DD(HTMLElement): """Element corresponding to the ``<dd>`` tag"""
[docs]class Div(HTMLElement): """Element corresponding to the ``<div>`` tag"""
[docs]class Blockquote(HTMLElement): """Element corresponding to the ``<blockquote>`` tag"""
[docs]class Form(HTMLElement): """Element corresponding to the ``<form>`` tag"""
[docs]class HR(HTMLElement): """Element corresponding to the ``<hr>`` tag"""
[docs]class Table(HTMLElement): """Element corresponding to the ``<table>`` tag"""
[docs]class Fieldset(HTMLElement): """Element corresponding to the ``<fieldset>`` tag"""
[docs]class Address(HTMLElement): """Element corresponding to the ``<address>`` tag"""
# %fontstyle
[docs]class TT (HTMLElement): """Element corresponding to the ``<tt >`` tag"""
[docs]class I (HTMLElement): """Element corresponding to the ``<i >`` tag"""
[docs]class B (HTMLElement): """Element corresponding to the ``<b >`` tag"""
[docs]class U (HTMLElement): """Element corresponding to the ``<u >`` tag"""
[docs]class Big (HTMLElement): """Element corresponding to the ``<big >`` tag"""
[docs]class Small(HTMLElement): """Element corresponding to the ``<small>`` tag"""
# %phrase
[docs]class Em (HTMLElement): """Element corresponding to the ``<em >`` tag"""
[docs]class Strong (HTMLElement): """Element corresponding to the ``<strong >`` tag"""
[docs]class Dfn (HTMLElement): """Element corresponding to the ``<dfn >`` tag"""
[docs]class Code (HTMLElement): """Element corresponding to the ``<code >`` tag"""
[docs]class Samp (HTMLElement): """Element corresponding to the ``<samp >`` tag"""
[docs]class Kbd (HTMLElement): """Element corresponding to the ``<kbd >`` tag"""
[docs]class Var (HTMLElement): """Element corresponding to the ``<var >`` tag"""
[docs]class Cite (HTMLElement): """Element corresponding to the ``<cite >`` tag"""
[docs]class Abbr (HTMLElement): """Element corresponding to the ``<abbr >`` tag"""
[docs]class Acronym(HTMLElement): """Element corresponding to the ``<acronym>`` tag"""
# %special
[docs]class A (HTMLElement): """Element corresponding to the ``<a >`` tag"""
[docs]class Img (HTMLElement): """Element corresponding to the ``<img >`` tag"""
[docs]class Object (HTMLElement): """Element corresponding to the ``<object >`` tag"""
[docs]class Br (HTMLElement): """Element corresponding to the ``<br >`` tag"""
[docs]class Q (HTMLElement): """Element corresponding to the ``<q >`` tag"""
[docs]class Sub (HTMLElement): """Element corresponding to the ``<sub >`` tag"""
[docs]class Sup (HTMLElement): """Element corresponding to the ``<sup >`` tag"""
[docs]class Span (HTMLElement): """Element corresponding to the ``<span >`` tag"""
[docs]class BDO(HTMLElement): """Element corresponding to the ``<bdo>`` tag"""
# %form
[docs]class Input(HTMLElement): """Element corresponding to the ``<input>`` tag"""
[docs]class Select(HTMLElement): """Element corresponding to the ``<select>`` tag"""
[docs]class Textarea(HTMLElement): """Element corresponding to the ``<textarea>`` tag"""
[docs]class Label(HTMLElement): """Element corresponding to the ``<label>`` tag"""
[docs]class Button(HTMLElement): """Element corresponding to the ``<button>`` tag"""
# table
[docs]class Caption(HTMLElement): """Element corresponding to the ``<caption>`` tag"""
[docs]class Thead(HTMLElement): """Element corresponding to the ``<thead>`` tag"""
[docs]class Tfoot(HTMLElement): """Element corresponding to the ``<tfoot>`` tag"""
[docs]class Tbody(HTMLElement): """Element corresponding to the ``<tbody>`` tag"""
[docs]class Colgroup(HTMLElement): """Element corresponding to the ``<colgroup>`` tag"""
[docs]class Col(HTMLElement): """Element corresponding to the ``<col>`` tag"""
[docs]class TR(HTMLElement): """Element corresponding to the ``<tr>`` tag"""
[docs]class TH(HTMLElement): """Element corresponding to the ``<th>`` tag"""
[docs]class TD(HTMLElement): """Element corresponding to the ``<td>`` tag"""
# very special?
[docs]class Ins(HTMLElement): """Element corresponding to the ``<ins>`` tag"""
[docs]class Del(HTMLElement): """Element corresponding to the ``<del>`` tag"""
# new elements in HTML5 -- cannot be simply expressed in XHTML # 1.1. Instead they're expressed as eg. '<div class="section">'
[docs]class HTML5Element(HTMLElement): tagname = "div" def _get_classname(self): return self.__class__.__name__.lower() classname = property(_get_classname)
[docs]class Article(HTML5Element): """Element corresponding to the ``<article>`` tag"""
[docs]class Aside(HTML5Element): """Element corresponding to the ``<aside>`` tag"""
[docs]class Bdi(HTML5Element): """Element corresponding to the ``<bdi>`` tag"""
[docs]class Details(HTML5Element): """Element corresponding to the ``<details>`` tag"""
[docs]class Dialog(HTML5Element): """Element corresponding to the ``<dialog>`` tag"""
[docs]class Summary(HTML5Element): """Element corresponding to the ``<summary>`` tag"""
[docs]class Figure(HTML5Element): """Element corresponding to the ``<figure>`` tag"""
[docs]class Figcaption(HTML5Element): """Element corresponding to the ``<figcaption>`` tag"""
[docs]class Hgroup(HTML5Element): """Element corresponding to the ``<hgroup>`` tag"""
[docs]class Mark(HTML5Element): """Element corresponding to the ``<mark>`` tag"""
[docs]class Meter(HTML5Element): """Element corresponding to the ``<meter>`` tag"""
[docs]class Progress(HTML5Element): """Element corresponding to the ``<progress>`` tag"""
[docs]class Ruby(HTML5Element): """Element corresponding to the ``<ruby>`` tag"""
[docs]class Rt(HTML5Element): """Element corresponding to the ``<rt>`` tag"""
[docs]class Rp(HTML5Element): """Element corresponding to the ``<rp>`` tag"""
[docs]class Section(HTML5Element): """Element corresponding to the ``<section>`` tag"""
[docs]class Time(HTML5Element): """Element corresponding to the ``<time>`` tag"""
[docs]class Wbr(HTML5Element): """Element corresponding to the ``<wbr>`` tag"""
# audio, video, embed, canvas and similar non structural/semantic # elements not included # For use by elements_from_soup. FIXME: we should be able to build # _tagmap dynamically. _tagmap = {'html': HTML, 'head': Head, 'title': Title, 'body': Body, 'p': P, 'h1': H1, 'h2': H2, 'h3': H3, 'h4': H4, 'h5': H5, 'h6': H6, 'ul': UL, 'ol': OL, 'li': LI, 'pre': Pre, 'dl': DL, 'dt': DT, 'dd': DD, 'div': Div, 'blockquote': Blockquote, 'form': Form, 'hr': HR, 'table': Table, 'fieldset': Fieldset, 'address': Address, 'tt': TT, 'i': I, 'b': B, 'u': U, 'big': Big, 'small': Small, 'em': Em, 'strong': Strong, 'dfn': Dfn, 'code': Code, 'samp': Samp, 'kbd': Kbd, 'var': Var, 'cite': Cite, 'abbr': Abbr, 'acronym': Acronym, 'a': A, 'img': Img, 'object': Object, 'br': Br, 'q': Q, 'sub': Sub, 'sup': Sup, 'span': Span, 'bdo': BDO, 'input': Input, 'select': Select, 'textarea': Textarea, 'label': Label, 'button': Button, 'caption': Caption, 'thead': Thead, 'tfoot': Tfoot, 'tbody': Tbody, 'colgroup': Colgroup, 'col': Col, 'tr': TR, 'th': TH, 'td': TD, 'ins': Ins, 'del': Del, 'article': Article, 'aside': Aside, 'bdi': Bdi, 'details': Details, 'dialog': Dialog, 'summary': Summary, 'figure': Figure, 'figcaption': Figcaption, 'footer': Footer, 'header': Header, 'hgroup': Hgroup, 'mark': Mark, 'meter': Meter, 'nav': Nav, 'progress': Progress, 'ruby': Ruby, 'rt': Rt, 'rp': Rp, 'section': Section, 'time': Time, 'wbr': Wbr }