# -*- coding: utf-8 -*-
"""The purpose of this module is to provide classes corresponding to
most elements (except ``<style>``, ``<script>`` and similar
non-document content elements) and core attributes (except ``@style``
and the ``%events`` attributes) of HTML4.01 and HTML5. It is not
totally compliant with the HTML4.01 and HTML5 standards, but is enough
to model most real-world HTML. It contains no provisions to ensure
that elements of a particular kind only contain allowed
sub-elements.
"""
from __future__ import (absolute_import, division,
print_function, unicode_literals)
from builtins import *
import logging
import bs4
from . import CompoundElement
[docs]def elements_from_soup(soup,
remove_tags=('script', 'style', 'font', 'map', 'center'),
keep_attributes=('class', 'id', 'dir', 'lang', 'src', 'href', 'name', 'alt')):
"""Converts a BeautifulSoup tree into a tree of
:py:class:`ferenda.elements.html.HTMLElement` objects. Some
non-semantic attributes and tags are removed in the process.
:param soup: Soup object to convert
:param remove_tags: Tags that should not be included
:type remove_tags: list
:param keep_attributes: Attributes to keep
:type keep_attributes: list
:returns: tree of element objects
:rtype: ferenda.elements.html.HTMLElement
"""
log = logging.getLogger(__name__)
if soup.name in remove_tags:
return None
if soup.name not in _tagmap:
log.warning("Can't render %s" % soup.name)
return None
attrs = {}
for attr in keep_attributes:
if attr in soup.attrs:
# print(" %s has attr %s" % (soup.name,attr))
if isinstance(soup[attr], list):
attrs[attr] = " ".join(soup[attr])
else:
attrs[attr] = soup[attr]
# print("%s: %r" % (soup.name, attrs))
element = _tagmap[soup.name](**attrs)
#print("%sNode: %s" % ((depth-1)*". ",soup.name))
for child in soup.children:
if isinstance(child, bs4.element.Comment):
#print("%sChild comment" % (depth*". "))
pass
elif isinstance(child, bs4.NavigableString):
#print("%sChild string %r" % (depth*". ",child[:10]))
if str(child).strip() != "": # ignore pure whitespace between tags
element.append(str(child)) # convert NavigableString to pure str
else:
#print("%sChild %s" % (depth*". ",soup.name))
subelement = elements_from_soup(child, remove_tags, keep_attributes)
if subelement is not None:
element.append(subelement)
return element
# abstract class
[docs]class HTMLElement(CompoundElement):
"""Abstract base class for all elements."""
[docs]class HTML(HTMLElement):
"""Element corresponding to the ``<html>`` tag"""
[docs]class Head(HTMLElement):
"""Element corresponding to the ``<head>`` tag"""
# a title cannot contain subelements -- derive from UnicodeElement instead?
[docs]class Title(HTMLElement):
"""Element corresponding to the ``<title>`` tag"""
[docs]class Body(HTMLElement):
"""Element corresponding to the ``<body>`` tag"""
[docs] def as_xhtml(self, uri, parent_uri=None):
element = super(Body, self).as_xhtml(uri, parent_uri)
element.set('about', uri)
return element
# %block
[docs]class P(HTMLElement):
"""Element corresponding to the ``<p>`` tag"""
# %heading
[docs]class H1(HTMLElement):
"""Element corresponding to the ``<h1>`` tag"""
[docs]class H2(HTMLElement):
"""Element corresponding to the ``<h2>`` tag"""
[docs]class H3(HTMLElement):
"""Element corresponding to the ``<h3>`` tag"""
[docs]class H4(HTMLElement):
"""Element corresponding to the ``<h4>`` tag"""
[docs]class H5(HTMLElement):
"""Element corresponding to the ``<h5>`` tag"""
[docs]class H6(HTMLElement):
"""Element corresponding to the ``<h6>`` tag"""
# %list
[docs]class UL(HTMLElement):
"""Element corresponding to the ``<ul>`` tag"""
[docs]class OL(HTMLElement):
"""Element corresponding to the ``<ol>`` tag"""
[docs]class LI(HTMLElement):
"""Element corresponding to the ``<li>`` tag"""
# %preformatted
[docs]class Pre(HTMLElement):
"""Element corresponding to the ``<pre>`` tag"""
# other
[docs]class DL(HTMLElement):
"""Element corresponding to the ``<dl>`` tag"""
[docs]class DT(HTMLElement):
"""Element corresponding to the ``<dt>`` tag"""
[docs]class DD(HTMLElement):
"""Element corresponding to the ``<dd>`` tag"""
[docs]class Div(HTMLElement):
"""Element corresponding to the ``<div>`` tag"""
[docs]class Blockquote(HTMLElement):
"""Element corresponding to the ``<blockquote>`` tag"""
[docs]class HR(HTMLElement):
"""Element corresponding to the ``<hr>`` tag"""
[docs]class Table(HTMLElement):
"""Element corresponding to the ``<table>`` tag"""
[docs]class Fieldset(HTMLElement):
"""Element corresponding to the ``<fieldset>`` tag"""
[docs]class Address(HTMLElement):
"""Element corresponding to the ``<address>`` tag"""
# %fontstyle
[docs]class TT (HTMLElement):
"""Element corresponding to the ``<tt >`` tag"""
[docs]class I (HTMLElement):
"""Element corresponding to the ``<i >`` tag"""
[docs]class B (HTMLElement):
"""Element corresponding to the ``<b >`` tag"""
[docs]class U (HTMLElement):
"""Element corresponding to the ``<u >`` tag"""
[docs]class Big (HTMLElement):
"""Element corresponding to the ``<big >`` tag"""
[docs]class Small(HTMLElement):
"""Element corresponding to the ``<small>`` tag"""
# %phrase
[docs]class Em (HTMLElement):
"""Element corresponding to the ``<em >`` tag"""
[docs]class Strong (HTMLElement):
"""Element corresponding to the ``<strong >`` tag"""
[docs]class Dfn (HTMLElement):
"""Element corresponding to the ``<dfn >`` tag"""
[docs]class Code (HTMLElement):
"""Element corresponding to the ``<code >`` tag"""
[docs]class Samp (HTMLElement):
"""Element corresponding to the ``<samp >`` tag"""
[docs]class Kbd (HTMLElement):
"""Element corresponding to the ``<kbd >`` tag"""
[docs]class Var (HTMLElement):
"""Element corresponding to the ``<var >`` tag"""
[docs]class Cite (HTMLElement):
"""Element corresponding to the ``<cite >`` tag"""
[docs]class Abbr (HTMLElement):
"""Element corresponding to the ``<abbr >`` tag"""
[docs]class Acronym(HTMLElement):
"""Element corresponding to the ``<acronym>`` tag"""
# %special
[docs]class A (HTMLElement):
"""Element corresponding to the ``<a >`` tag"""
[docs]class Img (HTMLElement):
"""Element corresponding to the ``<img >`` tag"""
[docs]class Object (HTMLElement):
"""Element corresponding to the ``<object >`` tag"""
[docs]class Br (HTMLElement):
"""Element corresponding to the ``<br >`` tag"""
[docs]class Q (HTMLElement):
"""Element corresponding to the ``<q >`` tag"""
[docs]class Sub (HTMLElement):
"""Element corresponding to the ``<sub >`` tag"""
[docs]class Sup (HTMLElement):
"""Element corresponding to the ``<sup >`` tag"""
[docs]class Span (HTMLElement):
"""Element corresponding to the ``<span >`` tag"""
[docs]class BDO(HTMLElement):
"""Element corresponding to the ``<bdo>`` tag"""
# %form
[docs]class Select(HTMLElement):
"""Element corresponding to the ``<select>`` tag"""
[docs]class Textarea(HTMLElement):
"""Element corresponding to the ``<textarea>`` tag"""
[docs]class Label(HTMLElement):
"""Element corresponding to the ``<label>`` tag"""
# table
[docs]class Caption(HTMLElement):
"""Element corresponding to the ``<caption>`` tag"""
[docs]class Thead(HTMLElement):
"""Element corresponding to the ``<thead>`` tag"""
[docs]class Tbody(HTMLElement):
"""Element corresponding to the ``<tbody>`` tag"""
[docs]class Colgroup(HTMLElement):
"""Element corresponding to the ``<colgroup>`` tag"""
[docs]class Col(HTMLElement):
"""Element corresponding to the ``<col>`` tag"""
[docs]class TR(HTMLElement):
"""Element corresponding to the ``<tr>`` tag"""
[docs]class TH(HTMLElement):
"""Element corresponding to the ``<th>`` tag"""
[docs]class TD(HTMLElement):
"""Element corresponding to the ``<td>`` tag"""
# very special?
[docs]class Ins(HTMLElement):
"""Element corresponding to the ``<ins>`` tag"""
[docs]class Del(HTMLElement):
"""Element corresponding to the ``<del>`` tag"""
# new elements in HTML5 -- cannot be simply expressed in XHTML
# 1.1. Instead they're expressed as eg. '<div class="section">'
[docs]class HTML5Element(HTMLElement):
tagname = "div"
def _get_classname(self):
return self.__class__.__name__.lower()
classname = property(_get_classname)
[docs]class Article(HTML5Element):
"""Element corresponding to the ``<article>`` tag"""
[docs]class Aside(HTML5Element):
"""Element corresponding to the ``<aside>`` tag"""
[docs]class Bdi(HTML5Element):
"""Element corresponding to the ``<bdi>`` tag"""
[docs]class Details(HTML5Element):
"""Element corresponding to the ``<details>`` tag"""
[docs]class Dialog(HTML5Element):
"""Element corresponding to the ``<dialog>`` tag"""
[docs]class Summary(HTML5Element):
"""Element corresponding to the ``<summary>`` tag"""
[docs]class Figcaption(HTML5Element):
"""Element corresponding to the ``<figcaption>`` tag"""
[docs]class Hgroup(HTML5Element):
"""Element corresponding to the ``<hgroup>`` tag"""
[docs]class Mark(HTML5Element):
"""Element corresponding to the ``<mark>`` tag"""
[docs]class Meter(HTML5Element):
"""Element corresponding to the ``<meter>`` tag"""
[docs]class Nav(HTML5Element):
"""Element corresponding to the ``<nav>`` tag"""
[docs]class Progress(HTML5Element):
"""Element corresponding to the ``<progress>`` tag"""
[docs]class Ruby(HTML5Element):
"""Element corresponding to the ``<ruby>`` tag"""
[docs]class Rt(HTML5Element):
"""Element corresponding to the ``<rt>`` tag"""
[docs]class Rp(HTML5Element):
"""Element corresponding to the ``<rp>`` tag"""
[docs]class Section(HTML5Element):
"""Element corresponding to the ``<section>`` tag"""
[docs]class Time(HTML5Element):
"""Element corresponding to the ``<time>`` tag"""
[docs]class Wbr(HTML5Element):
"""Element corresponding to the ``<wbr>`` tag"""
# audio, video, embed, canvas and similar non structural/semantic
# elements not included
# For use by elements_from_soup. FIXME: we should be able to build
# _tagmap dynamically.
_tagmap = {'html': HTML,
'head': Head,
'title': Title,
'body': Body,
'p': P,
'h1': H1,
'h2': H2,
'h3': H3,
'h4': H4,
'h5': H5,
'h6': H6,
'ul': UL,
'ol': OL,
'li': LI,
'pre': Pre,
'dl': DL,
'dt': DT,
'dd': DD,
'div': Div,
'blockquote': Blockquote,
'form': Form,
'hr': HR,
'table': Table,
'fieldset': Fieldset,
'address': Address,
'tt': TT,
'i': I,
'b': B,
'u': U,
'big': Big,
'small': Small,
'em': Em,
'strong': Strong,
'dfn': Dfn,
'code': Code,
'samp': Samp,
'kbd': Kbd,
'var': Var,
'cite': Cite,
'abbr': Abbr,
'acronym': Acronym,
'a': A,
'img': Img,
'object': Object,
'br': Br,
'q': Q,
'sub': Sub,
'sup': Sup,
'span': Span,
'bdo': BDO,
'input': Input,
'select': Select,
'textarea': Textarea,
'label': Label,
'button': Button,
'caption': Caption,
'thead': Thead,
'tfoot': Tfoot,
'tbody': Tbody,
'colgroup': Colgroup,
'col': Col,
'tr': TR,
'th': TH,
'td': TD,
'ins': Ins,
'del': Del,
'article': Article,
'aside': Aside,
'bdi': Bdi,
'details': Details,
'dialog': Dialog,
'summary': Summary,
'figure': Figure,
'figcaption': Figcaption,
'footer': Footer,
'header': Header,
'hgroup': Hgroup,
'mark': Mark,
'meter': Meter,
'nav': Nav,
'progress': Progress,
'ruby': Ruby,
'rt': Rt,
'rp': Rp,
'section': Section,
'time': Time,
'wbr': Wbr
}