Here's a nested table: |
This tag contains nothing but whitespace:
This p tag is cut off by
the end of the blockquote tag
This table contains bare markup
This document contains a surprise doctype
Tag name contains Unicode characters
"""
class SoupTest(object):
@property
def default_builder(self) -> Type[TreeBuilder]:
return default_builder
def soup(self, markup: _IncomingMarkup, **kwargs: Any) -> BeautifulSoup:
"""Build a Beautiful Soup object from markup."""
builder = kwargs.pop("builder", self.default_builder)
return BeautifulSoup(markup, builder=builder, **kwargs)
def document_for(self, markup: str, **kwargs: Any) -> str:
"""Turn an HTML fragment into a document.
The details depend on the builder.
"""
return self.default_builder(**kwargs).test_fragment_to_document(markup)
def assert_soup(
self, to_parse: _IncomingMarkup, compare_parsed_to: Optional[str] = None
) -> None:
"""Parse some markup using Beautiful Soup and verify that
the output markup is as expected.
"""
builder = self.default_builder
obj = BeautifulSoup(to_parse, builder=builder)
if compare_parsed_to is None:
assert isinstance(to_parse, str)
compare_parsed_to = to_parse
# Verify that the documents come out the same.
assert obj.decode() == self.document_for(compare_parsed_to)
# Also run some checks on the BeautifulSoup object itself:
# Verify that every tag that was opened was eventually closed.
# There are no tags in the open tag counter.
assert all(v == 0 for v in list(obj.open_tag_counter.values()))
# The only tag in the tag stack is the one for the root
# document.
assert [obj.ROOT_TAG_NAME] == [x.name for x in obj.tagStack]
assertSoupEquals = assert_soup
def assertConnectedness(self, element: Tag) -> None:
"""Ensure that next_element and previous_element are properly
set for all descendants of the given element.
"""
earlier = None
for e in element.descendants:
if earlier:
assert e == earlier.next_element
assert earlier == e.previous_element
earlier = e
def linkage_validator(
self, el: Tag, _recursive_call: bool = False
) -> Optional[PageElement]:
"""Ensure proper linkage throughout the document."""
descendant = None
# Document element should have no previous element or previous sibling.
# It also shouldn't have a next sibling.
if el.parent is None:
assert (
el.previous_element is None
), "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
el, el.previous_element, None
)
assert (
el.previous_sibling is None
), "Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
el, el.previous_sibling, None
)
assert (
el.next_sibling is None
), "Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
el, el.next_sibling, None
)
idx = 0
child = None
last_child = None
last_idx = len(el.contents) - 1
for child in el.contents:
descendant = None
# Parent should link next element to their first child
# That child should have no previous sibling
if idx == 0:
if el.parent is not None:
assert (
el.next_element is child
), "Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
el, el.next_element, child
)
assert (
child.previous_element is el
), "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
child, child.previous_element, el
)
assert (
child.previous_sibling is None
), "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format(
child, child.previous_sibling, None
)
# If not the first child, previous index should link as sibling to this index
# Previous element should match the last index or the last bubbled up descendant
else:
assert (
child.previous_sibling is el.contents[idx - 1]
), "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format(
child, child.previous_sibling, el.contents[idx - 1]
)
assert (
el.contents[idx - 1].next_sibling is child
), "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
el.contents[idx - 1], el.contents[idx - 1].next_sibling, child
)
if last_child is not None:
assert (
child.previous_element is last_child
), "Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format(
child, child.previous_element, last_child, child.parent.contents
)
assert (
last_child.next_element is child
), "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
last_child, last_child.next_element, child
)
if isinstance(child, Tag) and child.contents:
descendant = self.linkage_validator(child, True)
assert descendant is not None
# A bubbled up descendant should have no next siblings
assert (
descendant.next_sibling is None
), "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
descendant, descendant.next_sibling, None
)
# Mark last child as either the bubbled up descendant or the current child
if descendant is not None:
last_child = descendant
else:
last_child = child
# If last child, there are non next siblings
if idx == last_idx:
assert (
child.next_sibling is None
), "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
child, child.next_sibling, None
)
idx += 1
child = descendant if descendant is not None else child
if child is None:
child = el
if not _recursive_call and child is not None:
target: Optional[Tag] = el
while True:
if target is None:
assert (
child.next_element is None
), "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
child, child.next_element, None
)
break
elif target.next_sibling is not None:
assert (
child.next_element is target.next_sibling
), "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
child, child.next_element, target.next_sibling
)
break
target = target.parent
# We are done, so nothing to return
return None
else:
# Return the child to the recursive caller
return child
def assert_selects(self, tags: Iterable[Tag], should_match: Iterable[str]) -> None:
"""Make sure that the given tags have the correct text.
This is used in tests that define a bunch of tags, each
containing a single string, and then select certain strings by
some mechanism.
"""
assert [tag.string for tag in tags] == should_match
def assert_selects_ids(
self, tags: Iterable[Tag], should_match: Iterable[str]
) -> None:
"""Make sure that the given tags have the correct IDs.
This is used in tests that define a bunch of tags, each
containing a single string, and then select certain strings by
some mechanism.
"""
assert [tag["id"] for tag in tags] == should_match
class TreeBuilderSmokeTest(SoupTest):
# Tests that are common to HTML and XML tree builders.
@pytest.mark.parametrize(
"multi_valued_attributes", [None, {}, dict(b=["class"]), {"*": ["notclass"]}]
)
def test_attribute_not_multi_valued(self, multi_valued_attributes):
markup = ''
soup = self.soup(markup, multi_valued_attributes=multi_valued_attributes)
assert soup.a["class"] == "a b c"
@pytest.mark.parametrize(
"multi_valued_attributes", [dict(a=["class"]), {"*": ["class"]}]
)
def test_attribute_multi_valued(self, multi_valued_attributes):
markup = ''
soup = self.soup(markup, multi_valued_attributes=multi_valued_attributes)
assert soup.a["class"] == ["a", "b", "c"]
def test_invalid_doctype(self):
# We don't have an official opinion on how these are parsed,
# but they shouldn't crash any of the parsers.
markup = "content"
self.soup(markup)
markup = ""
self.soup(markup)
def test_doctype_filtered(self):
markup = "\n\n"
soup = self.soup(markup, parse_only=SoupStrainer(name="html"))
assert not any(isinstance(x, Doctype) for x in soup.descendants)
def test_custom_attribute_dict_class(self):
class MyAttributeDict(dict):
def __setitem__(self, key: str, value: Any):
# Ignore the provided value and substitute a
# hard-coded one.
super().__setitem__(key, "OVERRIDDEN")
markup = 'f'
builder = self.default_builder(attribute_dict_class=MyAttributeDict)
soup = self.soup(markup, builder=builder)
tag = soup.a
assert isinstance(tag.attrs, MyAttributeDict)
assert "OVERRIDDEN" == tag["attr1"]
tag["attr3"] = True
assert "OVERRIDDEN" == tag["attr3"]
expect = 'f'
assert expect == tag.decode()
def test_custom_attribute_value_list_class(self):
class MyCustomAttributeValueList(AttributeValueList):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.append("extra")
builder = self.default_builder(
multi_valued_attributes={"*": set(["attr2"])},
attribute_value_list_class=MyCustomAttributeValueList,
)
markup = 'f'
soup = self.soup(markup, builder=builder)
tag = soup.a
assert tag["attr1"] == "val1"
assert tag["attr2"] == ["val2", "extra"]
assert isinstance(tag["attr2"], MyCustomAttributeValueList)
class HTMLTreeBuilderSmokeTest(TreeBuilderSmokeTest):
"""A basic test of a treebuilder's competence.
Any HTML treebuilder, present or future, should be able to pass
these tests. With invalid markup, there's room for interpretation,
and different parsers can handle it differently. But with the
markup in these tests, there's not much room for interpretation.
"""
def test_empty_element_tags(self):
"""Verify that all HTML4 and HTML5 empty element (aka void element) tags
are handled correctly.
"""
for name in [
"area",
"base",
"br",
"col",
"embed",
"hr",
"img",
"input",
"keygen",
"link",
"menuitem",
"meta",
"param",
"source",
"track",
"wbr",
"spacer",
"frame",
]:
soup = self.soup("")
new_tag = soup.new_tag(name)
assert new_tag.is_empty_element is True
self.assert_soup("
", "
")
self.assert_soup("
", "
")
def test_special_string_containers(self):
soup = self.soup("")
assert isinstance(soup.style.string, Stylesheet)
assert isinstance(soup.script.string, Script)
soup = self.soup("")
assert isinstance(soup.style.string, Stylesheet)
# The contents of the style tag resemble an HTML comment, but
# it's not treated as a comment.
assert soup.style.string == ""
assert isinstance(soup.style.string, Stylesheet)
def test_pickle_and_unpickle_identity(self):
# Pickling a tree, then unpickling it, yields a tree identical
# to the original.
tree = self.soup("foo")
dumped = pickle.dumps(tree, pickle.HIGHEST_PROTOCOL)
loaded = pickle.loads(dumped)
assert loaded.__class__ == BeautifulSoup
assert loaded.decode() == tree.decode()
def test_pickle_and_unpickle_bad_markup(self):
markup = """
blabla
"""
soup = self.soup(markup)
pickled = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
soup = pickle.loads(pickled)
assert soup.builder.is_xml is False
def assertDoctypeHandled(self, doctype_fragment: str) -> None:
"""Assert that a given doctype string is handled correctly."""
doctype_str, soup = self._document_with_doctype(doctype_fragment)
# Make sure a Doctype object was created.
doctype = soup.contents[0]
assert doctype.__class__ == Doctype
assert doctype == doctype_fragment
assert soup.encode("utf8")[: len(doctype_str)] == doctype_str
# Make sure that the doctype was correctly associated with the
# parse tree and that the rest of the document parsed.
assert soup.p is not None
assert soup.p.contents[0] == "foo"
def _document_with_doctype(
self, doctype_fragment: str, doctype_string: str = "DOCTYPE"
) -> Tuple[bytes, BeautifulSoup]:
"""Generate and parse a document with the given doctype."""
doctype = "" % (doctype_string, doctype_fragment)
markup = doctype + "\nfoo
"
soup = self.soup(markup)
return doctype.encode("utf8"), soup
def test_normal_doctypes(self):
"""Make sure normal, everyday HTML doctypes are handled correctly."""
self.assertDoctypeHandled("html")
self.assertDoctypeHandled(
'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"'
)
def test_empty_doctype(self):
soup = self.soup("")
doctype = soup.contents[0]
assert "" == doctype.strip()
def test_mixed_case_doctype(self):
# A lowercase or mixed-case doctype becomes a Doctype.
for doctype_fragment in ("doctype", "DocType"):
doctype_str, soup = self._document_with_doctype("html", doctype_fragment)
# Make sure a Doctype object was created and that the DOCTYPE
# is uppercase.
doctype = soup.contents[0]
assert doctype.__class__ == Doctype
assert doctype == "html"
assert soup.encode("utf8")[: len(doctype_str)] == b""
# Make sure that the doctype was correctly associated with the
# parse tree and that the rest of the document parsed.
assert soup.p.contents[0] == "foo"
def test_public_doctype_with_url(self):
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
self.assertDoctypeHandled(doctype)
def test_system_doctype(self):
self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
def test_namespaced_system_doctype(self):
# We can handle a namespaced doctype with a system ID.
self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
def test_namespaced_public_doctype(self):
# Test a namespaced doctype with a public id.
self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
def test_real_xhtml_document(self):
"""A real XHTML document should come out more or less the same as it went in."""
markup = b"""
Hello.
Goodbye.
"""
with warnings.catch_warnings(record=True) as w:
soup = self.soup(markup)
assert soup.encode("utf-8").replace(b"\n", b"") == markup.replace(b"\n", b"")
# No warning was issued about parsing an XML document as HTML,
# because XHTML is both.
assert w == []
def test_namespaced_html(self):
# When a namespaced XML document is parsed as HTML it should
# be treated as HTML with weird tag names.
markup = b"""content"""
with warnings.catch_warnings(record=True) as w:
soup = self.soup(markup)
assert 2 == len(soup.find_all("ns1:foo"))
# n.b. no "you're parsing XML as HTML" warning was given
# because there was no XML declaration.
assert [] == w
def test_detect_xml_parsed_as_html(self):
# A warning is issued when parsing an XML document as HTML,
# but basic stuff should still work.
markup = b"""string"""
with warnings.catch_warnings(record=True) as w:
soup = self.soup(markup)
assert soup.tag.string == "string"
[warning] = w
assert isinstance(warning.message, XMLParsedAsHTMLWarning)
assert str(warning.message) == XMLParsedAsHTMLWarning.MESSAGE
# NOTE: the warning is not issued if the document appears to
# be XHTML (tested with test_real_xhtml_document in the
# superclass) or if there is no XML declaration (tested with
# test_namespaced_html in the superclass).
def test_processing_instruction(self):
# We test both Unicode and bytestring to verify that
# process_markup correctly sets processing_instruction_class
# even when the markup is already Unicode and there is no
# need to process anything.
markup = """"""
soup = self.soup(markup)
assert markup == soup.decode()
markup = b""""""
soup = self.soup(markup)
assert markup == soup.encode("utf8")
def test_deepcopy(self):
"""Make sure you can copy the tree builder.
This is important because the builder is part of a
BeautifulSoup object, and we want to be able to copy that.
"""
copy.deepcopy(self.default_builder)
def test_p_tag_is_never_empty_element(self):
"""A tag is never designated as an empty-element tag.
Even if the markup shows it as an empty-element tag, it
shouldn't be presented that way.
"""
soup = self.soup("
")
assert not soup.p.is_empty_element
assert str(soup.p) == ""
def test_unclosed_tags_get_closed(self):
"""A tag that's not closed by the end of the document should be closed.
This applies to all tags except empty-element tags.
"""
self.assert_soup("", "
")
self.assert_soup("", "")
self.assert_soup("
", "
")
def test_br_is_always_empty_element_tag(self):
"""A
tag is designated as an empty-element tag.
Some parsers treat
as one
tag, some parsers as
two tags, but it should always be an empty-element tag.
"""
soup = self.soup("
")
assert soup.br.is_empty_element
assert str(soup.br) == "
"
def test_nested_formatting_elements(self):
self.assert_soup("")
def test_double_head(self):
html = """
Ordinary HEAD element test
Hello, world!
"""
soup = self.soup(html)
assert "text/javascript" == soup.find("script")["type"]
def test_comment(self):
# Comments are represented as Comment objects.
markup = "foobaz
"
self.assert_soup(markup)
soup = self.soup(markup)
comment = soup.find(string="foobar")
assert comment.__class__ == Comment
# The comment is properly integrated into the tree.
foo = soup.find(string="foo")
assert comment == foo.next_element
baz = soup.find(string="baz")
assert comment == baz.previous_element
def test_preserved_whitespace_in_pre_and_textarea(self):
"""Whitespace must be preserved in and