# -*- coding: utf-8 -*-
"""Tests for Beautiful Soup's tree traversal methods.
The tree traversal methods are the main advantage of using Beautiful
Soup over just using a parser.
Different parsers will build different Beautiful Soup trees given the
same markup, but all Beautiful Soup trees can be traversed with the
methods tested here.
"""
from pdb import set_trace
import copy
import pickle
import re
import warnings
from bs4 import BeautifulSoup
from bs4.builder import (
builder_registry,
HTMLParserTreeBuilder,
)
from bs4.element import (
PY3K,
CData,
Comment,
Declaration,
Doctype,
NavigableString,
SoupStrainer,
Tag,
)
from bs4.testing import (
SoupTest,
skipIf,
)
XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None)
LXML_PRESENT = (builder_registry.lookup("lxml") is not None)
class TreeTest(SoupTest):
def assertSelects(self, tags, should_match):
"""Make sure that the given tags have the correct text.
This is used in tests that define a bunch of tags, each
containing a single string, and then select certain strings by
some mechanism.
"""
self.assertEqual([tag.string for tag in tags], should_match)
def assertSelectsIDs(self, tags, should_match):
"""Make sure that the given tags have the correct IDs.
This is used in tests that define a bunch of tags, each
containing a single string, and then select certain strings by
some mechanism.
"""
self.assertEqual([tag['id'] for tag in tags], should_match)
class TestFind(TreeTest):
"""Basic tests of the find() method.
find() just calls find_all() with limit=1, so it's not tested all
that thouroughly here.
"""
def test_find_tag(self):
soup = self.soup("1234")
self.assertEqual(soup.find("b").string, "2")
def test_unicode_text_find(self):
soup = self.soup('
')
str(soup)
self.assertEqual("here it is", soup.find(id='Räksmörgås').text)
def test_find_everything(self):
"""Test an optimization that finds all tags."""
soup = self.soup("foobar")
self.assertEqual(2, len(soup.find_all()))
def test_find_everything_with_name(self):
"""Test an optimization that finds all tags with a given name."""
soup = self.soup("foobarbaz")
self.assertEqual(2, len(soup.find_all('a')))
class TestFindAll(TreeTest):
"""Basic tests of the find_all() method."""
def test_find_all_text_nodes(self):
"""You can search the tree for text nodes."""
soup = self.soup("Foobar\xbb")
# Exact match.
self.assertEqual(soup.find_all(string="bar"), ["bar"])
self.assertEqual(soup.find_all(text="bar"), ["bar"])
# Match any of a number of strings.
self.assertEqual(
soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"])
# Match a regular expression.
self.assertEqual(soup.find_all(text=re.compile('.*')),
["Foo", "bar", '\xbb'])
# Match anything.
self.assertEqual(soup.find_all(text=True),
["Foo", "bar", '\xbb'])
def test_find_all_limit(self):
"""You can limit the number of items returned by find_all."""
soup = self.soup("12345")
self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"])
self.assertSelects(soup.find_all('a', limit=1), ["1"])
self.assertSelects(
soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"])
# A limit of 0 means no limit.
self.assertSelects(
soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"])
def test_calling_a_tag_is_calling_findall(self):
soup = self.soup("123")
self.assertSelects(soup('a', limit=1), ["1"])
self.assertSelects(soup.b(id="/code3/bs4-save/tests/foo"), ["3"])
def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self):
soup = self.soup("")
# Create a self-referential list.
l = []
l.append(l)
# Without special code in _normalize_search_value, this would cause infinite
# recursion.
self.assertEqual([], soup.find_all(l))
def test_find_all_resultset(self):
"""All find_all calls return a ResultSet"""
soup = self.soup("")
result = soup.find_all("a")
self.assertTrue(hasattr(result, "source"))
result = soup.find_all(True)
self.assertTrue(hasattr(result, "source"))
result = soup.find_all(text="/code3/bs4-save/tests/foo")
self.assertTrue(hasattr(result, "source"))
class TestFindAllBasicNamespaces(TreeTest):
def test_find_by_namespaced_name(self):
soup = self.soup('4')
self.assertEqual("4", soup.find("mathml:msqrt").string)
self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name)
class TestFindAllByName(TreeTest):
"""Test ways of finding tags by tag name."""
def setUp(self):
super(TreeTest, self).setUp()
self.tree = self.soup("""First tag.Second tag.Third Nested tag. tag.""")
def test_find_all_by_tag_name(self):
# Find all the tags.
self.assertSelects(
self.tree.find_all('a'), ['First tag.', 'Nested tag.'])
def test_find_all_by_name_and_text(self):
self.assertSelects(
self.tree.find_all('a', text='First tag.'), ['First tag.'])
self.assertSelects(
self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.'])
self.assertSelects(
self.tree.find_all('a', text=re.compile("tag")),
['First tag.', 'Nested tag.'])
def test_find_all_on_non_root_element(self):
# You can call find_all on any node, not just the root.
self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.'])
def test_calling_element_invokes_find_all(self):
self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.'])
def test_find_all_by_tag_strainer(self):
self.assertSelects(
self.tree.find_all(SoupStrainer('a')),
['First tag.', 'Nested tag.'])
def test_find_all_by_tag_names(self):
self.assertSelects(
self.tree.find_all(['a', 'b']),
['First tag.', 'Second tag.', 'Nested tag.'])
def test_find_all_by_tag_dict(self):
self.assertSelects(
self.tree.find_all({'a' : True, 'b' : True}),
['First tag.', 'Second tag.', 'Nested tag.'])
def test_find_all_by_tag_re(self):
self.assertSelects(
self.tree.find_all(re.compile('^[ab]$')),
['First tag.', 'Second tag.', 'Nested tag.'])
def test_find_all_with_tags_matching_method(self):
# You can define an oracle method that determines whether
# a tag matches the search.
def id_matches_name(tag):
return tag.name == tag.get('id')
tree = self.soup("""Match 1.Does not match.Match 2.""")
self.assertSelects(
tree.find_all(id_matches_name), ["Match 1.", "Match 2."])
class TestFindAllByAttribute(TreeTest):
def test_find_all_by_attribute_name(self):
# You can pass in keyword arguments to find_all to search by
# attribute.
tree = self.soup("""
Matching a.
Non-matching Matching b.a.
""")
self.assertSelects(tree.find_all(id='first'),
["Matching a.", "Matching b."])
def test_find_all_by_utf8_attribute_value(self):
peace = "םולש".encode("utf8")
data = ''.encode("utf8")
soup = self.soup(data)
self.assertEqual([soup.a], soup.find_all(title=peace))
self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8")))
self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"]))
def test_find_all_by_attribute_dict(self):
# You can pass in a dictionary as the argument 'attrs'. This
# lets you search for attributes like 'name' (a fixed argument
# to find_all) and 'class' (a reserved word in Python.)
tree = self.soup("""
Name match.Class match.Non-match.A tag called 'name1'.
""")
# This doesn't do what you want.
self.assertSelects(tree.find_all(name='name1'),
["A tag called 'name1'."])
# This does what you want.
self.assertSelects(tree.find_all(attrs={'name' : 'name1'}),
["Name match."])
self.assertSelects(tree.find_all(attrs={'class' : 'class2'}),
["Class match."])
def test_find_all_by_class(self):
tree = self.soup("""
Class 1.Class 2.Class 1.Class 3 and 4.
""")
# Passing in the class_ keyword argument will search against
# the 'class' attribute.
self.assertSelects(tree.find_all('a', class_='1'), ['Class 1.'])
self.assertSelects(tree.find_all('c', class_='3'), ['Class 3 and 4.'])
self.assertSelects(tree.find_all('c', class_='4'), ['Class 3 and 4.'])
# Passing in a string to 'attrs' will also search the CSS class.
self.assertSelects(tree.find_all('a', '1'), ['Class 1.'])
self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.'])
self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.'])
self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.'])
def test_find_by_class_when_multiple_classes_present(self):
tree = self.soup("Found it")
f = tree.find_all("gar", class_=re.compile("o"))
self.assertSelects(f, ["Found it"])
f = tree.find_all("gar", class_=re.compile("a"))
self.assertSelects(f, ["Found it"])
# Since the class is not the string "foo bar", but the two
# strings "/code3/bs4-save/tests/foo" and "bar", this will not find anything.
f = tree.find_all("gar", class_=re.compile("o b"))
self.assertSelects(f, [])
def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self):
soup = self.soup("Found it")
self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"])
def big_attribute_value(value):
return len(value) > 3
self.assertSelects(soup.find_all("a", big_attribute_value), [])
def small_attribute_value(value):
return len(value) <= 3
self.assertSelects(
soup.find_all("a", small_attribute_value), ["Found it"])
def test_find_all_with_string_for_attrs_finds_multiple_classes(self):
soup = self.soup('')
a, a2 = soup.find_all("a")
self.assertEqual([a, a2], soup.find_all("a", "/code3/bs4-save/tests/foo"))
self.assertEqual([a], soup.find_all("a", "bar"))
# If you specify the class as a string that contains a
# space, only that specific value will be found.
self.assertEqual([a], soup.find_all("a", class_="foo bar"))
self.assertEqual([a], soup.find_all("a", "foo bar"))
self.assertEqual([], soup.find_all("a", "bar foo"))
def test_find_all_by_attribute_soupstrainer(self):
tree = self.soup("""
Match.Non-match.""")
strainer = SoupStrainer(attrs={'id' : 'first'})
self.assertSelects(tree.find_all(strainer), ['Match.'])
def test_find_all_with_missing_atribute(self):
# You can pass in None as the value of an attribute to find_all.
# This will match tags that do not have that attribute set.
tree = self.soup("""ID present.No ID present.ID is empty.""")
self.assertSelects(tree.find_all('a', id=None), ["No ID present."])
def test_find_all_with_defined_attribute(self):
# You can pass in None as the value of an attribute to find_all.
# This will match tags that have that attribute set to any value.
tree = self.soup("""ID present.No ID present.ID is empty.""")
self.assertSelects(
tree.find_all(id=True), ["ID present.", "ID is empty."])
def test_find_all_with_numeric_attribute(self):
# If you search for a number, it's treated as a string.
tree = self.soup("""Unquoted attribute.Quoted attribute.""")
expected = ["Unquoted attribute.", "Quoted attribute."]
self.assertSelects(tree.find_all(id=1), expected)
self.assertSelects(tree.find_all(id="1"), expected)
def test_find_all_with_list_attribute_values(self):
# You can pass a list of attribute values instead of just one,
# and you'll get tags that match any of the values.
tree = self.soup("""123No ID.""")
self.assertSelects(tree.find_all(id=["1", "3", "4"]),
["1", "3"])
def test_find_all_with_regular_expression_attribute_value(self):
# You can pass a regular expression as an attribute value, and
# you'll get tags whose values for that attribute match the
# regular expression.
tree = self.soup("""One a.Two as.Mixed as and bs.One b.No ID.""")
self.assertSelects(tree.find_all(id=re.compile("^a+$")),
["One a.", "Two as."])
def test_find_by_name_and_containing_string(self):
soup = self.soup("foobarfoo")
a = soup.a
self.assertEqual([a], soup.find_all("a", text="/code3/bs4-save/tests/foo"))
self.assertEqual([], soup.find_all("a", text="bar"))
self.assertEqual([], soup.find_all("a", text="bar"))
def test_find_by_name_and_containing_string_when_string_is_buried(self):
soup = self.soup("foofoo")
self.assertEqual(soup.find_all("a"), soup.find_all("a", text="/code3/bs4-save/tests/foo"))
def test_find_by_attribute_and_containing_string(self):
soup = self.soup('foofoo')
a = soup.a
self.assertEqual([a], soup.find_all(id=2, text="/code3/bs4-save/tests/foo"))
self.assertEqual([], soup.find_all(id=1, text="bar"))
class TestIndex(TreeTest):
"""Test Tag.index"""
def test_index(self):
tree = self.soup("""
IdenticalNot identicalIdenticalIdentical with childAlso not identicalIdentical with child
""")
div = tree.div
for i, element in enumerate(div.contents):
self.assertEqual(i, div.index(element))
self.assertRaises(ValueError, tree.index, 1)
class TestParentOperations(TreeTest):
"""Test navigation and searching through an element's parents."""
def setUp(self):
super(TestParentOperations, self).setUp()
self.tree = self.soup('''
Start here
''')
self.start = self.tree.b
def test_parent(self):
self.assertEqual(self.start.parent['id'], 'bottom')
self.assertEqual(self.start.parent.parent['id'], 'middle')
self.assertEqual(self.start.parent.parent.parent['id'], 'top')
def test_parent_of_top_tag_is_soup_object(self):
top_tag = self.tree.contents[0]
self.assertEqual(top_tag.parent, self.tree)
def test_soup_object_has_no_parent(self):
self.assertEqual(None, self.tree.parent)
def test_find_parents(self):
self.assertSelectsIDs(
self.start.find_parents('ul'), ['bottom', 'middle', 'top'])
self.assertSelectsIDs(
self.start.find_parents('ul', id="middle"), ['middle'])
def test_find_parent(self):
self.assertEqual(self.start.find_parent('ul')['id'], 'bottom')
self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top')
def test_parent_of_text_element(self):
text = self.tree.find(text="Start here")
self.assertEqual(text.parent.name, 'b')
def test_text_element_find_parent(self):
text = self.tree.find(text="Start here")
self.assertEqual(text.find_parent('ul')['id'], 'bottom')
def test_parent_generator(self):
parents = [parent['id'] for parent in self.start.parents
if parent is not None and 'id' in parent.attrs]
self.assertEqual(parents, ['bottom', 'middle', 'top'])
class ProximityTest(TreeTest):
def setUp(self):
super(TreeTest, self).setUp()
self.tree = self.soup(
'
OneTwoThree')
class TestNextOperations(ProximityTest):
def setUp(self):
super(TestNextOperations, self).setUp()
self.start = self.tree.b
def test_next(self):
self.assertEqual(self.start.next_element, "One")
self.assertEqual(self.start.next_element.next_element['id'], "2")
def test_next_of_last_item_is_none(self):
last = self.tree.find(text="Three")
self.assertEqual(last.next_element, None)
def test_next_of_root_is_none(self):
# The document root is outside the next/previous chain.
self.assertEqual(self.tree.next_element, None)
def test_find_all_next(self):
self.assertSelects(self.start.find_all_next('b'), ["Two", "Three"])
self.start.find_all_next(id=3)
self.assertSelects(self.start.find_all_next(id=3), ["Three"])
def test_find_next(self):
self.assertEqual(self.start.find_next('b')['id'], '2')
self.assertEqual(self.start.find_next(text="Three"), "Three")
def test_find_next_for_text_element(self):
text = self.tree.find(text="One")
self.assertEqual(text.find_next("b").string, "Two")
self.assertSelects(text.find_all_next("b"), ["Two", "Three"])
def test_next_generator(self):
start = self.tree.find(text="Two")
successors = [node for node in start.next_elements]
# There are two successors: the final tag and its text contents.
tag, contents = successors
self.assertEqual(tag['id'], '3')
self.assertEqual(contents, "Three")
class TestPreviousOperations(ProximityTest):
def setUp(self):
super(TestPreviousOperations, self).setUp()
self.end = self.tree.find(text="Three")
def test_previous(self):
self.assertEqual(self.end.previous_element['id'], "3")
self.assertEqual(self.end.previous_element.previous_element, "Two")
def test_previous_of_first_item_is_none(self):
first = self.tree.find('html')
self.assertEqual(first.previous_element, None)
def test_previous_of_root_is_none(self):
# The document root is outside the next/previous chain.
# XXX This is broken!
#self.assertEqual(self.tree.previous_element, None)
pass
def test_find_all_previous(self):
# The tag containing the "Three" node is the predecessor
# of the "Three" node itself, which is why "Three" shows up
# here.
self.assertSelects(
self.end.find_all_previous('b'), ["Three", "Two", "One"])
self.assertSelects(self.end.find_all_previous(id=1), ["One"])
def test_find_previous(self):
self.assertEqual(self.end.find_previous('b')['id'], '3')
self.assertEqual(self.end.find_previous(text="One"), "One")
def test_find_previous_for_text_element(self):
text = self.tree.find(text="Three")
self.assertEqual(text.find_previous("b").string, "Three")
self.assertSelects(
text.find_all_previous("b"), ["Three", "Two", "One"])
def test_previous_generator(self):
start = self.tree.find(text="One")
predecessors = [node for node in start.previous_elements]
# There are four predecessors: the tag containing "One"
# the
Some content. More content.")
self.assertEqual(extracted.decode(), '
Nav crap
')
# The extracted tag is now an orphan.
self.assertEqual(len(soup.body.contents), 2)
self.assertEqual(extracted.parent, None)
self.assertEqual(extracted.previous_element, None)
self.assertEqual(extracted.next_element.next_element, None)
# The gap where the extracted tag used to be has been mended.
content_1 = soup.find(text="Some content. ")
content_2 = soup.find(text=" More content.")
self.assertEqual(content_1.next_element, content_2)
self.assertEqual(content_1.next_sibling, content_2)
self.assertEqual(content_2.previous_element, content_1)
self.assertEqual(content_2.previous_sibling, content_1)
def test_extract_distinguishes_between_identical_strings(self):
soup = self.soup("foobar")
foo_1 = soup.a.string
bar_1 = soup.b.string
foo_2 = soup.new_string("/code3/bs4-save/tests/foo")
bar_2 = soup.new_string("bar")
soup.a.append(foo_2)
soup.b.append(bar_2)
# Now there are two identical strings in the tag, and two
# in the tag. Let's remove the first "/code3/bs4-save/tests/foo" and the second
# "bar".
foo_1.extract()
bar_2.extract()
self.assertEqual(foo_2, soup.a.string)
self.assertEqual(bar_2, soup.b.string)
def test_extract_multiples_of_same_tag(self):
soup = self.soup("""
")
# clear using extract()
a = soup.a
soup.p.clear()
self.assertEqual(len(soup.p.contents), 0)
self.assertTrue(hasattr(a, "contents"))
# clear using decompose()
em = a.em
a.clear(decompose=True)
self.assertEqual(0, len(em.contents))
def test_string_set(self):
"""Tag.string = 'string'"""
soup = self.soup("")
soup.a.string = "/code3/bs4-save/tests/foo"
self.assertEqual(soup.a.contents, ["/code3/bs4-save/tests/foo"])
soup.b.string = "bar"
self.assertEqual(soup.b.contents, ["bar"])
def test_string_set_does_not_affect_original_string(self):
soup = self.soup("foobar")
soup.b.string = soup.c.string
self.assertEqual(soup.a.encode(), b"barbar")
def test_set_string_preserves_class_of_string(self):
soup = self.soup("")
cdata = CData("/code3/bs4-save/tests/foo")
soup.a.string = cdata
self.assertTrue(isinstance(soup.a.string, CData))
class TestElementObjects(SoupTest):
"""Test various features of element objects."""
def test_len(self):
"""The length of an element is its number of children."""
soup = self.soup("123")
# The BeautifulSoup object itself contains one element: the
# tag.
self.assertEqual(len(soup.contents), 1)
self.assertEqual(len(soup), 1)
# The tag contains three elements: the text node "1", the
# tag, and the text node "3".
self.assertEqual(len(soup.top), 3)
self.assertEqual(len(soup.top.contents), 3)
def test_member_access_invokes_find(self):
"""Accessing a Python member .foo invokes find('foo')"""
soup = self.soup('')
self.assertEqual(soup.b, soup.find('b'))
self.assertEqual(soup.b.i, soup.find('b').find('i'))
self.assertEqual(soup.a, None)
def test_deprecated_member_access(self):
soup = self.soup('')
with warnings.catch_warnings(record=True) as w:
tag = soup.bTag
self.assertEqual(soup.b, tag)
self.assertEqual(
'.bTag is deprecated, use .find("b") instead.',
str(w[0].message))
def test_has_attr(self):
"""has_attr() checks for the presence of an attribute.
Please note note: has_attr() is different from
__in__. has_attr() checks the tag's attributes and __in__
checks the tag's chidlren.
"""
soup = self.soup("")
self.assertTrue(soup.foo.has_attr('attr'))
self.assertFalse(soup.foo.has_attr('attr2'))
def test_attributes_come_out_in_alphabetical_order(self):
markup = ''
self.assertSoupEquals(markup, '')
def test_string(self):
# A tag that contains only a text node makes that node
# available as .string.
soup = self.soup("foo")
self.assertEqual(soup.b.string, 'foo')
def test_empty_tag_has_no_string(self):
# A tag with no children has no .stirng.
soup = self.soup("")
self.assertEqual(soup.b.string, None)
def test_tag_with_multiple_children_has_no_string(self):
# A tag with no children has no .string.
soup = self.soup("foo")
self.assertEqual(soup.b.string, None)
soup = self.soup("foobar