blob: 6d3e67f3110038eae32cab7f1995d275d8d057e8 [file] [log] [blame]
rjw1f884582022-01-06 17:20:42 +08001# -*- coding: utf-8 -*-
2"""Tests for Beautiful Soup's tree traversal methods.
3
4The tree traversal methods are the main advantage of using Beautiful
5Soup over just using a parser.
6
7Different parsers will build different Beautiful Soup trees given the
8same markup, but all Beautiful Soup trees can be traversed with the
9methods tested here.
10"""
11
12from pdb import set_trace
13import copy
14import pickle
15import re
16import warnings
17from bs4 import BeautifulSoup
18from bs4.builder import (
19 builder_registry,
20 HTMLParserTreeBuilder,
21)
22from bs4.element import (
23 PY3K,
24 CData,
25 Comment,
26 Declaration,
27 Doctype,
28 NavigableString,
29 SoupStrainer,
30 Tag,
31)
32from bs4.testing import (
33 SoupTest,
34 skipIf,
35)
36
37XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None)
38LXML_PRESENT = (builder_registry.lookup("lxml") is not None)
39
40class TreeTest(SoupTest):
41
42 def assertSelects(self, tags, should_match):
43 """Make sure that the given tags have the correct text.
44
45 This is used in tests that define a bunch of tags, each
46 containing a single string, and then select certain strings by
47 some mechanism.
48 """
49 self.assertEqual([tag.string for tag in tags], should_match)
50
51 def assertSelectsIDs(self, tags, should_match):
52 """Make sure that the given tags have the correct IDs.
53
54 This is used in tests that define a bunch of tags, each
55 containing a single string, and then select certain strings by
56 some mechanism.
57 """
58 self.assertEqual([tag['id'] for tag in tags], should_match)
59
60
61class TestFind(TreeTest):
62 """Basic tests of the find() method.
63
64 find() just calls find_all() with limit=1, so it's not tested all
65 that thouroughly here.
66 """
67
68 def test_find_tag(self):
69 soup = self.soup("<a>1</a><b>2</b><a>3</a><b>4</b>")
70 self.assertEqual(soup.find("b").string, "2")
71
72 def test_unicode_text_find(self):
73 soup = self.soup('<h1>Räksmörgås</h1>')
74 self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås')
75
76 def test_unicode_attribute_find(self):
77 soup = self.soup('<h1 id="Räksmörgås">here it is</h1>')
78 str(soup)
79 self.assertEqual("here it is", soup.find(id='Räksmörgås').text)
80
81
82 def test_find_everything(self):
83 """Test an optimization that finds all tags."""
84 soup = self.soup("<a>foo</a><b>bar</b>")
85 self.assertEqual(2, len(soup.find_all()))
86
87 def test_find_everything_with_name(self):
88 """Test an optimization that finds all tags with a given name."""
89 soup = self.soup("<a>foo</a><b>bar</b><a>baz</a>")
90 self.assertEqual(2, len(soup.find_all('a')))
91
92class TestFindAll(TreeTest):
93 """Basic tests of the find_all() method."""
94
95 def test_find_all_text_nodes(self):
96 """You can search the tree for text nodes."""
97 soup = self.soup("<html>Foo<b>bar</b>\xbb</html>")
98 # Exact match.
99 self.assertEqual(soup.find_all(string="bar"), ["bar"])
100 self.assertEqual(soup.find_all(text="bar"), ["bar"])
101 # Match any of a number of strings.
102 self.assertEqual(
103 soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"])
104 # Match a regular expression.
105 self.assertEqual(soup.find_all(text=re.compile('.*')),
106 ["Foo", "bar", '\xbb'])
107 # Match anything.
108 self.assertEqual(soup.find_all(text=True),
109 ["Foo", "bar", '\xbb'])
110
111 def test_find_all_limit(self):
112 """You can limit the number of items returned by find_all."""
113 soup = self.soup("<a>1</a><a>2</a><a>3</a><a>4</a><a>5</a>")
114 self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"])
115 self.assertSelects(soup.find_all('a', limit=1), ["1"])
116 self.assertSelects(
117 soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"])
118
119 # A limit of 0 means no limit.
120 self.assertSelects(
121 soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"])
122
123 def test_calling_a_tag_is_calling_findall(self):
124 soup = self.soup("<a>1</a><b>2<a id='foo'>3</a></b>")
125 self.assertSelects(soup('a', limit=1), ["1"])
126 self.assertSelects(soup.b(id="foo"), ["3"])
127
128 def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self):
129 soup = self.soup("<a></a>")
130 # Create a self-referential list.
131 l = []
132 l.append(l)
133
134 # Without special code in _normalize_search_value, this would cause infinite
135 # recursion.
136 self.assertEqual([], soup.find_all(l))
137
138 def test_find_all_resultset(self):
139 """All find_all calls return a ResultSet"""
140 soup = self.soup("<a></a>")
141 result = soup.find_all("a")
142 self.assertTrue(hasattr(result, "source"))
143
144 result = soup.find_all(True)
145 self.assertTrue(hasattr(result, "source"))
146
147 result = soup.find_all(text="foo")
148 self.assertTrue(hasattr(result, "source"))
149
150
151class TestFindAllBasicNamespaces(TreeTest):
152
153 def test_find_by_namespaced_name(self):
154 soup = self.soup('<mathml:msqrt>4</mathml:msqrt><a svg:fill="red">')
155 self.assertEqual("4", soup.find("mathml:msqrt").string)
156 self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name)
157
158
159class TestFindAllByName(TreeTest):
160 """Test ways of finding tags by tag name."""
161
162 def setUp(self):
163 super(TreeTest, self).setUp()
164 self.tree = self.soup("""<a>First tag.</a>
165 <b>Second tag.</b>
166 <c>Third <a>Nested tag.</a> tag.</c>""")
167
168 def test_find_all_by_tag_name(self):
169 # Find all the <a> tags.
170 self.assertSelects(
171 self.tree.find_all('a'), ['First tag.', 'Nested tag.'])
172
173 def test_find_all_by_name_and_text(self):
174 self.assertSelects(
175 self.tree.find_all('a', text='First tag.'), ['First tag.'])
176
177 self.assertSelects(
178 self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.'])
179
180 self.assertSelects(
181 self.tree.find_all('a', text=re.compile("tag")),
182 ['First tag.', 'Nested tag.'])
183
184
185 def test_find_all_on_non_root_element(self):
186 # You can call find_all on any node, not just the root.
187 self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.'])
188
189 def test_calling_element_invokes_find_all(self):
190 self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.'])
191
192 def test_find_all_by_tag_strainer(self):
193 self.assertSelects(
194 self.tree.find_all(SoupStrainer('a')),
195 ['First tag.', 'Nested tag.'])
196
197 def test_find_all_by_tag_names(self):
198 self.assertSelects(
199 self.tree.find_all(['a', 'b']),
200 ['First tag.', 'Second tag.', 'Nested tag.'])
201
202 def test_find_all_by_tag_dict(self):
203 self.assertSelects(
204 self.tree.find_all({'a' : True, 'b' : True}),
205 ['First tag.', 'Second tag.', 'Nested tag.'])
206
207 def test_find_all_by_tag_re(self):
208 self.assertSelects(
209 self.tree.find_all(re.compile('^[ab]$')),
210 ['First tag.', 'Second tag.', 'Nested tag.'])
211
212 def test_find_all_with_tags_matching_method(self):
213 # You can define an oracle method that determines whether
214 # a tag matches the search.
215 def id_matches_name(tag):
216 return tag.name == tag.get('id')
217
218 tree = self.soup("""<a id="a">Match 1.</a>
219 <a id="1">Does not match.</a>
220 <b id="b">Match 2.</a>""")
221
222 self.assertSelects(
223 tree.find_all(id_matches_name), ["Match 1.", "Match 2."])
224
225
226class TestFindAllByAttribute(TreeTest):
227
228 def test_find_all_by_attribute_name(self):
229 # You can pass in keyword arguments to find_all to search by
230 # attribute.
231 tree = self.soup("""
232 <a id="first">Matching a.</a>
233 <a id="second">
234 Non-matching <b id="first">Matching b.</b>a.
235 </a>""")
236 self.assertSelects(tree.find_all(id='first'),
237 ["Matching a.", "Matching b."])
238
239 def test_find_all_by_utf8_attribute_value(self):
240 peace = "םולש".encode("utf8")
241 data = '<a title="םולש"></a>'.encode("utf8")
242 soup = self.soup(data)
243 self.assertEqual([soup.a], soup.find_all(title=peace))
244 self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8")))
245 self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"]))
246
247 def test_find_all_by_attribute_dict(self):
248 # You can pass in a dictionary as the argument 'attrs'. This
249 # lets you search for attributes like 'name' (a fixed argument
250 # to find_all) and 'class' (a reserved word in Python.)
251 tree = self.soup("""
252 <a name="name1" class="class1">Name match.</a>
253 <a name="name2" class="class2">Class match.</a>
254 <a name="name3" class="class3">Non-match.</a>
255 <name1>A tag called 'name1'.</name1>
256 """)
257
258 # This doesn't do what you want.
259 self.assertSelects(tree.find_all(name='name1'),
260 ["A tag called 'name1'."])
261 # This does what you want.
262 self.assertSelects(tree.find_all(attrs={'name' : 'name1'}),
263 ["Name match."])
264
265 self.assertSelects(tree.find_all(attrs={'class' : 'class2'}),
266 ["Class match."])
267
268 def test_find_all_by_class(self):
269 tree = self.soup("""
270 <a class="1">Class 1.</a>
271 <a class="2">Class 2.</a>
272 <b class="1">Class 1.</b>
273 <c class="3 4">Class 3 and 4.</c>
274 """)
275
276 # Passing in the class_ keyword argument will search against
277 # the 'class' attribute.
278 self.assertSelects(tree.find_all('a', class_='1'), ['Class 1.'])
279 self.assertSelects(tree.find_all('c', class_='3'), ['Class 3 and 4.'])
280 self.assertSelects(tree.find_all('c', class_='4'), ['Class 3 and 4.'])
281
282 # Passing in a string to 'attrs' will also search the CSS class.
283 self.assertSelects(tree.find_all('a', '1'), ['Class 1.'])
284 self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.'])
285 self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.'])
286 self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.'])
287
288 def test_find_by_class_when_multiple_classes_present(self):
289 tree = self.soup("<gar class='foo bar'>Found it</gar>")
290
291 f = tree.find_all("gar", class_=re.compile("o"))
292 self.assertSelects(f, ["Found it"])
293
294 f = tree.find_all("gar", class_=re.compile("a"))
295 self.assertSelects(f, ["Found it"])
296
297 # Since the class is not the string "foo bar", but the two
298 # strings "foo" and "bar", this will not find anything.
299 f = tree.find_all("gar", class_=re.compile("o b"))
300 self.assertSelects(f, [])
301
302 def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self):
303 soup = self.soup("<a class='bar'>Found it</a>")
304
305 self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"])
306
307 def big_attribute_value(value):
308 return len(value) > 3
309
310 self.assertSelects(soup.find_all("a", big_attribute_value), [])
311
312 def small_attribute_value(value):
313 return len(value) <= 3
314
315 self.assertSelects(
316 soup.find_all("a", small_attribute_value), ["Found it"])
317
318 def test_find_all_with_string_for_attrs_finds_multiple_classes(self):
319 soup = self.soup('<a class="foo bar"></a><a class="foo"></a>')
320 a, a2 = soup.find_all("a")
321 self.assertEqual([a, a2], soup.find_all("a", "foo"))
322 self.assertEqual([a], soup.find_all("a", "bar"))
323
324 # If you specify the class as a string that contains a
325 # space, only that specific value will be found.
326 self.assertEqual([a], soup.find_all("a", class_="foo bar"))
327 self.assertEqual([a], soup.find_all("a", "foo bar"))
328 self.assertEqual([], soup.find_all("a", "bar foo"))
329
330 def test_find_all_by_attribute_soupstrainer(self):
331 tree = self.soup("""
332 <a id="first">Match.</a>
333 <a id="second">Non-match.</a>""")
334
335 strainer = SoupStrainer(attrs={'id' : 'first'})
336 self.assertSelects(tree.find_all(strainer), ['Match.'])
337
338 def test_find_all_with_missing_atribute(self):
339 # You can pass in None as the value of an attribute to find_all.
340 # This will match tags that do not have that attribute set.
341 tree = self.soup("""<a id="1">ID present.</a>
342 <a>No ID present.</a>
343 <a id="">ID is empty.</a>""")
344 self.assertSelects(tree.find_all('a', id=None), ["No ID present."])
345
346 def test_find_all_with_defined_attribute(self):
347 # You can pass in None as the value of an attribute to find_all.
348 # This will match tags that have that attribute set to any value.
349 tree = self.soup("""<a id="1">ID present.</a>
350 <a>No ID present.</a>
351 <a id="">ID is empty.</a>""")
352 self.assertSelects(
353 tree.find_all(id=True), ["ID present.", "ID is empty."])
354
355 def test_find_all_with_numeric_attribute(self):
356 # If you search for a number, it's treated as a string.
357 tree = self.soup("""<a id=1>Unquoted attribute.</a>
358 <a id="1">Quoted attribute.</a>""")
359
360 expected = ["Unquoted attribute.", "Quoted attribute."]
361 self.assertSelects(tree.find_all(id=1), expected)
362 self.assertSelects(tree.find_all(id="1"), expected)
363
364 def test_find_all_with_list_attribute_values(self):
365 # You can pass a list of attribute values instead of just one,
366 # and you'll get tags that match any of the values.
367 tree = self.soup("""<a id="1">1</a>
368 <a id="2">2</a>
369 <a id="3">3</a>
370 <a>No ID.</a>""")
371 self.assertSelects(tree.find_all(id=["1", "3", "4"]),
372 ["1", "3"])
373
374 def test_find_all_with_regular_expression_attribute_value(self):
375 # You can pass a regular expression as an attribute value, and
376 # you'll get tags whose values for that attribute match the
377 # regular expression.
378 tree = self.soup("""<a id="a">One a.</a>
379 <a id="aa">Two as.</a>
380 <a id="ab">Mixed as and bs.</a>
381 <a id="b">One b.</a>
382 <a>No ID.</a>""")
383
384 self.assertSelects(tree.find_all(id=re.compile("^a+$")),
385 ["One a.", "Two as."])
386
387 def test_find_by_name_and_containing_string(self):
388 soup = self.soup("<b>foo</b><b>bar</b><a>foo</a>")
389 a = soup.a
390
391 self.assertEqual([a], soup.find_all("a", text="foo"))
392 self.assertEqual([], soup.find_all("a", text="bar"))
393 self.assertEqual([], soup.find_all("a", text="bar"))
394
395 def test_find_by_name_and_containing_string_when_string_is_buried(self):
396 soup = self.soup("<a>foo</a><a><b><c>foo</c></b></a>")
397 self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo"))
398
399 def test_find_by_attribute_and_containing_string(self):
400 soup = self.soup('<b id="1">foo</b><a id="2">foo</a>')
401 a = soup.a
402
403 self.assertEqual([a], soup.find_all(id=2, text="foo"))
404 self.assertEqual([], soup.find_all(id=1, text="bar"))
405
406
407
408
409class TestIndex(TreeTest):
410 """Test Tag.index"""
411 def test_index(self):
412 tree = self.soup("""<div>
413 <a>Identical</a>
414 <b>Not identical</b>
415 <a>Identical</a>
416
417 <c><d>Identical with child</d></c>
418 <b>Also not identical</b>
419 <c><d>Identical with child</d></c>
420 </div>""")
421 div = tree.div
422 for i, element in enumerate(div.contents):
423 self.assertEqual(i, div.index(element))
424 self.assertRaises(ValueError, tree.index, 1)
425
426
427class TestParentOperations(TreeTest):
428 """Test navigation and searching through an element's parents."""
429
430 def setUp(self):
431 super(TestParentOperations, self).setUp()
432 self.tree = self.soup('''<ul id="empty"></ul>
433 <ul id="top">
434 <ul id="middle">
435 <ul id="bottom">
436 <b>Start here</b>
437 </ul>
438 </ul>''')
439 self.start = self.tree.b
440
441
442 def test_parent(self):
443 self.assertEqual(self.start.parent['id'], 'bottom')
444 self.assertEqual(self.start.parent.parent['id'], 'middle')
445 self.assertEqual(self.start.parent.parent.parent['id'], 'top')
446
447 def test_parent_of_top_tag_is_soup_object(self):
448 top_tag = self.tree.contents[0]
449 self.assertEqual(top_tag.parent, self.tree)
450
451 def test_soup_object_has_no_parent(self):
452 self.assertEqual(None, self.tree.parent)
453
454 def test_find_parents(self):
455 self.assertSelectsIDs(
456 self.start.find_parents('ul'), ['bottom', 'middle', 'top'])
457 self.assertSelectsIDs(
458 self.start.find_parents('ul', id="middle"), ['middle'])
459
460 def test_find_parent(self):
461 self.assertEqual(self.start.find_parent('ul')['id'], 'bottom')
462 self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top')
463
464 def test_parent_of_text_element(self):
465 text = self.tree.find(text="Start here")
466 self.assertEqual(text.parent.name, 'b')
467
468 def test_text_element_find_parent(self):
469 text = self.tree.find(text="Start here")
470 self.assertEqual(text.find_parent('ul')['id'], 'bottom')
471
472 def test_parent_generator(self):
473 parents = [parent['id'] for parent in self.start.parents
474 if parent is not None and 'id' in parent.attrs]
475 self.assertEqual(parents, ['bottom', 'middle', 'top'])
476
477
478class ProximityTest(TreeTest):
479
480 def setUp(self):
481 super(TreeTest, self).setUp()
482 self.tree = self.soup(
483 '<html id="start"><head></head><body><b id="1">One</b><b id="2">Two</b><b id="3">Three</b></body></html>')
484
485
486class TestNextOperations(ProximityTest):
487
488 def setUp(self):
489 super(TestNextOperations, self).setUp()
490 self.start = self.tree.b
491
492 def test_next(self):
493 self.assertEqual(self.start.next_element, "One")
494 self.assertEqual(self.start.next_element.next_element['id'], "2")
495
496 def test_next_of_last_item_is_none(self):
497 last = self.tree.find(text="Three")
498 self.assertEqual(last.next_element, None)
499
500 def test_next_of_root_is_none(self):
501 # The document root is outside the next/previous chain.
502 self.assertEqual(self.tree.next_element, None)
503
504 def test_find_all_next(self):
505 self.assertSelects(self.start.find_all_next('b'), ["Two", "Three"])
506 self.start.find_all_next(id=3)
507 self.assertSelects(self.start.find_all_next(id=3), ["Three"])
508
509 def test_find_next(self):
510 self.assertEqual(self.start.find_next('b')['id'], '2')
511 self.assertEqual(self.start.find_next(text="Three"), "Three")
512
513 def test_find_next_for_text_element(self):
514 text = self.tree.find(text="One")
515 self.assertEqual(text.find_next("b").string, "Two")
516 self.assertSelects(text.find_all_next("b"), ["Two", "Three"])
517
518 def test_next_generator(self):
519 start = self.tree.find(text="Two")
520 successors = [node for node in start.next_elements]
521 # There are two successors: the final <b> tag and its text contents.
522 tag, contents = successors
523 self.assertEqual(tag['id'], '3')
524 self.assertEqual(contents, "Three")
525
526class TestPreviousOperations(ProximityTest):
527
528 def setUp(self):
529 super(TestPreviousOperations, self).setUp()
530 self.end = self.tree.find(text="Three")
531
532 def test_previous(self):
533 self.assertEqual(self.end.previous_element['id'], "3")
534 self.assertEqual(self.end.previous_element.previous_element, "Two")
535
536 def test_previous_of_first_item_is_none(self):
537 first = self.tree.find('html')
538 self.assertEqual(first.previous_element, None)
539
540 def test_previous_of_root_is_none(self):
541 # The document root is outside the next/previous chain.
542 # XXX This is broken!
543 #self.assertEqual(self.tree.previous_element, None)
544 pass
545
546 def test_find_all_previous(self):
547 # The <b> tag containing the "Three" node is the predecessor
548 # of the "Three" node itself, which is why "Three" shows up
549 # here.
550 self.assertSelects(
551 self.end.find_all_previous('b'), ["Three", "Two", "One"])
552 self.assertSelects(self.end.find_all_previous(id=1), ["One"])
553
554 def test_find_previous(self):
555 self.assertEqual(self.end.find_previous('b')['id'], '3')
556 self.assertEqual(self.end.find_previous(text="One"), "One")
557
558 def test_find_previous_for_text_element(self):
559 text = self.tree.find(text="Three")
560 self.assertEqual(text.find_previous("b").string, "Three")
561 self.assertSelects(
562 text.find_all_previous("b"), ["Three", "Two", "One"])
563
564 def test_previous_generator(self):
565 start = self.tree.find(text="One")
566 predecessors = [node for node in start.previous_elements]
567
568 # There are four predecessors: the <b> tag containing "One"
569 # the <body> tag, the <head> tag, and the <html> tag.
570 b, body, head, html = predecessors
571 self.assertEqual(b['id'], '1')
572 self.assertEqual(body.name, "body")
573 self.assertEqual(head.name, "head")
574 self.assertEqual(html.name, "html")
575
576
577class SiblingTest(TreeTest):
578
579 def setUp(self):
580 super(SiblingTest, self).setUp()
581 markup = '''<html>
582 <span id="1">
583 <span id="1.1"></span>
584 </span>
585 <span id="2">
586 <span id="2.1"></span>
587 </span>
588 <span id="3">
589 <span id="3.1"></span>
590 </span>
591 <span id="4"></span>
592 </html>'''
593 # All that whitespace looks good but makes the tests more
594 # difficult. Get rid of it.
595 markup = re.compile("\n\s*").sub("", markup)
596 self.tree = self.soup(markup)
597
598
599class TestNextSibling(SiblingTest):
600
601 def setUp(self):
602 super(TestNextSibling, self).setUp()
603 self.start = self.tree.find(id="1")
604
605 def test_next_sibling_of_root_is_none(self):
606 self.assertEqual(self.tree.next_sibling, None)
607
608 def test_next_sibling(self):
609 self.assertEqual(self.start.next_sibling['id'], '2')
610 self.assertEqual(self.start.next_sibling.next_sibling['id'], '3')
611
612 # Note the difference between next_sibling and next_element.
613 self.assertEqual(self.start.next_element['id'], '1.1')
614
615 def test_next_sibling_may_not_exist(self):
616 self.assertEqual(self.tree.html.next_sibling, None)
617
618 nested_span = self.tree.find(id="1.1")
619 self.assertEqual(nested_span.next_sibling, None)
620
621 last_span = self.tree.find(id="4")
622 self.assertEqual(last_span.next_sibling, None)
623
624 def test_find_next_sibling(self):
625 self.assertEqual(self.start.find_next_sibling('span')['id'], '2')
626
627 def test_next_siblings(self):
628 self.assertSelectsIDs(self.start.find_next_siblings("span"),
629 ['2', '3', '4'])
630
631 self.assertSelectsIDs(self.start.find_next_siblings(id='3'), ['3'])
632
633 def test_next_sibling_for_text_element(self):
634 soup = self.soup("Foo<b>bar</b>baz")
635 start = soup.find(text="Foo")
636 self.assertEqual(start.next_sibling.name, 'b')
637 self.assertEqual(start.next_sibling.next_sibling, 'baz')
638
639 self.assertSelects(start.find_next_siblings('b'), ['bar'])
640 self.assertEqual(start.find_next_sibling(text="baz"), "baz")
641 self.assertEqual(start.find_next_sibling(text="nonesuch"), None)
642
643
644class TestPreviousSibling(SiblingTest):
645
646 def setUp(self):
647 super(TestPreviousSibling, self).setUp()
648 self.end = self.tree.find(id="4")
649
650 def test_previous_sibling_of_root_is_none(self):
651 self.assertEqual(self.tree.previous_sibling, None)
652
653 def test_previous_sibling(self):
654 self.assertEqual(self.end.previous_sibling['id'], '3')
655 self.assertEqual(self.end.previous_sibling.previous_sibling['id'], '2')
656
657 # Note the difference between previous_sibling and previous_element.
658 self.assertEqual(self.end.previous_element['id'], '3.1')
659
660 def test_previous_sibling_may_not_exist(self):
661 self.assertEqual(self.tree.html.previous_sibling, None)
662
663 nested_span = self.tree.find(id="1.1")
664 self.assertEqual(nested_span.previous_sibling, None)
665
666 first_span = self.tree.find(id="1")
667 self.assertEqual(first_span.previous_sibling, None)
668
669 def test_find_previous_sibling(self):
670 self.assertEqual(self.end.find_previous_sibling('span')['id'], '3')
671
672 def test_previous_siblings(self):
673 self.assertSelectsIDs(self.end.find_previous_siblings("span"),
674 ['3', '2', '1'])
675
676 self.assertSelectsIDs(self.end.find_previous_siblings(id='1'), ['1'])
677
678 def test_previous_sibling_for_text_element(self):
679 soup = self.soup("Foo<b>bar</b>baz")
680 start = soup.find(text="baz")
681 self.assertEqual(start.previous_sibling.name, 'b')
682 self.assertEqual(start.previous_sibling.previous_sibling, 'Foo')
683
684 self.assertSelects(start.find_previous_siblings('b'), ['bar'])
685 self.assertEqual(start.find_previous_sibling(text="Foo"), "Foo")
686 self.assertEqual(start.find_previous_sibling(text="nonesuch"), None)
687
688
689class TestTagCreation(SoupTest):
690 """Test the ability to create new tags."""
691 def test_new_tag(self):
692 soup = self.soup("")
693 new_tag = soup.new_tag("foo", bar="baz")
694 self.assertTrue(isinstance(new_tag, Tag))
695 self.assertEqual("foo", new_tag.name)
696 self.assertEqual(dict(bar="baz"), new_tag.attrs)
697 self.assertEqual(None, new_tag.parent)
698
699 def test_tag_inherits_self_closing_rules_from_builder(self):
700 if XML_BUILDER_PRESENT:
701 xml_soup = BeautifulSoup("", "lxml-xml")
702 xml_br = xml_soup.new_tag("br")
703 xml_p = xml_soup.new_tag("p")
704
705 # Both the <br> and <p> tag are empty-element, just because
706 # they have no contents.
707 self.assertEqual(b"<br/>", xml_br.encode())
708 self.assertEqual(b"<p/>", xml_p.encode())
709
710 html_soup = BeautifulSoup("", "html.parser")
711 html_br = html_soup.new_tag("br")
712 html_p = html_soup.new_tag("p")
713
714 # The HTML builder users HTML's rules about which tags are
715 # empty-element tags, and the new tags reflect these rules.
716 self.assertEqual(b"<br/>", html_br.encode())
717 self.assertEqual(b"<p></p>", html_p.encode())
718
719 def test_new_string_creates_navigablestring(self):
720 soup = self.soup("")
721 s = soup.new_string("foo")
722 self.assertEqual("foo", s)
723 self.assertTrue(isinstance(s, NavigableString))
724
725 def test_new_string_can_create_navigablestring_subclass(self):
726 soup = self.soup("")
727 s = soup.new_string("foo", Comment)
728 self.assertEqual("foo", s)
729 self.assertTrue(isinstance(s, Comment))
730
731class TestTreeModification(SoupTest):
732
733 def test_attribute_modification(self):
734 soup = self.soup('<a id="1"></a>')
735 soup.a['id'] = 2
736 self.assertEqual(soup.decode(), self.document_for('<a id="2"></a>'))
737 del(soup.a['id'])
738 self.assertEqual(soup.decode(), self.document_for('<a></a>'))
739 soup.a['id2'] = 'foo'
740 self.assertEqual(soup.decode(), self.document_for('<a id2="foo"></a>'))
741
742 def test_new_tag_creation(self):
743 builder = builder_registry.lookup('html')()
744 soup = self.soup("<body></body>", builder=builder)
745 a = Tag(soup, builder, 'a')
746 ol = Tag(soup, builder, 'ol')
747 a['href'] = 'http://foo.com/'
748 soup.body.insert(0, a)
749 soup.body.insert(1, ol)
750 self.assertEqual(
751 soup.body.encode(),
752 b'<body><a href="http://foo.com/"></a><ol></ol></body>')
753
754 def test_append_to_contents_moves_tag(self):
755 doc = """<p id="1">Don't leave me <b>here</b>.</p>
756 <p id="2">Don\'t leave!</p>"""
757 soup = self.soup(doc)
758 second_para = soup.find(id='2')
759 bold = soup.b
760
761 # Move the <b> tag to the end of the second paragraph.
762 soup.find(id='2').append(soup.b)
763
764 # The <b> tag is now a child of the second paragraph.
765 self.assertEqual(bold.parent, second_para)
766
767 self.assertEqual(
768 soup.decode(), self.document_for(
769 '<p id="1">Don\'t leave me .</p>\n'
770 '<p id="2">Don\'t leave!<b>here</b></p>'))
771
772 def test_replace_with_returns_thing_that_was_replaced(self):
773 text = "<a></a><b><c></c></b>"
774 soup = self.soup(text)
775 a = soup.a
776 new_a = a.replace_with(soup.c)
777 self.assertEqual(a, new_a)
778
779 def test_unwrap_returns_thing_that_was_replaced(self):
780 text = "<a><b></b><c></c></a>"
781 soup = self.soup(text)
782 a = soup.a
783 new_a = a.unwrap()
784 self.assertEqual(a, new_a)
785
786 def test_replace_with_and_unwrap_give_useful_exception_when_tag_has_no_parent(self):
787 soup = self.soup("<a><b>Foo</b></a><c>Bar</c>")
788 a = soup.a
789 a.extract()
790 self.assertEqual(None, a.parent)
791 self.assertRaises(ValueError, a.unwrap)
792 self.assertRaises(ValueError, a.replace_with, soup.c)
793
794 def test_replace_tag_with_itself(self):
795 text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
796 soup = self.soup(text)
797 c = soup.c
798 soup.c.replace_with(c)
799 self.assertEqual(soup.decode(), self.document_for(text))
800
801 def test_replace_tag_with_its_parent_raises_exception(self):
802 text = "<a><b></b></a>"
803 soup = self.soup(text)
804 self.assertRaises(ValueError, soup.b.replace_with, soup.a)
805
806 def test_insert_tag_into_itself_raises_exception(self):
807 text = "<a><b></b></a>"
808 soup = self.soup(text)
809 self.assertRaises(ValueError, soup.a.insert, 0, soup.a)
810
811 def test_replace_with_maintains_next_element_throughout(self):
812 soup = self.soup('<p><a>one</a><b>three</b></p>')
813 a = soup.a
814 b = a.contents[0]
815 # Make it so the <a> tag has two text children.
816 a.insert(1, "two")
817
818 # Now replace each one with the empty string.
819 left, right = a.contents
820 left.replaceWith('')
821 right.replaceWith('')
822
823 # The <b> tag is still connected to the tree.
824 self.assertEqual("three", soup.b.string)
825
826 def test_replace_final_node(self):
827 soup = self.soup("<b>Argh!</b>")
828 soup.find(text="Argh!").replace_with("Hooray!")
829 new_text = soup.find(text="Hooray!")
830 b = soup.b
831 self.assertEqual(new_text.previous_element, b)
832 self.assertEqual(new_text.parent, b)
833 self.assertEqual(new_text.previous_element.next_element, new_text)
834 self.assertEqual(new_text.next_element, None)
835
836 def test_consecutive_text_nodes(self):
837 # A builder should never create two consecutive text nodes,
838 # but if you insert one next to another, Beautiful Soup will
839 # handle it correctly.
840 soup = self.soup("<a><b>Argh!</b><c></c></a>")
841 soup.b.insert(1, "Hooray!")
842
843 self.assertEqual(
844 soup.decode(), self.document_for(
845 "<a><b>Argh!Hooray!</b><c></c></a>"))
846
847 new_text = soup.find(text="Hooray!")
848 self.assertEqual(new_text.previous_element, "Argh!")
849 self.assertEqual(new_text.previous_element.next_element, new_text)
850
851 self.assertEqual(new_text.previous_sibling, "Argh!")
852 self.assertEqual(new_text.previous_sibling.next_sibling, new_text)
853
854 self.assertEqual(new_text.next_sibling, None)
855 self.assertEqual(new_text.next_element, soup.c)
856
857 def test_insert_string(self):
858 soup = self.soup("<a></a>")
859 soup.a.insert(0, "bar")
860 soup.a.insert(0, "foo")
861 # The string were added to the tag.
862 self.assertEqual(["foo", "bar"], soup.a.contents)
863 # And they were converted to NavigableStrings.
864 self.assertEqual(soup.a.contents[0].next_element, "bar")
865
866 def test_insert_tag(self):
867 builder = self.default_builder
868 soup = self.soup(
869 "<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder)
870 magic_tag = Tag(soup, builder, 'magictag')
871 magic_tag.insert(0, "the")
872 soup.a.insert(1, magic_tag)
873
874 self.assertEqual(
875 soup.decode(), self.document_for(
876 "<a><b>Find</b><magictag>the</magictag><c>lady!</c><d></d></a>"))
877
878 # Make sure all the relationships are hooked up correctly.
879 b_tag = soup.b
880 self.assertEqual(b_tag.next_sibling, magic_tag)
881 self.assertEqual(magic_tag.previous_sibling, b_tag)
882
883 find = b_tag.find(text="Find")
884 self.assertEqual(find.next_element, magic_tag)
885 self.assertEqual(magic_tag.previous_element, find)
886
887 c_tag = soup.c
888 self.assertEqual(magic_tag.next_sibling, c_tag)
889 self.assertEqual(c_tag.previous_sibling, magic_tag)
890
891 the = magic_tag.find(text="the")
892 self.assertEqual(the.parent, magic_tag)
893 self.assertEqual(the.next_element, c_tag)
894 self.assertEqual(c_tag.previous_element, the)
895
896 def test_append_child_thats_already_at_the_end(self):
897 data = "<a><b></b></a>"
898 soup = self.soup(data)
899 soup.a.append(soup.b)
900 self.assertEqual(data, soup.decode())
901
902 def test_move_tag_to_beginning_of_parent(self):
903 data = "<a><b></b><c></c><d></d></a>"
904 soup = self.soup(data)
905 soup.a.insert(0, soup.d)
906 self.assertEqual("<a><d></d><b></b><c></c></a>", soup.decode())
907
908 def test_insert_works_on_empty_element_tag(self):
909 # This is a little strange, since most HTML parsers don't allow
910 # markup like this to come through. But in general, we don't
911 # know what the parser would or wouldn't have allowed, so
912 # I'm letting this succeed for now.
913 soup = self.soup("<br/>")
914 soup.br.insert(1, "Contents")
915 self.assertEqual(str(soup.br), "<br>Contents</br>")
916
917 def test_insert_before(self):
918 soup = self.soup("<a>foo</a><b>bar</b>")
919 soup.b.insert_before("BAZ")
920 soup.a.insert_before("QUUX")
921 self.assertEqual(
922 soup.decode(), self.document_for("QUUX<a>foo</a>BAZ<b>bar</b>"))
923
924 soup.a.insert_before(soup.b)
925 self.assertEqual(
926 soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
927
928 def test_insert_after(self):
929 soup = self.soup("<a>foo</a><b>bar</b>")
930 soup.b.insert_after("BAZ")
931 soup.a.insert_after("QUUX")
932 self.assertEqual(
933 soup.decode(), self.document_for("<a>foo</a>QUUX<b>bar</b>BAZ"))
934 soup.b.insert_after(soup.a)
935 self.assertEqual(
936 soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
937
938 def test_insert_after_raises_exception_if_after_has_no_meaning(self):
939 soup = self.soup("")
940 tag = soup.new_tag("a")
941 string = soup.new_string("")
942 self.assertRaises(ValueError, string.insert_after, tag)
943 self.assertRaises(NotImplementedError, soup.insert_after, tag)
944 self.assertRaises(ValueError, tag.insert_after, tag)
945
946 def test_insert_before_raises_notimplementederror_if_before_has_no_meaning(self):
947 soup = self.soup("")
948 tag = soup.new_tag("a")
949 string = soup.new_string("")
950 self.assertRaises(ValueError, string.insert_before, tag)
951 self.assertRaises(NotImplementedError, soup.insert_before, tag)
952 self.assertRaises(ValueError, tag.insert_before, tag)
953
954 def test_replace_with(self):
955 soup = self.soup(
956 "<p>There's <b>no</b> business like <b>show</b> business</p>")
957 no, show = soup.find_all('b')
958 show.replace_with(no)
959 self.assertEqual(
960 soup.decode(),
961 self.document_for(
962 "<p>There's business like <b>no</b> business</p>"))
963
964 self.assertEqual(show.parent, None)
965 self.assertEqual(no.parent, soup.p)
966 self.assertEqual(no.next_element, "no")
967 self.assertEqual(no.next_sibling, " business")
968
969 def test_replace_first_child(self):
970 data = "<a><b></b><c></c></a>"
971 soup = self.soup(data)
972 soup.b.replace_with(soup.c)
973 self.assertEqual("<a><c></c></a>", soup.decode())
974
975 def test_replace_last_child(self):
976 data = "<a><b></b><c></c></a>"
977 soup = self.soup(data)
978 soup.c.replace_with(soup.b)
979 self.assertEqual("<a><b></b></a>", soup.decode())
980
981 def test_nested_tag_replace_with(self):
982 soup = self.soup(
983 """<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""")
984
985 # Replace the entire <b> tag and its contents ("reserve the
986 # right") with the <f> tag ("refuse").
987 remove_tag = soup.b
988 move_tag = soup.f
989 remove_tag.replace_with(move_tag)
990
991 self.assertEqual(
992 soup.decode(), self.document_for(
993 "<a>We<f>refuse</f></a><e>to<g>service</g></e>"))
994
995 # The <b> tag is now an orphan.
996 self.assertEqual(remove_tag.parent, None)
997 self.assertEqual(remove_tag.find(text="right").next_element, None)
998 self.assertEqual(remove_tag.previous_element, None)
999 self.assertEqual(remove_tag.next_sibling, None)
1000 self.assertEqual(remove_tag.previous_sibling, None)
1001
1002 # The <f> tag is now connected to the <a> tag.
1003 self.assertEqual(move_tag.parent, soup.a)
1004 self.assertEqual(move_tag.previous_element, "We")
1005 self.assertEqual(move_tag.next_element.next_element, soup.e)
1006 self.assertEqual(move_tag.next_sibling, None)
1007
1008 # The gap where the <f> tag used to be has been mended, and
1009 # the word "to" is now connected to the <g> tag.
1010 to_text = soup.find(text="to")
1011 g_tag = soup.g
1012 self.assertEqual(to_text.next_element, g_tag)
1013 self.assertEqual(to_text.next_sibling, g_tag)
1014 self.assertEqual(g_tag.previous_element, to_text)
1015 self.assertEqual(g_tag.previous_sibling, to_text)
1016
1017 def test_unwrap(self):
1018 tree = self.soup("""
1019 <p>Unneeded <em>formatting</em> is unneeded</p>
1020 """)
1021 tree.em.unwrap()
1022 self.assertEqual(tree.em, None)
1023 self.assertEqual(tree.p.text, "Unneeded formatting is unneeded")
1024
1025 def test_wrap(self):
1026 soup = self.soup("I wish I was bold.")
1027 value = soup.string.wrap(soup.new_tag("b"))
1028 self.assertEqual(value.decode(), "<b>I wish I was bold.</b>")
1029 self.assertEqual(
1030 soup.decode(), self.document_for("<b>I wish I was bold.</b>"))
1031
1032 def test_wrap_extracts_tag_from_elsewhere(self):
1033 soup = self.soup("<b></b>I wish I was bold.")
1034 soup.b.next_sibling.wrap(soup.b)
1035 self.assertEqual(
1036 soup.decode(), self.document_for("<b>I wish I was bold.</b>"))
1037
1038 def test_wrap_puts_new_contents_at_the_end(self):
1039 soup = self.soup("<b>I like being bold.</b>I wish I was bold.")
1040 soup.b.next_sibling.wrap(soup.b)
1041 self.assertEqual(2, len(soup.b.contents))
1042 self.assertEqual(
1043 soup.decode(), self.document_for(
1044 "<b>I like being bold.I wish I was bold.</b>"))
1045
1046 def test_extract(self):
1047 soup = self.soup(
1048 '<html><body>Some content. <div id="nav">Nav crap</div> More content.</body></html>')
1049
1050 self.assertEqual(len(soup.body.contents), 3)
1051 extracted = soup.find(id="nav").extract()
1052
1053 self.assertEqual(
1054 soup.decode(), "<html><body>Some content. More content.</body></html>")
1055 self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>')
1056
1057 # The extracted tag is now an orphan.
1058 self.assertEqual(len(soup.body.contents), 2)
1059 self.assertEqual(extracted.parent, None)
1060 self.assertEqual(extracted.previous_element, None)
1061 self.assertEqual(extracted.next_element.next_element, None)
1062
1063 # The gap where the extracted tag used to be has been mended.
1064 content_1 = soup.find(text="Some content. ")
1065 content_2 = soup.find(text=" More content.")
1066 self.assertEqual(content_1.next_element, content_2)
1067 self.assertEqual(content_1.next_sibling, content_2)
1068 self.assertEqual(content_2.previous_element, content_1)
1069 self.assertEqual(content_2.previous_sibling, content_1)
1070
1071 def test_extract_distinguishes_between_identical_strings(self):
1072 soup = self.soup("<a>foo</a><b>bar</b>")
1073 foo_1 = soup.a.string
1074 bar_1 = soup.b.string
1075 foo_2 = soup.new_string("foo")
1076 bar_2 = soup.new_string("bar")
1077 soup.a.append(foo_2)
1078 soup.b.append(bar_2)
1079
1080 # Now there are two identical strings in the <a> tag, and two
1081 # in the <b> tag. Let's remove the first "foo" and the second
1082 # "bar".
1083 foo_1.extract()
1084 bar_2.extract()
1085 self.assertEqual(foo_2, soup.a.string)
1086 self.assertEqual(bar_2, soup.b.string)
1087
1088 def test_extract_multiples_of_same_tag(self):
1089 soup = self.soup("""
1090<html>
1091<head>
1092<script>foo</script>
1093</head>
1094<body>
1095 <script>bar</script>
1096 <a></a>
1097</body>
1098<script>baz</script>
1099</html>""")
1100 [soup.script.extract() for i in soup.find_all("script")]
1101 self.assertEqual("<body>\n\n<a></a>\n</body>", str(soup.body))
1102
1103
1104 def test_extract_works_when_element_is_surrounded_by_identical_strings(self):
1105 soup = self.soup(
1106 '<html>\n'
1107 '<body>hi</body>\n'
1108 '</html>')
1109 soup.find('body').extract()
1110 self.assertEqual(None, soup.find('body'))
1111
1112
1113 def test_clear(self):
1114 """Tag.clear()"""
1115 soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>")
1116 # clear using extract()
1117 a = soup.a
1118 soup.p.clear()
1119 self.assertEqual(len(soup.p.contents), 0)
1120 self.assertTrue(hasattr(a, "contents"))
1121
1122 # clear using decompose()
1123 em = a.em
1124 a.clear(decompose=True)
1125 self.assertEqual(0, len(em.contents))
1126
1127 def test_string_set(self):
1128 """Tag.string = 'string'"""
1129 soup = self.soup("<a></a> <b><c></c></b>")
1130 soup.a.string = "foo"
1131 self.assertEqual(soup.a.contents, ["foo"])
1132 soup.b.string = "bar"
1133 self.assertEqual(soup.b.contents, ["bar"])
1134
1135 def test_string_set_does_not_affect_original_string(self):
1136 soup = self.soup("<a><b>foo</b><c>bar</c>")
1137 soup.b.string = soup.c.string
1138 self.assertEqual(soup.a.encode(), b"<a><b>bar</b><c>bar</c></a>")
1139
1140 def test_set_string_preserves_class_of_string(self):
1141 soup = self.soup("<a></a>")
1142 cdata = CData("foo")
1143 soup.a.string = cdata
1144 self.assertTrue(isinstance(soup.a.string, CData))
1145
1146class TestElementObjects(SoupTest):
1147 """Test various features of element objects."""
1148
1149 def test_len(self):
1150 """The length of an element is its number of children."""
1151 soup = self.soup("<top>1<b>2</b>3</top>")
1152
1153 # The BeautifulSoup object itself contains one element: the
1154 # <top> tag.
1155 self.assertEqual(len(soup.contents), 1)
1156 self.assertEqual(len(soup), 1)
1157
1158 # The <top> tag contains three elements: the text node "1", the
1159 # <b> tag, and the text node "3".
1160 self.assertEqual(len(soup.top), 3)
1161 self.assertEqual(len(soup.top.contents), 3)
1162
1163 def test_member_access_invokes_find(self):
1164 """Accessing a Python member .foo invokes find('foo')"""
1165 soup = self.soup('<b><i></i></b>')
1166 self.assertEqual(soup.b, soup.find('b'))
1167 self.assertEqual(soup.b.i, soup.find('b').find('i'))
1168 self.assertEqual(soup.a, None)
1169
1170 def test_deprecated_member_access(self):
1171 soup = self.soup('<b><i></i></b>')
1172 with warnings.catch_warnings(record=True) as w:
1173 tag = soup.bTag
1174 self.assertEqual(soup.b, tag)
1175 self.assertEqual(
1176 '.bTag is deprecated, use .find("b") instead.',
1177 str(w[0].message))
1178
1179 def test_has_attr(self):
1180 """has_attr() checks for the presence of an attribute.
1181
1182 Please note note: has_attr() is different from
1183 __in__. has_attr() checks the tag's attributes and __in__
1184 checks the tag's chidlren.
1185 """
1186 soup = self.soup("<foo attr='bar'>")
1187 self.assertTrue(soup.foo.has_attr('attr'))
1188 self.assertFalse(soup.foo.has_attr('attr2'))
1189
1190
1191 def test_attributes_come_out_in_alphabetical_order(self):
1192 markup = '<b a="1" z="5" m="3" f="2" y="4"></b>'
1193 self.assertSoupEquals(markup, '<b a="1" f="2" m="3" y="4" z="5"></b>')
1194
1195 def test_string(self):
1196 # A tag that contains only a text node makes that node
1197 # available as .string.
1198 soup = self.soup("<b>foo</b>")
1199 self.assertEqual(soup.b.string, 'foo')
1200
1201 def test_empty_tag_has_no_string(self):
1202 # A tag with no children has no .stirng.
1203 soup = self.soup("<b></b>")
1204 self.assertEqual(soup.b.string, None)
1205
1206 def test_tag_with_multiple_children_has_no_string(self):
1207 # A tag with no children has no .string.
1208 soup = self.soup("<a>foo<b></b><b></b></b>")
1209 self.assertEqual(soup.b.string, None)
1210
1211 soup = self.soup("<a>foo<b></b>bar</b>")
1212 self.assertEqual(soup.b.string, None)
1213
1214 # Even if all the children are strings, due to trickery,
1215 # it won't work--but this would be a good optimization.
1216 soup = self.soup("<a>foo</b>")
1217 soup.a.insert(1, "bar")
1218 self.assertEqual(soup.a.string, None)
1219
1220 def test_tag_with_recursive_string_has_string(self):
1221 # A tag with a single child which has a .string inherits that
1222 # .string.
1223 soup = self.soup("<a><b>foo</b></a>")
1224 self.assertEqual(soup.a.string, "foo")
1225 self.assertEqual(soup.string, "foo")
1226
1227 def test_lack_of_string(self):
1228 """Only a tag containing a single text node has a .string."""
1229 soup = self.soup("<b>f<i>e</i>o</b>")
1230 self.assertFalse(soup.b.string)
1231
1232 soup = self.soup("<b></b>")
1233 self.assertFalse(soup.b.string)
1234
1235 def test_all_text(self):
1236 """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated"""
1237 soup = self.soup("<a>a<b>r</b> <r> t </r></a>")
1238 self.assertEqual(soup.a.text, "ar t ")
1239 self.assertEqual(soup.a.get_text(strip=True), "art")
1240 self.assertEqual(soup.a.get_text(","), "a,r, , t ")
1241 self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
1242
1243 def test_get_text_ignores_comments(self):
1244 soup = self.soup("foo<!--IGNORE-->bar")
1245 self.assertEqual(soup.get_text(), "foobar")
1246
1247 self.assertEqual(
1248 soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar")
1249 self.assertEqual(
1250 soup.get_text(types=None), "fooIGNOREbar")
1251
1252 def test_all_strings_ignores_comments(self):
1253 soup = self.soup("foo<!--IGNORE-->bar")
1254 self.assertEqual(['foo', 'bar'], list(soup.strings))
1255
1256class TestCDAtaListAttributes(SoupTest):
1257
1258 """Testing cdata-list attributes like 'class'.
1259 """
1260 def test_single_value_becomes_list(self):
1261 soup = self.soup("<a class='foo'>")
1262 self.assertEqual(["foo"],soup.a['class'])
1263
1264 def test_multiple_values_becomes_list(self):
1265 soup = self.soup("<a class='foo bar'>")
1266 self.assertEqual(["foo", "bar"], soup.a['class'])
1267
1268 def test_multiple_values_separated_by_weird_whitespace(self):
1269 soup = self.soup("<a class='foo\tbar\nbaz'>")
1270 self.assertEqual(["foo", "bar", "baz"],soup.a['class'])
1271
1272 def test_attributes_joined_into_string_on_output(self):
1273 soup = self.soup("<a class='foo\tbar'>")
1274 self.assertEqual(b'<a class="foo bar"></a>', soup.a.encode())
1275
1276 def test_accept_charset(self):
1277 soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">')
1278 self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset'])
1279
1280 def test_cdata_attribute_applying_only_to_one_tag(self):
1281 data = '<a accept-charset="ISO-8859-1 UTF-8"></a>'
1282 soup = self.soup(data)
1283 # We saw in another test that accept-charset is a cdata-list
1284 # attribute for the <form> tag. But it's not a cdata-list
1285 # attribute for any other tag.
1286 self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset'])
1287
1288 def test_string_has_immutable_name_property(self):
1289 string = self.soup("s").string
1290 self.assertEqual(None, string.name)
1291 def t():
1292 string.name = 'foo'
1293 self.assertRaises(AttributeError, t)
1294
1295class TestPersistence(SoupTest):
1296 "Testing features like pickle and deepcopy."
1297
1298 def setUp(self):
1299 super(TestPersistence, self).setUp()
1300 self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
1301"http://www.w3.org/TR/REC-html40/transitional.dtd">
1302<html>
1303<head>
1304<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
1305<title>Beautiful Soup: We called him Tortoise because he taught us.</title>
1306<link rev="made" href="mailto:leonardr@segfault.org">
1307<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping.">
1308<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)">
1309<meta name="author" content="Leonard Richardson">
1310</head>
1311<body>
1312<a href="foo">foo</a>
1313<a href="foo"><b>bar</b></a>
1314</body>
1315</html>"""
1316 self.tree = self.soup(self.page)
1317
1318 def test_pickle_and_unpickle_identity(self):
1319 # Pickling a tree, then unpickling it, yields a tree identical
1320 # to the original.
1321 dumped = pickle.dumps(self.tree, 2)
1322 loaded = pickle.loads(dumped)
1323 self.assertEqual(loaded.__class__, BeautifulSoup)
1324 self.assertEqual(loaded.decode(), self.tree.decode())
1325
1326 def test_deepcopy_identity(self):
1327 # Making a deepcopy of a tree yields an identical tree.
1328 copied = copy.deepcopy(self.tree)
1329 self.assertEqual(copied.decode(), self.tree.decode())
1330
1331 def test_unicode_pickle(self):
1332 # A tree containing Unicode characters can be pickled.
1333 html = "<b>\N{SNOWMAN}</b>"
1334 soup = self.soup(html)
1335 dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
1336 loaded = pickle.loads(dumped)
1337 self.assertEqual(loaded.decode(), soup.decode())
1338
1339 def test_copy_navigablestring_is_not_attached_to_tree(self):
1340 html = "<b>Foo<a></a></b><b>Bar</b>"
1341 soup = self.soup(html)
1342 s1 = soup.find(string="Foo")
1343 s2 = copy.copy(s1)
1344 self.assertEqual(s1, s2)
1345 self.assertEqual(None, s2.parent)
1346 self.assertEqual(None, s2.next_element)
1347 self.assertNotEqual(None, s1.next_sibling)
1348 self.assertEqual(None, s2.next_sibling)
1349 self.assertEqual(None, s2.previous_element)
1350
1351 def test_copy_navigablestring_subclass_has_same_type(self):
1352 html = "<b><!--Foo--></b>"
1353 soup = self.soup(html)
1354 s1 = soup.string
1355 s2 = copy.copy(s1)
1356 self.assertEqual(s1, s2)
1357 self.assertTrue(isinstance(s2, Comment))
1358
1359 def test_copy_entire_soup(self):
1360 html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
1361 soup = self.soup(html)
1362 soup_copy = copy.copy(soup)
1363 self.assertEqual(soup, soup_copy)
1364
1365 def test_copy_tag_copies_contents(self):
1366 html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
1367 soup = self.soup(html)
1368 div = soup.div
1369 div_copy = copy.copy(div)
1370
1371 # The two tags look the same, and evaluate to equal.
1372 self.assertEqual(str(div), str(div_copy))
1373 self.assertEqual(div, div_copy)
1374
1375 # But they're not the same object.
1376 self.assertFalse(div is div_copy)
1377
1378 # And they don't have the same relation to the parse tree. The
1379 # copy is not associated with a parse tree at all.
1380 self.assertEqual(None, div_copy.parent)
1381 self.assertEqual(None, div_copy.previous_element)
1382 self.assertEqual(None, div_copy.find(string='Bar').next_element)
1383 self.assertNotEqual(None, div.find(string='Bar').next_element)
1384
1385class TestSubstitutions(SoupTest):
1386
1387 def test_default_formatter_is_minimal(self):
1388 markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1389 soup = self.soup(markup)
1390 decoded = soup.decode(formatter="minimal")
1391 # The < is converted back into &lt; but the e-with-acute is left alone.
1392 self.assertEqual(
1393 decoded,
1394 self.document_for(
1395 "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
1396
1397 def test_formatter_html(self):
1398 markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1399 soup = self.soup(markup)
1400 decoded = soup.decode(formatter="html")
1401 self.assertEqual(
1402 decoded,
1403 self.document_for("<b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
1404
1405 def test_formatter_minimal(self):
1406 markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1407 soup = self.soup(markup)
1408 decoded = soup.decode(formatter="minimal")
1409 # The < is converted back into &lt; but the e-with-acute is left alone.
1410 self.assertEqual(
1411 decoded,
1412 self.document_for(
1413 "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
1414
1415 def test_formatter_null(self):
1416 markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1417 soup = self.soup(markup)
1418 decoded = soup.decode(formatter=None)
1419 # Neither the angle brackets nor the e-with-acute are converted.
1420 # This is not valid HTML, but it's what the user wanted.
1421 self.assertEqual(decoded,
1422 self.document_for("<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
1423
1424 def test_formatter_custom(self):
1425 markup = "<b>&lt;foo&gt;</b><b>bar</b>"
1426 soup = self.soup(markup)
1427 decoded = soup.decode(formatter = lambda x: x.upper())
1428 # Instead of normal entity conversion code, the custom
1429 # callable is called on every string.
1430 self.assertEqual(
1431 decoded,
1432 self.document_for("<b><FOO></b><b>BAR</b>"))
1433
1434 def test_formatter_is_run_on_attribute_values(self):
1435 markup = '<a href="http://a.com?a=b&c=é">e</a>'
1436 soup = self.soup(markup)
1437 a = soup.a
1438
1439 expect_minimal = '<a href="http://a.com?a=b&amp;c=é">e</a>'
1440
1441 self.assertEqual(expect_minimal, a.decode())
1442 self.assertEqual(expect_minimal, a.decode(formatter="minimal"))
1443
1444 expect_html = '<a href="http://a.com?a=b&amp;c=&eacute;">e</a>'
1445 self.assertEqual(expect_html, a.decode(formatter="html"))
1446
1447 self.assertEqual(markup, a.decode(formatter=None))
1448 expect_upper = '<a href="HTTP://A.COM?A=B&C=É">E</a>'
1449 self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))
1450
1451 def test_formatter_skips_script_tag_for_html_documents(self):
1452 doc = """
1453 <script type="text/javascript">
1454 console.log("< < hey > > ");
1455 </script>
1456"""
1457 encoded = BeautifulSoup(doc, 'html.parser').encode()
1458 self.assertTrue(b"< < hey > >" in encoded)
1459
1460 def test_formatter_skips_style_tag_for_html_documents(self):
1461 doc = """
1462 <style type="text/css">
1463 console.log("< < hey > > ");
1464 </style>
1465"""
1466 encoded = BeautifulSoup(doc, 'html.parser').encode()
1467 self.assertTrue(b"< < hey > >" in encoded)
1468
1469 def test_prettify_leaves_preformatted_text_alone(self):
1470 soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz ")
1471 # Everything outside the <pre> tag is reformatted, but everything
1472 # inside is left alone.
1473 self.assertEqual(
1474 '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>',
1475 soup.div.prettify())
1476
1477 def test_prettify_accepts_formatter(self):
1478 soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
1479 pretty = soup.prettify(formatter = lambda x: x.upper())
1480 self.assertTrue("FOO" in pretty)
1481
1482 def test_prettify_outputs_unicode_by_default(self):
1483 soup = self.soup("<a></a>")
1484 self.assertEqual(str, type(soup.prettify()))
1485
1486 def test_prettify_can_encode_data(self):
1487 soup = self.soup("<a></a>")
1488 self.assertEqual(bytes, type(soup.prettify("utf-8")))
1489
1490 def test_html_entity_substitution_off_by_default(self):
1491 markup = "<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
1492 soup = self.soup(markup)
1493 encoded = soup.b.encode("utf-8")
1494 self.assertEqual(encoded, markup.encode('utf-8'))
1495
1496 def test_encoding_substitution(self):
1497 # Here's the <meta> tag saying that a document is
1498 # encoded in Shift-JIS.
1499 meta_tag = ('<meta content="text/html; charset=x-sjis" '
1500 'http-equiv="Content-type"/>')
1501 soup = self.soup(meta_tag)
1502
1503 # Parse the document, and the charset apprears unchanged.
1504 self.assertEqual(soup.meta['content'], 'text/html; charset=x-sjis')
1505
1506 # Encode the document into some encoding, and the encoding is
1507 # substituted into the meta tag.
1508 utf_8 = soup.encode("utf-8")
1509 self.assertTrue(b"charset=utf-8" in utf_8)
1510
1511 euc_jp = soup.encode("euc_jp")
1512 self.assertTrue(b"charset=euc_jp" in euc_jp)
1513
1514 shift_jis = soup.encode("shift-jis")
1515 self.assertTrue(b"charset=shift-jis" in shift_jis)
1516
1517 utf_16_u = soup.encode("utf-16").decode("utf-16")
1518 self.assertTrue("charset=utf-16" in utf_16_u)
1519
1520 def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self):
1521 markup = ('<head><meta content="text/html; charset=x-sjis" '
1522 'http-equiv="Content-type"/></head><pre>foo</pre>')
1523
1524 # Beautiful Soup used to try to rewrite the meta tag even if the
1525 # meta tag got filtered out by the strainer. This test makes
1526 # sure that doesn't happen.
1527 strainer = SoupStrainer('pre')
1528 soup = self.soup(markup, parse_only=strainer)
1529 self.assertEqual(soup.contents[0].name, 'pre')
1530
1531class TestEncoding(SoupTest):
1532 """Test the ability to encode objects into strings."""
1533
1534 def test_unicode_string_can_be_encoded(self):
1535 html = "<b>\N{SNOWMAN}</b>"
1536 soup = self.soup(html)
1537 self.assertEqual(soup.b.string.encode("utf-8"),
1538 "\N{SNOWMAN}".encode("utf-8"))
1539
1540 def test_tag_containing_unicode_string_can_be_encoded(self):
1541 html = "<b>\N{SNOWMAN}</b>"
1542 soup = self.soup(html)
1543 self.assertEqual(
1544 soup.b.encode("utf-8"), html.encode("utf-8"))
1545
1546 def test_encoding_substitutes_unrecognized_characters_by_default(self):
1547 html = "<b>\N{SNOWMAN}</b>"
1548 soup = self.soup(html)
1549 self.assertEqual(soup.b.encode("ascii"), b"<b>&#9731;</b>")
1550
1551 def test_encoding_can_be_made_strict(self):
1552 html = "<b>\N{SNOWMAN}</b>"
1553 soup = self.soup(html)
1554 self.assertRaises(
1555 UnicodeEncodeError, soup.encode, "ascii", errors="strict")
1556
1557 def test_decode_contents(self):
1558 html = "<b>\N{SNOWMAN}</b>"
1559 soup = self.soup(html)
1560 self.assertEqual("\N{SNOWMAN}", soup.b.decode_contents())
1561
1562 def test_encode_contents(self):
1563 html = "<b>\N{SNOWMAN}</b>"
1564 soup = self.soup(html)
1565 self.assertEqual(
1566 "\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents(
1567 encoding="utf8"))
1568
1569 def test_deprecated_renderContents(self):
1570 html = "<b>\N{SNOWMAN}</b>"
1571 soup = self.soup(html)
1572 self.assertEqual(
1573 "\N{SNOWMAN}".encode("utf8"), soup.b.renderContents())
1574
1575 def test_repr(self):
1576 html = "<b>\N{SNOWMAN}</b>"
1577 soup = self.soup(html)
1578 if PY3K:
1579 self.assertEqual(html, repr(soup))
1580 else:
1581 self.assertEqual(b'<b>\\u2603</b>', repr(soup))
1582
1583class TestNavigableStringSubclasses(SoupTest):
1584
1585 def test_cdata(self):
1586 # None of the current builders turn CDATA sections into CData
1587 # objects, but you can create them manually.
1588 soup = self.soup("")
1589 cdata = CData("foo")
1590 soup.insert(1, cdata)
1591 self.assertEqual(str(soup), "<![CDATA[foo]]>")
1592 self.assertEqual(soup.find(text="foo"), "foo")
1593 self.assertEqual(soup.contents[0], "foo")
1594
1595 def test_cdata_is_never_formatted(self):
1596 """Text inside a CData object is passed into the formatter.
1597
1598 But the return value is ignored.
1599 """
1600
1601 self.count = 0
1602 def increment(*args):
1603 self.count += 1
1604 return "BITTER FAILURE"
1605
1606 soup = self.soup("")
1607 cdata = CData("<><><>")
1608 soup.insert(1, cdata)
1609 self.assertEqual(
1610 b"<![CDATA[<><><>]]>", soup.encode(formatter=increment))
1611 self.assertEqual(1, self.count)
1612
1613 def test_doctype_ends_in_newline(self):
1614 # Unlike other NavigableString subclasses, a DOCTYPE always ends
1615 # in a newline.
1616 doctype = Doctype("foo")
1617 soup = self.soup("")
1618 soup.insert(1, doctype)
1619 self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n")
1620
1621 def test_declaration(self):
1622 d = Declaration("foo")
1623 self.assertEqual("<?foo?>", d.output_ready())
1624
1625class TestSoupSelector(TreeTest):
1626
1627 HTML = """
1628<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
1629"http://www.w3.org/TR/html4/strict.dtd">
1630<html>
1631<head>
1632<title>The title</title>
1633<link rel="stylesheet" href="blah.css" type="text/css" id="l1">
1634</head>
1635<body>
1636<custom-dashed-tag class="dashed" id="dash1">Hello there.</custom-dashed-tag>
1637<div id="main" class="fancy">
1638<div id="inner">
1639<h1 id="header1">An H1</h1>
1640<p>Some text</p>
1641<p class="onep" id="p1">Some more text</p>
1642<h2 id="header2">An H2</h2>
1643<p class="class1 class2 class3" id="pmulti">Another</p>
1644<a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a>
1645<h2 id="header3">Another H2</h2>
1646<a id="me" href="http://simonwillison.net/" rel="me">me</a>
1647<span class="s1">
1648<a href="#" id="s1a1">span1a1</a>
1649<a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a>
1650<span class="span2">
1651<a href="#" id="s2a1">span2a1</a>
1652</span>
1653<span class="span3"></span>
1654<custom-dashed-tag class="dashed" id="dash2"/>
1655<div data-tag="dashedvalue" id="data1"/>
1656</span>
1657</div>
1658<x id="xid">
1659<z id="zida"/>
1660<z id="zidab"/>
1661<z id="zidac"/>
1662</x>
1663<y id="yid">
1664<z id="zidb"/>
1665</y>
1666<p lang="en" id="lang-en">English</p>
1667<p lang="en-gb" id="lang-en-gb">English UK</p>
1668<p lang="en-us" id="lang-en-us">English US</p>
1669<p lang="fr" id="lang-fr">French</p>
1670</div>
1671
1672<div id="footer">
1673</div>
1674"""
1675
1676 def setUp(self):
1677 self.soup = BeautifulSoup(self.HTML, 'html.parser')
1678
1679 def assertSelects(self, selector, expected_ids):
1680 el_ids = [el['id'] for el in self.soup.select(selector)]
1681 el_ids.sort()
1682 expected_ids.sort()
1683 self.assertEqual(expected_ids, el_ids,
1684 "Selector %s, expected [%s], got [%s]" % (
1685 selector, ', '.join(expected_ids), ', '.join(el_ids)
1686 )
1687 )
1688
1689 assertSelect = assertSelects
1690
1691 def assertSelectMultiple(self, *tests):
1692 for selector, expected_ids in tests:
1693 self.assertSelect(selector, expected_ids)
1694
1695 def test_one_tag_one(self):
1696 els = self.soup.select('title')
1697 self.assertEqual(len(els), 1)
1698 self.assertEqual(els[0].name, 'title')
1699 self.assertEqual(els[0].contents, ['The title'])
1700
1701 def test_one_tag_many(self):
1702 els = self.soup.select('div')
1703 self.assertEqual(len(els), 4)
1704 for div in els:
1705 self.assertEqual(div.name, 'div')
1706
1707 el = self.soup.select_one('div')
1708 self.assertEqual('main', el['id'])
1709
1710 def test_select_one_returns_none_if_no_match(self):
1711 match = self.soup.select_one('nonexistenttag')
1712 self.assertEqual(None, match)
1713
1714
1715 def test_tag_in_tag_one(self):
1716 els = self.soup.select('div div')
1717 self.assertSelects('div div', ['inner', 'data1'])
1718
1719 def test_tag_in_tag_many(self):
1720 for selector in ('html div', 'html body div', 'body div'):
1721 self.assertSelects(selector, ['data1', 'main', 'inner', 'footer'])
1722
1723 def test_tag_no_match(self):
1724 self.assertEqual(len(self.soup.select('del')), 0)
1725
1726 def test_invalid_tag(self):
1727 self.assertRaises(ValueError, self.soup.select, 'tag%t')
1728
1729 def test_select_dashed_tag_ids(self):
1730 self.assertSelects('custom-dashed-tag', ['dash1', 'dash2'])
1731
1732 def test_select_dashed_by_id(self):
1733 dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]')
1734 self.assertEqual(dashed[0].name, 'custom-dashed-tag')
1735 self.assertEqual(dashed[0]['id'], 'dash2')
1736
1737 def test_dashed_tag_text(self):
1738 self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, 'Hello there.')
1739
1740 def test_select_dashed_matches_find_all(self):
1741 self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag'))
1742
1743 def test_header_tags(self):
1744 self.assertSelectMultiple(
1745 ('h1', ['header1']),
1746 ('h2', ['header2', 'header3']),
1747 )
1748
1749 def test_class_one(self):
1750 for selector in ('.onep', 'p.onep', 'html p.onep'):
1751 els = self.soup.select(selector)
1752 self.assertEqual(len(els), 1)
1753 self.assertEqual(els[0].name, 'p')
1754 self.assertEqual(els[0]['class'], ['onep'])
1755
1756 def test_class_mismatched_tag(self):
1757 els = self.soup.select('div.onep')
1758 self.assertEqual(len(els), 0)
1759
1760 def test_one_id(self):
1761 for selector in ('div#inner', '#inner', 'div div#inner'):
1762 self.assertSelects(selector, ['inner'])
1763
1764 def test_bad_id(self):
1765 els = self.soup.select('#doesnotexist')
1766 self.assertEqual(len(els), 0)
1767
1768 def test_items_in_id(self):
1769 els = self.soup.select('div#inner p')
1770 self.assertEqual(len(els), 3)
1771 for el in els:
1772 self.assertEqual(el.name, 'p')
1773 self.assertEqual(els[1]['class'], ['onep'])
1774 self.assertFalse(els[0].has_attr('class'))
1775
1776 def test_a_bunch_of_emptys(self):
1777 for selector in ('div#main del', 'div#main div.oops', 'div div#main'):
1778 self.assertEqual(len(self.soup.select(selector)), 0)
1779
1780 def test_multi_class_support(self):
1781 for selector in ('.class1', 'p.class1', '.class2', 'p.class2',
1782 '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'):
1783 self.assertSelects(selector, ['pmulti'])
1784
1785 def test_multi_class_selection(self):
1786 for selector in ('.class1.class3', '.class3.class2',
1787 '.class1.class2.class3'):
1788 self.assertSelects(selector, ['pmulti'])
1789
1790 def test_child_selector(self):
1791 self.assertSelects('.s1 > a', ['s1a1', 's1a2'])
1792 self.assertSelects('.s1 > a span', ['s1a2s1'])
1793
1794 def test_child_selector_id(self):
1795 self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1'])
1796
1797 def test_attribute_equals(self):
1798 self.assertSelectMultiple(
1799 ('p[class="onep"]', ['p1']),
1800 ('p[id="p1"]', ['p1']),
1801 ('[class="onep"]', ['p1']),
1802 ('[id="p1"]', ['p1']),
1803 ('link[rel="stylesheet"]', ['l1']),
1804 ('link[type="text/css"]', ['l1']),
1805 ('link[href="blah.css"]', ['l1']),
1806 ('link[href="no-blah.css"]', []),
1807 ('[rel="stylesheet"]', ['l1']),
1808 ('[type="text/css"]', ['l1']),
1809 ('[href="blah.css"]', ['l1']),
1810 ('[href="no-blah.css"]', []),
1811 ('p[href="no-blah.css"]', []),
1812 ('[href="no-blah.css"]', []),
1813 )
1814
1815 def test_attribute_tilde(self):
1816 self.assertSelectMultiple(
1817 ('p[class~="class1"]', ['pmulti']),
1818 ('p[class~="class2"]', ['pmulti']),
1819 ('p[class~="class3"]', ['pmulti']),
1820 ('[class~="class1"]', ['pmulti']),
1821 ('[class~="class2"]', ['pmulti']),
1822 ('[class~="class3"]', ['pmulti']),
1823 ('a[rel~="friend"]', ['bob']),
1824 ('a[rel~="met"]', ['bob']),
1825 ('[rel~="friend"]', ['bob']),
1826 ('[rel~="met"]', ['bob']),
1827 )
1828
1829 def test_attribute_startswith(self):
1830 self.assertSelectMultiple(
1831 ('[rel^="style"]', ['l1']),
1832 ('link[rel^="style"]', ['l1']),
1833 ('notlink[rel^="notstyle"]', []),
1834 ('[rel^="notstyle"]', []),
1835 ('link[rel^="notstyle"]', []),
1836 ('link[href^="bla"]', ['l1']),
1837 ('a[href^="http://"]', ['bob', 'me']),
1838 ('[href^="http://"]', ['bob', 'me']),
1839 ('[id^="p"]', ['pmulti', 'p1']),
1840 ('[id^="m"]', ['me', 'main']),
1841 ('div[id^="m"]', ['main']),
1842 ('a[id^="m"]', ['me']),
1843 ('div[data-tag^="dashed"]', ['data1'])
1844 )
1845
1846 def test_attribute_endswith(self):
1847 self.assertSelectMultiple(
1848 ('[href$=".css"]', ['l1']),
1849 ('link[href$=".css"]', ['l1']),
1850 ('link[id$="1"]', ['l1']),
1851 ('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']),
1852 ('div[id$="1"]', ['data1']),
1853 ('[id$="noending"]', []),
1854 )
1855
1856 def test_attribute_contains(self):
1857 self.assertSelectMultiple(
1858 # From test_attribute_startswith
1859 ('[rel*="style"]', ['l1']),
1860 ('link[rel*="style"]', ['l1']),
1861 ('notlink[rel*="notstyle"]', []),
1862 ('[rel*="notstyle"]', []),
1863 ('link[rel*="notstyle"]', []),
1864 ('link[href*="bla"]', ['l1']),
1865 ('[href*="http://"]', ['bob', 'me']),
1866 ('[id*="p"]', ['pmulti', 'p1']),
1867 ('div[id*="m"]', ['main']),
1868 ('a[id*="m"]', ['me']),
1869 # From test_attribute_endswith
1870 ('[href*=".css"]', ['l1']),
1871 ('link[href*=".css"]', ['l1']),
1872 ('link[id*="1"]', ['l1']),
1873 ('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']),
1874 ('div[id*="1"]', ['data1']),
1875 ('[id*="noending"]', []),
1876 # New for this test
1877 ('[href*="."]', ['bob', 'me', 'l1']),
1878 ('a[href*="."]', ['bob', 'me']),
1879 ('link[href*="."]', ['l1']),
1880 ('div[id*="n"]', ['main', 'inner']),
1881 ('div[id*="nn"]', ['inner']),
1882 ('div[data-tag*="edval"]', ['data1'])
1883 )
1884
1885 def test_attribute_exact_or_hypen(self):
1886 self.assertSelectMultiple(
1887 ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
1888 ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
1889 ('p[lang|="fr"]', ['lang-fr']),
1890 ('p[lang|="gb"]', []),
1891 )
1892
1893 def test_attribute_exists(self):
1894 self.assertSelectMultiple(
1895 ('[rel]', ['l1', 'bob', 'me']),
1896 ('link[rel]', ['l1']),
1897 ('a[rel]', ['bob', 'me']),
1898 ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']),
1899 ('p[class]', ['p1', 'pmulti']),
1900 ('[blah]', []),
1901 ('p[blah]', []),
1902 ('div[data-tag]', ['data1'])
1903 )
1904
1905 def test_unsupported_pseudoclass(self):
1906 self.assertRaises(
1907 NotImplementedError, self.soup.select, "a:no-such-pseudoclass")
1908
1909 self.assertRaises(
1910 NotImplementedError, self.soup.select, "a:nth-of-type(a)")
1911
1912
1913 def test_nth_of_type(self):
1914 # Try to select first paragraph
1915 els = self.soup.select('div#inner p:nth-of-type(1)')
1916 self.assertEqual(len(els), 1)
1917 self.assertEqual(els[0].string, 'Some text')
1918
1919 # Try to select third paragraph
1920 els = self.soup.select('div#inner p:nth-of-type(3)')
1921 self.assertEqual(len(els), 1)
1922 self.assertEqual(els[0].string, 'Another')
1923
1924 # Try to select (non-existent!) fourth paragraph
1925 els = self.soup.select('div#inner p:nth-of-type(4)')
1926 self.assertEqual(len(els), 0)
1927
1928 # Pass in an invalid value.
1929 self.assertRaises(
1930 ValueError, self.soup.select, 'div p:nth-of-type(0)')
1931
1932 def test_nth_of_type_direct_descendant(self):
1933 els = self.soup.select('div#inner > p:nth-of-type(1)')
1934 self.assertEqual(len(els), 1)
1935 self.assertEqual(els[0].string, 'Some text')
1936
1937 def test_id_child_selector_nth_of_type(self):
1938 self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
1939
1940 def test_select_on_element(self):
1941 # Other tests operate on the tree; this operates on an element
1942 # within the tree.
1943 inner = self.soup.find("div", id="main")
1944 selected = inner.select("div")
1945 # The <div id="inner"> tag was selected. The <div id="footer">
1946 # tag was not.
1947 self.assertSelectsIDs(selected, ['inner', 'data1'])
1948
1949 def test_overspecified_child_id(self):
1950 self.assertSelects(".fancy #inner", ['inner'])
1951 self.assertSelects(".normal #inner", [])
1952
1953 def test_adjacent_sibling_selector(self):
1954 self.assertSelects('#p1 + h2', ['header2'])
1955 self.assertSelects('#p1 + h2 + p', ['pmulti'])
1956 self.assertSelects('#p1 + #header2 + .class1', ['pmulti'])
1957 self.assertEqual([], self.soup.select('#p1 + p'))
1958
1959 def test_general_sibling_selector(self):
1960 self.assertSelects('#p1 ~ h2', ['header2', 'header3'])
1961 self.assertSelects('#p1 ~ #header2', ['header2'])
1962 self.assertSelects('#p1 ~ h2 + a', ['me'])
1963 self.assertSelects('#p1 ~ h2 + [rel="me"]', ['me'])
1964 self.assertEqual([], self.soup.select('#inner ~ h2'))
1965
1966 def test_dangling_combinator(self):
1967 self.assertRaises(ValueError, self.soup.select, 'h1 >')
1968
1969 def test_sibling_combinator_wont_select_same_tag_twice(self):
1970 self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
1971
1972 # Test the selector grouping operator (the comma)
1973 def test_multiple_select(self):
1974 self.assertSelects('x, y', ['xid', 'yid'])
1975
1976 def test_multiple_select_with_no_space(self):
1977 self.assertSelects('x,y', ['xid', 'yid'])
1978
1979 def test_multiple_select_with_more_space(self):
1980 self.assertSelects('x, y', ['xid', 'yid'])
1981
1982 def test_multiple_select_duplicated(self):
1983 self.assertSelects('x, x', ['xid'])
1984
1985 def test_multiple_select_sibling(self):
1986 self.assertSelects('x, y ~ p[lang=fr]', ['xid', 'lang-fr'])
1987
1988 def test_multiple_select_tag_and_direct_descendant(self):
1989 self.assertSelects('x, y > z', ['xid', 'zidb'])
1990
1991 def test_multiple_select_direct_descendant_and_tags(self):
1992 self.assertSelects('div > x, y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
1993
1994 def test_multiple_select_indirect_descendant(self):
1995 self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
1996
1997 def test_invalid_multiple_select(self):
1998 self.assertRaises(ValueError, self.soup.select, ',x, y')
1999 self.assertRaises(ValueError, self.soup.select, 'x,,y')
2000
2001 def test_multiple_select_attrs(self):
2002 self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
2003
2004 def test_multiple_select_ids(self):
2005 self.assertSelects('x, y > z[id=zida], z[id=zidab], z[id=zidb]', ['xid', 'zidb', 'zidab'])
2006
2007 def test_multiple_select_nested(self):
2008 self.assertSelects('body > div > x, y > z', ['xid', 'zidb'])
2009
2010
2011