A"""
soup = self.soup(markup)
self.assertEqual(u"A", soup.body.decode())
def test_extraction(self):
"""
Test that extraction does not destroy the tree.
https://bugs.launchpad.net/beautifulsoup/+bug/1782928
"""
markup = """
hello
"""
soup = self.soup(markup)
[s.extract() for s in soup('script')]
[s.extract() for s in soup('style')]
self.assertEqual(len(soup.find_all("p")), 1)
def test_empty_comment(self):
"""
Test that empty comment does not break structure.
https://bugs.launchpad.net/beautifulsoup/+bug/1806598
"""
markup = """
"""
soup = self.soup(markup)
inputs = []
for form in soup.find_all('form'):
inputs.extend(form.find_all('input'))
self.assertEqual(len(inputs), 1)
def test_tracking_line_numbers(self):
# The html.parser TreeBuilder keeps track of line number and
# position of each element.
markup = "\n \n\n\ntext "
soup = self.soup(markup)
self.assertEqual(2, soup.p.sourceline)
self.assertEqual(5, soup.p.sourcepos)
self.assertEqual("sourceline", soup.p.find('sourceline').name)
# You can deactivate this behavior.
soup = self.soup(markup, store_line_numbers=False)
self.assertEqual("sourceline", soup.p.sourceline.name)
self.assertEqual("sourcepos", soup.p.sourcepos.name)
beautifulsoup4-4.8.2/bs4/tests/test_htmlparser.py 0000644 0001750 0001750 00000004462 13515137110 023565 0 ustar leonardr leonardr 0000000 0000000 """Tests to ensure that the html.parser tree builder generates good
trees."""
from pdb import set_trace
import pickle
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
from bs4.builder import HTMLParserTreeBuilder
from bs4.builder._htmlparser import BeautifulSoupHTMLParser
class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
default_builder = HTMLParserTreeBuilder
def test_namespaced_system_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one.
pass
def test_namespaced_public_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one.
pass
def test_builder_is_pickled(self):
"""Unlike most tree builders, HTMLParserTreeBuilder and will
be restored after pickling.
"""
tree = self.soup("foo")
dumped = pickle.dumps(tree, 2)
loaded = pickle.loads(dumped)
self.assertTrue(isinstance(loaded.builder, type(tree.builder)))
def test_redundant_empty_element_closing_tags(self):
self.assertSoupEquals('
', "
")
self.assertSoupEquals('', "")
def test_empty_element(self):
# This verifies that any buffered data present when the parser
# finishes working is handled.
self.assertSoupEquals("foo bar", "foo &# bar")
def test_tracking_line_numbers(self):
# The html.parser TreeBuilder keeps track of line number and
# position of each element.
markup = "\n \n\n\ntext "
soup = self.soup(markup)
self.assertEqual(2, soup.p.sourceline)
self.assertEqual(3, soup.p.sourcepos)
self.assertEqual("sourceline", soup.p.find('sourceline').name)
# You can deactivate this behavior.
soup = self.soup(markup, store_line_numbers=False)
self.assertEqual("sourceline", soup.p.sourceline.name)
self.assertEqual("sourcepos", soup.p.sourcepos.name)
class TestHTMLParserSubclass(SoupTest):
def test_error(self):
"""Verify that our HTMLParser subclass implements error() in a way
that doesn't cause a crash.
"""
parser = BeautifulSoupHTMLParser()
parser.error("don't crash")
beautifulsoup4-4.8.2/bs4/tests/test_lxml.py 0000644 0001750 0001750 00000010014 13515137015 022352 0 ustar leonardr leonardr 0000000 0000000 """Tests to ensure that the lxml tree builder generates good trees."""
import re
import warnings
try:
import lxml.etree
LXML_PRESENT = True
LXML_VERSION = lxml.etree.LXML_VERSION
except ImportError, e:
LXML_PRESENT = False
LXML_VERSION = (0,)
if LXML_PRESENT:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
from bs4 import (
BeautifulSoup,
BeautifulStoneSoup,
)
from bs4.element import Comment, Doctype, SoupStrainer
from bs4.testing import skipIf
from bs4.tests import test_htmlparser
from bs4.testing import (
HTMLTreeBuilderSmokeTest,
XMLTreeBuilderSmokeTest,
SoupTest,
skipIf,
)
@skipIf(
not LXML_PRESENT,
"lxml seems not to be present, not testing its tree builder.")
class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
"""See ``HTMLTreeBuilderSmokeTest``."""
@property
def default_builder(self):
return LXMLTreeBuilder
def test_out_of_range_entity(self):
self.assertSoupEquals(
"foobar ", "foobar ")
self.assertSoupEquals(
"foobar ", "foobar ")
self.assertSoupEquals(
"foobar ", "foobar ")
def test_entities_in_foreign_document_encoding(self):
# We can't implement this case correctly because by the time we
# hear about markup like "", it's been (incorrectly) converted into
# a string like u'\x93'
pass
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
# test if an old version of lxml is installed.
@skipIf(
not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
"Skipping doctype test for old version of lxml to avoid segfault.")
def test_empty_doctype(self):
soup = self.soup("")
doctype = soup.contents[0]
self.assertEqual("", doctype.strip())
def test_beautifulstonesoup_is_xml_parser(self):
# Make sure that the deprecated BSS class uses an xml builder
# if one is installed.
with warnings.catch_warnings(record=True) as w:
soup = BeautifulStoneSoup("")
self.assertEqual(u"", unicode(soup.b))
self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
def test_tracking_line_numbers(self):
# The lxml TreeBuilder cannot keep track of line numbers from
# the original markup. Even if you ask for line numbers, we
# don't have 'em.
#
# This means that if you have a tag like or
# , attribute access will find it rather than
# giving you a numeric answer.
soup = self.soup(
"\n \n\n\ntext ",
store_line_numbers=True
)
self.assertEqual("sourceline", soup.p.sourceline.name)
self.assertEqual("sourcepos", soup.p.sourcepos.name)
@skipIf(
not LXML_PRESENT,
"lxml seems not to be present, not testing its XML tree builder.")
class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
"""See ``HTMLTreeBuilderSmokeTest``."""
@property
def default_builder(self):
return LXMLTreeBuilderForXML
def test_namespace_indexing(self):
# We should not track un-prefixed namespaces as we can only hold one
# and it will be recognized as the default namespace by soupsieve,
# which may be confusing in some situations. When no namespace is provided
# for a selector, the default namespace (if defined) is assumed.
soup = self.soup(
'\n'
''
'content'
'content'
''
)
self.assertEqual(
soup._namespaces,
{'xml': 'http://www.w3.org/XML/1998/namespace', 'prefix': 'http://prefixed-namespace.com'}
)
beautifulsoup4-4.8.2/bs4/tests/test_soup.py 0000644 0001750 0001750 00000066014 13546175122 022404 0 ustar leonardr leonardr 0000000 0000000 # -*- coding: utf-8 -*-
"""Tests of Beautiful Soup as a whole."""
from pdb import set_trace
import logging
import unittest
import sys
import tempfile
from bs4 import (
BeautifulSoup,
BeautifulStoneSoup,
)
from bs4.builder import (
TreeBuilder,
ParserRejectedMarkup,
)
from bs4.element import (
CharsetMetaAttributeValue,
Comment,
ContentMetaAttributeValue,
SoupStrainer,
NamespacedAttribute,
Tag,
NavigableString,
)
import bs4.dammit
from bs4.dammit import (
EntitySubstitution,
UnicodeDammit,
EncodingDetector,
)
from bs4.testing import (
default_builder,
SoupTest,
skipIf,
)
import warnings
try:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
LXML_PRESENT = True
except ImportError as e:
LXML_PRESENT = False
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
class TestConstructor(SoupTest):
def test_short_unicode_input(self):
data = u"éé"
soup = self.soup(data)
self.assertEqual(u"éé", soup.h1.string)
def test_embedded_null(self):
data = u"foo\0bar"
soup = self.soup(data)
self.assertEqual(u"foo\0bar", soup.h1.string)
def test_exclude_encodings(self):
utf8_data = u"Räksmörgås".encode("utf-8")
soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
self.assertEqual("windows-1252", soup.original_encoding)
def test_custom_builder_class(self):
# Verify that you can pass in a custom Builder class and
# it'll be instantiated with the appropriate keyword arguments.
class Mock(object):
def __init__(self, **kwargs):
self.called_with = kwargs
self.is_xml = True
self.store_line_numbers = False
self.cdata_list_attributes = []
self.preserve_whitespace_tags = []
def initialize_soup(self, soup):
pass
def feed(self, markup):
self.fed = markup
def reset(self):
pass
def ignore(self, ignore):
pass
set_up_substitutions = can_be_empty_element = ignore
def prepare_markup(self, *args, **kwargs):
yield "prepared markup", "original encoding", "declared encoding", "contains replacement characters"
kwargs = dict(
var="value",
# This is a deprecated BS3-era keyword argument, which
# will be stripped out.
convertEntities=True,
)
with warnings.catch_warnings(record=True):
soup = BeautifulSoup('', builder=Mock, **kwargs)
assert isinstance(soup.builder, Mock)
self.assertEqual(dict(var="value"), soup.builder.called_with)
self.assertEqual("prepared markup", soup.builder.fed)
# You can also instantiate the TreeBuilder yourself. In this
# case, that specific object is used and any keyword arguments
# to the BeautifulSoup constructor are ignored.
builder = Mock(**kwargs)
with warnings.catch_warnings(record=True) as w:
soup = BeautifulSoup(
'', builder=builder, ignored_value=True,
)
msg = str(w[0].message)
assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.")
self.assertEqual(builder, soup.builder)
self.assertEqual(kwargs, builder.called_with)
def test_parser_markup_rejection(self):
# If markup is completely rejected by the parser, an
# explanatory ParserRejectedMarkup exception is raised.
class Mock(TreeBuilder):
def feed(self, *args, **kwargs):
raise ParserRejectedMarkup("Nope.")
def prepare_markup(self, *args, **kwargs):
# We're going to try two different ways of preparing this markup,
# but feed() will reject both of them.
yield markup, None, None, False
yield markup, None, None, False
import re
self.assertRaisesRegexp(
ParserRejectedMarkup,
"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.",
BeautifulSoup, '', builder=Mock,
)
def test_cdata_list_attributes(self):
# Most attribute values are represented as scalars, but the
# HTML standard says that some attributes, like 'class' have
# space-separated lists as values.
markup = ''
soup = self.soup(markup)
# Note that the spaces are stripped for 'class' but not for 'id'.
a = soup.a
self.assertEqual(" an id ", a['id'])
self.assertEqual(["a", "class"], a['class'])
# TreeBuilder takes an argument called 'mutli_valued_attributes' which lets
# you customize or disable this. As always, you can customize the TreeBuilder
# by passing in a keyword argument to the BeautifulSoup constructor.
soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None)
self.assertEqual(" a class ", soup.a['class'])
# Here are two ways of saying that `id` is a multi-valued
# attribute in this context, but 'class' is not.
for switcheroo in ({'*': 'id'}, {'a': 'id'}):
with warnings.catch_warnings(record=True) as w:
# This will create a warning about not explicitly
# specifying a parser, but we'll ignore it.
soup = self.soup(markup, builder=None, multi_valued_attributes=switcheroo)
a = soup.a
self.assertEqual(["an", "id"], a['id'])
self.assertEqual(" a class ", a['class'])
def test_replacement_classes(self):
# Test the ability to pass in replacements for element classes
# which will be used when building the tree.
class TagPlus(Tag):
pass
class StringPlus(NavigableString):
pass
class CommentPlus(Comment):
pass
soup = self.soup(
"foobar",
element_classes = {
Tag: TagPlus,
NavigableString: StringPlus,
Comment: CommentPlus,
}
)
# The tree was built with TagPlus, StringPlus, and CommentPlus objects,
# rather than Tag, String, and Comment objects.
assert all(
isinstance(x, (TagPlus, StringPlus, CommentPlus))
for x in soup.recursiveChildGenerator()
)
class TestWarnings(SoupTest):
def _no_parser_specified(self, s, is_there=True):
v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80])
self.assertTrue(v)
def test_warning_if_no_parser_specified(self):
with warnings.catch_warnings(record=True) as w:
soup = self.soup("")
msg = str(w[0].message)
self._assert_no_parser_specified(msg)
def test_warning_if_parser_specified_too_vague(self):
with warnings.catch_warnings(record=True) as w:
soup = self.soup("", "html")
msg = str(w[0].message)
self._assert_no_parser_specified(msg)
def test_no_warning_if_explicit_parser_specified(self):
with warnings.catch_warnings(record=True) as w:
soup = self.soup("", "html.parser")
self.assertEqual([], w)
def test_parseOnlyThese_renamed_to_parse_only(self):
with warnings.catch_warnings(record=True) as w:
soup = self.soup("", parseOnlyThese=SoupStrainer("b"))
msg = str(w[0].message)
self.assertTrue("parseOnlyThese" in msg)
self.assertTrue("parse_only" in msg)
self.assertEqual(b"", soup.encode())
def test_fromEncoding_renamed_to_from_encoding(self):
with warnings.catch_warnings(record=True) as w:
utf8 = b"\xc3\xa9"
soup = self.soup(utf8, fromEncoding="utf8")
msg = str(w[0].message)
self.assertTrue("fromEncoding" in msg)
self.assertTrue("from_encoding" in msg)
self.assertEqual("utf8", soup.original_encoding)
def test_unrecognized_keyword_argument(self):
self.assertRaises(
TypeError, self.soup, "", no_such_argument=True)
class TestWarnings(SoupTest):
def test_disk_file_warning(self):
filehandle = tempfile.NamedTemporaryFile()
filename = filehandle.name
try:
with warnings.catch_warnings(record=True) as w:
soup = self.soup(filename)
msg = str(w[0].message)
self.assertTrue("looks like a filename" in msg)
finally:
filehandle.close()
# The file no longer exists, so Beautiful Soup will no longer issue the warning.
with warnings.catch_warnings(record=True) as w:
soup = self.soup(filename)
self.assertEqual(0, len(w))
def test_url_warning_with_bytes_url(self):
with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup(b"http://www.crummybytes.com/")
# Be aware this isn't the only warning that can be raised during
# execution..
self.assertTrue(any("looks like a URL" in str(w.message)
for w in warning_list))
def test_url_warning_with_unicode_url(self):
with warnings.catch_warnings(record=True) as warning_list:
# note - this url must differ from the bytes one otherwise
# python's warnings system swallows the second warning
soup = self.soup(u"http://www.crummyunicode.com/")
self.assertTrue(any("looks like a URL" in str(w.message)
for w in warning_list))
def test_url_warning_with_bytes_and_space(self):
with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup(b"http://www.crummybytes.com/ is great")
self.assertFalse(any("looks like a URL" in str(w.message)
for w in warning_list))
def test_url_warning_with_unicode_and_space(self):
with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup(u"http://www.crummyuncode.com/ is great")
self.assertFalse(any("looks like a URL" in str(w.message)
for w in warning_list))
class TestSelectiveParsing(SoupTest):
def test_parse_with_soupstrainer(self):
markup = "NoYesNoYes Yes"
strainer = SoupStrainer("b")
soup = self.soup(markup, parse_only=strainer)
self.assertEqual(soup.encode(), b"YesYes Yes")
class TestEntitySubstitution(unittest.TestCase):
"""Standalone tests of the EntitySubstitution class."""
def setUp(self):
self.sub = EntitySubstitution
def test_simple_html_substitution(self):
# Unicode characters corresponding to named HTML entites
# are substituted, and no others.
s = u"foo\u2200\N{SNOWMAN}\u00f5bar"
self.assertEqual(self.sub.substitute_html(s),
u"foo∀\N{SNOWMAN}õbar")
def test_smart_quote_substitution(self):
# MS smart quotes are a common source of frustration, so we
# give them a special test.
quotes = b"\x91\x92foo\x93\x94"
dammit = UnicodeDammit(quotes)
self.assertEqual(self.sub.substitute_html(dammit.markup),
"‘’foo“”")
def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
s = 'Welcome to "my bar"'
self.assertEqual(self.sub.substitute_xml(s, False), s)
def test_xml_attribute_quoting_normally_uses_double_quotes(self):
self.assertEqual(self.sub.substitute_xml("Welcome", True),
'"Welcome"')
self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
'"Bob\'s Bar"')
def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
s = 'Welcome to "my bar"'
self.assertEqual(self.sub.substitute_xml(s, True),
"'Welcome to \"my bar\"'")
def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
s = 'Welcome to "Bob\'s Bar"'
self.assertEqual(
self.sub.substitute_xml(s, True),
'"Welcome to "Bob\'s Bar""')
def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
quoted = 'Welcome to "Bob\'s Bar"'
self.assertEqual(self.sub.substitute_xml(quoted), quoted)
def test_xml_quoting_handles_angle_brackets(self):
self.assertEqual(
self.sub.substitute_xml("foo"),
"foo<bar>")
def test_xml_quoting_handles_ampersands(self):
self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T")
def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
self.assertEqual(
self.sub.substitute_xml("ÁT&T"),
"ÁT&T")
def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
self.assertEqual(
self.sub.substitute_xml_containing_entities("ÁT&T"),
"ÁT&T")
def test_quotes_not_html_substituted(self):
"""There's no need to do this except inside attribute values."""
text = 'Bob\'s "bar"'
self.assertEqual(self.sub.substitute_html(text), text)
class TestEncodingConversion(SoupTest):
# Test Beautiful Soup's ability to decode and encode from various
# encodings.
def setUp(self):
super(TestEncodingConversion, self).setUp()
self.unicode_data = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!'
self.utf8_data = self.unicode_data.encode("utf-8")
# Just so you know what it looks like.
self.assertEqual(
self.utf8_data,
b'Sacr\xc3\xa9 bleu!')
def test_ascii_in_unicode_out(self):
# ASCII input is converted to Unicode. The original_encoding
# attribute is set to 'utf-8', a superset of ASCII.
chardet = bs4.dammit.chardet_dammit
logging.disable(logging.WARNING)
try:
def noop(str):
return None
# Disable chardet, which will realize that the ASCII is ASCII.
bs4.dammit.chardet_dammit = noop
ascii = b"a"
soup_from_ascii = self.soup(ascii)
unicode_output = soup_from_ascii.decode()
self.assertTrue(isinstance(unicode_output, unicode))
self.assertEqual(unicode_output, self.document_for(ascii.decode()))
self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
finally:
logging.disable(logging.NOTSET)
bs4.dammit.chardet_dammit = chardet
def test_unicode_in_unicode_out(self):
# Unicode input is left alone. The original_encoding attribute
# is not set.
soup_from_unicode = self.soup(self.unicode_data)
self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!')
self.assertEqual(soup_from_unicode.original_encoding, None)
def test_utf8_in_unicode_out(self):
# UTF-8 input is converted to Unicode. The original_encoding
# attribute is set.
soup_from_utf8 = self.soup(self.utf8_data)
self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
def test_utf8_out(self):
# The internal data structures can be encoded as UTF-8.
soup_from_unicode = self.soup(self.unicode_data)
self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
@skipIf(
PYTHON_3_PRE_3_2,
"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
def test_attribute_name_containing_unicode_characters(self):
markup = u''
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
class TestUnicodeDammit(unittest.TestCase):
"""Standalone tests of UnicodeDammit."""
def test_unicode_input(self):
markup = u"I'm already Unicode! \N{SNOWMAN}"
dammit = UnicodeDammit(markup)
self.assertEqual(dammit.unicode_markup, markup)
def test_smart_quotes_to_unicode(self):
markup = b"\x91\x92\x93\x94"
dammit = UnicodeDammit(markup)
self.assertEqual(
dammit.unicode_markup, u"\u2018\u2019\u201c\u201d")
def test_smart_quotes_to_xml_entities(self):
markup = b"\x91\x92\x93\x94"
dammit = UnicodeDammit(markup, smart_quotes_to="xml")
self.assertEqual(
dammit.unicode_markup, "‘’“”")
def test_smart_quotes_to_html_entities(self):
markup = b"\x91\x92\x93\x94"
dammit = UnicodeDammit(markup, smart_quotes_to="html")
self.assertEqual(
dammit.unicode_markup, "‘’“”")
def test_smart_quotes_to_ascii(self):
markup = b"\x91\x92\x93\x94"
dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
self.assertEqual(
dammit.unicode_markup, """''""""")
def test_detect_utf8(self):
utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
dammit = UnicodeDammit(utf8)
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
self.assertEqual(dammit.unicode_markup, u'Sacr\xe9 bleu! \N{SNOWMAN}')
def test_convert_hebrew(self):
hebrew = b"\xed\xe5\xec\xf9"
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
def test_dont_see_smart_quotes_where_there_are_none(self):
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
dammit = UnicodeDammit(utf_8)
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
def test_ignore_inappropriate_codecs(self):
utf8_data = u"Räksmörgås".encode("utf-8")
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_ignore_invalid_codecs(self):
utf8_data = u"Räksmörgås".encode("utf-8")
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
dammit = UnicodeDammit(utf8_data, [bad_encoding])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_exclude_encodings(self):
# This is UTF-8.
utf8_data = u"Räksmörgås".encode("utf-8")
# But if we exclude UTF-8 from consideration, the guess is
# Windows-1252.
dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
# And if we exclude that, there is no valid guess at all.
dammit = UnicodeDammit(
utf8_data, exclude_encodings=["utf-8", "windows-1252"])
self.assertEqual(dammit.original_encoding, None)
def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
detected = EncodingDetector(
b'')
encodings = list(detected.encodings)
assert u'utf-\N{REPLACEMENT CHARACTER}' in encodings
def test_detect_html5_style_meta_tag(self):
for data in (
b'',
b"",
b"",
b""):
dammit = UnicodeDammit(data, is_html=True)
self.assertEqual(
"euc-jp", dammit.original_encoding)
def test_last_ditch_entity_replacement(self):
# This is a UTF-8 document that contains bytestrings
# completely incompatible with UTF-8 (ie. encoded with some other
# encoding).
#
# Since there is no consistent encoding for the document,
# Unicode, Dammit will eventually encode the document as UTF-8
# and encode the incompatible characters as REPLACEMENT
# CHARACTER.
#
# If chardet is installed, it will detect that the document
# can be converted into ISO-8859-1 without errors. This happens
# to be the wrong encoding, but it is a consistent encoding, so the
# code we're testing here won't run.
#
# So we temporarily disable chardet if it's present.
doc = b"""\357\273\277
\330\250\330\252\330\261
\310\322\321\220\312\321\355\344"""
chardet = bs4.dammit.chardet_dammit
logging.disable(logging.WARNING)
try:
def noop(str):
return None
bs4.dammit.chardet_dammit = noop
dammit = UnicodeDammit(doc)
self.assertEqual(True, dammit.contains_replacement_characters)
self.assertTrue(u"\ufffd" in dammit.unicode_markup)
soup = BeautifulSoup(doc, "html.parser")
self.assertTrue(soup.contains_replacement_characters)
finally:
logging.disable(logging.NOTSET)
bs4.dammit.chardet_dammit = chardet
def test_byte_order_mark_removed(self):
# A document written in UTF-16LE will have its byte order marker stripped.
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
dammit = UnicodeDammit(data)
self.assertEqual(u"áé", dammit.unicode_markup)
self.assertEqual("utf-16le", dammit.original_encoding)
def test_detwingle(self):
# Here's a UTF8 document.
utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8")
# Here's a Windows-1252 document.
windows_1252 = (
u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
# Through some unholy alchemy, they've been stuck together.
doc = utf8 + windows_1252 + utf8
# The document can't be turned into UTF-8:
self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
# Unicode, Dammit thinks the whole document is Windows-1252,
# and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃"
# But if we run it through fix_embedded_windows_1252, it's fixed:
fixed = UnicodeDammit.detwingle(doc)
self.assertEqual(
u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
def test_detwingle_ignores_multibyte_characters(self):
# Each of these characters has a UTF-8 representation ending
# in \x93. \x93 is a smart quote if interpreted as
# Windows-1252. But our code knows to skip over multibyte
# UTF-8 characters, so they'll survive the process unscathed.
for tricky_unicode_char in (
u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
):
input = tricky_unicode_char.encode("utf8")
self.assertTrue(input.endswith(b'\x93'))
output = UnicodeDammit.detwingle(input)
self.assertEqual(output, input)
def test_find_declared_encoding(self):
# Test our ability to find a declared encoding inside an
# XML or HTML document.
#
# Even if the document comes in as Unicode, it may be
# interesting to know what encoding was claimed
# originally.
html_unicode = u''
html_bytes = html_unicode.encode("ascii")
xml_unicode= u''
xml_bytes = xml_unicode.encode("ascii")
m = EncodingDetector.find_declared_encoding
self.assertEquals(None, m(html_unicode, is_html=False))
self.assertEquals("utf-8", m(html_unicode, is_html=True))
self.assertEquals("utf-8", m(html_bytes, is_html=True))
self.assertEquals("iso-8859-1", m(xml_unicode))
self.assertEquals("iso-8859-1", m(xml_bytes))
# Normally, only the first few kilobytes of a document are checked for
# an encoding.
spacer = b' ' * 5000
self.assertEquals(None, m(spacer + html_bytes))
self.assertEquals(None, m(spacer + xml_bytes))
# But you can tell find_declared_encoding to search an entire
# HTML document.
self.assertEquals(
"utf-8",
m(spacer + html_bytes, is_html=True, search_entire_document=True)
)
# The XML encoding declaration has to be the very first thing
# in the document. We'll allow whitespace before the document
# starts, but nothing else.
self.assertEquals(
"iso-8859-1",
m(xml_bytes, search_entire_document=True)
)
self.assertEquals(
None, m(b'a' + xml_bytes, search_entire_document=True)
)
class TestNamedspacedAttribute(SoupTest):
def test_name_may_be_none_or_missing(self):
a = NamespacedAttribute("xmlns", None)
self.assertEqual(a, "xmlns")
a = NamespacedAttribute("xmlns")
self.assertEqual(a, "xmlns")
def test_attribute_is_equivalent_to_colon_separated_string(self):
a = NamespacedAttribute("a", "b")
self.assertEqual("a:b", a)
def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
a = NamespacedAttribute("a", "b", "c")
b = NamespacedAttribute("a", "b", "c")
self.assertEqual(a, b)
# The actual namespace is not considered.
c = NamespacedAttribute("a", "b", None)
self.assertEqual(a, c)
# But name and prefix are important.
d = NamespacedAttribute("a", "z", "c")
self.assertNotEqual(a, d)
e = NamespacedAttribute("z", "b", "c")
self.assertNotEqual(a, e)
class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
def test_content_meta_attribute_value(self):
value = CharsetMetaAttributeValue("euc-jp")
self.assertEqual("euc-jp", value)
self.assertEqual("euc-jp", value.original_value)
self.assertEqual("utf8", value.encode("utf8"))
def test_content_meta_attribute_value(self):
value = ContentMetaAttributeValue("text/html; charset=euc-jp")
self.assertEqual("text/html; charset=euc-jp", value)
self.assertEqual("text/html; charset=euc-jp", value.original_value)
self.assertEqual("text/html; charset=utf8", value.encode("utf8"))
beautifulsoup4-4.8.2/bs4/tests/test_tree.py 0000644 0001750 0001750 00000250514 13527345071 022356 0 ustar leonardr leonardr 0000000 0000000 # -*- coding: utf-8 -*-
"""Tests for Beautiful Soup's tree traversal methods.
The tree traversal methods are the main advantage of using Beautiful
Soup over just using a parser.
Different parsers will build different Beautiful Soup trees given the
same markup, but all Beautiful Soup trees can be traversed with the
methods tested here.
"""
from pdb import set_trace
import copy
import pickle
import re
import warnings
from bs4 import BeautifulSoup
from bs4.builder import (
builder_registry,
HTMLParserTreeBuilder,
)
from bs4.element import (
PY3K,
CData,
Comment,
Declaration,
Doctype,
Formatter,
NavigableString,
SoupStrainer,
Tag,
)
from bs4.testing import (
SoupTest,
skipIf,
)
XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None)
LXML_PRESENT = (builder_registry.lookup("lxml") is not None)
class TreeTest(SoupTest):
def assertSelects(self, tags, should_match):
"""Make sure that the given tags have the correct text.
This is used in tests that define a bunch of tags, each
containing a single string, and then select certain strings by
some mechanism.
"""
self.assertEqual([tag.string for tag in tags], should_match)
def assertSelectsIDs(self, tags, should_match):
"""Make sure that the given tags have the correct IDs.
This is used in tests that define a bunch of tags, each
containing a single string, and then select certain strings by
some mechanism.
"""
self.assertEqual([tag['id'] for tag in tags], should_match)
class TestFind(TreeTest):
"""Basic tests of the find() method.
find() just calls find_all() with limit=1, so it's not tested all
that thouroughly here.
"""
def test_find_tag(self):
soup = self.soup("1234")
self.assertEqual(soup.find("b").string, "2")
def test_unicode_text_find(self):
soup = self.soup(u'Räksmörgås')
self.assertEqual(soup.find(string=u'Räksmörgås'), u'Räksmörgås')
def test_unicode_attribute_find(self):
soup = self.soup(u'here it is')
str(soup)
self.assertEqual("here it is", soup.find(id=u'Räksmörgås').text)
def test_find_everything(self):
"""Test an optimization that finds all tags."""
soup = self.soup("foobar")
self.assertEqual(2, len(soup.find_all()))
def test_find_everything_with_name(self):
"""Test an optimization that finds all tags with a given name."""
soup = self.soup("foobarbaz")
self.assertEqual(2, len(soup.find_all('a')))
class TestFindAll(TreeTest):
"""Basic tests of the find_all() method."""
def test_find_all_text_nodes(self):
"""You can search the tree for text nodes."""
soup = self.soup("Foobar\xbb")
# Exact match.
self.assertEqual(soup.find_all(string="bar"), [u"bar"])
self.assertEqual(soup.find_all(text="bar"), [u"bar"])
# Match any of a number of strings.
self.assertEqual(
soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"])
# Match a regular expression.
self.assertEqual(soup.find_all(text=re.compile('.*')),
[u"Foo", u"bar", u'\xbb'])
# Match anything.
self.assertEqual(soup.find_all(text=True),
[u"Foo", u"bar", u'\xbb'])
def test_find_all_limit(self):
"""You can limit the number of items returned by find_all."""
soup = self.soup("12345")
self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"])
self.assertSelects(soup.find_all('a', limit=1), ["1"])
self.assertSelects(
soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"])
# A limit of 0 means no limit.
self.assertSelects(
soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"])
def test_calling_a_tag_is_calling_findall(self):
soup = self.soup("123")
self.assertSelects(soup('a', limit=1), ["1"])
self.assertSelects(soup.b(id="foo"), ["3"])
def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self):
soup = self.soup("")
# Create a self-referential list.
l = []
l.append(l)
# Without special code in _normalize_search_value, this would cause infinite
# recursion.
self.assertEqual([], soup.find_all(l))
def test_find_all_resultset(self):
"""All find_all calls return a ResultSet"""
soup = self.soup("")
result = soup.find_all("a")
self.assertTrue(hasattr(result, "source"))
result = soup.find_all(True)
self.assertTrue(hasattr(result, "source"))
result = soup.find_all(text="foo")
self.assertTrue(hasattr(result, "source"))
class TestFindAllBasicNamespaces(TreeTest):
def test_find_by_namespaced_name(self):
soup = self.soup('4')
self.assertEqual("4", soup.find("mathml:msqrt").string)
self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name)
class TestFindAllByName(TreeTest):
"""Test ways of finding tags by tag name."""
def setUp(self):
super(TreeTest, self).setUp()
self.tree = self.soup("""First tag.
Second tag.
Third Nested tag. tag.""")
def test_find_all_by_tag_name(self):
# Find all the tags.
self.assertSelects(
self.tree.find_all('a'), ['First tag.', 'Nested tag.'])
def test_find_all_by_name_and_text(self):
self.assertSelects(
self.tree.find_all('a', text='First tag.'), ['First tag.'])
self.assertSelects(
self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.'])
self.assertSelects(
self.tree.find_all('a', text=re.compile("tag")),
['First tag.', 'Nested tag.'])
def test_find_all_on_non_root_element(self):
# You can call find_all on any node, not just the root.
self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.'])
def test_calling_element_invokes_find_all(self):
self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.'])
def test_find_all_by_tag_strainer(self):
self.assertSelects(
self.tree.find_all(SoupStrainer('a')),
['First tag.', 'Nested tag.'])
def test_find_all_by_tag_names(self):
self.assertSelects(
self.tree.find_all(['a', 'b']),
['First tag.', 'Second tag.', 'Nested tag.'])
def test_find_all_by_tag_dict(self):
self.assertSelects(
self.tree.find_all({'a' : True, 'b' : True}),
['First tag.', 'Second tag.', 'Nested tag.'])
def test_find_all_by_tag_re(self):
self.assertSelects(
self.tree.find_all(re.compile('^[ab]$')),
['First tag.', 'Second tag.', 'Nested tag.'])
def test_find_all_with_tags_matching_method(self):
# You can define an oracle method that determines whether
# a tag matches the search.
def id_matches_name(tag):
return tag.name == tag.get('id')
tree = self.soup("""Match 1.
Does not match.
Match 2.""")
self.assertSelects(
tree.find_all(id_matches_name), ["Match 1.", "Match 2."])
def test_find_with_multi_valued_attribute(self):
soup = self.soup(
"1 2 3 "
)
r1 = soup.find('div', 'a d');
r2 = soup.find('div', re.compile(r'a d'));
r3, r4 = soup.find_all('div', ['a b', 'a d']);
self.assertEqual('3', r1.string)
self.assertEqual('3', r2.string)
self.assertEqual('1', r3.string)
self.assertEqual('3', r4.string)
class TestFindAllByAttribute(TreeTest):
def test_find_all_by_attribute_name(self):
# You can pass in keyword arguments to find_all to search by
# attribute.
tree = self.soup("""
Matching a.
Non-matching Matching b.a.
""")
self.assertSelects(tree.find_all(id='first'),
["Matching a.", "Matching b."])
def test_find_all_by_utf8_attribute_value(self):
peace = u"םולש".encode("utf8")
data = u''.encode("utf8")
soup = self.soup(data)
self.assertEqual([soup.a], soup.find_all(title=peace))
self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8")))
self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"]))
def test_find_all_by_attribute_dict(self):
# You can pass in a dictionary as the argument 'attrs'. This
# lets you search for attributes like 'name' (a fixed argument
# to find_all) and 'class' (a reserved word in Python.)
tree = self.soup("""
Name match.
Class match.
Non-match.
A tag called 'name1'.
""")
# This doesn't do what you want.
self.assertSelects(tree.find_all(name='name1'),
["A tag called 'name1'."])
# This does what you want.
self.assertSelects(tree.find_all(attrs={'name' : 'name1'}),
["Name match."])
self.assertSelects(tree.find_all(attrs={'class' : 'class2'}),
["Class match."])
def test_find_all_by_class(self):
tree = self.soup("""
Class 1.
Class 2.
Class 1.
Class 3 and 4.
""")
# Passing in the class_ keyword argument will search against
# the 'class' attribute.
self.assertSelects(tree.find_all('a', class_='1'), ['Class 1.'])
self.assertSelects(tree.find_all('c', class_='3'), ['Class 3 and 4.'])
self.assertSelects(tree.find_all('c', class_='4'), ['Class 3 and 4.'])
# Passing in a string to 'attrs' will also search the CSS class.
self.assertSelects(tree.find_all('a', '1'), ['Class 1.'])
self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.'])
self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.'])
self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.'])
def test_find_by_class_when_multiple_classes_present(self):
tree = self.soup("Found it")
f = tree.find_all("gar", class_=re.compile("o"))
self.assertSelects(f, ["Found it"])
f = tree.find_all("gar", class_=re.compile("a"))
self.assertSelects(f, ["Found it"])
# If the search fails to match the individual strings "foo" and "bar",
# it will be tried against the combined string "foo bar".
f = tree.find_all("gar", class_=re.compile("o b"))
self.assertSelects(f, ["Found it"])
def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self):
soup = self.soup("Found it")
self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"])
def big_attribute_value(value):
return len(value) > 3
self.assertSelects(soup.find_all("a", big_attribute_value), [])
def small_attribute_value(value):
return len(value) <= 3
self.assertSelects(
soup.find_all("a", small_attribute_value), ["Found it"])
def test_find_all_with_string_for_attrs_finds_multiple_classes(self):
soup = self.soup('')
a, a2 = soup.find_all("a")
self.assertEqual([a, a2], soup.find_all("a", "foo"))
self.assertEqual([a], soup.find_all("a", "bar"))
# If you specify the class as a string that contains a
# space, only that specific value will be found.
self.assertEqual([a], soup.find_all("a", class_="foo bar"))
self.assertEqual([a], soup.find_all("a", "foo bar"))
self.assertEqual([], soup.find_all("a", "bar foo"))
def test_find_all_by_attribute_soupstrainer(self):
tree = self.soup("""
Match.
Non-match.""")
strainer = SoupStrainer(attrs={'id' : 'first'})
self.assertSelects(tree.find_all(strainer), ['Match.'])
def test_find_all_with_missing_attribute(self):
# You can pass in None as the value of an attribute to find_all.
# This will match tags that do not have that attribute set.
tree = self.soup("""ID present.
No ID present.
ID is empty.""")
self.assertSelects(tree.find_all('a', id=None), ["No ID present."])
def test_find_all_with_defined_attribute(self):
# You can pass in None as the value of an attribute to find_all.
# This will match tags that have that attribute set to any value.
tree = self.soup("""ID present.
No ID present.
ID is empty.""")
self.assertSelects(
tree.find_all(id=True), ["ID present.", "ID is empty."])
def test_find_all_with_numeric_attribute(self):
# If you search for a number, it's treated as a string.
tree = self.soup("""Unquoted attribute.
Quoted attribute.""")
expected = ["Unquoted attribute.", "Quoted attribute."]
self.assertSelects(tree.find_all(id=1), expected)
self.assertSelects(tree.find_all(id="1"), expected)
def test_find_all_with_list_attribute_values(self):
# You can pass a list of attribute values instead of just one,
# and you'll get tags that match any of the values.
tree = self.soup("""1
2
3
No ID.""")
self.assertSelects(tree.find_all(id=["1", "3", "4"]),
["1", "3"])
def test_find_all_with_regular_expression_attribute_value(self):
# You can pass a regular expression as an attribute value, and
# you'll get tags whose values for that attribute match the
# regular expression.
tree = self.soup("""One a.
Two as.
Mixed as and bs.
One b.
No ID.""")
self.assertSelects(tree.find_all(id=re.compile("^a+$")),
["One a.", "Two as."])
def test_find_by_name_and_containing_string(self):
soup = self.soup("foobarfoo")
a = soup.a
self.assertEqual([a], soup.find_all("a", text="foo"))
self.assertEqual([], soup.find_all("a", text="bar"))
self.assertEqual([], soup.find_all("a", text="bar"))
def test_find_by_name_and_containing_string_when_string_is_buried(self):
soup = self.soup("foofoo")
self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo"))
def test_find_by_attribute_and_containing_string(self):
soup = self.soup('foofoo')
a = soup.a
self.assertEqual([a], soup.find_all(id=2, text="foo"))
self.assertEqual([], soup.find_all(id=1, text="bar"))
class TestSmooth(TreeTest):
"""Test Tag.smooth."""
def test_smooth(self):
soup = self.soup("a ")
div = soup.div
div.append("b")
div.append("c")
div.append(Comment("Comment 1"))
div.append(Comment("Comment 2"))
div.append("d")
builder = self.default_builder()
span = Tag(soup, builder, 'span')
span.append('1')
span.append('2')
div.append(span)
# At this point the tree has a bunch of adjacent
# NavigableStrings. This is normal, but it has no meaning in
# terms of HTML, so we may want to smooth things out for
# output.
# Since the tag has two children, its .string is None.
self.assertEquals(None, div.span.string)
self.assertEqual(7, len(div.contents))
div.smooth()
self.assertEqual(5, len(div.contents))
# The three strings at the beginning of div.contents have been
# merged into on string.
#
self.assertEqual('abc', div.contents[0])
# The call is recursive -- the tag was also smoothed.
self.assertEqual('12', div.span.string)
# The two comments have _not_ been merged, even though
# comments are strings. Merging comments would change the
# meaning of the HTML.
self.assertEqual('Comment 1', div.contents[1])
self.assertEqual('Comment 2', div.contents[2])
class TestIndex(TreeTest):
"""Test Tag.index"""
def test_index(self):
tree = self.soup("""
Identical
Not identical
Identical
Identical with child
Also not identical
Identical with child
""")
div = tree.div
for i, element in enumerate(div.contents):
self.assertEqual(i, div.index(element))
self.assertRaises(ValueError, tree.index, 1)
class TestParentOperations(TreeTest):
"""Test navigation and searching through an element's parents."""
def setUp(self):
super(TestParentOperations, self).setUp()
self.tree = self.soup('''
''')
self.start = self.tree.b
def test_parent(self):
self.assertEqual(self.start.parent['id'], 'bottom')
self.assertEqual(self.start.parent.parent['id'], 'middle')
self.assertEqual(self.start.parent.parent.parent['id'], 'top')
def test_parent_of_top_tag_is_soup_object(self):
top_tag = self.tree.contents[0]
self.assertEqual(top_tag.parent, self.tree)
def test_soup_object_has_no_parent(self):
self.assertEqual(None, self.tree.parent)
def test_find_parents(self):
self.assertSelectsIDs(
self.start.find_parents('ul'), ['bottom', 'middle', 'top'])
self.assertSelectsIDs(
self.start.find_parents('ul', id="middle"), ['middle'])
def test_find_parent(self):
self.assertEqual(self.start.find_parent('ul')['id'], 'bottom')
self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top')
def test_parent_of_text_element(self):
text = self.tree.find(text="Start here")
self.assertEqual(text.parent.name, 'b')
def test_text_element_find_parent(self):
text = self.tree.find(text="Start here")
self.assertEqual(text.find_parent('ul')['id'], 'bottom')
def test_parent_generator(self):
parents = [parent['id'] for parent in self.start.parents
if parent is not None and 'id' in parent.attrs]
self.assertEqual(parents, ['bottom', 'middle', 'top'])
class ProximityTest(TreeTest):
def setUp(self):
super(TreeTest, self).setUp()
self.tree = self.soup(
'OneTwoThree')
class TestNextOperations(ProximityTest):
def setUp(self):
super(TestNextOperations, self).setUp()
self.start = self.tree.b
def test_next(self):
self.assertEqual(self.start.next_element, "One")
self.assertEqual(self.start.next_element.next_element['id'], "2")
def test_next_of_last_item_is_none(self):
last = self.tree.find(text="Three")
self.assertEqual(last.next_element, None)
def test_next_of_root_is_none(self):
# The document root is outside the next/previous chain.
self.assertEqual(self.tree.next_element, None)
def test_find_all_next(self):
self.assertSelects(self.start.find_all_next('b'), ["Two", "Three"])
self.start.find_all_next(id=3)
self.assertSelects(self.start.find_all_next(id=3), ["Three"])
def test_find_next(self):
self.assertEqual(self.start.find_next('b')['id'], '2')
self.assertEqual(self.start.find_next(text="Three"), "Three")
def test_find_next_for_text_element(self):
text = self.tree.find(text="One")
self.assertEqual(text.find_next("b").string, "Two")
self.assertSelects(text.find_all_next("b"), ["Two", "Three"])
def test_next_generator(self):
start = self.tree.find(text="Two")
successors = [node for node in start.next_elements]
# There are two successors: the final tag and its text contents.
tag, contents = successors
self.assertEqual(tag['id'], '3')
self.assertEqual(contents, "Three")
class TestPreviousOperations(ProximityTest):
def setUp(self):
super(TestPreviousOperations, self).setUp()
self.end = self.tree.find(text="Three")
def test_previous(self):
self.assertEqual(self.end.previous_element['id'], "3")
self.assertEqual(self.end.previous_element.previous_element, "Two")
def test_previous_of_first_item_is_none(self):
first = self.tree.find('html')
self.assertEqual(first.previous_element, None)
def test_previous_of_root_is_none(self):
# The document root is outside the next/previous chain.
# XXX This is broken!
#self.assertEqual(self.tree.previous_element, None)
pass
def test_find_all_previous(self):
# The tag containing the "Three" node is the predecessor
# of the "Three" node itself, which is why "Three" shows up
# here.
self.assertSelects(
self.end.find_all_previous('b'), ["Three", "Two", "One"])
self.assertSelects(self.end.find_all_previous(id=1), ["One"])
def test_find_previous(self):
self.assertEqual(self.end.find_previous('b')['id'], '3')
self.assertEqual(self.end.find_previous(text="One"), "One")
def test_find_previous_for_text_element(self):
text = self.tree.find(text="Three")
self.assertEqual(text.find_previous("b").string, "Three")
self.assertSelects(
text.find_all_previous("b"), ["Three", "Two", "One"])
def test_previous_generator(self):
start = self.tree.find(text="One")
predecessors = [node for node in start.previous_elements]
# There are four predecessors: the tag containing "One"
# the tag, the tag, and the tag.
b, body, head, html = predecessors
self.assertEqual(b['id'], '1')
self.assertEqual(body.name, "body")
self.assertEqual(head.name, "head")
self.assertEqual(html.name, "html")
class SiblingTest(TreeTest):
def setUp(self):
super(SiblingTest, self).setUp()
markup = '''
'''
# All that whitespace looks good but makes the tests more
# difficult. Get rid of it.
markup = re.compile(r"\n\s*").sub("", markup)
self.tree = self.soup(markup)
class TestNextSibling(SiblingTest):
def setUp(self):
super(TestNextSibling, self).setUp()
self.start = self.tree.find(id="1")
def test_next_sibling_of_root_is_none(self):
self.assertEqual(self.tree.next_sibling, None)
def test_next_sibling(self):
self.assertEqual(self.start.next_sibling['id'], '2')
self.assertEqual(self.start.next_sibling.next_sibling['id'], '3')
# Note the difference between next_sibling and next_element.
self.assertEqual(self.start.next_element['id'], '1.1')
def test_next_sibling_may_not_exist(self):
self.assertEqual(self.tree.html.next_sibling, None)
nested_span = self.tree.find(id="1.1")
self.assertEqual(nested_span.next_sibling, None)
last_span = self.tree.find(id="4")
self.assertEqual(last_span.next_sibling, None)
def test_find_next_sibling(self):
self.assertEqual(self.start.find_next_sibling('span')['id'], '2')
def test_next_siblings(self):
self.assertSelectsIDs(self.start.find_next_siblings("span"),
['2', '3', '4'])
self.assertSelectsIDs(self.start.find_next_siblings(id='3'), ['3'])
def test_next_sibling_for_text_element(self):
soup = self.soup("Foobarbaz")
start = soup.find(text="Foo")
self.assertEqual(start.next_sibling.name, 'b')
self.assertEqual(start.next_sibling.next_sibling, 'baz')
self.assertSelects(start.find_next_siblings('b'), ['bar'])
self.assertEqual(start.find_next_sibling(text="baz"), "baz")
self.assertEqual(start.find_next_sibling(text="nonesuch"), None)
class TestPreviousSibling(SiblingTest):
def setUp(self):
super(TestPreviousSibling, self).setUp()
self.end = self.tree.find(id="4")
def test_previous_sibling_of_root_is_none(self):
self.assertEqual(self.tree.previous_sibling, None)
def test_previous_sibling(self):
self.assertEqual(self.end.previous_sibling['id'], '3')
self.assertEqual(self.end.previous_sibling.previous_sibling['id'], '2')
# Note the difference between previous_sibling and previous_element.
self.assertEqual(self.end.previous_element['id'], '3.1')
def test_previous_sibling_may_not_exist(self):
self.assertEqual(self.tree.html.previous_sibling, None)
nested_span = self.tree.find(id="1.1")
self.assertEqual(nested_span.previous_sibling, None)
first_span = self.tree.find(id="1")
self.assertEqual(first_span.previous_sibling, None)
def test_find_previous_sibling(self):
self.assertEqual(self.end.find_previous_sibling('span')['id'], '3')
def test_previous_siblings(self):
self.assertSelectsIDs(self.end.find_previous_siblings("span"),
['3', '2', '1'])
self.assertSelectsIDs(self.end.find_previous_siblings(id='1'), ['1'])
def test_previous_sibling_for_text_element(self):
soup = self.soup("Foobarbaz")
start = soup.find(text="baz")
self.assertEqual(start.previous_sibling.name, 'b')
self.assertEqual(start.previous_sibling.previous_sibling, 'Foo')
self.assertSelects(start.find_previous_siblings('b'), ['bar'])
self.assertEqual(start.find_previous_sibling(text="Foo"), "Foo")
self.assertEqual(start.find_previous_sibling(text="nonesuch"), None)
class TestTag(SoupTest):
# Test various methods of Tag.
def test__should_pretty_print(self):
# Test the rules about when a tag should be pretty-printed.
tag = self.soup("").new_tag("a_tag")
# No list of whitespace-preserving tags -> pretty-print
tag._preserve_whitespace_tags = None
self.assertEquals(True, tag._should_pretty_print(0))
# List exists but tag is not on the list -> pretty-print
tag.preserve_whitespace_tags = ["some_other_tag"]
self.assertEquals(True, tag._should_pretty_print(1))
# Indent level is None -> don't pretty-print
self.assertEquals(False, tag._should_pretty_print(None))
# Tag is on the whitespace-preserving list -> don't pretty-print
tag.preserve_whitespace_tags = ["some_other_tag", "a_tag"]
self.assertEquals(False, tag._should_pretty_print(1))
class TestTagCreation(SoupTest):
"""Test the ability to create new tags."""
def test_new_tag(self):
soup = self.soup("")
new_tag = soup.new_tag("foo", bar="baz", attrs={"name": "a name"})
self.assertTrue(isinstance(new_tag, Tag))
self.assertEqual("foo", new_tag.name)
self.assertEqual(dict(bar="baz", name="a name"), new_tag.attrs)
self.assertEqual(None, new_tag.parent)
def test_tag_inherits_self_closing_rules_from_builder(self):
if XML_BUILDER_PRESENT:
xml_soup = BeautifulSoup("", "lxml-xml")
xml_br = xml_soup.new_tag("br")
xml_p = xml_soup.new_tag("p")
# Both the and tag are empty-element, just because
# they have no contents.
self.assertEqual(b" ", xml_br.encode())
self.assertEqual(b" ", xml_p.encode())
html_soup = BeautifulSoup("", "html.parser")
html_br = html_soup.new_tag("br")
html_p = html_soup.new_tag("p")
# The HTML builder users HTML's rules about which tags are
# empty-element tags, and the new tags reflect these rules.
self.assertEqual(b" ", html_br.encode())
self.assertEqual(b"", html_p.encode())
def test_new_string_creates_navigablestring(self):
soup = self.soup("")
s = soup.new_string("foo")
self.assertEqual("foo", s)
self.assertTrue(isinstance(s, NavigableString))
def test_new_string_can_create_navigablestring_subclass(self):
soup = self.soup("")
s = soup.new_string("foo", Comment)
self.assertEqual("foo", s)
self.assertTrue(isinstance(s, Comment))
class TestTreeModification(SoupTest):
def test_attribute_modification(self):
soup = self.soup('')
soup.a['id'] = 2
self.assertEqual(soup.decode(), self.document_for(''))
del(soup.a['id'])
self.assertEqual(soup.decode(), self.document_for(''))
soup.a['id2'] = 'foo'
self.assertEqual(soup.decode(), self.document_for(''))
def test_new_tag_creation(self):
builder = builder_registry.lookup('html')()
soup = self.soup("", builder=builder)
a = Tag(soup, builder, 'a')
ol = Tag(soup, builder, 'ol')
a['href'] = 'http://foo.com/'
soup.body.insert(0, a)
soup.body.insert(1, ol)
self.assertEqual(
soup.body.encode(),
b' ')
def test_append_to_contents_moves_tag(self):
doc = """Don't leave me here.
Don\'t leave! """
soup = self.soup(doc)
second_para = soup.find(id='2')
bold = soup.b
# Move the tag to the end of the second paragraph.
soup.find(id='2').append(soup.b)
# The tag is now a child of the second paragraph.
self.assertEqual(bold.parent, second_para)
self.assertEqual(
soup.decode(), self.document_for(
'Don\'t leave me . \n'
'Don\'t leave!here '))
def test_replace_with_returns_thing_that_was_replaced(self):
text = ""
soup = self.soup(text)
a = soup.a
new_a = a.replace_with(soup.c)
self.assertEqual(a, new_a)
def test_unwrap_returns_thing_that_was_replaced(self):
text = ""
soup = self.soup(text)
a = soup.a
new_a = a.unwrap()
self.assertEqual(a, new_a)
def test_replace_with_and_unwrap_give_useful_exception_when_tag_has_no_parent(self):
soup = self.soup("FooBar")
a = soup.a
a.extract()
self.assertEqual(None, a.parent)
self.assertRaises(ValueError, a.unwrap)
self.assertRaises(ValueError, a.replace_with, soup.c)
def test_replace_tag_with_itself(self):
text = "Foo"
soup = self.soup(text)
c = soup.c
soup.c.replace_with(c)
self.assertEqual(soup.decode(), self.document_for(text))
def test_replace_tag_with_its_parent_raises_exception(self):
text = ""
soup = self.soup(text)
self.assertRaises(ValueError, soup.b.replace_with, soup.a)
def test_insert_tag_into_itself_raises_exception(self):
text = ""
soup = self.soup(text)
self.assertRaises(ValueError, soup.a.insert, 0, soup.a)
def test_insert_beautifulsoup_object_inserts_children(self):
"""Inserting one BeautifulSoup object into another actually inserts all
of its children -- you'll never combine BeautifulSoup objects.
"""
soup = self.soup("And now, a word: And we're back. ")
text = "p2 p3 "
to_insert = self.soup(text)
soup.insert(1, to_insert)
for i in soup.descendants:
assert not isinstance(i, BeautifulSoup)
p1, p2, p3, p4 = list(soup.children)
self.assertEquals("And now, a word:", p1.string)
self.assertEquals("p2", p2.string)
self.assertEquals("p3", p3.string)
self.assertEquals("And we're back.", p4.string)
def test_replace_with_maintains_next_element_throughout(self):
soup = self.soup('onethree ')
a = soup.a
b = a.contents[0]
# Make it so the tag has two text children.
a.insert(1, "two")
# Now replace each one with the empty string.
left, right = a.contents
left.replaceWith('')
right.replaceWith('')
# The tag is still connected to the tree.
self.assertEqual("three", soup.b.string)
def test_replace_final_node(self):
soup = self.soup("Argh!")
soup.find(text="Argh!").replace_with("Hooray!")
new_text = soup.find(text="Hooray!")
b = soup.b
self.assertEqual(new_text.previous_element, b)
self.assertEqual(new_text.parent, b)
self.assertEqual(new_text.previous_element.next_element, new_text)
self.assertEqual(new_text.next_element, None)
def test_consecutive_text_nodes(self):
# A builder should never create two consecutive text nodes,
# but if you insert one next to another, Beautiful Soup will
# handle it correctly.
soup = self.soup("Argh!")
soup.b.insert(1, "Hooray!")
self.assertEqual(
soup.decode(), self.document_for(
"Argh!Hooray!"))
new_text = soup.find(text="Hooray!")
self.assertEqual(new_text.previous_element, "Argh!")
self.assertEqual(new_text.previous_element.next_element, new_text)
self.assertEqual(new_text.previous_sibling, "Argh!")
self.assertEqual(new_text.previous_sibling.next_sibling, new_text)
self.assertEqual(new_text.next_sibling, None)
self.assertEqual(new_text.next_element, soup.c)
def test_insert_string(self):
soup = self.soup("")
soup.a.insert(0, "bar")
soup.a.insert(0, "foo")
# The string were added to the tag.
self.assertEqual(["foo", "bar"], soup.a.contents)
# And they were converted to NavigableStrings.
self.assertEqual(soup.a.contents[0].next_element, "bar")
def test_insert_tag(self):
builder = self.default_builder()
soup = self.soup(
"Findlady!", builder=builder)
magic_tag = Tag(soup, builder, 'magictag')
magic_tag.insert(0, "the")
soup.a.insert(1, magic_tag)
self.assertEqual(
soup.decode(), self.document_for(
"Findthelady!"))
# Make sure all the relationships are hooked up correctly.
b_tag = soup.b
self.assertEqual(b_tag.next_sibling, magic_tag)
self.assertEqual(magic_tag.previous_sibling, b_tag)
find = b_tag.find(text="Find")
self.assertEqual(find.next_element, magic_tag)
self.assertEqual(magic_tag.previous_element, find)
c_tag = soup.c
self.assertEqual(magic_tag.next_sibling, c_tag)
self.assertEqual(c_tag.previous_sibling, magic_tag)
the = magic_tag.find(text="the")
self.assertEqual(the.parent, magic_tag)
self.assertEqual(the.next_element, c_tag)
self.assertEqual(c_tag.previous_element, the)
def test_append_child_thats_already_at_the_end(self):
data = ""
soup = self.soup(data)
soup.a.append(soup.b)
self.assertEqual(data, soup.decode())
def test_extend(self):
data = ""
soup = self.soup(data)
l = [soup.g, soup.f, soup.e, soup.d, soup.c, soup.b]
soup.a.extend(l)
self.assertEqual("", soup.decode())
def test_move_tag_to_beginning_of_parent(self):
data = ""
soup = self.soup(data)
soup.a.insert(0, soup.d)
self.assertEqual("", soup.decode())
def test_insert_works_on_empty_element_tag(self):
# This is a little strange, since most HTML parsers don't allow
# markup like this to come through. But in general, we don't
# know what the parser would or wouldn't have allowed, so
# I'm letting this succeed for now.
soup = self.soup(" ")
soup.br.insert(1, "Contents")
self.assertEqual(str(soup.br), " Contents")
def test_insert_before(self):
soup = self.soup("foobar")
soup.b.insert_before("BAZ")
soup.a.insert_before("QUUX")
self.assertEqual(
soup.decode(), self.document_for("QUUXfooBAZbar"))
soup.a.insert_before(soup.b)
self.assertEqual(
soup.decode(), self.document_for("QUUXbarfooBAZ"))
# Can't insert an element before itself.
b = soup.b
self.assertRaises(ValueError, b.insert_before, b)
# Can't insert before if an element has no parent.
b.extract()
self.assertRaises(ValueError, b.insert_before, "nope")
# Can insert an identical element
soup = self.soup("")
soup.a.insert_before(soup.new_tag("a"))
def test_insert_multiple_before(self):
soup = self.soup("foobar")
soup.b.insert_before("BAZ", " ", "QUUX")
soup.a.insert_before("QUUX", " ", "BAZ")
self.assertEqual(
soup.decode(), self.document_for("QUUX BAZfooBAZ QUUXbar"))
soup.a.insert_before(soup.b, "FOO")
self.assertEqual(
soup.decode(), self.document_for("QUUX BAZbarFOOfooBAZ QUUX"))
def test_insert_after(self):
soup = self.soup("foobar")
soup.b.insert_after("BAZ")
soup.a.insert_after("QUUX")
self.assertEqual(
soup.decode(), self.document_for("fooQUUXbarBAZ"))
soup.b.insert_after(soup.a)
self.assertEqual(
soup.decode(), self.document_for("QUUXbarfooBAZ"))
# Can't insert an element after itself.
b = soup.b
self.assertRaises(ValueError, b.insert_after, b)
# Can't insert after if an element has no parent.
b.extract()
self.assertRaises(ValueError, b.insert_after, "nope")
# Can insert an identical element
soup = self.soup("")
soup.a.insert_before(soup.new_tag("a"))
def test_insert_multiple_after(self):
soup = self.soup("foobar")
soup.b.insert_after("BAZ", " ", "QUUX")
soup.a.insert_after("QUUX", " ", "BAZ")
self.assertEqual(
soup.decode(), self.document_for("fooQUUX BAZbarBAZ QUUX"))
soup.b.insert_after(soup.a, "FOO ")
self.assertEqual(
soup.decode(), self.document_for("QUUX BAZbarfooFOO BAZ QUUX"))
def test_insert_after_raises_exception_if_after_has_no_meaning(self):
soup = self.soup("")
tag = soup.new_tag("a")
string = soup.new_string("")
self.assertRaises(ValueError, string.insert_after, tag)
self.assertRaises(NotImplementedError, soup.insert_after, tag)
self.assertRaises(ValueError, tag.insert_after, tag)
def test_insert_before_raises_notimplementederror_if_before_has_no_meaning(self):
soup = self.soup("")
tag = soup.new_tag("a")
string = soup.new_string("")
self.assertRaises(ValueError, string.insert_before, tag)
self.assertRaises(NotImplementedError, soup.insert_before, tag)
self.assertRaises(ValueError, tag.insert_before, tag)
def test_replace_with(self):
soup = self.soup(
"There's no business like show business ")
no, show = soup.find_all('b')
show.replace_with(no)
self.assertEqual(
soup.decode(),
self.document_for(
"There's business like no business "))
self.assertEqual(show.parent, None)
self.assertEqual(no.parent, soup.p)
self.assertEqual(no.next_element, "no")
self.assertEqual(no.next_sibling, " business")
def test_replace_first_child(self):
data = ""
soup = self.soup(data)
soup.b.replace_with(soup.c)
self.assertEqual("", soup.decode())
def test_replace_last_child(self):
data = ""
soup = self.soup(data)
soup.c.replace_with(soup.b)
self.assertEqual("", soup.decode())
def test_nested_tag_replace_with(self):
soup = self.soup(
"""Wereservetherighttorefuseservice""")
# Replace the entire tag and its contents ("reserve the
# right") with the tag ("refuse").
remove_tag = soup.b
move_tag = soup.f
remove_tag.replace_with(move_tag)
self.assertEqual(
soup.decode(), self.document_for(
"Werefusetoservice"))
# The tag is now an orphan.
self.assertEqual(remove_tag.parent, None)
self.assertEqual(remove_tag.find(text="right").next_element, None)
self.assertEqual(remove_tag.previous_element, None)
self.assertEqual(remove_tag.next_sibling, None)
self.assertEqual(remove_tag.previous_sibling, None)
# The tag is now connected to the tag.
self.assertEqual(move_tag.parent, soup.a)
self.assertEqual(move_tag.previous_element, "We")
self.assertEqual(move_tag.next_element.next_element, soup.e)
self.assertEqual(move_tag.next_sibling, None)
# The gap where the tag used to be has been mended, and
# the word "to" is now connected to the tag.
to_text = soup.find(text="to")
g_tag = soup.g
self.assertEqual(to_text.next_element, g_tag)
self.assertEqual(to_text.next_sibling, g_tag)
self.assertEqual(g_tag.previous_element, to_text)
self.assertEqual(g_tag.previous_sibling, to_text)
def test_unwrap(self):
tree = self.soup("""
Unneeded formatting is unneeded
""")
tree.em.unwrap()
self.assertEqual(tree.em, None)
self.assertEqual(tree.p.text, "Unneeded formatting is unneeded")
def test_wrap(self):
soup = self.soup("I wish I was bold.")
value = soup.string.wrap(soup.new_tag("b"))
self.assertEqual(value.decode(), "I wish I was bold.")
self.assertEqual(
soup.decode(), self.document_for("I wish I was bold."))
def test_wrap_extracts_tag_from_elsewhere(self):
soup = self.soup("I wish I was bold.")
soup.b.next_sibling.wrap(soup.b)
self.assertEqual(
soup.decode(), self.document_for("I wish I was bold."))
def test_wrap_puts_new_contents_at_the_end(self):
soup = self.soup("I like being bold.I wish I was bold.")
soup.b.next_sibling.wrap(soup.b)
self.assertEqual(2, len(soup.b.contents))
self.assertEqual(
soup.decode(), self.document_for(
"I like being bold.I wish I was bold."))
def test_extract(self):
soup = self.soup(
'Some content. Nav crap More content.')
self.assertEqual(len(soup.body.contents), 3)
extracted = soup.find(id="nav").extract()
self.assertEqual(
soup.decode(), "Some content. More content.")
self.assertEqual(extracted.decode(), 'Nav crap ')
# The extracted tag is now an orphan.
self.assertEqual(len(soup.body.contents), 2)
self.assertEqual(extracted.parent, None)
self.assertEqual(extracted.previous_element, None)
self.assertEqual(extracted.next_element.next_element, None)
# The gap where the extracted tag used to be has been mended.
content_1 = soup.find(text="Some content. ")
content_2 = soup.find(text=" More content.")
self.assertEqual(content_1.next_element, content_2)
self.assertEqual(content_1.next_sibling, content_2)
self.assertEqual(content_2.previous_element, content_1)
self.assertEqual(content_2.previous_sibling, content_1)
def test_extract_distinguishes_between_identical_strings(self):
soup = self.soup("foobar")
foo_1 = soup.a.string
bar_1 = soup.b.string
foo_2 = soup.new_string("foo")
bar_2 = soup.new_string("bar")
soup.a.append(foo_2)
soup.b.append(bar_2)
# Now there are two identical strings in the tag, and two
# in the tag. Let's remove the first "foo" and the second
# "bar".
foo_1.extract()
bar_2.extract()
self.assertEqual(foo_2, soup.a.string)
self.assertEqual(bar_2, soup.b.string)
def test_extract_multiples_of_same_tag(self):
soup = self.soup("""
""")
[soup.script.extract() for i in soup.find_all("script")]
self.assertEqual("\n\n\n", unicode(soup.body))
def test_extract_works_when_element_is_surrounded_by_identical_strings(self):
soup = self.soup(
'\n'
'hi\n'
'')
soup.find('body').extract()
self.assertEqual(None, soup.find('body'))
def test_clear(self):
"""Tag.clear()"""
soup = self.soup("String Italicized and another ")
# clear using extract()
a = soup.a
soup.p.clear()
self.assertEqual(len(soup.p.contents), 0)
self.assertTrue(hasattr(a, "contents"))
# clear using decompose()
em = a.em
a.clear(decompose=True)
self.assertEqual(0, len(em.contents))
def test_string_set(self):
"""Tag.string = 'string'"""
soup = self.soup(" ")
soup.a.string = "foo"
self.assertEqual(soup.a.contents, ["foo"])
soup.b.string = "bar"
self.assertEqual(soup.b.contents, ["bar"])
def test_string_set_does_not_affect_original_string(self):
soup = self.soup("foobar")
soup.b.string = soup.c.string
self.assertEqual(soup.a.encode(), b"barbar")
def test_set_string_preserves_class_of_string(self):
soup = self.soup("")
cdata = CData("foo")
soup.a.string = cdata
self.assertTrue(isinstance(soup.a.string, CData))
class TestElementObjects(SoupTest):
"""Test various features of element objects."""
def test_len(self):
"""The length of an element is its number of children."""
soup = self.soup("123")
# The BeautifulSoup object itself contains one element: the
# tag.
self.assertEqual(len(soup.contents), 1)
self.assertEqual(len(soup), 1)
# The tag contains three elements: the text node "1", the
# tag, and the text node "3".
self.assertEqual(len(soup.top), 3)
self.assertEqual(len(soup.top.contents), 3)
def test_member_access_invokes_find(self):
"""Accessing a Python member .foo invokes find('foo')"""
soup = self.soup('')
self.assertEqual(soup.b, soup.find('b'))
self.assertEqual(soup.b.i, soup.find('b').find('i'))
self.assertEqual(soup.a, None)
def test_deprecated_member_access(self):
soup = self.soup('')
with warnings.catch_warnings(record=True) as w:
tag = soup.bTag
self.assertEqual(soup.b, tag)
self.assertEqual(
'.bTag is deprecated, use .find("b") instead. If you really were looking for a tag called bTag, use .find("bTag")',
str(w[0].message))
def test_has_attr(self):
"""has_attr() checks for the presence of an attribute.
Please note note: has_attr() is different from
__in__. has_attr() checks the tag's attributes and __in__
checks the tag's chidlren.
"""
soup = self.soup("")
self.assertTrue(soup.foo.has_attr('attr'))
self.assertFalse(soup.foo.has_attr('attr2'))
def test_attributes_come_out_in_alphabetical_order(self):
markup = ''
self.assertSoupEquals(markup, '')
def test_string(self):
# A tag that contains only a text node makes that node
# available as .string.
soup = self.soup("foo")
self.assertEqual(soup.b.string, 'foo')
def test_empty_tag_has_no_string(self):
# A tag with no children has no .stirng.
soup = self.soup("")
self.assertEqual(soup.b.string, None)
def test_tag_with_multiple_children_has_no_string(self):
# A tag with no children has no .string.
soup = self.soup("foo")
self.assertEqual(soup.b.string, None)
soup = self.soup("foobar")
self.assertEqual(soup.b.string, None)
# Even if all the children are strings, due to trickery,
# it won't work--but this would be a good optimization.
soup = self.soup("foo")
soup.a.insert(1, "bar")
self.assertEqual(soup.a.string, None)
def test_tag_with_recursive_string_has_string(self):
# A tag with a single child which has a .string inherits that
# .string.
soup = self.soup("foo")
self.assertEqual(soup.a.string, "foo")
self.assertEqual(soup.string, "foo")
def test_lack_of_string(self):
"""Only a tag containing a single text node has a .string."""
soup = self.soup("feo")
self.assertFalse(soup.b.string)
soup = self.soup("")
self.assertFalse(soup.b.string)
def test_all_text(self):
"""Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated"""
soup = self.soup("ar t ")
self.assertEqual(soup.a.text, "ar t ")
self.assertEqual(soup.a.get_text(strip=True), "art")
self.assertEqual(soup.a.get_text(","), "a,r, , t ")
self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
def test_get_text_ignores_comments(self):
soup = self.soup("foobar")
self.assertEqual(soup.get_text(), "foobar")
self.assertEqual(
soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar")
self.assertEqual(
soup.get_text(types=None), "fooIGNOREbar")
def test_all_strings_ignores_comments(self):
soup = self.soup("foobar")
self.assertEqual(['foo', 'bar'], list(soup.strings))
class TestCDAtaListAttributes(SoupTest):
"""Testing cdata-list attributes like 'class'.
"""
def test_single_value_becomes_list(self):
soup = self.soup("")
self.assertEqual(["foo"],soup.a['class'])
def test_multiple_values_becomes_list(self):
soup = self.soup("")
self.assertEqual(["foo", "bar"], soup.a['class'])
def test_multiple_values_separated_by_weird_whitespace(self):
soup = self.soup("")
self.assertEqual(["foo", "bar", "baz"],soup.a['class'])
def test_attributes_joined_into_string_on_output(self):
soup = self.soup("")
self.assertEqual(b'', soup.a.encode())
def test_get_attribute_list(self):
soup = self.soup("")
self.assertEqual(['abc def'], soup.a.get_attribute_list('id'))
def test_accept_charset(self):
soup = self.soup(' |