diff options
Diffstat (limited to 'import-layers/yocto-poky/bitbake/lib/bs4/tests/test_soup.py')
-rw-r--r-- | import-layers/yocto-poky/bitbake/lib/bs4/tests/test_soup.py | 107 |
1 files changed, 78 insertions, 29 deletions
diff --git a/import-layers/yocto-poky/bitbake/lib/bs4/tests/test_soup.py b/import-layers/yocto-poky/bitbake/lib/bs4/tests/test_soup.py index 47ac245f9..f87949e3d 100644 --- a/import-layers/yocto-poky/bitbake/lib/bs4/tests/test_soup.py +++ b/import-layers/yocto-poky/bitbake/lib/bs4/tests/test_soup.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """Tests of Beautiful Soup as a whole.""" +from pdb import set_trace import logging import unittest import sys @@ -20,6 +21,7 @@ import bs4.dammit from bs4.dammit import ( EntitySubstitution, UnicodeDammit, + EncodingDetector, ) from bs4.testing import ( SoupTest, @@ -30,7 +32,7 @@ import warnings try: from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML LXML_PRESENT = True -except ImportError, e: +except ImportError as e: LXML_PRESENT = False PYTHON_2_PRE_2_7 = (sys.version_info < (2,7)) @@ -39,17 +41,43 @@ PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) class TestConstructor(SoupTest): def test_short_unicode_input(self): - data = u"<h1>éé</h1>" + data = "<h1>éé</h1>" soup = self.soup(data) - self.assertEqual(u"éé", soup.h1.string) + self.assertEqual("éé", soup.h1.string) def test_embedded_null(self): - data = u"<h1>foo\0bar</h1>" + data = "<h1>foo\0bar</h1>" soup = self.soup(data) - self.assertEqual(u"foo\0bar", soup.h1.string) + self.assertEqual("foo\0bar", soup.h1.string) + def test_exclude_encodings(self): + utf8_data = "Räksmörgås".encode("utf-8") + soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) + self.assertEqual("windows-1252", soup.original_encoding) -class TestDeprecatedConstructorArguments(SoupTest): + +class TestWarnings(SoupTest): + + def _no_parser_specified(self, s, is_there=True): + v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80]) + self.assertTrue(v) + + def test_warning_if_no_parser_specified(self): + with warnings.catch_warnings(record=True) as w: + soup = self.soup("<a><b></b></a>") + msg = str(w[0].message) + self._assert_no_parser_specified(msg) + + def test_warning_if_parser_specified_too_vague(self): + with warnings.catch_warnings(record=True) as w: + soup = self.soup("<a><b></b></a>", "html") + msg = str(w[0].message) + self._assert_no_parser_specified(msg) + + def test_no_warning_if_explicit_parser_specified(self): + with warnings.catch_warnings(record=True) as w: + soup = self.soup("<a><b></b></a>", "html.parser") + self.assertEqual([], w) def test_parseOnlyThese_renamed_to_parse_only(self): with warnings.catch_warnings(record=True) as w: @@ -117,9 +145,9 @@ class TestEntitySubstitution(unittest.TestCase): def test_simple_html_substitution(self): # Unicode characters corresponding to named HTML entites # are substituted, and no others. - s = u"foo\u2200\N{SNOWMAN}\u00f5bar" + s = "foo\u2200\N{SNOWMAN}\u00f5bar" self.assertEqual(self.sub.substitute_html(s), - u"foo∀\N{SNOWMAN}õbar") + "foo∀\N{SNOWMAN}õbar") def test_smart_quote_substitution(self): # MS smart quotes are a common source of frustration, so we @@ -184,7 +212,7 @@ class TestEncodingConversion(SoupTest): def setUp(self): super(TestEncodingConversion, self).setUp() - self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' + self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' self.utf8_data = self.unicode_data.encode("utf-8") # Just so you know what it looks like. self.assertEqual( @@ -204,7 +232,7 @@ class TestEncodingConversion(SoupTest): ascii = b"<foo>a</foo>" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() - self.assertTrue(isinstance(unicode_output, unicode)) + self.assertTrue(isinstance(unicode_output, str)) self.assertEqual(unicode_output, self.document_for(ascii.decode())) self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") finally: @@ -216,7 +244,7 @@ class TestEncodingConversion(SoupTest): # is not set. soup_from_unicode = self.soup(self.unicode_data) self.assertEqual(soup_from_unicode.decode(), self.unicode_data) - self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') + self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!') self.assertEqual(soup_from_unicode.original_encoding, None) def test_utf8_in_unicode_out(self): @@ -224,7 +252,7 @@ class TestEncodingConversion(SoupTest): # attribute is set. soup_from_utf8 = self.soup(self.utf8_data) self.assertEqual(soup_from_utf8.decode(), self.unicode_data) - self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') + self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!') def test_utf8_out(self): # The internal data structures can be encoded as UTF-8. @@ -235,14 +263,14 @@ class TestEncodingConversion(SoupTest): PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2, "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") def test_attribute_name_containing_unicode_characters(self): - markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>' + markup = '<div><a \N{SNOWMAN}="snowman"></a></div>' self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) class TestUnicodeDammit(unittest.TestCase): """Standalone tests of UnicodeDammit.""" def test_unicode_input(self): - markup = u"I'm already Unicode! \N{SNOWMAN}" + markup = "I'm already Unicode! \N{SNOWMAN}" dammit = UnicodeDammit(markup) self.assertEqual(dammit.unicode_markup, markup) @@ -250,7 +278,7 @@ class TestUnicodeDammit(unittest.TestCase): markup = b"<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup) self.assertEqual( - dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>") + dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>") def test_smart_quotes_to_xml_entities(self): markup = b"<foo>\x91\x92\x93\x94</foo>" @@ -271,16 +299,17 @@ class TestUnicodeDammit(unittest.TestCase): dammit.unicode_markup, """<foo>''""</foo>""") def test_detect_utf8(self): - utf8 = b"\xc3\xa9" + utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" dammit = UnicodeDammit(utf8) - self.assertEqual(dammit.unicode_markup, u'\xe9') self.assertEqual(dammit.original_encoding.lower(), 'utf-8') + self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}') + def test_convert_hebrew(self): hebrew = b"\xed\xe5\xec\xf9" dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') - self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') + self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9') def test_dont_see_smart_quotes_where_there_are_none(self): utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" @@ -289,16 +318,36 @@ class TestUnicodeDammit(unittest.TestCase): self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) def test_ignore_inappropriate_codecs(self): - utf8_data = u"Räksmörgås".encode("utf-8") + utf8_data = "Räksmörgås".encode("utf-8") dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding.lower(), 'utf-8') def test_ignore_invalid_codecs(self): - utf8_data = u"Räksmörgås".encode("utf-8") + utf8_data = "Räksmörgås".encode("utf-8") for bad_encoding in ['.utf8', '...', 'utF---16.!']: dammit = UnicodeDammit(utf8_data, [bad_encoding]) self.assertEqual(dammit.original_encoding.lower(), 'utf-8') + def test_exclude_encodings(self): + # This is UTF-8. + utf8_data = "Räksmörgås".encode("utf-8") + + # But if we exclude UTF-8 from consideration, the guess is + # Windows-1252. + dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"]) + self.assertEqual(dammit.original_encoding.lower(), 'windows-1252') + + # And if we exclude that, there is no valid guess at all. + dammit = UnicodeDammit( + utf8_data, exclude_encodings=["utf-8", "windows-1252"]) + self.assertEqual(dammit.original_encoding, None) + + def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self): + detected = EncodingDetector( + b'<?xml version="1.0" encoding="UTF-\xdb" ?>') + encodings = list(detected.encodings) + assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings + def test_detect_html5_style_meta_tag(self): for data in ( @@ -337,7 +386,7 @@ class TestUnicodeDammit(unittest.TestCase): bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) - self.assertTrue(u"\ufffd" in dammit.unicode_markup) + self.assertTrue("\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) @@ -349,17 +398,17 @@ class TestUnicodeDammit(unittest.TestCase): # A document written in UTF-16LE will have its byte order marker stripped. data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' dammit = UnicodeDammit(data) - self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) + self.assertEqual("<a>áé</a>", dammit.unicode_markup) self.assertEqual("utf-16le", dammit.original_encoding) def test_detwingle(self): # Here's a UTF8 document. - utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") + utf8 = ("\N{SNOWMAN}" * 3).encode("utf8") # Here's a Windows-1252 document. windows_1252 = ( - u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" - u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") + "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" + "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") # Through some unholy alchemy, they've been stuck together. doc = utf8 + windows_1252 + utf8 @@ -374,7 +423,7 @@ class TestUnicodeDammit(unittest.TestCase): fixed = UnicodeDammit.detwingle(doc) self.assertEqual( - u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) + "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) def test_detwingle_ignores_multibyte_characters(self): # Each of these characters has a UTF-8 representation ending @@ -382,9 +431,9 @@ class TestUnicodeDammit(unittest.TestCase): # Windows-1252. But our code knows to skip over multibyte # UTF-8 characters, so they'll survive the process unscathed. for tricky_unicode_char in ( - u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' - u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' - u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one. + "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' + "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' + "\xf0\x90\x90\x93", # This is a CJK character, not sure which one. ): input = tricky_unicode_char.encode("utf8") self.assertTrue(input.endswith(b'\x93')) |