summaryrefslogtreecommitdiffstats
path: root/import-layers/yocto-poky/bitbake/lib/bs4/tests/test_soup.py
diff options
context:
space:
mode:
Diffstat (limited to 'import-layers/yocto-poky/bitbake/lib/bs4/tests/test_soup.py')
-rw-r--r--import-layers/yocto-poky/bitbake/lib/bs4/tests/test_soup.py107
1 files changed, 78 insertions, 29 deletions
diff --git a/import-layers/yocto-poky/bitbake/lib/bs4/tests/test_soup.py b/import-layers/yocto-poky/bitbake/lib/bs4/tests/test_soup.py
index 47ac245f9..f87949e3d 100644
--- a/import-layers/yocto-poky/bitbake/lib/bs4/tests/test_soup.py
+++ b/import-layers/yocto-poky/bitbake/lib/bs4/tests/test_soup.py
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
"""Tests of Beautiful Soup as a whole."""
+from pdb import set_trace
import logging
import unittest
import sys
@@ -20,6 +21,7 @@ import bs4.dammit
from bs4.dammit import (
EntitySubstitution,
UnicodeDammit,
+ EncodingDetector,
)
from bs4.testing import (
SoupTest,
@@ -30,7 +32,7 @@ import warnings
try:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
LXML_PRESENT = True
-except ImportError, e:
+except ImportError as e:
LXML_PRESENT = False
PYTHON_2_PRE_2_7 = (sys.version_info < (2,7))
@@ -39,17 +41,43 @@ PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
class TestConstructor(SoupTest):
def test_short_unicode_input(self):
- data = u"<h1>éé</h1>"
+ data = "<h1>éé</h1>"
soup = self.soup(data)
- self.assertEqual(u"éé", soup.h1.string)
+ self.assertEqual("éé", soup.h1.string)
def test_embedded_null(self):
- data = u"<h1>foo\0bar</h1>"
+ data = "<h1>foo\0bar</h1>"
soup = self.soup(data)
- self.assertEqual(u"foo\0bar", soup.h1.string)
+ self.assertEqual("foo\0bar", soup.h1.string)
+ def test_exclude_encodings(self):
+ utf8_data = "Räksmörgås".encode("utf-8")
+ soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
+ self.assertEqual("windows-1252", soup.original_encoding)
-class TestDeprecatedConstructorArguments(SoupTest):
+
+class TestWarnings(SoupTest):
+
+ def _no_parser_specified(self, s, is_there=True):
+ v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80])
+ self.assertTrue(v)
+
+ def test_warning_if_no_parser_specified(self):
+ with warnings.catch_warnings(record=True) as w:
+ soup = self.soup("<a><b></b></a>")
+ msg = str(w[0].message)
+ self._assert_no_parser_specified(msg)
+
+ def test_warning_if_parser_specified_too_vague(self):
+ with warnings.catch_warnings(record=True) as w:
+ soup = self.soup("<a><b></b></a>", "html")
+ msg = str(w[0].message)
+ self._assert_no_parser_specified(msg)
+
+ def test_no_warning_if_explicit_parser_specified(self):
+ with warnings.catch_warnings(record=True) as w:
+ soup = self.soup("<a><b></b></a>", "html.parser")
+ self.assertEqual([], w)
def test_parseOnlyThese_renamed_to_parse_only(self):
with warnings.catch_warnings(record=True) as w:
@@ -117,9 +145,9 @@ class TestEntitySubstitution(unittest.TestCase):
def test_simple_html_substitution(self):
# Unicode characters corresponding to named HTML entites
# are substituted, and no others.
- s = u"foo\u2200\N{SNOWMAN}\u00f5bar"
+ s = "foo\u2200\N{SNOWMAN}\u00f5bar"
self.assertEqual(self.sub.substitute_html(s),
- u"foo&forall;\N{SNOWMAN}&otilde;bar")
+ "foo&forall;\N{SNOWMAN}&otilde;bar")
def test_smart_quote_substitution(self):
# MS smart quotes are a common source of frustration, so we
@@ -184,7 +212,7 @@ class TestEncodingConversion(SoupTest):
def setUp(self):
super(TestEncodingConversion, self).setUp()
- self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
+ self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
self.utf8_data = self.unicode_data.encode("utf-8")
# Just so you know what it looks like.
self.assertEqual(
@@ -204,7 +232,7 @@ class TestEncodingConversion(SoupTest):
ascii = b"<foo>a</foo>"
soup_from_ascii = self.soup(ascii)
unicode_output = soup_from_ascii.decode()
- self.assertTrue(isinstance(unicode_output, unicode))
+ self.assertTrue(isinstance(unicode_output, str))
self.assertEqual(unicode_output, self.document_for(ascii.decode()))
self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
finally:
@@ -216,7 +244,7 @@ class TestEncodingConversion(SoupTest):
# is not set.
soup_from_unicode = self.soup(self.unicode_data)
self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
- self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!')
+ self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!')
self.assertEqual(soup_from_unicode.original_encoding, None)
def test_utf8_in_unicode_out(self):
@@ -224,7 +252,7 @@ class TestEncodingConversion(SoupTest):
# attribute is set.
soup_from_utf8 = self.soup(self.utf8_data)
self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
- self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
+ self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!')
def test_utf8_out(self):
# The internal data structures can be encoded as UTF-8.
@@ -235,14 +263,14 @@ class TestEncodingConversion(SoupTest):
PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2,
"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
def test_attribute_name_containing_unicode_characters(self):
- markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>'
+ markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
class TestUnicodeDammit(unittest.TestCase):
"""Standalone tests of UnicodeDammit."""
def test_unicode_input(self):
- markup = u"I'm already Unicode! \N{SNOWMAN}"
+ markup = "I'm already Unicode! \N{SNOWMAN}"
dammit = UnicodeDammit(markup)
self.assertEqual(dammit.unicode_markup, markup)
@@ -250,7 +278,7 @@ class TestUnicodeDammit(unittest.TestCase):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup)
self.assertEqual(
- dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>")
+ dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
def test_smart_quotes_to_xml_entities(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
@@ -271,16 +299,17 @@ class TestUnicodeDammit(unittest.TestCase):
dammit.unicode_markup, """<foo>''""</foo>""")
def test_detect_utf8(self):
- utf8 = b"\xc3\xa9"
+ utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
dammit = UnicodeDammit(utf8)
- self.assertEqual(dammit.unicode_markup, u'\xe9')
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
+ self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
+
def test_convert_hebrew(self):
hebrew = b"\xed\xe5\xec\xf9"
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
- self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
+ self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
def test_dont_see_smart_quotes_where_there_are_none(self):
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
@@ -289,16 +318,36 @@ class TestUnicodeDammit(unittest.TestCase):
self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
def test_ignore_inappropriate_codecs(self):
- utf8_data = u"Räksmörgås".encode("utf-8")
+ utf8_data = "Räksmörgås".encode("utf-8")
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_ignore_invalid_codecs(self):
- utf8_data = u"Räksmörgås".encode("utf-8")
+ utf8_data = "Räksmörgås".encode("utf-8")
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
dammit = UnicodeDammit(utf8_data, [bad_encoding])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
+ def test_exclude_encodings(self):
+ # This is UTF-8.
+ utf8_data = "Räksmörgås".encode("utf-8")
+
+ # But if we exclude UTF-8 from consideration, the guess is
+ # Windows-1252.
+ dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
+ self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
+
+ # And if we exclude that, there is no valid guess at all.
+ dammit = UnicodeDammit(
+ utf8_data, exclude_encodings=["utf-8", "windows-1252"])
+ self.assertEqual(dammit.original_encoding, None)
+
+ def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
+ detected = EncodingDetector(
+ b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
+ encodings = list(detected.encodings)
+ assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
+
def test_detect_html5_style_meta_tag(self):
for data in (
@@ -337,7 +386,7 @@ class TestUnicodeDammit(unittest.TestCase):
bs4.dammit.chardet_dammit = noop
dammit = UnicodeDammit(doc)
self.assertEqual(True, dammit.contains_replacement_characters)
- self.assertTrue(u"\ufffd" in dammit.unicode_markup)
+ self.assertTrue("\ufffd" in dammit.unicode_markup)
soup = BeautifulSoup(doc, "html.parser")
self.assertTrue(soup.contains_replacement_characters)
@@ -349,17 +398,17 @@ class TestUnicodeDammit(unittest.TestCase):
# A document written in UTF-16LE will have its byte order marker stripped.
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
dammit = UnicodeDammit(data)
- self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
+ self.assertEqual("<a>áé</a>", dammit.unicode_markup)
self.assertEqual("utf-16le", dammit.original_encoding)
def test_detwingle(self):
# Here's a UTF8 document.
- utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8")
+ utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
# Here's a Windows-1252 document.
windows_1252 = (
- u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
- u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
+ "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
+ "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
# Through some unholy alchemy, they've been stuck together.
doc = utf8 + windows_1252 + utf8
@@ -374,7 +423,7 @@ class TestUnicodeDammit(unittest.TestCase):
fixed = UnicodeDammit.detwingle(doc)
self.assertEqual(
- u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
+ "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
def test_detwingle_ignores_multibyte_characters(self):
# Each of these characters has a UTF-8 representation ending
@@ -382,9 +431,9 @@ class TestUnicodeDammit(unittest.TestCase):
# Windows-1252. But our code knows to skip over multibyte
# UTF-8 characters, so they'll survive the process unscathed.
for tricky_unicode_char in (
- u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
- u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
- u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
+ "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
+ "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
+ "\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
):
input = tricky_unicode_char.encode("utf8")
self.assertTrue(input.endswith(b'\x93'))
OpenPOWER on IntegriCloud