summaryrefslogtreecommitdiffstats
path: root/import-layers/yocto-poky/bitbake/lib/bs4/dammit.py
diff options
context:
space:
mode:
Diffstat (limited to 'import-layers/yocto-poky/bitbake/lib/bs4/dammit.py')
-rw-r--r--import-layers/yocto-poky/bitbake/lib/bs4/dammit.py31
1 files changed, 21 insertions, 10 deletions
diff --git a/import-layers/yocto-poky/bitbake/lib/bs4/dammit.py b/import-layers/yocto-poky/bitbake/lib/bs4/dammit.py
index 59640b7ce..68d419feb 100644
--- a/import-layers/yocto-poky/bitbake/lib/bs4/dammit.py
+++ b/import-layers/yocto-poky/bitbake/lib/bs4/dammit.py
@@ -3,12 +3,14 @@
This library converts a bytestream to Unicode through any means
necessary. It is heavily based on code from Mark Pilgrim's Universal
-Feed Parser. It works best on XML and XML, but it does not rewrite the
+Feed Parser. It works best on XML and HTML, but it does not rewrite the
XML or HTML to reflect a new encoding; that's the tree builder's job.
"""
+__license__ = "MIT"
+from pdb import set_trace
import codecs
-from htmlentitydefs import codepoint2name
+from html.entities import codepoint2name
import re
import logging
import string
@@ -56,7 +58,7 @@ class EntitySubstitution(object):
reverse_lookup = {}
characters_for_re = []
for codepoint, name in list(codepoint2name.items()):
- character = unichr(codepoint)
+ character = chr(codepoint)
if codepoint != 34:
# There's no point in turning the quotation mark into
# ", unless it happens within an attribute value, which
@@ -212,8 +214,11 @@ class EncodingDetector:
5. Windows-1252.
"""
- def __init__(self, markup, override_encodings=None, is_html=False):
+ def __init__(self, markup, override_encodings=None, is_html=False,
+ exclude_encodings=None):
self.override_encodings = override_encodings or []
+ exclude_encodings = exclude_encodings or []
+ self.exclude_encodings = set([x.lower() for x in exclude_encodings])
self.chardet_encoding = None
self.is_html = is_html
self.declared_encoding = None
@@ -224,6 +229,8 @@ class EncodingDetector:
def _usable(self, encoding, tried):
if encoding is not None:
encoding = encoding.lower()
+ if encoding in self.exclude_encodings:
+ return False
if encoding not in tried:
tried.add(encoding)
return True
@@ -266,6 +273,9 @@ class EncodingDetector:
def strip_byte_order_mark(cls, data):
"""If a byte-order mark is present, strip it and return the encoding it implies."""
encoding = None
+ if isinstance(data, str):
+ # Unicode data cannot have a byte-order mark.
+ return data, encoding
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
and (data[2:4] != '\x00\x00'):
encoding = 'utf-16be'
@@ -306,7 +316,7 @@ class EncodingDetector:
declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
if declared_encoding_match is not None:
declared_encoding = declared_encoding_match.groups()[0].decode(
- 'ascii')
+ 'ascii', 'replace')
if declared_encoding:
return declared_encoding.lower()
return None
@@ -331,18 +341,19 @@ class UnicodeDammit:
]
def __init__(self, markup, override_encodings=[],
- smart_quotes_to=None, is_html=False):
+ smart_quotes_to=None, is_html=False, exclude_encodings=[]):
self.smart_quotes_to = smart_quotes_to
self.tried_encodings = []
self.contains_replacement_characters = False
self.is_html = is_html
- self.detector = EncodingDetector(markup, override_encodings, is_html)
+ self.detector = EncodingDetector(
+ markup, override_encodings, is_html, exclude_encodings)
# Short-circuit if the data is in Unicode to begin with.
- if isinstance(markup, unicode) or markup == '':
+ if isinstance(markup, str) or markup == '':
self.markup = markup
- self.unicode_markup = unicode(markup)
+ self.unicode_markup = str(markup)
self.original_encoding = None
return
@@ -425,7 +436,7 @@ class UnicodeDammit:
def _to_unicode(self, data, encoding, errors="strict"):
'''Given a string and its encoding, decodes the string into Unicode.
%encoding is a string recognized by encodings.aliases'''
- return unicode(data, encoding, errors)
+ return str(data, encoding, errors)
@property
def declared_html_encoding(self):
OpenPOWER on IntegriCloud