diff options
Diffstat (limited to 'import-layers/yocto-poky/bitbake/lib/bs4/builder')
4 files changed, 125 insertions, 56 deletions
diff --git a/import-layers/yocto-poky/bitbake/lib/bs4/builder/__init__.py b/import-layers/yocto-poky/bitbake/lib/bs4/builder/__init__.py index 740f5f29c..6ccd4d23d 100644 --- a/import-layers/yocto-poky/bitbake/lib/bs4/builder/__init__.py +++ b/import-layers/yocto-poky/bitbake/lib/bs4/builder/__init__.py @@ -80,9 +80,12 @@ builder_registry = TreeBuilderRegistry() class TreeBuilder(object): """Turn a document into a Beautiful Soup object tree.""" + NAME = "[Unknown tree builder]" + ALTERNATE_NAMES = [] features = [] is_xml = False + picklable = False preserve_whitespace_tags = set() empty_element_tags = None # A tag will be considered an empty-element # tag when and only when it has no contents. @@ -153,13 +156,13 @@ class TreeBuilder(object): universal = self.cdata_list_attributes.get('*', []) tag_specific = self.cdata_list_attributes.get( tag_name.lower(), None) - for attr in attrs.keys(): + for attr in list(attrs.keys()): if attr in universal or (tag_specific and attr in tag_specific): # We have a "class"-type attribute whose string # value is a whitespace-separated list of # values. Split it into a list. value = attrs[attr] - if isinstance(value, basestring): + if isinstance(value, str): values = whitespace_re.split(value) else: # html5lib sometimes calls setAttributes twice diff --git a/import-layers/yocto-poky/bitbake/lib/bs4/builder/_html5lib.py b/import-layers/yocto-poky/bitbake/lib/bs4/builder/_html5lib.py index 7de36ae75..f0e5924eb 100644 --- a/import-layers/yocto-poky/bitbake/lib/bs4/builder/_html5lib.py +++ b/import-layers/yocto-poky/bitbake/lib/bs4/builder/_html5lib.py @@ -2,6 +2,7 @@ __all__ = [ 'HTML5TreeBuilder', ] +from pdb import set_trace import warnings from bs4.builder import ( PERMISSIVE, @@ -9,7 +10,10 @@ from bs4.builder import ( HTML_5, HTMLTreeBuilder, ) -from bs4.element import NamespacedAttribute +from bs4.element import ( + NamespacedAttribute, + whitespace_re, +) import html5lib from html5lib.constants import namespaces from bs4.element import ( @@ -22,11 +26,20 @@ from bs4.element import ( class HTML5TreeBuilder(HTMLTreeBuilder): """Use html5lib to build a tree.""" - features = ['html5lib', PERMISSIVE, HTML_5, HTML] + NAME = "html5lib" + + features = [NAME, PERMISSIVE, HTML_5, HTML] - def prepare_markup(self, markup, user_specified_encoding): + def prepare_markup(self, markup, user_specified_encoding, + document_declared_encoding=None, exclude_encodings=None): # Store the user-specified encoding for use later on. self.user_specified_encoding = user_specified_encoding + + # document_declared_encoding and exclude_encodings aren't used + # ATM because the html5lib TreeBuilder doesn't use + # UnicodeDammit. + if exclude_encodings: + warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") yield (markup, None, None, False) # These methods are defined by Beautiful Soup. @@ -37,7 +50,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): doc = parser.parse(markup, encoding=self.user_specified_encoding) # Set the character encoding detected by the tokenizer. - if isinstance(markup, unicode): + if isinstance(markup, str): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None @@ -51,7 +64,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" - return u'<html><head></head><body>%s</body></html>' % fragment + return '<html><head></head><body>%s</body></html>' % fragment class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): @@ -101,7 +114,16 @@ class AttrList(object): def __iter__(self): return list(self.attrs.items()).__iter__() def __setitem__(self, name, value): - "set attr", name, value + # If this attribute is a multi-valued attribute for this element, + # turn its value into a list. + list_attr = HTML5TreeBuilder.cdata_list_attributes + if (name in list_attr['*'] + or (self.element.name in list_attr + and name in list_attr[self.element.name])): + # A node that is being cloned may have already undergone + # this procedure. + if not isinstance(value, list): + value = whitespace_re.split(value) self.element[name] = value def items(self): return list(self.attrs.items()) @@ -124,7 +146,7 @@ class Element(html5lib.treebuilders._base.Node): def appendChild(self, node): string_child = child = None - if isinstance(node, basestring): + if isinstance(node, str): # Some other piece of code decided to pass in a string # instead of creating a TextElement object to contain the # string. @@ -139,7 +161,7 @@ class Element(html5lib.treebuilders._base.Node): else: child = node.element - if not isinstance(child, basestring) and child.parent is not None: + if not isinstance(child, str) and child.parent is not None: node.element.extract() if (string_child and self.element.contents @@ -152,7 +174,7 @@ class Element(html5lib.treebuilders._base.Node): old_element.replace_with(new_element) self.soup._most_recent_element = new_element else: - if isinstance(node, basestring): + if isinstance(node, str): # Create a brand new NavigableString from this string. child = self.soup.new_string(node) @@ -161,6 +183,12 @@ class Element(html5lib.treebuilders._base.Node): # immediately after the parent, if it has no children.) if self.element.contents: most_recent_element = self.element._last_descendant(False) + elif self.element.next_element is not None: + # Something from further ahead in the parse tree is + # being inserted into this earlier element. This is + # very annoying because it means an expensive search + # for the last element in the tree. + most_recent_element = self.soup._last_descendant() else: most_recent_element = self.element @@ -172,6 +200,7 @@ class Element(html5lib.treebuilders._base.Node): return AttrList(self.element) def setAttributes(self, attributes): + if attributes is not None and len(attributes) > 0: converted_attributes = [] @@ -183,7 +212,7 @@ class Element(html5lib.treebuilders._base.Node): self.soup.builder._replace_cdata_list_attribute_values( self.name, attributes) - for name, value in attributes.items(): + for name, value in list(attributes.items()): self.element[name] = value # The attributes may contain variables that need substitution. @@ -218,6 +247,9 @@ class Element(html5lib.treebuilders._base.Node): def reparentChildren(self, new_parent): """Move all of this tag's children into another tag.""" + # print "MOVE", self.element.contents + # print "FROM", self.element + # print "TO", new_parent.element element = self.element new_parent_element = new_parent.element # Determine what this tag's next_element will be once all the children @@ -236,17 +268,28 @@ class Element(html5lib.treebuilders._base.Node): new_parents_last_descendant_next_element = new_parent_element.next_element to_append = element.contents - append_after = new_parent.element.contents + append_after = new_parent_element.contents if len(to_append) > 0: # Set the first child's previous_element and previous_sibling # to elements within the new parent first_child = to_append[0] - first_child.previous_element = new_parents_last_descendant + if new_parents_last_descendant: + first_child.previous_element = new_parents_last_descendant + else: + first_child.previous_element = new_parent_element first_child.previous_sibling = new_parents_last_child + if new_parents_last_descendant: + new_parents_last_descendant.next_element = first_child + else: + new_parent_element.next_element = first_child + if new_parents_last_child: + new_parents_last_child.next_sibling = first_child # Fix the last child's next_element and next_sibling last_child = to_append[-1] last_child.next_element = new_parents_last_descendant_next_element + if new_parents_last_descendant_next_element: + new_parents_last_descendant_next_element.previous_element = last_child last_child.next_sibling = None for child in to_append: @@ -257,6 +300,10 @@ class Element(html5lib.treebuilders._base.Node): element.contents = [] element.next_element = final_next_element + # print "DONE WITH MOVE" + # print "FROM", self.element + # print "TO", new_parent_element + def cloneNode(self): tag = self.soup.new_tag(self.element.name, self.namespace) node = Element(tag, self.soup, self.namespace) diff --git a/import-layers/yocto-poky/bitbake/lib/bs4/builder/_htmlparser.py b/import-layers/yocto-poky/bitbake/lib/bs4/builder/_htmlparser.py index ca8d8b892..bb0a63f2f 100644 --- a/import-layers/yocto-poky/bitbake/lib/bs4/builder/_htmlparser.py +++ b/import-layers/yocto-poky/bitbake/lib/bs4/builder/_htmlparser.py @@ -4,10 +4,16 @@ __all__ = [ 'HTMLParserTreeBuilder', ] -from HTMLParser import ( - HTMLParser, - HTMLParseError, - ) +from html.parser import HTMLParser + +try: + from html.parser import HTMLParseError +except ImportError as e: + # HTMLParseError is removed in Python 3.5. Since it can never be + # thrown in 3.5, we can just define our own class as a placeholder. + class HTMLParseError(Exception): + pass + import sys import warnings @@ -19,10 +25,10 @@ import warnings # At the end of this file, we monkeypatch HTMLParser so that # strict=True works well on Python 3.2.2. major, minor, release = sys.version_info[:3] -CONSTRUCTOR_TAKES_STRICT = ( - major > 3 - or (major == 3 and minor > 2) - or (major == 3 and minor == 2 and release >= 3)) +CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 +CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 +CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 + from bs4.element import ( CData, @@ -63,7 +69,8 @@ class BeautifulSoupHTMLParser(HTMLParser): def handle_charref(self, name): # XXX workaround for a bug in HTMLParser. Remove this once - # it's fixed. + # it's fixed in all supported versions. + # http://bugs.python.org/issue13633 if name.startswith('x'): real_name = int(name.lstrip('x'), 16) elif name.startswith('X'): @@ -72,9 +79,9 @@ class BeautifulSoupHTMLParser(HTMLParser): real_name = int(name) try: - data = unichr(real_name) - except (ValueError, OverflowError), e: - data = u"\N{REPLACEMENT CHARACTER}" + data = chr(real_name) + except (ValueError, OverflowError) as e: + data = "\N{REPLACEMENT CHARACTER}" self.handle_data(data) @@ -113,14 +120,6 @@ class BeautifulSoupHTMLParser(HTMLParser): def handle_pi(self, data): self.soup.endData() - if data.endswith("?") and data.lower().startswith("xml"): - # "An XHTML processing instruction using the trailing '?' - # will cause the '?' to be included in data." - HTMLParser - # docs. - # - # Strip the question mark so we don't end up with two - # question marks. - data = data[:-1] self.soup.handle_data(data) self.soup.endData(ProcessingInstruction) @@ -128,26 +127,31 @@ class BeautifulSoupHTMLParser(HTMLParser): class HTMLParserTreeBuilder(HTMLTreeBuilder): is_xml = False - features = [HTML, STRICT, HTMLPARSER] + picklable = True + NAME = HTMLPARSER + features = [NAME, HTML, STRICT] def __init__(self, *args, **kwargs): - if CONSTRUCTOR_TAKES_STRICT: + if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: kwargs['strict'] = False + if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: + kwargs['convert_charrefs'] = False self.parser_args = (args, kwargs) def prepare_markup(self, markup, user_specified_encoding=None, - document_declared_encoding=None): + document_declared_encoding=None, exclude_encodings=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ - if isinstance(markup, unicode): + if isinstance(markup, str): yield (markup, None, None, False) return try_encodings = [user_specified_encoding, document_declared_encoding] - dammit = UnicodeDammit(markup, try_encodings, is_html=True) + dammit = UnicodeDammit(markup, try_encodings, is_html=True, + exclude_encodings=exclude_encodings) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters) @@ -158,7 +162,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): parser.soup = self.soup try: parser.feed(markup) - except HTMLParseError, e: + except HTMLParseError as e: warnings.warn(RuntimeWarning( "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) raise e diff --git a/import-layers/yocto-poky/bitbake/lib/bs4/builder/_lxml.py b/import-layers/yocto-poky/bitbake/lib/bs4/builder/_lxml.py index fa5d49875..9c6c14ee6 100644 --- a/import-layers/yocto-poky/bitbake/lib/bs4/builder/_lxml.py +++ b/import-layers/yocto-poky/bitbake/lib/bs4/builder/_lxml.py @@ -4,10 +4,15 @@ __all__ = [ ] from io import BytesIO -from StringIO import StringIO +from io import StringIO import collections from lxml import etree -from bs4.element import Comment, Doctype, NamespacedAttribute +from bs4.element import ( + Comment, + Doctype, + NamespacedAttribute, + ProcessingInstruction, +) from bs4.builder import ( FAST, HTML, @@ -25,8 +30,11 @@ class LXMLTreeBuilderForXML(TreeBuilder): is_xml = True + NAME = "lxml-xml" + ALTERNATE_NAMES = ["xml"] + # Well, it's permissive by XML parser standards. - features = [LXML, XML, FAST, PERMISSIVE] + features = [NAME, LXML, XML, FAST, PERMISSIVE] CHUNK_SIZE = 512 @@ -70,6 +78,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): return (None, tag) def prepare_markup(self, markup, user_specified_encoding=None, + exclude_encodings=None, document_declared_encoding=None): """ :yield: A series of 4-tuples. @@ -78,12 +87,12 @@ class LXMLTreeBuilderForXML(TreeBuilder): Each 4-tuple represents a strategy for parsing the document. """ - if isinstance(markup, unicode): + if isinstance(markup, str): # We were given Unicode. Maybe lxml can parse Unicode on # this system? yield markup, None, document_declared_encoding, False - if isinstance(markup, unicode): + if isinstance(markup, str): # No, apparently not. Convert the Unicode to UTF-8 and # tell lxml to parse it as UTF-8. yield (markup.encode("utf8"), "utf8", @@ -95,14 +104,15 @@ class LXMLTreeBuilderForXML(TreeBuilder): # the document as each one in turn. is_html = not self.is_xml try_encodings = [user_specified_encoding, document_declared_encoding] - detector = EncodingDetector(markup, try_encodings, is_html) + detector = EncodingDetector( + markup, try_encodings, is_html, exclude_encodings) for encoding in detector.encodings: yield (detector.markup, encoding, document_declared_encoding, False) def feed(self, markup): if isinstance(markup, bytes): markup = BytesIO(markup) - elif isinstance(markup, unicode): + elif isinstance(markup, str): markup = StringIO(markup) # Call feed() at least once, even if the markup is empty, @@ -117,7 +127,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): if len(data) != 0: self.parser.feed(data) self.parser.close() - except (UnicodeDecodeError, LookupError, etree.ParserError), e: + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: raise ParserRejectedMarkup(str(e)) def close(self): @@ -135,12 +145,12 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.nsmaps.append(None) elif len(nsmap) > 0: # A new namespace mapping has come into play. - inverted_nsmap = dict((value, key) for key, value in nsmap.items()) + inverted_nsmap = dict((value, key) for key, value in list(nsmap.items())) self.nsmaps.append(inverted_nsmap) # Also treat the namespace mapping as a set of attributes on the # tag, so we can recreate it later. attrs = attrs.copy() - for prefix, namespace in nsmap.items(): + for prefix, namespace in list(nsmap.items()): attribute = NamespacedAttribute( "xmlns", prefix, "http://www.w3.org/2000/xmlns/") attrs[attribute] = namespace @@ -149,7 +159,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): # from lxml with namespaces attached to their names, and # turn then into NamespacedAttribute objects. new_attrs = {} - for attr, value in attrs.items(): + for attr, value in list(attrs.items()): namespace, attr = self._getNsTag(attr) if namespace is None: new_attrs[attr] = value @@ -189,7 +199,9 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.nsmaps.pop() def pi(self, target, data): - pass + self.soup.endData() + self.soup.handle_data(target + ' ' + data) + self.soup.endData(ProcessingInstruction) def data(self, content): self.soup.handle_data(content) @@ -207,12 +219,15 @@ class LXMLTreeBuilderForXML(TreeBuilder): def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" - return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment + return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): - features = [LXML, HTML, FAST, PERMISSIVE] + NAME = LXML + ALTERNATE_NAMES = ["lxml-html"] + + features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] is_xml = False def default_parser(self, encoding): @@ -224,10 +239,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): self.parser = self.parser_for(encoding) self.parser.feed(markup) self.parser.close() - except (UnicodeDecodeError, LookupError, etree.ParserError), e: + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: raise ParserRejectedMarkup(str(e)) def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" - return u'<html><body>%s</body></html>' % fragment + return '<html><body>%s</body></html>' % fragment |