diff options
author | Patrick Williams <patrick@stwcx.xyz> | 2016-08-17 14:31:25 -0500 |
---|---|---|
committer | Patrick Williams <patrick@stwcx.xyz> | 2016-08-22 16:43:26 +0000 |
commit | 60f9d69e016b11c468c98ea75ba0a60c44afbbc4 (patch) | |
tree | ecb49581a9e41a37943c22cd9ef3f63451b20ee7 /import-layers/yocto-poky/bitbake/lib/bs4/element.py | |
parent | e18c61205e0234b03697129c20cc69c9b3940efc (diff) | |
download | talos-openbmc-60f9d69e016b11c468c98ea75ba0a60c44afbbc4.tar.gz talos-openbmc-60f9d69e016b11c468c98ea75ba0a60c44afbbc4.zip |
yocto-poky: Move to import-layers subdir
We are going to import additional layers, so create a subdir to
hold all of the layers that we import with git-subtree.
Change-Id: I6f732153a22be8ca663035c518837e3cc5ec0799
Signed-off-by: Patrick Williams <patrick@stwcx.xyz>
Diffstat (limited to 'import-layers/yocto-poky/bitbake/lib/bs4/element.py')
-rw-r--r-- | import-layers/yocto-poky/bitbake/lib/bs4/element.py | 1611 |
1 files changed, 1611 insertions, 0 deletions
diff --git a/import-layers/yocto-poky/bitbake/lib/bs4/element.py b/import-layers/yocto-poky/bitbake/lib/bs4/element.py new file mode 100644 index 000000000..da9afdf48 --- /dev/null +++ b/import-layers/yocto-poky/bitbake/lib/bs4/element.py @@ -0,0 +1,1611 @@ +import collections +import re +import sys +import warnings +from bs4.dammit import EntitySubstitution + +DEFAULT_OUTPUT_ENCODING = "utf-8" +PY3K = (sys.version_info[0] > 2) + +whitespace_re = re.compile("\s+") + +def _alias(attr): + """Alias one attribute name to another for backward compatibility""" + @property + def alias(self): + return getattr(self, attr) + + @alias.setter + def alias(self): + return setattr(self, attr) + return alias + + +class NamespacedAttribute(unicode): + + def __new__(cls, prefix, name, namespace=None): + if name is None: + obj = unicode.__new__(cls, prefix) + elif prefix is None: + # Not really namespaced. + obj = unicode.__new__(cls, name) + else: + obj = unicode.__new__(cls, prefix + ":" + name) + obj.prefix = prefix + obj.name = name + obj.namespace = namespace + return obj + +class AttributeValueWithCharsetSubstitution(unicode): + """A stand-in object for a character encoding specified in HTML.""" + +class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """A generic stand-in for the value of a meta tag's 'charset' attribute. + + When Beautiful Soup parses the markup '<meta charset="utf8">', the + value of the 'charset' attribute will be one of these objects. + """ + + def __new__(cls, original_value): + obj = unicode.__new__(cls, original_value) + obj.original_value = original_value + return obj + + def encode(self, encoding): + return encoding + + +class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """A generic stand-in for the value of a meta tag's 'content' attribute. + + When Beautiful Soup parses the markup: + <meta http-equiv="content-type" content="text/html; charset=utf8"> + + The value of the 'content' attribute will be one of these objects. + """ + + CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) + + def __new__(cls, original_value): + match = cls.CHARSET_RE.search(original_value) + if match is None: + # No substitution necessary. + return unicode.__new__(unicode, original_value) + + obj = unicode.__new__(cls, original_value) + obj.original_value = original_value + return obj + + def encode(self, encoding): + def rewrite(match): + return match.group(1) + encoding + return self.CHARSET_RE.sub(rewrite, self.original_value) + +class HTMLAwareEntitySubstitution(EntitySubstitution): + + """Entity substitution rules that are aware of some HTML quirks. + + Specifically, the contents of <script> and <style> tags should not + undergo entity substitution. + + Incoming NavigableString objects are checked to see if they're the + direct children of a <script> or <style> tag. + """ + + cdata_containing_tags = set(["script", "style"]) + + preformatted_tags = set(["pre"]) + + @classmethod + def _substitute_if_appropriate(cls, ns, f): + if (isinstance(ns, NavigableString) + and ns.parent is not None + and ns.parent.name in cls.cdata_containing_tags): + # Do nothing. + return ns + # Substitute. + return f(ns) + + @classmethod + def substitute_html(cls, ns): + return cls._substitute_if_appropriate( + ns, EntitySubstitution.substitute_html) + + @classmethod + def substitute_xml(cls, ns): + return cls._substitute_if_appropriate( + ns, EntitySubstitution.substitute_xml) + +class PageElement(object): + """Contains the navigational information for some part of the page + (either a tag or a piece of text)""" + + # There are five possible values for the "formatter" argument passed in + # to methods like encode() and prettify(): + # + # "html" - All Unicode characters with corresponding HTML entities + # are converted to those entities on output. + # "minimal" - Bare ampersands and angle brackets are converted to + # XML entities: & < > + # None - The null formatter. Unicode characters are never + # converted to entities. This is not recommended, but it's + # faster than "minimal". + # A function - This function will be called on every string that + # needs to undergo entity substitution. + # + + # In an HTML document, the default "html" and "minimal" functions + # will leave the contents of <script> and <style> tags alone. For + # an XML document, all tags will be given the same treatment. + + HTML_FORMATTERS = { + "html" : HTMLAwareEntitySubstitution.substitute_html, + "minimal" : HTMLAwareEntitySubstitution.substitute_xml, + None : None + } + + XML_FORMATTERS = { + "html" : EntitySubstitution.substitute_html, + "minimal" : EntitySubstitution.substitute_xml, + None : None + } + + def format_string(self, s, formatter='minimal'): + """Format the given string using the given formatter.""" + if not callable(formatter): + formatter = self._formatter_for_name(formatter) + if formatter is None: + output = s + else: + output = formatter(s) + return output + + @property + def _is_xml(self): + """Is this element part of an XML tree or an HTML tree? + + This is used when mapping a formatter name ("minimal") to an + appropriate function (one that performs entity-substitution on + the contents of <script> and <style> tags, or not). It's + inefficient, but it should be called very rarely. + """ + if self.parent is None: + # This is the top-level object. It should have .is_xml set + # from tree creation. If not, take a guess--BS is usually + # used on HTML markup. + return getattr(self, 'is_xml', False) + return self.parent._is_xml + + def _formatter_for_name(self, name): + "Look up a formatter function based on its name and the tree." + if self._is_xml: + return self.XML_FORMATTERS.get( + name, EntitySubstitution.substitute_xml) + else: + return self.HTML_FORMATTERS.get( + name, HTMLAwareEntitySubstitution.substitute_xml) + + def setup(self, parent=None, previous_element=None): + """Sets up the initial relations between this element and + other elements.""" + self.parent = parent + self.previous_element = previous_element + if previous_element is not None: + self.previous_element.next_element = self + self.next_element = None + self.previous_sibling = None + self.next_sibling = None + if self.parent is not None and self.parent.contents: + self.previous_sibling = self.parent.contents[-1] + self.previous_sibling.next_sibling = self + + nextSibling = _alias("next_sibling") # BS3 + previousSibling = _alias("previous_sibling") # BS3 + + def replace_with(self, replace_with): + if replace_with is self: + return + if replace_with is self.parent: + raise ValueError("Cannot replace a Tag with its parent.") + old_parent = self.parent + my_index = self.parent.index(self) + self.extract() + old_parent.insert(my_index, replace_with) + return self + replaceWith = replace_with # BS3 + + def unwrap(self): + my_parent = self.parent + my_index = self.parent.index(self) + self.extract() + for child in reversed(self.contents[:]): + my_parent.insert(my_index, child) + return self + replace_with_children = unwrap + replaceWithChildren = unwrap # BS3 + + def wrap(self, wrap_inside): + me = self.replace_with(wrap_inside) + wrap_inside.append(me) + return wrap_inside + + def extract(self): + """Destructively rips this element out of the tree.""" + if self.parent is not None: + del self.parent.contents[self.parent.index(self)] + + #Find the two elements that would be next to each other if + #this element (and any children) hadn't been parsed. Connect + #the two. + last_child = self._last_descendant() + next_element = last_child.next_element + + if self.previous_element is not None: + self.previous_element.next_element = next_element + if next_element is not None: + next_element.previous_element = self.previous_element + self.previous_element = None + last_child.next_element = None + + self.parent = None + if self.previous_sibling is not None: + self.previous_sibling.next_sibling = self.next_sibling + if self.next_sibling is not None: + self.next_sibling.previous_sibling = self.previous_sibling + self.previous_sibling = self.next_sibling = None + return self + + def _last_descendant(self, is_initialized=True, accept_self=True): + "Finds the last element beneath this object to be parsed." + if is_initialized and self.next_sibling: + last_child = self.next_sibling.previous_element + else: + last_child = self + while isinstance(last_child, Tag) and last_child.contents: + last_child = last_child.contents[-1] + if not accept_self and last_child == self: + last_child = None + return last_child + # BS3: Not part of the API! + _lastRecursiveChild = _last_descendant + + def insert(self, position, new_child): + if new_child is self: + raise ValueError("Cannot insert a tag into itself.") + if (isinstance(new_child, basestring) + and not isinstance(new_child, NavigableString)): + new_child = NavigableString(new_child) + + position = min(position, len(self.contents)) + if hasattr(new_child, 'parent') and new_child.parent is not None: + # We're 'inserting' an element that's already one + # of this object's children. + if new_child.parent is self: + current_index = self.index(new_child) + if current_index < position: + # We're moving this element further down the list + # of this object's children. That means that when + # we extract this element, our target index will + # jump down one. + position -= 1 + new_child.extract() + + new_child.parent = self + previous_child = None + if position == 0: + new_child.previous_sibling = None + new_child.previous_element = self + else: + previous_child = self.contents[position - 1] + new_child.previous_sibling = previous_child + new_child.previous_sibling.next_sibling = new_child + new_child.previous_element = previous_child._last_descendant(False) + if new_child.previous_element is not None: + new_child.previous_element.next_element = new_child + + new_childs_last_element = new_child._last_descendant(False) + + if position >= len(self.contents): + new_child.next_sibling = None + + parent = self + parents_next_sibling = None + while parents_next_sibling is None and parent is not None: + parents_next_sibling = parent.next_sibling + parent = parent.parent + if parents_next_sibling is not None: + # We found the element that comes next in the document. + break + if parents_next_sibling is not None: + new_childs_last_element.next_element = parents_next_sibling + else: + # The last element of this tag is the last element in + # the document. + new_childs_last_element.next_element = None + else: + next_child = self.contents[position] + new_child.next_sibling = next_child + if new_child.next_sibling is not None: + new_child.next_sibling.previous_sibling = new_child + new_childs_last_element.next_element = next_child + + if new_childs_last_element.next_element is not None: + new_childs_last_element.next_element.previous_element = new_childs_last_element + self.contents.insert(position, new_child) + + def append(self, tag): + """Appends the given tag to the contents of this tag.""" + self.insert(len(self.contents), tag) + + def insert_before(self, predecessor): + """Makes the given element the immediate predecessor of this one. + + The two elements will have the same parent, and the given element + will be immediately before this one. + """ + if self is predecessor: + raise ValueError("Can't insert an element before itself.") + parent = self.parent + if parent is None: + raise ValueError( + "Element has no parent, so 'before' has no meaning.") + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(predecessor, PageElement): + predecessor.extract() + index = parent.index(self) + parent.insert(index, predecessor) + + def insert_after(self, successor): + """Makes the given element the immediate successor of this one. + + The two elements will have the same parent, and the given element + will be immediately after this one. + """ + if self is successor: + raise ValueError("Can't insert an element after itself.") + parent = self.parent + if parent is None: + raise ValueError( + "Element has no parent, so 'after' has no meaning.") + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(successor, PageElement): + successor.extract() + index = parent.index(self) + parent.insert(index+1, successor) + + def find_next(self, name=None, attrs={}, text=None, **kwargs): + """Returns the first item that matches the given criteria and + appears after this Tag in the document.""" + return self._find_one(self.find_all_next, name, attrs, text, **kwargs) + findNext = find_next # BS3 + + def find_all_next(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns all items that match the given criteria and appear + after this Tag in the document.""" + return self._find_all(name, attrs, text, limit, self.next_elements, + **kwargs) + findAllNext = find_all_next # BS3 + + def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): + """Returns the closest sibling to this Tag that matches the + given criteria and appears after this Tag in the document.""" + return self._find_one(self.find_next_siblings, name, attrs, text, + **kwargs) + findNextSibling = find_next_sibling # BS3 + + def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns the siblings of this Tag that match the given + criteria and appear after this Tag in the document.""" + return self._find_all(name, attrs, text, limit, + self.next_siblings, **kwargs) + findNextSiblings = find_next_siblings # BS3 + fetchNextSiblings = find_next_siblings # BS2 + + def find_previous(self, name=None, attrs={}, text=None, **kwargs): + """Returns the first item that matches the given criteria and + appears before this Tag in the document.""" + return self._find_one( + self.find_all_previous, name, attrs, text, **kwargs) + findPrevious = find_previous # BS3 + + def find_all_previous(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns all items that match the given criteria and appear + before this Tag in the document.""" + return self._find_all(name, attrs, text, limit, self.previous_elements, + **kwargs) + findAllPrevious = find_all_previous # BS3 + fetchPrevious = find_all_previous # BS2 + + def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): + """Returns the closest sibling to this Tag that matches the + given criteria and appears before this Tag in the document.""" + return self._find_one(self.find_previous_siblings, name, attrs, text, + **kwargs) + findPreviousSibling = find_previous_sibling # BS3 + + def find_previous_siblings(self, name=None, attrs={}, text=None, + limit=None, **kwargs): + """Returns the siblings of this Tag that match the given + criteria and appear before this Tag in the document.""" + return self._find_all(name, attrs, text, limit, + self.previous_siblings, **kwargs) + findPreviousSiblings = find_previous_siblings # BS3 + fetchPreviousSiblings = find_previous_siblings # BS2 + + def find_parent(self, name=None, attrs={}, **kwargs): + """Returns the closest parent of this Tag that matches the given + criteria.""" + # NOTE: We can't use _find_one because findParents takes a different + # set of arguments. + r = None + l = self.find_parents(name, attrs, 1, **kwargs) + if l: + r = l[0] + return r + findParent = find_parent # BS3 + + def find_parents(self, name=None, attrs={}, limit=None, **kwargs): + """Returns the parents of this Tag that match the given + criteria.""" + + return self._find_all(name, attrs, None, limit, self.parents, + **kwargs) + findParents = find_parents # BS3 + fetchParents = find_parents # BS2 + + @property + def next(self): + return self.next_element + + @property + def previous(self): + return self.previous_element + + #These methods do the real heavy lifting. + + def _find_one(self, method, name, attrs, text, **kwargs): + r = None + l = method(name, attrs, text, 1, **kwargs) + if l: + r = l[0] + return r + + def _find_all(self, name, attrs, text, limit, generator, **kwargs): + "Iterates over a generator looking for things that match." + + if isinstance(name, SoupStrainer): + strainer = name + else: + strainer = SoupStrainer(name, attrs, text, **kwargs) + + if text is None and not limit and not attrs and not kwargs: + if name is True or name is None: + # Optimization to find all tags. + result = (element for element in generator + if isinstance(element, Tag)) + return ResultSet(strainer, result) + elif isinstance(name, basestring): + # Optimization to find all tags with a given name. + result = (element for element in generator + if isinstance(element, Tag) + and element.name == name) + return ResultSet(strainer, result) + results = ResultSet(strainer) + while True: + try: + i = next(generator) + except StopIteration: + break + if i: + found = strainer.search(i) + if found: + results.append(found) + if limit and len(results) >= limit: + break + return results + + #These generators can be used to navigate starting from both + #NavigableStrings and Tags. + @property + def next_elements(self): + i = self.next_element + while i is not None: + yield i + i = i.next_element + + @property + def next_siblings(self): + i = self.next_sibling + while i is not None: + yield i + i = i.next_sibling + + @property + def previous_elements(self): + i = self.previous_element + while i is not None: + yield i + i = i.previous_element + + @property + def previous_siblings(self): + i = self.previous_sibling + while i is not None: + yield i + i = i.previous_sibling + + @property + def parents(self): + i = self.parent + while i is not None: + yield i + i = i.parent + + # Methods for supporting CSS selectors. + + tag_name_re = re.compile('^[a-z0-9]+$') + + # /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ + # \---/ \---/\-------------/ \-------/ + # | | | | + # | | | The value + # | | ~,|,^,$,* or = + # | Attribute + # Tag + attribselect_re = re.compile( + r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' + + r'=?"?(?P<value>[^\]"]*)"?\]$' + ) + + def _attr_value_as_string(self, value, default=None): + """Force an attribute value into a string representation. + + A multi-valued attribute will be converted into a + space-separated stirng. + """ + value = self.get(value, default) + if isinstance(value, list) or isinstance(value, tuple): + value =" ".join(value) + return value + + def _tag_name_matches_and(self, function, tag_name): + if not tag_name: + return function + else: + def _match(tag): + return tag.name == tag_name and function(tag) + return _match + + def _attribute_checker(self, operator, attribute, value=''): + """Create a function that performs a CSS selector operation. + + Takes an operator, attribute and optional value. Returns a + function that will return True for elements that match that + combination. + """ + if operator == '=': + # string representation of `attribute` is equal to `value` + return lambda el: el._attr_value_as_string(attribute) == value + elif operator == '~': + # space-separated list representation of `attribute` + # contains `value` + def _includes_value(element): + attribute_value = element.get(attribute, []) + if not isinstance(attribute_value, list): + attribute_value = attribute_value.split() + return value in attribute_value + return _includes_value + elif operator == '^': + # string representation of `attribute` starts with `value` + return lambda el: el._attr_value_as_string( + attribute, '').startswith(value) + elif operator == '$': + # string represenation of `attribute` ends with `value` + return lambda el: el._attr_value_as_string( + attribute, '').endswith(value) + elif operator == '*': + # string representation of `attribute` contains `value` + return lambda el: value in el._attr_value_as_string(attribute, '') + elif operator == '|': + # string representation of `attribute` is either exactly + # `value` or starts with `value` and then a dash. + def _is_or_starts_with_dash(element): + attribute_value = element._attr_value_as_string(attribute, '') + return (attribute_value == value or attribute_value.startswith( + value + '-')) + return _is_or_starts_with_dash + else: + return lambda el: el.has_attr(attribute) + + # Old non-property versions of the generators, for backwards + # compatibility with BS3. + def nextGenerator(self): + return self.next_elements + + def nextSiblingGenerator(self): + return self.next_siblings + + def previousGenerator(self): + return self.previous_elements + + def previousSiblingGenerator(self): + return self.previous_siblings + + def parentGenerator(self): + return self.parents + + +class NavigableString(unicode, PageElement): + + PREFIX = '' + SUFFIX = '' + + def __new__(cls, value): + """Create a new NavigableString. + + When unpickling a NavigableString, this method is called with + the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be + passed in to the superclass's __new__ or the superclass won't know + how to handle non-ASCII characters. + """ + if isinstance(value, unicode): + return unicode.__new__(cls, value) + return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + + def __copy__(self): + return self + + def __getnewargs__(self): + return (unicode(self),) + + def __getattr__(self, attr): + """text.string gives you text. This is for backwards + compatibility for Navigable*String, but for CData* it lets you + get the string without the CData wrapper.""" + if attr == 'string': + return self + else: + raise AttributeError( + "'%s' object has no attribute '%s'" % ( + self.__class__.__name__, attr)) + + def output_ready(self, formatter="minimal"): + output = self.format_string(self, formatter) + return self.PREFIX + output + self.SUFFIX + + @property + def name(self): + return None + + @name.setter + def name(self, name): + raise AttributeError("A NavigableString cannot be given a name.") + +class PreformattedString(NavigableString): + """A NavigableString not subject to the normal formatting rules. + + The string will be passed into the formatter (to trigger side effects), + but the return value will be ignored. + """ + + def output_ready(self, formatter="minimal"): + """CData strings are passed into the formatter. + But the return value is ignored.""" + self.format_string(self, formatter) + return self.PREFIX + self + self.SUFFIX + +class CData(PreformattedString): + + PREFIX = u'<![CDATA[' + SUFFIX = u']]>' + +class ProcessingInstruction(PreformattedString): + + PREFIX = u'<?' + SUFFIX = u'?>' + +class Comment(PreformattedString): + + PREFIX = u'<!--' + SUFFIX = u'-->' + + +class Declaration(PreformattedString): + PREFIX = u'<!' + SUFFIX = u'!>' + + +class Doctype(PreformattedString): + + @classmethod + def for_name_and_ids(cls, name, pub_id, system_id): + value = name or '' + if pub_id is not None: + value += ' PUBLIC "%s"' % pub_id + if system_id is not None: + value += ' "%s"' % system_id + elif system_id is not None: + value += ' SYSTEM "%s"' % system_id + + return Doctype(value) + + PREFIX = u'<!DOCTYPE ' + SUFFIX = u'>\n' + + +class Tag(PageElement): + + """Represents a found HTML tag with its attributes and contents.""" + + def __init__(self, parser=None, builder=None, name=None, namespace=None, + prefix=None, attrs=None, parent=None, previous=None): + "Basic constructor." + + if parser is None: + self.parser_class = None + else: + # We don't actually store the parser object: that lets extracted + # chunks be garbage-collected. + self.parser_class = parser.__class__ + if name is None: + raise ValueError("No value provided for new tag's name.") + self.name = name + self.namespace = namespace + self.prefix = prefix + if attrs is None: + attrs = {} + elif attrs and builder.cdata_list_attributes: + attrs = builder._replace_cdata_list_attribute_values( + self.name, attrs) + else: + attrs = dict(attrs) + self.attrs = attrs + self.contents = [] + self.setup(parent, previous) + self.hidden = False + + # Set up any substitutions, such as the charset in a META tag. + if builder is not None: + builder.set_up_substitutions(self) + self.can_be_empty_element = builder.can_be_empty_element(name) + else: + self.can_be_empty_element = False + + parserClass = _alias("parser_class") # BS3 + + @property + def is_empty_element(self): + """Is this tag an empty-element tag? (aka a self-closing tag) + + A tag that has contents is never an empty-element tag. + + A tag that has no contents may or may not be an empty-element + tag. It depends on the builder used to create the tag. If the + builder has a designated list of empty-element tags, then only + a tag whose name shows up in that list is considered an + empty-element tag. + + If the builder has no designated list of empty-element tags, + then any tag with no contents is an empty-element tag. + """ + return len(self.contents) == 0 and self.can_be_empty_element + isSelfClosing = is_empty_element # BS3 + + @property + def string(self): + """Convenience property to get the single string within this tag. + + :Return: If this tag has a single string child, return value + is that string. If this tag has no children, or more than one + child, return value is None. If this tag has one child tag, + return value is the 'string' attribute of the child tag, + recursively. + """ + if len(self.contents) != 1: + return None + child = self.contents[0] + if isinstance(child, NavigableString): + return child + return child.string + + @string.setter + def string(self, string): + self.clear() + self.append(string.__class__(string)) + + def _all_strings(self, strip=False, types=(NavigableString, CData)): + """Yield all strings of certain classes, possibly stripping them. + + By default, yields only NavigableString and CData objects. So + no comments, processing instructions, etc. + """ + for descendant in self.descendants: + if ( + (types is None and not isinstance(descendant, NavigableString)) + or + (types is not None and type(descendant) not in types)): + continue + if strip: + descendant = descendant.strip() + if len(descendant) == 0: + continue + yield descendant + + strings = property(_all_strings) + + @property + def stripped_strings(self): + for string in self._all_strings(True): + yield string + + def get_text(self, separator=u"", strip=False, + types=(NavigableString, CData)): + """ + Get all child strings, concatenated using the given separator. + """ + return separator.join([s for s in self._all_strings( + strip, types=types)]) + getText = get_text + text = property(get_text) + + def decompose(self): + """Recursively destroys the contents of this tree.""" + self.extract() + i = self + while i is not None: + next = i.next_element + i.__dict__.clear() + i.contents = [] + i = next + + def clear(self, decompose=False): + """ + Extract all children. If decompose is True, decompose instead. + """ + if decompose: + for element in self.contents[:]: + if isinstance(element, Tag): + element.decompose() + else: + element.extract() + else: + for element in self.contents[:]: + element.extract() + + def index(self, element): + """ + Find the index of a child by identity, not value. Avoids issues with + tag.contents.index(element) getting the index of equal elements. + """ + for i, child in enumerate(self.contents): + if child is element: + return i + raise ValueError("Tag.index: element not in tag") + + def get(self, key, default=None): + """Returns the value of the 'key' attribute for the tag, or + the value given for 'default' if it doesn't have that + attribute.""" + return self.attrs.get(key, default) + + def has_attr(self, key): + return key in self.attrs + + def __hash__(self): + return str(self).__hash__() + + def __getitem__(self, key): + """tag[key] returns the value of the 'key' attribute for the tag, + and throws an exception if it's not there.""" + return self.attrs[key] + + def __iter__(self): + "Iterating over a tag iterates over its contents." + return iter(self.contents) + + def __len__(self): + "The length of a tag is the length of its list of contents." + return len(self.contents) + + def __contains__(self, x): + return x in self.contents + + def __nonzero__(self): + "A tag is non-None even if it has no contents." + return True + + def __setitem__(self, key, value): + """Setting tag[key] sets the value of the 'key' attribute for the + tag.""" + self.attrs[key] = value + + def __delitem__(self, key): + "Deleting tag[key] deletes all 'key' attributes for the tag." + self.attrs.pop(key, None) + + def __call__(self, *args, **kwargs): + """Calling a tag like a function is the same as calling its + find_all() method. Eg. tag('a') returns a list of all the A tags + found within this tag.""" + return self.find_all(*args, **kwargs) + + def __getattr__(self, tag): + #print "Getattr %s.%s" % (self.__class__, tag) + if len(tag) > 3 and tag.endswith('Tag'): + # BS3: soup.aTag -> "soup.find("a") + tag_name = tag[:-3] + warnings.warn( + '.%sTag is deprecated, use .find("%s") instead.' % ( + tag_name, tag_name)) + return self.find(tag_name) + # We special case contents to avoid recursion. + elif not tag.startswith("__") and not tag=="contents": + return self.find(tag) + raise AttributeError( + "'%s' object has no attribute '%s'" % (self.__class__, tag)) + + def __eq__(self, other): + """Returns true iff this tag has the same name, the same attributes, + and the same contents (recursively) as the given tag.""" + if self is other: + return True + if (not hasattr(other, 'name') or + not hasattr(other, 'attrs') or + not hasattr(other, 'contents') or + self.name != other.name or + self.attrs != other.attrs or + len(self) != len(other)): + return False + for i, my_child in enumerate(self.contents): + if my_child != other.contents[i]: + return False + return True + + def __ne__(self, other): + """Returns true iff this tag is not identical to the other tag, + as defined in __eq__.""" + return not self == other + + def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): + """Renders this tag as a string.""" + return self.encode(encoding) + + def __unicode__(self): + return self.decode() + + def __str__(self): + return self.encode() + + if PY3K: + __str__ = __repr__ = __unicode__ + + def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, + indent_level=None, formatter="minimal", + errors="xmlcharrefreplace"): + # Turn the data structure into Unicode, then encode the + # Unicode. + u = self.decode(indent_level, encoding, formatter) + return u.encode(encoding, errors) + + def _should_pretty_print(self, indent_level): + """Should this tag be pretty-printed?""" + return ( + indent_level is not None and + (self.name not in HTMLAwareEntitySubstitution.preformatted_tags + or self._is_xml)) + + def decode(self, indent_level=None, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Returns a Unicode representation of this tag and its contents. + + :param eventual_encoding: The tag is destined to be + encoded into this encoding. This method is _not_ + responsible for performing that encoding. This information + is passed in so that it can be substituted in if the + document contains a <META> tag that mentions the document's + encoding. + """ + + # First off, turn a string formatter into a function. This + # will stop the lookup from happening over and over again. + if not callable(formatter): + formatter = self._formatter_for_name(formatter) + + attrs = [] + if self.attrs: + for key, val in sorted(self.attrs.items()): + if val is None: + decoded = key + else: + if isinstance(val, list) or isinstance(val, tuple): + val = ' '.join(val) + elif not isinstance(val, basestring): + val = unicode(val) + elif ( + isinstance(val, AttributeValueWithCharsetSubstitution) + and eventual_encoding is not None): + val = val.encode(eventual_encoding) + + text = self.format_string(val, formatter) + decoded = ( + unicode(key) + '=' + + EntitySubstitution.quoted_attribute_value(text)) + attrs.append(decoded) + close = '' + closeTag = '' + + prefix = '' + if self.prefix: + prefix = self.prefix + ":" + + if self.is_empty_element: + close = '/' + else: + closeTag = '</%s%s>' % (prefix, self.name) + + pretty_print = self._should_pretty_print(indent_level) + space = '' + indent_space = '' + if indent_level is not None: + indent_space = (' ' * (indent_level - 1)) + if pretty_print: + space = indent_space + indent_contents = indent_level + 1 + else: + indent_contents = None + contents = self.decode_contents( + indent_contents, eventual_encoding, formatter) + + if self.hidden: + # This is the 'document root' object. + s = contents + else: + s = [] + attribute_string = '' + if attrs: + attribute_string = ' ' + ' '.join(attrs) + if indent_level is not None: + # Even if this particular tag is not pretty-printed, + # we should indent up to the start of the tag. + s.append(indent_space) + s.append('<%s%s%s%s>' % ( + prefix, self.name, attribute_string, close)) + if pretty_print: + s.append("\n") + s.append(contents) + if pretty_print and contents and contents[-1] != "\n": + s.append("\n") + if pretty_print and closeTag: + s.append(space) + s.append(closeTag) + if indent_level is not None and closeTag and self.next_sibling: + # Even if this particular tag is not pretty-printed, + # we're now done with the tag, and we should add a + # newline if appropriate. + s.append("\n") + s = ''.join(s) + return s + + def prettify(self, encoding=None, formatter="minimal"): + if encoding is None: + return self.decode(True, formatter=formatter) + else: + return self.encode(encoding, True, formatter=formatter) + + def decode_contents(self, indent_level=None, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Renders the contents of this tag as a Unicode string. + + :param eventual_encoding: The tag is destined to be + encoded into this encoding. This method is _not_ + responsible for performing that encoding. This information + is passed in so that it can be substituted in if the + document contains a <META> tag that mentions the document's + encoding. + """ + # First off, turn a string formatter into a function. This + # will stop the lookup from happening over and over again. + if not callable(formatter): + formatter = self._formatter_for_name(formatter) + + pretty_print = (indent_level is not None) + s = [] + for c in self: + text = None + if isinstance(c, NavigableString): + text = c.output_ready(formatter) + elif isinstance(c, Tag): + s.append(c.decode(indent_level, eventual_encoding, + formatter)) + if text and indent_level and not self.name == 'pre': + text = text.strip() + if text: + if pretty_print and not self.name == 'pre': + s.append(" " * (indent_level - 1)) + s.append(text) + if pretty_print and not self.name == 'pre': + s.append("\n") + return ''.join(s) + + def encode_contents( + self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Renders the contents of this tag as a bytestring.""" + contents = self.decode_contents(indent_level, encoding, formatter) + return contents.encode(encoding) + + # Old method for BS3 compatibility + def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, + prettyPrint=False, indentLevel=0): + if not prettyPrint: + indentLevel = None + return self.encode_contents( + indent_level=indentLevel, encoding=encoding) + + #Soup methods + + def find(self, name=None, attrs={}, recursive=True, text=None, + **kwargs): + """Return only the first child of this Tag matching the given + criteria.""" + r = None + l = self.find_all(name, attrs, recursive, text, 1, **kwargs) + if l: + r = l[0] + return r + findChild = find + + def find_all(self, name=None, attrs={}, recursive=True, text=None, + limit=None, **kwargs): + """Extracts a list of Tag objects that match the given + criteria. You can specify the name of the Tag and any + attributes you want the Tag to have. + + The value of a key-value pair in the 'attrs' map can be a + string, a list of strings, a regular expression object, or a + callable that takes a string and returns whether or not the + string matches for some custom definition of 'matches'. The + same is true of the tag name.""" + + generator = self.descendants + if not recursive: + generator = self.children + return self._find_all(name, attrs, text, limit, generator, **kwargs) + findAll = find_all # BS3 + findChildren = find_all # BS2 + + #Generator methods + @property + def children(self): + # return iter() to make the purpose of the method clear + return iter(self.contents) # XXX This seems to be untested. + + @property + def descendants(self): + if not len(self.contents): + return + stopNode = self._last_descendant().next_element + current = self.contents[0] + while current is not stopNode: + yield current + current = current.next_element + + # CSS selector code + + _selector_combinators = ['>', '+', '~'] + _select_debug = False + def select(self, selector, _candidate_generator=None): + """Perform a CSS selection operation on the current element.""" + tokens = selector.split() + current_context = [self] + + if tokens[-1] in self._selector_combinators: + raise ValueError( + 'Final combinator "%s" is missing an argument.' % tokens[-1]) + if self._select_debug: + print 'Running CSS selector "%s"' % selector + for index, token in enumerate(tokens): + if self._select_debug: + print ' Considering token "%s"' % token + recursive_candidate_generator = None + tag_name = None + if tokens[index-1] in self._selector_combinators: + # This token was consumed by the previous combinator. Skip it. + if self._select_debug: + print ' Token was consumed by the previous combinator.' + continue + # Each operation corresponds to a checker function, a rule + # for determining whether a candidate matches the + # selector. Candidates are generated by the active + # iterator. + checker = None + + m = self.attribselect_re.match(token) + if m is not None: + # Attribute selector + tag_name, attribute, operator, value = m.groups() + checker = self._attribute_checker(operator, attribute, value) + + elif '#' in token: + # ID selector + tag_name, tag_id = token.split('#', 1) + def id_matches(tag): + return tag.get('id', None) == tag_id + checker = id_matches + + elif '.' in token: + # Class selector + tag_name, klass = token.split('.', 1) + classes = set(klass.split('.')) + def classes_match(candidate): + return classes.issubset(candidate.get('class', [])) + checker = classes_match + + elif ':' in token: + # Pseudo-class + tag_name, pseudo = token.split(':', 1) + if tag_name == '': + raise ValueError( + "A pseudo-class must be prefixed with a tag name.") + pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) + found = [] + if pseudo_attributes is not None: + pseudo_type, pseudo_value = pseudo_attributes.groups() + if pseudo_type == 'nth-of-type': + try: + pseudo_value = int(pseudo_value) + except: + raise NotImplementedError( + 'Only numeric values are currently supported for the nth-of-type pseudo-class.') + if pseudo_value < 1: + raise ValueError( + 'nth-of-type pseudo-class value must be at least 1.') + class Counter(object): + def __init__(self, destination): + self.count = 0 + self.destination = destination + + def nth_child_of_type(self, tag): + self.count += 1 + if self.count == self.destination: + return True + if self.count > self.destination: + # Stop the generator that's sending us + # these things. + raise StopIteration() + return False + checker = Counter(pseudo_value).nth_child_of_type + else: + raise NotImplementedError( + 'Only the following pseudo-classes are implemented: nth-of-type.') + + elif token == '*': + # Star selector -- matches everything + pass + elif token == '>': + # Run the next token as a CSS selector against the + # direct children of each tag in the current context. + recursive_candidate_generator = lambda tag: tag.children + elif token == '~': + # Run the next token as a CSS selector against the + # siblings of each tag in the current context. + recursive_candidate_generator = lambda tag: tag.next_siblings + elif token == '+': + # For each tag in the current context, run the next + # token as a CSS selector against the tag's next + # sibling that's a tag. + def next_tag_sibling(tag): + yield tag.find_next_sibling(True) + recursive_candidate_generator = next_tag_sibling + + elif self.tag_name_re.match(token): + # Just a tag name. + tag_name = token + else: + raise ValueError( + 'Unsupported or invalid CSS selector: "%s"' % token) + + if recursive_candidate_generator: + # This happens when the selector looks like "> foo". + # + # The generator calls select() recursively on every + # member of the current context, passing in a different + # candidate generator and a different selector. + # + # In the case of "> foo", the candidate generator is + # one that yields a tag's direct children (">"), and + # the selector is "foo". + next_token = tokens[index+1] + def recursive_select(tag): + if self._select_debug: + print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs) + print '-' * 40 + for i in tag.select(next_token, recursive_candidate_generator): + if self._select_debug: + print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs) + yield i + if self._select_debug: + print '-' * 40 + _use_candidate_generator = recursive_select + elif _candidate_generator is None: + # By default, a tag's candidates are all of its + # children. If tag_name is defined, only yield tags + # with that name. + if self._select_debug: + if tag_name: + check = "[any]" + else: + check = tag_name + print ' Default candidate generator, tag name="%s"' % check + if self._select_debug: + # This is redundant with later code, but it stops + # a bunch of bogus tags from cluttering up the + # debug log. + def default_candidate_generator(tag): + for child in tag.descendants: + if not isinstance(child, Tag): + continue + if tag_name and not child.name == tag_name: + continue + yield child + _use_candidate_generator = default_candidate_generator + else: + _use_candidate_generator = lambda tag: tag.descendants + else: + _use_candidate_generator = _candidate_generator + + new_context = [] + new_context_ids = set([]) + for tag in current_context: + if self._select_debug: + print " Running candidate generator on %s %s" % ( + tag.name, repr(tag.attrs)) + for candidate in _use_candidate_generator(tag): + if not isinstance(candidate, Tag): + continue + if tag_name and candidate.name != tag_name: + continue + if checker is not None: + try: + result = checker(candidate) + except StopIteration: + # The checker has decided we should no longer + # run the generator. + break + if checker is None or result: + if self._select_debug: + print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)) + if id(candidate) not in new_context_ids: + # If a tag matches a selector more than once, + # don't include it in the context more than once. + new_context.append(candidate) + new_context_ids.add(id(candidate)) + elif self._select_debug: + print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) + + current_context = new_context + + if self._select_debug: + print "Final verdict:" + for i in current_context: + print " %s %s" % (i.name, i.attrs) + return current_context + + # Old names for backwards compatibility + def childGenerator(self): + return self.children + + def recursiveChildGenerator(self): + return self.descendants + + def has_key(self, key): + """This was kind of misleading because has_key() (attributes) + was different from __in__ (contents). has_key() is gone in + Python 3, anyway.""" + warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % ( + key)) + return self.has_attr(key) + +# Next, a couple classes to represent queries and their results. +class SoupStrainer(object): + """Encapsulates a number of ways of matching a markup element (tag or + text).""" + + def __init__(self, name=None, attrs={}, text=None, **kwargs): + self.name = self._normalize_search_value(name) + if not isinstance(attrs, dict): + # Treat a non-dict value for attrs as a search for the 'class' + # attribute. + kwargs['class'] = attrs + attrs = None + + if 'class_' in kwargs: + # Treat class_="foo" as a search for the 'class' + # attribute, overriding any non-dict value for attrs. + kwargs['class'] = kwargs['class_'] + del kwargs['class_'] + + if kwargs: + if attrs: + attrs = attrs.copy() + attrs.update(kwargs) + else: + attrs = kwargs + normalized_attrs = {} + for key, value in attrs.items(): + normalized_attrs[key] = self._normalize_search_value(value) + + self.attrs = normalized_attrs + self.text = self._normalize_search_value(text) + + def _normalize_search_value(self, value): + # Leave it alone if it's a Unicode string, a callable, a + # regular expression, a boolean, or None. + if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match') + or isinstance(value, bool) or value is None): + return value + + # If it's a bytestring, convert it to Unicode, treating it as UTF-8. + if isinstance(value, bytes): + return value.decode("utf8") + + # If it's listlike, convert it into a list of strings. + if hasattr(value, '__iter__'): + new_value = [] + for v in value: + if (hasattr(v, '__iter__') and not isinstance(v, bytes) + and not isinstance(v, unicode)): + # This is almost certainly the user's mistake. In the + # interests of avoiding infinite loops, we'll let + # it through as-is rather than doing a recursive call. + new_value.append(v) + else: + new_value.append(self._normalize_search_value(v)) + return new_value + + # Otherwise, convert it into a Unicode string. + # The unicode(str()) thing is so this will do the same thing on Python 2 + # and Python 3. + return unicode(str(value)) + + def __str__(self): + if self.text: + return self.text + else: + return "%s|%s" % (self.name, self.attrs) + + def search_tag(self, markup_name=None, markup_attrs={}): + found = None + markup = None + if isinstance(markup_name, Tag): + markup = markup_name + markup_attrs = markup + call_function_with_tag_data = ( + isinstance(self.name, collections.Callable) + and not isinstance(markup_name, Tag)) + + if ((not self.name) + or call_function_with_tag_data + or (markup and self._matches(markup, self.name)) + or (not markup and self._matches(markup_name, self.name))): + if call_function_with_tag_data: + match = self.name(markup_name, markup_attrs) + else: + match = True + markup_attr_map = None + for attr, match_against in list(self.attrs.items()): + if not markup_attr_map: + if hasattr(markup_attrs, 'get'): + markup_attr_map = markup_attrs + else: + markup_attr_map = {} + for k, v in markup_attrs: + markup_attr_map[k] = v + attr_value = markup_attr_map.get(attr) + if not self._matches(attr_value, match_against): + match = False + break + if match: + if markup: + found = markup + else: + found = markup_name + if found and self.text and not self._matches(found.string, self.text): + found = None + return found + searchTag = search_tag + + def search(self, markup): + # print 'looking for %s in %s' % (self, markup) + found = None + # If given a list of items, scan it for a text element that + # matches. + if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)): + for element in markup: + if isinstance(element, NavigableString) \ + and self.search(element): + found = element + break + # If it's a Tag, make sure its name or attributes match. + # Don't bother with Tags if we're searching for text. + elif isinstance(markup, Tag): + if not self.text or self.name or self.attrs: + found = self.search_tag(markup) + # If it's text, make sure the text matches. + elif isinstance(markup, NavigableString) or \ + isinstance(markup, basestring): + if not self.name and not self.attrs and self._matches(markup, self.text): + found = markup + else: + raise Exception( + "I don't know how to match against a %s" % markup.__class__) + return found + + def _matches(self, markup, match_against): + # print u"Matching %s against %s" % (markup, match_against) + result = False + if isinstance(markup, list) or isinstance(markup, tuple): + # This should only happen when searching a multi-valued attribute + # like 'class'. + if (isinstance(match_against, unicode) + and ' ' in match_against): + # A bit of a special case. If they try to match "foo + # bar" on a multivalue attribute's value, only accept + # the literal value "foo bar" + # + # XXX This is going to be pretty slow because we keep + # splitting match_against. But it shouldn't come up + # too often. + return (whitespace_re.split(match_against) == markup) + else: + for item in markup: + if self._matches(item, match_against): + return True + return False + + if match_against is True: + # True matches any non-None value. + return markup is not None + + if isinstance(match_against, collections.Callable): + return match_against(markup) + + # Custom callables take the tag as an argument, but all + # other ways of matching match the tag name as a string. + if isinstance(markup, Tag): + markup = markup.name + + # Ensure that `markup` is either a Unicode string, or None. + markup = self._normalize_search_value(markup) + + if markup is None: + # None matches None, False, an empty string, an empty list, and so on. + return not match_against + + if isinstance(match_against, unicode): + # Exact string match + return markup == match_against + + if hasattr(match_against, 'match'): + # Regexp match + return match_against.search(markup) + + if hasattr(match_against, '__iter__'): + # The markup must be an exact match against something + # in the iterable. + return markup in match_against + + +class ResultSet(list): + """A ResultSet is just a list that keeps track of the SoupStrainer + that created it.""" + def __init__(self, source, result=()): + super(ResultSet, self).__init__(result) + self.source = source |