* should pop to 'tr', not the first 'td'
"""
nestingResetTriggers = self.NESTABLE_TAGS.get(name)
isNestable = nestingResetTriggers != None
isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
popTo = None
inclusive = True
for i in range(len(self.tagStack)-1, 0, -1):
p = self.tagStack[i]
if (not p or p.name == name) and not isNestable:
#Non-nestable tags get popped to the top or to their
#last occurance.
popTo = name
break
if (nestingResetTriggers != None
and p.name in nestingResetTriggers) \
or (nestingResetTriggers == None and isResetNesting
and self.RESET_NESTING_TAGS.has_key(p.name)):
#If we encounter one of the nesting reset triggers
#peculiar to this tag, or we encounter another tag
#that causes nesting to reset, pop up to but not
#including that tag.
popTo = p.name
inclusive = False
break
p = p.parent
if popTo:
self._popToTag(popTo, inclusive)
def unknown_starttag(self, name, attrs, selfClosing=0):
#print "Start tag %s: %s" % (name, attrs)
if self.quoteStack:
#This is not a real tag.
#print " is not real!" % name
attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
self.handle_data('' % (name, attrs))
return
self.endData()
if not self.isSelfClosingTag(name) and not selfClosing:
self._smartPop(name)
if self.parseOnlyThese and len(self.tagStack) <= 1 \
and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
return
tag = Tag(self, name, attrs, self.currentTag, self.previous)
if self.previous:
self.previous.next = tag
self.previous = tag
self.pushTag(tag)
if selfClosing or self.isSelfClosingTag(name):
self.popTag()
if name in self.QUOTE_TAGS:
#print "Beginning quote (%s)" % name
self.quoteStack.append(name)
self.literal = 1
return tag
def unknown_endtag(self, name):
#print "End tag %s" % name
if self.quoteStack and self.quoteStack[-1] != name:
#This is not a real end tag.
#print "%s> is not real!" % name
self.handle_data('%s>' % name)
return
self.endData()
self._popToTag(name)
if self.quoteStack and self.quoteStack[-1] == name:
self.quoteStack.pop()
self.literal = (len(self.quoteStack) > 0)
def handle_data(self, data):
self.currentData.append(data)
def _toStringSubclass(self, text, subclass):
"""Adds a certain piece of text to the tree as a NavigableString
subclass."""
self.endData()
self.handle_data(text)
self.endData(subclass)
def handle_pi(self, text):
"""Handle a processing instruction as a ProcessingInstruction
object, possibly one with a %SOUP-ENCODING% slot into which an
encoding will be plugged later."""
if text[:3] == "xml":
text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
self._toStringSubclass(text, ProcessingInstruction)
def handle_comment(self, text):
"Handle comments as Comment objects."
self._toStringSubclass(text, Comment)
def handle_charref(self, ref):
"Handle character references as data."
if self.convertEntities:
data = unichr(int(ref))
else:
data = '%s;' % ref
self.handle_data(data)
def handle_entityref(self, ref):
"""Handle entity references as data, possibly converting known
HTML and/or XML entity references to the corresponding Unicode
characters."""
data = None
if self.convertHTMLEntities:
try:
data = unichr(name2codepoint[ref])
except KeyError:
pass
if not data and self.convertXMLEntities:
data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
if not data and self.convertHTMLEntities and \
not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
# TODO: We've got a problem here. We're told this is
# an entity reference, but it's not an XML entity
# reference or an HTML entity reference. Nonetheless,
# the logical thing to do is to pass it through as an
# unrecognized entity reference.
#
# Except: when the input is "&carol;" this function
# will be called with input "carol". When the input is
# "AT&T", this function will be called with input
# "T". We have no way of knowing whether a semicolon
# was present originally, so we don't know whether
# this is an unknown entity or just a misplaced
# ampersand.
#
# The more common case is a misplaced ampersand, so I
# escape the ampersand and omit the trailing semicolon.
data = "&%s" % ref
if not data:
# This case is different from the one above, because we
# haven't already gone through a supposedly comprehensive
# mapping of entities to Unicode characters. We might not
# have gone through any mapping at all. So the chances are
# very high that this is a real entity, and not a
# misplaced ampersand.
data = "&%s;" % ref
self.handle_data(data)
def handle_decl(self, data):
"Handle DOCTYPEs and the like as Declaration objects."
self._toStringSubclass(data, Declaration)
def parse_declaration(self, i):
"""Treat a bogus SGML declaration as raw data. Treat a CDATA
declaration as a CData object."""
j = None
if self.rawdata[i:i+9] == '', i)
if k == -1:
k = len(self.rawdata)
data = self.rawdata[i+9:k]
j = k+3
self._toStringSubclass(data, CData)
else:
try:
j = SGMLParser.parse_declaration(self, i)
except SGMLParseError:
toHandle = self.rawdata[i:]
self.handle_data(toHandle)
j = i + len(toHandle)
return j
class BeautifulSoup(BeautifulStoneSoup):
"""This parser knows the following facts about HTML:
* Some tags have no closing tag and should be interpreted as being
closed as soon as they are encountered.
* The text inside some tags (ie. 'script') may contain tags which
are not really part of the document and which should be parsed
as text, not tags. If you want to parse the text as tags, you can
always fetch it and parse it explicitly.
* Tag nesting rules:
Most tags can't be nested at all. For instance, the occurance of
a

tag should implicitly close the previous

tag.

Para1

Para2
should be transformed into:

Para1

Para2
Some tags can be nested arbitrarily. For instance, the occurance
of a

tag should _not_ implicitly close the previous

tag.
Alice said:

Bob said:

Blah
should NOT be transformed into:
Alice said:

Bob said:

Blah
Some tags can be nested, but the nesting is reset by the
interposition of other tags. For instance, a

tag should
implicitly close the previous

tag within the same

,
but not close a

tag in another table.

Blah

Blah
should be transformed into:

Blah

Blah
but,

Blah

Blah
should NOT be transformed into

Blah

Blah
Differing assumptions about tag nesting rules are a major source
of problems with the BeautifulSoup class. If BeautifulSoup is not
treating as nestable a tag your page author treats as nestable,
try ICantBelieveItsBeautifulSoup, MinimalSoup, or
BeautifulStoneSoup before writing your own subclass."""
def __init__(self, *args, **kwargs):
if not kwargs.has_key('smartQuotesTo'):
kwargs['smartQuotesTo'] = self.HTML_ENTITIES
BeautifulStoneSoup.__init__(self, *args, **kwargs)
SELF_CLOSING_TAGS = buildTagMap(None,
['br' , 'hr', 'input', 'img', 'meta',
'spacer', 'link', 'frame', 'base'])
QUOTE_TAGS = {'script' : None, 'textarea' : None}
#According to the HTML standard, each of these inline tags can
#contain another tag of the same type. Furthermore, it's common
#to actually use these tags this way.
NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
'center']
#According to the HTML standard, these block tags can contain
#another tag of the same type. Furthermore, it's common
#to actually use these tags this way.
NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
#Lists can contain other lists, but there are restrictions.
NESTABLE_LIST_TAGS = { 'ol' : [],
'ul' : [],
'li' : ['ul', 'ol'],
'dl' : [],
'dd' : ['dl'],
'dt' : ['dl'] }
#Tables can contain other tables, but there are restrictions.
NESTABLE_TABLE_TAGS = {'table' : [],
'tr' : ['table', 'tbody', 'tfoot', 'thead'],
'td' : ['tr'],
'th' : ['tr'],
'thead' : ['table'],
'tbody' : ['table'],
'tfoot' : ['table'],
}
NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
#If one of these tags is encountered, all tags up to the next tag of
#this type are popped.
RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
NON_NESTABLE_BLOCK_TAGS,
NESTABLE_LIST_TAGS,
NESTABLE_TABLE_TAGS)
NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
# Used to detect the charset in a META tag; see start_meta
CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")
def start_meta(self, attrs):
"""Beautiful Soup can detect a charset included in a META tag,
try to convert the document to that charset, and re-parse the
document from the beginning."""
httpEquiv = None
contentType = None
contentTypeIndex = None
tagNeedsEncodingSubstitution = False
for i in range(0, len(attrs)):
key, value = attrs[i]
key = key.lower()
if key == 'http-equiv':
httpEquiv = value
elif key == 'content':
contentType = value
contentTypeIndex = i
if httpEquiv and contentType: # It's an interesting meta tag.
match = self.CHARSET_RE.search(contentType)
if match:
if getattr(self, 'declaredHTMLEncoding') or \
(self.originalEncoding == self.fromEncoding):
# This is our second pass through the document, or
# else an encoding was specified explicitly and it
# worked. Rewrite the meta tag.
newAttr = self.CHARSET_RE.sub\
(lambda(match):match.group(1) +
"%SOUP-ENCODING%", contentType)
attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
newAttr)
tagNeedsEncodingSubstitution = True
else:
# This is our first pass through the document.
# Go through it again with the new information.
newCharset = match.group(3)
if newCharset and newCharset != self.originalEncoding:
self.declaredHTMLEncoding = newCharset
self._feed(self.declaredHTMLEncoding)
raise StopParsing
tag = self.unknown_starttag("meta", attrs)
if tag and tagNeedsEncodingSubstitution:
tag.containsSubstitutions = True
class StopParsing(Exception):
pass
class ICantBelieveItsBeautifulSoup(BeautifulSoup):
"""The BeautifulSoup class is oriented towards skipping over
common HTML errors like unclosed tags. However, sometimes it makes
errors of its own. For instance, consider this fragment:
FooBar
This is perfectly valid (if bizarre) HTML. However, the
BeautifulSoup class will implicitly close the first b tag when it
encounters the second 'b'. It will think the author wrote
"FooBar", and didn't close the first 'b' tag, because
there's no real-world reason to bold something that's already
bold. When it encounters '' it will close two more 'b'
tags, for a grand total of three tags closed instead of two. This
can throw off the rest of your document structure. The same is
true of a number of other tags, listed below.
It's much more common for someone to forget to close a 'b' tag
than to actually use nested 'b' tags, and the BeautifulSoup class
handles the common case. This class handles the not-co-common
case: where you can't believe someone wrote what they did, but
it's valid HTML and BeautifulSoup screwed up by assuming it
wouldn't be."""
I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
'big']
I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
class MinimalSoup(BeautifulSoup):
"""The MinimalSoup class is for parsing HTML that contains
pathologically bad markup. It makes no assumptions about tag
nesting, but it does know which tags are self-closing, that