1# Copyright (c) 2004 Ian Bicking. All rights reserved. 2# 3# Redistribution and use in source and binary forms, with or without 4# modification, are permitted provided that the following conditions are 5# met: 6# 7# 1. Redistributions of source code must retain the above copyright 8# notice, this list of conditions and the following disclaimer. 9# 10# 2. Redistributions in binary form must reproduce the above copyright 11# notice, this list of conditions and the following disclaimer in 12# the documentation and/or other materials provided with the 13# distribution. 14# 15# 3. Neither the name of Ian Bicking nor the names of its contributors may 16# be used to endorse or promote products derived from this software 17# without specific prior written permission. 18# 19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR 23# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 26# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 27# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 28# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 29# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31"""The ``lxml.html`` tool set for HTML handling. 32""" 33 34importsys 35importre 36try: 37fromurlparseimporturljoin 38exceptImportError: 39# Python 3 40fromurllib.parseimporturljoin 41importcopy 42fromlxmlimportetree 43fromlxml.htmlimportdefs 44fromlxml.html._setmixinimportSetMixin 45try: 46fromcollectionsimportMutableMappingasDictMixin 47exceptImportError: 48# Python < 2.6 49fromUserDictimportDictMixin 50try: 51set 52exceptNameError: 53# Python 2.3 54fromsetsimportSetasset 55try: 56bytes 57exceptNameError: 58# Python < 2.6 59bytes=str 60try: 61unicode 62exceptNameError: 63# Python 3 64unicode=str 65try: 66basestring 67exceptNameError: 68# Python 3 69basestring=(str,bytes) 70

175id=self.get('id') 176ifnotid: 177raiseTypeError( 178"You cannot set a label for an element (%r) that has no id" 179%self) 180if_nons(label.tag)!='label': 181raiseTypeError( 182"You can only assign label to a label element (not %r)" 183%label) 184label.set('for',id)

192""" 193 Removes this element from the tree, including its children and 194 text. The tail text is joined to the previous element or 195 parent. 196 """ 197parent=self.getparent() 198assertparentisnotNone 199ifself.tail: 200previous=self.getprevious() 201ifpreviousisNone: 202parent.text=(parent.textor'')+self.tail 203else: 204previous.tail=(previous.tailor'')+self.tail 205parent.remove(self)

254""" 255 Get the first element in a document with the given id. If none is 256 found, return the default argument if provided or raise KeyError 257 otherwise. 258 259 Note that there can be more than one element with the same id, 260 and this isn't uncommon in HTML documents found in the wild. 261 Browsers return only the first match, and this function does 262 the same. 263 """ 264try: 265# FIXME: should this check for multiple matches? 266# browsers just return the first one 267return_id_xpath(self,id=id)[0] 268exceptIndexError: 269ifdefault: 270returndefault[0] 271else: 272raiseKeyError(id)

281""" 282 Run the CSS expression on this element and its children, 283 returning a list of the results. 284 285 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) 286 -- note that pre-compiling the expression can provide a substantial 287 speedup. 288 """ 289# Do the import here to make the dependency optional. 290fromlxml.cssselectimportCSSSelector 291returnCSSSelector(expr,translator=translator)(self)

298""" 299 Make all links in the document absolute, given the 300 ``base_url`` for the document (the full URL where the document 301 came from), or if no ``base_url`` is given, then the ``.base_url`` of the document. 302 303 If ``resolve_base_href`` is true, then any ``<base href>`` 304 tags in the document are used *and* removed from the document. 305 If it is false then any such tag is ignored. 306 """ 307ifbase_urlisNone: 308base_url=self.base_url 309ifbase_urlisNone: 310raiseTypeError( 311"No base_url given, and the document has no base_url") 312ifresolve_base_href: 313self.resolve_base_href() 314deflink_repl(href): 315returnurljoin(base_url,href)

319""" 320 Find any ``<base href>`` tag in the document, and apply its 321 values to all links found in the document. Also remove the 322 tag once it has been applied. 323 """ 324base_href=None 325basetags=self.xpath('//base[@href]|//x:base[@href]',namespaces={'x':XHTML_NAMESPACE}) 326forbinbasetags: 327base_href=b.get('href') 328b.drop_tree() 329ifnotbase_href: 330return 331self.make_links_absolute(base_href,resolve_base_href=False)

334""" 335 Yield (element, attribute, link, pos), where attribute may be None 336 (indicating the link is in the text). ``pos`` is the position 337 where the link occurs; often 0, but sometimes something else in 338 the case of links in stylesheets or style tags. 339 340 Note: <base href> is *not* taken into account in any way. The 341 link you get is exactly the link in the document. 342 343 Note: multiple links inside of a single text string or 344 attribute value are returned in reversed order. This makes it 345 possible to replace or delete them from the text string value 346 based on their reported text positions. Otherwise, a 347 modification at one text position can change the positions of 348 links reported later on. 349 """ 350link_attrs=defs.link_attrs 351forelinself.iter(): 352attribs=el.attrib 353tag=_nons(el.tag) 354iftag!='object': 355forattribinlink_attrs: 356ifattribinattribs: 357yield(el,attrib,attribs[attrib],0) 358eliftag=='object': 359codebase=None 360## <object> tags have attributes that are relative to 361## codebase 362if'codebase'inattribs: 363codebase=el.get('codebase') 364yield(el,'codebase',codebase,0) 365forattribin'classid','data': 366ifattribinattribs: 367value=el.get(attrib) 368ifcodebaseisnotNone: 369value=urljoin(codebase,value) 370yield(el,attrib,value,0) 371if'archive'inattribs: 372formatchin_archive_re.finditer(el.get('archive')): 373value=match.group(0) 374ifcodebaseisnotNone: 375value=urljoin(codebase,value) 376yield(el,'archive',value,match.start()) 377iftag=='param': 378valuetype=el.get('valuetype')or'' 379ifvaluetype.lower()=='ref': 380## FIXME: while it's fine we *find* this link, 381## according to the spec we aren't supposed to 382## actually change the value, including resolving 383## it. It can also still be a link, even if it 384## doesn't have a valuetype="ref" (which seems to be the norm) 385## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype 386yield(el,'value',el.get('value'),0) 387iftag=='style'andel.text: 388urls=[ 389_unquote_match(match.group(1),match.start(1)) 390formatchin_css_url_re.finditer(el.text) 391]+[ 392(match.group(1),match.start(1)) 393formatchin_css_import_re.finditer(el.text) 394] 395ifurls: 396# sort by start pos to bring both match sets back into order 397urls=[(start,url)for(url,start)inurls] 398urls.sort() 399# reverse the list to report correct positions despite 400# modifications 401urls.reverse() 402forstart,urlinurls: 403yield(el,None,url,start) 404if'style'inattribs: 405urls=list(_css_url_re.finditer(attribs['style'])) 406ifurls: 407# return in reversed order to simplify in-place modifications 408formatchinurls[::-1]: 409url,start=_unquote_match(match.group(1),match.start(1)) 410yield(el,'style',url,start)

414""" 415 Rewrite all the links in the document. For each link 416 ``link_repl_func(link)`` will be called, and the return value 417 will replace the old link. 418 419 Note that links may not be absolute (unless you first called 420 ``make_links_absolute()``), and may be internal (e.g., 421 ``'#anchor'``). They can also be values like 422 ``'mailto:email'`` or ``'javascript:expr'``. 423 424 If you give ``base_href`` then all links passed to 425 ``link_repl_func()`` will take that into account. 426 427 If the ``link_repl_func`` returns None, the attribute or 428 tag text will be removed completely. 429 """ 430ifbase_hrefisnotNone: 431# FIXME: this can be done in one pass with a wrapper 432# around link_repl_func 433self.make_links_absolute(base_href,resolve_base_href=resolve_base_href) 434elifresolve_base_href: 435self.resolve_base_href() 436forel,attrib,link,posinself.iterlinks(): 437new_link=link_repl_func(link.strip()) 438ifnew_link==link: 439continue 440ifnew_linkisNone: 441# Remove the attribute or element content 442ifattribisNone: 443el.text='' 444else: 445delel.attrib[attrib] 446continue 447ifattribisNone: 448new=el.text[:pos]+new_link+el.text[pos+len(link):] 449el.text=new 450else: 451cur=el.attrib[attrib] 452ifnotposandlen(cur)==len(link): 453# Most common case 454el.attrib[attrib]=new_link 455else: 456new=cur[:pos]+new_link+cur[pos+len(link):] 457el.attrib[attrib]=new

461""" 462 An object that represents a method on an element as a function; 463 the function takes either an element or an HTML string. It 464 returns whatever the function normally returns, or if the function 465 works in-place (and so returns None) it returns a serialized form 466 of the resulting document. 467 """

516"""A lookup scheme for HTML Element classes. 517 518 To create a lookup instance with different Element classes, pass a tag 519 name mapping of Element classes in the ``classes`` keyword argument and/or 520 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 521 The special key '*' denotes a Mixin class that should be mixed into all 522 Element classes. 523 """ 524_default_element_classes={} 525

576""" 577 Parses several HTML elements, returning a list of elements. 578 579 The first item in the list may be a string (though leading 580 whitespace is removed). If no_leading_text is true, then it will 581 be an error if there is leading text, and it will always be a list 582 of only elements. 583 584 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 585 """ 586ifparserisNone: 587parser=html_parser 588# FIXME: check what happens when you give html with a body, head, etc. 589ifisinstance(html,bytes): 590ifnot_looks_like_full_html_bytes(html): 591html='<html><body>%s</body></html>'.encode('ascii')%html 592else: 593ifnot_looks_like_full_html_unicode(html): 594html='<html><body>%s</body></html>'%html 595doc=document_fromstring(html,parser=parser,base_url=base_url,**kw) 596assert_nons(doc.tag)=='html' 597bodies=[eforeindocif_nons(e.tag)=='body'] 598assertlen(bodies)==1,("too many bodies: %r in %r"%(bodies,html)) 599body=bodies[0] 600elements=[] 601ifno_leading_textandbody.textandbody.text.strip(): 602raiseetree.ParserError( 603"There is leading text: %r"%body.text) 604ifbody.textandbody.text.strip(): 605elements.append(body.text) 606elements.extend(body) 607# FIXME: removing the reference to the parent artificial document 608# would be nice 609returnelements

658""" 659 Parse the html, returning a single element/document. 660 661 This tries to minimally parse the chunk of text, without knowing if it 662 is a fragment or a document. 663 664 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 665 """ 666ifparserisNone: 667parser=html_parser 668ifisinstance(html,bytes): 669is_full_html=_looks_like_full_html_bytes(html) 670else: 671is_full_html=_looks_like_full_html_unicode(html) 672doc=document_fromstring(html,parser=parser,base_url=base_url,**kw) 673ifis_full_html: 674returndoc 675# otherwise, lets parse it out... 676bodies=doc.findall('body') 677ifnotbodies: 678bodies=doc.findall('{%s}body'%XHTML_NAMESPACE) 679ifbodies: 680body=bodies[0] 681iflen(bodies)>1: 682# Somehow there are multiple bodies, which is bad, but just 683# smash them into one body 684forother_bodyinbodies[1:]: 685ifother_body.text: 686iflen(body): 687body[-1].tail=(body[-1].tailor'')+other_body.text 688else: 689body.text=(body.textor'')+other_body.text 690body.extend(other_body) 691# We'll ignore tail 692# I guess we are ignoring attributes too 693other_body.drop_tree() 694else: 695body=None 696heads=doc.findall('head') 697ifnotheads: 698heads=doc.findall('{%s}head'%XHTML_NAMESPACE) 699ifheads: 700# Well, we have some sort of structure, so lets keep it all 701head=heads[0] 702iflen(heads)>1: 703forother_headinheads[1:]: 704head.extend(other_head) 705# We don't care about text or tail in a head 706other_head.drop_tree() 707returndoc 708ifbodyisNone: 709returndoc 710if(len(body)==1and(notbody.textornotbody.text.strip()) 711and(notbody[-1].tailornotbody[-1].tail.strip())): 712# The body has just one element, so it was probably a single 713# element passed in 714returnbody[0] 715# Now we have a body which represents a bunch of tags which have the 716# content that was passed in. We will create a fake container, which 717# is the body tag, except <body> implies too much structure. 718if_contains_block_level_tag(body): 719body.tag='div' 720else: 721body.tag='span' 722returnbody

725""" 726 Parse a filename, URL, or file-like object into an HTML document 727 tree. Note: this returns a tree, not an element. Use 728 ``parse(...).getroot()`` to get the document root. 729 730 You can override the base URL with the ``base_url`` keyword. This 731 is most useful when parsing from a file-like object. 732 """ 733ifparserisNone: 734parser=html_parser 735returnetree.parse(filename_or_url,parser,base_url=base_url,**kw)

865""" 866 Helper function to submit a form. Returns a file-like object, as from 867 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 868 which shows the URL if there were any redirects. 869 870 You can use this like:: 871 872 form = doc.forms[0] 873 form.inputs['foo'].value = 'bar' # etc 874 response = form.submit() 875 doc = parse(response) 876 doc.make_links_absolute(response.geturl()) 877 878 To change the HTTP requester, pass a function as ``open_http`` keyword 879 argument that opens the URL for you. The function must have the following 880 signature:: 881 882 open_http(method, URL, values) 883 884 The action is one of 'GET' or 'POST', the URL is the target URL as a 885 string, and the values are a sequence of ``(name, value)`` tuples with the 886 form data. 887 """ 888values=form.form_values() 889ifextra_values: 890ifhasattr(extra_values,'items'): 891extra_values=extra_values.items() 892values.extend(extra_values) 893ifopen_httpisNone: 894open_http=open_http_urllib 895ifform.action: 896url=form.action 897else: 898url=form.base_url 899returnopen_http(form.method,url,values)

947 948""" 949 An accessor that represents all the input fields in a form. 950 951 You can get fields by name from this, with 952 ``form.inputs['field_name']``. If there are a set of checkboxes 953 with the same name, they are returned as a list (a `CheckboxGroup` 954 which also allows value setting). Radio inputs are handled 955 similarly. 956 957 You can also iterate over this to get all input elements. This 958 won't return the same thing as if you get all the names, as 959 checkboxes and radio elements are returned individually. 960 """ 961 962_name_xpath=etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]") 963_all_xpath=etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']") 964

1047"""1048 Get/set the value (which is the contents of this element)1049 """1050content=self.textor''1051ifself.tag.startswith("{%s}"%XHTML_NAMESPACE):1052serialisation_method='xml'1053else:1054serialisation_method='html'1055forelinself:1056# it's rare that we actually get here, so let's not use ''.join()1057content+=etree.tostring(el,method=serialisation_method,encoding=unicode)1058returncontent

1070"""1071 ``<select>`` element. You can get the name with ``.name``.10721073 ``.value`` will be the value of the selected option, unless this1074 is a multi-select element (``<select multiple>``), in which case1075 it will be a set-like object. In either case ``.value_options``1076 gives the possible values.10771078 The boolean attribute ``.multiple`` shows if this is a1079 multi-select.1080 """1081

1197foroptioninself.options:1198opt_value=option.get('value')1199ifopt_valueisNone:1200opt_value=option.textor''1201ifopt_value:1202opt_value=opt_value.strip()1203ifopt_value==item:1204option.set('selected','')1205break1206else:1207raiseValueError(1208"There is no option with the value %r"%item)

1211foroptioninself.options:1212opt_value=option.get('value')1213ifopt_valueisNone:1214opt_value=option.textor''1215ifopt_value:1216opt_value=opt_value.strip()1217ifopt_value==item:1218if'selected'inoption.attrib:1219deloption.attrib['selected']1220else:1221raiseValueError(1222"The option %r is not currently selected"%item)1223break1224else:1225raiseValueError(1226"There is not option with the value %r"%item)

1235"""1236 This object represents several ``<input type=radio>`` elements1237 that have the same name.12381239 You can use this like a list, but also use the property1240 ``.value`` to check/uncheck inputs. Also you can use1241 ``.value_options`` to get the possible values.1242 """1243

1255ifvalueisnotNone:1256forelinself:1257ifel.get('value')==value:1258checked_option=el1259break1260else:1261raiseValueError(1262"There is no radio input with the value %r"%value)1263forelinself:1264if'checked'inel.attrib:1265delel.attrib['checked']1266ifvalueisnotNone:1267checked_option.set('checked','')

1287"""1288 Represents a group of checkboxes (``<input type=checkbox>``) that1289 have the same name.12901291 In addition to using this like a list, the ``.value`` attribute1292 returns a set-like object that you can add to or remove from to1293 check and uncheck checkboxes. You can also use ``.value_options``1294 to get the possible values.1295 """1296

1369"""1370 Represents an ``<input>`` element.13711372 You can get the type with ``.type`` (which is lower-cased and1373 defaults to ``'text'``).13741375 Also you can get and set the value with ``.value``13761377 Checkboxes and radios have the attribute ``input.checkable ==1378 True`` (for all others it is false) and a boolean attribute1379 ``.checked``.13801381 """13821383## FIXME: I'm a little uncomfortable with the use of .checked

1385"""1386 Get/set the value of this element, using the ``value`` attribute.13871388 Also, if this is a checkbox and it has no value, this defaults1389 to ``'on'``. If it is a checkbox or radio that is not1390 checked, this returns None.1391 """1392ifself.checkable:1393ifself.checked:1394returnself.get('value')or'on'1395else:1396returnNone1397returnself.get('value')

1433"""1434 Boolean attribute to get/set the presence of the ``checked``1435 attribute.14361437 You can only use this on checkable input types.1438 """1439ifnotself.checkable:1440raiseAttributeError('Not a checkable input type')1441return'checked'inself.attrib

1505"""Convert all tags in an XHTML tree to HTML by removing their1506 XHTML namespace.1507 """1508try:1509xhtml=xhtml.getroot()1510exceptAttributeError:1511pass1512prefix="{%s}"%XHTML_NAMESPACE1513prefix_len=len(prefix)1514forelinxhtml.iter(prefix+"*"):1515el.tag=el.tag[prefix_len:]

15161517# This isn't a general match, but it's a match for what libxml21518# specifically serialises:1519__str_replace_meta_content_type=re.compile(1520r'<meta http-equiv="Content-Type"[^>]*>').sub1521__bytes_replace_meta_content_type=re.compile(1522r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub1523

1600"""1601 Open the HTML document in a web browser, saving it to a temporary1602 file to open it. Note that this does not delete the file after1603 use. This is mainly meant for debugging.1604 """1605importos1606importwebbrowser1607importtempfile1608ifnotisinstance(doc,etree._ElementTree):1609doc=etree.ElementTree(doc)1610handle,fn=tempfile.mkstemp(suffix='.html')1611f=os.fdopen(handle,'wb')1612try:1613doc.write(f,method="html",encoding=encodingordoc.docinfo.encodingor"UTF-8")1614finally:1615# we leak the file itself here, but we should at least close it1616f.close()1617url='file://'+fn.replace(os.path.sep,'/')1618print(url)1619webbrowser.open(url)

1634"""An XML parser that is configured to return lxml.html Element1635 objects.16361637 Note that this parser is not really XHTML aware unless you let it1638 load a DTD that declares the HTML entities. To do this, make sure1639 you have the XHTML DTDs installed in your catalogs, and create the1640 parser like this::16411642 >>> parser = XHTMLParser(load_dtd=True)16431644 If you additionally want to validate the document, use this::16451646 >>> parser = XHTMLParser(dtd_validation=True)16471648 For catalog support, see http://www.xmlsoft.org/catalog.html.1649 """