"""Utilities for manipulating the content provided by the user."""fromlxmlimportetreeasETfromlxml.htmlimportHTMLParserfrompyqueryimportPyQueryaspqfromsemanticeditor.utils.etreeimportcleanup,flatten,get_parent,get_depth,get_index,indent,eliminate_tagfromsemanticeditor.utils.datastructuresimportstructimportre### Errors ###classInvalidHtml(ValueError):passclassIncorrectHeadings(ValueError):passclassBadStructure(ValueError):passclassTooManyColumns(BadStructure):passAllUserErrors=(InvalidHtml,IncorrectHeadings,BadStructure,TooManyColumns)### Definitions ###technical_blockdef=set(['h1','h2','h3','h4','h5','h6','p','ol','ul','blockquote'])# according to HTML4additional_blockdef=set(['li'])# li really act like block elementsblockdef=technical_blockdef|additional_blockdefblockdef_selector=",".join(blockdef)# need to sync with wymeditor.semantic.jsheadingdef=set(['h1','h2','h3','h4','h5','h6'])preview_blockdef=technical_blockdef# The number of chars we trim block level elements to.BLOCK_LEVEL_TRIM_LENGTH=200### Layout CSS class names #### This is designed to be user supply-able if necessaryclassLayoutDetailsBase(object):""" Base class for strategy object used to define the details of CSS/HTML to be used when rendering a layout """# Inherit from this class if creating your own custom class. LayoutDetails# provides a concrete implementation.def_raise_not_implemented(self):raiseNotImplementedError()max_columns=property(_raise_not_implemented,doc="""Maximum number of columns to allow""")use_inner_column_div=property(_raise_not_implemented,doc="""True to wrap all column content in a inner div""")defrow_classes(self,logical_column_count,actual_column_count):""" Returns a list of CSS classes to be used for a row containing logical_column_count 'logical' columns, actual_column_count 'actual' columns. 'actual' columns are present in the HTML structure, but some might be, for example, double width, so are counted as two logical columns. """raiseNotImplementedError()defcolumn_classes(self,logical_column_num,actual_column_num,logical_column_count,actual_column_count):""" Returns a list of CSS classes to be used for a column which is number column_num out of column_count. (see above regarding logical/actual) """raiseNotImplementedError()defis_row_class(self,class_):""" Returns true if the class (a string) corresponds to a CSS class used for a row. """raiseNotImplementedErrordefis_column_class(self,class_):""" Returns true if the class (a string) corresponds to a CSS class used for a column. """raiseNotImplementedError()defrow_end_html(self):""" Returns some raw HTML to be added at the end of a row (e.g. for clearing floats) if necessary. """return""defouter_column_classes(self,presinfo):""" Given a list a PresentationInfo objects, return the ones that should be applied to the outer column div. """ifself.use_inner_column_div:returnpresinfoelse:raiseNotImplementedError()definner_column_classes(self,presinfo):""" Given a list a PresentationInfo objects, return the ones that should be applied to the inner column div. (Never called if use_inner_column_div = False) """raiseNotImplementedError()# Hacks, optionaldefformat_pre_parse_hacks(self,html,styleinfo):""" For formatting, applies hacks to unformatted HTML before parsing, returns HTML to be used. """returnhtmldefformat_post_parse_hacks(self,tree,styleinfo):""" For formatting, applies hacks to tree after parsing, returns new tree to be used. """returntreedefformat_structure_hacks(self,structure,styleinfo):""" For formatting, given a list of StructureItems and a list of PresentationInfos, applies hacks and returns new structure to be used. """returnstructuredefformat_post_layout_hacks(self,tree,structure,styleinfo):""" For formatting, given the tree after layout, the structure and style info, apply hacks and return a new tree. """returntreedefextract_pre_parse_hacks(self,html):""" For extracting presentation info, applies hacks to formatted HTML before parsing, and returns HTML to be used. """returnhtmldefextract_post_parse_hacks(self,tree):""" For extracting presentation info, applies hacks to parse tree before after parsing, and returns tree. """returntreedefextract_structure_hacks(self,structure):""" For extracting presentation info, given a list of StructureItems, applies hacks and returns new structure to be used. """returnstructureclassLayoutDetails(LayoutDetailsBase):""" Strategy object used for defining the details of CSS/HTML to be used when rendering a Layout. This is a concrete implementation. """ROW_CLASS="row"COLUMN_CLASS="column"max_columns=6use_inner_column_div=Truedefrow_classes(self,logical_column_count,actual_column_count):retval=[self.ROW_CLASS]ifactual_column_count>1:retval.append("columns%d"%logical_column_count)returnretvaldefcolumn_classes(self,logical_column_num,actual_column_num,logical_column_count,actual_column_count):ifactual_column_count==1:# No classesreturn[]retval=[self.COLUMN_CLASS]ifactual_column_num==1:retval.append("firstcolumn")ifactual_column_num==actual_column_count:retval.append("lastcolumn")returnretvaldefis_row_class(self,class_):returnclass_==self.ROW_CLASSorre.match(r'^columns\d+$',class_)defis_column_class(self,class_):returnclass_==self.COLUMN_CLASSorre.match(r'^(first|last)column$',class_)defrow_end_html(self):return""defouter_column_classes(self,presinfo):return[piforpiinpresinfoifpi.column_equivisnotNone]definner_column_classes(self,presinfo):return[piforpiinpresinfoifpi.column_equivisNone]# Hacksdefformat_post_layout_hacks(self,tree,structure,styleinfo):# WYMEditor cannot insert divs. This is a workaroundfornintree.getiterator():ifn.tag=='p'and('div'in_get_classes_for_node(n)):n.tag='div'returntreedefextract_post_parse_hacks(self,tree):# inverse part of above workaroundfornintree.getiterator():ifn.tag=='div'and('div'in_get_classes_for_node(n)):n.tag='p'returntree### Parsing ###defparse(content,clean=False):""" Parses the HTML provided into an ElementTree. If 'clean' is True, lax parsing is done, the tree is cleaned of dirty user provided HTML """# We also use HTMLParser for 'strict', because the XML parser seems to eliminate# '\r' for some reason.tree=ET.fromstring(u'<html><body>'+content+u'</body></html>',parser=HTMLParser())ifclean:clean_tree(tree)returntree# NB: ElementTree is bizarre - after parsing some UTF-8 bytestrings, it will# then return nodes that are 'str's if the text is all ASCII, otherwise# 'unicode's (having correctly interpreted the UTF-8). When serialising to# JSON, this works out OK actually, so we leave it as is for the moment.defpretty_print(content):t=parse(content)indent(t)return_html_extract(t)### Semantic editor functionality ##### Presentation dictionary utilitiesclassPresentationInfo(object):""" Encapsulates a piece of presentation information. """def__init__(self,prestype=None,name=None,verbose_name="",description="",allowed_elements=None,column_equiv=None):self.prestype=prestypeself.name=name# verbose_name, description and allowed_elements are additional pieces# of information that are only needed when the client is requesting a# list of styles. In other sitations these objects may not have these# attributes filled in.self.verbose_name=verbose_nameself.description=descriptionifallowed_elementsisNone:allowed_elements=[]self.allowed_elements=allowed_elementsself.column_equiv=column_equivdef__eq__(self,other):returnself.prestype==other.prestypeandself.name==other.namedef__hash__(self):returnhash(self.prestype)^hash(self.name)def__repr__(self):return"PresentationInfo(prestype=\"%s\", name=\"%s\")"%(self.prestype,self.name)defPresentationClass(name,verbose_name="",description="",allowed_elements=None,column_equiv=None):""" Shortcut for creating CSS classes """returnPresentationInfo(prestype="class",name=name,verbose_name=verbose_name,description=description,allowed_elements=allowed_elements,column_equiv=column_equiv)defPresentationCommand(name,verbose_name="",description=""):""" Shortcut for creating commands """returnPresentationInfo(prestype="command",name=name,verbose_name=verbose_name,description=description,allowed_elements=sorted(list(technical_blockdef)))NEWROW=PresentationCommand('newrow',verbose_name="New row",description="""<p>Use this command to start a new row.</p><p>This must be used in conjunction with 'New column'to create a column layout.</p><p>Please note that new rows and columns cannot be started at anypoint in the document. Within a given row, new columns can only bestarted on section headings of the same level. The 'New row' commandmust be applied to the first section heading for which a column layoutis required and subsequent headings of the same level may be givena 'New column' command.</p><p>If you wish to stop an existing column layout for a section, then you willneed to apply a 'New row' command to that section, creating a row withjust one column in it.</p>""")NEWCOL=PresentationCommand('newcol',verbose_name="New column",description="""<p>Use this command to start a new column, after a 'New row'command has been used to start a set of columns.</p>""")COMMANDS=[NEWROW,NEWCOL]## General utilitiesdefany(seq):foriinseq:ifi:returnTruereturnFalsedef_invert_dict(d):returndict((v,k)for(k,v)ind.items())def_get_classes_for_node(node):returnfilter(len,node.get('class','').split(' '))def_find_next_available_name(stem,used_names):i=2whileTrue:attempt=stem+str(i)ifattemptnotinused_names:returnattemptelse:i+=1defmake_sect_id(tag,used_ids):i=1whileTrue:attempt=tag+"_"+str(i)ifattemptnotinused_ids:returnattemptelse:i+=1defget_layout_details_strategy():# TODO - make configurablereturnLayoutDetails()### Structure related ###classStructureItem(object):__metaclass__=structlevel=0# level is the 'outline level' in the document i.e. an integersect_id=''# sect_id is a unique ID used for storing presentation information againstname=''# name is a user presentable name for the sectiontag=''# tag is the HTML element e.g. h1node=None# node is the ElementTree nodedefget_structure(root,assert_structure=False):""" Return the structure nodes, as a list of StructureItems """retval=[]names=set()sect_ids=set()heading_names=set()cur_level=1last_heading_num=0first_heading_level=1# Pre-pass to get existing ids.forninroot.getiterator():ifn.taginblockdef:sect_id=n.get('id')ifsect_idisnotNone:ifnotsect_id.startswith(n.tag)orsect_idinsect_ids:# don't use invalid or duplicate ids.# removedeln.attrib['id']else:# reservesect_ids.add(sect_id)forninroot.getiterator():ifn.taginblockdef:text=flatten(n)sect_id=n.get('id')ifsect_idisNone:sect_id=make_sect_id(n.tag,sect_ids)sect_ids.add(sect_id)ifn.taginheadingdef:name=textlevel=int(n.tag[1])cur_level=levelifassert_structure:iflen(heading_names)==0:first_heading_level=levelelse:iflevel<first_heading_level:raiseIncorrectHeadings("No heading can be higher than the first ""heading, which was H%d."%first_heading_level)ifnameinheading_names:raiseIncorrectHeadings('There are duplicate headings with the name'' "%s".'%name)# Headings should decrease or monotonically increaseiflen(heading_names)>0andlevel>last_heading_num+1:raiseIncorrectHeadings('Heading "%(name)s" is level H%(foundnum)d, ''but it should be level H%(rightnum)d or less'%dict(name=name,foundnum=level,rightnum=last_heading_num+1))last_heading_num=levelheading_names.add(name)else:name=text[0:BLOCK_LEVEL_TRIM_LENGTH]name=name+"..."ifnameinnames:name=_find_next_available_name(name,names)names.add(name)# Paragraphs etc within a section should be indented# one further than the heading above them.iflen(heading_names)==0:level=1else:level=cur_level+1# Level is adjusted so that e.g. H3 is level 1, if it is# the first to appear in the document.# It is also adjusted so that nested items (e.g. p in blockquote)# appear to be nested.nesting_level=get_depth(root,n)-2retval.append(StructureItem(level=nesting_level+level-first_heading_level+1,sect_id=sect_id,name=name,tag=n.tag.lower(),node=n))returnretvaldef_get_classes_from_presinfo(presinfos):# Extract a list of classes from a list of PresentationInfo objectsreturn[pi.nameforpiinpresinfosifpi.prestype=="class"]## Main functions and sub functionsdefextract_structure(content):""" Extracts H1, H2, etc headings, and other block level elements and returns a list of tuples containing (level, name, tag) """# This function is no longer used externally, but it has tests# against it that are useful at checking the behaviour of get_structuretree=parse(content,clean=True)structure=get_structure(tree,assert_structure=True)returnstructuredefformat_html(html,styleinfo,return_tree=False,pretty_print=False):""" Formats the XHTML given using a dictionary of style information. The dictionary has keys which are the ids of sections, and values which are lists of CSS classes or special commands. """layout_strategy=get_layout_details_strategy()html=layout_strategy.format_pre_parse_hacks(html,styleinfo)root=parse(html,clean=True)root=layout_strategy.format_post_parse_hacks(root,styleinfo)structure=get_structure(root,assert_structure=True)structure=layout_strategy.format_structure_hacks(structure,styleinfo)sect_ids=[s.sect_idforsinstructure]styleinfo=_sanitise_styleinfo(styleinfo,sect_ids)# Strip existing divs, otherwise we cannot format properly. If# there are other block level elements that mess things up, we# raise BadStructure later, but divs have no semantics so can just# be removed._strip_presentation(root)# Apply normal CSS classes.forsiinstructure:# Apply css stylesclasses=_get_classes_from_presinfo(styleinfo[si.sect_id])classes.sort()ifclasses:si.node.set("class"," ".join(classes))# Create layout from row/column commandslayout=_create_layout(root,styleinfo,structure)_check_layout(layout,structure,layout_strategy)# Create new ET tree from layout. The individual nodes that belong to# 'root' are not altered, but just added to a new tree. This means that the# information in 'structure' does not need updating.rendered=_render_layout(layout,layout_strategy)rendered=layout_strategy.format_post_layout_hacks(rendered,structure,styleinfo)# Pretty printifpretty_print:indent(rendered)# Remove the temporary IDs we may have added when splitting the HTML# into content and presentation. We don't do this before this point,# as the IDs need to be there to identify sectionsforsiinstructure:if'id'insi.node.attrib:delsi.node.attrib['id']ifreturn_tree:return(rendered,structure)else:return_html_extract(rendered)def_html_extract(root):iflen(root)==0androot.textisNoneandroot.tailisNone:return''returnET.tostring(root).replace('<html>','').replace('</html>','').replace('<body>','').replace('</body>','').replace("<head/>","").replace("&#13;","\r")def_strip_presentation(tree):cleanup(tree,lambdat:t.tag=='div')def_sanitise_styleinfo(styleinfo,sect_ids):# Replace lists with setsout={}fork,vinstyleinfo.items():out[k]=set(v)# Ensure that all sections have an entry in styleinfoforsect_idinsect_ids:ifnotsect_idinout:out[sect_id]=set()returnout#### Layout related ##### Some dumb container structuresLayout=struct("Layout",(object,),dict(rows=list))LayoutRow=struct("LayoutRow",(object,),dict(columns=list,presinfo=list))LayoutColumn=struct("LayoutColumn",(object,),dict(nodes=list,presinfo=list))_NEWROW_PREFIX='newrow_'_NEWCOL_PREFIX='newcol_'def_layout_column_width(col):""" Returns the logical column width of a column """column_equivs=[pi.column_equivforpiincol.presinfoifpi.column_equivisnotNone]iflen(column_equivs)>0:# assume user has not done something silly like put# *2* column_equiv classes on a columnreturncolumn_equivs[0]else:return1def_layout_column_count(row):""" Get the number of logical columns in a LayoutRow """returnsum(_layout_column_width(c)forcinrow.columns)defis_root(node):returnnode.tag=='html'ornode.tag=='body'def_find_layout_commands(root,structure,styleinfo):# Layout commands are not stored against normal sections,# but have their own entry in the section list, using an id# of 'newrow_' or 'newcol_' + id of block they preceed.sect_dict=dict((s.sect_id,s)forsinstructure)row_info={}# key = sect_id, val = [PresentationInfo]col_info={}# key = sect_id, val = [PresentationInfo]forsect_id,presinfoinstyleinfo.items():ifsect_id.startswith(_NEWROW_PREFIX):real_sect_id=sect_id[len(_NEWROW_PREFIX):]sect=sect_dict.get(real_sect_id)ifsectisnotNone:parent=get_parent(root,sect.node)ifnotis_root(parent):raiseBadStructure("Section \"%(name)s\" is not at the top level of ""the document, and therefore cannot have a column ""structure applied to it. Please move the 'New row' ""command to a top level element."%dict(name=sect.name))row_info[real_sect_id]=presinfoifsect_id.startswith(_NEWCOL_PREFIX):real_sect_id=sect_id[len(_NEWCOL_PREFIX):]sect=sect_dict.get(real_sect_id)ifsectisnotNone:parent=get_parent(root,sect.node)ifnotis_root(parent):raiseBadStructure("Section \"%(name)s\" is not at the top level of ""the document, and therefore cannot have a column ""structure applied to it. Please move the 'New column' ""command to a top level element."%dict(name=sect.name))col_info[real_sect_id]=presinforeturnrow_info,col_infodef_create_layout(root,styleinfo,structure):# Find the layout commandsrow_info,col_info=_find_layout_commands(root,structure,styleinfo)# Build a Layout structure# We put everything inside a Row and Column, even if there is# only one column.layout=Layout()row=LayoutRow()col=LayoutColumn()sect_dict=dict((si.node,si)forsiinstructure)# Build Layoutchildren=root.getchildren()ifchildrenandchildren[0].tag=='body':children=children[0].getchildren()fornodeinchildren:si=sect_dict.get(node)ifsi:row_presinfo=row_info.get(si.sect_id)ifrow_presinfoisnotNone:# We can assume row_presinfo contains NEWROW command# Finish current col and row, if they have anything in themifcol.nodes:row.columns.append(col)ifrow.columns:layout.rows.append(row)# Start new row with stylesrow=LayoutRow(presinfo=row_presinfo)# Start new colcol=LayoutColumn()col_presinfo=col_info.get(si.sect_id)ifcol_presinfoisnotNone:# Assume col_presinfo contains NEWCOL command# Finish current col, if it is non-emptyifcol.nodes:row.columns.append(col)# Start new col with stylescol=LayoutColumn(presinfo=col_presinfo)# Now deal with content itselfcol.nodes.append(node)# Close last col and rowifcol.nodes:row.columns.append(col)layout.rows.append(row)returnlayoutdef_check_layout(layout,structure,layout_strategy):sect_dict=dict((si.node,si)forsiinstructure)max_cols=layout_strategy.max_columnsforrowinlayout.rows:if_layout_column_count(row)>max_cols:# Because columns can be multiple width, we can't easily work out# which column needs to be moved, so just refer user to whole# section.node=row.columns[0].nodes[0]sect=sect_dict[node]raiseTooManyColumns("The maximum number of columns is %(max)d. ""Please adjust columns in section '%(name)s'."%dict(max=max_cols,name=sect.name))def_render_layout(layout,layout_strategy):docroot=ET.fromstring("<html><body></body></html>")root=docroot.getchildren()[0]# bodyforrowinlayout.rows:# Rowlogical_column_count=_layout_column_count(row)actual_column_count=len(row.columns)rowdiv=ET.Element('div')classes=layout_strategy.row_classes(logical_column_count,actual_column_count)+_get_classes_from_presinfo(row.presinfo)ifclasses:rowdiv.set('class',' '.join(classes))# Columnslogical_column_num=1fori,colinenumerate(row.columns):coldiv=ET.Element('div')classes=layout_strategy.column_classes(logical_column_num,i+1,logical_column_count,actual_column_count)+ \
_get_classes_from_presinfo(layout_strategy.outer_column_classes(col.presinfo))ifclasses:coldiv.set('class',' '.join(classes))iflayout_strategy.use_inner_column_div:contentdiv=ET.Element('div')coldiv.append(contentdiv)inner_classes=_get_classes_from_presinfo(layout_strategy.inner_column_classes(col.presinfo))ifinner_classes:contentdiv.set('class',' '.join(inner_classes))else:contentdiv=coldivfornincol.nodes:contentdiv.append(n)rowdiv.append(coldiv)logical_column_num+=_layout_column_width(col)root.append(rowdiv)returndocrootdefpreview_html(html,pres):root,structure=format_html(html,pres,return_tree=True)structure2=[siforsiinstructureifsi.taginpreview_blockdef]known_nodes=dict((si.node,si)forsiinstructure2)_create_preview(root,structure2,known_nodes)return_html_extract(root)def_create_preview(node,structure,known_nodes):children=node.getchildren()ifchildrenandchildren[0].tag=='body':children=children[0].getchildren()forninchildren:ifn.tag=='div':_create_preview(n,structure,known_nodes)else:sect=known_nodes.get(n)ifsectisnotNoneand(n.taginblockdef):n.set('class','structural '+"tag"+n.tag.lower())n.tag="div"n[:]=[]n.text=sect.nameelse:node.remove(n)def_find_row_col_divs(root,node,layout_strategy):""" Finds the row and column divs that a node belongs to. Returns a 3 tuple (row_div, col_div, inner_col_div) col_div is None if the node is not the first content node within that column. row_div is None if the node is not the first content node within that row. inner_col_div is None if there is no inner column div, or if col_div is None """# Keep going up until we find a 'row' div or 'column' div# that are parent/child.p=get_parent(root,node)gp=Nonep_is_col,gp_is_row=False,Falserow_div,col_div,inner_col_div=None,None,NoneifpisnotNoneandp.tag=='div'andget_index(p,node)==0:# We only care if node is the first child of the column divc_classes=_get_classes_for_node(p)p_is_col=any(layout_strategy.is_column_class(c)forcinc_classes)gp=get_parent(root,p)ifgpisnotNoneandgp.tag=='div'andget_index(gp,p)==0:# We only locate row divs if col is first col within rowr_classes=_get_classes_for_node(gp)gp_is_row=any(layout_strategy.is_row_class(c)forcinr_classes)# We can't always tell if something is a col (especially for single# column structure), but by identfying the row we can tell we are in# a column structure.ifgp_is_row:p_is_col=Trueifgp_is_row:row_div=gpifp_is_col:col_div=pifnotp_is_col:ifpisnotNoneandp.tag=='div'andget_index(p,node)==0:# Try to go up onerow_div,col_div,inner_col_div=_find_row_col_divs(root,p,layout_strategy)ifinner_col_divisNoneandcol_divisnotNone:# We now know that current parent 'p' is inner_col_divinner_col_div=preturn(row_div,col_div,inner_col_div)return(row_div,col_div,inner_col_div)defextract_presentation(html):""" Takes HTML with formatting applied and returns presentation elements (a dictionary with keys = section names, values = set of classes/commands) and the HTML without formatting (ready to be used in an editor) """# TODO: this function is not brilliantly well defined e.g. should# there be an entry in the dictionary for sections with no# formatting? This does not affect functionality, but it does# affect tests.layout_strategy=get_layout_details_strategy()html=layout_strategy.extract_pre_parse_hacks(html)root=parse(html,clean=False)# it's important we don't clean.root=layout_strategy.extract_post_parse_hacks(root)structure=get_structure(root)structure=layout_strategy.extract_structure_hacks(structure)pres={}forsiinstructure:pres[si.sect_id]=set()# Section - extract classesforcin_get_classes_for_node(si.node):pres[si.sect_id].add(PresentationClass(c))if'class'insi.node.attrib:delsi.node.attrib['class']# Add custom ids. These are only for purpose of editing,# and will be removed again at end of format_htmlsi.node.set('id',si.sect_id)# Try to find 'row' and 'column' divs that this node belongs to.# Columns can have inner divs for styling purposes. Some CSS classes# will be applied to the outer column div, some to the inner column div.row_node,col_node,inner_col_node=_find_row_col_divs(root,si.node,layout_strategy)ifrow_nodeisnotNone:r_classes=_get_classes_for_node(row_node)row_pres=set([NEWROW]+[PresentationClass(c)forcinr_classesifnotlayout_strategy.is_row_class(c)])pres[_NEWROW_PREFIX+si.sect_id]=row_presifcol_nodeisnotNone:c_classes=_get_classes_for_node(col_node)ifinner_col_nodeisnotNone:c_classes.extend(_get_classes_for_node(inner_col_node))col_pres=set([NEWCOL]+[PresentationClass(c)forcinc_classesifnotlayout_strategy.is_column_class(c)])pres[_NEWCOL_PREFIX+si.sect_id]=col_pres_strip_presentation(root)out_html=_html_extract(root)return(pres,out_html)def_clean_elem(d):forxin['style','class']:try:d.removeAttr(x)exceptKeyError:passdef_empty_text(x):returnxisNoneorx.strip()==""def_promote_child_text(elem,tag):""" Ensure any leading or trailing text directly as a child of elem is wrapped in a tag. """ifnot_empty_text(elem.text):newtag=ET.Element(tag)newtag.text=elem.textelem.insert(0,newtag)elem.text=Noneiflen(elem)>0andnot_empty_text(elem[-1].tail):newtag=ET.Element(tag)newtag.text=elem[-1].tailelem[-1].tail=Noneelem.append(newtag)def_clean_nested(elem):foridx,childinreversed(list(enumerate(elem.getchildren()))):# (do it reversed so that indexes never change as we mutate children)_clean_nested(child)ifchild.tag=='p'andelem.tag=='p':eliminate_tag(elem,idx)def_replace_block_elements(elem):forchildinelem.getchildren():ifchild.tag=='div':child.tag='p'_replace_block_elements(child)def_remove_command_divs(elem):forchildinreversed(elem.getchildren()):_remove_command_divs(child)ifchild.tag=='div'orchild.tag=='p':classes=set(_get_classes_for_node(child))ifany(c.nameinclassesforcinCOMMANDS):elem.remove(child)defclean_tree(root):""" Cleans dirty HTML from an ElementTree """initial_html=_html_extract(root)body=root[0]# <html><body># If there is text directly in body, it needs wrapping in a block element._promote_child_text(body,'p')# replace 'command' divs_remove_command_divs(body)# First replace divs_replace_block_elements(body)# Deal with nested 'p's and other elements._clean_nested(body)doc=pq(root)doc('*').each(_clean_elem)doc('style').remove()doc('col').remove()defpull_up(n):p=get_parent(body,n)i=get_index(p,n)eliminate_tag(p,i)forxin['table','tbody','thead','tr','td','span','li p:only-child']:fornindoc(x):pull_up(n)# "li p:only-child" appears to be buggy. It works like# "li p:only-descendent" or something.forxin['strong','em','b','i']:fornindoc(x):ifpq(n).is_(blockdef_selector):pull_up(n)# remove duplicate 'id' attributes.ids=[n.get('id',None)fornindoc('*[id]')]ids=[iforiinidsifi!=""andi!=None]foriinset(ids):forj,nodeinenumerate(doc('#'+i)):if(j>0):# skip the first onedelnode.attrib['id']forxin['p + br','p:empty']:doc(x).remove()# Removed elements can give problems which need to be fixed again. We keep# iterating through this until we get the same answer!output_html=_html_extract(root)ifinitial_html==output_html:returnelse:clean_tree(root)defclean_html(html):tree=parse(html,clean=True)return_html_extract(tree)