[docs]defget_dirs(self,src,tgt,sources=False):''' Returns all directions with boolean values or list of sources. '''query=(src,tgt)ifself.check_nodes(query):ifsources:return[self.sources[query],self.sources[(query[1],query[0])],self.sources['undirected']]else:return[self.dirs[query],self.dirs[(query[1],query[0])],self.dirs['undirected']]else:returnNone

[docs]defsrc(self):''' Returns the IDs of effector molecules in this directed interaction. If the interaction is bidirectional, the list will contain 2 IDs. If the interaction is undirec- ted, an empty list will be returned. '''return[k[0]fork,viniteritems(self.dirs)ifk!='undirected'andv]

[docs]deftgt(self):''' Returns the IDs of the target moleculess in the inter- action. Same behaviour as `Direction.src()`. '''return[k[1]fork,viniteritems(self.dirs)ifk!='undirected'andv]

[docs]defmajority_dir(self):""" Returns directionality based on majority consensus. Returns `None` if the number of sources supporting the two opposite directions are the same. Returns `'undirected'` if there is no directionality information. Returns `tuple` of IDs if one direction is supported by more sources. """ifself.is_directed():iflen(self.sources[self.straight])==len(self.sources[self.reverse]):returnNoneeliflen(self.sources[self.straight])>len(self.sources[self.reverse]):returnself.straightelse:returnself.reverseelse:return'undirected'

[docs]defmajority_sign(self):""" Returns signs based on majority consensus. Keys in the returned `dict` are directions. Values are `None` if the direction lacks effect sign. Otherwise `tuples` with their first element `True` if the number of sources supporting stimulation in the given direction is greater or equal compared to those supporting inhibition. The second value is the same for inhibition. """result={self.straight:None,self.reverse:None}ifself.has_sign(direction=self.straight):pos=len(self.positive_sources[self.straight])>=len(self.negative_sources[self.straight])neg=len(self.positive_sources[self.straight])<=len(self.negative_sources[self.straight])result[self.straight]=[pos,neg]ifself.has_sign(direction=self.reverse):pos=len(self.positive_sources[self.reverse])>=len(self.negative_sources[self.reverse])neg=len(self.positive_sources[self.reverse])<=len(self.negative_sources[self.reverse])result[self.reverse]=[pos,neg]returnresult

[docs]defconsensus_edges(self):""" Returns list of edges based on majority consensus of directions and signs. """result=[]d=self.majority_dir()s=self.majority_sign()ifd=='undirected':result.append([self.straight[0],self.straight[1],'undirected','unknown'])ifdisNoneord==self.straight:ifs[self.straight]isnotNone:ifs[self.straight][0]:result.append([self.straight[0],self.straight[1],'directed','positive'])ifs[self.straight][1]:result.append([self.straight[0],self.straight[1],'directed','negative'])else:result.append([self.straight[0],self.straight[1],'directed','unknown'])ifdisNoneord==self.reverse:ifs[self.reverse]isnotNone:ifs[self.reverse][0]:result.append([self.reverse[0],self.reverse[1],'directed','positive'])ifs[self.reverse][1]:result.append([self.reverse[0],self.reverse[1],'directed','negative'])else:result.append([self.reverse[0],self.reverse[1],'directed','unknown'])returnresult

classAttrHelper(object):def__init__(self,value,name=None,defaults={}):self.name=nameself.value=valueself.defaults=defaultsifisinstance(self.value,dict):self.id_type=type(self.value.keys()[0])def__call__(self,instance,thisDir=None,thisSign=None,thisDirSources=None,thisSources=None):_thisDir='directed'ifisinstance(thisDir,tuple)elsethisDir# user supplied callback function:ifhasattr(self.value,'__call__'):returnself.value(instance)# special cases #1: by direction/effectelifself.value=='DIRECTIONS'andself.defaultsisnotNoneand \
self.nameisnotNoneandself.nameinself.defaults:if_thisDirinself.defaults[self.name]:ifthisSigninself.defaults[self.name][_thisDir]:returnself.defaults[self.name][_thisDir][thisSign]# special cases #2: by source categoryelifself.value=='RESOURCE_CATEGORIES':forresource_typein['pathway','ptm','reaction','interaction']:iflen(getattr(data_formats,'%s_resources'%resource_type)&thisSources)>0:ifself.nameinself.defaultsand \
resource_typeinself.defaults[self.name]:returnself.defaults[self.name][resource_type]sys.stdout.wrtie('No category for %s\n'%thisSources)sys.stdout.flush()# if value is constant:eliftype(self.value)incommon.simpleTypes:returnself.value# if a dictionary given to map some igraph attribute to values:elifhasattr(self.value,'__call__'):returnself.value(instance)elifisinstance(self.value,dict)andself.attr_nameisnotNone:ifhasattr(instance,self.value['_name']):key_attr=getattr(instance,self.value['_name'])elifself.value['_name']ininstance.attributes():key_attr=instance[self.value['_name']]ifkey_attrinself.value:returnself.value[key_attr]# if default value has been given for this attribute:elifself.nameisnotNoneandself.defaultsisnotNoneand \
self.nameinself.defaults:returnself.defaults[self.name]# ultimately, return Noneelse:returnNoneclass_NamedVertexSeq(object):def__init__(self,_vs,_nodNam,_nodLab):self._vs=_vsself._nodNam=_nodNamself._nodLab=_nodLabdef__iter__(self):forvinself._vs:yieldvdefgenesymbol(self):forvinself._vs:yieldself._nodLab[v.index]defuniprot(self):forvinself._vs:yieldself._nodNam[v.index]defids(self):forvinself._vs:yieldv.indexgs=genesymbolup=uniprotvs=__iter__

[docs]classPyPath(object):#### main network object###default_name_type={'protein':'uniprot','mirna':'mirbase','drug':'chembl'}def__init__(self,ncbi_tax_id=9606,default_name_type=default_name_type,copy=None,mysql=(None,'mapping'),chembl_mysql=(None,'chembl'),name='unnamed',outdir='results',loglevel='INFO',loops=False):''' Currently only one organism molecular interaction networks are supported. Some functions supports multi-species networks, and maybe once the whole module will support that. @ncbi_tax_id : int The ID of the organism in NCBI Taxonomy. Defaults to human (9606). @mysql The MySQL parameter used by the mapping module to load some ID conversion tables from MySQL. @default_name_type : dict Dictionary of default ID types, what all identifiers of the given molecular species should be converted to. By default, for protein it is UniProt. It could be any other, only then you need to supply the required format definitions for the ID conversion tables. @copy : BioGraph object In case you copy an other instance. @name : str This is a custom session/project name. @outdir : str The directory where you wish to create all the output files. @loglevel : str Passed to logging module. @loops : bool Whether to allow loop edges in the graph. Default is False. '''self.__version__=common.__version__fordin['results','log','cache']:ifnotos.path.exists(d):os.makedirs(d)ifcopyisNone:self.graph=igraph.Graph(0)g=self.graphg['entity_types']={}g['ncbi_tax_id']=ncbi_tax_idg['name']=nameg['sources']={}g['references']={}g['directed']=Falseg.vs['type']=[]g.vs['name']=[]g.vs['nameType']=[]g.vs['originalNames']=[[]for_inxrange(self.graph.vcount())]g.vs['ncbi_tax_id']=[]g.vs['exp']=[{}]g.es['sources']=[set([])for_inxrange(self.graph.ecount())]g.es['type']=[[]for_inxrange(self.graph.ecount())]g.es['references']=[[]for_inxrange(self.graph.ecount())]g.es['refs_by_source']=[{}for_inxrange(self.graph.ecount())]g.es['refs_by_dir']=[{}for_inxrange(self.graph.ecount())]g.es['refs_by_type']=[{}for_inxrange(self.graph.ecount())]g.es['sources_by_type']=[{}for_inxrange(self.graph.ecount())]g.es['negative_refs']=[[]for_inxrange(self.graph.ecount())]g.es['negative']=[[]for_inxrange(self.graph.ecount())]g.es['dirs']=[None]g['layout_type']=Noneg['layout_data']=Noneg['only_directed']=False# allow loop edges in the graphself.loops=loopsself.dgraph=Noneself._undirected=self.graphself._directed=Noneself.failed_edges=[]self.uniprot_mapped=[]self.mysql_conf=mysqlself.set_chembl_mysql(chembl_mysql[1],chembl_mysql[0])# self.mysql = mysql.MysqlRunner(self.mysql_conf)self.unmapped=[]self.name=nameself.outdir=outdirself.ncbi_tax_id=ncbi_tax_idself.data={}self.reflists={}self.negatives={}self.raw_data=Noneself.lists={}self.plots={}self.proteomicsdb=Noneself.exp_samples=set([])self.sources=[]self.has_cats=set([])self.db_dict={}self.pathway_types=[]self.pathways={}self.vertexAttrs={}self.edgeAttrs={}self.u_pfam=Noneself.seq=Noneself.palette=['#6EA945','#007B7F','#FCCC06','#DA0025','#000000']self.session=common.gen_session_id()self.session_name=''.join([self.name,'-',self.session])self.loglevel=loglevelself.ownlog=logn.logw(self.session,self.loglevel)self.mapper=mapping.Mapper(self.ncbi_tax_id,mysql_conf=self.mysql_conf,log=self.ownlog)self.disclaimer='\n\n\t=== d i s c l a i m e r ===\n\n'\
'\tAll data coming with this module\n'\
'\teither as redistributed copy or downloaded using the\n'\
'\tprogrammatic interfaces included in the present module\n'\
'\tare available under public domain, are free to use at\n'\
'\tleast for academic research or education purposes.\n'\
'\tPlease be aware of the licences of all the datasets\n'\
'\tyou use in your analysis, and please give appropriate\n'\
'\tcredits for the original sources when you publish your\n'\
'\tresults. To find out more about data sources please\n'\
'\tlook at `pypath.descriptions` and\n'\
'\t`pypath.data_formats.urls`.\n\n'self.licence()self.ownlog.msg(1,"PyPath has been initialized")self.ownlog.msg(1,"Beginning session '%s'"%self.session)sys.stdout.write("""\t> New session started,\n\tsession ID: '%s'\n\tlogfile:""""""'./%s'.\n"""%(self.session,self.ownlog.logfile))else:self.copy(copy)

[docs]defset_chembl_mysql(self,title,config_file=None):''' Sets the ChEMBL MySQL config according to `title` section in `config_file` ini style config. title (str): section title in ini file config_file (str, NoneType): config file name; if None, the `mysql_config/defaults.mysql` will be used '''self.chembl_mysql=(config_file,title)

defsave_network(self,pfile=None):pfile=pfileifpfileisnotNone \
elseos.path.join('cache','default_network.pickle')pickle.dump(self.graph,open(pfile,'wb'))#### functions to read networks from text files or mysql###defget_max(self,attrList):maxC=0forvalinattrList.values():ifval.__class__istuple:val=val[0]ifval>maxC:maxC=valreturnmaxCdefget_attrs(self,line,spec,lnum):attrs={}forcolinspec:# extraEdgeAttrs and extraNodeAttrs are dicts# of additional parameters assigned to edges and nodes respectively;# key is the name of the parameter, value is the col number,# or a tuple of col number and the separator,# if the column contains additional subfields e.g. (5, ";")try:ifspec[col].__class__istuple:fieldVal=line[spec[col][0]].split(spec[col][1])else:fieldVal=line[spec[col]]except:self.ownlog.msg(2,("""Wrong column index (%s) in extra attributes? Line #%u\n"""%(str(col),lnum)),'ERROR')readError=1breakfieldName=colattrs[fieldName]=fieldValreturnattrsdefget_taxon(self,tax_dict,fields):if'A'intax_dictand'B'intax_dict:return(self.get_taxon(tax_dict['A'],fields),self.get_taxon(tax_dict['B'],fields))else:iffields[tax_dict['col']]intax_dict['dict']:returntax_dict['dict'][fields[tax_dict['col']]]else:returnNonedefnumof_references(self):returnlen(common.uniqList(common.flatList(list(map(lambdae:e['references'],self.graph.es)))))defmean_reference_per_interaction(self):returnnp.mean(list(map(lambdae:len(e['references']),self.graph.es)))defnumof_reference_interaction_pairs(self):returnlen(common.uniqList(common.flatList(list(map(lambdae:list(map(lambdar:(e.index,r),e['references'])),self.graph.es)))))defcurators_work(self):curation_effort=self.numof_reference_interaction_pairs()sys.stdout.write('\t:: Curators worked %.01f-%.01f years to accomplish ''what currently you have incorporated in this network!''\n\n\tAmazing, isn\'t it?\n'%(curation_effort*15/60.0/2087.0,curation_effort*60/60.0/2087.0))sys.stdout.flush()defreference_edge_ratio(self):returnself.numof_references()/float(self.graph.ecount())

[docs]defupdate_vname(self):''' For fast lookup of node names and indexes, these are hold in a list and a dict as well. However, every time new nodes are added, these should be updated. This function is automatically called after all operations affecting node indices. '''self.genesymbol_labels()graph=self._get_undirected()self._already_has_directed()dgraph=self._directedifgraphisnotNone:self.nodInd=set(graph.vs['name'])self.nodDct=dict(zip(graph.vs['name'],xrange(graph.vcount())))self.labDct=dict(zip(graph.vs['label'],xrange(graph.vcount())))self.nodNam=dict(zip(xrange(graph.vcount()),graph.vs['name']))self.nodLab=dict(zip(xrange(graph.vcount()),graph.vs['label']))ifdgraphisnotNone:self.dnodInd=set(dgraph.vs['name'])self.dnodDct=dict(zip(dgraph.vs['name'],xrange(dgraph.vcount())))self.dlabDct=dict(zip(dgraph.vs['label'],xrange(dgraph.vcount())))self.dnodNam=dict(zip(xrange(dgraph.vcount()),dgraph.vs['name']))self.dnodLab=dict(zip(xrange(dgraph.vcount()),dgraph.vs['label']))

deffilters(self,line,positiveFilters=[],negativeFilters=[]):forfiltrinnegativeFilters:iflen(filtr)>2:sep=filtr[2]thisVal=set(line[filtr[0]].split(sep))else:thisVal=set([line[filtr[0]]])filtrVal=set(filtr[1]ifisinstance(filtr[1],list)else[filtr[1]])iflen(thisVal&filtrVal)>0:returnTrueforfiltrinpositiveFilters:iflen(filtr)>2:sep=filtr[2]thisVal=set(line[filtr[0]].split(sep))else:thisVal=set([line[filtr[0]]])filtrVal=set(filtr[1]ifisinstance(filtr[1],list)else[filtr[1]])iflen(thisVal&filtrVal)==0:returnTruereturnFalsedeflookup_cache(self,name,cache_files,int_cache,edges_cache):infile=NoneedgeListMapped=[]cache_file=cache_files[name]ifnameincache_fileselseNoneifcache_fileisnotNoneandos.path.exists(cache_file):cache_type=cache_file.split('.')[-2]ifcache_type=='interactions':infile=self.read_from_cache(int_cache)elifcache_type=='edges':edgeListMapped=self.read_from_cache(edges_cache)elifos.path.exists(edges_cache):edgeListMapped=self.read_from_cache(edges_cache)else:ifos.path.exists(int_cache):infile=self.read_from_cache(int_cache)returninfile,edgeListMappeddefread_from_cache(self,cache_file):sys.stdout.write('\t:: Reading from cache: %s\n'%cache_file)sys.stdout.flush()self.ownlog.msg(2,'Data have been read from cache: %s'%cache_file)returnpickle.load(open(cache_file,'rb'))defprocess_sign(self,signData,signDef):stim=Falseinh=FalsesignSep=signDef[3]iflen(signDef)>3elseNonesignData=set(str(signData).split(signSep))pos=set(signDef[1]ifisinstance(signDef[1],list)else[signDef[1]])neg=set(signDef[2]ifisinstance(signDef[2],list)else[signDef[2]])iflen(signData&pos)>0:stim=Trueeliflen(signData&neg)>0:inh=Truereturnstim,inhdefprocess_direction(self,line,dirCol,dirVal,dirSep):ifdirColisNoneordirValisNone:returnFalseelse:thisDir=set(line[dirCol].split(dirSep))returnlen(thisDir&dirVal)>0

[docs]defread_data_file(self,settings,keep_raw=False,cache_files={},reread=False,redownload=False):''' Interaction data with node and edge attributes can be read from simple text based files. This function works not only with files, but with lists as well. Any other function can be written to download a preprocess data, and then give it to this function to finally attach to the network. @settings : ReadSettings instance The detailed definition of the input format. Instead of the file name you can give a function name, which will be executed, and the returned data will be used. @keep_raw : boolean To keep the raw data read by this function, in order for debugging purposes, or further use. '''listLike=set([list,tuple])edgeList=[]nodeList=[]edgeListMapped=[]infile=None_name=settings.name.lower()int_cache=os.path.join('cache','%s.interactions.pickle'%_name)edges_cache=os.path.join('cache','%s.edges.pickle'%_name)ifnotrereadandnotredownload:infile,edgeListMapped=self.lookup_cache(_name,cache_files,int_cache,edges_cache)ifnotlen(edgeListMapped):ifinfileisNone:ifsettings.__class__.__name__!="ReadSettings":self.ownlog.msg(2,("""No proper input file definition!\n\'settings\' should be a \'ReadSettings\' instance\n"""),'ERROR')returnNoneifsettings.huge:sys.stdout.write('\n\tProcessing %s requires huge memory.\n''\tPlease hit `y` if you have at least 2G free memory,\n''\tor `n` to omit %s.\n''\tAfter processing once, it will be saved in \n''\t%s, so next time can be loaded quickly.\n\n''\tProcess %s now? [y/n]\n'%(settings.name,settings.name,edges_cache,settings.name))sys.stdout.flush()whileTrue:answer=raw_input().lower()ifanswer=='n':returnNoneelifanswer=='y':breakelse:sys.stdout.write('\n\tPlease answer `y` or `n`:\n\t')sys.stdout.flush()inputFunc=self.get_function(settings.inFile)ifinputFuncisNoneandhasattr(dataio,settings.inFile):inputFunc=getattr(dataio,settings.inFile)# reading from remote or local file, or executing import# function:ifsettings.inFile.startswith('http')or \
settings.inFile.startswith('ftp'):curl_use_cache=notredownloadc=curl.Curl(settings.inFile,silent=False,large=True,cache=curl_use_cache)infile=c.resultinfile=[xforxininfile.read().replace('\r','').split('\n')iflen(x)>0]self.ownlog.msg(2,"Retrieving data from%s ..."%settings.inFile)# elif hasattr(dataio, settings.inFile):elifinputFuncisnotNone:self.ownlog.msg(2,"Retrieving data by dataio.%s() ..."%inputFunc.__name__)_store_cache=curl.CACHEcurl.CACHE=notredownload# this try-except needs to be removed# once correct exception handling will# be implemented in every input functiontry:infile=inputFunc(**settings.inputArgs)exceptExceptionase:sys.stdout.write('\n\t:: Error in `pypath.dataio.%s()`. ''Skipping to next resource.\n'%inputFunc.__name__)sys.stdout.write('\t:: %s\n'%str(e.args))sys.stdout.flush()try:traceback.print_tb(e.__traceback__,file=sys.stdout)exceptExceptionase:sys.stdout.write('\t:: Failed handling exception.\n')sys.stdout.write('\t%s\n'%str(e.args))sys.stdout.flush()curl.CACHE=_store_cacheelifos.path.isfile(settings.inFile):infile=codecs.open(settings.inFile,encoding='utf-8',mode='r')self.ownlog.msg(2,"%s opened..."%settings.inFile)ifinfileisNone:self.ownlog.msg(2,"%s: No such file or ""dataio function! :(\n"%(settings.inFile),'ERROR')returnNone# finding the largest referred column number,# to avoid references out of rangeisDir=settings.isDirectedsign=settings.signrefCol=settings.refs[0]ifisinstance(settings.refs,tuple) \
elsesettings.refsifisinstance(settings.refs,int)elseNonerefSep=settings.refs[1]ifisinstance(settings.refs,tuple)else';'sigCol=Noneifnotisinstance(sign,tuple)elsesign[0]dirCol=NonedirVal=NonedirSep=Noneifisinstance(isDir,tuple):dirCol=isDir[0]dirVal=isDir[1]dirSep=isDir[2]iflen(isDir)>2elseNoneelifisinstance(sign,tuple):dirCol=sign[0]dirVal=sign[1:3]dirVal=dirValiftype(dirVal[0])incommon.simpleTypeselsecommon.flatList(dirVal)dirSep=sign[3]iflen(sign)>3elseNonedirVal=set(dirValifisinstance(dirVal,list)else[dirVal])maxCol=max(filter(lambdai:iisnotNone,[settings.nameColA,settings.nameColB,self.get_max(settings.extraEdgeAttrs),self.get_max(settings.extraNodeAttrsA),self.get_max(settings.extraNodeAttrsB),refCol,dirCol,sigCol,max(itertools.chain(map(lambdax:x[0],settings.positiveFilters),[0])),max(itertools.chain(map(lambdax:x[0],settings.negativeFilters),[0]))]))# iterating lines from input filelnum=0lFiltered=0rFiltered=0tFiltered=0readError=0forlineininfile:lnum+=1iflen(line)<=1or(lnum==1andsettings.header):# empty lines# or header rowcontinueiftype(line)notinlistLike:ifhasattr(line,'decode'):line=line.decode('utf-8')line=line.replace('\n','').replace('\r','').\
split(settings.separator)else:line=[x.replace('\n','').replace('\r','')ifhasattr(x,'replace')elsexforxinline]# in case line has less fields than needediflen(line)<maxCol:self.ownlog.msg(2,('Line #%u has less than %u fields,'' skipping! :(\n'%(lnum,maxCol)),'ERROR')readError=1continueelse:# applying filters:ifself.filters(line,settings.positiveFilters,settings.negativeFilters):lFiltered+=1continue# reading names and attributes:ifisDirandnotisinstance(isDir,tuple):thisEdgeDir=Trueelse:thisEdgeDir=self.process_direction(line,dirCol,dirVal,dirSep)refs=[]ifrefColisnotNone:refs=common.delEmpty(list(set(line[refCol].split(refSep))))refs=dataio.only_pmids([r.strip()forrinrefs])iflen(refs)==0andsettings.must_have_references:rFiltered+=1continue# to give an easy way:ifisinstance(settings.ncbiTaxId,int):taxA=settings.ncbiTaxIdtaxB=settings.ncbiTaxId# to enable more sophisticated inputs:elifisinstance(settings.ncbiTaxId,dict):taxx=self.get_taxon(settings.ncbiTaxId,line)ifisinstance(taxx,tuple):taxA=taxx[0]taxB=taxx[1]else:taxA=taxB=taxxelse:taxA=taxB=self.ncbi_tax_idiftaxAisNoneortaxBisNone:tFiltered+=1continuestim=Falseinh=Falseifisinstance(sign,tuple):stim,inh=self.process_sign(line[sign[0]],sign)resource=line[settings.resource]ifisinstance(settings.resource,int)elsesettings.resourcenewEdge={"nameA":line[settings.nameColA].strip(),"nameB":line[settings.nameColB].strip(),"nameTypeA":settings.nameTypeA,"nameTypeB":settings.nameTypeB,"typeA":settings.typeA,"typeB":settings.typeB,"source":resource,"isDirected":thisEdgeDir,"references":refs,"stim":stim,"inh":inh,"taxA":taxA,"taxB":taxB,"type":settings.intType}# except:# self.ownlog.msg(2,("""Wrong name column indexes (%u and %u),# or wrong separator (%s)? Line #%u\n"""#% (#settings.nameColA, settings.nameColB,# settings.separator, lnum)), 'ERROR')#readError = 1# break# getting additional edge and node attributesattrsEdge=self.get_attrs(line,settings.extraEdgeAttrs,lnum)attrsNodeA=self.get_attrs(line,settings.extraNodeAttrsA,lnum)attrsNodeB=self.get_attrs(line,settings.extraNodeAttrsB,lnum)# merging dictionariesnodeAttrs={"attrsNodeA":attrsNodeA,"attrsNodeB":attrsNodeB,"attrsEdge":attrsEdge}newEdge.update(nodeAttrs)ifreadError!=0:self.ownlog.msg(2,('Errors occured, certain lines skipped.''Trying to read the remaining.\n'),'ERROR')readError=1edgeList.append(newEdge)ifhasattr(infile,'close'):infile.close()### !!!! ##edgeListMapped=self.map_list(edgeList)self.ownlog.msg(2,"%u lines have been read from %s,""%u links after mapping; \n\t\t""%u lines filtered by filters;\n\t\t""%u lines filtered because lack of references;\n\t\t""%u lines filtered by taxon filters."%(lnum-1,settings.inFile,len(edgeListMapped),lFiltered,rFiltered,tFiltered))ifrereadorredownload:pickle.dump(edgeListMapped,open(edges_cache,'wb'))self.ownlog.msg(2,'Mapped edge list saved to %s'%edges_cache)ifkeep_raw:self.data[settings.name]=edgeListMappedself.raw_data=edgeListMapped

defload_list(self,lst,name):self.lists[name]=lst

[docs]defreceptors_list(self):""" Loads the Human Plasma Membrane Receptome as a list. This resource is human only. """self.lists['rec']=common.uniqList(common.flatList([self.mapper.map_name(rec,'genesymbol','uniprot',ncbi_tax_id=9606)forrecindataio.get_hpmr()]))

[docs]defdruggability_list(self):""" Loads the list of druggable proteins from DgiDB. This resource is human only. """self.lists['dgb']=common.uniqList(common.flatList([self.mapper.map_name(dgb,'genesymbol','uniprot',9606)fordgbindataio.get_dgidb()]))

[docs]defkinases_list(self):""" Loads the list of all known kinases in the proteome from kinase.com. This resource is human only. """self.lists['kin']=common.uniqList(common.flatList([self.mapper.map_name(kin,'genesymbol','uniprot',9606)forkinindataio.get_kinases()]))

[docs]defdisease_genes_list(self,dataset='curated'):""" Loads the list of all disease related genes from DisGeNet. This resource is human only. """diss=dataio.get_disgenet(dataset=dataset)dis=[]fordiindiss:dis.extend(self.mapper.map_name(di['entrez'],'entrez','uniprot',9606))self.lists['dis']=common.uniqList(dis)

[docs]defsignaling_proteins_list(self):""" Compiles a list of signaling proteins (as opposed to other proteins like metabolic enzymes, matrix proteins), by looking up a few simple keywords in short description of GO terms. """goq=dataio.get_go_quick()gosig=set([])forterm,nameiniteritems(goq['names']):if'signal'innameor'regulat'inname:gosig.add(term)upsig=set([])if'proteome'notinself.lists:self.proteome_list()forup,terminiteritems(goq['terms']['P']):iflen(term&gosig):upsig.add(up)spsig=set([])foruinupsig:spsig.update(set(self.mapper.map_name(u,'uniprot','uniprot',ncbi_tax_id=self.ncbi_tax_id)))upsig=spsig&set(self.lists['proteome'])self.lists['sig']=list(upsig)

defcancer_drivers_list(self,intogen_file=None):self.cancer_gene_census_list()ifintogen_fileisnotNone:self.intogen_cancer_drivers_list(intogen_file=intogen_file)self.lists['cdv']=list(set(self.lists['cgc'])|set(self.lists['IntOGen']))else:self.lists['cdv']=self.lists['cgc']self.graph.vs['cdv']=list(map(lambdav:Trueifv['name']inself.lists['cdv']elseFalse,self.graph.vs))defcoverage(self,lst):lst=lstifisinstance(lst,set) \
elseset(lst)ifisinstance(lst,list) \
elseset(self.lists[lst]) \
ifisinstance(lst,str)andlstinself.lists \
elseset([])returnlen(set(self.graph.vs['name'])&lst)/float(len(lst))deffisher_enrichment(self,lst,attr,ref='proteome'):cont= \
np.array([[len(self.lists[ref]),self.graph.vcount()],[len(self.lists[lst]),len([1forvinself.graph.vsiflen(v[attr])>0])]])returnstats.fisher_exact(cont)defread_list_file(self,settings,**kwargs):_input=Noneifsettings.__class__.__name__!="ReadList":self.ownlog.msg(2,("""No proper input file definition!\n\'settings\' should be a \'readList\' instance\n"""),'ERROR')returnNoneifhasattr(dataio,settings.inFile):toCall=getattr(dataio,settings.inFile)_input=toCall(**kwargs)elifnotos.path.isfile(settings.inFile):self.ownlog.msg(2,"%s: No such file! :(\n"%(settings.inFile),'ERROR')returnNoneelse:_input=settings.inFileoriginalNameType=settings.nameTypedefaultNameType=self.default_name_type[settings.typ]mapTbl=''.join([originalNameType,"_",defaultNameType])iftype(_input)incommon.charTypesandos.path.isfile(_input):_input=codecs.open(_input,encoding='utf-8',mode='r')if_inputisNone:self.ownlog.msg(2,("""Could not find '\ 'file or dataio function.\n"""),'ERROR')returnNoneself.ownlog.msg(2,"%s opened..."%settings.inFile)# finding the largest referred column number,# to avoid references out of indexmaxCol=max([settings.nameCol,self.get_max(settings.extraAttrs)])# iterating lines from input filelnum=1readError=0itemList=[]forlinein_input:iflen(line)==0or(lnum==1andsettings.header):# empty lines# or header rowlnum+=1continueiftype(line)incommon.charTypes:line=line.rstrip().split(settings.separator)# in case line has less fields than needediflen(line)<maxCol:self.ownlog.msg(2,("Line #%u has less than %u fields! :(\n"%(lnum,maxCol)),'ERROR')readError=1breakelse:# reading names and attributestry:newItem={"name":line[settings.nameCol],"nameType":settings.nameType,"type":settings.typ,"source":settings.name}except:self.ownlog.msg(2,("""Wrong name column indexes (%u and %u), or wrong separator (%s)? Line #%u\n"""%(settings.nameCol,settings.separator,lnum)),'ERROR')readError=1break# getting additional attributesattrsItem=self.get_attrs(line,settings.extraAttrs,lnum)# merging dictionariesnewItem.update(attrsItem)ifreadError!=0:breakitemList.append(newItem)lnum+=1ifhasattr(_input,'close'):_input.close()itemListMapped=self.map_list(itemList,singleList=True)itemListMapped=list(set(itemListMapped))self.ownlog.msg(2,"%u lines have been read from %s, %u '\ items after mapping"%(lnum,settings.inFile,len(itemListMapped)))self.lists[settings.name]=itemListMapped

[docs]defmap_list(self,lst,singleList=False):''' Only a wrapper for map_edge() '''listMapped=[]ifsingleList:foriteminlst:listMapped+=self.map_item(item)else:foredgeinlst:listMapped+=self.map_edge(edge)returnlistMapped

[docs]defmap_item(self,item):""" Translates the name in item representing a molecule. """# TODO: include defaultNames=self.mapper.map_name(item['name'],item['nameType'],self.default_name_type[item['type']])iflen(defaultNames)==0:self.unmapped.append(item['name'])returndefaultNames

[docs]defmap_edge(self,edge):""" Translates molecule names in dict representing an edge. """edgeStack=[]defaultNameA=self.mapper.map_name(edge['nameA'],edge['nameTypeA'],self.default_name_type[edge['typeA']],ncbi_tax_id=edge['taxA'])# print 'mapped %s to %s' % (str(edge['nameA']), str(defaultNameA))defaultNameB=self.mapper.map_name(edge['nameB'],edge['nameTypeB'],self.default_name_type[edge['typeB']],ncbi_tax_id=edge['taxB'])# print 'mapped %s to %s' % (str(edge['nameB']), str(defaultNameB))# this is needed because the possibility ambigous mapping# one name can be mapped to multiple ones# this multiplies the nodes and edges# in case of proteins this does not happen too oftenfordnAindefaultNameA:fordnBindefaultNameB:edge['defaultNameA']=dnAedge['defaultNameTypeA']=self.default_name_type[edge['typeA']]edge['defaultNameB']=dnBedge['defaultNameTypeB']=self.default_name_type[edge['typeB']]edgeStack.append(edge)# print 'new edge: %s' % str(edge)returnedgeStack

[docs]defcombine_attr(self,lst,num_method=max):""" Combines multiple attributes into one. This method attempts to find out which is the best way to combine attributes. * if there is only one value or one of them is None, then returns the one available * lists: concatenates unique values of lists * numbers: returns the greater by default or calls `num_method()` if given. * sets: returns the union * dicts: calls `common.merge_dicts()` * Direction: calls their special `merge()` method Works on more than 2 attributes recursively. :param list lst: List of one or two attribute values. :param callable num_method: Method to merge numeric attributes. """deflist_or_set(one,two):if(isinstance(one,list)andisinstance(two,set))or \
(isinstance(two,list)andisinstance(one,set)):try:returnset(one),set(two)exceptTypeError:returnlist(one),list(two)else:returnone,two# recursion:iflen(lst)>2:lst=[lst[0],self.combine_attr(lst[1:],num_method=num_method)]# quick and simple cases:iflen(lst)==0:returnNoneiflen(lst)==1:returnlst[0]iflst[0]==lst[1]:returnlst[0]iflst[0]isNone:returnlst[1]iflst[1]isNone:returnlst[0]# merge numeric valuesiftype(lst[0])incommon.numTypesandtype(lst[1])incommon.numTypes:returnnum_method(lst)# in case one is list other is setlst[0],lst[1]=list_or_set(lst[0],lst[1])# merge lists:ifisinstance(lst[0],list)andisinstance(lst[1],list):try:# lists of hashable elements only:returnlist(set(itertools.chain(lst[0],lst[1])))exceptTypeError:# if contain non-hashable elements:returnlist(itertools.chain(lst[0],lst[1]))# merge sets:ifisinstance(lst[0],set):returncommon.addToSet(lst[0],lst[1])ifisinstance(lst[1],set):returncommon.addToSet(lst[1],lst[0])# merge dicts:ifisinstance(lst[0],dict)andisinstance(lst[1],dict):returncommon.merge_dicts(lst[0],lst[1])# 2 different strings: return a set with both of themif(isinstance(lst[0],str)orisinstance(lst[0],unicode))and \
(isinstance(lst[1],str)orisinstance(lst[1],unicode)):iflen(lst[0])==0:returnlst[1]iflen(lst[1])==0:returnlst[0]returnset([lst[0],lst[1]])# one attr is list, the other is simple value:if(isinstance(lst[0],list)andtype(lst[1])incommon.simpleTypes):iflst[1]incommon.numTypesorlen(lst[1])>0:returncommon.addToList(lst[0],lst[1])else:returnlst[0]if(isinstance(lst[1],list)andtype(lst[0])incommon.simpleTypes):iflst[0]incommon.numTypesorlen(lst[0])>0:returncommon.addToList(lst[1],lst[0])else:returnlst[1]# special: merging directionsiflst[0].__class__.__name__=='Direction'and \
lst[1].__class__.__name__=='Direction':lst[0].merge(lst[1])returnlst[0]# in case the objects have `__add__()` method:ifhasattr(lst[0],'__add__'):returnlst[0]+lst[1]

[docs]defcollapse_by_name(self,graph=None):""" Collapses nodes with the same name with copying and merging all edges and attributes. """graph=self.graphifgraphisNoneelsegraphdupli=Counter(graph.vs['name'])forname,countiniteritems(dupli):ifcount>1:nodes=graph.vs.select(name=name)# the number of nodes might have changediflen(nodes)>1:self.merge_nodes(nodes)

[docs]defmerge_nodes(self,nodes,primary=None,graph=None):""" Merges all attributes and all edges of selected nodes and assigns them to the primary node (by default the one with lowest ID). :param list nodes: List of edge IDs. :param int primary: ID of the primary edge; if None the lowest ID selected. """graph=self.graphifgraphisNoneelsegraphnodes=sorted(list(map(lambdan:n.indexiftype(n)isnotintelsen,nodes)))nodes=sorted(nodes)primary=nodes[0]ifprimaryisNoneelseprimaryprimary=primary.indexiftype(primary)isnotintelseprimarynonprimary=list(filter(lambdan:n!=primary,nodes))graph.vs['id_merge']=list(range(graph.vcount()))# combining vertex attributes:vprim=graph.vs[primary]forattrinvprim.attributes():ifattr!='name':vprim[attr]=self.combine_attr(list(map(lambdavid:graph.vs[vid][attr],# combining from all nodesnodes)))# moving edges of non primary vertices to the primary oneself.copy_edges(nonprimary,primary,move=True,graph=graph)# deleting non primary vertices:toDel=list(map(lambdai:graph.vs.select(id_merge=i)[0].index,nonprimary))graph.delete_vertices(toDel)delgraph.vs['id_merge']

[docs]defcopy_edges(self,sources,target,move=False,graph=None):""" Copies edges of one node to another, keeping attributes and directions. :param list sources: Vertex IDs to copy from. :param int target: Vertex ID to copy for. :param bool move: Whether perform copy or move, i.e. remove or keep the source edges. """toDel=set([])graph=self.graphifgraphisNoneelsegraphgraph.vs['id_old']=list(range(graph.vcount()))graph.es['id_old']=list(range(graph.ecount()))# preserve a permanent marker of the target vertexovidt=graph.vs[target]['id_old']# collecting the edges of all source vertices into dictses= \
dict(map(lambdas:(# id_old of source vertices:s,# edges of current source node:set(map(lambdae:e.index,itertools.chain(graph.es.select(_source=s),graph.es.select(_target=s))))),sources))# collecting edges to be newly createdtoAdd=set([])fors,esiniteritems(ses):foreidines:# the source edge:e=graph.es[eid]# looking up if target edge already exists:vid1=targetife.source==selsee.sourcevid2=targetife.target==selsee.targette=graph.get_eid(vid1,vid2,error=False)ifte==-1:# target edge not found, needs to be added:toAdd.add((vid1,vid2))# creating new edgesgraph.add_edges(toAdd)# copying attributes:forovids,esiniteritems(ses):foroeidines:# this is the index of the current source node:s=graph.vs.select(id_old=ovids)[0].index# this is the index of the current target node:t=graph.vs.select(id_old=ovidt)[0].index# this is the current source edge:e=graph.es.select(id_old=oeid)[0]# looking up target edge and peer vertex:vid1=tife.source==selsee.sourcevid2=tife.target==selsee.targetvid_peer=e.sourceife.target==selsee.targette=graph.es[graph.get_eid(vid1,vid2)]# old direction:d=e['dirs']# dict from old names to new ones# the peer does no change, only s->tids={graph.vs[s]['name']:graph.vs[t]['name'],graph.vs[vid_peer]['name']:graph.vs[vid_peer]['name']}# copying directions and signs:te['dirs']=d.translate(ids).merge(te['dirs']) \
ifisinstance(te['dirs'],Direction)elsed.translate(ids)# copying `refs_by_dir`te['refs_by_dir']= \
self.translate_refsdir(e['refs_by_dir'],ids)# copying further attributes:foreattrine.attributes():ifeattr!='dirs'andeattr!='refs_by_dir':te[eattr]=self.combine_attr([te[eattr],e[eattr]])# in case we want to delete old edges:toDel.add(e.index)ifmove:graph.delete_edges(list(toDel))# removing temporary attributesdelgraph.es['id_old']delgraph.vs['id_old']

[docs]defdelete_by_taxon(self,tax):""" Removes the proteins of all organisms which are not listed. :param list tax: List of NCBI Taxonomy IDs of the organisms. E.g. [7227, 9606] """g=self.graphtoDel=[]forving.vs:ifv['ncbi_tax_id']notintax:toDel.append(v.index)g.delete_vertices(toDel)self.update_vname()self.update_db_dict()

[docs]defdelete_unknown(self,tax,typ='protein',defaultNameType=None):''' Removes those proteins which are not in the list of all default IDs of the organisms. By default, it means to remove all protein nodes not having a human SwissProt ID. @tax : list List of NCBI Taxonomy IDs of the organisms of interest. E.g. [7227, 9606] @typ : str Molecule type. E.g. 'protein' or 'mirna' @defaultNameType : str The default name type of the given molecular species. For proteins it's 'uniprot' by default. '''g=self.graphifnotdefaultNameType:defaultNameType=self.default_name_type[typ]toDel=[]reflists={}self.update_vname()fortintax:idx=(defaultNameType,typ,t)ifidxinself.reflists:reflists[t]=self.reflists[idx].lstelse:msg=('Missing reference list for %s (default name type: %s), in taxon %u')%(idx[1],idx[0],t)self.ownlog.msg(2,msg,'ERROR')sys.stdout.write(''.join(['\t',msg,'\n']))returnFalsesys.stdout.write(' :: Comparing with reference lists...')fortintax:nt=g.vs['nameType']nt=[ifori,jinenumerate(nt)ifj==defaultNameType]ty=g.vs['type']ty=[ifori,jinenumerate(ty)ifj==typ]tx=g.vs['ncbi_tax_id']tx=[ifori,jinenumerate(tx)ifj==t]vs=list((set(nt)&set(ty))&set(tx))vn=[g.vs[i]['name']foriinvs]toDelNames=list(set(vn)-set(reflists[t]))toDel+=[self.nodDct[n]fornintoDelNames]g.delete_vertices(toDel)sys.stdout.write(' done.\n')

[docs]defadd_update_vertex(self,defAttrs,originalName,originalNameType,extraAttrs={},add=False):''' Updates the attributes of one node in the network. Optionally it creates a new node and sets the attributes, but it is not efficient as igraph needs to reindex vertices after this operation, so better to create new nodes and edges in batch. '''g=self.graphifnotdefAttrs["name"]ing.vs["name"]:ifnotadd:self.ownlog.msg(2,'Failed to add some vertices','ERROR')returnFalsen=g.vcount()g.add_vertices(1)g.vs[n][key].originalNames={originalName:originalNameType}thisNode=g.vs.find(name=defAttrs["name"])else:thisNode=g.vs.find(name=defAttrs["name"])ifthisNode["originalNames"]isNone:thisNode["originalNames"]={}thisNode["originalNames"][originalName]=originalNameTypeforkey,valueiniteritems(defAttrs):thisNode[key]=valueforkey,valueiniteritems(extraAttrs):ifkeynoting.vs.attributes():g.vs[key]=[[]for_inxrange(self.graph.vcount())] \
ifisinstance(value,list)else[None]thisNode[key]=self.combine_attr([thisNode[key],value])

[docs]defget_edge(self,nodes):''' Returns the edge id only if there is an edge from nodes[0] to nodes[1], returns False if edge exists in opposite direction, or no edge exists between the two vertices, or any of the vertice ids doesn't exist. To find edges without regarding their direction, see edge_exists(). '''g=self.graphtry:e=g.get_eid(nodes[0],nodes[1])returneexcept:returnFalse

[docs]defstraight_between(self,nameA,nameB):''' This does actually the same as get_edge(), but by names instead of vertex ids. '''nodNm=sorted([nameA,nameB])nodes=[self.graph.vs['name'].index(nodNm[0]),self.graph.vs['name'].index(nodNm[1])]edge=self.get_edge(nodes)ifisinstance(edge,int):returnedgeelse:returnnodes

[docs]defall_between(self,nameA,nameB):''' Returns all edges between two given vertex names. Similar to straight_between(), but checks both directions, and returns list of edge ids in [undirected, straight, reveresed] format, for both nameA -> nameB and nameB -> nameA edges. '''g=self.graphedges={'ab':[None,None,None],'ba':[None,None,None]}eid=self.edge_exists(self,nameA,nameB)ifisinstance(eid,int):ifg.es[eid]['dirs'].get_dir('undirected'):edges['ab'][0]=eidedges['ba'][0]=eidifg.es[eid]['dirs'].get_dir((nameA,nameB)):edges['ab'][1]=eidedges['ba'][2]=eidifg.es[eid]['dirs'].get_dir((nameB,nameA)):edges['ab'][2]=eidedges['ba'][1]=eidreturnedges

[docs]definit_vertex_attr(self,attr):""" Fills vertex attribute with its default values, creates lists if in `vertexAttrs` the attribute is registered as list. """forvinself.graph.vs:ifv[attr]isNone:v[attr]=self.vertexAttrs[attr]()ifself.vertexAttrs[attr]islistandtype(v[attr])incommon.simpleTypes:v[attr]=[v[attr]]iflen(v[attr])>0else[]

[docs]definit_edge_attr(self,attr):""" Fills edge attribute with its default values, creates lists if in `edgeAttrs` the attribute is registered as list. """foreinself.graph.es:ife[attr]isNone:e[attr]=self.edgeAttrs[attr]()ifself.edgeAttrs[attr]islistandtype(e[attr])incommon.simpleTypes:e[attr]=[e[attr]]iflen(e[attr])>0else[]

[docs]defattach_network(self,edgeList=False,regulator=False):""" Adds edges to the network from edgeList obtained from file or other input method. """g=self.graphifnotedgeList:ifself.raw_dataisnotNone:edgeList=self.raw_dataelse:self.ownlog.msg(2,"attach_network(): No data, nothing to do.",'INFO')returnTrueifisinstance(edgeList,str):ifedgeListinself.data:edgeList=self.data[edgeList]else:self.ownlog.msg(2,"`%s' looks like a source name, but no data""available under this name."%(edgeList),'ERROR')returnFalsenodes=[]edges=[]# adding nodes and edges first in bunch,# to avoid multiple reindexing by igraphself.update_vname()prg=Progress(total=len(edgeList),name="Processing nodes",interval=50)foreinedgeList:aexists=self.node_exists(e["defaultNameA"])bexists=self.node_exists(e["defaultNameB"])ifnotaexistsand(notregulatororbexists):nodes.append(e["defaultNameA"])ifnotbexistsandnotregulator:nodes.append(e["defaultNameB"])prg.step()prg.terminate()self.new_nodes(set(nodes))self.ownlog.msg(2,'New nodes have been created','INFO')self.update_vname()prg=Progress(total=len(edgeList),name='Processing edges',interval=50)foreinedgeList:aexists=self.node_exists(e["defaultNameA"])bexists=self.node_exists(e["defaultNameB"])ifaexistsandbexists:edge=self.edge_exists(e["defaultNameA"],e["defaultNameB"])ifisinstance(edge,list):edges.append(tuple(edge))prg.step()prg.terminate()self.new_edges(set(edges))self.ownlog.msg(2,"New edges have been created",'INFO')self.ownlog.msg(2,("""Introducing new node and edge attributes..."""),'INFO')prg=Progress(total=len(edgeList),name="Processing attributes",interval=30)nodes_updated=[]self.update_vname()foreinedgeList:# adding new node attributesife["defaultNameA"]notinnodes_updated:defAttrs={"name":e["defaultNameA"],"label":e["defaultNameA"],"nameType":e["defaultNameTypeA"],"type":e["typeA"],"ncbi_tax_id":e["taxA"]}self.add_update_vertex(defAttrs,e["nameA"],e["nameTypeA"],e["attrsNodeA"])nodes_updated.append(e["defaultNameA"])ife["defaultNameB"]notinnodes_updated:defAttrs={"name":e["defaultNameB"],"label":e["defaultNameB"],"nameType":e["defaultNameTypeB"],"type":e["typeB"],"ncbi_tax_id":e["taxB"]}self.add_update_vertex(defAttrs,e["nameB"],e["nameTypeB"],e["attrsNodeB"])nodes_updated.append(e["defaultNameB"])# adding new edge attributesself.add_update_edge(e["defaultNameA"],e["defaultNameB"],e["source"],e["isDirected"],e["references"],e["stim"],e["inh"],e["taxA"],e["taxB"],e["type"],e["attrsEdge"])prg.step()prg.terminate()self.raw_data=Noneself.update_attrs()

[docs]defapply_list(self,name,node_or_edge="node"):""" Creates vertex or edge attribute based on a list. """ifnamenotinself.lists:self.ownlog.msg(1,("""No such list: %s"""%name),'ERROR')returnNoneg=self.graphifnode_or_edge=="edge":g.es[name]=[None]else:g.vs[name]=[None]ifisinstance(self.lists[name],dict):ifnode_or_edge=="edge":foreing.es:if(v[e.source]["name"],v[e.target]["name"])inself.lists[name]:e[name]=self.lists[name][(v[e.source]["name"],v[e.target]["name"])]if(v[e.target]["name"],v[e.source]["name"])inself.lists[name]:e[name]=self.lists[name][(v[e.target]["name"],v[e.source]["name"])]else:forving.vs:ifv["name"]inself.lists[name]:v[name]=self.lists[name][v["name"]]ifisinstance(self.lists[name],list):ifnode_or_edge=="edge":foreing.es:if(v[e.source]["name"],v[e.target]["name"])inself.lists[name]:e[name]=Trueelse:e[name]=Falseelse:forving.vs:ifv["name"]inself.lists[name]:v[name]=Trueelse:v[name]=False

[docs]defmerge_lists(self,nameA,nameB,name=None,and_or="and",delete=False,func="max"):""" Merges two lists in `lists`. """ifnameAnotinself.lists:self.ownlog.msg(1,("""No such list: %s"""%nameA),'ERROR')returnNoneifnameBnotinself.lists:self.ownlog.msg(1,("""No such list: %s"""%nameB),'ERROR')returnNonename='_'.join([nameA,nameB])ifnameisNoneelsenameifisinstance(self.lists[nameA],list)andisinstance(self.lists[nameB],list):ifand_or=="and":self.lists[name]=list(set(self.lists[nameA])|set(self.lists[nameB]))ifand_or=="or":self.lists[name]=list(set(self.lists[nameA])&set(self.lists[nameB]))ifisinstance(self.lists[nameA],dict)andisinstance(self.lists[nameB],dict):self.lists[name]={}ifand_or=="and":keys=list(set(self.lists[nameA].keys)|set(self.lists[nameB].keys()))forkinkeys:ifkinself.lists[nameA]:self.lists[name][k]=self.lists[nameA][k]ifkinself.lists[nameB]:self.lists[name][k]=self.lists[nameB][k]ifkinself.lists[nameA]andkinself.lists[nameB]:self.lists[name][k]=self.combine_attr([self.lists[nameA][k],self.lists[nameB][k]])ifand_or=="or":keys=list(set(self.lists[nameA].keys)&set(self.lists[nameB].keys()))forkinkeys:self.lists[name][k]=self.combine_attr([self.lists[nameA][k],self.lists[nameB][k]])ifdelete:delself.lists[nameA]delself.lists[nameB]

[docs]defsave_session(self):""" Save current state into pickle dump. """pickleFile="pwnet-"+self.session+".pickle"self.ownlog.msg(1,("""Saving session to %s... """%pickleFile),'INFO')withopen(pickleFile,"wb")asf:pickle.dump(self,f)

#### functions for plotting // with custom typeface ;)##### functions to compare networks and pathways#defdatabases_similarity(self,index='simpson'):g=self.graphedges={}nodes={}self.update_sources()nodes=dict([(s,[v.indexforving.vsifsinv['sources']])forsinself.sources])edges=dict([(s,[e.indexforeing.esifsine['sources']])forsinself.sources])sNodes=self.similarity_groups(nodes,index=index)sEdges=self.similarity_groups(edges,index=index)return{'nodes':sNodes,'edges':sEdges}defsimilarity_groups(self,groups,index='simpson'):index_func='%s_index'%index# these are imported into globals() from common:ifhasattr(sys.modules['%s.common'%self.__module__.split('.')[0]],index_func):to_call=getattr(sys.modules['%s.common'%self.__module__.split('.')[0]],index_func)grs=sorted(groups.keys())sor=dict([(g,{})forgingrs])forg1inxrange(0,len(grs)):forg2inxrange(g1,len(grs)):sor[grs[g1]][grs[g2]]=to_call(groups[grs[g1]],groups[grs[g2]])sor[grs[g2]][grs[g1]]=sor[grs[g1]][grs[g2]]returnsorelse:self.ownlog.msg(2,'No such function: %s()'%index_func,'ERROR')defsorensen_pathways(self,pwlist=None):g=self.graphifpwlistisNone:pwlist=self.pathway_typesforpinpwlist:ifpnoting.vs.attributes():self.ownlog.msg(2,("No such vertex attribute: %s"%p),'ERROR')edges={}nodes={}foreing.es:indA=e.sourceindB=e.targetpwsA=[]pwsB=[]forpinpwlist:ifg.vs[indA][p]isnotNone:forpwing.vs[indA][p]:thisPw=p.replace("_pathways","__")+pwifthisPwnotinnodes:nodes[thisPw]=[]nodes[thisPw].append(indA)pwsA.append(thisPw)ifg.vs[indB][p]isnotNone:forpwing.vs[indB][p]:thisPw=p.replace("_pathways","__")+pwifthisPwnotinnodes:nodes[thisPw]=[]nodes[thisPw].append(indB)pwsB.append(thisPw)pwsE=set(pwsA).intersection(set(pwsB))forpwinpwsE:ifpwnotinedges:edges[pw]=[]edges[pw].append(e.index)sNodes=self.sorensen_groups(nodes)sEdges=self.sorensen_groups(edges)return{"nodes":sNodes,"edges":sEdges}defwrite_table(self,tbl,outfile,sep="\t",cut=None,colnames=True,rownames=True):out=''rn=tbl.keys()if"header"inrn:cn=tbl["header"]deltbl["header"]rn.remove("header")else:cn=[str(i)foriinxrange(0,len(tbl[rn[0]]))]ifcolnames:ifrownames:out+=sepout+=sep.join(cn)+"\n"forrinrn:ifrownames:out+=str(r)[0:cut]+septhisRow=[str(i)foriintbl[r]]out+=sep.join(thisRow)+"\n"f=codecs.open(self.outdir+outfile,encoding='utf-8',mode='w')f.write(out)f.close()defsearch_attr_or(self,obj,lst):iflen(lst)==0:returnTruefora,viniteritems(lst):if(isinstance(v,list)andlen(set(obj[a]).intersection(v))>0)or(notisinstance(v,list)andobj[a]==v):returnTruereturnFalsedefsearch_attr_and(self,obj,lst):fora,viniteritems(lst):if(isinstance(v,list)andlen(set(obj[a]).intersection(v))==0)or(notisinstance(v,list)andobj[a]!=v):returnFalsereturnTruedefget_sub(self,crit,andor="or",graph=None):g=self.graphifgraphisNoneelsegraphkeepV=[]delE=[]ifandor=="and":foreing.es:keepThis=self.search_attr_and(e,crit["edge"])ifkeepThis:keepA=self.search_attr_and(g.vs[e.source],crit["node"])ifkeepA:keepV+=[e.source]keepB=self.search_attr_and(g.vs[e.target],crit["node"])ifkeepB:keepV+=[e.target]ifnotkeepAornotkeepB:delE+=[e.index]else:delE+=[e.index]else:foreing.es:keepThis=self.search_attr_or(e,crit["edge"])ifkeepThis:keepV+=[e.source,e.target]continueelse:delE+=[e.index]iflen(crit["node"])>0:keepA=self.search_attr_or(g.vs[e.source],crit["node"])ifkeepA:keep+=[e.source]keepB=self.search_attr_or(g.vs[e.target],crit["node"])ifkeepB:keep+=[e.target]return{"nodes":list(set(keepV)),"edges":list(set(delE))}defedgeseq_inverse(self,edges):g=self.graphinv=[]foreing.es:ife.indexnotinset(edges):inv.append(e.index)returninvdefget_network(self,crit,andor="or",graph=None):g=self.graphifgraphisNoneelsegraphsub=self.get_sub(crit,andor=andor,graph=g)new=g.copy()new.delete_edges(sub["edges"])returnnew.induced_subgraph(sub["nodes"])

[docs]defupdate_sources(self):""" Makes sure that the `sources` attribute is an up to date list of all sources in the current network. """g=self.graphsrc=[]foreing.es:src+=e["sources"]self.sources=list(set(src))self.update_cats()

[docs]defupdate_cats(self):""" Makes sure that the `has_cats` attribute is an up to date set of all categories in the current network. """self.has_cats=set(list(map(lambdas:data_formats.categories[s],filter(lambdas:sindata_formats.categories,self.sources))))

[docs]defgenesymbol_labels(self,graph=None,remap_all=False):""" Creats vertex attribute ``label`` and fills up with Gene Symbols of all proteins where the Gene Symbol can be looked up based on the default name of the protein vertex. If the attribute ``label`` had been already initialized, updates this attribute or recreates if ``remap_all`` is ``True``. """self._already_has_directed()ifgraphisNoneandself.dgraphisnotNone:self.genesymbol_labels(graph=self.dgraph,remap_all=remap_all)g=self.graphifgraphisNoneelsegraphdefaultNameType=self.default_name_type["protein"]geneSymbol="genesymbol"if'label'noting.vs.attributes():remap_all=Truelabels=[Noneifremap_allorv['label']==v['name']elsev['label']forving.vs]forv,l,iinzip(g.vs,labels,xrange(g.vcount())):iflisNoneandv['type']=='protein':label=self.mapper.map_name(v['name'],defaultNameType,geneSymbol,ncbi_tax_id=v['ncbi_tax_id'])iflen(label)==0:labels[i]=v['name']else:labels[i]=label[0]g.vs['label']=labels

[docs]defnetwork_stats(self,outfile=None):''' Calculates basic statistics for the whole network and each of sources. Writes the results in a tab file. '''ifoutfileisNone:outfile='-'.join(["pwnet",self.session,"stats"])stats={}stats['header']=["vnum","enum","deg_avg","diam","trans","adh","coh"]forkinxrange(0,len(self.sources)+1):s="All"ifk==len(self.sources)elseself.sources[k]g=self.graphifk==len(self.sources)elseself.get_network({"edge":{"sources":[s]},"node":{}})ifg.vcount()>0:stats[s]=[g.vcount(),g.ecount(),sum(g.vs.degree())/float(len(g.vs)),g.diameter(),g.transitivity_undirected(),g.adhesion(),g.cohesion()]self.write_table(stats,outfile)

[docs]defnetwork_filter(self,p=2.0):''' This function aims to cut the number of edges in the network, without loosing nodes, to make the network less connected, less hairball-like, more usable for analysis. '''ref_freq={}forsinself.sources:ref_freq[s]={}foreinself.graph.es:ifsine['sources']:forrine['refs_by_source'][s]:ifrnotinref_freq[s]:ref_freq[s][r]=1else:ref_freq[s][r]+=1self.graph.es['score']=[0.0]deg=self.graph.vs.degree()avdeg=sum(deg)/len(deg)prg=Progress(self.graph.ecount(),'Calculating scores',11)foreinself.graph.es:score=0.0fors,rsiniteritems(e['refs_by_source']):forrinrs:score+=1.0/ref_freq[s][r]mindeg=min(self.graph.vs[e.source].degree(),self.graph.vs[e.target].degree())ifmindeg<avdeg:score*=pow((mindeg-avdeg),p)e['score']=scoreprg.step()prg.terminate()

[docs]defshortest_path_dist(self,graph=None,subset=None,outfile=None,**kwargs):''' subset is a tuple of two lists if you wish to look for paths between elements of two groups, or a list if you wish to look for shortest paths within this group '''graph=graphifgraphisnotNoneelseself.graphshortest_paths=[]subset=subsetifisinstance(subset,tuple)orsubsetisNoneelse(subset,subset)prg=Progress(graph.vcount(),'Calculating paths',1)foriinxrange(0,graph.vcount()-1):ifsubsetisNoneoriinsubset[0]oriinsubset[1]:paths=graph.get_shortest_paths(i,xrange(i+1,graph.vcount()),**kwargs)forjinxrange(0,len(paths)):ifsubsetisNoneor(iinsubset[0]andi+j+1insubset[1])or(iinsubset[1]andi+j+1insubset[0]):shortest_paths.append(len(paths[j]))prg.step()prg.terminate()ifoutfileisnotNone:out='\n'.join([str(i)foriinshortest_paths])withcodecs.open(outfile,encoding='utf-8',mode='w')asf:f.write(out)returnshortest_paths

defload_pdb(self,graph=None):graph=graphifgraphisnotNoneelseself.graphu_pdb,pdb_u=dataio.get_pdb()ifu_pdbisNone:self.ownlog.msg(2,'Failed to download UniProt-PDB dictionary','ERROR')else:graph.vs['pdb']=[None]forvingraph.vs:v['pdb']={}ifv['name']inu_pdb:forpdbinu_pdb[v['name']]:v['pdb'][pdb[0]]=(pdb[1],pdb[2])self.ownlog.msg(2,'PDB IDs for proteins has been retrieved.','INFO')defload_pfam(self,graph=None):graph=graphifgraphisnotNoneelseself.graphu_pfam,pfam_u=dataio.get_pfam(graph.vs['name'])ifu_pfamisNone:self.ownlog.msg(2,'Failed to download Pfam data from UniProt','ERROR')else:graph.vs['pfam']=[None]forvingraph.vs:v['pfam']=[]ifv['name']inu_pfam:v['pfam']+=u_pfam[v['name']]self.ownlog.msg(2,'Pfam domains has been retrieved.','INFO')defload_pfam2(self):self.pfam_regions()ifself.u_pfamisNone:self.ownlog.msg(2,'Failed to download data from Pfam','ERROR')else:self.graph.vs['pfam']=[{}for_inself.graph.vs]forvinself.graph.vs:ifv['name']inself.u_pfam:v['pfam']=self.u_pfam[v['name']]self.ownlog.msg(2,'Pfam domains has been retrieved.','INFO')defload_pfam3(self):self.pfam_regions()ifself.u_pfamisNone:self.ownlog.msg(2,'Failed to download data from Pfam','ERROR')else:self.graph.vs['doms']=[[]for_inself.graph.vs]forvinself.graph.vs:ifv['name']inself.u_pfam:forpfam,regionsiniteritems(self.u_pfam[v['name']]):forregioninregions:v['doms'].append(intera.Domain(protein=v['name'],domain=pfam,start=region['start'],end=region['end'],isoform=region['isoform']))self.ownlog.msg(2,'Pfam domains has been retrieved.','INFO')

[docs]defload_corum(self,graph=None):""" Loads complexes from CORUM database. Loads data into vertex attribute `graph.vs['complexes']['corum']`. This resource is human only. """graph=graphifgraphisnotNoneelseself.graphcomplexes,members=dataio.get_corum()ifcomplexesisNone:self.ownlog.msg(2,'Failed to download data from CORUM','ERROR')else:self.init_complex_attr(graph,'corum')foru,csiniteritems(members):sw=self.mapper.map_name(u,'uniprot','uniprot',9606)forsinsw:ifsingraph.vs['name']:forcincs:others=[]formembincomplexes[c[0]][0]:others+=self.mapper.map_name(memb,'uniprot','uniprot',9606)graph.vs.select(name=s)[0]['complexes']['corum'][c[1]]={'full_name':c[0],'all_members':others,'all_members_original':complexes[c[0]][0],'references':c[2],'functions':c[4],'diseases':c[5],}self.ownlog.msg(2,'Complexes from CORUM have been retrieved.','INFO')

[docs]defduniprot(self,uniprot):''' Same as ``PyPath.uniprot(), just for directed graph. Returns ``igraph.Vertex()`` object if the UniProt can be found in the default directed network, otherwise ``None``. @uniprot : str UniProt ID. '''dgraph=self._get_directed()returndgraph.vs[self.dnodDct[uniprot]] \
ifuniprotinself.dnodDctelseNone

dup=duniprot

[docs]defuniprots(self,uniprots):''' Returns list of ``igraph.Vertex()`` object for a list of UniProt IDs omitting those could not be found in the default undirected graph. '''returnfilter(lambdav:visnotNone,map(self.uniprot,uniprots))

ups=uniprots

[docs]defduniprots(self,uniprots):''' Returns list of ``igraph.Vertex()`` object for a list of UniProt IDs omitting those could not be found in the default directed graph. '''returnfilter(lambdav:visnotNone,map(self.duniprot,uniprots))

[docs]defdprotein(self,identifier):''' Same as ``PyPath.protein``, just for the directed graph. Returns ``igraph.Vertex()`` object if the identifier is a valid vertex index in the default directed graph, or a UniProt ID or GeneSymbol which can be found in the default directed network, otherwise ``None``. @identifier : int, str Vertex index (int) or GeneSymbol (str) or UniProt ID (str). '''dgraph=self._get_directed()returndgraph.vs[identifier] \
ifisinstance(identifier,int)andidentifier<dgraph.vcount() \
elsedgraph.vs[self.dnodDct[identifier]] \
ifidentifierinself.dnodDctelse \
dgraph.vs[self.dlabDct[identifier]] \
ifidentifierinself.dlabDctelseNone

[docs]defedges_in_comlexes(self,csources=['corum'],graph=None):''' Creates edge attributes ``complexes`` and ``in_complex``. These are both dicts where the keys are complex resources. The values in ``complexes`` are the list of complex names both the source and the target vertices belong to. The values ``in_complex`` are boolean values whether there is at least one complex in the given resources both the source and the target vertex of the edge belong to. @csources : list List of complex resources. Should be already loaded. @graph : igraph.Graph() The graph object to do the calculations on. '''graph=graphifgraphisnotNoneelseself.graphif'complexes'notingraph.es.attributes():graph.es['complexes']=[{}for_ingraph.es]if'in_complex'notingraph.es.attributes():graph.es['in_complex']=[{}for_ingraph.es]def_common_complexes(e,cs):returnset(graph.vs[e.source]['complexes'][cs].keys())& \
set(graph.vs[e.source]['complexes'][cs].keys())nul=map(lambdacs:map(lambdae:e['complexes'].__setitem__(cs,_common_complexes(e,cs)),graph.es),csources)nul=map(lambdacs:map(lambdae:e['in_complex'].__setitem__(cs,bool(len(e['complexes'][cs]))),graph.es),csources)

[docs]defsum_in_complex(self,csources=['corum'],graph=None):''' Returns the total number of edges in the network falling between two members of the same complex. Returns as a dict by complex resources. Calls :py:func:pypath.pypath.Pypath.edges_in_comlexes() to do the calculations. @csources : list List of complex resources. Should be already loaded. @graph : igraph.Graph() The graph object to do the calculations on. '''graph=graphifgraphisnotNoneelseself.graphself.edges_in_comlexes(csources=csources,graph=graph)returndict(map(lambdacs:(cs,sum(map(lambdae:e['in_complex'][cs],graph.es))),csources))

[docs]defload_ddi(self,ddi):''' ddi is either a list of intera.DomainDomain objects, or a function resulting this list '''data=ddiifnothasattr(ddi,'__call__')elseddi()ifdataisNone:ifddi.__module__.split('.')[1]=='dataio':self.ownlog.msg(2,'Function %s() failed'%ddi,'ERROR')returnNoneif'ddi'notinself.graph.es.attributes():self.graph.es['ddi']=[[]for_inself.graph.es]prg=Progress(len(data),'Loading domain-domain interactions',99)in_network=0forddindata:prg.step()uniprot1=dd.domains[0].proteinuniprot2=dd.domains[1].proteinifself.node_exists(uniprot1)andself.node_exists(uniprot2):e=self.edge_exists(uniprot1,uniprot2)ifisinstance(e,int):ifnotisinstance(self.graph.es[e]['ddi'],list):self.graph.es[e]['ddi']=[]in_network+=1self.graph.es[e]['ddi'].append(dd)prg.terminate()

[docs]defload_dmi(self,dmi):''' dmi is either a list of intera.DomainMotif objects, or a function resulting this list '''data=dmiifnothasattr(dmi,'__call__')elsedmi()ifdataisNone:ifdmi.__module__.split('.')[1]=='dataio':self.ownlog.msg(2,'Function %s() failed'%dmi,'ERROR')returnNoneif'ptm'notinself.graph.es.attributes():self.graph.es['ptm']=[[]for_inself.graph.es]

[docs]defprocess_dmi(self,source,**kwargs):''' This is an universal function for loading domain-motif objects like load_phospho_dmi() for PTMs. TODO this will replace load_elm, load_ielm, etc '''functions={'LMPID':'lmpid_dmi'}motif_plus={'LMPID':[]}self.update_vname()toCall=getattr(dataio,functions[source])data=toCall(**kwargs)ifself.seqisNone:self.sequences(self.ncbi_tax_id)ifself.seqisNone:self.sequences(isoforms=True)if'ptm'notinself.graph.es.attributes():self.graph.es['ptm']=[[]for_inself.graph.es]prg=Progress(len(data),'Processing domain-motif interactions from %s'%source,7)fordindata:prg.step()domain_ups=[]motif_upd=[]ifsource=='LMPID':domain_ups=self.mapper.map_name(d['domain_protein'],'uniprot','uniprot')motif_ups=self.mapper.map_name(d['motif_protein'],'uniprot','uniprot')forduindomain_ups:dom=intera.Domain(du,domain=Noneif'domain_name'notindelsed['domain_name'],domain_id_type=Noneif'domain_name_type'notindelsed['domain_name_type'])formuinmotif_ups:ifmuinself.seqandmuinself.nodIndandduinself.nodInd:edge=self.get_edge((self.nodDct[du],self.nodDct[mu]))ifedge:mse=self.seq[mu]isos=[]ifd['instance']isNone:start,end,inst=mse.get_region(start=d['motif_start'],end=d['motif_end'],isoform=1)isos.append((start,end,1,inst))forisoinmse.isoforms():start,end,inst=mse.get_region(start=d['motif_start'],end=d['motif_end'],isoform=iso)ifinst==d['instance']:isos.append((start,