[docs]defrejectionline(n,alpha=0.5):'''reference line for rejection in multiple tests Not used anymore from: section 3.2, page 60 '''t=np.arange(n)/float(n)frej=t/(t*(1-alpha)+alpha)returnfrej

#I don't remember what I changed or why 2 versions,#this follows german diss ??? with rline#this might be useful if the null hypothesis is not "all effects are zero"#rename to _bak and working again on fdrcorrection0deffdrcorrection_bak(pvals,alpha=0.05,method='indep'):'''Reject False discovery rate correction for pvalues Old version, to be deleted missing: methods that estimate fraction of true hypotheses '''pvals=np.asarray(pvals)pvals_sortind=np.argsort(pvals)pvals_sorted=pvals[pvals_sortind]pecdf=ecdf(pvals_sorted)ifmethodin['i','indep','p','poscorr']:rline=pvals_sorted/alphaelifmethodin['n','negcorr']:cm=np.sum(1./np.arange(1,len(pvals)))rline=pvals_sorted/alpha*cmelifmethodin['g','onegcorr']:#what's this ? german dissrline=pvals_sorted/(pvals_sorted*(1-alpha)+alpha)elifmethodin['oth','o2negcorr']:# other invalid, cut-pastecm=np.sum(np.arange(len(pvals)))rline=pvals_sorted/alpha/cmelse:raiseValueError('method not available')reject=pecdf>=rlineifreject.any():rejectmax=max(np.nonzero(reject)[0])else:rejectmax=0reject[:rejectmax]=Truereturnreject[pvals_sortind.argsort()]

[docs]deftiecorrect(xranks):''' should be equivalent of scipy.stats.tiecorrect '''#casting to int rounds down, but not relevant for this caserankbincount=np.bincount(np.asarray(xranks,dtype=int))nties=rankbincount[rankbincount>1]ntot=float(len(xranks));tiecorrection=1-(nties**3-nties).sum()/(ntot**3-ntot)returntiecorrection

[docs]classGroupsStats(object):''' statistics by groups (another version) groupstats as a class with lazy evaluation (not yet - decorators are still missing) written this time as equivalent of scipy.stats.rankdata gs = GroupsStats(X, useranks=True) assert_almost_equal(gs.groupmeanfilter, stats.rankdata(X[:,0]), 15) TODO: incomplete doc strings '''def__init__(self,x,useranks=False,uni=None,intlab=None):'''descriptive statistics by groups Parameters ---------- x : array, 2d first column data, second column group labels useranks : boolean if true, then use ranks as data corresponding to the scipy.stats.rankdata definition (start at 1, ties get mean) uni, intlab : arrays (optional) to avoid call to unique, these can be given as inputs '''self.x=np.asarray(x)ifintlabisNone:uni,intlab=np.unique(x[:,1],return_inverse=True)elifuniisNone:uni=np.unique(x[:,1])self.useranks=useranksself.uni=uniself.intlab=intlabself.groupnobs=groupnobs=np.bincount(intlab)#temporary until separated and made all lazyself.runbasic(useranks=useranks)

[docs]defplot_simultaneous(self,comparison_name=None,ax=None,figsize=(10,6),xlabel=None,ylabel=None):"""Plot a universal confidence interval of each group mean Visiualize significant differences in a plot with one confidence interval per group instead of all pairwise confidence intervals. Parameters ---------- comparison_name : string, optional if provided, plot_intervals will color code all groups that are significantly different from the comparison_name red, and will color code insignificant groups gray. Otherwise, all intervals will just be plotted in black. ax : matplotlib axis, optional An axis handle on which to attach the plot. figsize : tuple, optional tuple for the size of the figure generated xlabel : string, optional Name to be displayed on x axis ylabel : string, optional Name to be displayed on y axis Returns ------- fig : Matplotlib Figure object handle to figure object containing interval plots Notes ----- Multiple comparison tests are nice, but lack a good way to be visualized. If you have, say, 6 groups, showing a graph of the means between each group will require 15 confidence intervals. Instead, we can visualize inter-group differences with a single interval for each group mean. Hochberg et al. [1] first proposed this idea and used Tukey's Q critical value to compute the interval widths. Unlike plotting the differences in the means and their respective confidence intervals, any two pairs can be compared for significance by looking for overlap. References ---------- .. [*] Hochberg, Y., and A. C. Tamhane. Multiple Comparison Procedures. Hoboken, NJ: John Wiley & Sons, 1987. Examples -------- >>> from statsmodels.examples.try_tukey_hsd import cylinders, cyl_labels >>> from statsmodels.stats.multicomp import MultiComparison >>> cardata = MultiComparison(cylinders, cyl_labels) >>> results = cardata.tukeyhsd() >>> results.plot_simultaneous() <matplotlib.figure.Figure at 0x...> This example shows an example plot comparing significant differences in group means. Significant differences at the alpha=0.05 level can be identified by intervals that do not overlap (i.e. USA vs Japan, USA vs Germany). >>> results.plot_simultaneous(comparison_name="USA") <matplotlib.figure.Figure at 0x...> Optionally provide one of the group names to color code the plot to highlight group means different from comparison_name. """fig,ax1=utils.create_mpl_ax(ax)iffigsizeisnotNone:fig.set_size_inches(figsize)ifgetattr(self,'halfwidths',None)isNone:self._simultaneous_ci()means=self._multicomp.groupstats.groupmeansigidx=[]nsigidx=[]minrange=[means[i]-self.halfwidths[i]foriinrange(len(means))]maxrange=[means[i]+self.halfwidths[i]foriinrange(len(means))]ifcomparison_nameisNone:ax1.errorbar(means,lrange(len(means)),xerr=self.halfwidths,marker='o',linestyle='None',color='k',ecolor='k')else:ifcomparison_namenotinself.groupsunique:raiseValueError('comparison_name not found in group names.')midx=np.where(self.groupsunique==comparison_name)[0][0]foriinrange(len(means)):ifself.groupsunique[i]==comparison_name:continueif(min(maxrange[i],maxrange[midx])-max(minrange[i],minrange[midx])<0):sigidx.append(i)else:nsigidx.append(i)#Plot the master comparisonax1.errorbar(means[midx],midx,xerr=self.halfwidths[midx],marker='o',linestyle='None',color='b',ecolor='b')ax1.plot([minrange[midx]]*2,[-1,self._multicomp.ngroups],linestyle='--',color='0.7')ax1.plot([maxrange[midx]]*2,[-1,self._multicomp.ngroups],linestyle='--',color='0.7')#Plot those that are significantly differentiflen(sigidx)>0:ax1.errorbar(means[sigidx],sigidx,xerr=self.halfwidths[sigidx],marker='o',linestyle='None',color='r',ecolor='r')#Plot those that are not significantly differentiflen(nsigidx)>0:ax1.errorbar(means[nsigidx],nsigidx,xerr=self.halfwidths[nsigidx],marker='o',linestyle='None',color='0.5',ecolor='0.5')ax1.set_title('Multiple Comparisons Between All Pairs (Tukey)')r=np.max(maxrange)-np.min(minrange)ax1.set_ylim([-1,self._multicomp.ngroups])ax1.set_xlim([np.min(minrange)-r/10.,np.max(maxrange)+r/10.])ax1.set_yticklabels(np.insert(self.groupsunique.astype(str),0,''))ax1.set_yticks(np.arange(-1,len(means)+1))ax1.set_xlabel(xlabelifxlabelisnotNoneelse'')ax1.set_ylabel(ylabelifylabelisnotNoneelse'')returnfig

[docs]classMultiComparison(object):'''Tests for multiple comparisons Parameters ---------- data : array independent data samples groups : array group labels corresponding to each data point group_order : list of strings, optional the desired order for the group mean results to be reported in. If not specified, results are reported in increasing order. If group_order does not contain all labels that are in groups, then only those observations are kept that have a label in group_order. '''def__init__(self,data,groups,group_order=None):iflen(data)!=len(groups):raiseValueError('data has %d elements and groups has %d'%(len(data),len(groups)))self.data=np.asarray(data)self.groups=groups=np.asarray(groups)# Allow for user-provided sorting of groupsifgroup_orderisNone:self.groupsunique,self.groupintlab=np.unique(groups,return_inverse=True)else:#check if group_order has any names not in groupsforgrpingroup_order:ifgrpnotingroups:raiseValueError("group_order value '%s' not found in groups"%grp)self.groupsunique=np.array(group_order)self.groupintlab=np.empty(len(data),int)self.groupintlab.fill(-999)# instead of a nancount=0fornameinself.groupsunique:idx=np.where(self.groups==name)[0]count+=len(idx)self.groupintlab[idx]=np.where(self.groupsunique==name)[0]ifcount!=data.shape[0]:#raise ValueError('group_order does not contain all groups')# warn and keep only observations with label in group_orderimportwarningswarnings.warn('group_order does not contain all groups:'+' dropping observations',ValueWarning)mask_keep=self.groupintlab!=-999self.groupintlab=self.groupintlab[mask_keep]self.data=self.data[mask_keep]self.groups=self.groups[mask_keep]iflen(self.groupsunique)<2:raiseValueError('2 or more groups required for multiple comparisons')self.datali=[self.data[self.groups==k]forkinself.groupsunique]self.pairindices=np.triu_indices(len(self.groupsunique),1)#tupleself.nobs=self.data.shape[0]self.ngroups=len(self.groupsunique)

[docs]defgetranks(self):'''convert data to rankdata and attach This creates rankdata as it is used for non-parametric tests, where in the case of ties the average rank is assigned. '''#bug: the next should use self.groupintlab instead of self.groups#update: looks fixed#self.ranks = GroupsStats(np.column_stack([self.data, self.groups]),self.ranks=GroupsStats(np.column_stack([self.data,self.groupintlab]),useranks=True)self.rankdata=self.ranks.groupmeanfilter

[docs]defkruskal(self,pairs=None,multimethod='T'):''' pairwise comparison for kruskal-wallis test This is just a reimplementation of scipy.stats.kruskal and does not yet use a multiple comparison correction. '''self.getranks()tot=self.nobsmeanranks=self.ranks.groupmeangroupnobs=self.ranks.groupnobs# simultaneous/separate treatment of multiple testsf=(tot*(tot+1.)/12.)/stats.tiecorrect(self.rankdata)#(xranks)print('MultiComparison.kruskal')fori,jinzip(*self.pairindices):#pdiff = np.abs(mrs[i] - mrs[j])pdiff=np.abs(meanranks[i]-meanranks[j])se=np.sqrt(f*np.sum(1./groupnobs[[i,j]]))#np.array([8,8]))) #Fixme groupnobs[[i,j]] ))Q=pdiff/se# TODO : print(statments, fixprint(i,j,pdiff,se,pdiff/se,pdiff/se>2.6310)print(stats.norm.sf(Q)*2)returnstats.norm.sf(Q)*2

[docs]defallpairtest(self,testfunc,alpha=0.05,method='bonf',pvalidx=1):'''run a pairwise test on all pairs with multiple test correction The statistical test given in testfunc is calculated for all pairs and the p-values are adjusted by methods in multipletests. The p-value correction is generic and based only on the p-values, and does not take any special structure of the hypotheses into account. Parameters ---------- testfunc : function A test function for two (independent) samples. It is assumed that the return value on position pvalidx is the p-value. alpha : float familywise error rate method : string This specifies the method for the p-value correction. Any method of multipletests is possible. pvalidx : int (default: 1) position of the p-value in the return of testfunc Returns ------- sumtab : SimpleTable instance summary table for printing errors: TODO: check if this is still wrong, I think it's fixed. results from multipletests are in different order pval_corrected can be larger than 1 ??? '''res=[]fori,jinzip(*self.pairindices):res.append(testfunc(self.datali[i],self.datali[j]))res=np.array(res)reject,pvals_corrected,alphacSidak,alphacBonf= \
multipletests(res[:,pvalidx],alpha=0.05,method=method)#print(np.column_stack([res[:,0],res[:,1], reject, pvals_corrected])i1,i2=self.pairindicesifpvals_correctedisNone:resarr=np.array(lzip(self.groupsunique[i1],self.groupsunique[i2],np.round(res[:,0],4),np.round(res[:,1],4),reject),dtype=[('group1',object),('group2',object),('stat',float),('pval',float),('reject',np.bool8)])else:resarr=np.array(lzip(self.groupsunique[i1],self.groupsunique[i2],np.round(res[:,0],4),np.round(res[:,1],4),np.round(pvals_corrected,4),reject),dtype=[('group1',object),('group2',object),('stat',float),('pval',float),('pval_corr',float),('reject',np.bool8)])fromstatsmodels.iolib.tableimportSimpleTableresults_table=SimpleTable(resarr,headers=resarr.dtype.names)results_table.title=('Test Multiple Comparison %s\n%s%4.2f method=%s'%(testfunc.__name__,'FWER=',alpha,method)+'\nalphacSidak=%4.2f, alphacBonf=%5.3f'%(alphacSidak,alphacBonf))returnresults_table,(res,reject,pvals_corrected,alphacSidak,alphacBonf),resarr

[docs]defrankdata(x):'''rankdata, equivalent to scipy.stats.rankdata just a different implementation, I have not yet compared speed '''uni,intlab=np.unique(x[:,0],return_inverse=True)groupnobs=np.bincount(intlab)groupxsum=np.bincount(intlab,weights=X[:,0])groupxmean=groupxsum*1.0/groupnobsrankraw=x[:,0].argsort().argsort()groupranksum=np.bincount(intlab,weights=rankraw)# start at 1 for stats.rankdata :grouprankmean=groupranksum*1.0/groupnobs+1returngrouprankmean[intlab]

[docs]defvarcorrection_unbalanced(nobs_all,srange=False):'''correction factor for variance with unequal sample sizes this is just a harmonic mean Parameters ---------- nobs_all : array_like The number of observations for each sample srange : bool if true, then the correction is divided by the number of samples for the variance of the studentized range statistic Returns ------- correction : float Correction factor for variance. Notes ----- variance correction factor is 1/k * sum_i 1/n_i where k is the number of samples and summation is over i=0,...,k-1. If all n_i are the same, then the correction factor is 1. This needs to be multiplied by the joint variance estimate, means square error, MSE. To obtain the correction factor for the standard deviation, square root needs to be taken. '''nobs_all=np.asarray(nobs_all)ifnotsrange:return(1./nobs_all).sum()else:return(1./nobs_all).sum()/len(nobs_all)

[docs]defvarcorrection_pairs_unbalanced(nobs_all,srange=False):'''correction factor for variance with unequal sample sizes for all pairs this is just a harmonic mean Parameters ---------- nobs_all : array_like The number of observations for each sample srange : bool if true, then the correction is divided by 2 for the variance of the studentized range statistic Returns ------- correction : array Correction factor for variance. Notes ----- variance correction factor is 1/k * sum_i 1/n_i where k is the number of samples and summation is over i=0,...,k-1. If all n_i are the same, then the correction factor is 1. This needs to be multiplies by the joint variance estimate, means square error, MSE. To obtain the correction factor for the standard deviation, square root needs to be taken. For the studentized range statistic, the resulting factor has to be divided by 2. '''#TODO: test and replace with broadcastingn1,n2=np.meshgrid(nobs_all,nobs_all)ifnotsrange:return(1./n1+1./n2)else:return(1./n1+1./n2)/2.

[docs]defvarcorrection_unequal(var_all,nobs_all,df_all):'''return joint variance from samples with unequal variances and unequal sample sizes something is wrong Parameters ---------- var_all : array_like The variance for each sample nobs_all : array_like The number of observations for each sample df_all : array_like degrees of freedom for each sample Returns ------- varjoint : float joint variance. dfjoint : float joint Satterthwait's degrees of freedom Notes ----- (copy, paste not correct) variance is 1/k * sum_i 1/n_i where k is the number of samples and summation is over i=0,...,k-1. If all n_i are the same, then the correction factor is 1/n. This needs to be multiplies by the joint variance estimate, means square error, MSE. To obtain the correction factor for the standard deviation, square root needs to be taken. This is for variance of mean difference not of studentized range. '''var_all=np.asarray(var_all)var_over_n=var_all*1./nobs_all#avoid integer divisionvarjoint=var_over_n.sum()dfjoint=varjoint**2/(var_over_n**2*df_all).sum()returnvarjoint,dfjoint

[docs]defvarcorrection_pairs_unequal(var_all,nobs_all,df_all):'''return joint variance from samples with unequal variances and unequal sample sizes for all pairs something is wrong Parameters ---------- var_all : array_like The variance for each sample nobs_all : array_like The number of observations for each sample df_all : array_like degrees of freedom for each sample Returns ------- varjoint : array joint variance. dfjoint : array joint Satterthwait's degrees of freedom Notes ----- (copy, paste not correct) variance is 1/k * sum_i 1/n_i where k is the number of samples and summation is over i=0,...,k-1. If all n_i are the same, then the correction factor is 1. This needs to be multiplies by the joint variance estimate, means square error, MSE. To obtain the correction factor for the standard deviation, square root needs to be taken. TODO: something looks wrong with dfjoint, is formula from SPSS '''#TODO: test and replace with broadcastingv1,v2=np.meshgrid(var_all,var_all)n1,n2=np.meshgrid(nobs_all,nobs_all)df1,df2=np.meshgrid(df_all,df_all)varjoint=v1/n1+v2/n2dfjoint=varjoint**2/(df1*(v1/n1)**2+df2*(v2/n2)**2)returnvarjoint,dfjoint

deftukeyhsd(mean_all,nobs_all,var_all,df=None,alpha=0.05,q_crit=None):'''simultaneous Tukey HSD check: instead of sorting, I use absolute value of pairwise differences in means. That's irrelevant for the test, but maybe reporting actual differences would be better. CHANGED: meandiffs are with sign, studentized range uses abs q_crit added for testing TODO: error in variance calculation when nobs_all is scalar, missing 1/n '''mean_all=np.asarray(mean_all)#check if or when other ones need to be arraysn_means=len(mean_all)ifdfisNone:df=nobs_all-1ifnp.size(df)==1:# assumes balanced samples with df = n - 1, n_i = ndf_total=n_means*dfdf=np.ones(n_means)*dfelse:df_total=np.sum(df)if(np.size(nobs_all)==1)and(np.size(var_all)==1):#balanced sample sizes and homogenous variancevar_pairs=1.*var_all/nobs_all*np.ones((n_means,n_means))elifnp.size(var_all)==1:#unequal sample sizes and homogenous variancevar_pairs=var_all*varcorrection_pairs_unbalanced(nobs_all,srange=True)elifnp.size(var_all)>1:var_pairs,df_sum=varcorrection_pairs_unequal(nobs_all,var_all,df)var_pairs/=2.#check division by two for studentized rangeelse:raiseValueError('not supposed to be here')#meandiffs_ = mean_all[:,None] - mean_allmeandiffs_=mean_all-mean_all[:,None]#reverse sign, check with R examplestd_pairs_=np.sqrt(var_pairs)#select all pairs from upper triangle of matrixidx1,idx2=np.triu_indices(n_means,1)meandiffs=meandiffs_[idx1,idx2]std_pairs=std_pairs_[idx1,idx2]st_range=np.abs(meandiffs)/std_pairs#studentized range statisticdf_total_=max(df_total,5)#TODO: smallest df in tableifq_critisNone:q_crit=get_tukeyQcrit2(n_means,df_total,alpha=alpha)reject=st_range>q_critcrit_int=std_pairs*q_critreject2=np.abs(meandiffs)>crit_intconfint=np.column_stack((meandiffs-crit_int,meandiffs+crit_int))return(idx1,idx2),reject,meandiffs,std_pairs,confint,q_crit, \
df_total,reject2defsimultaneous_ci(q_crit,var,groupnobs,pairindices=None):"""Compute simultaneous confidence intervals for comparison of means. q_crit value is generated from tukey hsd test. Variance is considered across all groups. Returned halfwidths can be thought of as uncertainty intervals around each group mean. They allow for simultaneous comparison of pairwise significance among any pairs (by checking for overlap) Parameters ---------- q_crit : float The Q critical value studentized range statistic from Tukey's HSD var : float The group variance groupnobs : array-like object Number of observations contained in each group. pairindices : tuple of lists, optional Indices corresponding to the upper triangle of matrix. Computed here if not supplied Returns ------- halfwidths : ndarray Half the width of each confidence interval for each group given in groupnobs See Also -------- MultiComparison : statistics class providing significance tests tukeyhsd : among other things, computes q_crit value References ---------- .. [*] Hochberg, Y., and A. C. Tamhane. Multiple Comparison Procedures. Hoboken, NJ: John Wiley & Sons, 1987.) """# Set initial variablesng=len(groupnobs)ifpairindicesisNone:pairindices=np.triu_indices(ng,1)# Compute dij for all pairwise comparisons ala hochberg p. 95gvar=var/groupnobsd12=np.sqrt(gvar[pairindices[0]]+gvar[pairindices[1]])# Create the full d matrix given all known dij valsd=np.zeros((ng,ng))d[pairindices]=d12d=d+d.conj().T# Compute the two global sums from hochberg eq 3.32sum1=np.sum(d12)sum2=np.sum(d,axis=0)if(ng>2):w=((ng-1.)*sum2-sum1)/((ng-1.)*(ng-2.))else:w=sum1*np.ones((2,1))/2.return(q_crit/np.sqrt(2))*w

defcontrast_allpairs(nm):'''contrast or restriction matrix for all pairs of nm variables Parameters ---------- nm : int Returns ------- contr : ndarray, 2d, (nm*(nm-1)/2, nm) contrast matrix for all pairwise comparisons '''contr=[]foriinrange(nm):forjinrange(i+1,nm):contr_row=np.zeros(nm)contr_row[i]=1contr_row[j]=-1contr.append(contr_row)returnnp.array(contr)defcontrast_all_one(nm):'''contrast or restriction matrix for all against first comparison Parameters ---------- nm : int Returns ------- contr : ndarray, 2d, (nm-1, nm) contrast matrix for all against first comparisons '''contr=np.column_stack((np.ones(nm-1),-np.eye(nm-1)))returncontrdefcontrast_diff_mean(nm):'''contrast or restriction matrix for all against mean comparison Parameters ---------- nm : int Returns ------- contr : ndarray, 2d, (nm-1, nm) contrast matrix for all against mean comparisons '''returnnp.eye(nm)-np.ones((nm,nm))/nmdeftukey_pvalues(std_range,nm,df):#corrected but very slow with warnings about integrationfromstatsmodels.sandbox.distributions.multivariateimportmvstdtprob#nm = len(std_range)contr=contrast_allpairs(nm)corr=np.dot(contr,contr.T)/2.tstat=std_range/np.sqrt(2)*np.ones(corr.shape[0])#need len of all pairsreturnmulticontrast_pvalues(tstat,corr,df=df)deftest_tukey_pvalues():#testcase with 3 is not good because all pairs has also 3*(3-1)/2=3 elementsres=tukey_pvalues(3.649,3,16)#3.649*np.ones(3), 16)assert_almost_equal(0.05,res[0],3)assert_almost_equal(0.05*np.ones(3),res[1],3)defmulticontrast_pvalues(tstat,tcorr,df=None,dist='t',alternative='two-sided'):'''pvalues for simultaneous tests '''fromstatsmodels.sandbox.distributions.multivariateimportmvstdtprobif(dfisNone)and(dist=='t'):raiseValueError('df has to be specified for the t-distribution')tstat=np.asarray(tstat)ntests=len(tstat)cc=np.abs(tstat)pval_global=1-mvstdtprob(-cc,cc,tcorr,df)pvals=[]fortiincc:limits=ti*np.ones(ntests)pvals.append(1-mvstdtprob(-cc,cc,tcorr,df))returnpval_global,np.asarray(pvals)

[docs]classStepDown(object):'''a class for step down methods This is currently for simple tree subset descend, similar to homogeneous_subsets, but checks all leave-one-out subsets instead of assuming an ordered set. Comment in SAS manual: SAS only uses interval subsets of the sorted list, which is sufficient for range tests (maybe also equal variance and balanced sample sizes are required). For F-test based critical distances, the restriction to intervals is not sufficient. This version uses a single critical value of the studentized range distribution for all comparisons, and is therefore a step-down version of Tukey HSD. The class is written so it can be subclassed, where the get_distance_matrix and get_crit are overwritten to obtain other step-down procedures such as REGW. iter_subsets can be overwritten, to get a recursion as in the many to one comparison with a control such as in Dunnet's test. A one-sided right tail test is not covered because the direction of the inequality is hard coded in check_set. Also Peritz's check of partitions is not possible, but I have not seen it mentioned in any more recent references. I have only partially read the step-down procedure for closed tests by Westfall. One change to make it more flexible, is to separate out the decision on a subset, also because the F-based tests, FREGW in SPSS, take information from all elements of a set and not just pairwise comparisons. I haven't looked at the details of the F-based tests such as Sheffe yet. It looks like running an F-test on equality of means in each subset. This would also outsource how pairwise conditions are combined, any larger or max. This would also imply that the distance matrix cannot be calculated in advance for tests like the F-based ones. '''def__init__(self,vals,nobs_all,var_all,df=None):self.vals=valsself.n_vals=len(vals)self.nobs_all=nobs_allself.var_all=var_allself.df=df# the following has been moved to run#self.cache_result = {}#self.crit = self.getcrit(0.5) #decide where to set alpha, moved to run#self.accepted = [] #store accepted sets, not unique

[docs]defstepdown(self,indices):print(indices)ifself.check_set(indices):# larger than critical distanceif(len(indices)>2):# step down into subsets if more than 2 elementsforsubsinself.iter_subsets(indices):self.stepdown(subs)else:self.rejected.append(tuple(indices))else:self.accepted.append(tuple(indices))returnindices

[docs]defrun(self,alpha):'''main function to run the test, could be done in __call__ instead this could have all the initialization code '''self.cache_result={}self.crit=self.get_crit(alpha)#decide where to set alpha, moved to runself.accepted=[]#store accepted sets, not uniqueself.rejected=[]self.get_distance_matrix()self.stepdown(lrange(self.n_vals))returnlist(set(self.accepted)),list(set(sd.rejected))

[docs]defset_partition(ssli):'''extract a partition from a list of tuples this should be correctly called select largest disjoint sets. Begun and Gabriel 1981 don't seem to be bothered by sets of accepted hypothesis with joint elements, e.g. maximal_accepted_sets = { {1,2,3}, {2,3,4} } This creates a set partition from a list of sets given as tuples. It tries to find the partition with the largest sets. That is, sets are included after being sorted by length. If the list doesn't include the singletons, then it will be only a partial partition. Missing items are singletons (I think). Examples -------- >>> li [(5, 6, 7, 8), (1, 2, 3), (4, 5), (0, 1)] >>> set_partition(li) ([(5, 6, 7, 8), (1, 2, 3)], [0, 4]) '''part=[]forsinsorted(list(set(ssli)),key=len)[::-1]:#print(s,s_=set(s).copy()ifnotany(set(s_).intersection(set(t))fortinpart):#print('inside:', spart.append(s)#else: print(partmissing=list(set(iforllinssliforiinll)-set(iforllinpartforiinll))returnpart,missing

[docs]defset_remove_subs(ssli):'''remove sets that are subsets of another set from a list of tuples Parameters ---------- ssli : list of tuples each tuple is considered as a set Returns ------- part : list of tuples new list with subset tuples removed, it is sorted by set-length of tuples. The list contains original tuples, duplicate elements are not removed. Examples -------- >>> set_remove_subs([(0, 1), (1, 2), (1, 2, 3), (0,)]) [(1, 2, 3), (0, 1)] >>> set_remove_subs([(0, 1), (1, 2), (1,1, 1, 2, 3), (0,)]) [(1, 1, 1, 2, 3), (0, 1)] '''#TODO: maybe convert all tuples to sets immediately, but I don't need the extra efficiencypart=[]forsinsorted(list(set(ssli)),key=lambdax:len(set(x)))[::-1]:#print(s,#s_ = set(s).copy()ifnotany(set(s).issubset(set(t))fortinpart):#print('inside:', spart.append(s)#else: print(part## missing = list(set(i for ll in ssli for i in ll)## - set(i for ll in part for i in ll))returnpart