'''Sanity check for the Dict of TaxID:Samples'''nr_keys=0taxids_of_interest=[]forkeyinsamples_of_interest:nr_keys+=1taxids_of_interest.append(key)taxids_of_interest.sort()print("DEBUGGING:\nNr_TaxIDs_of_Interest.keys: {}".format(nr_keys))print(taxids_of_interest)

sample_names.append(header_cov[sample_count-1])# starting at 0 in arrayifsample_count==len(header_cov)andlen(sample_names)>=args.m:samples_of_interest[cov_taxID]=sample_namesreturn{'SoI':samples_of_interest,'h':header_cov}#return dict()COV.close()PER.close()################################## FILTER I: END 'returns(<'samples_of_interest', type=dict>)'#################################defheader_comparison(header_cov):'''Comparing the sample order in the coverage and all_samples file'''# sort snp_indices by sample_of_interest names

#if header == snp_header_joined:# print("Headers match nicely\n")#else:# print("CAUTION: Header in COV_FILE does not match the order of samples in the SNP_FILES,\n\t no problem, we took care of it!\n")

withopen(best_split_x,'r')asfile:forsnp_lineinfile:#position wise loopsnp_taxID=snp_line.split()[0].split('.')[0]#Name of Genome change from . to ]## SPECIES FILTER:ifsnp_taxIDnotinsamples_of_interest.keys():#Check if Genome is of interest

'''Pairwise genetic distance'''# The expression used to compute dist_nd would compute all the necessary# values if appplied to sample[12]. However, the case where there are no# duplicates is the majority (often >90% of cases) and can be done much# faster, so it is special cased here:sample1nd=sample1.reset_index().drop_duplicates(subset='index',keep=False).set_index('index')sample2nd=sample2.reset_index().drop_duplicates(subset='index',keep=False).set_index('index')sample2nd=sample2nd.reindex(index=sample1nd.index)s1=sample1nd.valuess2=sample2nd.valuesvalid=~(np.isnan(s1)|np.isnan(s2))s1=s1[valid]s2=s2[valid]s1=np.vstack([s1,1-s1])s2=np.vstack([s2,1-s2])dist_nd=(s1[0]*s2[1]+s1[1]*s2[0]).sum()defcompute_diversity(x):out=np.outer(x.s1.values,x.s2.values)returnnp.nansum(out)-np.nansum(out.diagonal())sample1d=sample1.ix[sample1.index[sample1.index.duplicated()]]sample2d=sample2.ix[sample2.index[sample2.index.duplicated()]]ifnotlen(sample1d)ornotlen(sample2d):# No duplicatesreturndist_ndboth=pd.DataFrame({'s1':sample1d,'s2':sample2d})both=both.reset_index()both=pd.concat([both,(1.-both.groupby('index').sum()).reset_index()])dist_d=both.groupby('index',group_keys=False).apply(compute_diversity).sum()returndist_d+dist_nd