# In this example the k-means clustering method is used to cluster a given toy
# data set. In k-means clustering one tries to partition n observations into k
# clusters in which each observation belongs to the cluster with the nearest mean.
# The algorithm class constructor takes the number of clusters and a distance to
# be used as input. The distance used in this example is Euclidean distance.
# After training one can fetch the result of clustering by obtaining the cluster
# centers and their radiuses.
from tools.load import LoadMatrix
from sg import sg
lm=LoadMatrix()
traindat=lm.load_numbers('../data/fm_train_real.dat')
parameter_list=[[traindat,10,3,1000],[traindat,11,4,1500]]
def clustering_kmeans (fm_train=traindat, size_cache=10,k=3,iter=1000):
sg('set_features', 'TRAIN', fm_train)
sg('set_distance', 'EUCLIDEAN', 'REAL')
sg('new_clustering', 'KMEANS')
sg('train_clustering', k, iter)
[radi, centers]=sg('get_clustering')
return [radi, centers]
if __name__=='__main__':
print('KMeans')
clustering_kmeans(*parameter_list[0])

Distance

../examples/documented/python_static/distance_braycurtis.py

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values (feature type 'REAL')
# from different files and initializes the distance to 'BRAYCURTIS'.
# Each column of the matrices corresponds to one data point.
#
# The target 'TRAIN' for 'set_features' controls the processing of the given
# data points, where a pairwise distance matrix is computed by
# 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix' and
# target 'TRAIN'.
#
# The target 'TEST' for 'set_features' controls the processing of the given
# data points 'TRAIN' and 'TEST', where a pairwise distance matrix between
# these two matrices is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TEST'. The 'TRAIN' distance matrix ceased to exist.
#
# For more details see doc/classshogun_1_1CBrayCurtisDistance.html.
#
# Obviously, using the Bray Curtis distance is not limited to this showcase
# example.
from tools.load import LoadMatrix
from sg import sg
lm=LoadMatrix()
traindat=lm.load_numbers('../data/fm_train_real.dat')
testdat=lm.load_numbers('../data/fm_test_real.dat')
parameter_list=[[traindat,testdat],[traindat,testdat]]
def distance_braycurtis (fm_train_real=traindat,fm_test_real=testdat):
sg('set_distance', 'BRAYCURTIS', 'REAL')
sg('set_features', 'TRAIN', fm_train_real)
dm=sg('get_distance_matrix', 'TRAIN')
sg('set_features', 'TEST', fm_test_real)
dm=sg('get_distance_matrix', 'TEST')
return dm
if __name__=='__main__':
print('BrayCurtisDistance')
distance_braycurtis(*parameter_list[0])

../examples/documented/python_static/distance_canberra.py

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values (feature type 'REAL')
# from different files and initializes the distance to 'CANBERRA'.
# Each column of the matrices corresponds to one data point.
#
# The target 'TRAIN' for 'set_features' controls the processing of the given
# data points, where a pairwise distance (dissimilarity ratio) matrix is
# computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TRAIN'.
#
# The target 'TEST' for 'set_features' controls the processing of the given
# data points 'TRAIN' and 'TEST', where a pairwise distance (dissimilarity ratio)
# matrix between these two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix' and
# target 'TEST'. The 'TRAIN' distance matrix ceased to exist.
#
# For more details see doc/classshogun_1_1CCanberraMetric.html.
#
# Obviously, using the Canberra distance is not limited to this showcase
# example.
from tools.load import LoadMatrix
from sg import sg
lm=LoadMatrix()
traindat=lm.load_numbers('../data/fm_train_real.dat')
testdat=lm.load_numbers('../data/fm_test_real.dat')
parameter_list=[[traindat,testdat],[traindat,testdat]]
def distance_canberra (fm_train_real=traindat,fm_test_real=testdat):
sg('set_distance', 'CANBERRA', 'REAL')
sg('set_features', 'TRAIN', fm_train_real)
dm=sg('get_distance_matrix', 'TRAIN')
sg('set_features', 'TEST', fm_test_real)
dm=sg('get_distance_matrix', 'TEST')
return dm
if __name__=='__main__':
print('CanberraMetric')
distance_canberra(*parameter_list[0])

../examples/documented/python_static/distance_canberraword.py

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored data sets in 'STRING' representation
# (feature type 'CHAR' with alphabet 'DNA') from different files and
# initializes the distance to 'CANBERRA' with feature type 'WORD'.
#
# Data points in this example are defined by the transformation function
# 'convert' and the preprocessing step applied afterwards (defined by
# 'add_preproc' and preprocessor 'SORTWORDSTRING').
#
# The target 'TRAIN' for 'set_features' controls the binding of the given
# data points. In order to compute a pairwise distance matrix by
# 'get_distance_matrix', we have to perform two preprocessing steps for
# input data 'TRAIN'. The method 'convert' transforms the input data to
# a string representation suitable for the selected distance. The individual
# strings are sorted in ascending order after the execution of 'attach_preproc'.
# A pairwise distance matrix is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TRAIN'.
#
# The target 'TEST' for 'set_features' controls the binding of the given
# data points 'TRAIN' and 'TEST'. In order to compute a pairwise distance
# matrix between these two data sets by 'get_distance_matrix', we have to
# perform two preprocessing steps for input data 'TEST'. The method 'convert'
# transforms the input data 'TEST' to a string representation suitable for
# the selected distance. The individual strings are sorted in ascending order
# after the execution of 'attach_preproc'. A pairwise distance matrix between
# the data sets 'TRAIN' and 'TEST' is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TEST'. The 'TRAIN' distance matrix ceased to exist.
#
# For more details see
# doc/classshogun_1_1CSortWordString.html,
# doc/classshogun_1_1CPreprocessor.html,
# doc/classshogun_1_1CStringFeatures.html (method obtain_from_char_features) and
# doc/classshogun_1_1CCanberraWordDistance.html.
#
# Obviously, using the Canberra word distance is not limited to this showcase
# example.
from tools.load import LoadMatrix
from sg import sg
lm=LoadMatrix()
traindna=lm.load_dna('../data/fm_train_dna.dat')
testdna=lm.load_dna('../data/fm_test_dna.dat')
parameter_list=[[traindna,testdna,3,0,'n'],[traindna,testdna,4,0,'n']]
def distance_canberraword (fm_train_dna=traindna,fm_test_dna=testdna,order=3,
gap=0,reverse='n'):
sg('set_distance', 'CANBERRA', 'WORD')
sg('add_preproc', 'SORTWORDSTRING')
sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)
sg('attach_preproc', 'TRAIN')
dm=sg('get_distance_matrix', 'TRAIN')
sg('set_features', 'TEST', fm_test_dna, 'DNA')
sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)
sg('attach_preproc', 'TEST')
dm=sg('get_distance_matrix', 'TEST')
return dm
if __name__=='__main__':
print('CanberraWordDistance')
distance_canberraword(*parameter_list[0])

../examples/documented/python_static/distance_chebyshew.py

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values (feature type 'REAL')
# from different files and initializes the distance to 'CHEBYSHEW'.
# Each column of the matrices corresponds to one data point.
#
# The target 'TRAIN' for 'set_features' controls the processing of the given
# data points, where a pairwise distance matrix (maximum of absolute feature
# dimension differences) is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TRAIN'.
#
# The target 'TEST' for 'set_features' controls the processing of the given
# data points 'TRAIN' and 'TEST', where a pairwise distance matrix (maximum
# of absolute feature dimension differences) between these two data sets is
# computed.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TEST'. The 'TRAIN' distance matrix ceased to exist.
#
# For more details see doc/classshogun_1_1CChebyshewMetric.html.
#
# Obviously, using the Chebyshew distance is not limited to this showcase
# example.
from tools.load import LoadMatrix
from sg import sg
lm=LoadMatrix()
traindat=lm.load_numbers('../data/fm_train_real.dat')
testdat=lm.load_numbers('../data/fm_test_real.dat')
parameter_list=[[traindat,testdat],[traindat,testdat]]
def distance_chebyshew (fm_train_real=traindat,fm_test_real=testdat):
sg('set_distance', 'CHEBYSHEW', 'REAL')
sg('set_features', 'TRAIN', fm_train_real)
dm=sg('get_distance_matrix', 'TRAIN')
sg('set_features', 'TEST', fm_test_real)
dm=sg('get_distance_matrix', 'TEST')
return dm
if __name__=='__main__':
print('ChebyshewMetric')
distance_chebyshew(*parameter_list[0])

../examples/documented/python_static/distance_chisquare.py

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values (feature type 'REAL')
# from different files and initializes the distance to 'CHISQUARE'.
# Each column of the matrices corresponds to one data point.
#
# The target 'TRAIN' for 'set_features' controls the processing of the given
# data points, where a pairwise distance matrix is computed by
# 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TRAIN'.
#
# The target 'TEST' for 'set_features' controls the processing of the given
# data points 'TRAIN' and 'TEST', where a pairwise distance matrix between
# these two matrices is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TEST'. The 'TRAIN' distance matrix ceased to exist.
#
# For more details see doc/classshogun_1_1CChiSquareDistance.html.
#
# Obviously, using the ChiSquare distance is not limited to this showcase
# example.
from tools.load import LoadMatrix
from sg import sg
lm=LoadMatrix()
traindat=lm.load_numbers('../data/fm_train_real.dat')
testdat=lm.load_numbers('../data/fm_test_real.dat')
parameter_list=[[traindat,testdat],[traindat,testdat]]
def distance_chisquare (fm_train_real=traindat,fm_test_real=testdat):
sg('set_distance', 'CHISQUARE', 'REAL')
sg('set_features', 'TRAIN', fm_train_real)
dm=sg('get_distance_matrix', 'TRAIN')
sg('set_features', 'TEST', fm_test_real)
dm=sg('get_distance_matrix', 'TEST')
return dm
if __name__=='__main__':
print('ChiSquareDistance')
distance_chisquare(*parameter_list[0])

../examples/documented/python_static/distance_cosine.py

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values (feature type 'REAL')
# from different files and initializes the distance to 'COSINE'.
# Each column of the matrices corresponds to one data point.
#
# The target 'TRAIN' for 'set_features' controls the processing of the given
# data points, where a pairwise distance matrix is computed by
# 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix' and
# target 'TRAIN'.
#
# The target 'TEST' for 'set_features' controls the processing of the given
# data points 'TRAIN' and 'TEST', where a pairwise distance matrix between
# these two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TEST'. The 'TRAIN' distance matrix ceased to exist.
#
# For more details see doc/classshogun_1_1CCosineDistance.html.
#
# Obviously, using the Cosine distance is not limited to this showcase
# example.
from tools.load import LoadMatrix
from sg import sg
lm=LoadMatrix()
traindat=lm.load_numbers('../data/fm_train_real.dat')
testdat=lm.load_numbers('../data/fm_test_real.dat')
parameter_list=[[traindat,testdat],[traindat,testdat]]
def distance_cosine (fm_train_real=traindat,fm_test_real=testdat):
sg('set_distance', 'COSINE', 'REAL')
sg('set_features', 'TRAIN', fm_train_real)
dm=sg('get_distance_matrix', 'TRAIN')
sg('set_features', 'TEST', fm_test_real)
dm=sg('get_distance_matrix', 'TEST')
return dm
if __name__=='__main__':
print('CosineDistance')
distance_cosine(*parameter_list[0])

../examples/documented/python_static/distance_euclidian.py

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values (feature type 'REAL')
# from different files and initializes the distance to 'EUCLIDIAN'.
# Each column of the matrices corresponds to one data point.
#
# The target 'TRAIN' for 'set_features' controls the processing of the given
# data points, where a pairwise distance matrix is computed by
# 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix' and
# target 'TRAIN'.
#
# The target 'TEST' for 'set_features' controls the processing of the given
# data points 'TRAIN' and 'TEST', where a pairwise distance matrix between
# these two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TEST'. The 'TRAIN' distance matrix ceased to exist.
#
# For more details see doc/classshogun_1_1CEuclidianDistance.html.
#
# Obviously, using the Euclidian distance is not limited to this showcase
# example.
from tools.load import LoadMatrix
from sg import sg
lm=LoadMatrix()
traindat=lm.load_numbers('../data/fm_train_real.dat')
testdat=lm.load_numbers('../data/fm_test_real.dat')
parameter_list=[[traindat,testdat],[traindat,testdat]]
def distance_euclidean (fm_train_real=traindat,fm_test_real=testdat):
sg('set_distance', 'EUCLIDEAN', 'REAL')
sg('set_features', 'TRAIN', fm_train_real)
dm=sg('get_distance_matrix', 'TRAIN')
sg('set_features', 'TEST', fm_test_real)
dm=sg('get_distance_matrix', 'TEST')
return dm
if __name__=='__main__':
print('EuclideanDistance')
distance_euclidean(*parameter_list[0])

../examples/documented/python_static/distance_geodesic.py

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values (feature type 'REAL')
# from different files and initializes the distance to 'GEODESIC'.
# Each column of the matrices corresponds to one data point.
#
# The target 'TRAIN' for 'set_features' controls the processing of the given
# data points, where a pairwise distance (shortest path on a sphere) matrix is
# computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix' and
# target 'TRAIN'.
#
# The target 'TEST' for 'set_features' controls the processing of the given
# data points 'TRAIN' and 'TEST', where a pairwise distance (shortest path on
# a sphere) matrix between these two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TEST'. The 'TRAIN' distance matrix ceased to exist.
#
# For more details see doc/classshogun_1_1CGeodesicMetric.html.
#
# Obviously, using the Geodesic distance is not limited to this showcase
# example.
from tools.load import LoadMatrix
from sg import sg
lm=LoadMatrix()
traindat=lm.load_numbers('../data/fm_train_real.dat')
testdat=lm.load_numbers('../data/fm_test_real.dat')
parameter_list=[[traindat,testdat],[traindat,testdat]]
def distance_geodesic (fm_train_real=traindat,fm_test_real=testdat):
sg('set_distance', 'GEODESIC', 'REAL')
sg('set_features', 'TRAIN', fm_train_real)
dm=sg('get_distance_matrix', 'TRAIN')
sg('set_features', 'TEST', fm_test_real)
dm=sg('get_distance_matrix', 'TEST')
return dm
if __name__=='__main__':
print('GeodesicMetric')
distance_geodesic(*parameter_list[0])

../examples/documented/python_static/distance_hammingword.py

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored data sets in 'STRING' representation
# (feature type 'CHAR' with alphabet 'DNA') from different files and
# initializes the distance to 'HAMMING' with feature type 'WORD'.
#
# Data points in this example are defined by the transformation function
# 'convert' and the preprocessing step applied afterwards (defined by
# 'add_preproc' and preprocessor 'SORTWORDSTRING').
#
# The target 'TRAIN' for 'set_features' controls the binding of the given
# data points. In order to compute a pairwise distance matrix by
# 'get_distance_matrix', we have to perform two preprocessing steps for
# input data 'TRAIN'. The method 'convert' transforms the input data to
# a string representation suitable for the selected distance. The individual
# strings are sorted in ascending order after the execution of 'attach_preproc'.
# A pairwise distance matrix is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TRAIN'.
#
# The target 'TEST' for 'set_features' controls the binding of the given
# data points 'TRAIN' and 'TEST'. In order to compute a pairwise distance
# matrix between these two data sets by 'get_distance_matrix', we have to
# perform two preprocessing steps for input data 'TEST'. The method 'convert'
# transforms the input data 'TEST' to a string representation suitable for
# the selected distance. The individual strings are sorted in ascending order
# after the execution of 'attach_preproc'. A pairwise distance matrix between
# the data sets 'TRAIN' and 'TEST' is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TEST'. The 'TRAIN' distance matrix ceased to exist.
#
# For more details see
# doc/classshogun_1_1CSortWordString.html,
# doc/classshogun_1_1CPreprocessor.html,
# doc/classshogun_1_1CStringFeatures.html (method obtain_from_char_features) and
# doc/classshogun_1_1CHammingWordDistance.html.
#
# Obviously, using the Hamming word distance is not limited to this showcase
# example.
from tools.load import LoadMatrix
from sg import sg
lm=LoadMatrix()
traindna=lm.load_dna('../data/fm_train_dna.dat')
testdna=lm.load_dna('../data/fm_test_dna.dat')
parameter_list=[[traindna,testdna,3,0,'n'],[traindna,testdna,4,0,'n']]
def distance_hammingword (fm_train_dna=traindna,fm_test_dna=testdna,order=3,
gap=0,reverse='n'):
sg('set_distance', 'HAMMING', 'WORD')
sg('add_preproc', 'SORTWORDSTRING')
sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)
sg('attach_preproc', 'TRAIN')
dm=sg('get_distance_matrix', 'TRAIN')
sg('set_features', 'TEST', fm_test_dna, 'DNA')
sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)
sg('attach_preproc', 'TEST')
dm=sg('get_distance_matrix', 'TEST')
return dm
if __name__=='__main__':
print('HammingWordDistance')
distance_hammingword(*parameter_list[0])

../examples/documented/python_static/distance_jensen.py

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values (feature type 'REAL')
# from different files and initializes the distance to 'JENSEN'.
# Each column of the matrices corresponds to one data point.
#
# The target 'TRAIN' for 'set_features' controls the processing of the given
# data points, where a pairwise distance (divergence measure based on the
# Kullback-Leibler divergence) matrix is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix' and
# target 'TRAIN'.
#
# The target 'TEST' for 'set_features' controls the processing of the given
# data points 'TRAIN' and 'TEST', where a pairwise distance (divergence measure
# based on the Kullback-Leibler divergence) matrix between these two data sets
# is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TEST'. The 'TRAIN' distance matrix ceased to exist.
#
# For more details see doc/classshogun_1_1CJensenMetric.html.
#
# Obviously, using the Jensen-Shannon distance/divergence is not limited to
# this showcase example.
from tools.load import LoadMatrix
from sg import sg
lm=LoadMatrix()
traindat=lm.load_numbers('../data/fm_train_real.dat')
testdat=lm.load_numbers('../data/fm_test_real.dat')
parameter_list=[[traindat,testdat],[traindat,testdat]]
def distance_jensen (fm_train_real=traindat,fm_test_real=testdat):
sg('set_distance', 'JENSEN', 'REAL')
sg('set_features', 'TRAIN', fm_train_real)
dm=sg('get_distance_matrix', 'TRAIN')
sg('set_features', 'TEST', fm_test_real)
dm=sg('get_distance_matrix', 'TEST')
return dm
if __name__=='__main__':
print('JensenMetric')
distance_jensen(*parameter_list[0])

../examples/documented/python_static/distance_manhatten.py

# n approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values (feature type 'REAL')
# from different files and initializes the distance to 'MANHATTAN'.
# Each column of the matrices corresponds to one data point.
#
# The target 'TRAIN' for 'set_features' controls the processing of the given
# data points, where a pairwise distance (sum of absolute feature
# dimension differences) matrix is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix' and
# target 'TRAIN'.
#
# The target 'TEST' for 'set_features' controls the processing of the given
# data points 'TRAIN' and 'TEST', where a pairwise distance (sum of absolute
# feature dimension differences) matrix between these two data sets is
# computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TEST'. The 'TRAIN' distance matrix ceased to exist.
#
# For more details see doc/classshogun_1_1CManhattanMetric.html.
#
# Obviously, using the Manhattan distance is not limited to this showcase
# example.
from tools.load import LoadMatrix
from sg import sg
lm=LoadMatrix()
traindat=lm.load_numbers('../data/fm_train_real.dat')
testdat=lm.load_numbers('../data/fm_test_real.dat')
parameter_list=[[traindat,testdat],[traindat,testdat]]
def distance_manhatten (fm_train_real=traindat,fm_test_real=testdat):
sg('set_distance', 'MANHATTAN', 'REAL')
sg('set_features', 'TRAIN', fm_train_real)
dm=sg('get_distance_matrix', 'TRAIN')
sg('set_features', 'TEST', fm_test_real)
dm=sg('get_distance_matrix', 'TEST')
return dm
if __name__=='__main__':
print('ManhattanMetric')
distance_manhatten(*parameter_list[0])

../examples/documented/python_static/distance_manhattenword.py

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored data sets in 'STRING' representation
# (feature type 'CHAR' with alphabet 'DNA') from different files and
# initializes the distance to 'MANHATTAN' with feature type 'WORD'.
#
# Data points in this example are defined by the transformation function
# 'convert' and the preprocessing step applied afterwards (defined by
# 'add_preproc' and preprocessor 'SORTWORDSTRING').
#
# The target 'TRAIN' for 'set_features' controls the binding of the given
# data points. In order to compute a pairwise distance matrix by
# 'get_distance_matrix', we have to perform two preprocessing steps for
# input data 'TRAIN'. The method 'convert' transforms the input data to
# a string representation suitable for the selected distance. The individual
# strings are sorted in ascending order after the execution of 'attach_preproc'.
# A pairwise distance matrix is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TRAIN'.
#
# The target 'TEST' for 'set_features' controls the binding of the given
# data points 'TRAIN' and 'TEST'. In order to compute a pairwise distance
# matrix between these two data sets by 'get_distance_matrix', we have to
# perform two preprocessing steps for input data 'TEST'. The method 'convert'
# transforms the input data 'TEST' to a string representation suitable for
# the selected distance. The individual strings are sorted in ascending order
# after the execution of 'attach_preproc'. A pairwise distance matrix between
# the data sets 'TRAIN' and 'TEST' is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TEST'. The 'TRAIN' distance matrix ceased to exist.
#
# For more details see
# doc/classshogun_1_1CSortWordString.html,
# doc/classshogun_1_1CPreprocessor.html,
# doc/classshogun_1_1CStringFeatures.html (method obtain_from_char_features) and
# doc/classshogun_1_1CManhattanWordDistance.html.
#
# Obviously, using the Manhattan word distance is not limited to this showcase
# example.
from tools.load import LoadMatrix
from sg import sg
lm=LoadMatrix()
traindna=lm.load_dna('../data/fm_train_dna.dat')
testdna=lm.load_dna('../data/fm_test_dna.dat')
parameter_list=[[traindna,testdna,3,0,'n'],[traindna,testdna,4,0,'n']]
def distance_manhattenword (fm_train_dna=traindna,fm_test_dna=testdna,order=3,
gap=0,reverse='n'):
sg('set_distance', 'MANHATTAN', 'WORD')
sg('add_preproc', 'SORTWORDSTRING')
sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)
sg('attach_preproc', 'TRAIN')
dm=sg('get_distance_matrix', 'TRAIN')
sg('set_features', 'TEST', fm_test_dna, 'DNA')
sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)
sg('attach_preproc', 'TEST')
dm=sg('get_distance_matrix', 'TEST')
return dm
if __name__=='__main__':
print('ManhattanWordDistance')
distance_manhattenword(*parameter_list[0])

../examples/documented/python_static/distance_minkowski.py

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values (feature type 'REAL')
# from different files and initializes the distance to 'MINKOWSKI' with
# norm 'k'. Each column of the matrices corresponds to one data point.
#
# The target 'TRAIN' for 'set_features' controls the processing of the given
# data points, where a pairwise distance matrix is computed by
# 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix' and
# target 'TRAIN'.
#
# The target 'TEST' for 'set_features' controls the processing of the given
# data points 'TRAIN' and 'TEST', where a pairwise distance matrix between
# these two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TEST'. The 'TRAIN' distance matrix ceased to exist.
#
# For more details see doc/classshogun_1_1CMinkowskiMetric.html.
#
# Obviously, using the Minkowski metric is not limited to this showcase
# example.
from tools.load import LoadMatrix
from sg import sg
lm=LoadMatrix()
traindat=lm.load_numbers('../data/fm_train_real.dat')
testdat=lm.load_numbers('../data/fm_test_real.dat')
parameter_list=[[traindat,testdat,3.],[traindat,testdat,4.]]
def distance_minkowski (fm_train_real=traindat,fm_test_real=testdat,k=3.):
sg('set_distance', 'MINKOWSKI', 'REAL', k)
sg('set_features', 'TRAIN', fm_train_real)
dm=sg('get_distance_matrix', 'TRAIN')
sg('set_features', 'TEST', fm_test_real)
dm=sg('get_distance_matrix', 'TEST')
return dm
if __name__=='__main__':
print('MinkowskiMetric')
distance_minkowski(*parameter_list[0])

../examples/documented/python_static/distance_tanimoto.py

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values (feature type 'REAL')
# from different files and initializes the distance to 'TANIMOTO'.
# Each column of the matrices corresponds to one data point.
#
# The target 'TRAIN' for 'set_features' controls the processing of the given
# data points, where a pairwise distance (extended Jaccard coefficient)
# matrix is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix' and
# target 'TRAIN'.
#
# The target 'TEST' for 'set_features' controls the processing of the given
# data points 'TRAIN' and 'TEST', where a pairwise distance (extended
# Jaccard coefficient) matrix between these two data sets is computed by
# 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TEST'. The 'TRAIN' distance matrix ceased to exist.
#
# For more details see doc/classshogun_1_1CTanimotoDistance.html.
#
# Obviously, using the Tanimoto distance/coefficient is not limited to
# this showcase example.
from tools.load import LoadMatrix
from sg import sg
lm=LoadMatrix()
traindat=lm.load_numbers('../data/fm_train_real.dat')
testdat=lm.load_numbers('../data/fm_test_real.dat')
parameter_list=[[traindat,testdat],[traindat,testdat]]
def distance_tanimoto (fm_train_real=traindat,fm_test_real=testdat):
sg('set_distance', 'TANIMOTO', 'REAL')
sg('set_features', 'TRAIN', fm_train_real)
dm=sg('get_distance_matrix', 'TRAIN')
sg('set_features', 'TEST', fm_test_real)
dm=sg('get_distance_matrix', 'TEST')
return dm
if __name__=='__main__':
print('TanimotoDistance')
distance_tanimoto(*parameter_list[0])

# This example initializes the locality improved string kernel. The locality improved string
# kernel is defined on sequences of the same length and inspects letters matching at
# corresponding positions in both sequences. The kernel sums over all matches in windows of
# length l and takes this sum to the power of 'inner_degree'. The sum over all these
# terms along the sequence is taken to the power of 'outer_degree'.
from tools.load import LoadMatrix
from sg import sg
lm=LoadMatrix()
traindna=lm.load_dna('../data/fm_train_dna.dat')
testdna=lm.load_dna('../data/fm_test_dna.dat')
trainlabel=lm.load_labels('../data/label_train_dna.dat')
parameter_list=[[traindna,testdna,trainlabel,10,5,5,7],
[traindna,testdna,trainlabel,11,6,6,8]]
def kernel_localityimprovedstring (fm_train_dna=traindna,fm_test_dna=testdna,
label_train_dna=trainlabel,size_cache=10,
length=5,inner_degree=5,outer_degree=7):
sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
sg('set_features', 'TEST', fm_test_dna, 'DNA')
sg('set_kernel', 'LIK', 'CHAR', size_cache, length, inner_degree, outer_degree)
km=sg('get_kernel_matrix', 'TRAIN')
km=sg('get_kernel_matrix', 'TEST')
return km
if __name__=='__main__':
print('LocalityImprovedString')
kernel_localityimprovedstring(*parameter_list[0])

# This is an example for the initialization of the PolyMatchString kernel on string data.
# The PolyMatchString kernel sums over the matches of two stings of the same length and
# takes the sum to the power of 'degree'. The strings consist of the characters 'ACGT' corresponding
# to the DNA-alphabet. Each column of the matrices of type char corresponds to
# one training/test example.
from tools.load import LoadMatrix
from sg import sg
lm=LoadMatrix()
traindna=lm.load_dna('../data/fm_train_dna.dat')
testdna=lm.load_dna('../data/fm_test_dna.dat')
parameter_list=[[traindna,testdna,10,3,False],
[traindna,testdna,11,4,False]]
def kernel_polymatchstring (fm_train_dna=traindna,fm_test_dna=testdna,
size_cache=10,degree=3,inhomogene=False):
sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
sg('set_features', 'TEST', fm_test_dna, 'DNA')
sg('set_kernel', 'POLYMATCH', 'CHAR', size_cache, degree, inhomogene)
km=sg('get_kernel_matrix', 'TRAIN')
km=sg('get_kernel_matrix', 'TEST')
return km
if __name__=='__main__':
print('PolyMatchString')
kernel_polymatchstring(*parameter_list[0])

Preproc

../examples/documented/python_static/preproc_logplusone.py

# In this example a kernel matrix is computed for a given real-valued data set.
# The kernel used is the Chi2 kernel which operates on real-valued vectors. It
# computes the chi-squared distance between sets of histograms. It is a very
# useful distance in image recognition (used to detect objects). The preprocessor
# LogPlusOne adds one to a dense real-valued vector and takes the logarithm of
# each component of it. It is most useful in situations where the inputs are
# counts: When one compares differences of small counts any difference may matter
# a lot, while small differences in large counts don't. This is what this log
# transformation controls for.
from tools.load import LoadMatrix
from sg import sg
lm=LoadMatrix()
traindat=lm.load_numbers('../data/fm_train_real.dat')
testdat=lm.load_numbers('../data/fm_test_real.dat')
parameter_list=[[traindat,testdat,1.4,10],[traindat,testdat,1.5,11]]
def preproc_logplusone (fm_train_real=traindat,fm_test_real=testdat,
width=1.4,size_cache=10):
sg('add_preproc', 'LOGPLUSONE')
sg('set_kernel', 'CHI2', 'REAL', size_cache, width)
sg('set_features', 'TRAIN', fm_train_real)
sg('attach_preproc', 'TRAIN')
km=sg('get_kernel_matrix', 'TRAIN')
sg('set_features', 'TEST', fm_test_real)
sg('attach_preproc', 'TEST')
km=sg('get_kernel_matrix', 'TEST')
return km
if __name__=='__main__':
print('LogPlusOne')
preproc_logplusone(*parameter_list[0])