"""classifier evaluation using scikit-learn
more details at:
http://scikit-learn.org/stable/modules/cross_validation.html
http://scikit-learn.org/stable/tutorial/statistical_inference/model_selection.html
"""import numpy as np
from sklearn import cross_validation
from sklearn import metrics
import perceptron2
data=np.genfromtxt("../data/heart_scale.data", delimiter=",")
X=data[:,1:]
y=data[:,0]# let's train/test a perceptron on the heart dataset:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)
classifier = perceptron2.Perceptron()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)# let's comput the accuracy of the classifier:print(len(np.where(np.equal(y_pred, y_test))[0])/len(y_test))# you can get the same result using scikit-learn:
metrics.accuracy_score(y_test, y_pred)# now let's use cross-validation instead:print(cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='accuracy'))# you can obtain accuracy for other metrics, such as area under the roc curve:print(cross_validation.cross_val_score(classifier, X, y, cv=5, scoring='roc_auc'))# you can also obtain the predictions by cross-validation and then compute the accuracy:
y_predict = cross_validation.cross_val_predict(classifier, X, y, cv=5)print(metrics.accuracy_score(y, y_predict))# here's an alternative way of doing cross-validation.# first divide the data into folds:
cv = cross_validation.StratifiedKFold(y,5)# now use these folds:print(cross_validation.cross_val_score(classifier, X, y, cv=cv, scoring='roc_auc'))# you can see how examples were divided into folds by looking at the test_folds attribute:print(cv.test_folds)# hmm... perhaps we should shuffle things a bit...
cv = cross_validation.StratifiedKFold(y,5, shuffle=True)print(cv.test_folds)# if you run division into folds multiple times you will get a different answer:
cv = cross_validation.StratifiedKFold(y,5, shuffle=True)print(cv.test_folds)# if you want to consistently get the same division into folds:
cv = cross_validation.StratifiedKFold(y,5, shuffle=True, random_state=0)# this sets the seed for the random number generator.