Sex

# Create an encodersex_encoder=preprocessing.LabelEncoder()# Fit the encoder to the train data so it knows that male = 1sex_encoder.fit(train['Sex'])# Apply the encoder to the training datatrain['male']=sex_encoder.transform(train['Sex'])# Apply the encoder to the training datatest['male']=sex_encoder.transform(test['Sex'])

Embarked

# Convert the Embarked training feature into dummies using one-hot# and leave one first category to prevent perfect collinearitytrain_embarked_dummied=pd.get_dummies(train["Embarked"],prefix='embarked',drop_first=True)# Convert the Embarked test feature into dummies using one-hot# and leave one first category to prevent perfect collinearitytest_embarked_dummied=pd.get_dummies(test["Embarked"],prefix='embarked',drop_first=True)# Concatenate the dataframe of dummies with the main dataframestrain=pd.concat([train,train_embarked_dummied],axis=1)test=pd.concat([test,test_embarked_dummied],axis=1)

Social Class

# Convert the Pclass training feature into dummies using one-hot# and leave one first category to prevent perfect collinearitytrain_Pclass_dummied=pd.get_dummies(train["Pclass"],prefix='Pclass',drop_first=True)# Convert the Pclass test feature into dummies using one-hot# and leave one first category to prevent perfect collinearitytest_Pclass_dummied=pd.get_dummies(test["Pclass"],prefix='Pclass',drop_first=True)# Concatenate the dataframe of dummies with the main dataframestrain=pd.concat([train,train_Pclass_dummied],axis=1)test=pd.concat([test,test_Pclass_dummied],axis=1)

Impute Missing Values

A number of values of the Age feature are missing and will prevent the random forest to train. We get around this we will fill in missing values with the mean value of age (a useful fiction).

Age

# Create an imputer objectage_imputer=preprocessing.Imputer(missing_values='NaN',strategy='mean',axis=0)# Fit the imputer object on the training dataage_imputer.fit(train['Age'].reshape(-1,1))# Apply the imputer object to the training and test datatrain['Age']=age_imputer.transform(train['Age'].reshape(-1,1))test['Age']=age_imputer.transform(test['Age'].reshape(-1,1))

Fare

# Create an imputer objectfare_imputer=preprocessing.Imputer(missing_values='NaN',strategy='mean',axis=0)# Fit the imputer object on the training datafare_imputer.fit(train['Fare'].reshape(-1,1))# Apply the imputer object to the training and test datatrain['Fare']=fare_imputer.transform(train['Fare'].reshape(-1,1))test['Fare']=fare_imputer.transform(test['Fare'].reshape(-1,1))

Search For Optimum Parameters

# Create a dictionary containing all the candidate values of the parametersparameter_grid=dict(n_estimators=list(range(1,5001,1000)),criterion=['gini','entropy'],max_features=list(range(1,len(features),2)),max_depth=[None]+list(range(5,25,1)))# Creata a random forest objectrandom_forest=RandomForestClassifier(random_state=0,n_jobs=-1)# Create a gridsearch object with 5-fold cross validation, and uses all cores (n_jobs=-1)clf=GridSearchCV(estimator=random_forest,param_grid=parameter_grid,cv=5,verbose=1,n_jobs=-1)

# Nest the gridsearchCV in a 3-fold CV for model evaluationcv_scores=cross_val_score(clf,train[features],train['Survived'])# Print resultsprint('Accuracy scores:',cv_scores)print('Mean of score:',np.mean(cv_scores))print('Variance of scores:',np.var(cv_scores))

Retrain The Random Forest With The Optimum Parameters

# Retrain the model on the whole datasetclf.fit(train[features],train['Survived'])# Predict who survived in the test datasetpredictions=clf.predict(test[features])

Create The Kaggle Submission

# Grab the passenger IDsids=test['PassengerId'].values# Create a csvsubmission_file=open("submission.csv","w")# Write to that csvopen_file_object=csv.writer(submission_file)# Write the header of the csvopen_file_object.writerow(["PassengerId","Survived"])# Write the rows of the csvopen_file_object.writerows(zip(ids,predictions))# Close the filesubmission_file.close()