A Genetic Programming-based Imputation Method for Classification with Missing Data

@InProceedings{Tran:2016:EuroGP,
author = "Cao Truong Tran and Mengjie Zhang and Peter Andreae",
title = "A Genetic Programming-based Imputation Method for
Classification with Missing Data",
booktitle = "EuroGP 2016: Proceedings of the 19th European
Conference on Genetic Programming",
year = "2016",
month = "30 " # mar # "--1 " # apr,
editor = "Malcolm I. Heywood and James McDermott and
Mauro Castelli and Ernesto Costa and Kevin Sim",
series = "LNCS",
volume = "9594",
publisher = "Springer Verlag",
address = "Porto, Portugal",
pages = "149--163",
organisation = "EvoStar",
keywords = "genetic algorithms, genetic programming",
isbn13 = "978-3-319-30668-1",
DOI = "doi:10.1007/978-3-319-30668-1_10",
abstract = "Many industrial and real-world datasets suffer from an
unavoidable problem of missing values. The ability to
deal with missing values is an essential requirement
for classification because inadequate treatment of
missing values may lead to large errors on
classification. The problem of missing data has been
addressed extensively in the statistics literature, and
also, but to a lesser extent in the classification
literature. One of the most popular approaches to deal
with missing data is to use imputation methods to fill
missing values with plausible values. Some powerful
imputation methods such as regression-based imputations
in MICE \cite{van1999flexible} are often suitable for
batch imputation tasks. However, they are often
expensive to impute missing values for every single
incomplete instance in the unseen set for
classification. This paper proposes a genetic
programming-based imputation (GPI) method for
classification with missing data that uses genetic
programming as a regression method to impute missing
values. The experiments on six benchmark datasets and
five popular classifiers compare GPI with five other
popular and advanced regression-based imputation
methods in MICE on two measures: classification
accuracy and computation time. The results showed that,
in most cases, GPI achieves classification accuracy at
least as good as the other imputation methods, and
sometimes significantly better. However, using GPI to
impute missing values for every single incomplete
instance is dramatically faster than the other
imputation methods.",
notes = "Part of \cite{Heywood:2016:GP}EuroGP'2016 held in
conjunction with EvoCOP2016, EvoMusArt2016 and
EvoApplications2016",
}