This paper proposes a generic methodology for the semi-automatic generation of reliable position annotations for evaluating multi-camera people-trackers on large video data sets. Most of the annotation data are automatically computed, by estimating a consensus tracking result from multiple existing trackers and people detectors and classifying it as either reliable or not. A small subset of the data, composed of tracks with insufficient reliability, is verified by a human using a simple binary decision task, a process faster than marking the correct person position. The proposed framework is generic and can handle additional trackers. We present results on a data set of ~ 6 h captured by 4 cameras, featuring a person in a holiday flat, performing activities such as walking, cooking, eating, cleaning, and watching TV. When aiming for a tracking accuracy of 60 cm, 80% of all video frames are automatically annotated. The annotations for the remaining 20% of the frames were added after human verification of an automatically selected subset of data. This involved ~ 2.4 h of manual labor. According to a subsequent comprehensive visual inspection to judge the annotation procedure, we found 99% of the automatically annotated frames to be correct. We provide guidelines on how to apply the proposed methodology to new data sets. We also provide an exploratory study for the multi-target case, applied on the existing and new benchmark video sequences.

@article{7184372,
abstract = {This paper proposes a generic methodology for the semi-automatic generation of reliable position annotations for evaluating multi-camera people-trackers on large video data sets. Most of the annotation data are automatically computed, by estimating a consensus tracking result from multiple existing trackers and people detectors and classifying it as either reliable or not. A small subset of the data, composed of tracks with insufficient reliability, is verified by a human using a simple binary decision task, a process faster than marking the correct person position. The proposed framework is generic and can handle additional trackers. We present results on a data set of {\texttildelow} 6 h captured by 4 cameras, featuring a person in a holiday flat, performing activities such as walking, cooking, eating, cleaning, and watching TV. When aiming for a tracking accuracy of 60 cm, 80\% of all video frames are automatically annotated. The annotations for the remaining 20\% of the frames were added after human verification of an automatically selected subset of data. This involved {\texttildelow} 2.4 h of manual labor. According to a subsequent comprehensive visual inspection to judge the annotation procedure, we found 99\% of the automatically annotated frames to be correct. We provide guidelines on how to apply the proposed methodology to new data sets. We also provide an exploratory study for the multi-target case, applied on the existing and new benchmark video sequences.},
author = {Ni{\~n}o Casta{\~n}eda, Jorge and Frias Velazquez, Andres and Nyan, Bo Bo and Slembrouck, Maarten and Guan, Junzhi and Debard, Glen and Vanrumste, Bart and Tuytelaars, Tinne and Philips, Wilfried},
issn = {1057-7149},
journal = {IEEE TRANSACTIONS ON IMAGE PROCESSING},
keyword = {multi-camera tracking,VISUAL TRACKING,semi-automatic annotation,people tracking,Multi-camera tracking,performance evaluation,TOOLS},
language = {eng},
number = {5},
pages = {2259--2274},
publisher = {IEEE},
title = {Scalable semi-automatic annotation for multi-camera person tracking},
url = {http://dx.doi.org/10.1109/TIP.2016.2542021},
volume = {25},
year = {2016},
}