In this paper we address the problem of dimensioning infrastructure, comprising both network and server resources, for large-scale decentralized distributed systems such as grids or clouds. We will provide an overview of our work in this area, and in particular focus on how to design the resulting grid/cloud to be resilient against network link and/or server site failures. To this end, we will exploit relocation: under failure conditions, a request may be sent to an alternate destination than the one under failure-free conditions. We will provide a comprehensive overview of related work in this area, and focus in some detail on our own most recent work. The latter comprises a case study where traffic has a known origin, but we assume a degree of freedom as to where its end up being processed, which is typically the case for e. g., grid applications of the bag-of-tasks (BoT) type or for providing cloud services. In particular, we will provide in this paper a new integer linear programming (ILP) formulation to solve the resilient grid/cloud dimensioning problem using failure-dependent backup routes. Our algorithm will simultaneously decide on server and network capacity. We find that in the anycast routing problem we address, the benefit of using failure-dependent (FD) rerouting is limited compared to failure-independent (FID) backup routing. We confirm our earlier findings in terms of network capacity savings achieved by relocation compared to not exploiting relocation (order of 6-10% in the current case studies).

@inproceedings{2983200,
abstract = {In this paper we address the problem of dimensioning infrastructure, comprising both network and server resources, for large-scale decentralized distributed systems such as grids or clouds. We will provide an overview of our work in this area, and in particular focus on how to design the resulting grid/cloud to be resilient against network link and/or server site failures. To this end, we will exploit relocation: under failure conditions, a request may be sent to an alternate destination than the one under failure-free conditions. We will provide a comprehensive overview of related work in this area, and focus in some detail on our own most recent work. The latter comprises a case study where traffic has a known origin, but we assume a degree of freedom as to where its end up being processed, which is typically the case for e. g., grid applications of the bag-of-tasks (BoT) type or for providing cloud services. In particular, we will provide in this paper a new integer linear programming (ILP) formulation to solve the resilient grid/cloud dimensioning problem using failure-dependent backup routes. Our algorithm will simultaneously decide on server and network capacity. We find that in the anycast routing problem we address, the benefit of using failure-dependent (FD) rerouting is limited compared to failure-independent (FID) backup routing. We confirm our earlier findings in terms of network capacity savings achieved by relocation compared to not exploiting relocation (order of 6-10\% in the current case studies).},
author = {Develder, Chris and Buysse, Jens and De Leenheer, Marc and Jaumard, Brigitte and Dhoedt, Bart},
booktitle = {IEEE International Conference on Communications},
isbn = {9781457720536},
issn = {1550-3607},
language = {eng},
location = {Ottawa, Canada},
pages = {6262--6267},
publisher = {IEEE},
title = {Resilient network dimensioning for optical grid/clouds using relocation},
year = {2012},
}