@inproceedings{8366937,
title = {Proctor: Detecting and Investigating Interference in Shared Datacenters},
author = {R S Kannan and A Jain and M A Laurenzano and L Tang and J Mars},
doi = {10.1109/ISPASS.2018.00016},
year = {2018},
date = {2018-04-01},
booktitle = {2018 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS)},
pages = {76-86},
abstract = {Cloud-scale datacenter management systems utilize virtualization to provide performance isolation while maximizing the utilization of the underlying hardware infrastructure. However, virtualization does not provide complete performance isolation as Virtual Machines (VMs) still compete for nonreservable shared resources (like caches, network, I/O bandwidth etc.) This becomes highly challenging to address in datacenter environments housing tens of thousands of VMs, causing degradation in application performance. Addressing this problem for production datacenters requires a non-intrusive scalable solution that 1) detects performance intrusion and 2) investigates both the intrusive VMs causing interference, as well as the resource(s) for which the VMs are competing for. To address this problem, this paper introduces Proctor, a real time, lightweight and scalable analytics fabric that detects performance intrusive VMs and identifies its root causes from among the arbitrary VMs running in shared datacenters across 4 key hardware resources - network, I/O, cache, and CPU. Proctor is based on a robust statistical approach that requires no special profiling phases, standing in stark contrast to a wide body of prior work that assumes pre-acquisition of application level information prior to its execution. By detecting performance degradation and identifying the root cause VMs and their metrics, Proctor can be utilized to dramatically improve the performance outcomes of applications executing in large-scale datacenters. From our experiments, we are able to show that when we deploy Proctor in a datacenter housing a mix of I/O, network, compute and cache-sensitive applications, it is able to effectively pinpoint performance intrusive VMs. Further, we observe that when Proctor is applied with migration, the application-level Quality-of-Service improves by an average of 2.2× as compared to systems which are unable to detect, identify and pinpoint performance int- usion and their root causes.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}

Cloud-scale datacenter management systems utilize virtualization to provide performance isolation while maximizing the utilization of the underlying hardware infrastructure. However, virtualization does not provide complete performance isolation as Virtual Machines (VMs) still compete for nonreservable shared resources (like caches, network, I/O bandwidth etc.) This becomes highly challenging to address in datacenter environments housing tens of thousands of VMs, causing degradation in application performance. Addressing this problem for production datacenters requires a non-intrusive scalable solution that 1) detects performance intrusion and 2) investigates both the intrusive VMs causing interference, as well as the resource(s) for which the VMs are competing for. To address this problem, this paper introduces Proctor, a real time, lightweight and scalable analytics fabric that detects performance intrusive VMs and identifies its root causes from among the arbitrary VMs running in shared datacenters across 4 key hardware resources - network, I/O, cache, and CPU. Proctor is based on a robust statistical approach that requires no special profiling phases, standing in stark contrast to a wide body of prior work that assumes pre-acquisition of application level information prior to its execution. By detecting performance degradation and identifying the root cause VMs and their metrics, Proctor can be utilized to dramatically improve the performance outcomes of applications executing in large-scale datacenters. From our experiments, we are able to show that when we deploy Proctor in a datacenter housing a mix of I/O, network, compute and cache-sensitive applications, it is able to effectively pinpoint performance intrusive VMs. Further, we observe that when Proctor is applied with migration, the application-level Quality-of-Service improves by an average of 2.2× as compared to systems which are unable to detect, identify and pinpoint performance int- usion and their root causes.

As data centers increase in size and computational capacity, their growth comes at a cost: an increasing thermal load that must be removed to prevent overheating. Here, the authors propose using phase-change materials (PCMs) to shape a data center's thermal load, absorbing and releasing heat when it's advantageous. They evaluate three important opportunities for cost savings. They find that in a data center, PCM can reduce the necessary cooling system size by up to 12 percent without impacting peak throughput, or increase the number of servers by up to 14.6 percent without increasing the cooling load. In a thermally constrained setting, PCM can increase peak throughput up to 69 percent while delaying the onset of thermal limits by over 3 hours, and a wax-aware scheduler enables up to a 11 percent reduction in peak cooling load when batch jobs are added, increasing average daily throughput by 36-52 percent.

@inproceedings{7783727,
title = {CrystalBall: Statically analyzing runtime behavior via deep sequence learning},
author = {S Zekany and D Rings and N Harada and M A Laurenzano and L Tang and J Mars},
doi = {10.1109/MICRO.2016.7783727},
year = {2016},
date = {2016-10-01},
booktitle = {2016 49th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)},
pages = {1-12},
abstract = {Understanding dynamic program behavior is critical in many stages of the software development lifecycle, for purposes as diverse as optimization, debugging, testing, and security. This paper focuses on the problem of predicting dynamic program behavior statically. We introduce a novel technique to statically identify hot paths that leverages emerging deep learning techniques to take advantage of their ability to learn subtle, complex relationships between sequences of inputs. This approach maps well to the problem of identifying the behavior of sequences of basic blocks in program execution. Our technique is also designed to operate on the compiler's intermediate representation (IR), as opposed to the approaches taken by prior techniques that have focused primarily on source code, giving our approach language-independence. We describe the pitfalls of conventional metrics used for hot path prediction such as accuracy, and motivate the use of Area Under the Receiver Operating Characteristic curve (AUROC). Through a thorough evaluation of our technique on complex applications that include the SPEC CPU2006 benchmarks, we show that our approach achieves an AUROC of 0.85.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}

Understanding dynamic program behavior is critical in many stages of the software development lifecycle, for purposes as diverse as optimization, debugging, testing, and security. This paper focuses on the problem of predicting dynamic program behavior statically. We introduce a novel technique to statically identify hot paths that leverages emerging deep learning techniques to take advantage of their ability to learn subtle, complex relationships between sequences of inputs. This approach maps well to the problem of identifying the behavior of sequences of basic blocks in program execution. Our technique is also designed to operate on the compiler's intermediate representation (IR), as opposed to the approaches taken by prior techniques that have focused primarily on source code, giving our approach language-independence. We describe the pitfalls of conventional metrics used for hot path prediction such as accuracy, and motivate the use of Area Under the Receiver Operating Characteristic curve (AUROC). Through a thorough evaluation of our technique on complex applications that include the SPEC CPU2006 benchmarks, we show that our approach achieves an AUROC of 0.85.

@inproceedings{7783744,
title = {Concise loads and stores: The case for an asymmetric compute-memory architecture for approximation},
author = {A Jain and P Hill and S C Lin and M Khan and M E Haque and M A Laurenzano and S Mahlke and L Tang and J Mars},
doi = {10.1109/MICRO.2016.7783744},
year = {2016},
date = {2016-10-01},
booktitle = {2016 49th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)},
pages = {1-13},
abstract = {Cache capacity and memory bandwidth play critical roles in application performance, particularly for data-intensive applications from domains that include machine learning, numerical analysis, and data mining. Many of these applications are also tolerant to imprecise inputs and have loose constraints on the quality of output, making them ideal candidates for approximate computing. This paper introduces a novel approximate computing technique that decouples the format of data in the memory hierarchy from the format of data in the compute subsystem to significantly reduce the cost of storing and moving bits throughout the memory hierarchy and improve application performance. This asymmetric compute-memory extension to conventional architectures, ACME, adds two new instruction classes to the ISA - load-concise and store-concise - along with three small functional units to the micro-architecture to support these instructions. ACME does not affect exact execution of applications and comes into play only when concise memory operations are used. Through detailed experimentation we find that ACME is very effective at trading result accuracy for improved application performance. Our results show that ACME achieves a 1.3x speedup (up to 1.8x) while maintaining 99% accuracy, or a 1.1x speedup while maintaining 99.999% accuracy. Moreover, our approach incurs negligible area and power overheads, adding just 0.005% area and 0.1% power to a conventional modern architecture.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}

Cache capacity and memory bandwidth play critical roles in application performance, particularly for data-intensive applications from domains that include machine learning, numerical analysis, and data mining. Many of these applications are also tolerant to imprecise inputs and have loose constraints on the quality of output, making them ideal candidates for approximate computing. This paper introduces a novel approximate computing technique that decouples the format of data in the memory hierarchy from the format of data in the compute subsystem to significantly reduce the cost of storing and moving bits throughout the memory hierarchy and improve application performance. This asymmetric compute-memory extension to conventional architectures, ACME, adds two new instruction classes to the ISA - load-concise and store-concise - along with three small functional units to the micro-architecture to support these instructions. ACME does not affect exact execution of applications and comes into play only when concise memory operations are used. Through detailed experimentation we find that ACME is very effective at trading result accuracy for improved application performance. Our results show that ACME achieves a 1.3x speedup (up to 1.8x) while maintaining 99% accuracy, or a 1.1x speedup while maintaining 99.999% accuracy. Moreover, our approach incurs negligible area and power overheads, adding just 0.005% area and 0.1% power to a conventional modern architecture.

The class of optimizations characterized by manipulating a loop's interaction space for improved cache locality and reuse (i.e, cache tiling/blocking/strip mine and interchange) are static optimizations requiring a priori information about the microarchitectural and runtime environment of an application binary. However, particularly in datacenter environments, deployed applications face numerous dynamic environments over their lifetimes. As a result, this class of optimizations can result in sub-optimal performance due to the inability to flexibly adapt iteration spaces as cache conditions change at runtime. This paper introduces continuous shape shifiting, a compilation approach that removes the risks of cache tiling optimizations by dynamically rewriting (and reshaping) deployed, running application code. To realize continuous shape shifting, we present ShapeShifter, a framework for continuous monitoring of co-running applications and their runtime environments to reshape loop iteration spaces and pinpoint near-optimal loop tile configurations. Upon identifying a need for reshaping, a new tiling approach is quickly constructed for the application, new code is dynamically generated and is then seamlessly stitched into the running application with near-zero overhead. Our evaluation on a wide spectrum of runtime scenarios demonstrates that ShapeShifter achieves an average of 10-40% performance improvement (up to 2.4 χ) on real systems depending on the runtime environment compared to an oracle static loop tiling baseline.

@article{7478443,
title = {Sirius Implications for Future Warehouse-Scale Computers},
author = {J Hauswald and M A Laurenzano and Y Zhang and C Li and A Rovinski and A Khurana and R G Dreslinski and T Mudge and V Petrucci and L Tang and J Mars},
doi = {10.1109/MM.2016.37},
issn = {0272-1732},
year = {2016},
date = {2016-05-01},
journal = {IEEE Micro},
volume = {36},
number = {3},
pages = {42-53},
abstract = {Demand is expected to grow significantly for cloud services that deliver sophisticated artificial intelligence on the critical path of user queries, as is the case with intelligent personal assistants such as Apple's Siri. If the prediction of the trend is correct, these types of applications will likely consume most of the world's computing cycles. The Sirius project was motivated to investigate what this future might look like and how cloud architectures should evolve to achieve it.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}

Demand is expected to grow significantly for cloud services that deliver sophisticated artificial intelligence on the critical path of user queries, as is the case with intelligent personal assistants such as Apple's Siri. If the prediction of the trend is correct, these types of applications will likely consume most of the world's computing cycles. The Sirius project was motivated to investigate what this future might look like and how cloud architectures should evolve to achieve it.

Modern processors widely use hardware prefetching to hide memory latency. While aggressive hardware prefetchers can improve performance significantly for some applications, they can limit the overall performance in highly-utilized multicore processors by saturating the offchip bandwidth and wasting last-level cache capacity. Co-executing applications can slowdown due to contention over these shared resources. This work introduces Adaptive Resource Efficient Prefetching (AREP) -- a runtime framework that dynamically combines software prefetching and hardware prefetching to maximize throughput in highly utilized multicore processors. AREP achieves better performance by prefetching data in a resource efficient way -- conserving offchip-bandwidth and last-level cache capacity with accurate prefetching and by applying cache-bypassing when possible. AREP dynamically explores a mix of hardware/software prefetching policies, then selects and applies the best performing policy. AREP is phase-aware and re-explores (at runtime) for the best prefetching policy at phase boundaries. A multitude of experiments with workload mixes and parallel applications on a modern high performance multicore show that AREP can increase throughput by up to 49% (8.1% on average). This is complemented by improved fairness, resulting in average quality of service above 94%.

@inproceedings{7056037,
title = {Octopus-Man: QoS-driven task management for heterogeneous multicores in warehouse-scale computers},
author = {V Petrucci and M A Laurenzano and J Doherty and Y Zhang and D Mossé and J Mars and L Tang},
doi = {10.1109/HPCA.2015.7056037},
issn = {1530-0897},
year = {2015},
date = {2015-02-01},
booktitle = {2015 IEEE 21st International Symposium on High Performance Computer Architecture (HPCA)},
pages = {246-258},
abstract = {Heterogeneous multicore architectures have the potential to improve energy efficiency by integrating power-efficient wimpy cores with high-performing brawny cores. However, it is an open question as how to deliver energy reduction while ensuring the quality of service (QoS) of latency-sensitive web-services running on such heterogeneous multicores in warehouse-scale computers (WSCs). In this work, we first investigate the implications of heterogeneous multicores in WSCs and show that directly adopting heterogeneous multicores without re-designing the software stack to provide QoS management leads to significant QoS violations. We then present Octopus-Man, a novel QoS-aware task management solution that dynamically maps latency-sensitive tasks to the least power-hungry processing resources that are sufficient to meet the QoS requirements. Using carefully-designed feedback-control mechanisms, Octopus-Man addresses critical challenges that emerge due to uncertainties in workload fluctuations and adaptation dynamics in a real system. Our evaluation using web-search and memcached running on a real-system Intel heterogeneous prototype demonstrates that Octopus-Man improves energy efficiency by up to 41% (CPU power) and up to 15% (system power) over an all-brawny WSC design while adhering to specified QoS targets.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}

Heterogeneous multicore architectures have the potential to improve energy efficiency by integrating power-efficient wimpy cores with high-performing brawny cores. However, it is an open question as how to deliver energy reduction while ensuring the quality of service (QoS) of latency-sensitive web-services running on such heterogeneous multicores in warehouse-scale computers (WSCs). In this work, we first investigate the implications of heterogeneous multicores in WSCs and show that directly adopting heterogeneous multicores without re-designing the software stack to provide QoS management leads to significant QoS violations. We then present Octopus-Man, a novel QoS-aware task management solution that dynamically maps latency-sensitive tasks to the least power-hungry processing resources that are sufficient to meet the QoS requirements. Using carefully-designed feedback-control mechanisms, Octopus-Man addresses critical challenges that emerge due to uncertainties in workload fluctuations and adaptation dynamics in a real system. Our evaluation using web-search and memcached running on a real-system Intel heterogeneous prototype demonstrates that Octopus-Man improves energy efficiency by up to 41% (CPU power) and up to 15% (system power) over an all-brawny WSC design while adhering to specified QoS targets.

@inproceedings{7056039,
title = {Adrenaline: Pinpointing and reining in tail queries with quick voltage boosting},
author = {C H Hsu and Y Zhang and M A Laurenzano and D Meisner and T Wenisch and J Mars and L Tang and R G Dreslinski},
doi = {10.1109/HPCA.2015.7056039},
issn = {1530-0897},
year = {2015},
date = {2015-02-01},
booktitle = {2015 IEEE 21st International Symposium on High Performance Computer Architecture (HPCA)},
pages = {271-282},
abstract = {Reducing the long tail of the query latency distribution in modern warehouse scale computers is critical for improving performance and quality of service of workloads such as Web Search and Memcached. Traditional turbo boost increases a processor's voltage and frequency during a coarse-grain sliding window, boosting all queries that are processed during that window. However, the inability of such a technique to pinpoint tail queries for boosting limits its tail reduction benefit. In this work, we propose Adrenaline, an approach to leverage finer granularity, 10's of nanoseconds, voltage boosting to effectively rein in the tail latency with query-level precision. Two key insights underlie this work. First, emerging finer granularity voltage/frequency boosting is an enabling mechanism for intelligent allocation of the power budget to precisely boost only the queries that contribute to the tail latency; and second, per-query characteristics can be used to design indicators for proactively pinpointing these queries, triggering boosting accordingly. Based on these insights, Adrenaline effectively pinpoints and boosts queries that are likely to increase the tail distribution and can reap more benefit from the voltage/frequency boost. By evaluating under various workload configurations, we demonstrate the effectiveness of our methodology. We achieve up to a 2.50x tail latency improvement for Memcached and up to a 3.03x for Web Search over coarse-grained DVFS given a fixed boosting power budget. When optimizing for energy reduction, Adrenaline achieves up to a 1.81x improvement for Memcached and up to a 1.99x for Web Search over coarse-grained DVFS.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}

Reducing the long tail of the query latency distribution in modern warehouse scale computers is critical for improving performance and quality of service of workloads such as Web Search and Memcached. Traditional turbo boost increases a processor's voltage and frequency during a coarse-grain sliding window, boosting all queries that are processed during that window. However, the inability of such a technique to pinpoint tail queries for boosting limits its tail reduction benefit. In this work, we propose Adrenaline, an approach to leverage finer granularity, 10's of nanoseconds, voltage boosting to effectively rein in the tail latency with query-level precision. Two key insights underlie this work. First, emerging finer granularity voltage/frequency boosting is an enabling mechanism for intelligent allocation of the power budget to precisely boost only the queries that contribute to the tail latency; and second, per-query characteristics can be used to design indicators for proactively pinpointing these queries, triggering boosting accordingly. Based on these insights, Adrenaline effectively pinpoints and boosts queries that are likely to increase the tail distribution and can reap more benefit from the voltage/frequency boost. By evaluating under various workload configurations, we demonstrate the effectiveness of our methodology. We achieve up to a 2.50x tail latency improvement for Memcached and up to a 3.03x for Web Search over coarse-grained DVFS given a fixed boosting power budget. When optimizing for energy reduction, Adrenaline achieves up to a 1.81x improvement for Memcached and up to a 1.99x for Web Search over coarse-grained DVFS.

Co-location, where multiple jobs share compute nodes in large-scale HPC systems, has been shown to increase aggregate throughput and energy efficiency by 10 to 20%. However, system operators disallow co-location due to fair-pricing concerns, i.e., a pricing mechanism that considers performance interference from co-running jobs. In the current pricing model, application execution time determines the price, which results in unfair prices paid by the minority of users whose jobs suffer from co-location. This paper presents POPPA, a runtime system that enables fair pricing by delivering precise online interference detection and facilitates the adoption of supercomputers with co-locations. POPPA leverages a novel shutter mechanism - a cyclic, fine-grained interference sampling mechanism to accurately deduce the interference between co-runners - to provide unbiased pricing of jobs that share nodes. POPPA is able to quantify inter-application interference within 4% mean absolute error on a variety of co-located benchmark and real scientific workloads.