@CONFERENCE{AbelBDDHHJMRSW:2013:ResSharing,
author = {Andreas Abel and Florian Benz and Johannes Doerfert and Barbara D{\"o}rr and Sebastian Hahn and Florian Haupenthal and Michael Jacobs and Amir H. Moin and Jan Reineke and Bernhard Schommer and Reinhard Wilhelm},
title = {Impact of Resource Sharing on Performance and Performance Prediction: A Survey},
series = {CONCUR},
year = {2013},
pages = {25-43},
ee = {http://dx.doi.org/10.1007/978-3-642-40184-8_3},
crossref = {DBLP:conf/concur/2013},
doi = {10.1007/978-3-642-40184-8_3},
bibsource = {DBLP, http://dblp.uni-trier.de},
webpdf = {http://embedded.cs.uni-saarland.de/publications/ResourceSharingSurvey.pdf},
abstract = {
Multi-core processors are increasingly considered as execution platforms for
embedded systems because of their good performance/ energy ratio. However,
the interference on shared resources poses several problems. It may severely
reduce the performance of tasks executed on the cores, and it increases the
complexity of timing analysis and/or decreases the precision of its results.
In this paper, we survey recent work on the impact of shared buses, caches,
and other resources on performance and performance prediction.
},
}

@CONFERENCE{KH:2011:cgo,
author = {Ralf Karrenberg and Sebastian Hack},
title = {{W}hole {F}unction {V}ectorization},
booktitle = {International Symposium on Code Generation and Optimization},
series = {CGO},
year = {2011},
doi = {10.1109/CGO.2011.5764682},
abstract = {
Abstract—Data-parallel programming languages are an important component
in today's parallel computing landscape. Among those are domain-
specific languages like shading languages in graphics (HLSL, GLSL,
RenderMan, etc.) and "general-purpose" languages like CUDA or OpenCL.
Current implementations of those languages on CPUs solely rely on multi-
threading to implement parallelism and ignore the additional intra-core
parallelism provided by the SIMD instruction set of those processors
(like Intel's SSE and the upcoming AVX or Larrabee instruction sets).
In this paper, we discuss several aspects of implementing data-parallel
languages on machines with SIMD instruction sets. Our main contribution
is a language- and platform-independent code transformation that
performs whole-function vectorization on low-level intermediate code
given by a control flow graph in SSA form.
We evaluate our technique in two scenarios: First, incorporated in a
compiler for a domain-specific language used in real-time ray tracing.
Second, in a stand-alone OpenCL driver. We observe average speedup
factors of 3.9 for the ray tracer and factors between 0.6 and 5.2 for
different OpenCL kernels.
},
webslides = {http://www.cdl.uni-saarland.de/projects/wfv/wfv_cgo11_slides.pdf},
url = {http://www.cdl.uni-saarland.de/papers/karrenberg_wfv.pdf},
acc_rate = {26.7},
accepted = {28},
submitted = {105},
}

@CONFERENCE{KRSH:2010:hpg,
author = {Ralf Karrenberg and Dmitri Rubinstein and Philipp Slusallek and Sebastian Hack},
title = {{AnySL: Efficient and Portable Shading for Ray Tracing}},
booktitle = {Proceedings of the Conference on High Performance Graphics},
series = {HPG '10},
year = {2010},
location = {Saarbrucken, Germany},
pages = {97--105},
numpages = {9},
url = {http://portal.acm.org/citation.cfm?id=1921479.1921495},
acmid = {1921495},
publisher = {Eurographics Association},
address = {Aire-la-Ville, Switzerland, Switzerland},
booktitle_short = {HPG},
abstract = {
While a number of different shading languages have been developed,
their efficient integration into an existing renderer is notoriously
difficult, often boiling down to implementing an entire compiler
toolchain for each language. Furthermore, no shading language is
broadly supported across the variety of rendering systems.
AnySL attacks this issue from multiple directions: We compile shaders
from different languages into a common, portable representation, which
uses subroutine threaded code: Every language operator is translated to
a function call. Thus, the compiled shader is generic with respect to
the used types and operators.
The key component of our system is an embedded compiler that
instantiates this generic code in terms of the renderer's native types
and operations. It allows for flexible code transformations to match
the internal structure of the renderer and eliminates all overhead due
to the subroutine threaded code. For SIMD architectures we
automatically perform vectorization of scalar shaders which speeds up
rendering by a factor of 3.9 on average on SSE. The results are highly
optimized, parallel shaders that operate directly on the internal data
structures of a renderer. We show that both traditional shading
languages such as RenderMan, but also C/C++-based shading languages,
can be fully supported and deliver high performance across different
CPU renderers.
},
webslides = {http://www.cdl.uni-saarland.de/projects/anysl/anysl_hpg10_slides.pdf}
}

@INPROCEEDINGS{Doerfert:2015:PollyRed,
title = {Polly's Polyhedral Scheduling in the Presence of Reductions},
author = {Johannes Doerfert and Kevin Streit and Sebastian Hack and Zino Benaissa},
year = {2015},
month = {Jan},
address = {Amsterdam, Netherlands},
booktitle = {{I}nternational {W}orkshop on {P}olyhedral {C}ompilation {T}echniques},
booktitle_short = {IMPACT},
webpdf = {http://impact.gforge.inria.fr/impact2015/papers/impact2015-doerfert.pdf},
abstract = {
The polyhedral model provides a powerful mathematical abstraction to
enable effective optimization of loop nests with respect to a given
optimization goal, e.g., exploiting parallelism. Unexploited
reduction properties are a frequent reason for polyhedral optimizers
to assume parallelism prohibiting dependences. To our knowledge, no
polyhedral loop optimizer available in any production compiler
provides support for reductions. In this paper, we show that
leveraging the parallelism of reductions can lead to a significant
performance increase. We give a precise, dependence based, definition
of reductions and discuss ways to extend polyhedral optimization to
exploit the associativity and commutativity of reduction
computations. We have implemented a reduction-enabled scheduling
approach in the Polly polyhedral optimizer and evaluate it on the
standard Polybench 3.2 benchmark suite. We were able to detect and
model all 52 arithmetic reductions and achieve speedups up to 2.21×
on a quad core machine by exploiting the multidimensional reduction
in the BiCG benchmark.
},
}

A Framework for the Optimization of the WCET of Programs on Multi-Core Processors John, M. and Jacobs, M.Proceedings of the 8th Junior Researcher Workshop on Real-Time Computing, pages 1-4, 2014.
[url][pdf][bib]

@INPROCEEDINGS{Doerfert:2013:SPolly,
title = {{SP}olly: {S}peculative {O}ptimizations in the {P}olyhedral {M}odel},
author = {Johannes Doerfert and Clemens Hammacher and Kevin Streit and Sebastian Hack},
year = {2013},
month = jan,
address = {Berlin, Germany},
booktitle = {{I}nternational {W}orkshop on {P}olyhedral {C}ompilation {T}echniques},
booktitle_short = {IMPACT},
editors = {Armin Größlinger and Louis-Noël Pouchet},
pages = {55--61},
webpdf = {http://www.st.cs.uni-saarland.de/publications/files/doerfert-impact-2013.pdf},
abstract = {
The polyhedral model is only applicable to code regions that form static
control parts (SCoPs) or slight extensions thereof. To apply polyhedral
techniques to a piece of code, the compiler usually checks, by static
analysis, whether all SCoP conditions are fulfilled. However, in many
codes, the compiler fails to verify that this is the case. In this paper
we investigate the rejection causes as reported by Polly, the polyhedral
optimizer of a state-of-the-art compiler. We show that many rejections
follow from the conservative overapproximation of the employed static
analyses. In SPolly, a speculative extension of Polly, we employ the
knowledge of runtime features to supersede this overapproximation. All
speculatively generated variants form valid SCoPs and are optimizable by
the facilities of Polly. Our evaluation shows that SPolly is able to
effectively widen the applicability of polyhedral optimization. On the
SPEC 2000 suite, the number of optimizable code regions is increased by
131 percent. In 10 out of the 31 benchmarks of the PolyBench suite,
SPolly achieves speedups of up to 11-fold as compared to plain Polly.
},
}

@MASTERSTHESIS{Karrenberg:2009:MSc,
author = {Ralf Karrenberg},
title = {{Automatic Packetization}},
school = {Saarland University},
year = {2009},
month = {July},
webpdf = {http://www.cdl.uni-saarland.de/publications/theses/karrenberg_msc.pdf},
abstract = {
Modern processor architectures provide the possibility to execute an
instruction on multiple values at once. So-called SIMD (Single
Instruction, Multiple Data) instructions work on packets (or vectors)
of data instead of scalar values. They offer a significant performance
boost for data-parallel algorithms that perform the same operations on
large amounts of data, e.g. data encoding and decoding, image
processing, or ray tracing.
However, the performance gain comes at a price: programming languages
provide no elegant means to exploit SIMD instruction sets. Packet
operations have to be coded by hand, which is complicated, unintuitive,
and error prone. Thus, packetization - the transformation of scalar
code to packet form - is mostly applied automatically by local compiler
optimizations (e.g. during loop vectorization) or with a lot of manual
effort at performance-critical parts of a system.
This thesis describes an algorithm for automatic packetization that
allows a programmer to write scalar functions but use them on packets
of data. A compiler pass automatically transforms those functions to
work on packets of the target-architecture's SIMD width. The resulting
packetized function computes the same results as multiple executions of
the scalar code.
The algorithm is implemented in a source-language and target-
architecture independent intermediate representation (the Low Level
Virtual Machine (LLVM)), which enables its use in many different
environments. The performance of the generated code is shown in a real-
world case study in the context of real-time ray tracing: serial shader
code written in C++ is automatically specialized, optimized, and
packetized at runtime. The packetized shaders outperform their scalar
counterparts by an average factor of 3.6 on a standard SSE architecture
of SIMD width 4.
}
}

@BACHELORSTHESIS{Karrenberg:2007:BSc,
author = { Ralf Karrenberg },
title = { {M}emory {A}ware {R}ealtime {R}ay {T}racing: {T}he {B}ounding {P}lane {H}ierarchy },
school = {Saarland University},
year = { 2007 },
month = { August },
webpdf = {http://www.cdl.uni-saarland.de/publications/theses/karrenberg_bsc.pdf},
abstract = {
Realtime Ray Tracing has become available on single desktop environments
over the last few years, recently moving on to supporting dynamic
scenes. Approaches either use general-purpose CPUs, high-end
programmable graphics cards or specialised custom hardware. One of the
most important factors influencing the performance of all
implementations is the algorithm’s dependence on a spatial index
structure. Its random memory access is very slow compared to the
computational power of current computer hardware, often being the
bottleneck of the system.
The goal of this thesis was the design and implementation of a spatial
index structure that focuses entirely on improving memory efficiency in
trade for computational complexity. Two main starting points were
followed: The memory requirement of the whole structure and of each of
its parts on one hand and the caching efficiency on the other. The
resulting contributions are the following:
• The Bounding Plane Hierarchy (BPH) is a complete acceleration
structure equivalent to a Bounding Volume Hierarchy (BVH) with axis-
aligned bounding boxes (AABBs) that needs less than half the size of
current BVH-implementations.
• Treelets are groups of interconnected nodes that are traversed
independently from the rest of the acceleration structure by using a new
two-stage algorithm. They are employed to enhance cache efficiency -
especially on parallel and/or multi-threaded systems like the CELL -
and can be applied to any spatial index structure.
A first implementation of the BPH with Treelets using Packet Tracing
and SIMD-instructions renders animations in real-time while still
offering many possibilities for optimization.
}
}