@Article{Griebl2000,
author=“Griebl, Martin
and Feautrier, Paul
and Lengauer, Christian”,
title=“Index Set Splitting”,
journal=“International Journal of Parallel Programming”,
year=“2000”,
month=“Dec”,
day=“01”,
volume=“28”,
number=“6”,
pages=“607–631”,
abstract=“There are many algorithms for the space-time mapping of nested loops. Some of them even make the optimal choices within their framework. We propose a preprocessing phase for algorithms in the polytope model, which extends the model and yields space-time mappings whose schedule is, in some cases, orders of magnitude faster. These are cases in which the dependence graph has small irregularities. The basic idea is to split the index set of the loop nests into parts with a regular dependence structure and apply the existing space-time mapping algorithms to these parts individually. This work is based on a seminal idea in the more limited context of loop parallelization at the code level. We elevate the idea to the model level (our model is the polytope model), which increases its applicability by providing a clearer and wider range of choices at an acceptable analysis cost. Index set splitting is one facet in the effort to extend the power of the polytope model and to enable the generation of competitive target code.”,
issn=“1573-7640”,
doi=“10.1023/A:1007516818651”,
url=“https://doi.org/10.1023/A:1007516818651”
}

We describe the design of a convolutional neural network accelerator running on a Stratix V FPGA. The design runs at three times the throughput of previous FPGA CNN accelerator designs. We show that the throughput/watt is significantly higher than for a GPU, and project the performance when ported to an Arria 10 FPGA.