@inproceedings{70d2263f54874e1bb435e04c49369ebf,
title = "Scalar waving: Improving the efficiency of SIMD execution on GPUs",
abstract = "GPUs take advantage of uniformity in program control flow and utilize SIMD execution to obtain execution efficiency. In SIMD execution, threads are batched into SIMD groups to share a common program counter and execute identical instructions on SIMD pipelines. Previous research has shown that there is a significant number of scalar instructions - instructions where different threads in a SIMD group execute using the same input operands and generate the exact same output - present in a range of applications. GPUs eliminate redundant fetches and decodes by utilizing a shared common pipeline front-end. However, most GPUs do not handle scalar instruction efficiently, allowing these instructions to be redundantly executed by the threads in a SIMD group. In this paper, we propose to use scalar execution to eliminate redundant execution of scalar instructions. We introduce scalar waving as a mechanism to batch scalar operations possessing the same PC and execute them as a group on SIMD lanes for efficiency. We also propose simultaneous execution of dynamically-formed scalar waves with SIMD groups to overcome the under-utilization of SIMD lanes when encountering divergence. We evaluate our work using 22 different GPU benchmarks taken from 4 different benchmark suites. We evaluate a range of configurations using timing simulation. Our results show that scalar waving can obtain up to a 25% improvement in performance on average. Our experiments also provide insight into the amount of performance gain that we can expect with scalar waving as a function of the scalar content, occupancy, and memory characteristics of the target application.",
keywords = "GPU, Redundant Computation, Scalar Waving, SIMD Efficiency",
author = "Ayse Yilmazer and Zhongliang Chen and David Kaeli",
year = "2014",
doi = "10.1109/IPDPS.2014.22",
language = "English",
isbn = "9780769552071",
series = "Proceedings of the International Parallel and Distributed Processing Symposium, IPDPS",
publisher = "IEEE Computer Society",
pages = "103--112",
booktitle = "Proceedings - IEEE 28th International Parallel and Distributed Processing Symposium, IPDPS 2014",
address = "United States",
note = "28th IEEE International Parallel and Distributed Processing Symposium, IPDPS 2014 ; Conference date: 19-05-2014 Through 23-05-2014",
}