summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArtem Belevich <tra@google.com>2019-09-03 17:31:58 +0000
committerArtem Belevich <tra@google.com>2019-09-03 17:31:58 +0000
commitce94ec661f53892e2af3c40a3800e5b9258618cd (patch)
treedc67782f2bf50235ba104289631ddf78f3475fbf
parent37e2f5f125e1e9553d8a55261fae40e0bde1ada2 (diff)
downloadbcm5719-llvm-ce94ec661f53892e2af3c40a3800e5b9258618cd.tar.gz
bcm5719-llvm-ce94ec661f53892e2af3c40a3800e5b9258618cd.zip
[CUDA] Use activemask.b32 instruction to implement __activemask w/ CUDA-9.2+
vote.ballot instruction is gone in recent CUDA versions and vote.sync.ballot can not be used because it needs a thread mask parameter. Fortunately PTX 6.2 (introduced with CUDA-9.2) provides activemask.b32 instruction for this. Differential Revision: https://reviews.llvm.org/D66665 llvm-svn: 370792
-rw-r--r--clang/lib/Headers/__clang_cuda_intrinsics.h10
1 files changed, 9 insertions, 1 deletions
diff --git a/clang/lib/Headers/__clang_cuda_intrinsics.h b/clang/lib/Headers/__clang_cuda_intrinsics.h
index 2970d17f89e..b67461a146f 100644
--- a/clang/lib/Headers/__clang_cuda_intrinsics.h
+++ b/clang/lib/Headers/__clang_cuda_intrinsics.h
@@ -211,7 +211,15 @@ inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) {
return __nvvm_vote_ballot_sync(mask, pred);
}
-inline __device__ unsigned int __activemask() { return __nvvm_vote_ballot(1); }
+inline __device__ unsigned int __activemask() {
+#if CUDA_VERSION < 9020
+ return __nvvm_vote_ballot(1);
+#else
+ unsigned int mask;
+ asm volatile("activemask.b32 %0;" : "=r"(mask));
+ return mask;
+#endif
+}
inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) {
return __nvvm_fns(mask, base, offset);
OpenPOWER on IntegriCloud