diff options
author | Justin Lebar <jlebar@google.com> | 2016-05-19 22:49:13 +0000 |
---|---|---|
committer | Justin Lebar <jlebar@google.com> | 2016-05-19 22:49:13 +0000 |
commit | 2e4ecfdebe8fa73ab4ed6f738307339ee9586418 (patch) | |
tree | a8eb6c205f01da79d919bb3f573cb92f11100a40 /clang/lib | |
parent | b926bdac4c18e0f31d827dec482f207856e88e1e (diff) | |
download | bcm5719-llvm-2e4ecfdebe8fa73ab4ed6f738307339ee9586418.tar.gz bcm5719-llvm-2e4ecfdebe8fa73ab4ed6f738307339ee9586418.zip |
[CUDA] Implement __ldg using intrinsics.
Summary:
Previously it was implemented as inline asm in the CUDA headers.
This change allows us to use the [addr+imm] addressing mode when
executing ld.global.nc instructions. This translates into a 1.3x
speedup on some benchmarks that call this instruction from within an
unrolled loop.
Reviewers: tra, rsmith
Subscribers: jhen, cfe-commits, jholewinski
Differential Revision: http://reviews.llvm.org/D19990
llvm-svn: 270150
Diffstat (limited to 'clang/lib')
-rw-r--r-- | clang/lib/CodeGen/CGBuiltin.cpp | 45 | ||||
-rw-r--r-- | clang/lib/Headers/CMakeLists.txt | 1 | ||||
-rw-r--r-- | clang/lib/Headers/__clang_cuda_intrinsics.h | 256 | ||||
-rw-r--r-- | clang/lib/Headers/__clang_cuda_runtime_wrapper.h | 6 |
4 files changed, 307 insertions, 1 deletions
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index a68394bfc71..afc308d7f7a 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -7349,6 +7349,17 @@ Value *CodeGenFunction::EmitSystemZBuiltinExpr(unsigned BuiltinID, Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { + auto MakeLdg = [&](unsigned IntrinsicID) { + Value *Ptr = EmitScalarExpr(E->getArg(0)); + AlignmentSource AlignSource; + clang::CharUnits Align = + getNaturalPointeeTypeAlignment(E->getArg(0)->getType(), &AlignSource); + return Builder.CreateCall( + CGM.getIntrinsic(IntrinsicID, {Ptr->getType()->getPointerElementType(), + Ptr->getType()}), + {Ptr, ConstantInt::get(Builder.getInt32Ty(), Align.getQuantity())}); + }; + switch (BuiltinID) { case NVPTX::BI__nvvm_atom_add_gen_i: case NVPTX::BI__nvvm_atom_add_gen_l: @@ -7433,6 +7444,40 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, return Builder.CreateCall(FnALD32, {Ptr, Val}); } + case NVPTX::BI__nvvm_ldg_c: + case NVPTX::BI__nvvm_ldg_c2: + case NVPTX::BI__nvvm_ldg_c4: + case NVPTX::BI__nvvm_ldg_s: + case NVPTX::BI__nvvm_ldg_s2: + case NVPTX::BI__nvvm_ldg_s4: + case NVPTX::BI__nvvm_ldg_i: + case NVPTX::BI__nvvm_ldg_i2: + case NVPTX::BI__nvvm_ldg_i4: + case NVPTX::BI__nvvm_ldg_l: + case NVPTX::BI__nvvm_ldg_ll: + case NVPTX::BI__nvvm_ldg_ll2: + case NVPTX::BI__nvvm_ldg_uc: + case NVPTX::BI__nvvm_ldg_uc2: + case NVPTX::BI__nvvm_ldg_uc4: + case NVPTX::BI__nvvm_ldg_us: + case NVPTX::BI__nvvm_ldg_us2: + case NVPTX::BI__nvvm_ldg_us4: + case NVPTX::BI__nvvm_ldg_ui: + case NVPTX::BI__nvvm_ldg_ui2: + case NVPTX::BI__nvvm_ldg_ui4: + case NVPTX::BI__nvvm_ldg_ul: + case NVPTX::BI__nvvm_ldg_ull: + case NVPTX::BI__nvvm_ldg_ull2: + // PTX Interoperability section 2.2: "For a vector with an even number of + // elements, its alignment is set to number of elements times the alignment + // of its member: n*alignof(t)." + return MakeLdg(Intrinsic::nvvm_ldg_global_i); + case NVPTX::BI__nvvm_ldg_f: + case NVPTX::BI__nvvm_ldg_f2: + case NVPTX::BI__nvvm_ldg_f4: + case NVPTX::BI__nvvm_ldg_d: + case NVPTX::BI__nvvm_ldg_d2: + return MakeLdg(Intrinsic::nvvm_ldg_global_f); default: return nullptr; } diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 46e574e47c3..e4beccc6af3 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -21,6 +21,7 @@ set(files bmi2intrin.h bmiintrin.h __clang_cuda_cmath.h + __clang_cuda_intrinsics.h __clang_cuda_math_forward_declares.h __clang_cuda_runtime_wrapper.h cpuid.h diff --git a/clang/lib/Headers/__clang_cuda_intrinsics.h b/clang/lib/Headers/__clang_cuda_intrinsics.h new file mode 100644 index 00000000000..f58a03ad2c4 --- /dev/null +++ b/clang/lib/Headers/__clang_cuda_intrinsics.h @@ -0,0 +1,256 @@ +/*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __CLANG_CUDA_INTRINSICS_H__ +#define __CLANG_CUDA_INTRINSICS_H__ +#ifndef __CUDA__ +#error "This file is for CUDA compilation only." +#endif + +// sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}. + +// Prevent the vanilla sm_32 intrinsics header from being included. +#define __SM_32_INTRINSICS_H__ +#define __SM_32_INTRINSICS_HPP__ + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 + +inline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); } +inline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); } +inline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); } +inline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); } +inline __device__ long long __ldg(const long long *ptr) { + return __nvvm_ldg_ll(ptr); +} +inline __device__ unsigned char __ldg(const unsigned char *ptr) { + return __nvvm_ldg_uc(ptr); +} +inline __device__ unsigned short __ldg(const unsigned short *ptr) { + return __nvvm_ldg_us(ptr); +} +inline __device__ unsigned int __ldg(const unsigned int *ptr) { + return __nvvm_ldg_ui(ptr); +} +inline __device__ unsigned long __ldg(const unsigned long *ptr) { + return __nvvm_ldg_ul(ptr); +} +inline __device__ unsigned long long __ldg(const unsigned long long *ptr) { + return __nvvm_ldg_ull(ptr); +} +inline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); } +inline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); } + +inline __device__ char2 __ldg(const char2 *ptr) { + typedef char c2 __attribute__((ext_vector_type(2))); + // We can assume that ptr is aligned at least to char2's alignment, but the + // load will assume that ptr is aligned to char2's alignment. This is only + // safe if alignof(c2) <= alignof(char2). + c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr)); + char2 ret; + ret.x = rv[0]; + ret.y = rv[1]; + return ret; +} +inline __device__ char4 __ldg(const char4 *ptr) { + typedef char c4 __attribute__((ext_vector_type(4))); + c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr)); + char4 ret; + ret.w = rv[0]; + ret.x = rv[1]; + ret.y = rv[2]; + ret.z = rv[3]; + return ret; +} +inline __device__ short2 __ldg(const short2 *ptr) { + typedef short s2 __attribute__((ext_vector_type(2))); + s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr)); + short2 ret; + ret.x = rv[0]; + ret.y = rv[1]; + return ret; +} +inline __device__ short4 __ldg(const short4 *ptr) { + typedef short s4 __attribute__((ext_vector_type(4))); + s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr)); + short4 ret; + ret.w = rv[0]; + ret.x = rv[1]; + ret.y = rv[2]; + ret.z = rv[3]; + return ret; +} +inline __device__ int2 __ldg(const int2 *ptr) { + typedef int i2 __attribute__((ext_vector_type(2))); + i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr)); + int2 ret; + ret.x = rv[0]; + ret.y = rv[1]; + return ret; +} +inline __device__ int4 __ldg(const int4 *ptr) { + typedef int i4 __attribute__((ext_vector_type(4))); + i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr)); + int4 ret; + ret.w = rv[0]; + ret.x = rv[1]; + ret.y = rv[2]; + ret.z = rv[3]; + return ret; +} +inline __device__ longlong2 __ldg(const longlong2 *ptr) { + typedef long long ll2 __attribute__((ext_vector_type(2))); + ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr)); + longlong2 ret; + ret.x = rv[0]; + ret.y = rv[1]; + return ret; +} + +inline __device__ uchar2 __ldg(const uchar2 *ptr) { + typedef unsigned char uc2 __attribute__((ext_vector_type(2))); + uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr)); + uchar2 ret; + ret.x = rv[0]; + ret.y = rv[1]; + return ret; +} +inline __device__ uchar4 __ldg(const uchar4 *ptr) { + typedef unsigned char uc4 __attribute__((ext_vector_type(4))); + uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr)); + uchar4 ret; + ret.w = rv[0]; + ret.x = rv[1]; + ret.y = rv[2]; + ret.z = rv[3]; + return ret; +} +inline __device__ ushort2 __ldg(const ushort2 *ptr) { + typedef unsigned short us2 __attribute__((ext_vector_type(2))); + us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr)); + ushort2 ret; + ret.x = rv[0]; + ret.y = rv[1]; + return ret; +} +inline __device__ ushort4 __ldg(const ushort4 *ptr) { + typedef unsigned short us4 __attribute__((ext_vector_type(4))); + us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr)); + ushort4 ret; + ret.w = rv[0]; + ret.x = rv[1]; + ret.y = rv[2]; + ret.z = rv[3]; + return ret; +} +inline __device__ uint2 __ldg(const uint2 *ptr) { + typedef unsigned int ui2 __attribute__((ext_vector_type(2))); + ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr)); + uint2 ret; + ret.x = rv[0]; + ret.y = rv[1]; + return ret; +} +inline __device__ uint4 __ldg(const uint4 *ptr) { + typedef unsigned int ui4 __attribute__((ext_vector_type(4))); + ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr)); + uint4 ret; + ret.w = rv[0]; + ret.x = rv[1]; + ret.y = rv[2]; + ret.z = rv[3]; + return ret; +} +inline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) { + typedef unsigned long long ull2 __attribute__((ext_vector_type(2))); + ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr)); + ulonglong2 ret; + ret.x = rv[0]; + ret.y = rv[1]; + return ret; +} + +inline __device__ float2 __ldg(const float2 *ptr) { + typedef float f2 __attribute__((ext_vector_type(2))); + f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr)); + float2 ret; + ret.x = rv[0]; + ret.y = rv[1]; + return ret; +} +inline __device__ float4 __ldg(const float4 *ptr) { + typedef float f4 __attribute__((ext_vector_type(4))); + f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr)); + float4 ret; + ret.w = rv[0]; + ret.x = rv[1]; + ret.y = rv[2]; + ret.z = rv[3]; + return ret; +} +inline __device__ double2 __ldg(const double2 *ptr) { + typedef double d2 __attribute__((ext_vector_type(2))); + d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr)); + double2 ret; + ret.x = rv[0]; + ret.y = rv[1]; + return ret; +} + +// TODO: Implement these as intrinsics, so the backend can work its magic on +// these. Alternatively, we could implement these as plain C and try to get +// llvm to recognize the relevant patterns. +inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32, + unsigned shiftWidth) { + unsigned result; + asm("shf.l.wrap.b32 %0, %1, %2, %3;" + : "=r"(result) + : "r"(low32), "r"(high32), "r"(shiftWidth)); + return result; +} +inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32, + unsigned shiftWidth) { + unsigned result; + asm("shf.l.clamp.b32 %0, %1, %2, %3;" + : "=r"(result) + : "r"(low32), "r"(high32), "r"(shiftWidth)); + return result; +} +inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32, + unsigned shiftWidth) { + unsigned result; + asm("shf.r.wrap.b32 %0, %1, %2, %3;" + : "=r"(result) + : "r"(low32), "r"(high32), "r"(shiftWidth)); + return result; +} +inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, + unsigned shiftWidth) { + unsigned ret; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" + : "=r"(ret) + : "r"(low32), "r"(high32), "r"(shiftWidth)); + return ret; +} + +#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 + +#endif // defined(__CLANG_CUDA_INTRINSICS_H__) diff --git a/clang/lib/Headers/__clang_cuda_runtime_wrapper.h b/clang/lib/Headers/__clang_cuda_runtime_wrapper.h index 4ad240ff1d6..3e41eabac03 100644 --- a/clang/lib/Headers/__clang_cuda_runtime_wrapper.h +++ b/clang/lib/Headers/__clang_cuda_runtime_wrapper.h @@ -188,7 +188,10 @@ static inline __device__ void __brkpt(int __c) { __brkpt(); } // sm_30_intrinsics.h has declarations that use default argument, so // we have to include it and it will in turn include .hpp #include "sm_30_intrinsics.h" -#include "sm_32_intrinsics.hpp" + +// Don't include sm_32_intrinsics.h. That header defines __ldg using inline +// asm, but we want to define it using builtins, because we can't use the +// [addr+imm] addressing mode if we use the inline asm in the header. #undef __MATH_FUNCTIONS_HPP__ @@ -278,6 +281,7 @@ __device__ inline __cuda_builtin_gridDim_t::operator dim3() const { } #include <__clang_cuda_cmath.h> +#include <__clang_cuda_intrinsics.h> // curand_mtgp32_kernel helpfully redeclares blockDim and threadIdx in host // mode, giving them their "proper" types of dim3 and uint3. This is |