diff options
author | Justin Lebar <jlebar@google.com> | 2016-05-19 22:49:13 +0000 |
---|---|---|
committer | Justin Lebar <jlebar@google.com> | 2016-05-19 22:49:13 +0000 |
commit | 2e4ecfdebe8fa73ab4ed6f738307339ee9586418 (patch) | |
tree | a8eb6c205f01da79d919bb3f573cb92f11100a40 /clang/lib/CodeGen/CGBuiltin.cpp | |
parent | b926bdac4c18e0f31d827dec482f207856e88e1e (diff) | |
download | bcm5719-llvm-2e4ecfdebe8fa73ab4ed6f738307339ee9586418.tar.gz bcm5719-llvm-2e4ecfdebe8fa73ab4ed6f738307339ee9586418.zip |
[CUDA] Implement __ldg using intrinsics.
Summary:
Previously it was implemented as inline asm in the CUDA headers.
This change allows us to use the [addr+imm] addressing mode when
executing ld.global.nc instructions. This translates into a 1.3x
speedup on some benchmarks that call this instruction from within an
unrolled loop.
Reviewers: tra, rsmith
Subscribers: jhen, cfe-commits, jholewinski
Differential Revision: http://reviews.llvm.org/D19990
llvm-svn: 270150
Diffstat (limited to 'clang/lib/CodeGen/CGBuiltin.cpp')
-rw-r--r-- | clang/lib/CodeGen/CGBuiltin.cpp | 45 |
1 files changed, 45 insertions, 0 deletions
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index a68394bfc71..afc308d7f7a 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -7349,6 +7349,17 @@ Value *CodeGenFunction::EmitSystemZBuiltinExpr(unsigned BuiltinID, Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { + auto MakeLdg = [&](unsigned IntrinsicID) { + Value *Ptr = EmitScalarExpr(E->getArg(0)); + AlignmentSource AlignSource; + clang::CharUnits Align = + getNaturalPointeeTypeAlignment(E->getArg(0)->getType(), &AlignSource); + return Builder.CreateCall( + CGM.getIntrinsic(IntrinsicID, {Ptr->getType()->getPointerElementType(), + Ptr->getType()}), + {Ptr, ConstantInt::get(Builder.getInt32Ty(), Align.getQuantity())}); + }; + switch (BuiltinID) { case NVPTX::BI__nvvm_atom_add_gen_i: case NVPTX::BI__nvvm_atom_add_gen_l: @@ -7433,6 +7444,40 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, return Builder.CreateCall(FnALD32, {Ptr, Val}); } + case NVPTX::BI__nvvm_ldg_c: + case NVPTX::BI__nvvm_ldg_c2: + case NVPTX::BI__nvvm_ldg_c4: + case NVPTX::BI__nvvm_ldg_s: + case NVPTX::BI__nvvm_ldg_s2: + case NVPTX::BI__nvvm_ldg_s4: + case NVPTX::BI__nvvm_ldg_i: + case NVPTX::BI__nvvm_ldg_i2: + case NVPTX::BI__nvvm_ldg_i4: + case NVPTX::BI__nvvm_ldg_l: + case NVPTX::BI__nvvm_ldg_ll: + case NVPTX::BI__nvvm_ldg_ll2: + case NVPTX::BI__nvvm_ldg_uc: + case NVPTX::BI__nvvm_ldg_uc2: + case NVPTX::BI__nvvm_ldg_uc4: + case NVPTX::BI__nvvm_ldg_us: + case NVPTX::BI__nvvm_ldg_us2: + case NVPTX::BI__nvvm_ldg_us4: + case NVPTX::BI__nvvm_ldg_ui: + case NVPTX::BI__nvvm_ldg_ui2: + case NVPTX::BI__nvvm_ldg_ui4: + case NVPTX::BI__nvvm_ldg_ul: + case NVPTX::BI__nvvm_ldg_ull: + case NVPTX::BI__nvvm_ldg_ull2: + // PTX Interoperability section 2.2: "For a vector with an even number of + // elements, its alignment is set to number of elements times the alignment + // of its member: n*alignof(t)." + return MakeLdg(Intrinsic::nvvm_ldg_global_i); + case NVPTX::BI__nvvm_ldg_f: + case NVPTX::BI__nvvm_ldg_f2: + case NVPTX::BI__nvvm_ldg_f4: + case NVPTX::BI__nvvm_ldg_d: + case NVPTX::BI__nvvm_ldg_d2: + return MakeLdg(Intrinsic::nvvm_ldg_global_f); default: return nullptr; } |