summaryrefslogtreecommitdiffstats
path: root/clang/lib/CodeGen/CGBuiltin.cpp
diff options
context:
space:
mode:
authorJustin Lebar <jlebar@google.com>2016-05-19 22:49:13 +0000
committerJustin Lebar <jlebar@google.com>2016-05-19 22:49:13 +0000
commit2e4ecfdebe8fa73ab4ed6f738307339ee9586418 (patch)
treea8eb6c205f01da79d919bb3f573cb92f11100a40 /clang/lib/CodeGen/CGBuiltin.cpp
parentb926bdac4c18e0f31d827dec482f207856e88e1e (diff)
downloadbcm5719-llvm-2e4ecfdebe8fa73ab4ed6f738307339ee9586418.tar.gz
bcm5719-llvm-2e4ecfdebe8fa73ab4ed6f738307339ee9586418.zip
[CUDA] Implement __ldg using intrinsics.
Summary: Previously it was implemented as inline asm in the CUDA headers. This change allows us to use the [addr+imm] addressing mode when executing ld.global.nc instructions. This translates into a 1.3x speedup on some benchmarks that call this instruction from within an unrolled loop. Reviewers: tra, rsmith Subscribers: jhen, cfe-commits, jholewinski Differential Revision: http://reviews.llvm.org/D19990 llvm-svn: 270150
Diffstat (limited to 'clang/lib/CodeGen/CGBuiltin.cpp')
-rw-r--r--clang/lib/CodeGen/CGBuiltin.cpp45
1 files changed, 45 insertions, 0 deletions
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index a68394bfc71..afc308d7f7a 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -7349,6 +7349,17 @@ Value *CodeGenFunction::EmitSystemZBuiltinExpr(unsigned BuiltinID,
Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
const CallExpr *E) {
+ auto MakeLdg = [&](unsigned IntrinsicID) {
+ Value *Ptr = EmitScalarExpr(E->getArg(0));
+ AlignmentSource AlignSource;
+ clang::CharUnits Align =
+ getNaturalPointeeTypeAlignment(E->getArg(0)->getType(), &AlignSource);
+ return Builder.CreateCall(
+ CGM.getIntrinsic(IntrinsicID, {Ptr->getType()->getPointerElementType(),
+ Ptr->getType()}),
+ {Ptr, ConstantInt::get(Builder.getInt32Ty(), Align.getQuantity())});
+ };
+
switch (BuiltinID) {
case NVPTX::BI__nvvm_atom_add_gen_i:
case NVPTX::BI__nvvm_atom_add_gen_l:
@@ -7433,6 +7444,40 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
return Builder.CreateCall(FnALD32, {Ptr, Val});
}
+ case NVPTX::BI__nvvm_ldg_c:
+ case NVPTX::BI__nvvm_ldg_c2:
+ case NVPTX::BI__nvvm_ldg_c4:
+ case NVPTX::BI__nvvm_ldg_s:
+ case NVPTX::BI__nvvm_ldg_s2:
+ case NVPTX::BI__nvvm_ldg_s4:
+ case NVPTX::BI__nvvm_ldg_i:
+ case NVPTX::BI__nvvm_ldg_i2:
+ case NVPTX::BI__nvvm_ldg_i4:
+ case NVPTX::BI__nvvm_ldg_l:
+ case NVPTX::BI__nvvm_ldg_ll:
+ case NVPTX::BI__nvvm_ldg_ll2:
+ case NVPTX::BI__nvvm_ldg_uc:
+ case NVPTX::BI__nvvm_ldg_uc2:
+ case NVPTX::BI__nvvm_ldg_uc4:
+ case NVPTX::BI__nvvm_ldg_us:
+ case NVPTX::BI__nvvm_ldg_us2:
+ case NVPTX::BI__nvvm_ldg_us4:
+ case NVPTX::BI__nvvm_ldg_ui:
+ case NVPTX::BI__nvvm_ldg_ui2:
+ case NVPTX::BI__nvvm_ldg_ui4:
+ case NVPTX::BI__nvvm_ldg_ul:
+ case NVPTX::BI__nvvm_ldg_ull:
+ case NVPTX::BI__nvvm_ldg_ull2:
+ // PTX Interoperability section 2.2: "For a vector with an even number of
+ // elements, its alignment is set to number of elements times the alignment
+ // of its member: n*alignof(t)."
+ return MakeLdg(Intrinsic::nvvm_ldg_global_i);
+ case NVPTX::BI__nvvm_ldg_f:
+ case NVPTX::BI__nvvm_ldg_f2:
+ case NVPTX::BI__nvvm_ldg_f4:
+ case NVPTX::BI__nvvm_ldg_d:
+ case NVPTX::BI__nvvm_ldg_d2:
+ return MakeLdg(Intrinsic::nvvm_ldg_global_f);
default:
return nullptr;
}
OpenPOWER on IntegriCloud