[CUDA] Implement __ldg using intrinsics.

Summary: Previously it was implemented as inline asm in the CUDA headers. This change allows us to use the [addr+imm] addressing mode when executing ld.global.nc instructions. This translates into a 1.3x speedup on some benchmarks that call this instruction from within an unrolled loop. Reviewers: tra, rsmith Subscribers: jhen, cfe-commits, jholewinski Differential Revision: http://reviews.llvm.org/D19990 llvm-svn: 270150
author: Justin Lebar <jlebar@google.com> 2016-05-19 22:49:13 +0000
committer: Justin Lebar <jlebar@google.com> 2016-05-19 22:49:13 +0000
commit: 2e4ecfdebe8fa73ab4ed6f738307339ee9586418 (patch)
tree: a8eb6c205f01da79d919bb3f573cb92f11100a40 /clang/lib/CodeGen/CGBuiltin.cpp
parent: b926bdac4c18e0f31d827dec482f207856e88e1e (diff)
download: bcm5719-llvm-2e4ecfdebe8fa73ab4ed6f738307339ee9586418.tar.gz
bcm5719-llvm-2e4ecfdebe8fa73ab4ed6f738307339ee9586418.zip
1 files changed, 45 insertions, 0 deletions
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index a68394bfc71..afc308d7f7a 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -7349,6 +7349,17 @@ Value *CodeGenFunction::EmitSystemZBuiltinExpr(unsigned BuiltinID,
 
 Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
                                              const CallExpr *E) {
+  auto MakeLdg = [&](unsigned IntrinsicID) {
+    Value *Ptr = EmitScalarExpr(E->getArg(0));
+    AlignmentSource AlignSource;
+    clang::CharUnits Align =
+        getNaturalPointeeTypeAlignment(E->getArg(0)->getType(), &AlignSource);
+    return Builder.CreateCall(
+        CGM.getIntrinsic(IntrinsicID, {Ptr->getType()->getPointerElementType(),
+                                       Ptr->getType()}),
+        {Ptr, ConstantInt::get(Builder.getInt32Ty(), Align.getQuantity())});
+  };
+
   switch (BuiltinID) {
   case NVPTX::BI__nvvm_atom_add_gen_i:
   case NVPTX::BI__nvvm_atom_add_gen_l:
@@ -7433,6 +7444,40 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
     return Builder.CreateCall(FnALD32, {Ptr, Val});
   }
 
+  case NVPTX::BI__nvvm_ldg_c:
+  case NVPTX::BI__nvvm_ldg_c2:
+  case NVPTX::BI__nvvm_ldg_c4:
+  case NVPTX::BI__nvvm_ldg_s:
+  case NVPTX::BI__nvvm_ldg_s2:
+  case NVPTX::BI__nvvm_ldg_s4:
+  case NVPTX::BI__nvvm_ldg_i:
+  case NVPTX::BI__nvvm_ldg_i2:
+  case NVPTX::BI__nvvm_ldg_i4:
+  case NVPTX::BI__nvvm_ldg_l:
+  case NVPTX::BI__nvvm_ldg_ll:
+  case NVPTX::BI__nvvm_ldg_ll2:
+  case NVPTX::BI__nvvm_ldg_uc:
+  case NVPTX::BI__nvvm_ldg_uc2:
+  case NVPTX::BI__nvvm_ldg_uc4:
+  case NVPTX::BI__nvvm_ldg_us:
+  case NVPTX::BI__nvvm_ldg_us2:
+  case NVPTX::BI__nvvm_ldg_us4:
+  case NVPTX::BI__nvvm_ldg_ui:
+  case NVPTX::BI__nvvm_ldg_ui2:
+  case NVPTX::BI__nvvm_ldg_ui4:
+  case NVPTX::BI__nvvm_ldg_ul:
+  case NVPTX::BI__nvvm_ldg_ull:
+  case NVPTX::BI__nvvm_ldg_ull2:
+    // PTX Interoperability section 2.2: "For a vector with an even number of
+    // elements, its alignment is set to number of elements times the alignment
+    // of its member: n*alignof(t)."
+    return MakeLdg(Intrinsic::nvvm_ldg_global_i);
+  case NVPTX::BI__nvvm_ldg_f:
+  case NVPTX::BI__nvvm_ldg_f2:
+  case NVPTX::BI__nvvm_ldg_f4:
+  case NVPTX::BI__nvvm_ldg_d:
+  case NVPTX::BI__nvvm_ldg_d2:
+    return MakeLdg(Intrinsic::nvvm_ldg_global_f);
   default:
     return nullptr;
   }
author	Justin Lebar <jlebar@google.com>	2016-05-19 22:49:13 +0000
committer	Justin Lebar <jlebar@google.com>	2016-05-19 22:49:13 +0000
commit	2e4ecfdebe8fa73ab4ed6f738307339ee9586418 (patch)
tree	a8eb6c205f01da79d919bb3f573cb92f11100a40 /clang/lib/CodeGen/CGBuiltin.cpp
parent	b926bdac4c18e0f31d827dec482f207856e88e1e (diff)
download	bcm5719-llvm-2e4ecfdebe8fa73ab4ed6f738307339ee9586418.tar.gz bcm5719-llvm-2e4ecfdebe8fa73ab4ed6f738307339ee9586418.zip