From 5da831cc3109ff1fc5c8f43be2cf1a1f52e030da Mon Sep 17 00:00:00 2001
From: Jingyue Wu <jingyue@google.com>
Date: Fri, 10 Apr 2015 05:03:50 +0000
Subject: Divergence analysis for GPU programs

Summary:
Some optimizations such as jump threading and loop unswitching can negatively
affect performance when applied to divergent branches. The divergence analysis
added in this patch conservatively estimates which branches in a GPU program
can diverge. This information can then help LLVM to run certain optimizations
selectively.

Test Plan: test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll

Reviewers: resistor, hfinkel, eliben, meheff, jholewinski

Subscribers: broune, bjarke.roune, madhur13490, tstellarAMD, dberlin, echristo, jholewinski, llvm-commits

Differential Revision: http://reviews.llvm.org/D8576

llvm-svn: 234567
---
 llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp | 70 ++++++++++++++++++++++
 1 file changed, 70 insertions(+)

(limited to 'llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp')
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index b8af04de24a..dc81802f4b5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "NVPTXTargetTransformInfo.h"
+#include "NVPTXUtilities.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -19,6 +20,75 @@ using namespace llvm;
 
 #define DEBUG_TYPE "NVPTXtti"
 
+// Whether the given intrinsic reads threadIdx.x/y/z.
+static bool readsThreadIndex(const IntrinsicInst *II) {
+  switch (II->getIntrinsicID()) {
+    default: return false;
+    case Intrinsic::nvvm_read_ptx_sreg_tid_x:
+    case Intrinsic::nvvm_read_ptx_sreg_tid_y:
+    case Intrinsic::nvvm_read_ptx_sreg_tid_z:
+      return true;
+  }
+}
+
+static bool readsLaneId(const IntrinsicInst *II) {
+  return II->getIntrinsicID() == Intrinsic::ptx_read_laneid;
+}
+
+// Whether the given intrinsic is an atomic instruction in PTX.
+static bool isNVVMAtomic(const IntrinsicInst *II) {
+  switch (II->getIntrinsicID()) {
+    default: return false;
+    case Intrinsic::nvvm_atomic_load_add_f32:
+    case Intrinsic::nvvm_atomic_load_inc_32:
+    case Intrinsic::nvvm_atomic_load_dec_32:
+      return true;
+  }
+}
+
+bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) {
+  // Without inter-procedural analysis, we conservatively assume that arguments
+  // to __device__ functions are divergent.
+  if (const Argument *Arg = dyn_cast<Argument>(V))
+    return !isKernelFunction(*Arg->getParent());
+
+  if (const Instruction *I = dyn_cast<Instruction>(V)) {
+    // Without pointer analysis, we conservatively assume values loaded from
+    // generic or local address space are divergent.
+    if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      unsigned AS = LI->getPointerAddressSpace();
+      return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL;
+    }
+    // Atomic instructions may cause divergence. Atomic instructions are
+    // executed sequentially across all threads in a warp. Therefore, an earlier
+    // executed thread may see different memory inputs than a later executed
+    // thread. For example, suppose *a = 0 initially.
+    //
+    //   atom.global.add.s32 d, [a], 1
+    //
+    // returns 0 for the first thread that enters the critical region, and 1 for
+    // the second thread.
+    if (I->isAtomic())
+      return true;
+    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+      // Instructions that read threadIdx are obviously divergent.
+      if (readsThreadIndex(II) || readsLaneId(II))
+        return true;
+      // Handle the NVPTX atomic instrinsics that cannot be represented as an
+      // atomic IR instruction.
+      if (isNVVMAtomic(II))
+        return true;
+    }
+    // Conservatively consider the return value of function calls as divergent.
+    // We could analyze callees with bodies more precisely using
+    // inter-procedural analysis.
+    if (isa<CallInst>(I))
+      return true;
+  }
+
+  return false;
+}
+
 unsigned NVPTXTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
-- 
cgit v1.2.3