[AMDGPU] gfx1010 wave32 icmp/fcmp intrinsic changes for wave32

Differential Revision: https://reviews.llvm.org/D63301 llvm-svn: 363339
author: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> 2019-06-13 23:47:36 +0000
committer: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> 2019-06-13 23:47:36 +0000
commit: 68a2fef9ae5beadd0ca7974d936e98caa04aa085 (patch)
tree: ea66f916f5ab7b128571ba39eb8227afc23e83b8 /llvm/lib
parent: 347ec0faa79a26dd1ba2e896b9acb18d8d05fdfc (diff)
download: bcm5719-llvm-68a2fef9ae5beadd0ca7974d936e98caa04aa085.tar.gz
bcm5719-llvm-68a2fef9ae5beadd0ca7974d936e98caa04aa085.zip
6 files changed, 69 insertions, 25 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 4756a77a7ec..1c503c29d55 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -249,7 +249,8 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
   // We need to know how many lanes are active within the wavefront, and we do
   // this by doing a ballot of active lanes.
   CallInst *const Ballot =
-      B.CreateIntrinsic(Intrinsic::amdgcn_icmp, {B.getInt32Ty()},
+      B.CreateIntrinsic(Intrinsic::amdgcn_icmp,
+                        {B.getInt64Ty(), B.getInt32Ty()},
                         {B.getInt32(1), B.getInt32(0), B.getInt32(33)});
 
   // We need to know how many lanes are active within the wavefront that are
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index bccbcd4da16..ca8dc8c07c6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -1028,6 +1028,10 @@ public:
       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
       const override;
 
+  bool isWave32() const {
+    return WavefrontSize == 32;
+  }
+
   /// \returns Maximum number of work groups per compute unit supported by the
   /// subtarget and limited by given \p FlatWorkGroupSize.
   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 15ab80b756e..b764ca7d706 100644
--- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -12,11 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constant.h"
@@ -55,13 +57,13 @@ class SIAnnotateControlFlow : public FunctionPass {
 
   Type *Boolean;
   Type *Void;
-  Type *Int64;
+  Type *IntMask;
   Type *ReturnStruct;
 
   ConstantInt *BoolTrue;
   ConstantInt *BoolFalse;
   UndefValue *BoolUndef;
-  Constant *Int64Zero;
+  Constant *IntMaskZero;
 
   Function *If;
   Function *Else;
@@ -74,6 +76,8 @@ class SIAnnotateControlFlow : public FunctionPass {
 
   LoopInfo *LI;
 
+  void initialize(Module &M, const GCNSubtarget &ST);
+
   bool isUniform(BranchInst *T);
 
   bool isTopOfStack(BasicBlock *BB);
@@ -103,8 +107,6 @@ public:
 
   SIAnnotateControlFlow() : FunctionPass(ID) {}
 
-  bool doInitialization(Module &M) override;
-
   bool runOnFunction(Function &F) override;
 
   StringRef getPassName() const override { return "SI annotate control flow"; }
@@ -114,6 +116,7 @@ public:
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<LegacyDivergenceAnalysis>();
     AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetPassConfig>();
     FunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -124,31 +127,34 @@ INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE,
                       "Annotate SI Control Flow", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE,
                     "Annotate SI Control Flow", false, false)
 
 char SIAnnotateControlFlow::ID = 0;
 
 /// Initialize all the types and constants used in the pass
-bool SIAnnotateControlFlow::doInitialization(Module &M) {
+void SIAnnotateControlFlow::initialize(Module &M, const GCNSubtarget &ST) {
   LLVMContext &Context = M.getContext();
 
   Void = Type::getVoidTy(Context);
   Boolean = Type::getInt1Ty(Context);
-  Int64 = Type::getInt64Ty(Context);
-  ReturnStruct = StructType::get(Boolean, Int64);
+  IntMask = ST.isWave32() ? Type::getInt32Ty(Context)
+                           : Type::getInt64Ty(Context);
+  ReturnStruct = StructType::get(Boolean, IntMask);
 
   BoolTrue = ConstantInt::getTrue(Context);
   BoolFalse = ConstantInt::getFalse(Context);
   BoolUndef = UndefValue::get(Boolean);
-  Int64Zero = ConstantInt::get(Int64, 0);
-
-  If = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if);
-  Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else);
-  IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break);
-  Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop);
-  EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf);
-  return false;
+  IntMaskZero = ConstantInt::get(IntMask, 0);
+
+  If = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if, { IntMask });
+  Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else,
+                                   { IntMask, IntMask });
+  IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break,
+                                      { IntMask, IntMask });
+  Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop, { IntMask });
+  EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf, { IntMask });
 }
 
 /// Is the branch condition uniform or did the StructurizeCFG pass
@@ -258,14 +264,14 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
     return;
 
   BasicBlock *Target = Term->getSuccessor(1);
-  PHINode *Broken = PHINode::Create(Int64, 0, "phi.broken", &Target->front());
+  PHINode *Broken = PHINode::Create(IntMask, 0, "phi.broken", &Target->front());
 
   Value *Cond = Term->getCondition();
   Term->setCondition(BoolTrue);
   Value *Arg = handleLoopCondition(Cond, Broken, L, Term);
 
   for (BasicBlock *Pred : predecessors(Target)) {
-    Value *PHIValue = Int64Zero;
+    Value *PHIValue = IntMaskZero;
     if (Pred == BB) // Remember the value of the previous iteration.
       PHIValue = Arg;
     // If the backedge from Pred to Target could be executed before the exit
@@ -316,6 +322,10 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   DA = &getAnalysis<LegacyDivergenceAnalysis>();
+  TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
+  const TargetMachine &TM = TPC.getTM<TargetMachine>();
+
+  initialize(*F.getParent(), TM.getSubtarget<GCNSubtarget>(F));
 
   for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
        E = df_end(&F.getEntryBlock()); I != E; ++I) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3bb1ddc6703..92ca105af25 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3839,7 +3839,6 @@ static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
 
   ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
 
-
   SDValue LHS = N->getOperand(1);
   SDValue RHS = N->getOperand(2);
 
@@ -3855,8 +3854,14 @@ static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
 
   ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
 
-  return DAG.getNode(AMDGPUISD::SETCC, DL, VT, LHS, RHS,
-                     DAG.getCondCode(CCOpcode));
+  unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
+  EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
+
+  SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
+                              DAG.getCondCode(CCOpcode));
+  if (VT.bitsEq(CCVT))
+    return SetCC;
+  return DAG.getZExtOrTrunc(SetCC, DL, VT);
 }
 
 static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
@@ -3882,8 +3887,13 @@ static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
 
   FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
   ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
-  return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src0,
-                     Src1, DAG.getCondCode(CCOpcode));
+  unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
+  EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
+  SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0,
+                              Src1, DAG.getCondCode(CCOpcode));
+  if (VT.bitsEq(CCVT))
+    return SetCC;
+  return DAG.getZExtOrTrunc(SetCC, SL, VT);
 }
 
 void SITargetLowering::ReplaceNodeResults(SDNode *N,
@@ -5394,6 +5404,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
                           SDLoc(DAG.getEntryNode()),
                           MFI->getArgInfo().WorkItemIDZ);
+  case Intrinsic::amdgcn_wavefrontsize:
+    return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
+                           SDLoc(Op), MVT::i32);
   case Intrinsic::amdgcn_s_buffer_load: {
     unsigned Cache = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
     return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
@@ -5598,6 +5611,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_fmad_ftz:
     return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
                        Op.getOperand(2), Op.getOperand(3));
+
+  case Intrinsic::amdgcn_if_break:
+    return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
+                                      Op->getOperand(1), Op->getOperand(2)), 0);
+
   default:
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
@@ -6495,6 +6513,10 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
                                    M->getMemoryVT(), M->getMemOperand());
   }
 
+  case Intrinsic::amdgcn_end_cf:
+    return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
+                                      Op->getOperand(2), Chain), 0);
+
   default: {
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 4a3e8b3e36b..e6b64ecbfce 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -604,7 +604,12 @@ def : Pat <
   // TODO: we could add more variants for other types of conditionals
 
 def : Pat <
-  (int_amdgcn_icmp i1:$src, (i1 0), (i32 33)),
+  (i64 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))),
+  (COPY $src) // Return the SGPRs representing i1 src
+>;
+
+def : Pat <
+  (i32 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))),
   (COPY $src) // Return the SGPRs representing i1 src
 >;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 2c8fa20b259..f167762b602 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3733,7 +3733,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
         break;
 
       Function *NewF =
-          Intrinsic::getDeclaration(II->getModule(), NewIID, SrcLHS->getType());
+          Intrinsic::getDeclaration(II->getModule(), NewIID,
+                                    { II->getType(),
+                                      SrcLHS->getType() });
       Value *Args[] = { SrcLHS, SrcRHS,
                         ConstantInt::get(CC->getType(), SrcPred) };
       CallInst *NewCall = Builder.CreateCall(NewF, Args);
author	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>	2019-06-13 23:47:36 +0000
committer	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>	2019-06-13 23:47:36 +0000
commit	68a2fef9ae5beadd0ca7974d936e98caa04aa085 (patch)
tree	ea66f916f5ab7b128571ba39eb8227afc23e83b8 /llvm/lib
parent	347ec0faa79a26dd1ba2e896b9acb18d8d05fdfc (diff)
download	bcm5719-llvm-68a2fef9ae5beadd0ca7974d936e98caa04aa085.tar.gz bcm5719-llvm-68a2fef9ae5beadd0ca7974d936e98caa04aa085.zip