AMDGPU: Add DS append/consume intrinsics

Since these pass the pointer in m0 unlike other DS instructions, these need to worry about whether the address is uniform or not. This assumes the address is dynamically uniform, and just uses readfirstlane to get a copy into an SGPR. I don't know if these have the same 16-bit add for the addressing mode offset problem on SI or not, but I've just assumed they do. Also includes some misc. changes to avoid test differences between the LDS and GDS versions. llvm-svn: 352422
author: Matt Arsenault <Matthew.Arsenault@amd.com> 2019-01-28 20:14:49 +0000
committer: Matt Arsenault <Matthew.Arsenault@amd.com> 2019-01-28 20:14:49 +0000
commit: cdd191d9db6a17b75b5f6d6f3d1d2691ac198153 (patch)
tree: 3ade69542218a8b35a4658c843389b8a1fd01274 /llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
parent: e4e9ba2bea60c73a19d8a673e947f0d9b258a777 (diff)
download: bcm5719-llvm-cdd191d9db6a17b75b5f6d6f3d1d2691ac198153.tar.gz
bcm5719-llvm-cdd191d9db6a17b75b5f6d6f3d1d2691ac198153.zip
1 files changed, 72 insertions, 15 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 1bc6be45056..2cdd691fc10 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -106,12 +106,13 @@ private:
 
   MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;
 
-  SDNode *glueCopyToM0(SDNode *N) const;
+  SDNode *glueCopyToM0LDSInit(SDNode *N) const;
+  SDNode *glueCopyToM0(SDNode *N, SDValue Val) const;
 
   const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
   virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
   virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
-  bool isDSOffsetLegal(const SDValue &Base, unsigned Offset,
+  bool isDSOffsetLegal(SDValue Base, unsigned Offset,
                        unsigned OffsetBits) const;
   bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
   bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
@@ -209,6 +210,7 @@ private:
   void SelectBRCOND(SDNode *N);
   void SelectFMAD_FMA(SDNode *N);
   void SelectATOMIC_CMP_SWAP(SDNode *N);
+  void SelectINTRINSIC_W_CHAIN(SDNode *N);
 
 protected:
   // Include the pieces autogenerated from the target description.
@@ -339,29 +341,32 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
   }
 }
 
-SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
-  if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS ||
-      !Subtarget->ldsRequiresM0Init())
-    return N;
-
+SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
   const SITargetLowering& Lowering =
-      *static_cast<const SITargetLowering*>(getTargetLowering());
+    *static_cast<const SITargetLowering*>(getTargetLowering());
 
   // Write max value to m0 before each load operation
 
   SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N),
-                                 CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
+                                 Val);
 
   SDValue Glue = M0.getValue(1);
 
   SmallVector <SDValue, 8> Ops;
-  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
-     Ops.push_back(N->getOperand(i));
-  }
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    Ops.push_back(N->getOperand(i));
+
   Ops.push_back(Glue);
   return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
 }
 
+SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
+  if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS ||
+      !Subtarget->ldsRequiresM0Init())
+    return N;
+  return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
+}
+
 MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
                                                   EVT VT) const {
   SDNode *Lo = CurDAG->getMachineNode(
@@ -472,7 +477,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
        Opc == ISD::ATOMIC_LOAD_FADD ||
        Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
        Opc == AMDGPUISD::ATOMIC_LOAD_FMAX))
-    N = glueCopyToM0(N);
+    N = glueCopyToM0LDSInit(N);
 
   switch (Opc) {
   default:
@@ -570,7 +575,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
   case ISD::STORE:
   case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_STORE: {
-    N = glueCopyToM0(N);
+    N = glueCopyToM0LDSInit(N);
     break;
   }
 
@@ -648,6 +653,12 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
       SelectCode(N);
       return;
     }
+
+    break;
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+    SelectINTRINSIC_W_CHAIN(N);
+    return;
   }
   }
 
@@ -828,7 +839,7 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
   CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
 }
 
-bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
+bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset,
                                          unsigned OffsetBits) const {
   if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
       (OffsetBits == 8 && !isUInt<8>(Offset)))
@@ -1760,6 +1771,52 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
   CurDAG->RemoveDeadNode(N);
 }
 
+void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
+  unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  if ((IntrID != Intrinsic::amdgcn_ds_append &&
+       IntrID != Intrinsic::amdgcn_ds_consume) ||
+      N->getValueType(0) != MVT::i32) {
+    SelectCode(N);
+    return;
+  }
+
+  // The address is assumed to be uniform, so if it ends up in a VGPR, it will
+  // be copied to an SGPR with readfirstlane.
+  unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
+    AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
+
+  SDValue Chain = N->getOperand(0);
+  SDValue Ptr = N->getOperand(2);
+  MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
+  bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
+
+  SDValue Offset;
+  if (CurDAG->isBaseWithConstantOffset(Ptr)) {
+    SDValue PtrBase = Ptr.getOperand(0);
+    SDValue PtrOffset = Ptr.getOperand(1);
+
+    const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue();
+    if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue(), 16)) {
+      N = glueCopyToM0(N, PtrBase);
+      Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
+    }
+  }
+
+  if (!Offset) {
+    N = glueCopyToM0(N, Ptr);
+    Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
+  }
+
+  SDValue Ops[] = {
+    Offset,
+    CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
+    Chain,
+    N->getOperand(N->getNumOperands() - 1) // New glue
+  };
+
+  CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
+}
+
 bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
                                             unsigned &Mods) const {
   Mods = 0;
author	Matt Arsenault <Matthew.Arsenault@amd.com>	2019-01-28 20:14:49 +0000
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	2019-01-28 20:14:49 +0000
commit	cdd191d9db6a17b75b5f6d6f3d1d2691ac198153 (patch)
tree	3ade69542218a8b35a4658c843389b8a1fd01274 /llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
parent	e4e9ba2bea60c73a19d8a673e947f0d9b258a777 (diff)
download	bcm5719-llvm-cdd191d9db6a17b75b5f6d6f3d1d2691ac198153.tar.gz bcm5719-llvm-cdd191d9db6a17b75b5f6d6f3d1d2691ac198153.zip