summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp30
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h1
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad_int24.ll105
3 files changed, 129 insertions, 7 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index fcf17407b9e..0d82de55665 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -506,6 +506,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::FABS);
setTargetDAGCombine(ISD::AssertZext);
setTargetDAGCombine(ISD::AssertSext);
+ setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
}
//===----------------------------------------------------------------------===//
@@ -2771,8 +2772,16 @@ static bool isI24(SDValue Op, SelectionDAG &DAG) {
static SDValue simplifyI24(SDNode *Node24,
TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
- SDValue LHS = Node24->getOperand(0);
- SDValue RHS = Node24->getOperand(1);
+ bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
+
+ SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
+ SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
+ unsigned NewOpcode = Node24->getOpcode();
+ if (IsIntrin) {
+ unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
+ NewOpcode = IID == Intrinsic::amdgcn_mul_i24 ?
+ AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
+ }
APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
@@ -2782,7 +2791,7 @@ static SDValue simplifyI24(SDNode *Node24,
SDValue DemandedLHS = DAG.GetDemandedBits(LHS, Demanded);
SDValue DemandedRHS = DAG.GetDemandedBits(RHS, Demanded);
if (DemandedLHS || DemandedRHS)
- return DAG.getNode(Node24->getOpcode(), SDLoc(Node24), Node24->getVTList(),
+ return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
DemandedLHS ? DemandedLHS : LHS,
DemandedRHS ? DemandedRHS : RHS);
@@ -3020,6 +3029,19 @@ SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
return SDValue();
}
+
+SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+ switch (IID) {
+ case Intrinsic::amdgcn_mul_i24:
+ case Intrinsic::amdgcn_mul_u24:
+ return simplifyI24(N, DCI);
+ default:
+ return SDValue();
+ }
+}
+
/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
/// binary operation \p Opc to it with the corresponding constant operands.
SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
@@ -4108,6 +4130,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::AssertZext:
case ISD::AssertSext:
return performAssertSZExtCombine(N, DCI);
+ case ISD::INTRINSIC_WO_CHAIN:
+ return performIntrinsicWOChainCombine(N, DCI);
}
return SDValue();
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 421054ef783..ab18bbe822a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -78,6 +78,7 @@ protected:
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,
unsigned Opc, SDValue LHS,
diff --git a/llvm/test/CodeGen/AMDGPU/mad_int24.ll b/llvm/test/CodeGen/AMDGPU/mad_int24.ll
index af0159aa9b1..7ac2ff4be08 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_int24.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=FUNC
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
@@ -9,8 +9,8 @@
; Make sure we aren't masking the inputs.
; CM-NOT: AND
; CM: MULADD_INT24
-; SI-NOT: and
-; SI: v_mad_i32_i24
+; GCN-NOT: and
+; GCN: v_mad_i32_i24
define amdgpu_kernel void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
entry:
%0 = shl i32 %a, 8
@@ -22,3 +22,100 @@ entry:
store i32 %3, i32 addrspace(1)* %out
ret void
}
+
+; GCN-LABEL: {{^}}mad24_known_bits_destroyed:
+; GCN: s_waitcnt
+; GCN-NEXT: v_mad_i32_i24
+; GCN-NEXT: v_mul_i32_i24
+; GCN-NEXT: s_setpc_b64
+define i32 @mad24_known_bits_destroyed(i32 %a, i32 %b, i32 %c) {
+
+ %shl.0 = shl i32 %a, 8
+ %sra.0 = ashr i32 %shl.0, 8
+ %shl.1 = shl i32 %b, 8
+ %sra.1 = ashr i32 %shl.1, 8
+
+ %mul0 = mul nsw i32 %sra.0, %sra.1
+ %add0 = add nsw i32 %mul0, %c
+
+ %shl.2 = shl i32 %add0, 8
+ %sra.2 = ashr i32 %shl.2, 8
+
+ %shl.3 = shl i32 %sra.0, 8
+ %sra.3 = ashr i32 %shl.3, 8
+
+ %mul1 = mul nsw i32 %sra.2, %sra.3
+ ret i32 %mul1
+}
+
+; GCN-LABEL: {{^}}mad24_intrin_known_bits_destroyed:
+; GCN: s_waitcnt
+; GCN-NEXT: v_mad_i32_i24
+; GCN-NEXT: v_mul_i32_i24
+; GCN-NEXT: s_setpc_b64
+define i32 @mad24_intrin_known_bits_destroyed(i32 %a, i32 %b, i32 %c) {
+ %shl.0 = shl i32 %a, 8
+ %sra.0 = ashr i32 %shl.0, 8
+ %shl.1 = shl i32 %b, 8
+ %sra.1 = ashr i32 %shl.1, 8
+
+ %mul0 = call i32 @llvm.amdgcn.mul.i24(i32 %sra.0, i32 %sra.1)
+ %add0 = add nsw i32 %mul0, %c
+
+ %shl.2 = shl i32 %add0, 8
+ %sra.2 = ashr i32 %shl.2, 8
+
+ %shl.3 = shl i32 %sra.0, 8
+ %sra.3 = ashr i32 %shl.3, 8
+
+ %mul1 = mul nsw i32 %sra.2, %sra.3
+ ret i32 %mul1
+}
+
+; Make sure no unnecessary BFEs are emitted in the loop.
+; GCN-LABEL: {{^}}mad24_destroyed_knownbits_2:
+; GCN-NOT: v_bfe
+; GCN: v_mad_i32_i24
+; GCN-NOT: v_bfe
+; GCN: v_mad_i32_i24
+; GCN-NOT: v_bfe
+; GCN: v_mad_i32_i24
+; GCN-NOT: v_bfe
+; GCN: v_mad_i32_i24
+; GCN-NOT: v_bfe
+define void @mad24_destroyed_knownbits_2(i32 %arg, i32 %arg1, i32 %arg2, i32 addrspace(1)* %arg3) {
+bb:
+ br label %bb6
+
+bb5: ; preds = %bb6
+ ret void
+
+bb6: ; preds = %bb6, %bb
+ %tmp = phi i32 [ %tmp27, %bb6 ], [ 0, %bb ]
+ %tmp7 = phi i32 [ %arg2, %bb6 ], [ 1, %bb ]
+ %tmp8 = phi i32 [ %tmp26, %bb6 ], [ %arg, %bb ]
+ %tmp9 = shl i32 %tmp7, 8
+ %tmp10 = ashr exact i32 %tmp9, 8
+ %tmp11 = shl i32 %tmp8, 8
+ %tmp12 = ashr exact i32 %tmp11, 8
+ %tmp13 = mul nsw i32 %tmp12, %tmp10
+ %tmp14 = add nsw i32 %tmp13, %tmp7
+ %tmp15 = shl i32 %tmp14, 8
+ %tmp16 = ashr exact i32 %tmp15, 8
+ %tmp17 = mul nsw i32 %tmp16, %tmp10
+ %tmp18 = add nsw i32 %tmp17, %tmp14
+ %tmp19 = shl i32 %tmp18, 8
+ %tmp20 = ashr exact i32 %tmp19, 8
+ %tmp21 = mul nsw i32 %tmp20, %tmp16
+ %tmp22 = add nsw i32 %tmp21, %tmp18
+ %tmp23 = shl i32 %tmp22, 8
+ %tmp24 = ashr exact i32 %tmp23, 8
+ %tmp25 = mul nsw i32 %tmp24, %tmp20
+ %tmp26 = add nsw i32 %tmp25, %tmp22
+ store i32 %tmp26, i32 addrspace(1)* %arg3
+ %tmp27 = add nuw i32 %tmp, 1
+ %tmp28 = icmp eq i32 %tmp27, %arg1
+ br i1 %tmp28, label %bb5, label %bb6
+}
+
+declare i32 @llvm.amdgcn.mul.i24(i32, i32)
OpenPOWER on IntegriCloud