summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2015-07-14 18:20:33 +0000
committerMatt Arsenault <Matthew.Arsenault@amd.com>2015-07-14 18:20:33 +0000
commit24692118bad46f907f80086f0e83c19f3905f5c3 (patch)
tree3b27c6aab3027b6172c5f35f25bfe9340c558025 /llvm/lib
parent5c6efed3f5cc85b55ea87e774b6a7701fe1d13df (diff)
downloadbcm5719-llvm-24692118bad46f907f80086f0e83c19f3905f5c3.tar.gz
bcm5719-llvm-24692118bad46f907f80086f0e83c19f3905f5c3.zip
AMDGPU: Avoid using 64-bit shift for i64 (shl x, 32)
This can be done only with moves which theoretically will optimize better later. Although this transform increases the instruction count, it should be code size / cycle count neutral in the worst VALU case. It also seems to slightly improve a couple of testcases due to other DAG combines this exposes. This is probably slightly worse for the SALU case, so it might be better to handle this during moveToVALU, although then you lose some simplifications like the load width reducing in the simple testcase. llvm-svn: 242177
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp34
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h1
2 files changed, 35 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 3b68a92cd5f..3a65f3b5614 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -406,6 +406,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
+ setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::SELECT);
setTargetDAGCombine(ISD::SELECT_CC);
@@ -2415,6 +2416,33 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
SN->getBasePtr(), SN->getMemOperand());
}
+SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ if (N->getValueType(0) != MVT::i64)
+ return SDValue();
+
+ // i64 (shl x, 32) -> (build_pair 0, x)
+
+ // Doing this with moves theoretically helps MI optimizations that understand
+ // copies. 2 v_mov_b32_e32 will have the same code size / cycle count as
+ // v_lshl_b64. In the SALU case, I think this is slightly worse since it
+ // doubles the code size and I'm unsure about cycle count.
+ const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!RHS || RHS->getZExtValue() != 32)
+ return SDValue();
+
+ SDValue LHS = N->getOperand(0);
+
+ SDLoc SL(N);
+ SelectionDAG &DAG = DCI.DAG;
+
+ // Extract low 32-bits.
+ SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
+
+ const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+ return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Zero, Lo);
+}
+
SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
EVT VT = N->getValueType(0);
@@ -2454,6 +2482,12 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
switch(N->getOpcode()) {
default:
break;
+ case ISD::SHL: {
+ if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+ break;
+
+ return performShlCombine(N, DCI);
+ }
case ISD::MUL:
return performMulCombine(N, DCI);
case AMDGPUISD::MUL_I24:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index bba303d3e4b..478b2035fd7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -65,6 +65,7 @@ private:
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
protected:
OpenPOWER on IntegriCloud