2 files changed, 132 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index bfd28b93569..32dc2a7afce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3144,6 +3144,36 @@ SDValue AMDGPUTargetLowering::performTruncateCombine(
     }
   }
 
+  // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
+  //
+  // i16 (trunc (srl i64:x, K)), K <= 16 ->
+  //     i16 (trunc (srl (i32 (trunc x), K)))
+  if (VT.getScalarSizeInBits() < 32) {
+    EVT SrcVT = Src.getValueType();
+    if (SrcVT.getScalarSizeInBits() > 32 &&
+        (Src.getOpcode() == ISD::SRL ||
+         Src.getOpcode() == ISD::SRA ||
+         Src.getOpcode() == ISD::SHL)) {
+      if (auto ShiftAmount = isConstOrConstSplat(Src.getOperand(1))) {
+        if (ShiftAmount->getZExtValue() <= VT.getScalarSizeInBits()) {
+          EVT MidVT = VT.isVector() ?
+            EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                             VT.getVectorNumElements()) : MVT::i32;
+
+          EVT ShiftTy = getShiftAmountTy(MidVT, DAG.getDataLayout());
+          SDValue NewShiftAmt = DAG.getConstant(ShiftAmount->getZExtValue(),
+                                                SL, ShiftTy);
+          SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
+                                      Src.getOperand(0));
+          DCI.AddToWorklist(Trunc.getNode());
+          SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
+                                            Trunc, NewShiftAmt);
+          return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
+        }
+      }
+    }
+  }
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll b/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll
new file mode 100644
index 00000000000..65307ca6fa9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll
@@ -0,0 +1,102 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+
+; Test combine to reduce the width of a 64-bit shift to 32-bit if
+; truncated to 16-bit.
+
+; GCN-LABEL: {{^}}trunc_srl_i64_16_to_i16:
+; GCN: s_waitcnt
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_setpc_b64
+define i16 @trunc_srl_i64_16_to_i16(i64 %x) {
+  %shift = lshr i64 %x, 16
+  %trunc = trunc i64 %shift to i16
+  ret i16 %trunc
+}
+
+; GCN-LABEL: {{^}}trunc_srl_i64_17_to_i16:
+; GCN: s_waitcnt
+; GCN-NEXT: v_lshrrev_b64 v[0:1], 17, v[0:1]
+; GCN-NEXT: s_setpc_b64
+define i16 @trunc_srl_i64_17_to_i16(i64 %x) {
+  %shift = lshr i64 %x, 17
+  %trunc = trunc i64 %shift to i16
+  ret i16 %trunc
+}
+
+; GCN-LABEL: {{^}}trunc_srl_i55_16_to_i15:
+; GCN: s_waitcnt
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 15, v0
+; GCN-NEXT: v_add_u16_e32 v0, 4, v0
+; GCN-NEXT: s_setpc_b64
+define i15 @trunc_srl_i55_16_to_i15(i55 %x) {
+  %shift = lshr i55 %x, 15
+  %trunc = trunc i55 %shift to i15
+  %add = add i15 %trunc, 4
+  ret i15 %add
+}
+
+; GCN-LABEL: {{^}}trunc_sra_i64_16_to_i16:
+; GCN: s_waitcnt
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_setpc_b64
+define i16 @trunc_sra_i64_16_to_i16(i64 %x) {
+  %shift = ashr i64 %x, 16
+  %trunc = trunc i64 %shift to i16
+  ret i16 %trunc
+}
+
+; GCN-LABEL: {{^}}trunc_sra_i64_17_to_i16:
+; GCN: s_waitcnt
+; GCN-NEXT: v_lshrrev_b64 v[0:1], 17, v[0:1]
+; GCN-NEXT: s_setpc_b64
+define i16 @trunc_sra_i64_17_to_i16(i64 %x) {
+  %shift = ashr i64 %x, 17
+  %trunc = trunc i64 %shift to i16
+  ret i16 %trunc
+}
+
+; GCN-LABEL: {{^}}trunc_shl_i64_16_to_i16:
+; GCN: s_waitcnt
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_setpc_b64
+define i16 @trunc_shl_i64_16_to_i16(i64 %x) {
+  %shift = shl i64 %x, 16
+  %trunc = trunc i64 %shift to i16
+  ret i16 %trunc
+}
+
+; GCN-LABEL: {{^}}trunc_shl_i64_17_to_i16:
+; GCN: s_waitcnt
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_setpc_b64
+define i16 @trunc_shl_i64_17_to_i16(i64 %x) {
+  %shift = shl i64 %x, 17
+  %trunc = trunc i64 %shift to i16
+  ret i16 %trunc
+}
+
+; GCN-LABEL: {{^}}trunc_srl_v2i64_16_to_v2i16:
+; GCN: s_waitcnt
+; GCN-DAG: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0000
+; GCN: v_and_or_b32 v0, v2, [[MASK]], v0
+; GCN-NEXT: s_setpc_b64
+define <2 x i16> @trunc_srl_v2i64_16_to_v2i16(<2 x i64> %x) {
+  %shift = lshr <2 x i64> %x, <i64 16, i64 16>
+  %trunc = trunc <2 x i64> %shift to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+; GCN-LABEL: {{^}}s_trunc_srl_i64_16_to_i16:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+; GCN: s_lshr_b32 [[VAL_SHIFT:s[0-9]+]], [[VAL]], 16
+; GCN: s_or_b32 [[RESULT:s[0-9]+]], [[VAL_SHIFT]], 4
+; GCN: v_mov_b32_e32 [[V_RESULT:v[0-9]+]], [[RESULT]]
+; GCN: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]]
+define amdgpu_kernel void @s_trunc_srl_i64_16_to_i16(i64 %x) {
+  %shift = lshr i64 %x, 16
+  %trunc = trunc i64 %shift to i16
+  %add = or i16 %trunc, 4
+  store i16 %add, i16 addrspace(1)* undef
+  ret void
+}