summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp30
-rw-r--r--llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll102
2 files changed, 132 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index bfd28b93569..32dc2a7afce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3144,6 +3144,36 @@ SDValue AMDGPUTargetLowering::performTruncateCombine(
}
}
+ // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
+ //
+ // i16 (trunc (srl i64:x, K)), K <= 16 ->
+ // i16 (trunc (srl (i32 (trunc x), K)))
+ if (VT.getScalarSizeInBits() < 32) {
+ EVT SrcVT = Src.getValueType();
+ if (SrcVT.getScalarSizeInBits() > 32 &&
+ (Src.getOpcode() == ISD::SRL ||
+ Src.getOpcode() == ISD::SRA ||
+ Src.getOpcode() == ISD::SHL)) {
+ if (auto ShiftAmount = isConstOrConstSplat(Src.getOperand(1))) {
+ if (ShiftAmount->getZExtValue() <= VT.getScalarSizeInBits()) {
+ EVT MidVT = VT.isVector() ?
+ EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ VT.getVectorNumElements()) : MVT::i32;
+
+ EVT ShiftTy = getShiftAmountTy(MidVT, DAG.getDataLayout());
+ SDValue NewShiftAmt = DAG.getConstant(ShiftAmount->getZExtValue(),
+ SL, ShiftTy);
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
+ Src.getOperand(0));
+ DCI.AddToWorklist(Trunc.getNode());
+ SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
+ Trunc, NewShiftAmt);
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
+ }
+ }
+ }
+ }
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll b/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll
new file mode 100644
index 00000000000..65307ca6fa9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll
@@ -0,0 +1,102 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+
+; Test combine to reduce the width of a 64-bit shift to 32-bit if
+; truncated to 16-bit.
+
+; GCN-LABEL: {{^}}trunc_srl_i64_16_to_i16:
+; GCN: s_waitcnt
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_setpc_b64
+define i16 @trunc_srl_i64_16_to_i16(i64 %x) {
+ %shift = lshr i64 %x, 16
+ %trunc = trunc i64 %shift to i16
+ ret i16 %trunc
+}
+
+; GCN-LABEL: {{^}}trunc_srl_i64_17_to_i16:
+; GCN: s_waitcnt
+; GCN-NEXT: v_lshrrev_b64 v[0:1], 17, v[0:1]
+; GCN-NEXT: s_setpc_b64
+define i16 @trunc_srl_i64_17_to_i16(i64 %x) {
+ %shift = lshr i64 %x, 17
+ %trunc = trunc i64 %shift to i16
+ ret i16 %trunc
+}
+
+; GCN-LABEL: {{^}}trunc_srl_i55_16_to_i15:
+; GCN: s_waitcnt
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 15, v0
+; GCN-NEXT: v_add_u16_e32 v0, 4, v0
+; GCN-NEXT: s_setpc_b64
+define i15 @trunc_srl_i55_16_to_i15(i55 %x) {
+ %shift = lshr i55 %x, 15
+ %trunc = trunc i55 %shift to i15
+ %add = add i15 %trunc, 4
+ ret i15 %add
+}
+
+; GCN-LABEL: {{^}}trunc_sra_i64_16_to_i16:
+; GCN: s_waitcnt
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_setpc_b64
+define i16 @trunc_sra_i64_16_to_i16(i64 %x) {
+ %shift = ashr i64 %x, 16
+ %trunc = trunc i64 %shift to i16
+ ret i16 %trunc
+}
+
+; GCN-LABEL: {{^}}trunc_sra_i64_17_to_i16:
+; GCN: s_waitcnt
+; GCN-NEXT: v_lshrrev_b64 v[0:1], 17, v[0:1]
+; GCN-NEXT: s_setpc_b64
+define i16 @trunc_sra_i64_17_to_i16(i64 %x) {
+ %shift = ashr i64 %x, 17
+ %trunc = trunc i64 %shift to i16
+ ret i16 %trunc
+}
+
+; GCN-LABEL: {{^}}trunc_shl_i64_16_to_i16:
+; GCN: s_waitcnt
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_setpc_b64
+define i16 @trunc_shl_i64_16_to_i16(i64 %x) {
+ %shift = shl i64 %x, 16
+ %trunc = trunc i64 %shift to i16
+ ret i16 %trunc
+}
+
+; GCN-LABEL: {{^}}trunc_shl_i64_17_to_i16:
+; GCN: s_waitcnt
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_setpc_b64
+define i16 @trunc_shl_i64_17_to_i16(i64 %x) {
+ %shift = shl i64 %x, 17
+ %trunc = trunc i64 %shift to i16
+ ret i16 %trunc
+}
+
+; GCN-LABEL: {{^}}trunc_srl_v2i64_16_to_v2i16:
+; GCN: s_waitcnt
+; GCN-DAG: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0000
+; GCN: v_and_or_b32 v0, v2, [[MASK]], v0
+; GCN-NEXT: s_setpc_b64
+define <2 x i16> @trunc_srl_v2i64_16_to_v2i16(<2 x i64> %x) {
+ %shift = lshr <2 x i64> %x, <i64 16, i64 16>
+ %trunc = trunc <2 x i64> %shift to <2 x i16>
+ ret <2 x i16> %trunc
+}
+
+; GCN-LABEL: {{^}}s_trunc_srl_i64_16_to_i16:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+; GCN: s_lshr_b32 [[VAL_SHIFT:s[0-9]+]], [[VAL]], 16
+; GCN: s_or_b32 [[RESULT:s[0-9]+]], [[VAL_SHIFT]], 4
+; GCN: v_mov_b32_e32 [[V_RESULT:v[0-9]+]], [[RESULT]]
+; GCN: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]]
+define amdgpu_kernel void @s_trunc_srl_i64_16_to_i16(i64 %x) {
+ %shift = lshr i64 %x, 16
+ %trunc = trunc i64 %shift to i16
+ %add = or i16 %trunc, 4
+ store i16 %add, i16 addrspace(1)* undef
+ ret void
+}
OpenPOWER on IntegriCloud