diff options
| author | Sanjay Patel <spatel@rotateright.com> | 2019-12-13 09:40:33 -0500 |
|---|---|---|
| committer | Sanjay Patel <spatel@rotateright.com> | 2019-12-13 14:03:54 -0500 |
| commit | 2f0c7fd2dbd06ae5f25b0c72b2b8f2a1c5baeb72 (patch) | |
| tree | 05694975281fbc7378ff5b8849ef90a8cd0db81d | |
| parent | ed50e6060b1c51ec4a5dad6c01a64a5f1526cdb5 (diff) | |
| download | bcm5719-llvm-2f0c7fd2dbd06ae5f25b0c72b2b8f2a1c5baeb72.tar.gz bcm5719-llvm-2f0c7fd2dbd06ae5f25b0c72b2b8f2a1c5baeb72.zip | |
[DAGCombiner] fold shift-trunc-shift to shift-mask-trunc (2nd try)
The initial attempt (rG89633320) botched the logic by reversing
the source/dest types. Added x86 tests for additional coverage.
The vector tests show a potential improvement (fold vector load
instead of broadcasting), but that's a known/existing problem.
This fold is done in IR by instcombine, and we have a special
form of it already here in DAGCombiner, but we want the more
general transform too:
https://rise4fun.com/Alive/3jZm
Name: general
Pre: (C1 + zext(C2) < 64)
%s = lshr i64 %x, C1
%t = trunc i64 %s to i16
%r = lshr i16 %t, C2
=>
%s2 = lshr i64 %x, C1 + zext(C2)
%a = and i64 %s2, zext((1 << (16 - C2)) - 1)
%r = trunc %a to i16
Name: special
Pre: C1 == 48
%s = lshr i64 %x, C1
%t = trunc i64 %s to i16
%r = lshr i16 %t, C2
=>
%s2 = lshr i64 %x, C1 + zext(C2)
%r = trunc %s2 to i16
...because D58017 exposes a regression without this fold.
| -rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 14 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AArch64/shift-amount-mod.ll | 3 | ||||
| -rw-r--r-- | llvm/test/CodeGen/PowerPC/trunc-srl-load.ll | 3 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/shift-amount-mod.ll | 8 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-shift-lshr-256.ll | 42 |
5 files changed, 44 insertions, 26 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 15c2be1740a..95127d75665 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -7943,6 +7943,20 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { InnerShift.getOperand(0), NewShiftAmt); return DAG.getNode(ISD::TRUNCATE, DL, VT, NewShift); } + // In the more general case, we can clear the high bits after the shift: + // srl (trunc (srl x, c1)), c2 --> trunc (and (srl x, (c1+c2)), Mask) + if (N0.hasOneUse() && InnerShift.hasOneUse() && + c1 + c2 < InnerShiftSize) { + SDLoc DL(N); + SDValue NewShiftAmt = DAG.getConstant(c1 + c2, DL, ShiftAmtVT); + SDValue NewShift = DAG.getNode(ISD::SRL, DL, InnerShiftVT, + InnerShift.getOperand(0), NewShiftAmt); + SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(InnerShiftSize, + OpSizeInBits - c2), + DL, InnerShiftVT); + SDValue And = DAG.getNode(ISD::AND, DL, InnerShiftVT, NewShift, Mask); + return DAG.getNode(ISD::TRUNCATE, DL, VT, And); + } } } diff --git a/llvm/test/CodeGen/AArch64/shift-amount-mod.ll b/llvm/test/CodeGen/AArch64/shift-amount-mod.ll index 4f6051e2a6c..403839044cd 100644 --- a/llvm/test/CodeGen/AArch64/shift-amount-mod.ll +++ b/llvm/test/CodeGen/AArch64/shift-amount-mod.ll @@ -670,8 +670,7 @@ define i64 @reg64_lshr_by_masked_negated_unfolded_add_b(i64 %val, i64 %a, i64 %b define i32 @t(i64 %x) { ; CHECK-LABEL: t: ; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, #13 -; CHECK-NEXT: ubfx x0, x8, #4, #28 +; CHECK-NEXT: ubfx x0, x0, #17, #28 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret %s = lshr i64 %x, 13 diff --git a/llvm/test/CodeGen/PowerPC/trunc-srl-load.ll b/llvm/test/CodeGen/PowerPC/trunc-srl-load.ll index a1af256eccb..5dc0534ef44 100644 --- a/llvm/test/CodeGen/PowerPC/trunc-srl-load.ll +++ b/llvm/test/CodeGen/PowerPC/trunc-srl-load.ll @@ -25,8 +25,7 @@ cond.false: ; preds = %entry define i32 @sh_trunc_sh(i64 %x) { ; CHECK-LABEL: sh_trunc_sh: ; CHECK: # %bb.0: -; CHECK-NEXT: rldicl 3, 3, 51, 13 -; CHECK-NEXT: srwi 3, 3, 4 +; CHECK-NEXT: rldicl 3, 3, 47, 36 ; CHECK-NEXT: blr %s = lshr i64 %x, 13 %t = trunc i64 %s to i32 diff --git a/llvm/test/CodeGen/X86/shift-amount-mod.ll b/llvm/test/CodeGen/X86/shift-amount-mod.ll index bccb3607c98..6c0527c91ff 100644 --- a/llvm/test/CodeGen/X86/shift-amount-mod.ll +++ b/llvm/test/CodeGen/X86/shift-amount-mod.ll @@ -1564,10 +1564,10 @@ define i16 @sh_trunc_sh(i64 %x) { ; ; X64-LABEL: sh_trunc_sh: ; X64: # %bb.0: -; X64-NEXT: shrq $24, %rdi -; X64-NEXT: movzwl %di, %eax -; X64-NEXT: shrl $12, %eax -; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: shrq $36, %rax +; X64-NEXT: andl $15, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $rax ; X64-NEXT: retq %s = lshr i64 %x, 24 %t = trunc i64 %s to i16 diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll index c448921db7d..24395c9169f 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -1399,71 +1399,77 @@ define <4 x i32> @sh_trunc_sh_vec(<4 x i64> %x) { ; AVX1-LABEL: sh_trunc_sh_vec: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsrlq $24, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlq $24, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlq $36, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq $36, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: vpsrld $12, %xmm0, %xmm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: sh_trunc_sh_vec: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrlq $24, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlq $36, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-NEXT: vpsrld $12, %xmm0, %xmm0 +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575] +; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: sh_trunc_sh_vec: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[3,4,5,6,11,12,13,14],xmm1[3,4,5,6,11,12,13,14] -; XOPAVX1-NEXT: vpsrld $12, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpsrlq $36, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpsrlq $36, %xmm0, %xmm0 +; XOPAVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; XOPAVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; XOPAVX1-NEXT: vzeroupper ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: sh_trunc_sh_vec: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpsrlq $24, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpsrlq $36, %ymm0, %ymm0 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; XOPAVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; XOPAVX2-NEXT: vpsrld $12, %xmm0, %xmm0 +; XOPAVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575] +; XOPAVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: vzeroupper ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: sh_trunc_sh_vec: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsrlq $24, %ymm0, %ymm0 +; AVX512-NEXT: vpsrlq $36, %ymm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpsrld $12, %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: sh_trunc_sh_vec: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlq $24, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlq $36, %ymm0, %ymm0 ; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512VL-NEXT: vpsrld $12, %xmm0, %xmm0 +; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; X32-AVX1-LABEL: sh_trunc_sh_vec: ; X32-AVX1: # %bb.0: ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X32-AVX1-NEXT: vpsrlq $24, %xmm1, %xmm1 -; X32-AVX1-NEXT: vpsrlq $24, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsrlq $36, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsrlq $36, %xmm0, %xmm0 ; X32-AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; X32-AVX1-NEXT: vpsrld $12, %xmm0, %xmm0 +; X32-AVX1-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0 ; X32-AVX1-NEXT: vzeroupper ; X32-AVX1-NEXT: retl ; ; X32-AVX2-LABEL: sh_trunc_sh_vec: ; X32-AVX2: # %bb.0: -; X32-AVX2-NEXT: vpsrlq $24, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrlq $36, %ymm0, %ymm0 ; X32-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X32-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; X32-AVX2-NEXT: vpsrld $12, %xmm0, %xmm0 +; X32-AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575] +; X32-AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 ; X32-AVX2-NEXT: vzeroupper ; X32-AVX2-NEXT: retl %s = lshr <4 x i64> %x, <i64 24, i64 24, i64 24, i64 24> |

