diff options
author | Eli Friedman <efriedma@codeaurora.org> | 2018-12-20 23:39:54 +0000 |
---|---|---|
committer | Eli Friedman <efriedma@codeaurora.org> | 2018-12-20 23:39:54 +0000 |
commit | b1bbd5dca3d5ee3c6155232a2d506bb804b1461b (patch) | |
tree | 5090297413d2bbbc8d09af96967ffe08598ed511 | |
parent | 4c7f5d5c5aeeb4336da0f044ab543d76971068f4 (diff) | |
download | bcm5719-llvm-b1bbd5dca3d5ee3c6155232a2d506bb804b1461b.tar.gz bcm5719-llvm-b1bbd5dca3d5ee3c6155232a2d506bb804b1461b.zip |
[ARM] Complete the Thumb1 shift+and->shift+shift transforms.
This saves materializing the immediate. The additional forms are less
common (they don't usually show up for bitfield insert/extract), but
they're still relevant.
I had to add a new target hook to prevent DAGCombine from reversing the
transform. That isn't the only possible way to solve the conflict, but
it seems straightforward enough.
Differential Revision: https://reviews.llvm.org/D55630
llvm-svn: 349857
-rw-r--r-- | llvm/include/llvm/CodeGen/TargetLowering.h | 9 | ||||
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 3 | ||||
-rw-r--r-- | llvm/lib/Target/ARM/ARMISelLowering.cpp | 53 | ||||
-rw-r--r-- | llvm/lib/Target/ARM/ARMISelLowering.h | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/Thumb/shift-and.ll | 75 |
5 files changed, 125 insertions, 17 deletions
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index d4e6da80a4c..9cd4af9b22e 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3090,6 +3090,15 @@ public: return true; } + /// Return true if it is profitable to fold a pair of shifts into a mask. + /// This is usually true on most targets. But some targets, like Thumb1, + /// have immediate shift instructions, but no immediate "and" instruction; + /// this makes the fold unprofitable. + virtual bool shouldFoldShiftPairToMask(const SDNode *N, + CombineLevel Level) const { + return true; + } + // Return true if it is profitable to combine a BUILD_VECTOR with a stride-pattern // to a shuffle and a truncate. // Example of such a combine: diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index a027d2906c8..272630e32dd 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -6518,7 +6518,8 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { // (and (srl x, (sub c1, c2), MASK) // Only fold this if the inner shift has no other uses -- if it does, folding // this will increase the total number of instructions. - if (N1C && N0.getOpcode() == ISD::SRL && N0.hasOneUse()) { + if (N1C && N0.getOpcode() == ISD::SRL && N0.hasOneUse() && + TLI.shouldFoldShiftPairToMask(N, Level)) { if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) { uint64_t c1 = N0C1->getZExtValue(); if (c1 < OpSizeInBits) { diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 8d5cf0a2c1c..22f8f7865fb 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -10432,6 +10432,18 @@ ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N, return false; } +bool +ARMTargetLowering::shouldFoldShiftPairToMask(const SDNode *N, + CombineLevel Level) const { + if (!Subtarget->isThumb1Only()) + return true; + + if (Level == BeforeLegalizeTypes) + return true; + + return false; +} + static SDValue PerformSHLSimplify(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST) { @@ -10735,6 +10747,12 @@ static SDValue CombineANDShift(SDNode *N, if (!C2 || C2 >= 32) return SDValue(); + // Clear irrelevant bits in the mask. + if (LeftShift) + C1 &= (-1U << C2); + else + C1 &= (-1U >> C2); + SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); @@ -10742,9 +10760,7 @@ static SDValue CombineANDShift(SDNode *N, // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to // transform to a pair of shifts, to save materializing c1. - // First pattern: right shift, and c1+1 is a power of two. - // FIXME: Also check reversed pattern (left shift, and ~c1+1 is a power - // of two). + // First pattern: right shift, then mask off leading bits. // FIXME: Use demanded bits? if (!LeftShift && isMask_32(C1)) { uint32_t C3 = countLeadingZeros(C1); @@ -10756,13 +10772,23 @@ static SDValue CombineANDShift(SDNode *N, } } - // Second pattern: left shift, and (c1>>c2)+1 is a power of two. - // FIXME: Also check reversed pattern (right shift, and ~(c1<<c2)+1 - // is a power of two). + // First pattern, reversed: left shift, then mask off trailing bits. + if (LeftShift && isMask_32(~C1)) { + uint32_t C3 = countTrailingZeros(C1); + if (C2 < C3) { + SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), + DAG.getConstant(C3 - C2, DL, MVT::i32)); + return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, + DAG.getConstant(C3, DL, MVT::i32)); + } + } + + // Second pattern: left shift, then mask off leading bits. // FIXME: Use demanded bits? if (LeftShift && isShiftedMask_32(C1)) { + uint32_t Trailing = countTrailingZeros(C1); uint32_t C3 = countLeadingZeros(C1); - if (C2 + C3 < 32 && C1 == ((-1U << (C2 + C3)) >> C3)) { + if (Trailing == C2 && C2 + C3 < 32) { SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), DAG.getConstant(C2 + C3, DL, MVT::i32)); return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, @@ -10770,6 +10796,19 @@ static SDValue CombineANDShift(SDNode *N, } } + // Second pattern, reversed: right shift, then mask off trailing bits. + // FIXME: Handle other patterns of known/demanded bits. + if (!LeftShift && isShiftedMask_32(C1)) { + uint32_t Leading = countLeadingZeros(C1); + uint32_t C3 = countTrailingZeros(C1); + if (Leading == C2 && C2 + C3 < 32) { + SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), + DAG.getConstant(C2 + C3, DL, MVT::i32)); + return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, + DAG.getConstant(C3, DL, MVT::i32)); + } + } + // FIXME: Transform "(and (shl x, c2) c1)" -> // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than // c1. diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index ccee7fa258a..7a9fc739fc1 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -593,6 +593,8 @@ class VectorType; bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override; + bool shouldFoldShiftPairToMask(const SDNode *N, + CombineLevel Level) const override; protected: std::pair<const TargetRegisterClass *, uint8_t> findRepresentativeClass(const TargetRegisterInfo *TRI, diff --git a/llvm/test/CodeGen/Thumb/shift-and.ll b/llvm/test/CodeGen/Thumb/shift-and.ll index 01f6b84d0c3..3981d2c9702 100644 --- a/llvm/test/CodeGen/Thumb/shift-and.ll +++ b/llvm/test/CodeGen/Thumb/shift-and.ll @@ -45,9 +45,8 @@ entry: define i32 @test4(i32 %x) { ; CHECK-LABEL: test4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: lsls r0, r0, #4 -; CHECK-NEXT: movs r1, #112 -; CHECK-NEXT: bics r0, r1 +; CHECK-NEXT: lsrs r0, r0, #3 +; CHECK-NEXT: lsls r0, r0, #7 ; CHECK-NEXT: bx lr entry: %0 = shl i32 %x, 4 @@ -84,9 +83,8 @@ entry: define i32 @test7(i32 %x) { ; CHECK-LABEL: test7: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: lsrs r1, r0, #29 -; CHECK-NEXT: movs r0, #4 -; CHECK-NEXT: ands r0, r1 +; CHECK-NEXT: lsrs r0, r0, #31 +; CHECK-NEXT: lsls r0, r0, #2 ; CHECK-NEXT: bx lr entry: %0 = lshr i32 %x, 29 @@ -110,9 +108,8 @@ entry: define i32 @test9(i32 %x) { ; CHECK-LABEL: test9: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: lsrs r0, r0, #2 -; CHECK-NEXT: movs r1, #1 -; CHECK-NEXT: bics r0, r1 +; CHECK-NEXT: lsrs r0, r0, #3 +; CHECK-NEXT: lsls r0, r0, #1 ; CHECK-NEXT: bx lr entry: %and = lshr i32 %x, 2 @@ -131,3 +128,63 @@ entry: %shr = and i32 %0, 255 ret i32 %shr } + +define i32 @test11(i32 %x) { +; CHECK-LABEL: test11: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: lsrs r0, r0, #24 +; CHECK-NEXT: lsls r0, r0, #2 +; CHECK-NEXT: bx lr +entry: + %shl = lshr i32 %x, 22 + %and = and i32 %shl, 1020 + ret i32 %and +} + +define i32 @test12(i32 %x) { +; CHECK-LABEL: test12: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: lsrs r0, r0, #3 +; CHECK-NEXT: lsls r0, r0, #4 +; CHECK-NEXT: bx lr +entry: + %0 = shl i32 %x, 1 + %shr = and i32 %0, -16 + ret i32 %shr +} + +define i32 @test13(i32 %x) { +; CHECK-LABEL: test13: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: lsrs r0, r0, #3 +; CHECK-NEXT: lsls r0, r0, #4 +; CHECK-NEXT: bx lr +entry: + %shr = lshr i32 %x, 3 + %shl = shl i32 %shr, 4 + ret i32 %shl +} + +define i32 @test14(i32 %x) { +; CHECK-LABEL: test14: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: lsrs r0, r0, #6 +; CHECK-NEXT: lsls r0, r0, #10 +; CHECK-NEXT: bx lr +entry: + %shl = shl i32 %x, 4 + %and = and i32 %shl, -1024 + ret i32 %and +} + +define i32 @test15(i32 %x) { +; CHECK-LABEL: test15: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: lsrs r0, r0, #4 +; CHECK-NEXT: lsls r0, r0, #3 +; CHECK-NEXT: bx lr +entry: + %shr = lshr i32 %x, 4 + %shl = shl i32 %shr, 3 + ret i32 %shl +} |