[InstCombine] canonicalize shift+logic+shift to reduce dependency chain

shift (logic (shift X, C0), Y), C1 --> logic (shift X, C0+C1), (shift Y, C1) This is an IR translation of an existing SDAG transform added here: rL370617 So we again have 9 possible patterns with a commuted IR variant of each pattern: https://rise4fun.com/Alive/VlI https://rise4fun.com/Alive/n1m https://rise4fun.com/Alive/1Vn Part of the motivation is to allow easier recognition and subsequent canonicalization of bswap patterns as discussed in PR43146: https://bugs.llvm.org/show_bug.cgi?id=43146 We had to delay this transform because it used to allow the SLP vectorizer to create awful reductions out of simple load-combines. That problem was fixed with: rL375025 (we'll bring back load combining in IR someday...) The backend is also better equipped to deal with these patterns now using hooks like TLI.getShiftAmountThreshold(). The only remaining potential controversy is that the -reassociate pass tends to reverse this kind of pattern (to help GVN?). But since -reassociate doesn't do anything with these specific patterns, there is no conflict currently. Finally, there's a new pass proposal at D67383 for general tree-height-reduction reassociation, and it could use a cost model to decide how to optimally rearrange these kinds of ops for a target. That patch appears to be stalled. Differential Revision: https://reviews.llvm.org/D69842
author: Sanjay Patel <spatel@rotateright.com> 2019-11-07 12:08:10 -0500
committer: Sanjay Patel <spatel@rotateright.com> 2019-11-07 12:09:45 -0500
commit: d9ccb6367a1089fd61bd85be6b0fbb0d6a590e05 (patch)
tree: b5eee1f06d4537a10c2b19c987a7de5a930c698e
parent: 05299c7d98ab5562ebf927847126621826358907 (diff)
download: bcm5719-llvm-d9ccb6367a1089fd61bd85be6b0fbb0d6a590e05.tar.gz
bcm5719-llvm-d9ccb6367a1089fd61bd85be6b0fbb0d6a590e05.zip
3 files changed, 77 insertions, 31 deletions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index eea890c815c..f35a6ce2d93 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -296,6 +296,49 @@ dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift,
   return BinaryOperator::Create(Instruction::And, NewShift, NewMask);
 }
 
+/// If we have a shift-by-constant of a bitwise logic op that itself has a
+/// shift-by-constant operand with identical opcode, we may be able to convert
+/// that into 2 independent shifts followed by the logic op. This eliminates a
+/// a use of an intermediate value (reduces dependency chain).
+static Instruction *foldShiftOfShiftedLogic(BinaryOperator &I,
+                                            InstCombiner::BuilderTy &Builder) {
+  assert(I.isShift() && "Expected a shift as input");
+  auto *LogicInst = dyn_cast<BinaryOperator>(I.getOperand(0));
+  if (!LogicInst || !LogicInst->isBitwiseLogicOp() || !LogicInst->hasOneUse())
+    return nullptr;
+
+  const APInt *C0, *C1;
+  if (!match(I.getOperand(1), m_APInt(C1)))
+    return nullptr;
+
+  Instruction::BinaryOps ShiftOpcode = I.getOpcode();
+  Type *Ty = I.getType();
+
+  // Find a matching one-use shift by constant. The fold is not valid if the sum
+  // of the shift values equals or exceeds bitwidth.
+  // TODO: Remove the one-use check if the other logic operand (Y) is constant.
+  Value *X, *Y;
+  auto matchFirstShift = [&](Value *V) {
+    return match(V, m_OneUse(m_Shift(m_Value(X), m_APInt(C0)))) &&
+           cast<BinaryOperator>(V)->getOpcode() == ShiftOpcode &&
+           (*C0 + *C1).ult(Ty->getScalarSizeInBits());
+  };
+
+  // Logic ops are commutative, so check each operand for a match.
+  if (matchFirstShift(LogicInst->getOperand(0)))
+    Y = LogicInst->getOperand(1);
+  else if (matchFirstShift(LogicInst->getOperand(1)))
+    Y = LogicInst->getOperand(0);
+  else
+    return nullptr;
+
+  // shift (logic (shift X, C0), Y), C1 -> logic (shift X, C0+C1), (shift Y, C1)
+  Constant *ShiftSumC = ConstantInt::get(Ty, *C0 + *C1);
+  Value *NewShift1 = Builder.CreateBinOp(ShiftOpcode, X, ShiftSumC);
+  Value *NewShift2 = Builder.CreateBinOp(ShiftOpcode, Y, I.getOperand(1));
+  return BinaryOperator::Create(LogicInst->getOpcode(), NewShift1, NewShift2);
+}
+
 Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   assert(Op0->getType() == Op1->getType());
@@ -348,6 +391,9 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
     return &I;
   }
 
+  if (Instruction *Logic = foldShiftOfShiftedLogic(I, Builder))
+    return Logic;
+
   return nullptr;
 }
 
diff --git a/llvm/test/Transforms/InstCombine/bswap.ll b/llvm/test/Transforms/InstCombine/bswap.ll
index 2d8a69b86a5..540b1752c26 100644
--- a/llvm/test/Transforms/InstCombine/bswap.ll
+++ b/llvm/test/Transforms/InstCombine/bswap.ll
@@ -169,10 +169,10 @@ define i32 @bswap32_shl_first(i32 %x) {
 
 define i32 @bswap32_shl_first_extra_use(i32 %x) {
 ; CHECK-LABEL: @bswap32_shl_first_extra_use(
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[X:%.*]], 16
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X]], 16
-; CHECK-NEXT:    [[SWAPHALF:%.*]] = or i32 [[SHL]], [[SHR]]
-; CHECK-NEXT:    [[T:%.*]] = shl i32 [[SWAPHALF]], 8
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 16
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[X]], 24
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i32 [[SHR]], 8
+; CHECK-NEXT:    [[T:%.*]] = or i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[BSWAP:%.*]] = call i32 @llvm.bswap.i32(i32 [[X]])
 ; CHECK-NEXT:    call void @extra_use(i32 [[T]])
 ; CHECK-NEXT:    ret i32 [[BSWAP]]
diff --git a/llvm/test/Transforms/InstCombine/shift-logic.ll b/llvm/test/Transforms/InstCombine/shift-logic.ll
index f387d1f738b..453463f46fc 100644
--- a/llvm/test/Transforms/InstCombine/shift-logic.ll
+++ b/llvm/test/Transforms/InstCombine/shift-logic.ll
@@ -3,9 +3,9 @@
 
 define i8 @shl_and(i8 %x, i8 %y) {
 ; CHECK-LABEL: @shl_and(
-; CHECK-NEXT:    [[SH0:%.*]] = shl i8 [[X:%.*]], 3
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[SH0]], [[Y:%.*]]
-; CHECK-NEXT:    [[SH1:%.*]] = shl i8 [[R]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i8 [[X:%.*]], 5
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i8 [[Y:%.*]], 2
+; CHECK-NEXT:    [[SH1:%.*]] = and i8 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret i8 [[SH1]]
 ;
   %sh0 = shl i8 %x, 3
@@ -17,9 +17,9 @@ define i8 @shl_and(i8 %x, i8 %y) {
 define i16 @shl_or(i16 %x, i16 %py) {
 ; CHECK-LABEL: @shl_or(
 ; CHECK-NEXT:    [[Y:%.*]] = srem i16 [[PY:%.*]], 42
-; CHECK-NEXT:    [[SH0:%.*]] = shl i16 [[X:%.*]], 5
-; CHECK-NEXT:    [[R:%.*]] = or i16 [[Y]], [[SH0]]
-; CHECK-NEXT:    [[SH1:%.*]] = shl i16 [[R]], 7
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i16 [[X:%.*]], 12
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nsw i16 [[Y]], 7
+; CHECK-NEXT:    [[SH1:%.*]] = or i16 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret i16 [[SH1]]
 ;
   %y = srem i16 %py, 42 ; thwart complexity-based canonicalization
@@ -31,9 +31,9 @@ define i16 @shl_or(i16 %x, i16 %py) {
 
 define i32 @shl_xor(i32 %x, i32 %y) {
 ; CHECK-LABEL: @shl_xor(
-; CHECK-NEXT:    [[SH0:%.*]] = shl i32 [[X:%.*]], 5
-; CHECK-NEXT:    [[R:%.*]] = xor i32 [[SH0]], [[Y:%.*]]
-; CHECK-NEXT:    [[SH1:%.*]] = shl i32 [[R]], 7
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[X:%.*]], 12
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[Y:%.*]], 7
+; CHECK-NEXT:    [[SH1:%.*]] = xor i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret i32 [[SH1]]
 ;
   %sh0 = shl i32 %x, 5
@@ -45,9 +45,9 @@ define i32 @shl_xor(i32 %x, i32 %y) {
 define i64 @lshr_and(i64 %x, i64 %py) {
 ; CHECK-LABEL: @lshr_and(
 ; CHECK-NEXT:    [[Y:%.*]] = srem i64 [[PY:%.*]], 42
-; CHECK-NEXT:    [[SH0:%.*]] = lshr i64 [[X:%.*]], 5
-; CHECK-NEXT:    [[R:%.*]] = and i64 [[Y]], [[SH0]]
-; CHECK-NEXT:    [[SH1:%.*]] = lshr i64 [[R]], 7
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[X:%.*]], 12
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[Y]], 7
+; CHECK-NEXT:    [[SH1:%.*]] = and i64 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret i64 [[SH1]]
 ;
   %y = srem i64 %py, 42 ; thwart complexity-based canonicalization
@@ -59,9 +59,9 @@ define i64 @lshr_and(i64 %x, i64 %py) {
 
 define <4 x i32> @lshr_or(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: @lshr_or(
-; CHECK-NEXT:    [[SH0:%.*]] = lshr <4 x i32> [[X:%.*]], <i32 5, i32 5, i32 5, i32 5>
-; CHECK-NEXT:    [[R:%.*]] = or <4 x i32> [[SH0]], [[Y:%.*]]
-; CHECK-NEXT:    [[SH1:%.*]] = lshr <4 x i32> [[R]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> [[X:%.*]], <i32 12, i32 12, i32 12, i32 12>
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr <4 x i32> [[Y:%.*]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[SH1:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <4 x i32> [[SH1]]
 ;
   %sh0 = lshr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
@@ -73,9 +73,9 @@ define <4 x i32> @lshr_or(<4 x i32> %x, <4 x i32> %y) {
 define <8 x i16> @lshr_xor(<8 x i16> %x, <8 x i16> %py) {
 ; CHECK-LABEL: @lshr_xor(
 ; CHECK-NEXT:    [[Y:%.*]] = srem <8 x i16> [[PY:%.*]], <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42>
-; CHECK-NEXT:    [[SH0:%.*]] = lshr <8 x i16> [[X:%.*]], <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
-; CHECK-NEXT:    [[R:%.*]] = xor <8 x i16> [[Y]], [[SH0]]
-; CHECK-NEXT:    [[SH1:%.*]] = lshr <8 x i16> [[R]], <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i16> [[X:%.*]], <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr <8 x i16> [[Y]], <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+; CHECK-NEXT:    [[SH1:%.*]] = xor <8 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <8 x i16> [[SH1]]
 ;
   %y = srem <8 x i16> %py, <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 -42> ; thwart complexity-based canonicalization
@@ -89,9 +89,9 @@ define <8 x i16> @lshr_xor(<8 x i16> %x, <8 x i16> %py) {
 define <16 x i8> @ashr_and(<16 x i8> %x, <16 x i8> %py, <16 x i8> %pz) {
 ; CHECK-LABEL: @ashr_and(
 ; CHECK-NEXT:    [[Y:%.*]] = srem <16 x i8> [[PY:%.*]], [[PZ:%.*]]
-; CHECK-NEXT:    [[SH0:%.*]] = ashr <16 x i8> [[X:%.*]], <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-; CHECK-NEXT:    [[R:%.*]] = and <16 x i8> [[Y]], [[SH0]]
-; CHECK-NEXT:    [[SH1:%.*]] = ashr <16 x i8> [[R]], <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i8> [[X:%.*]], <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr <16 x i8> [[Y]], <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
+; CHECK-NEXT:    [[SH1:%.*]] = and <16 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <16 x i8> [[SH1]]
 ;
   %y = srem <16 x i8> %py, %pz ; thwart complexity-based canonicalization
@@ -103,9 +103,9 @@ define <16 x i8> @ashr_and(<16 x i8> %x, <16 x i8> %py, <16 x i8> %pz) {
 
 define <2 x i64> @ashr_or(<2 x i64> %x, <2 x i64> %y) {
 ; CHECK-LABEL: @ashr_or(
-; CHECK-NEXT:    [[SH0:%.*]] = ashr <2 x i64> [[X:%.*]], <i64 5, i64 5>
-; CHECK-NEXT:    [[R:%.*]] = or <2 x i64> [[SH0]], [[Y:%.*]]
-; CHECK-NEXT:    [[SH1:%.*]] = ashr <2 x i64> [[R]], <i64 7, i64 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <2 x i64> [[X:%.*]], <i64 12, i64 12>
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr <2 x i64> [[Y:%.*]], <i64 7, i64 7>
+; CHECK-NEXT:    [[SH1:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <2 x i64> [[SH1]]
 ;
   %sh0 = ashr <2 x i64> %x, <i64 5, i64 5>
@@ -117,9 +117,9 @@ define <2 x i64> @ashr_or(<2 x i64> %x, <2 x i64> %y) {
 define i32 @ashr_xor(i32 %x, i32 %py) {
 ; CHECK-LABEL: @ashr_xor(
 ; CHECK-NEXT:    [[Y:%.*]] = srem i32 [[PY:%.*]], 42
-; CHECK-NEXT:    [[SH0:%.*]] = ashr i32 [[X:%.*]], 5
-; CHECK-NEXT:    [[R:%.*]] = xor i32 [[Y]], [[SH0]]
-; CHECK-NEXT:    [[SH1:%.*]] = ashr i32 [[R]], 7
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 12
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y]], 7
+; CHECK-NEXT:    [[SH1:%.*]] = xor i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret i32 [[SH1]]
 ;
   %y = srem i32 %py, 42 ; thwart complexity-based canonicalization
author	Sanjay Patel <spatel@rotateright.com>	2019-11-07 12:08:10 -0500
committer	Sanjay Patel <spatel@rotateright.com>	2019-11-07 12:09:45 -0500
commit	d9ccb6367a1089fd61bd85be6b0fbb0d6a590e05 (patch)
tree	b5eee1f06d4537a10c2b19c987a7de5a930c698e
parent	05299c7d98ab5562ebf927847126621826358907 (diff)
download	bcm5719-llvm-d9ccb6367a1089fd61bd85be6b0fbb0d6a590e05.tar.gz bcm5719-llvm-d9ccb6367a1089fd61bd85be6b0fbb0d6a590e05.zip