From 216d8cf7201857ba08e50c3a9e8ac452c3f565b9 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 17 Jun 2016 16:46:50 +0000
Subject: [InstCombine] allow more than one use for vector bitcast folding with
 selects

The motivating example for this transform is similar to D20774 where bitcasts interfere
with a single cmp/select sequence, but in this case we have 2 uses of each bitcast to
produce min and max ops:

define void @minmax_bc_store(<4 x float> %a, <4 x float> %b, <4 x float>* %ptr1, <4 x float>* %ptr2) {
  %cmp = fcmp olt <4 x float> %a, %b
  %bc1 = bitcast <4 x float> %a to <4 x i32>
  %bc2 = bitcast <4 x float> %b to <4 x i32>
  %sel1 = select <4 x i1> %cmp, <4 x i32> %bc1, <4 x i32> %bc2
  %sel2 = select <4 x i1> %cmp, <4 x i32> %bc2, <4 x i32> %bc1
  %bc3 = bitcast <4 x float>* %ptr1 to <4 x i32>*
  store <4 x i32> %sel1, <4 x i32>* %bc3
  %bc4 = bitcast <4 x float>* %ptr2 to <4 x i32>*
  store <4 x i32> %sel2, <4 x i32>* %bc4
  ret void
}

With this patch, we move the selects up to use the input args which allows getting rid of
all of the bitcasts:

define void @minmax_bc_store(<4 x float> %a, <4 x float> %b, <4 x float>* %ptr1, <4 x float>* %ptr2) {
  %cmp = fcmp olt <4 x float> %a, %b
  %sel1.v = select <4 x i1> %cmp, <4 x float> %a, <4 x float> %b
  %sel2.v = select <4 x i1> %cmp, <4 x float> %b, <4 x float> %a
  store <4 x float> %sel1.v, <4 x float>* %ptr1, align 16
  store <4 x float> %sel2.v, <4 x float>* %ptr2, align 16
  ret void
}

The asm for x86 SSE then improves from:

movaps  %xmm0, %xmm2
cmpltps %xmm1, %xmm2
movaps  %xmm2, %xmm3
andnps  %xmm1, %xmm3
movaps  %xmm2, %xmm4
andnps  %xmm0, %xmm4
andps %xmm2, %xmm0
orps  %xmm3, %xmm0
andps %xmm1, %xmm2
orps  %xmm4, %xmm2
movaps  %xmm0, (%rdi)
movaps  %xmm2, (%rsi)

To:

movaps  %xmm0, %xmm2
minps %xmm1, %xmm2
maxps %xmm0, %xmm1
movaps  %xmm2, (%rdi)
movaps  %xmm1, (%rsi)

The TODO comments show that we're limiting this transform only to vectors and only to bitcasts
because we need to improve other transforms or risk creating worse codegen.

Differential Revision: http://reviews.llvm.org/D21190

llvm-svn: 273011
---
 .../Transforms/InstCombine/InstCombineSelect.cpp   | 48 ++++++++++++++++------
 1 file changed, 35 insertions(+), 13 deletions(-)

(limited to 'llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp')

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 6b56bc6ff04..7c3ef99d227 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -116,8 +116,7 @@ static Constant *GetSelectFoldableConstant(Instruction *I) {
   }
 }
 
-/// Here we have (select c, TI, FI), and we know that TI and FI
-/// have the same opcode and only one use each.  Try to simplify this.
+/// We have (select c, TI, FI), and we know that TI and FI have the same opcode.
 Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
                                           Instruction *FI) {
   // If this is a cast from the same type, merge.
@@ -129,10 +128,30 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
     // The select condition may be a vector. We may only change the operand
     // type if the vector width remains the same (and matches the condition).
     Type *CondTy = SI.getCondition()->getType();
-    if (CondTy->isVectorTy() &&
-        (!FIOpndTy->isVectorTy() ||
-         CondTy->getVectorNumElements() != FIOpndTy->getVectorNumElements()))
+    if (CondTy->isVectorTy()) {
+      if (!FIOpndTy->isVectorTy())
+        return nullptr;
+      if (CondTy->getVectorNumElements() != FIOpndTy->getVectorNumElements())
+        return nullptr;
+
+      // TODO: If the backend knew how to deal with casts better, we could
+      // remove this limitation. For now, there's too much potential to create
+      // worse codegen by promoting the select ahead of size-altering casts
+      // (PR28160).
+      //
+      // Note that ValueTracking's matchSelectPattern() looks through casts
+      // without checking 'hasOneUse' when it matches min/max patterns, so this
+      // transform may end up happening anyway.
+      if (TI->getOpcode() != Instruction::BitCast &&
+          (!TI->hasOneUse() || !FI->hasOneUse()))
+        return nullptr;
+
+    } else if (!TI->hasOneUse() || !FI->hasOneUse()) {
+      // TODO: The one-use restrictions for a scalar select could be eased if
+      // the fold of a select in visitLoadInst() was enhanced to match a pattern
+      // that includes a cast.
       return nullptr;
+    }
 
     // Fold this by inserting a select from the input values.
     Value *NewSI = Builder->CreateSelect(SI.getCondition(), TI->getOperand(0),
@@ -141,8 +160,13 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
                             TI->getType());
   }
 
-  // Only handle binary operators here.
-  if (!isa<BinaryOperator>(TI))
+  // TODO: This function ends awkwardly in unreachable - fix to be more normal.
+
+  // Only handle binary operators with one-use here. As with the cast case
+  // above, it may be possible to relax the one-use constraint, but that needs
+  // be examined carefully since it may not reduce the total number of
+  // instructions.
+  if (!isa<BinaryOperator>(TI) || !TI->hasOneUse() || !FI->hasOneUse())
     return nullptr;
 
   // Figure out if the operations have any operands in common.
@@ -1056,14 +1080,12 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   if (Instruction *Add = foldAddSubSelect(SI, *Builder))
     return Add;
 
+  // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z))
   auto *TI = dyn_cast<Instruction>(TrueVal);
   auto *FI = dyn_cast<Instruction>(FalseVal);
-  if (TI && FI && TI->hasOneUse() && FI->hasOneUse()) {
-    // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z))
-    if (TI->getOpcode() == FI->getOpcode())
-      if (Instruction *IV = FoldSelectOpOp(SI, TI, FI))
-        return IV;
-  }
+  if (TI && FI && TI->getOpcode() == FI->getOpcode())
+    if (Instruction *IV = FoldSelectOpOp(SI, TI, FI))
+      return IV;
 
   // See if we can fold the select into one of our operands.
   if (SI.getType()->isIntOrIntVectorTy() || SI.getType()->isFPOrFPVectorTy()) {
-- 
cgit v1.2.3