From 216d8cf7201857ba08e50c3a9e8ac452c3f565b9 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Fri, 17 Jun 2016 16:46:50 +0000 Subject: [InstCombine] allow more than one use for vector bitcast folding with selects The motivating example for this transform is similar to D20774 where bitcasts interfere with a single cmp/select sequence, but in this case we have 2 uses of each bitcast to produce min and max ops: define void @minmax_bc_store(<4 x float> %a, <4 x float> %b, <4 x float>* %ptr1, <4 x float>* %ptr2) { %cmp = fcmp olt <4 x float> %a, %b %bc1 = bitcast <4 x float> %a to <4 x i32> %bc2 = bitcast <4 x float> %b to <4 x i32> %sel1 = select <4 x i1> %cmp, <4 x i32> %bc1, <4 x i32> %bc2 %sel2 = select <4 x i1> %cmp, <4 x i32> %bc2, <4 x i32> %bc1 %bc3 = bitcast <4 x float>* %ptr1 to <4 x i32>* store <4 x i32> %sel1, <4 x i32>* %bc3 %bc4 = bitcast <4 x float>* %ptr2 to <4 x i32>* store <4 x i32> %sel2, <4 x i32>* %bc4 ret void } With this patch, we move the selects up to use the input args which allows getting rid of all of the bitcasts: define void @minmax_bc_store(<4 x float> %a, <4 x float> %b, <4 x float>* %ptr1, <4 x float>* %ptr2) { %cmp = fcmp olt <4 x float> %a, %b %sel1.v = select <4 x i1> %cmp, <4 x float> %a, <4 x float> %b %sel2.v = select <4 x i1> %cmp, <4 x float> %b, <4 x float> %a store <4 x float> %sel1.v, <4 x float>* %ptr1, align 16 store <4 x float> %sel2.v, <4 x float>* %ptr2, align 16 ret void } The asm for x86 SSE then improves from: movaps %xmm0, %xmm2 cmpltps %xmm1, %xmm2 movaps %xmm2, %xmm3 andnps %xmm1, %xmm3 movaps %xmm2, %xmm4 andnps %xmm0, %xmm4 andps %xmm2, %xmm0 orps %xmm3, %xmm0 andps %xmm1, %xmm2 orps %xmm4, %xmm2 movaps %xmm0, (%rdi) movaps %xmm2, (%rsi) To: movaps %xmm0, %xmm2 minps %xmm1, %xmm2 maxps %xmm0, %xmm1 movaps %xmm2, (%rdi) movaps %xmm1, (%rsi) The TODO comments show that we're limiting this transform only to vectors and only to bitcasts because we need to improve other transforms or risk creating worse codegen. Differential Revision: http://reviews.llvm.org/D21190 llvm-svn: 273011 --- .../Transforms/InstCombine/InstCombineSelect.cpp | 48 ++++++++++++++++------ 1 file changed, 35 insertions(+), 13 deletions(-) (limited to 'llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp') diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 6b56bc6ff04..7c3ef99d227 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -116,8 +116,7 @@ static Constant *GetSelectFoldableConstant(Instruction *I) { } } -/// Here we have (select c, TI, FI), and we know that TI and FI -/// have the same opcode and only one use each. Try to simplify this. +/// We have (select c, TI, FI), and we know that TI and FI have the same opcode. Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI, Instruction *FI) { // If this is a cast from the same type, merge. @@ -129,10 +128,30 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI, // The select condition may be a vector. We may only change the operand // type if the vector width remains the same (and matches the condition). Type *CondTy = SI.getCondition()->getType(); - if (CondTy->isVectorTy() && - (!FIOpndTy->isVectorTy() || - CondTy->getVectorNumElements() != FIOpndTy->getVectorNumElements())) + if (CondTy->isVectorTy()) { + if (!FIOpndTy->isVectorTy()) + return nullptr; + if (CondTy->getVectorNumElements() != FIOpndTy->getVectorNumElements()) + return nullptr; + + // TODO: If the backend knew how to deal with casts better, we could + // remove this limitation. For now, there's too much potential to create + // worse codegen by promoting the select ahead of size-altering casts + // (PR28160). + // + // Note that ValueTracking's matchSelectPattern() looks through casts + // without checking 'hasOneUse' when it matches min/max patterns, so this + // transform may end up happening anyway. + if (TI->getOpcode() != Instruction::BitCast && + (!TI->hasOneUse() || !FI->hasOneUse())) + return nullptr; + + } else if (!TI->hasOneUse() || !FI->hasOneUse()) { + // TODO: The one-use restrictions for a scalar select could be eased if + // the fold of a select in visitLoadInst() was enhanced to match a pattern + // that includes a cast. return nullptr; + } // Fold this by inserting a select from the input values. Value *NewSI = Builder->CreateSelect(SI.getCondition(), TI->getOperand(0), @@ -141,8 +160,13 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI, TI->getType()); } - // Only handle binary operators here. - if (!isa(TI)) + // TODO: This function ends awkwardly in unreachable - fix to be more normal. + + // Only handle binary operators with one-use here. As with the cast case + // above, it may be possible to relax the one-use constraint, but that needs + // be examined carefully since it may not reduce the total number of + // instructions. + if (!isa(TI) || !TI->hasOneUse() || !FI->hasOneUse()) return nullptr; // Figure out if the operations have any operands in common. @@ -1056,14 +1080,12 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { if (Instruction *Add = foldAddSubSelect(SI, *Builder)) return Add; + // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z)) auto *TI = dyn_cast(TrueVal); auto *FI = dyn_cast(FalseVal); - if (TI && FI && TI->hasOneUse() && FI->hasOneUse()) { - // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z)) - if (TI->getOpcode() == FI->getOpcode()) - if (Instruction *IV = FoldSelectOpOp(SI, TI, FI)) - return IV; - } + if (TI && FI && TI->getOpcode() == FI->getOpcode()) + if (Instruction *IV = FoldSelectOpOp(SI, TI, FI)) + return IV; // See if we can fold the select into one of our operands. if (SI.getType()->isIntOrIntVectorTy() || SI.getType()->isFPOrFPVectorTy()) { -- cgit v1.2.3