diff options
author | Sanjay Patel <spatel@rotateright.com> | 2018-10-09 21:26:01 +0000 |
---|---|---|
committer | Sanjay Patel <spatel@rotateright.com> | 2018-10-09 21:26:01 +0000 |
commit | e9ca7ea3e5c0f3e155fc7b47cb9068a2f12cae6a (patch) | |
tree | d9811d19d68449dc71aa1e0cb83a0d45d3b73610 /llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp | |
parent | 5989281cf3af52fc07ad458297e70f559db02de7 (diff) | |
download | bcm5719-llvm-e9ca7ea3e5c0f3e155fc7b47cb9068a2f12cae6a.tar.gz bcm5719-llvm-e9ca7ea3e5c0f3e155fc7b47cb9068a2f12cae6a.zip |
[InstCombine] reverse 'trunc X to <N x i1>' canonicalization
icmp ne (and X, 1), 0 --> trunc X to N x i1
Ideally, we'd do the same for scalars, but there will likely be
regressions unless we add more trunc folds as we're doing here
for vectors.
The motivating vector case is from PR37549:
https://bugs.llvm.org/show_bug.cgi?id=37549
define <4 x float> @bitwise_select(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %w) {
%c = fcmp ole <4 x float> %x, %y
%s = sext <4 x i1> %c to <4 x i32>
%s1 = shufflevector <4 x i32> %s, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
%s2 = shufflevector <4 x i32> %s, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
%cond = or <4 x i32> %s1, %s2
%condtr = trunc <4 x i32> %cond to <4 x i1>
%r = select <4 x i1> %condtr, <4 x float> %z, <4 x float> %w
ret <4 x float> %r
}
Here's a sampling of the vector codegen for that case using
mask+icmp (current behavior) vs. trunc (with this patch):
AVX before:
vcmpleps %xmm1, %xmm0, %xmm0
vpermilps $80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1]
vpermilps $250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3]
vorps %xmm0, %xmm1, %xmm0
vandps LCPI0_0(%rip), %xmm0, %xmm0
vxorps %xmm1, %xmm1, %xmm1
vpcmpeqd %xmm1, %xmm0, %xmm0
vblendvps %xmm0, %xmm3, %xmm2, %xmm0
AVX after:
vcmpleps %xmm1, %xmm0, %xmm0
vpermilps $80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1]
vpermilps $250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3]
vorps %xmm0, %xmm1, %xmm0
vblendvps %xmm0, %xmm2, %xmm3, %xmm0
AVX512f before:
vcmpleps %xmm1, %xmm0, %xmm0
vpermilps $80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1]
vpermilps $250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3]
vorps %xmm0, %xmm1, %xmm0
vpbroadcastd LCPI0_0(%rip), %xmm1 ## xmm1 = [1,1,1,1]
vptestnmd %zmm1, %zmm0, %k1
vblendmps %zmm3, %zmm2, %zmm0 {%k1}
AVX512f after:
vcmpleps %xmm1, %xmm0, %xmm0
vpermilps $80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1]
vpermilps $250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3]
vorps %xmm0, %xmm1, %xmm0
vpslld $31, %xmm0, %xmm0
vptestmd %zmm0, %zmm0, %k1
vblendmps %zmm2, %zmm3, %zmm0 {%k1}
AArch64 before:
fcmge v0.4s, v1.4s, v0.4s
zip1 v1.4s, v0.4s, v0.4s
zip2 v0.4s, v0.4s, v0.4s
orr v0.16b, v1.16b, v0.16b
movi v1.4s, #1
and v0.16b, v0.16b, v1.16b
cmeq v0.4s, v0.4s, #0
bsl v0.16b, v3.16b, v2.16b
AArch64 after:
fcmge v0.4s, v1.4s, v0.4s
zip1 v1.4s, v0.4s, v0.4s
zip2 v0.4s, v0.4s, v0.4s
orr v0.16b, v1.16b, v0.16b
bsl v0.16b, v2.16b, v3.16b
PowerPC-le before:
xvcmpgesp 34, 35, 34
vspltisw 0, 1
vmrglw 3, 2, 2
vmrghw 2, 2, 2
xxlor 0, 35, 34
xxlxor 35, 35, 35
xxland 34, 0, 32
vcmpequw 2, 2, 3
xxsel 34, 36, 37, 34
PowerPC-le after:
xvcmpgesp 34, 35, 34
vmrglw 3, 2, 2
vmrghw 2, 2, 2
xxlor 0, 35, 34
xxsel 34, 37, 36, 0
Differential Revision: https://reviews.llvm.org/D52747
llvm-svn: 344082
Diffstat (limited to 'llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp')
-rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp | 31 |
1 files changed, 27 insertions, 4 deletions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index fd59c3a7c0c..74f1e695ff6 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -706,12 +706,35 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) { if (SimplifyDemandedInstructionBits(CI)) return &CI; - // Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0), likewise for vector. if (DestTy->getScalarSizeInBits() == 1) { - Constant *One = ConstantInt::get(SrcTy, 1); - Src = Builder.CreateAnd(Src, One); Value *Zero = Constant::getNullValue(Src->getType()); - return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero); + if (DestTy->isIntegerTy()) { + // Canonicalize trunc x to i1 -> icmp ne (and x, 1), 0 (scalar only). + // TODO: We canonicalize to more instructions here because we are probably + // lacking equivalent analysis for trunc relative to icmp. There may also + // be codegen concerns. If those trunc limitations were removed, we could + // remove this transform. + Value *And = Builder.CreateAnd(Src, ConstantInt::get(SrcTy, 1)); + return new ICmpInst(ICmpInst::ICMP_NE, And, Zero); + } + + // For vectors, we do not canonicalize all truncs to icmp, so optimize + // patterns that would be covered within visitICmpInst. + Value *X; + const APInt *C; + if (match(Src, m_OneUse(m_LShr(m_Value(X), m_APInt(C))))) { + // trunc (lshr X, C) to i1 --> icmp ne (and X, C'), 0 + APInt MaskC = APInt(SrcTy->getScalarSizeInBits(), 1).shl(*C); + Value *And = Builder.CreateAnd(X, ConstantInt::get(SrcTy, MaskC)); + return new ICmpInst(ICmpInst::ICMP_NE, And, Zero); + } + if (match(Src, m_OneUse(m_c_Or(m_LShr(m_Value(X), m_APInt(C)), + m_Deferred(X))))) { + // trunc (or (lshr X, C), X) to i1 --> icmp ne (and X, C'), 0 + APInt MaskC = APInt(SrcTy->getScalarSizeInBits(), 1).shl(*C) | 1; + Value *And = Builder.CreateAnd(X, ConstantInt::get(SrcTy, MaskC)); + return new ICmpInst(ICmpInst::ICMP_NE, And, Zero); + } } // FIXME: Maybe combine the next two transforms to handle the no cast case |