diff options
-rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp | 35 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx-blend.ll | 23 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx2-blend.ll | 11 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/sse41-blend.ll | 32 | ||||
-rw-r--r-- | llvm/test/Transforms/InstCombine/blend_x86.ll | 56 |
5 files changed, 157 insertions, 0 deletions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index be1b5aa50b1..a0819fdfc82 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -718,6 +718,41 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } + case Intrinsic::x86_sse41_pblendvb: + case Intrinsic::x86_sse41_blendvps: + case Intrinsic::x86_sse41_blendvpd: + case Intrinsic::x86_avx_blendv_ps_256: + case Intrinsic::x86_avx_blendv_pd_256: + case Intrinsic::x86_avx2_pblendvb: { + // Convert blendv* to vector selects if the mask is constant. + // This optimization is convoluted because the intrinsic is defined as + // getting a vector of floats or doubles for the ps and pd versions. + // FIXME: That should be changed. + Value *Mask = II->getArgOperand(2); + if (auto C = dyn_cast<ConstantDataVector>(Mask)) { + auto Tyi1 = Builder->getInt1Ty(); + auto SelectorType = cast<VectorType>(Mask->getType()); + auto EltTy = SelectorType->getElementType(); + unsigned Size = SelectorType->getNumElements(); + unsigned BitWidth = EltTy->isFloatTy() ? 32 : (EltTy->isDoubleTy() ? 64 : EltTy->getIntegerBitWidth()); + assert(BitWidth == 64 || BitWidth == 32 || BitWidth == 8 && "Wrong arguments for variable blend intrinsic"); + SmallVector<Constant*, 32> Selectors; + for (unsigned I = 0; I < Size; ++I) { + // The intrinsics only read the top bit + uint64_t Selector; + if (BitWidth == 8) + Selector = C->getElementAsInteger(I); + else + Selector = C->getElementAsAPFloat(I).bitcastToAPInt().getZExtValue(); + Selectors.push_back(ConstantInt::get(Tyi1, Selector >> (BitWidth - 1))); + } + auto NewSelector = ConstantVector::get(Selectors); + return SelectInst::Create(NewSelector, II->getArgOperand(0), II->getArgOperand(1), "blendv"); + } else { + break; + } + } + case Intrinsic::x86_avx_vpermilvar_ps: case Intrinsic::x86_avx_vpermilvar_ps_256: case Intrinsic::x86_avx_vpermilvar_pd: diff --git a/llvm/test/CodeGen/X86/avx-blend.ll b/llvm/test/CodeGen/X86/avx-blend.ll index 4d4f6c1a03a..e21c7a07e8b 100644 --- a/llvm/test/CodeGen/X86/avx-blend.ll +++ b/llvm/test/CodeGen/X86/avx-blend.ll @@ -135,3 +135,26 @@ define <2 x double> @testb(<2 x double> %x, <2 x double> %y) { %min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y ret <2 x double> %min } + +; If we can figure out a blend has a constant mask, we should emit the +; blend instruction with an immediate mask +define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) { +; CHECK-LABEL: constant_blendvpd_avx: +; CHECK-NOT: mov +; CHECK: vblendpd +; CHECK: ret + %1 = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> %xy, <4 x double> %ab + ret <4 x double> %1 +} + +define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) { +; CHECK-LABEL: constant_blendvps_avx: +; CHECK-NOT: mov +; CHECK: vblendps +; CHECK: ret + %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <8 x float> %xyzw, <8 x float> %abcd + ret <8 x float> %1 +} + +declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) +declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) diff --git a/llvm/test/CodeGen/X86/avx2-blend.ll b/llvm/test/CodeGen/X86/avx2-blend.ll new file mode 100644 index 00000000000..b02442b6fad --- /dev/null +++ b/llvm/test/CodeGen/X86/avx2-blend.ll @@ -0,0 +1,11 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s + +define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) { +; CHECK-LABEL: constant_pblendvb_avx2: +; CHECK: vmovdqa +; CHECK: vpblendvb + %1 = select <32 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <32 x i8> %xyzw, <32 x i8> %abcd + ret <32 x i8> %1 +} + +declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) diff --git a/llvm/test/CodeGen/X86/sse41-blend.ll b/llvm/test/CodeGen/X86/sse41-blend.ll index 951bb7dc854..8ad79877c8e 100644 --- a/llvm/test/CodeGen/X86/sse41-blend.ll +++ b/llvm/test/CodeGen/X86/sse41-blend.ll @@ -88,3 +88,35 @@ entry: store double %extract214vector_func.i, double addrspace(1)* undef, align 8 ret void } + +; If we can figure out a blend has a constant mask, we should emit the +; blend instruction with an immediate mask +define <2 x double> @constant_blendvpd(<2 x double> %xy, <2 x double> %ab) { +; In this case, we emit a simple movss +; CHECK-LABEL: constant_blendvpd +; CHECK: movsd +; CHECK: ret + %1 = select <2 x i1> <i1 true, i1 false>, <2 x double> %xy, <2 x double> %ab + ret <2 x double> %1 +} + +define <4 x float> @constant_blendvps(<4 x float> %xyzw, <4 x float> %abcd) { +; CHECK-LABEL: constant_blendvps +; CHECK-NOT: mov +; CHECK: blendps $7 +; CHECK: ret + %1 = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> %xyzw, <4 x float> %abcd + ret <4 x float> %1 +} + +define <16 x i8> @constant_pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd) { +; CHECK-LABEL: constant_pblendvb: +; CHECK: movaps +; CHECK: pblendvb +; CHECK: ret + %1 = select <16 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <16 x i8> %xyzw, <16 x i8> %abcd + ret <16 x i8> %1 +} +declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) +declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) +declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) diff --git a/llvm/test/Transforms/InstCombine/blend_x86.ll b/llvm/test/Transforms/InstCombine/blend_x86.ll new file mode 100644 index 00000000000..6dbacf963ce --- /dev/null +++ b/llvm/test/Transforms/InstCombine/blend_x86.ll @@ -0,0 +1,56 @@ +; RUN: opt < %s -instcombine -mtriple=x86_64-apple-macosx -mcpu=core-avx2 -S | FileCheck %s + +define <2 x double> @constant_blendvpd(<2 x double> %xy, <2 x double> %ab) { +; CHECK-LABEL: @constant_blendvpd +; CHECK: select <2 x i1> <i1 true, i1 false> + %1 = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %xy, <2 x double> %ab, <2 x double> <double 0xFFFFFFFFE0000000, double 0.000000e+00>) + ret <2 x double> %1 +} + +define <4 x float> @constant_blendvps(<4 x float> %xyzw, <4 x float> %abcd) { +; CHECK-LABEL: @constant_blendvps +; CHECK: select <4 x i1> <i1 false, i1 false, i1 false, i1 true> + %1 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %xyzw, <4 x float> %abcd, <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0xFFFFFFFFE0000000>) + ret <4 x float> %1 +} + +define <16 x i8> @constant_pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd) { +; CHECK-LABEL: @constant_pblendvb +; CHECK: select <16 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false> + %1 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd, <16 x i8> <i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0>) + ret <16 x i8> %1 +} + +define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) { +; CHECK-LABEL: @constant_blendvpd_avx +; CHECK: select <4 x i1> <i1 true, i1 false, i1 true, i1 false> + %1 = tail call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %xy, <4 x double> %ab, <4 x double> <double 0xFFFFFFFFE0000000, double 0.000000e+00, double 0xFFFFFFFFE0000000, double 0.000000e+00>) + ret <4 x double> %1 +} + +define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) { +; CHECK-LABEL: @constant_blendvps_avx +; CHECK: select <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true> + %1 = tail call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %xyzw, <8 x float> %abcd, <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0xFFFFFFFFE0000000, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0xFFFFFFFFE0000000>) + ret <8 x float> %1 +} + +define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) { +; CHECK-LABEL: @constant_pblendvb_avx2 +; CHECK: select <32 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false> + %1 = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %xyzw, <32 x i8> %abcd, + <32 x i8> <i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, + i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, + i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, + i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0>) + ret <32 x i8> %1 +} + +declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) +declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) +declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) + +declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) +declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) +declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) + |