diff options
author | Sanjay Patel <spatel@rotateright.com> | 2016-02-26 21:04:14 +0000 |
---|---|---|
committer | Sanjay Patel <spatel@rotateright.com> | 2016-02-26 21:04:14 +0000 |
commit | 1ace99351fdf034af94bf6a67aee8b3bb5d96083 (patch) | |
tree | ad21397b20f5e2752d1450e9fcb994f56358298a /llvm/lib | |
parent | 2eff7f788a9e4d71044fcd73c6b391ca31689361 (diff) | |
download | bcm5719-llvm-1ace99351fdf034af94bf6a67aee8b3bb5d96083.tar.gz bcm5719-llvm-1ace99351fdf034af94bf6a67aee8b3bb5d96083.zip |
[x86, InstCombine] transform x86 AVX masked stores to LLVM intrinsics
The intended effect of this patch in conjunction with:
http://reviews.llvm.org/rL259392
http://reviews.llvm.org/rL260145
is that customers using the AVX intrinsics in C will benefit from combines when
the store mask is constant:
void mstore_zero_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set1_epi32(0), v);
}
void mstore_fake_ones_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set1_epi32(1), v);
}
void mstore_ones_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set1_epi32(0x80000000), v);
}
void mstore_one_set_elt_mask(float *f, __m128 v) {
_mm_maskstore_ps(f, _mm_set_epi32(0x80000000, 0, 0, 0), v);
}
...so none of the above will actually generate a masked store for optimized code.
Differential Revision: http://reviews.llvm.org/D17485
llvm-svn: 262064
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp | 49 |
1 files changed, 49 insertions, 0 deletions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index a7c660468c1..7cf990363f7 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -828,6 +828,46 @@ static Instruction *simplifyMaskedScatter(IntrinsicInst &II, InstCombiner &IC) { return nullptr; } +// TODO: If the x86 backend knew how to convert a bool vector mask back to an +// XMM register mask efficiently, we could transform all x86 masked intrinsics +// to LLVM masked intrinsics and remove the x86 masked intrinsic defs. +static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { + Value *Ptr = II.getOperand(0); + Value *Mask = II.getOperand(1); + Value *Vec = II.getOperand(2); + + // Special case a zero mask since that's not a ConstantDataVector: + // this masked store instruction does nothing. + if (isa<ConstantAggregateZero>(Mask)) { + IC.eraseInstFromFunction(II); + return true; + } + + auto *ConstMask = dyn_cast<ConstantDataVector>(Mask); + if (!ConstMask) + return false; + + // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic + // to allow target-independent optimizations. + + // First, cast the x86 intrinsic scalar pointer to a vector pointer to match + // the LLVM intrinsic definition for the pointer argument. + unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); + PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); + + Value *PtrCast = IC.Builder->CreateBitCast(Ptr, VecPtrTy, "castvec"); + + // Second, convert the x86 XMM integer vector mask to a vector of bools based + // on each element's most significant bit (the sign bit). + Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask); + + IC.Builder->CreateMaskedStore(Vec, PtrCast, 1, BoolMask); + + // 'Replace uses' doesn't work for stores. Erase the original masked store. + IC.eraseInstFromFunction(II); + return true; +} + /// CallInst simplification. This mostly only handles folding of intrinsic /// instructions. For normal calls, it allows visitCallSite to do the heavy /// lifting. @@ -1590,6 +1630,15 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return replaceInstUsesWith(*II, V); break; + case Intrinsic::x86_avx_maskstore_ps: + case Intrinsic::x86_avx_maskstore_pd: + case Intrinsic::x86_avx_maskstore_ps_256: + case Intrinsic::x86_avx_maskstore_pd_256: + // TODO: The AVX2 integer variants can go here too. + if (simplifyX86MaskedStore(*II, *this)) + return nullptr; + break; + case Intrinsic::x86_xop_vpcomb: case Intrinsic::x86_xop_vpcomd: case Intrinsic::x86_xop_vpcomq: |