diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2015-07-12 11:15:19 +0000 |
---|---|---|
committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2015-07-12 11:15:19 +0000 |
commit | 64cc4ad0a273fec56debf406c8524d1122d249b9 (patch) | |
tree | 35e69efafa6d2c3ac1d74bdfa9898949f5fcba34 /llvm/lib/Target/X86/X86TargetTransformInfo.cpp | |
parent | d08eca0181f0d1d21fd7f35fde62eccb509cf5c5 (diff) | |
download | bcm5719-llvm-64cc4ad0a273fec56debf406c8524d1122d249b9.tar.gz bcm5719-llvm-64cc4ad0a273fec56debf406c8524d1122d249b9.zip |
[X86][SSE] Vectorized v4i32 non-uniform shifts.
While the v4i32 shl operation is already vectorized using a cvttps2dq/pmulld pattern, the lshr/ashr opeations are still scalarized.
This patch adds vectorization support for non-uniform v4i32 shift operations - it splats constant shift amounts to allow them to use the immediate sse shift instructions, or extracts/zero-extends non-constant shift amounts. The individual results are then blended together.
Differential Revision: http://reviews.llvm.org/D11063
llvm-svn: 241989
Diffstat (limited to 'llvm/lib/Target/X86/X86TargetTransformInfo.cpp')
-rw-r--r-- | llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 46 |
1 files changed, 23 insertions, 23 deletions
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index ee8f8c656a8..a7164ec8ba5 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -261,18 +261,18 @@ unsigned X86TTIImpl::getArithmeticInstrCost( { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. { ISD::SHL, MVT::v2i64, 2*10 }, // Scalarized. { ISD::SHL, MVT::v4i64, 4*10 }, // Scalarized. - - { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence. - { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence. - { ISD::SRL, MVT::v4i32, 4*10 }, // Scalarized. - { ISD::SRL, MVT::v2i64, 2*10 }, // Scalarized. - - { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence. - { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence. - { ISD::SRA, MVT::v4i32, 4*10 }, // Scalarized. - { ISD::SRA, MVT::v2i64, 2*10 }, // Scalarized. - - // It is not a good idea to vectorize division. We have to scalarize it and +
+ { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
+ { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
+ { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
+ { ISD::SRL, MVT::v2i64, 2*10 }, // Scalarized.
+
+ { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
+ { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
+ { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
+ { ISD::SRA, MVT::v2i64, 2*10 }, // Scalarized.
+
+ // It is not a good idea to vectorize division. We have to scalarize it and
// in the process we will often end up having to spilling regular // registers. The overhead of division is going to dominate most kernels // anyways so try hard to prevent vectorization of division - it is @@ -1117,17 +1117,17 @@ unsigned X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, } return X86TTIImpl::getIntImmCost(Imm, Ty); } - -bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, int Consecutive) { - int DataWidth = DataTy->getPrimitiveSizeInBits(); - - // Todo: AVX512 allows gather/scatter, works with strided and random as well - if ((DataWidth < 32) || (Consecutive == 0)) - return false; - if (ST->hasAVX512() || ST->hasAVX2()) - return true; - return false; -} +
+bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, int Consecutive) {
+ int DataWidth = DataTy->getPrimitiveSizeInBits();
+
+ // Todo: AVX512 allows gather/scatter, works with strided and random as well
+ if ((DataWidth < 32) || (Consecutive == 0))
+ return false;
+ if (ST->hasAVX512() || ST->hasAVX2())
+ return true;
+ return false;
+}
bool X86TTIImpl::isLegalMaskedStore(Type *DataType, int Consecutive) { return isLegalMaskedLoad(DataType, Consecutive); |