summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
diff options
context:
space:
mode:
authorAndrea Di Biagio <Andrea_DiBiagio@sn.scee.net>2014-02-12 23:43:47 +0000
committerAndrea Di Biagio <Andrea_DiBiagio@sn.scee.net>2014-02-12 23:43:47 +0000
commitb7882b3bd11b46cca650e318bb7b921a1bfc8e21 (patch)
tree97a7fdb6b7e8ffe0337e7c9260cc03cb39f3c1b9 /llvm/lib/Target/X86/X86TargetTransformInfo.cpp
parent386d566395d90b01ddddf3d0ab219097cca8c37c (diff)
downloadbcm5719-llvm-b7882b3bd11b46cca650e318bb7b921a1bfc8e21.tar.gz
bcm5719-llvm-b7882b3bd11b46cca650e318bb7b921a1bfc8e21.zip
[Vectorizer] Add a new 'OperandValueKind' in TargetTransformInfo called
'OK_NonUniformConstValue' to identify operands which are constants but not constant splats. The cost model now allows returning 'OK_NonUniformConstValue' for non splat operands that are instances of ConstantVector or ConstantDataVector. With this change, targets are now able to compute different costs for instructions with non-uniform constant operands. For example, On X86 the cost of a vector shift may vary depending on whether the second operand is a uniform or non-uniform constant. This patch applies the following changes: - The cost model computation now takes into account non-uniform constants; - The cost of vector shift instructions has been improved in X86TargetTransformInfo analysis pass; - BBVectorize, SLPVectorizer and LoopVectorize now know how to distinguish between non-uniform and uniform constant operands. Added a new test to verify that the output of opt '-cost-model -analyze' is valid in the following configurations: SSE2, SSE4.1, AVX, AVX2. llvm-svn: 201272
Diffstat (limited to 'llvm/lib/Target/X86/X86TargetTransformInfo.cpp')
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp35
1 files changed, 33 insertions, 2 deletions
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 207a7685c59..d50bab99ff3 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -225,6 +225,13 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// Look for AVX2 lowering tricks.
if (ST->hasAVX2()) {
+ if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
+ (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
+ // On AVX2, a packed v16i16 shift left by a constant build_vector
+ // is lowered into a vector multiply (vpmullw).
+ return LT.first;
+
int Idx = CostTableLookup(AVX2CostTable, ISD, LT.second);
if (Idx != -1)
return LT.first * AVX2CostTable[Idx].Cost;
@@ -257,6 +264,20 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
return LT.first * SSE2UniformConstCostTable[Idx].Cost;
}
+ if (ISD == ISD::SHL &&
+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
+ EVT VT = LT.second;
+ if ((VT == MVT::v8i16 && ST->hasSSE2()) ||
+ (VT == MVT::v4i32 && ST->hasSSE41()))
+ // Vector shift left by non uniform constant can be lowered
+ // into vector multiply (pmullw/pmulld).
+ return LT.first;
+ if (VT == MVT::v4i32 && ST->hasSSE2())
+ // A vector shift left by non uniform constant is converted
+ // into a vector multiply; the new multiply is eventually
+ // lowered into a sequence of shuffles and 2 x pmuludq.
+ ISD = ISD::MUL;
+ }
static const CostTblEntry<MVT::SimpleValueType> SSE2CostTable[] = {
// We don't correctly identify costs of casts because they are marked as
@@ -271,6 +292,7 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
{ ISD::SHL, MVT::v8i16, 8*10 }, // Scalarized.
{ ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
{ ISD::SHL, MVT::v2i64, 2*10 }, // Scalarized.
+ { ISD::SHL, MVT::v4i64, 4*10 }, // Scalarized.
{ ISD::SRL, MVT::v16i8, 16*10 }, // Scalarized.
{ ISD::SRL, MVT::v8i16, 8*10 }, // Scalarized.
@@ -308,6 +330,7 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// We don't have to scalarize unsupported ops. We can issue two half-sized
// operations and we only need to extract the upper YMM half.
// Two ops + 1 extract + 1 insert = 4.
+ { ISD::MUL, MVT::v16i16, 4 },
{ ISD::MUL, MVT::v8i32, 4 },
{ ISD::SUB, MVT::v8i32, 4 },
{ ISD::ADD, MVT::v8i32, 4 },
@@ -323,7 +346,15 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// Look for AVX1 lowering tricks.
if (ST->hasAVX() && !ST->hasAVX2()) {
- int Idx = CostTableLookup(AVX1CostTable, ISD, LT.second);
+ EVT VT = LT.second;
+
+ // v16i16 and v8i32 shifts by non-uniform constants are lowered into a
+ // sequence of extract + two vector multiply + insert.
+ if (ISD == ISD::SHL && (VT == MVT::v8i32 || VT == MVT::v16i16) &&
+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)
+ ISD = ISD::MUL;
+
+ int Idx = CostTableLookup(AVX1CostTable, ISD, VT);
if (Idx != -1)
return LT.first * AVX1CostTable[Idx].Cost;
}
@@ -343,7 +374,7 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// 2x pmuludq, 2x shuffle.
if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() &&
!ST->hasSSE41())
- return 6;
+ return LT.first * 6;
// Fallback to the default implementation.
return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info,
OpenPOWER on IntegriCloud