diff options
| author | Hal Finkel <hfinkel@anl.gov> | 2014-04-04 23:51:18 +0000 | 
|---|---|---|
| committer | Hal Finkel <hfinkel@anl.gov> | 2014-04-04 23:51:18 +0000 | 
| commit | de0b413ec03d7df83cb2e0896b4ce2c19c6373cf (patch) | |
| tree | 6ec2df6c43a2a90c10c55b8150304263bcc1ba49 /llvm | |
| parent | b1308d525c133f256ac8e2f13ec2d3b589a2f3fc (diff) | |
| download | bcm5719-llvm-de0b413ec03d7df83cb2e0896b4ce2c19c6373cf.tar.gz bcm5719-llvm-de0b413ec03d7df83cb2e0896b4ce2c19c6373cf.zip  | |
[PowerPC] Adjust load/store costs in PPCTTI
This provides more realistic costs for the insert/extractelement instructions
(which are load/store pairs), accounts for the cheap unaligned Altivec load
sequence, and for unaligned VSX load/stores.
Bad news:
MultiSource/Applications/sgefa/sgefa - 35% slowdown (this will require more investigation)
SingleSource/Benchmarks/McGill/queens - 20% slowdown (we no longer vectorize this, but it was a constant store that was scalarized)
MultiSource/Benchmarks/FreeBench/pcompress2/pcompress2 - 2% slowdown
Good news:
SingleSource/Benchmarks/Shootout/ary3 - 54% speedup
SingleSource/Benchmarks/Shootout-C++/ary - 40% speedup
MultiSource/Benchmarks/Ptrdist/ks/ks - 35% speedup
MultiSource/Benchmarks/FreeBench/neural/neural - 30% speedup
MultiSource/Benchmarks/TSVC/Symbolics-flt/Symbolics-flt - 20% speedup
Unfortunately, estimating the costs of the stack-based scalarization sequences
is hard, and adjusting these costs is like a game of whac-a-mole :( I'll
revisit this again after we have better codegen for vector extloads and
truncstores and unaligned load/stores.
llvm-svn: 205658
Diffstat (limited to 'llvm')
4 files changed, 30 insertions, 7 deletions
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 53b2dd65d0f..ed849b5bc85 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -216,7 +216,9 @@ unsigned PPCTTI::getVectorInstrCost(unsigned Opcode, Type *Val,    // experimentally as a minimum needed to prevent unprofitable    // vectorization for the paq8p benchmark.  It may need to be    // raised further if other unprofitable cases remain. -  unsigned LHSPenalty = 12; +  unsigned LHSPenalty = 2; +  if (ISD == ISD::INSERT_VECTOR_ELT) +    LHSPenalty += 7;    // Vector element insert/extract with Altivec is very expensive,    // because they require store and reload with the attendant @@ -240,14 +242,32 @@ unsigned PPCTTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,    unsigned Cost =      TargetTransformInfo::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace); -  // FIXME: Update this for VSX loads/stores that support unaligned access. +  // VSX loads/stores support unaligned access. +  if (ST->hasVSX()) { +    if (LT.second == MVT::v2f64 || LT.second == MVT::v2i64) +      return Cost; +  } + +  bool UnalignedAltivec = +    Src->isVectorTy() && +    Src->getPrimitiveSizeInBits() >= LT.second.getSizeInBits() && +    LT.second.getSizeInBits() == 128 && +    Opcode == Instruction::Load;    // PPC in general does not support unaligned loads and stores. They'll need    // to be decomposed based on the alignment factor.    unsigned SrcBytes = LT.second.getStoreSize(); -  if (SrcBytes && Alignment && Alignment < SrcBytes) +  if (SrcBytes && Alignment && Alignment < SrcBytes && !UnalignedAltivec) {      Cost += LT.first*(SrcBytes/Alignment-1); +    // For a vector type, there is also scalarization overhead (only for +    // stores, loads are expanded using the vector-load + permutation sequence, +    // which is much less expensive). +    if (Src->isVectorTy() && Opcode == Instruction::Store) +      for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i) +        Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i); +  } +    return Cost;  } diff --git a/llvm/test/Analysis/CostModel/PowerPC/ext.ll b/llvm/test/Analysis/CostModel/PowerPC/ext.ll index daaa8f5bac3..7d6a14e93cd 100644 --- a/llvm/test/Analysis/CostModel/PowerPC/ext.ll +++ b/llvm/test/Analysis/CostModel/PowerPC/ext.ll @@ -13,7 +13,7 @@ define void @exts() {    ; CHECK: cost of 1 {{.*}} sext    %v3 = sext <4 x i16> undef to <4 x i32> -  ; CHECK: cost of 216 {{.*}} sext +  ; CHECK: cost of 112 {{.*}} sext    %v4 = sext <8 x i16> undef to <8 x i32>    ret void diff --git a/llvm/test/Analysis/CostModel/PowerPC/insert_extract.ll b/llvm/test/Analysis/CostModel/PowerPC/insert_extract.ll index f51963d56fd..8dc003153a2 100644 --- a/llvm/test/Analysis/CostModel/PowerPC/insert_extract.ll +++ b/llvm/test/Analysis/CostModel/PowerPC/insert_extract.ll @@ -3,13 +3,13 @@ target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3  target triple = "powerpc64-unknown-linux-gnu"  define i32 @insert(i32 %arg) { -  ; CHECK: cost of 13 {{.*}} insertelement +  ; CHECK: cost of 10 {{.*}} insertelement    %x = insertelement <4 x i32> undef, i32 %arg, i32 0    ret i32 undef  }  define i32 @extract(<4 x i32> %arg) { -  ; CHECK: cost of 13 {{.*}} extractelement +  ; CHECK: cost of 3 {{.*}} extractelement    %x = extractelement <4 x i32> %arg, i32 0    ret i32 %x  } diff --git a/llvm/test/Analysis/CostModel/PowerPC/load_store.ll b/llvm/test/Analysis/CostModel/PowerPC/load_store.ll index 8145a1dc715..40862780faf 100644 --- a/llvm/test/Analysis/CostModel/PowerPC/load_store.ll +++ b/llvm/test/Analysis/CostModel/PowerPC/load_store.ll @@ -31,9 +31,12 @@ define i32 @loads(i32 %arg) {    ; FIXME: There actually are sub-vector Altivec loads, and so we could handle    ; this with a small expense, but we don't currently. -  ; CHECK: cost of 60 {{.*}} load +  ; CHECK: cost of 48 {{.*}} load    load <4 x i16>* undef, align 2 +  ; CHECK: cost of 1 {{.*}} load +  load <4 x i32>* undef, align 4 +    ret i32 undef  }  | 

