diff options
| author | Mohammed Agabaria <mohammed.agabaria@intel.com> | 2017-01-05 14:03:41 +0000 | 
|---|---|---|
| committer | Mohammed Agabaria <mohammed.agabaria@intel.com> | 2017-01-05 14:03:41 +0000 | 
| commit | 23599ba7940d9891ed5eb982aaaed116f97aea74 (patch) | |
| tree | 910698abcc007239ab93b2f167fcd7ff96a75603 /llvm/test/Transforms/LoopVectorize | |
| parent | a983e7c4a415bc28b8bc6218f4881d11b3a2d995 (diff) | |
| download | bcm5719-llvm-23599ba7940d9891ed5eb982aaaed116f97aea74.tar.gz bcm5719-llvm-23599ba7940d9891ed5eb982aaaed116f97aea74.zip | |
Currently isLikelyComplexAddressComputation tries to figure out if the given stride seems to be 'complex' and need some extra cost for address computation handling.
This code seems to be target dependent which may not be the same for all targets.
Passed the decision whether the given stride is complex or not to the target by sending stride information via SCEV to getAddressComputationCost instead of 'IsComplex'.
Specifically at X86 targets we dont see any significant address computation cost in case of the strided access in general.
Differential Revision: https://reviews.llvm.org/D27518
llvm-svn: 291106
Diffstat (limited to 'llvm/test/Transforms/LoopVectorize')
| -rw-r--r-- | llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll | 54 | 
1 files changed, 54 insertions, 0 deletions
| diff --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll new file mode 100644 index 00000000000..645f3360543 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll @@ -0,0 +1,54 @@ +; This test checks that the given loop still beneficial for vecotization +; even if it contains scalarized load (gather on AVX2) +;RUN: opt < %s -loop-vectorize -S -o - | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: norecurse nounwind readonly uwtable +define i32 @matrix_row_col([100 x i32]* nocapture readonly %data, i32 %i, i32 %j) local_unnamed_addr #0 { +entry: +  %idxprom = sext i32 %i to i64 +  %idxprom5 = sext i32 %j to i64 +  br label %for.body + +  for.cond.cleanup:                                 ; preds = %for.body +  ret i32 %add7 + +  for.body:                                         ; preds = %for.body, %entry +  ; the loop gets vectorized +  ; first consecutive load as vector load +  ; CHECK: %wide.load = load <8 x i32> +  ; second strided load scalarized +  ; CHECK: load i32 +  ; CHECK: load i32 +  ; CHECK: load i32 +  ; CHECK: load i32 +  ; CHECK: load i32 +  ; CHECK: load i32 +  ; CHECK: load i32 +  ; CHECK: load i32 + +  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] +  %sum.015 = phi i32 [ 0, %entry ], [ %add7, %for.body ] +  %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* %data, i64 %idxprom, i64 %indvars.iv +  %0 = load i32, i32* %arrayidx2, align 4, !tbaa !1 +  %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %data, i64 %indvars.iv, i64 %idxprom5 +  %1 = load i32, i32* %arrayidx6, align 4, !tbaa !1 +  %mul = mul nsw i32 %1, %0 +  %add = add i32 %sum.015, 4 +  %add7 = add i32 %add, %mul +  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +  %exitcond = icmp eq i64 %indvars.iv.next, 100 +  br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { "target-cpu"="core-avx2" "target-features"="+avx,+avx2,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3" } + +!llvm.ident = !{!0} + +!0 = !{!"clang version 4.0.0 (cfe/trunk 284570)"} +!1 = !{!2, !2, i64 0} +!2 = !{!"int", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C/C++ TBAA"} | 

