summaryrefslogtreecommitdiffstats
path: root/llvm/test
diff options
context:
space:
mode:
authorKarthik Bhat <kv.bhat@samsung.com>2015-01-20 06:11:00 +0000
committerKarthik Bhat <kv.bhat@samsung.com>2015-01-20 06:11:00 +0000
commit0b0f4660faa5072aa5f75560722f224b9712629c (patch)
tree9379a83302baaaebfb17fd5025c54119e295f9e8 /llvm/test
parent3087b22e1ae7bf00ec922a4bb9a950b27d85dd72 (diff)
downloadbcm5719-llvm-0b0f4660faa5072aa5f75560722f224b9712629c.tar.gz
bcm5719-llvm-0b0f4660faa5072aa5f75560722f224b9712629c.zip
Fix Operandreorder logic in SLPVectorizer to generate longer vectorizable chain.
This patch fixes 2 issues in reorderInputsAccordingToOpcode 1) AllSameOpcodeLeft and AllSameOpcodeRight was being calculated incorrectly resulting in code not being vectorized in few cases. 2) Adds logic to reorder operands if we get longer chain of consecutive loads enabling vectorization. Handled the same for cases were we have AltOpcode. Thanks Michael for inputs and review. Review: http://reviews.llvm.org/D6677 llvm-svn: 226547
Diffstat (limited to 'llvm/test')
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/addsub.ll133
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll110
2 files changed, 243 insertions, 0 deletions
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
index 174d4004684..d082b07b1d0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
@@ -10,6 +10,7 @@ target triple = "x86_64-unknown-linux-gnu"
@fb = common global [4 x float] zeroinitializer, align 16
@fc = common global [4 x float] zeroinitializer, align 16
@fa = common global [4 x float] zeroinitializer, align 16
+@fd = common global [4 x float] zeroinitializer, align 16
; CHECK-LABEL: @addsub
; CHECK: %5 = add nsw <4 x i32> %3, %4
@@ -177,5 +178,137 @@ entry:
ret void
}
+; Check vectorization of following code for float data type-
+; fc[0] = fb[0]+fa[0]; //swapped fb and fa
+; fc[1] = fa[1]-fb[1];
+; fc[2] = fa[2]+fb[2];
+; fc[3] = fa[3]-fb[3];
+
+; CHECK-LABEL: @reorder_alt
+; CHECK: %3 = fadd <4 x float> %1, %2
+; CHECK: %4 = fsub <4 x float> %1, %2
+; CHECK: %5 = shufflevector <4 x float> %3, <4 x float> %4, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+define void @reorder_alt() #0 {
+ %1 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 0), align 4
+ %2 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 0), align 4
+ %3 = fadd float %1, %2
+ store float %3, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 0), align 4
+ %4 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 1), align 4
+ %5 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 1), align 4
+ %6 = fsub float %4, %5
+ store float %6, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 1), align 4
+ %7 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 2), align 4
+ %8 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 2), align 4
+ %9 = fadd float %7, %8
+ store float %9, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 2), align 4
+ %10 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 3), align 4
+ %11 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 3), align 4
+ %12 = fsub float %10, %11
+ store float %12, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 3), align 4
+ ret void
+}
+
+; Check vectorization of following code for float data type-
+; fc[0] = fa[0]+(fb[0]-fd[0]);
+; fc[1] = fa[1]-(fb[1]+fd[1]);
+; fc[2] = fa[2]+(fb[2]-fd[2]);
+; fc[3] = fa[3]-(fd[3]+fb[3]); //swapped fd and fb
+
+; CHECK-LABEL: @reorder_alt_subTree
+; CHECK: %4 = fsub <4 x float> %3, %2
+; CHECK: %5 = fadd <4 x float> %3, %2
+; CHECK: %6 = shufflevector <4 x float> %4, <4 x float> %5, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK: %7 = fadd <4 x float> %1, %6
+; CHECK: %8 = fsub <4 x float> %1, %6
+; CHECK: %9 = shufflevector <4 x float> %7, <4 x float> %8, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+define void @reorder_alt_subTree() #0 {
+ %1 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 0), align 4
+ %2 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 0), align 4
+ %3 = load float* getelementptr inbounds ([4 x float]* @fd, i32 0, i64 0), align 4
+ %4 = fsub float %2, %3
+ %5 = fadd float %1, %4
+ store float %5, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 0), align 4
+ %6 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 1), align 4
+ %7 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 1), align 4
+ %8 = load float* getelementptr inbounds ([4 x float]* @fd, i32 0, i64 1), align 4
+ %9 = fadd float %7, %8
+ %10 = fsub float %6, %9
+ store float %10, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 1), align 4
+ %11 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 2), align 4
+ %12 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 2), align 4
+ %13 = load float* getelementptr inbounds ([4 x float]* @fd, i32 0, i64 2), align 4
+ %14 = fsub float %12, %13
+ %15 = fadd float %11, %14
+ store float %15, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 2), align 4
+ %16 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 3), align 4
+ %17 = load float* getelementptr inbounds ([4 x float]* @fd, i32 0, i64 3), align 4
+ %18 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 3), align 4
+ %19 = fadd float %17, %18
+ %20 = fsub float %16, %19
+ store float %20, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 3), align 4
+ ret void
+}
+
+; Check vectorization of following code for double data type-
+; c[0] = (a[0]+b[0])-d[0];
+; c[1] = d[1]+(a[1]+b[1]); //swapped d[1] and (a[1]+b[1])
+
+; CHECK-LABEL: @reorder_alt_rightsubTree
+; CHECK: fadd <2 x double>
+; CHECK: fsub <2 x double>
+; CHECK: shufflevector <2 x double>
+define void @reorder_alt_rightsubTree(double* nocapture %c, double* noalias nocapture readonly %a, double* noalias nocapture readonly %b, double* noalias nocapture readonly %d) {
+ %1 = load double* %a
+ %2 = load double* %b
+ %3 = fadd double %1, %2
+ %4 = load double* %d
+ %5 = fsub double %3, %4
+ store double %5, double* %c
+ %6 = getelementptr inbounds double* %d, i64 1
+ %7 = load double* %6
+ %8 = getelementptr inbounds double* %a, i64 1
+ %9 = load double* %8
+ %10 = getelementptr inbounds double* %b, i64 1
+ %11 = load double* %10
+ %12 = fadd double %9, %11
+ %13 = fadd double %7, %12
+ %14 = getelementptr inbounds double* %c, i64 1
+ store double %13, double* %14
+ ret void
+}
+
+; Dont vectorization of following code for float data type as sub is not commutative-
+; fc[0] = fb[0]+fa[0];
+; fc[1] = fa[1]-fb[1];
+; fc[2] = fa[2]+fb[2];
+; fc[3] = fb[3]-fa[3];
+; In the above code we can swap the 1st and 2nd operation as fadd is commutative
+; but not 2nd or 4th as fsub is not commutative.
+
+; CHECK-LABEL: @no_vec_shuff_reorder
+; CHECK-NOT: fadd <4 x float>
+; CHECK-NOT: fsub <4 x float>
+; CHECK-NOT: shufflevector
+define void @no_vec_shuff_reorder() #0 {
+ %1 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 0), align 4
+ %2 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 0), align 4
+ %3 = fadd float %1, %2
+ store float %3, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 0), align 4
+ %4 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 1), align 4
+ %5 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 1), align 4
+ %6 = fsub float %4, %5
+ store float %6, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 1), align 4
+ %7 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 2), align 4
+ %8 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 2), align 4
+ %9 = fadd float %7, %8
+ store float %9, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 2), align 4
+ %10 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 3), align 4
+ %11 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 3), align 4
+ %12 = fsub float %10, %11
+ store float %12, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 3), align 4
+ ret void
+}
+
+
attributes #0 = { nounwind }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
index c5322a839ed..cd446f0335b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
@@ -232,3 +232,113 @@ for.body3:
for.end:
ret void
}
+
+; Check vectorization of following code for double data type-
+; c[0] = a[0]+b[0];
+; c[1] = b[1]+a[1]; // swapped b[1] and a[1]
+
+; CHECK-LABEL: load_reorder_double
+; CHECK: load <2 x double>*
+; CHECK: fadd <2 x double>
+define void @load_reorder_double(double* nocapture %c, double* noalias nocapture readonly %a, double* noalias nocapture readonly %b){
+ %1 = load double* %a
+ %2 = load double* %b
+ %3 = fadd double %1, %2
+ store double %3, double* %c
+ %4 = getelementptr inbounds double* %b, i64 1
+ %5 = load double* %4
+ %6 = getelementptr inbounds double* %a, i64 1
+ %7 = load double* %6
+ %8 = fadd double %5, %7
+ %9 = getelementptr inbounds double* %c, i64 1
+ store double %8, double* %9
+ ret void
+}
+
+; Check vectorization of following code for float data type-
+; c[0] = a[0]+b[0];
+; c[1] = b[1]+a[1]; // swapped b[1] and a[1]
+; c[2] = a[2]+b[2];
+; c[3] = a[3]+b[3];
+
+; CHECK-LABEL: load_reorder_float
+; CHECK: load <4 x float>*
+; CHECK: fadd <4 x float>
+define void @load_reorder_float(float* nocapture %c, float* noalias nocapture readonly %a, float* noalias nocapture readonly %b){
+ %1 = load float* %a
+ %2 = load float* %b
+ %3 = fadd float %1, %2
+ store float %3, float* %c
+ %4 = getelementptr inbounds float* %b, i64 1
+ %5 = load float* %4
+ %6 = getelementptr inbounds float* %a, i64 1
+ %7 = load float* %6
+ %8 = fadd float %5, %7
+ %9 = getelementptr inbounds float* %c, i64 1
+ store float %8, float* %9
+ %10 = getelementptr inbounds float* %a, i64 2
+ %11 = load float* %10
+ %12 = getelementptr inbounds float* %b, i64 2
+ %13 = load float* %12
+ %14 = fadd float %11, %13
+ %15 = getelementptr inbounds float* %c, i64 2
+ store float %14, float* %15
+ %16 = getelementptr inbounds float* %a, i64 3
+ %17 = load float* %16
+ %18 = getelementptr inbounds float* %b, i64 3
+ %19 = load float* %18
+ %20 = fadd float %17, %19
+ %21 = getelementptr inbounds float* %c, i64 3
+ store float %20, float* %21
+ ret void
+}
+
+; Check we properly reorder the below code so that it gets vectorized optimally-
+; a[0] = (b[0]+c[0])+d[0];
+; a[1] = d[1]+(b[1]+c[1]);
+; a[2] = (b[2]+c[2])+d[2];
+; a[3] = (b[3]+c[3])+d[3];
+
+; CHECK-LABEL: opcode_reorder
+; CHECK: load <4 x float>*
+; CHECK: fadd <4 x float>
+define void @opcode_reorder(float* noalias nocapture %a, float* noalias nocapture readonly %b,
+ float* noalias nocapture readonly %c,float* noalias nocapture readonly %d){
+ %1 = load float* %b
+ %2 = load float* %c
+ %3 = fadd float %1, %2
+ %4 = load float* %d
+ %5 = fadd float %3, %4
+ store float %5, float* %a
+ %6 = getelementptr inbounds float* %d, i64 1
+ %7 = load float* %6
+ %8 = getelementptr inbounds float* %b, i64 1
+ %9 = load float* %8
+ %10 = getelementptr inbounds float* %c, i64 1
+ %11 = load float* %10
+ %12 = fadd float %9, %11
+ %13 = fadd float %7, %12
+ %14 = getelementptr inbounds float* %a, i64 1
+ store float %13, float* %14
+ %15 = getelementptr inbounds float* %b, i64 2
+ %16 = load float* %15
+ %17 = getelementptr inbounds float* %c, i64 2
+ %18 = load float* %17
+ %19 = fadd float %16, %18
+ %20 = getelementptr inbounds float* %d, i64 2
+ %21 = load float* %20
+ %22 = fadd float %19, %21
+ %23 = getelementptr inbounds float* %a, i64 2
+ store float %22, float* %23
+ %24 = getelementptr inbounds float* %b, i64 3
+ %25 = load float* %24
+ %26 = getelementptr inbounds float* %c, i64 3
+ %27 = load float* %26
+ %28 = fadd float %25, %27
+ %29 = getelementptr inbounds float* %d, i64 3
+ %30 = load float* %29
+ %31 = fadd float %28, %30
+ %32 = getelementptr inbounds float* %a, i64 3
+ store float %31, float* %32
+ ret void
+}
OpenPOWER on IntegriCloud