diff options
author | Mohammed Agabaria <mohammed.agabaria@intel.com> | 2017-07-02 12:16:15 +0000 |
---|---|---|
committer | Mohammed Agabaria <mohammed.agabaria@intel.com> | 2017-07-02 12:16:15 +0000 |
commit | eb09a810e6a59c436bef6ac392273b8f764dcd29 (patch) | |
tree | 9ef6b18944bf1f8bfb53579024f66491215f3037 /llvm/test | |
parent | dc25c2b08b834ca40ebabf90c132abd4c128f7f2 (diff) | |
download | bcm5719-llvm-eb09a810e6a59c436bef6ac392273b8f764dcd29.tar.gz bcm5719-llvm-eb09a810e6a59c436bef6ac392273b8f764dcd29.zip |
[X86][CM] update add\sub costs of vectors of 64 in X86\SLM arch
this patch updates the cost of addq\subq (add\subtract of vectors of 64bits)
based on the performance numbers of SLM arch.
Differential Revision: https://reviews.llvm.org/D33983
llvm-svn: 306974
Diffstat (limited to 'llvm/test')
-rw-r--r-- | llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll | 28 | ||||
-rw-r--r-- | llvm/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll | 48 |
2 files changed, 69 insertions, 7 deletions
diff --git a/llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll b/llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll index 3673a5d9e06..a767aa30b8e 100644 --- a/llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll +++ b/llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll @@ -3,6 +3,20 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" +define <2 x i64> @slm-costs_64_vector_add(<2 x i64> %a, <2 x i64> %b) { +entry: +; SLM: cost of 4 {{.*}} add <2 x i64> + %res = add <2 x i64> %a, %b + ret <2 x i64> %res +} + +define <2 x i64> @slm-costs_64_vector_sub(<2 x i64> %a, <2 x i64> %b) { +entry: +; SLM: cost of 4 {{.*}} sub <2 x i64> + %res = sub <2 x i64> %a, %b + ret <2 x i64> %res +} + ; 8bit mul define i8 @slm-costs_8_scalar_mul(i8 %a, i8 %b) { entry: @@ -13,7 +27,7 @@ entry: define <2 x i8> @slm-costs_8_v2_mul(<2 x i8> %a, <2 x i8> %b) { entry: -; SLM: cost of 11 {{.*}} mul nsw <2 x i8> +; SLM: cost of 17 {{.*}} mul nsw <2 x i8> %res = mul nsw <2 x i8> %a, %b ret <2 x i8> %res } @@ -97,7 +111,7 @@ entry: define <2 x i16> @slm-costs_16_v2_mul(<2 x i16> %a, <2 x i16> %b) { entry: -; SLM: cost of 11 {{.*}} mul nsw <2 x i16> +; SLM: cost of 17 {{.*}} mul nsw <2 x i16> %res = mul nsw <2 x i16> %a, %b ret <2 x i16> %res } @@ -181,7 +195,7 @@ entry: define <2 x i32> @slm-costs_32_v2_mul(<2 x i32> %a, <2 x i32> %b) { entry: -; SLM: cost of 11 {{.*}} mul nsw <2 x i32> +; SLM: cost of 17 {{.*}} mul nsw <2 x i32> %res = mul nsw <2 x i32> %a, %b ret <2 x i32> %res } @@ -217,28 +231,28 @@ entry: define <2 x i64> @slm-costs_64_v2_mul(<2 x i64> %a, <2 x i64> %b) { entry: -; SLM: cost of 11 {{.*}} mul nsw <2 x i64> +; SLM: cost of 17 {{.*}} mul nsw <2 x i64> %res = mul nsw <2 x i64> %a, %b ret <2 x i64> %res } define <4 x i64> @slm-costs_64_v4_mul(<4 x i64> %a, <4 x i64> %b) { entry: -; SLM: cost of 22 {{.*}} mul nsw <4 x i64> +; SLM: cost of 34 {{.*}} mul nsw <4 x i64> %res = mul nsw <4 x i64> %a, %b ret <4 x i64> %res } define <8 x i64> @slm-costs_64_v8_mul(<8 x i64> %a, <8 x i64> %b) { entry: -; SLM: cost of 44 {{.*}} mul nsw <8 x i64> +; SLM: cost of 68 {{.*}} mul nsw <8 x i64> %res = mul nsw <8 x i64> %a, %b ret <8 x i64> %res } define <16 x i64> @slm-costs_64_v16_mul(<16 x i64> %a, <16 x i64> %b) { entry: -; SLM: cost of 88 {{.*}} mul nsw <16 x i64> +; SLM: cost of 136 {{.*}} mul nsw <16 x i64> %res = mul nsw <16 x i64> %a, %b ret <16 x i64> %res } diff --git a/llvm/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll b/llvm/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll new file mode 100644 index 00000000000..8be9f1d0799 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll @@ -0,0 +1,48 @@ +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -mcpu=slm -debug 2>&1 | FileCheck -check-prefix=MSG %s +; This test should not be vectorized in X86\SLM arch +; Vectorizing the 64bit multiply in this case is wrong since +; it can be done with a lower bit mode (notice that the sources is 16bit) +; Also addq\subq (quad word) has a high cost on SLM arch. +; this test has a bad performance (regression of -70%) if vectorized on SLM arch +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @no_vec(i32 %LastIndex, i16* nocapture readonly %InputData, i16 signext %lag, i16 signext %Scale) { +entry: +; MSG: LV: Selecting VF: 1. + %cmp17 = icmp sgt i32 %LastIndex, 0 + br i1 %cmp17, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %conv5 = sext i16 %Scale to i64 + %sh_prom = and i64 %conv5, 4294967295 + %0 = sext i16 %lag to i64 + %wide.trip.count = zext i32 %LastIndex to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + %conv8 = trunc i64 %add7 to i32 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %Accumulator.0.lcssa = phi i32 [ 0, %entry ], [ %conv8, %for.cond.cleanup.loopexit ] + ret i32 %Accumulator.0.lcssa + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %Accumulator.018 = phi i64 [ 0, %for.body.lr.ph ], [ %add7, %for.body ] + %arrayidx = getelementptr inbounds i16, i16* %InputData, i64 %indvars.iv + %1 = load i16, i16* %arrayidx, align 2 + %conv = sext i16 %1 to i64 + %2 = add nsw i64 %indvars.iv, %0 + %arrayidx3 = getelementptr inbounds i16, i16* %InputData, i64 %2 + %3 = load i16, i16* %arrayidx3, align 2 + %conv4 = sext i16 %3 to i64 + %mul = mul nsw i64 %conv4, %conv + %shr = ashr i64 %mul, %sh_prom + %add7 = add i64 %shr, %Accumulator.018 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} + |