[X86] Improved X86::CMOV to Branch heuristic.

Resolved PR33954. This patch contains two more constraints that aim to reduce the noise cases where we convert CMOV into branch for small gain, and end up spending more cycles due to overhead. Differential Revision: https://reviews.llvm.org/D36081 llvm-svn: 310352
author: Amjad Aboud <amjad.aboud@intel.com> 2017-08-08 12:17:56 +0000
committer: Amjad Aboud <amjad.aboud@intel.com> 2017-08-08 12:17:56 +0000
commit: 6fa6813aeccadb8f0780aa0ee4468ccc443122d8 (patch)
tree: 33fe6c66d7447c0288ad611bf712d5187ae23db6 /llvm/test
parent: 70a3511bd5a83d85ef277bb7b088e15917ba7915 (diff)
download: bcm5719-llvm-6fa6813aeccadb8f0780aa0ee4468ccc443122d8.tar.gz
bcm5719-llvm-6fa6813aeccadb8f0780aa0ee4468ccc443122d8.zip
2 files changed, 143 insertions, 6 deletions
diff --git a/llvm/test/CodeGen/X86/pr33954.ll b/llvm/test/CodeGen/X86/pr33954.ll
new file mode 100644
index 00000000000..084873a7dc5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr33954.ll
@@ -0,0 +1,91 @@
+; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; This test checks that x86-cmov-converter optimization does not transform CMOV
+;; instruction when the gain (in cycles) of converting to branch is less than
+;; a fix threshold (measured for "-x86-cmov-converter-threshold=4").
+;;
+;; Test was created using the following command line:
+;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o -
+;; Where foo.c is:
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;int bar(int *a, int *b, int n) {
+;;  int sum = 0;
+;;  for (int i = 0; i < n; ++i) {
+;;    int x = a[i] * a[i+1] * a[i+2];
+;;    int y = b[i] * b[i+1];
+;;    sum += y > x ? x : 0;
+;;  }
+;;  return sum;
+;;}
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Adding a test to the above function shows code with CMOV is 25% faster than
+;; the code with branch.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;#define N 10000
+;;int A[N];
+;;int B[N];
+;;
+;;
+;;
+;;int main () {
+;;  for (int i=0; i< N; ++i) {
+;;    A[i] = i%4;
+;;    B[i] = i%5;
+;;  }
+;;  int sum = 0;
+;;  for (int i=0; i< N*10; ++i)
+;;    sum += bar(A, B, N);
+;;  return sum;
+;;}
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; CHECK-NOT: jg
+; CHECK: cmovle
+define i32 @bar(i32* nocapture readonly %a, i32* nocapture readonly %b, i32 %n) #0 {
+entry:
+  %cmp30 = icmp sgt i32 %n, 0
+  br i1 %cmp30, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %.pre = load i32, i32* %a, align 4
+  %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 1
+  %.pre34 = load i32, i32* %arrayidx2.phi.trans.insert, align 4
+  %.pre35 = load i32, i32* %b, align 4
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add14, %for.body ]
+  ret i32 %sum.0.lcssa
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %0 = phi i32 [ %.pre35, %for.body.preheader ], [ %5, %for.body ]
+  %1 = phi i32 [ %.pre34, %for.body.preheader ], [ %4, %for.body ]
+  %2 = phi i32 [ %.pre, %for.body.preheader ], [ %1, %for.body ]
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add14, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %mul = mul nsw i32 %1, %2
+  %3 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %3
+  %4 = load i32, i32* %arrayidx5, align 4
+  %mul6 = mul nsw i32 %mul, %4
+  %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next
+  %5 = load i32, i32* %arrayidx11, align 4
+  %mul12 = mul nsw i32 %5, %0
+  %cmp13 = icmp sgt i32 %mul12, %mul6
+  %cond = select i1 %cmp13, i32 %mul6, i32 0
+  %add14 = add nsw i32 %cond, %sum.032
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = {"target-cpu"="skylake"}
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 2}
+!1 = !{i32 7, !"PIC Level", i32 2}
+!2 = !{!"clang version 5.0.0 (trunk)"}
diff --git a/llvm/test/CodeGen/X86/x86-cmov-converter.ll b/llvm/test/CodeGen/X86/x86-cmov-converter.ll
index 39877c14429..1837a8d4985 100644
--- a/llvm/test/CodeGen/X86/x86-cmov-converter.ll
+++ b/llvm/test/CodeGen/X86/x86-cmov-converter.ll
@@ -3,13 +3,13 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; This test checks that x86-cmov-converter optimization transform CMOV
 ;; instruction into branches when it is profitable.
-;; There are 5 cases below:
+;; There are 6 cases below:
 ;;   1. CmovInCriticalPath:
 ;;        CMOV depends on the condition and it is in the hot path.
 ;;        Thus, it worths transforming.
 ;;
 ;;   2. CmovNotInCriticalPath:
-;;        similar test like in (1), just that CMOV is not in the hot path.
+;;        Similar test like in (1), just that CMOV is not in the hot path.
 ;;        Thus, it does not worth transforming.
 ;;
 ;;   3. MaxIndex:
@@ -26,16 +26,21 @@
 ;;        Usually, binary search CMOV is not predicted.
 ;;        Thus, it does not worth transforming.
 ;;
+;;   6. SmallGainPerLoop:
+;;        The gain percentage from converting CMOV into branch is acceptable,
+;;        however, the absolute gain is smaller than a threshold.
+;;        Thus, it does not worth transforming.
+;;
 ;; Test was created using the following command line:
 ;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o -
 ;; Where foo.c is:
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;void CmovInHotPath(int n, int a, int b, int *c, int *d) {
 ;;  for (int i = 0; i < n; i++) {
-;;    int t = c[i];
+;;    int t = c[i] + 1;
 ;;    if (c[i] * a > b)
 ;;      t = 10;
-;;    c[i] = t;
+;;    c[i] = (c[i] + 1) * t;
 ;;  }
 ;;}
 ;;
@@ -87,6 +92,16 @@
 ;;  }
 ;;  return Curr->Val;
 ;;}
+;;
+;;
+;;void SmallGainPerLoop(int n, int a, int b, int *c, int *d) {
+;;  for (int i = 0; i < n; i++) {
+;;    int t = c[i];
+;;    if (c[i] * a > b)
+;;      t = 10;
+;;    c[i] = t;
+;;  }
+;;}
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 %struct.Node = type { i32, %struct.Node*, %struct.Node* }
@@ -111,10 +126,12 @@ for.body:                                         ; preds = %for.body.preheader,
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
   %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
   %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, 1
   %mul = mul nsw i32 %0, %a
   %cmp3 = icmp sgt i32 %mul, %b
-  %. = select i1 %cmp3, i32 10, i32 %0
-  store i32 %., i32* %arrayidx, align 4
+  %. = select i1 %cmp3, i32 10, i32 %add
+  %mul7 = mul nsw i32 %., %add
+  store i32 %mul7, i32* %arrayidx, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
@@ -253,6 +270,35 @@ while.end:                                        ; preds = %while.body, %entry
   ret i32 %.lcssa
 }
 
+; CHECK-LABEL: SmallGainPerLoop
+; CHECK-NOT: jg
+; CHECK: cmovg
+
+define void @SmallGainPerLoop(i32 %n, i32 %a, i32 %b, i32* nocapture %c, i32* nocapture readnone %d) #0 {
+entry:
+  %cmp14 = icmp sgt i32 %n, 0
+  br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %mul = mul nsw i32 %0, %a
+  %cmp3 = icmp sgt i32 %mul, %b
+  %. = select i1 %cmp3, i32 10, i32 %0
+  store i32 %., i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; The following test checks that x86-cmov-converter optimization transforms
 ;; CMOV instructions into branch correctly.
author	Amjad Aboud <amjad.aboud@intel.com>	2017-08-08 12:17:56 +0000
committer	Amjad Aboud <amjad.aboud@intel.com>	2017-08-08 12:17:56 +0000
commit	6fa6813aeccadb8f0780aa0ee4468ccc443122d8 (patch)
tree	33fe6c66d7447c0288ad611bf712d5187ae23db6 /llvm/test
parent	70a3511bd5a83d85ef277bb7b088e15917ba7915 (diff)
download	bcm5719-llvm-6fa6813aeccadb8f0780aa0ee4468ccc443122d8.tar.gz bcm5719-llvm-6fa6813aeccadb8f0780aa0ee4468ccc443122d8.zip