[CGP] despeculate expensive cttz/ctlz intrinsics

This is another step towards allowing SimplifyCFG to speculate harder, but then have CGP clean things up if the target doesn't like it. Previous patches in this series: http://reviews.llvm.org/D12882 http://reviews.llvm.org/D13297 D13297 should catch most expensive ops, but speculation of cttz/ctlz requires special handling because of weirdness in the intrinsic definition for handling a zero input (that definition can probably be blamed on x86). For example, if we have the usual speculated-by-select expensive op pattern like this: %tobool = icmp eq i64 %A, 0 %0 = tail call i64 @llvm.cttz.i64(i64 %A, i1 true) ; is_zero_undef == true %cond = select i1 %tobool, i64 64, i64 %0 ret i64 %cond There's an instcombine that will turn it into: %0 = tail call i64 @llvm.cttz.i64(i64 %A, i1 false) ; is_zero_undef == false This CGP patch is looking for that case and despeculating it back into: entry: %tobool = icmp eq i64 %A, 0 br i1 %tobool, label %cond.end, label %cond.true cond.true: %0 = tail call i64 @llvm.cttz.i64(i64 %A, i1 true) ; is_zero_undef == true br label %cond.end cond.end: %cond = phi i64 [ %0, %cond.true ], [ 64, %entry ] ret i64 %cond This unfortunately may lead to poorer codegen (see the changes in the existing x86 test), but if we increase speculation in SimplifyCFG (the next step in this patch series), then we should avoid those kinds of cases in the first place. The need for this patch was originally mentioned here: http://reviews.llvm.org/D7506 with follow-up here: http://reviews.llvm.org/D7554 Differential Revision: http://reviews.llvm.org/D14630 llvm-svn: 253573
author: Sanjay Patel <spatel@rotateright.com> 2015-11-19 16:37:10 +0000
committer: Sanjay Patel <spatel@rotateright.com> 2015-11-19 16:37:10 +0000
commit: 4699b8ab6acf76e595bfb4d525484df7a113f63f (patch)
tree: 3fda9e24dfe1a0212dbab8d1c4341d2f6dbae348 /llvm/test/CodeGen/X86/clz.ll
parent: dcc2500452746939988c613e2b6d00513dc2ab3e (diff)
download: bcm5719-llvm-4699b8ab6acf76e595bfb4d525484df7a113f63f.tar.gz
bcm5719-llvm-4699b8ab6acf76e595bfb4d525484df7a113f63f.zip
1 files changed, 37 insertions, 18 deletions
diff --git a/llvm/test/CodeGen/X86/clz.ll b/llvm/test/CodeGen/X86/clz.ll
index e50d7ec437c..4a094480c93 100644
--- a/llvm/test/CodeGen/X86/clz.ll
+++ b/llvm/test/CodeGen/X86/clz.ll
@@ -87,55 +87,74 @@ define i64 @ctlz_i64(i64 %x) {
   ret i64 %tmp
 }
 
-define i32 @ctlz_i32_cmov(i32 %n) {
-; CHECK-LABEL: ctlz_i32_cmov:
+define i32 @ctlz_i32_zero_test(i32 %n) {
+; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
+
+; CHECK-LABEL: ctlz_i32_zero_test:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    bsrl %edi, %ecx
-; CHECK-NEXT:    movl $63, %eax
-; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    movl $32, %eax
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB8_2
+; CHECK-NEXT:  # BB#1: # %cond.false
+; CHECK-NEXT:    bsrl %edi, %eax
 ; CHECK-NEXT:    xorl $31, %eax
+; CHECK-NEXT:  .LBB8_2: # %cond.end
 ; CHECK-NEXT:    retq
-; Generate a cmov to handle zero inputs when necessary.
   %tmp1 = call i32 @llvm.ctlz.i32(i32 %n, i1 false)
   ret i32 %tmp1
 }
 
 define i32 @ctlz_i32_fold_cmov(i32 %n) {
+; Don't generate the cmovne when the source is known non-zero (and bsr would
+; not set ZF).
+; rdar://9490949
+; FIXME: The compare and branch are produced late in IR (by CodeGenPrepare), and
+;        codegen doesn't know how to delete the movl and je.
+
 ; CHECK-LABEL: ctlz_i32_fold_cmov:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    orl $1, %edi
+; CHECK-NEXT:    movl $32, %eax
+; CHECK-NEXT:    je .LBB9_2
+; CHECK-NEXT:  # BB#1: # %cond.false
 ; CHECK-NEXT:    bsrl %edi, %eax
 ; CHECK-NEXT:    xorl $31, %eax
+; CHECK-NEXT:  .LBB9_2: # %cond.end
 ; CHECK-NEXT:    retq
-; Don't generate the cmovne when the source is known non-zero (and bsr would
-; not set ZF).
-; rdar://9490949
   %or = or i32 %n, 1
   %tmp1 = call i32 @llvm.ctlz.i32(i32 %or, i1 false)
   ret i32 %tmp1
 }
 
 define i32 @ctlz_bsr(i32 %n) {
+; Don't generate any xors when a 'ctlz' intrinsic is actually used to compute
+; the most significant bit, which is what 'bsr' does natively.
+
 ; CHECK-LABEL: ctlz_bsr:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    bsrl %edi, %eax
 ; CHECK-NEXT:    retq
-; Don't generate any xors when a 'ctlz' intrinsic is actually used to compute
-; the most significant bit, which is what 'bsr' does natively.
   %ctlz = call i32 @llvm.ctlz.i32(i32 %n, i1 true)
   %bsr = xor i32 %ctlz, 31
   ret i32 %bsr
 }
 
-define i32 @ctlz_bsr_cmov(i32 %n) {
-; CHECK-LABEL: ctlz_bsr_cmov:
+define i32 @ctlz_bsr_zero_test(i32 %n) {
+; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
+; FIXME: The compare and branch are produced late in IR (by CodeGenPrepare), and
+;        codegen doesn't know how to combine the $32 and $31 into $63.
+
+; CHECK-LABEL: ctlz_bsr_zero_test:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    bsrl %edi, %ecx
-; CHECK-NEXT:    movl $63, %eax
-; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    movl $32, %eax
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB11_2
+; CHECK-NEXT:  # BB#1: # %cond.false
+; CHECK-NEXT:    bsrl %edi, %eax
+; CHECK-NEXT:    xorl $31, %eax
+; CHECK-NEXT:  .LBB11_2: # %cond.end
+; CHECK-NEXT:    xorl $31, %eax
 ; CHECK-NEXT:    retq
-; Same as ctlz_bsr, but ensure this happens even when there is a potential
-; zero.
   %ctlz = call i32 @llvm.ctlz.i32(i32 %n, i1 false)
   %bsr = xor i32 %ctlz, 31
   ret i32 %bsr
author	Sanjay Patel <spatel@rotateright.com>	2015-11-19 16:37:10 +0000
committer	Sanjay Patel <spatel@rotateright.com>	2015-11-19 16:37:10 +0000
commit	4699b8ab6acf76e595bfb4d525484df7a113f63f (patch)
tree	3fda9e24dfe1a0212dbab8d1c4341d2f6dbae348 /llvm/test/CodeGen/X86/clz.ll
parent	dcc2500452746939988c613e2b6d00513dc2ab3e (diff)
download	bcm5719-llvm-4699b8ab6acf76e595bfb4d525484df7a113f63f.tar.gz bcm5719-llvm-4699b8ab6acf76e595bfb4d525484df7a113f63f.zip