[Loop Vectorizer] Support predication of div/rem

div/rem instructions in basic blocks that require predication currently prevent vectorization. This patch extends the existing mechanism for predicating stores to handle other instructions and leverages it to predicate divs and rems. Differential Revision: https://reviews.llvm.org/D22918 llvm-svn: 279620
author: Gil Rapaport <gil.rapaport@intel.com> 2016-08-24 11:37:57 +0000
committer: Gil Rapaport <gil.rapaport@intel.com> 2016-08-24 11:37:57 +0000
commit: 550148b2f662504f77e0b91fb134162c5a176a61 (patch)
tree: fe9d6ff971b9c7c900b592c6cec2aa60a9855f6d /llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
parent: 6392b8d4ce410eaaba08cdbf6c4e261ba1375c7a (diff)
download: bcm5719-llvm-550148b2f662504f77e0b91fb134162c5a176a61.tar.gz
bcm5719-llvm-550148b2f662504f77e0b91fb134162c5a176a61.zip
1 files changed, 173 insertions, 0 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
new file mode 100644
index 00000000000..881eb51f9bc
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
@@ -0,0 +1,173 @@
+; RUN: opt -S -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -verify-loop-info -simplifycfg < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Test predication of non-void instructions, specifically (i) that these
+; instructions permit vectorization and (ii) the creation of an insertelement
+; and a Phi node. We check the full 2-element sequence for the first
+; instruction; For the rest we'll just make sure they get predicated based
+; on the code generated for the first element.
+define void @test(i32* nocapture %asd, i32* nocapture %aud,
+                  i32* nocapture %asr, i32* nocapture %aur) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %if.end
+  ret void
+
+; CHECK-LABEL: test
+; CHECK: vector.body:
+; CHECK:   %[[SDEE:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 0
+; CHECK:   %[[SDCC:[a-zA-Z0-9]+]] = icmp eq i1 %[[SDEE]], true
+; CHECK:   br i1 %[[SDCC]], label %[[CSD:[a-zA-Z0-9.]+]], label %[[ESD:[a-zA-Z0-9.]+]]
+; CHECK: [[CSD]]:
+; CHECK:   %[[SDA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[SDA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[SD0:[a-zA-Z0-9]+]] = sdiv i32 %[[SDA0]], %[[SDA1]]
+; CHECK:   %[[SD1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[SD0]], i32 0
+; CHECK:   br label %[[ESD]]
+; CHECK: [[ESD]]:
+; CHECK:   %[[SDR:[a-zA-Z0-9]+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[SD1]], %[[CSD]] ]
+; CHECK:   %[[SDEEH:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 1
+; CHECK:   %[[SDCCH:[a-zA-Z0-9]+]] = icmp eq i1 %[[SDEEH]], true
+; CHECK:   br i1 %[[SDCCH]], label %[[CSDH:[a-zA-Z0-9.]+]], label %[[ESDH:[a-zA-Z0-9.]+]]
+; CHECK: [[CSDH]]:
+; CHECK:   %[[SDA0H:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 1
+; CHECK:   %[[SDA1H:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 1
+; CHECK:   %[[SD0H:[a-zA-Z0-9]+]] = sdiv i32 %[[SDA0H]], %[[SDA1H]]
+; CHECK:   %[[SD1H:[a-zA-Z0-9]+]] = insertelement <2 x i32> %[[SDR]], i32 %[[SD0H]], i32 1
+; CHECK:   br label %[[ESDH]]
+; CHECK: [[ESDH]]:
+; CHECK:   %{{.*}} = phi <2 x i32> [ %[[SDR]], %[[ESD]] ], [ %[[SD1H]], %[[CSDH]] ]
+
+; CHECK:   %[[UDEE:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 0
+; CHECK:   %[[UDCC:[a-zA-Z0-9]+]] = icmp eq i1 %[[UDEE]], true
+; CHECK:   br i1 %[[UDCC]], label %[[CUD:[a-zA-Z0-9.]+]], label %[[EUD:[a-zA-Z0-9.]+]]
+; CHECK: [[CUD]]:
+; CHECK:   %[[UDA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[UDA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[UD0:[a-zA-Z0-9]+]] = udiv i32 %[[UDA0]], %[[UDA1]]
+; CHECK:   %[[UD1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[UD0]], i32 0
+; CHECK:   br label %[[EUD]]
+; CHECK: [[EUD]]:
+; CHECK:   %{{.*}} = phi <2 x i32> [ undef, %{{.*}} ], [ %[[UD1]], %[[CUD]] ]
+
+; CHECK:   %[[SREE:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 0
+; CHECK:   %[[SRCC:[a-zA-Z0-9]+]] = icmp eq i1 %[[SREE]], true
+; CHECK:   br i1 %[[SRCC]], label %[[CSR:[a-zA-Z0-9.]+]], label %[[ESR:[a-zA-Z0-9.]+]]
+; CHECK: [[CSR]]:
+; CHECK:   %[[SRA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[SRA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[SR0:[a-zA-Z0-9]+]] = srem i32 %[[SRA0]], %[[SRA1]]
+; CHECK:   %[[SR1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[SR0]], i32 0
+; CHECK:   br label %[[ESR]]
+; CHECK: [[ESR]]:
+; CHECK:   %{{.*}} = phi <2 x i32> [ undef, %{{.*}} ], [ %[[SR1]], %[[CSR]] ]
+
+; CHECK:   %[[UREE:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 0
+; CHECK:   %[[URCC:[a-zA-Z0-9]+]] = icmp eq i1 %[[UREE]], true
+; CHECK:   br i1 %[[URCC]], label %[[CUR:[a-zA-Z0-9.]+]], label %[[EUR:[a-zA-Z0-9.]+]]
+; CHECK: [[CUR]]:
+; CHECK:   %[[URA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[URA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[UR0:[a-zA-Z0-9]+]] = urem i32 %[[URA0]], %[[URA1]]
+; CHECK:   %[[UR1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[UR0]], i32 0
+; CHECK:   br label %[[EUR]]
+; CHECK: [[EUR]]:
+; CHECK:   %{{.*}} = phi <2 x i32> [ undef, %{{.*}} ], [ %[[UR1]], %[[CUR]] ]
+
+for.body:                                         ; preds = %if.end, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ]
+  %isd = getelementptr inbounds i32, i32* %asd, i64 %indvars.iv
+  %iud = getelementptr inbounds i32, i32* %aud, i64 %indvars.iv
+  %isr = getelementptr inbounds i32, i32* %asr, i64 %indvars.iv
+  %iur = getelementptr inbounds i32, i32* %aur, i64 %indvars.iv
+  %lsd = load i32, i32* %isd, align 4
+  %lud = load i32, i32* %iud, align 4
+  %lsr = load i32, i32* %isr, align 4
+  %lur = load i32, i32* %iur, align 4
+  %psd = add nsw i32 %lsd, 23
+  %pud = add nsw i32 %lud, 24
+  %psr = add nsw i32 %lsr, 25
+  %pur = add nsw i32 %lur, 26
+  %cmp1 = icmp slt i32 %lsd, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %rsd = sdiv i32 %psd, %lsd
+  %rud = udiv i32 %pud, %lud
+  %rsr = srem i32 %psr, %lsr
+  %rur = urem i32 %pur, %lur
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %ysd.0 = phi i32 [ %rsd, %if.then ], [ %psd, %for.body ]
+  %yud.0 = phi i32 [ %rud, %if.then ], [ %pud, %for.body ]
+  %ysr.0 = phi i32 [ %rsr, %if.then ], [ %psr, %for.body ]
+  %yur.0 = phi i32 [ %rur, %if.then ], [ %pur, %for.body ]
+  store i32 %ysd.0, i32* %isd, align 4
+  store i32 %yud.0, i32* %iud, align 4
+  store i32 %ysr.0, i32* %isr, align 4
+  store i32 %yur.0, i32* %iur, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 128
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; Future-use test for predication under smarter scalar-scalar: this test will
+; fail when the vectorizer starts feeding scalarized values directly to their
+; scalar users, i.e. w/o generating redundant insertelement/extractelement
+; instructions. This case is already supported by the predication code (which
+; should generate a phi for the scalar predicated value rather than for the
+; insertelement), but cannot be tested yet.
+; If you got this test to fail, kindly fix the test by using the alternative
+; FFU sequence. This will make the test check how we handle this case from
+; now on.
+define void @test_scalar2scalar(i32* nocapture %asd, i32* nocapture %bsd) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %if.end
+  ret void
+
+; CHECK-LABEL: test_scalar2scalar
+; CHECK: vector.body:
+; CHECK:   br i1 %{{.*}}, label %[[THEN:[a-zA-Z0-9.]+]], label %[[FI:[a-zA-Z0-9.]+]]
+; CHECK: [[THEN]]:
+; CHECK:   %[[PD:[a-zA-Z0-9]+]] = sdiv i32 %{{.*}}, %{{.*}}
+; CHECK:   %[[PDV:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[PD]], i32 0
+; CHECK:   br label %[[FI]]
+; CHECK: [[FI]]:
+; CHECK:   %[[PH:[a-zA-Z0-9]+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[PDV]], %[[THEN]] ]
+; FFU-LABEL: test_scalar2scalar
+; FFU:   vector.body:
+; FFU:     br i1 %{{.*}}, label %[[THEN:[a-zA-Z0-9.]+]], label %[[FI:[a-zA-Z0-9.]+]]
+; FFU:   [[THEN]]:
+; FFU:     %[[PD:[a-zA-Z0-9]+]] = sdiv i32 %{{.*}}, %{{.*}}
+; FFU:     br label %[[FI]]
+; FFU:   [[FI]]:
+; FFU:     %{{.*}} = phi i32 [ undef, %vector.body ], [ %[[PD]], %[[THEN]] ]
+
+for.body:                                         ; preds = %if.end, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ]
+  %isd = getelementptr inbounds i32, i32* %asd, i64 %indvars.iv
+  %lsd = load i32, i32* %isd, align 4
+  %isd.b = getelementptr inbounds i32, i32* %bsd, i64 %indvars.iv
+  %lsd.b = load i32, i32* %isd.b, align 4
+  %psd = add nsw i32 %lsd, 23
+  %cmp1 = icmp slt i32 %lsd, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %sd1 = sdiv i32 %psd, %lsd
+  %rsd = sdiv i32 %lsd.b, %sd1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %ysd.0 = phi i32 [ %rsd, %if.then ], [ %psd, %for.body ]
+  store i32 %ysd.0, i32* %isd, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 128
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
author	Gil Rapaport <gil.rapaport@intel.com>	2016-08-24 11:37:57 +0000
committer	Gil Rapaport <gil.rapaport@intel.com>	2016-08-24 11:37:57 +0000
commit	550148b2f662504f77e0b91fb134162c5a176a61 (patch)
tree	fe9d6ff971b9c7c900b592c6cec2aa60a9855f6d /llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
parent	6392b8d4ce410eaaba08cdbf6c4e261ba1375c7a (diff)
download	bcm5719-llvm-550148b2f662504f77e0b91fb134162c5a176a61.tar.gz bcm5719-llvm-550148b2f662504f77e0b91fb134162c5a176a61.zip