summaryrefslogtreecommitdiffstats
path: root/llvm/test/Transforms/LoopVectorize/X86
diff options
context:
space:
mode:
authorPhilip Reames <listmail@philipreames.com>2019-09-09 20:54:13 +0000
committerPhilip Reames <listmail@philipreames.com>2019-09-09 20:54:13 +0000
commit7403569be75176c9bf76d65a4e97a66ebe01165a (patch)
treea5ae1ac94fc4105ea47c1f64524558b1d218a8ff /llvm/test/Transforms/LoopVectorize/X86
parent589273bebd44c3fcc09d2ad87b259dcaff30d299 (diff)
downloadbcm5719-llvm-7403569be75176c9bf76d65a4e97a66ebe01165a.tar.gz
bcm5719-llvm-7403569be75176c9bf76d65a4e97a66ebe01165a.zip
[LoopVectorize] Leverage speculation safety to avoid masked.loads
If we're vectorizing a load in a predicated block, check to see if the load can be speculated rather than predicated. This allows us to generate a normal vector load instead of a masked.load. To do so, we must prove that all bytes accessed on any iteration of the original loop are dereferenceable, and that all loads (across all iterations) are properly aligned. This is equivelent to proving that hoisting the load into the loop header in the original scalar loop is safe. Note: There are a couple of code motion todos in the code. My intention is to wait about a day - to be sure this sticks - and then perform the NFC motion without furthe review. Differential Revision: https://reviews.llvm.org/D66688 llvm-svn: 371452
Diffstat (limited to 'llvm/test/Transforms/LoopVectorize/X86')
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll32
1 files changed, 16 insertions, 16 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
index 53fc12b4f57..ea5ce9c9e29 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
@@ -67,24 +67,24 @@ define i32 @test_explicit_pred(i64 %len) {
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]]
; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
; CHECK-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP25]], i32 4, <4 x i1> [[TMP16]], <4 x i32> undef)
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP25]], align 4
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 4
; CHECK-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP27]], i32 4, <4 x i1> [[TMP17]], <4 x i32> undef)
+; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP27]], align 4
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8
; CHECK-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP29]], i32 4, <4 x i1> [[TMP18]], <4 x i32> undef)
+; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 12
; CHECK-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP31]], i32 4, <4 x i1> [[TMP19]], <4 x i32> undef)
+; CHECK-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP31]], align 4
; CHECK-NEXT: [[TMP32:%.*]] = xor <4 x i1> [[TMP16]], <i1 true, i1 true, i1 true, i1 true>
; CHECK-NEXT: [[TMP33:%.*]] = xor <4 x i1> [[TMP17]], <i1 true, i1 true, i1 true, i1 true>
; CHECK-NEXT: [[TMP34:%.*]] = xor <4 x i1> [[TMP18]], <i1 true, i1 true, i1 true, i1 true>
; CHECK-NEXT: [[TMP35:%.*]] = xor <4 x i1> [[TMP19]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP16]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
-; CHECK-NEXT: [[PREDPHI16:%.*]] = select <4 x i1> [[TMP17]], <4 x i32> [[WIDE_MASKED_LOAD13]], <4 x i32> zeroinitializer
-; CHECK-NEXT: [[PREDPHI17:%.*]] = select <4 x i1> [[TMP18]], <4 x i32> [[WIDE_MASKED_LOAD14]], <4 x i32> zeroinitializer
-; CHECK-NEXT: [[PREDPHI18:%.*]] = select <4 x i1> [[TMP19]], <4 x i32> [[WIDE_MASKED_LOAD15]], <4 x i32> zeroinitializer
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP16]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer
+; CHECK-NEXT: [[PREDPHI16:%.*]] = select <4 x i1> [[TMP17]], <4 x i32> [[WIDE_LOAD13]], <4 x i32> zeroinitializer
+; CHECK-NEXT: [[PREDPHI17:%.*]] = select <4 x i1> [[TMP18]], <4 x i32> [[WIDE_LOAD14]], <4 x i32> zeroinitializer
+; CHECK-NEXT: [[PREDPHI18:%.*]] = select <4 x i1> [[TMP19]], <4 x i32> [[WIDE_LOAD15]], <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP36]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
; CHECK-NEXT: [[TMP37]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI16]]
; CHECK-NEXT: [[TMP38]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI17]]
@@ -244,24 +244,24 @@ define i32 @test_explicit_pred_generic(i64 %len, i1* %test_base) {
; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]]
; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0
; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> undef)
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4
; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> undef)
+; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP71]], align 4
; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8
; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> undef)
+; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP73]], align 4
; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12
; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> undef)
+; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP75]], align 4
; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], <i1 true, i1 true, i1 true, i1 true>
; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
-; CHECK-NEXT: [[PREDPHI10:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD7]], <4 x i32> zeroinitializer
-; CHECK-NEXT: [[PREDPHI11:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD8]], <4 x i32> zeroinitializer
-; CHECK-NEXT: [[PREDPHI12:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD9]], <4 x i32> zeroinitializer
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer
+; CHECK-NEXT: [[PREDPHI10:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_LOAD7]], <4 x i32> zeroinitializer
+; CHECK-NEXT: [[PREDPHI11:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_LOAD8]], <4 x i32> zeroinitializer
+; CHECK-NEXT: [[PREDPHI12:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_LOAD9]], <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI10]]
; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI11]]
OpenPOWER on IntegriCloud