diff options
Diffstat (limited to 'llvm/test/CodeGen/Thumb2/LowOverheadLoops')
5 files changed, 15 insertions, 63 deletions
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll index 79c81ca7a44..257d950c60f 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll @@ -4,7 +4,7 @@ ; CHECK: vector.body: ; CHECK: %index = phi i32 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.vctp8(i32 [[ELEMS]]) +; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[ELEMS]]) ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 16 ; CHECK: [[LD0:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef) ; CHECK: [[LD1:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef) @@ -57,7 +57,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry ; CHECK: vector.body: ; CHECK: %index = phi i32 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]]) +; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[ELEMS]]) ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 8 ; CHECK: [[LD0:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) ; CHECK: [[LD1:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) @@ -109,7 +109,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry ; CHECK-LABEL: mul_v4i32 ; CHECK: vector.body: ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]]) +; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) @@ -158,59 +158,11 @@ for.cond.cleanup: ; preds = %vector.body, %entry ret void } -; CHECK-LABEL: copy_v2i64 -; CHECK: vector.body: -; CHECK: %index = phi i32 -; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <2 x i1> @llvm.arm.vctp64(i32 [[ELEMS]]) -; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 2 -; CHECK: [[LD0:%[^ ]+]] = tail call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* {{.*}}, i32 4, <2 x i1> [[VCTP]], <2 x i64> undef) -; CHECK: tail call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> [[LD0]], <2 x i64>* {{.*}}, i32 4, <2 x i1> [[VCTP]]) -define void @copy_v2i64(i64* %a, i64* %b, i32 %N) { -entry: - %cmp8 = icmp eq i32 %N, 0 - %tmp8 = add i32 %N, 1 - %tmp9 = lshr i32 %tmp8, 1 - %tmp10 = shl nuw i32 %tmp9, 1 - %tmp11 = add i32 %tmp10, -2 - %tmp12 = lshr i32 %tmp11, 1 - %tmp13 = add nuw nsw i32 %tmp12, 1 - br i1 %cmp8, label %for.cond.cleanup, label %vector.ph - -vector.ph: ; preds = %entry - %trip.count.minus.1 = add i32 %N, -1 - %broadcast.splatinsert10 = insertelement <2 x i32> undef, i32 %trip.count.minus.1, i32 0 - %broadcast.splat11 = shufflevector <2 x i32> %broadcast.splatinsert10, <2 x i32> undef, <2 x i32> zeroinitializer - call void @llvm.set.loop.iterations.i32(i32 %tmp13) - br label %vector.body - -vector.body: ; preds = %vector.body, %vector.ph - %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] - %broadcast.splatinsert = insertelement <2 x i32> undef, i32 %index, i32 0 - %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer - %induction = add <2 x i32> %broadcast.splat, <i32 0, i32 1> - %tmp1 = icmp ule <2 x i32> %induction, %broadcast.splat11 - %tmp = getelementptr inbounds i64, i64* %a, i32 %index - %tmp2 = bitcast i64* %tmp to <2 x i64>* - %wide.masked.load = tail call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %tmp2, i32 4, <2 x i1> %tmp1, <2 x i64> undef) - %tmp3 = getelementptr inbounds i64, i64* %b, i32 %index - %tmp7 = bitcast i64* %tmp3 to <2 x i64>* - tail call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %wide.masked.load, <2 x i64>* %tmp7, i32 4, <2 x i1> %tmp1) - %index.next = add i32 %index, 2 - %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) - %tmp16 = icmp ne i32 %tmp15, 0 - br i1 %tmp16, label %vector.body, label %for.cond.cleanup - -for.cond.cleanup: ; preds = %vector.body, %entry - ret void -} - ; CHECK-LABEL: split_vector ; CHECK: vector.body: ; CHECK: %index = phi i32 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]]) +; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) @@ -268,7 +220,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry ; One of the loads now uses ult predicate. ; CHECK-LABEL: mismatch_load_pred ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]]) +; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> %wrong, <4 x i32> undef) @@ -322,7 +274,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry ; CHECK-LABEL: mismatch_store_pred ; CHECK: %index = phi i32 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]]) +; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll index 2f9d301e808..f67a59f74fb 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll @@ -28,7 +28,7 @@ define void @mat_vec_sext_i16(i16** nocapture readonly %A, i16* nocapture readon ; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.vctp32(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]]) ; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP6]] to <4 x i16>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP8]], i32 2, <4 x i1> [[TMP1]], <4 x i16> undef) @@ -140,7 +140,7 @@ define void @mat_vec_i32(i32** nocapture readonly %A, i32* nocapture readonly %B ; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.vctp32(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]]) ; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP8]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll index 70e272ffc0d..330c6db24a7 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll @@ -1,7 +1,7 @@ ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve,+lob %s -S -o - | FileCheck %s ; CHECK-LABEL: expand_v8i16_v8i32 -; CHECK-NOT: call i32 @llvm.arm.vctp +; CHECK-NOT: call i32 @llvm.arm.mve.vctp define void @expand_v8i16_v8i32(i16* noalias nocapture readonly %a, i16* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) { entry: %cmp8 = icmp eq i32 %N, 0 @@ -50,7 +50,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry ; CHECK-LABEL: expand_v8i16_v4i32 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS_REM:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]]) +; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[ELEMS]]) ; CHECK: [[ELEMS_REM]] = sub i32 [[ELEMS]], 8 ; CHECK: tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) ; CHECK: %store.pred = icmp ule <4 x i32> %induction.store @@ -117,7 +117,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry } ; CHECK-LABEL: expand_v4i32_v4i64 -; CHECK-NOT: call i32 @llvm.arm.vctp +; CHECK-NOT: call i32 @llvm.arm.mve.vctp define void @expand_v4i32_v4i64(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i64* noalias nocapture %c, i32 %N) { entry: %cmp8 = icmp eq i32 %N, 0 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll index 7cdd28fd0f3..c7ed9ce674d 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll @@ -5,7 +5,7 @@ ; CHECK: phi <8 x i16> [ zeroinitializer, %entry ] ; CHECK: phi i32 ; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %entry ], [ [[ELEMS:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[PHI]]) +; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]]) ; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8 ; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) ; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp6, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) @@ -63,7 +63,7 @@ middle.block: ; preds = %vector.body ; CHECK: phi <8 x i16> [ zeroinitializer, %entry ] ; CHECK: phi i32 ; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %entry ], [ [[ELEMS:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[PHI]]) +; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]]) ; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8 ; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) define i16 @reduction_i32_with_scalar(i16* nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr { diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll index dbf40f60cbd..38dc5ce54bc 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll @@ -6,13 +6,13 @@ ; CHECK: vector.body: ; CHECK-NOT: phi i32 [ 0, %vector.ph ] ; CHECK: [[ELTS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[SUB:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELTS]]) +; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELTS]]) ; CHECK: [[SUB]] = sub i32 [[ELTS]], 4 ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]] ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], ; CHECK: middle.block: -; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELTS]]) +; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELTS]]) ; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP_CLONE]], ; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]]) |