summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/include/llvm/IR/IntrinsicsARM.td9
-rw-r--r--llvm/lib/Target/ARM/ARMInstrMVE.td6
-rw-r--r--llvm/lib/Target/ARM/MVETailPredication.cpp13
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll60
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll4
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll6
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll4
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll4
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-vctp.ll12
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-vpt-from-intrinsics.ll4
10 files changed, 40 insertions, 82 deletions
diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td
index 337110b4917..894a4be2947 100644
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -773,10 +773,11 @@ class Neon_Dot_Intrinsic
def int_arm_neon_udot : Neon_Dot_Intrinsic;
def int_arm_neon_sdot : Neon_Dot_Intrinsic;
-def int_arm_vctp8 : Intrinsic<[llvm_v16i1_ty], [llvm_i32_ty], [IntrNoMem]>;
-def int_arm_vctp16 : Intrinsic<[llvm_v8i1_ty], [llvm_i32_ty], [IntrNoMem]>;
-def int_arm_vctp32 : Intrinsic<[llvm_v4i1_ty], [llvm_i32_ty], [IntrNoMem]>;
-def int_arm_vctp64 : Intrinsic<[llvm_v2i1_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_arm_mve_vctp8 : Intrinsic<[llvm_v16i1_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_arm_mve_vctp16 : Intrinsic<[llvm_v8i1_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_arm_mve_vctp32 : Intrinsic<[llvm_v4i1_ty], [llvm_i32_ty], [IntrNoMem]>;
+// vctp64 takes v4i1, to work around v2i1 not being a legal MVE type
+def int_arm_mve_vctp64 : Intrinsic<[llvm_v4i1_ty], [llvm_i32_ty], [IntrNoMem]>;
// v8.3-A Floating-point complex add
def int_arm_neon_vcadd_rot90 : Neon_2Arg_Intrinsic;
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index e64ab9b7370..6cd20309126 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -4291,11 +4291,11 @@ def MVE_VCTP32 : MVE_VCTP<"32", 0b10>;
def MVE_VCTP64 : MVE_VCTP<"64", 0b11>;
let Predicates = [HasMVEInt] in {
- def : Pat<(int_arm_vctp8 rGPR:$Rn),
+ def : Pat<(int_arm_mve_vctp8 rGPR:$Rn),
(v16i1 (MVE_VCTP8 rGPR:$Rn))>;
- def : Pat<(int_arm_vctp16 rGPR:$Rn),
+ def : Pat<(int_arm_mve_vctp16 rGPR:$Rn),
(v8i1 (MVE_VCTP16 rGPR:$Rn))>;
- def : Pat<(int_arm_vctp32 rGPR:$Rn),
+ def : Pat<(int_arm_mve_vctp32 rGPR:$Rn),
(v4i1 (MVE_VCTP32 rGPR:$Rn))>;
}
diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
index 397f9004477..e8bc43dbe2d 100644
--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -485,10 +485,15 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
switch (VecTy->getNumElements()) {
default:
llvm_unreachable("unexpected number of lanes");
- case 2: VCTPID = Intrinsic::arm_vctp64; break;
- case 4: VCTPID = Intrinsic::arm_vctp32; break;
- case 8: VCTPID = Intrinsic::arm_vctp16; break;
- case 16: VCTPID = Intrinsic::arm_vctp8; break;
+ case 4: VCTPID = Intrinsic::arm_mve_vctp32; break;
+ case 8: VCTPID = Intrinsic::arm_mve_vctp16; break;
+ case 16: VCTPID = Intrinsic::arm_mve_vctp8; break;
+
+ // FIXME: vctp64 currently not supported because the predicate
+ // vector wants to be <2 x i1>, but v2i1 is not a legal MVE
+ // type, so problems happen at isel time.
+ // Intrinsic::arm_mve_vctp64 exists for ACLE intrinsics
+ // purposes, but takes a v4i1 instead of a v2i1.
}
Function *VCTP = Intrinsic::getDeclaration(M, VCTPID);
Value *TailPredicate = Builder.CreateCall(VCTP, Processed);
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
index 79c81ca7a44..257d950c60f 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
@@ -4,7 +4,7 @@
; CHECK: vector.body:
; CHECK: %index = phi i32
; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.vctp8(i32 [[ELEMS]])
+; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[ELEMS]])
; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 16
; CHECK: [[LD0:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
; CHECK: [[LD1:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
@@ -57,7 +57,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry
; CHECK: vector.body:
; CHECK: %index = phi i32
; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]])
+; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[ELEMS]])
; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 8
; CHECK: [[LD0:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
; CHECK: [[LD1:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
@@ -109,7 +109,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry
; CHECK-LABEL: mul_v4i32
; CHECK: vector.body:
; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
@@ -158,59 +158,11 @@ for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
-; CHECK-LABEL: copy_v2i64
-; CHECK: vector.body:
-; CHECK: %index = phi i32
-; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <2 x i1> @llvm.arm.vctp64(i32 [[ELEMS]])
-; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 2
-; CHECK: [[LD0:%[^ ]+]] = tail call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* {{.*}}, i32 4, <2 x i1> [[VCTP]], <2 x i64> undef)
-; CHECK: tail call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> [[LD0]], <2 x i64>* {{.*}}, i32 4, <2 x i1> [[VCTP]])
-define void @copy_v2i64(i64* %a, i64* %b, i32 %N) {
-entry:
- %cmp8 = icmp eq i32 %N, 0
- %tmp8 = add i32 %N, 1
- %tmp9 = lshr i32 %tmp8, 1
- %tmp10 = shl nuw i32 %tmp9, 1
- %tmp11 = add i32 %tmp10, -2
- %tmp12 = lshr i32 %tmp11, 1
- %tmp13 = add nuw nsw i32 %tmp12, 1
- br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
-
-vector.ph: ; preds = %entry
- %trip.count.minus.1 = add i32 %N, -1
- %broadcast.splatinsert10 = insertelement <2 x i32> undef, i32 %trip.count.minus.1, i32 0
- %broadcast.splat11 = shufflevector <2 x i32> %broadcast.splatinsert10, <2 x i32> undef, <2 x i32> zeroinitializer
- call void @llvm.set.loop.iterations.i32(i32 %tmp13)
- br label %vector.body
-
-vector.body: ; preds = %vector.body, %vector.ph
- %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
- %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
- %broadcast.splatinsert = insertelement <2 x i32> undef, i32 %index, i32 0
- %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer
- %induction = add <2 x i32> %broadcast.splat, <i32 0, i32 1>
- %tmp1 = icmp ule <2 x i32> %induction, %broadcast.splat11
- %tmp = getelementptr inbounds i64, i64* %a, i32 %index
- %tmp2 = bitcast i64* %tmp to <2 x i64>*
- %wide.masked.load = tail call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %tmp2, i32 4, <2 x i1> %tmp1, <2 x i64> undef)
- %tmp3 = getelementptr inbounds i64, i64* %b, i32 %index
- %tmp7 = bitcast i64* %tmp3 to <2 x i64>*
- tail call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %wide.masked.load, <2 x i64>* %tmp7, i32 4, <2 x i1> %tmp1)
- %index.next = add i32 %index, 2
- %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
- %tmp16 = icmp ne i32 %tmp15, 0
- br i1 %tmp16, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %vector.body, %entry
- ret void
-}
-
; CHECK-LABEL: split_vector
; CHECK: vector.body:
; CHECK: %index = phi i32
; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
@@ -268,7 +220,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry
; One of the loads now uses ult predicate.
; CHECK-LABEL: mismatch_load_pred
; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> %wrong, <4 x i32> undef)
@@ -322,7 +274,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry
; CHECK-LABEL: mismatch_store_pred
; CHECK: %index = phi i32
; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
index 2f9d301e808..f67a59f74fb 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
@@ -28,7 +28,7 @@ define void @mat_vec_sext_i16(i16** nocapture readonly %A, i16* nocapture readon
; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 [[INDEX]]
-; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.vctp32(i32 [[TMP0]])
+; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4
; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP6]] to <4 x i16>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP8]], i32 2, <4 x i1> [[TMP1]], <4 x i16> undef)
@@ -140,7 +140,7 @@ define void @mat_vec_i32(i32** nocapture readonly %A, i32* nocapture readonly %B
; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 [[INDEX]]
-; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.vctp32(i32 [[TMP0]])
+; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4
; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP8]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
index 70e272ffc0d..330c6db24a7 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
@@ -1,7 +1,7 @@
; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve,+lob %s -S -o - | FileCheck %s
; CHECK-LABEL: expand_v8i16_v8i32
-; CHECK-NOT: call i32 @llvm.arm.vctp
+; CHECK-NOT: call i32 @llvm.arm.mve.vctp
define void @expand_v8i16_v8i32(i16* noalias nocapture readonly %a, i16* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
entry:
%cmp8 = icmp eq i32 %N, 0
@@ -50,7 +50,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry
; CHECK-LABEL: expand_v8i16_v4i32
; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS_REM:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]])
+; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[ELEMS]])
; CHECK: [[ELEMS_REM]] = sub i32 [[ELEMS]], 8
; CHECK: tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
; CHECK: %store.pred = icmp ule <4 x i32> %induction.store
@@ -117,7 +117,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry
}
; CHECK-LABEL: expand_v4i32_v4i64
-; CHECK-NOT: call i32 @llvm.arm.vctp
+; CHECK-NOT: call i32 @llvm.arm.mve.vctp
define void @expand_v4i32_v4i64(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i64* noalias nocapture %c, i32 %N) {
entry:
%cmp8 = icmp eq i32 %N, 0
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll
index 7cdd28fd0f3..c7ed9ce674d 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll
@@ -5,7 +5,7 @@
; CHECK: phi <8 x i16> [ zeroinitializer, %entry ]
; CHECK: phi i32
; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %entry ], [ [[ELEMS:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[PHI]])
+; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]])
; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8
; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp6, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
@@ -63,7 +63,7 @@ middle.block: ; preds = %vector.body
; CHECK: phi <8 x i16> [ zeroinitializer, %entry ]
; CHECK: phi i32
; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %entry ], [ [[ELEMS:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[PHI]])
+; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]])
; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8
; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
define i16 @reduction_i32_with_scalar(i16* nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr {
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
index dbf40f60cbd..38dc5ce54bc 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
@@ -6,13 +6,13 @@
; CHECK: vector.body:
; CHECK-NOT: phi i32 [ 0, %vector.ph ]
; CHECK: [[ELTS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[SUB:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELTS]])
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELTS]])
; CHECK: [[SUB]] = sub i32 [[ELTS]], 4
; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]]
; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]],
; CHECK: middle.block:
-; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELTS]])
+; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELTS]])
; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP_CLONE]],
; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]])
diff --git a/llvm/test/CodeGen/Thumb2/mve-vctp.ll b/llvm/test/CodeGen/Thumb2/mve-vctp.ll
index 8f7e1696e67..d6e4d492f53 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vctp.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vctp.ll
@@ -10,7 +10,7 @@ define void @vctp8(i32 %arg, <16 x i8> *%in, <16 x i8>* %out) {
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: vstrw.32 q0, [r2]
; CHECK-NEXT: bx lr
- %pred = call <16 x i1> @llvm.arm.vctp8(i32 %arg)
+ %pred = call <16 x i1> @llvm.arm.mve.vctp8(i32 %arg)
%ld = load <16 x i8>, <16 x i8>* %in
%res = select <16 x i1> %pred, <16 x i8> %ld, <16 x i8> zeroinitializer
store <16 x i8> %res, <16 x i8>* %out
@@ -26,7 +26,7 @@ define void @vctp16(i32 %arg, <8 x i16> *%in, <8 x i16>* %out) {
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: vstrw.32 q0, [r2]
; CHECK-NEXT: bx lr
- %pred = call <8 x i1> @llvm.arm.vctp16(i32 %arg)
+ %pred = call <8 x i1> @llvm.arm.mve.vctp16(i32 %arg)
%ld = load <8 x i16>, <8 x i16>* %in
%res = select <8 x i1> %pred, <8 x i16> %ld, <8 x i16> zeroinitializer
store <8 x i16> %res, <8 x i16>* %out
@@ -42,13 +42,13 @@ define void @vctp32(i32 %arg, <4 x i32> *%in, <4 x i32>* %out) {
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: vstrw.32 q0, [r2]
; CHECK-NEXT: bx lr
- %pred = call <4 x i1> @llvm.arm.vctp32(i32 %arg)
+ %pred = call <4 x i1> @llvm.arm.mve.vctp32(i32 %arg)
%ld = load <4 x i32>, <4 x i32>* %in
%res = select <4 x i1> %pred, <4 x i32> %ld, <4 x i32> zeroinitializer
store <4 x i32> %res, <4 x i32>* %out
ret void
}
-declare <16 x i1> @llvm.arm.vctp8(i32)
-declare <8 x i1> @llvm.arm.vctp16(i32)
-declare <4 x i1> @llvm.arm.vctp32(i32)
+declare <16 x i1> @llvm.arm.mve.vctp8(i32)
+declare <8 x i1> @llvm.arm.mve.vctp16(i32)
+declare <4 x i1> @llvm.arm.mve.vctp32(i32)
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-from-intrinsics.ll b/llvm/test/CodeGen/Thumb2/mve-vpt-from-intrinsics.ll
index f46eb77e755..e6e7de61094 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-from-intrinsics.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-from-intrinsics.ll
@@ -27,7 +27,7 @@ define arm_aapcs_vfpcc <8 x i16> @test_vpnot(<8 x i16> %v, <8 x i16> %w, <8 x i1
; CHECK-NEXT: vaddt.i16 q0, q1, q2
; CHECK-NEXT: bx lr
entry:
- %0 = call <8 x i1> @llvm.arm.vctp16(i32 %n)
+ %0 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
%1 = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> %0)
%2 = trunc i32 %1 to i16
%3 = xor i16 %2, -1
@@ -40,5 +40,5 @@ entry:
declare i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1>)
declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
declare <8 x i16> @llvm.arm.mve.add.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>)
-declare <8 x i1> @llvm.arm.vctp16(i32)
+declare <8 x i1> @llvm.arm.mve.vctp16(i32)
OpenPOWER on IntegriCloud