summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--clang/include/clang/Basic/arm_mve.td75
-rw-r--r--clang/test/CodeGen/arm-mve-intrinsics/vabdq.c18
-rw-r--r--clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c18
-rw-r--r--clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c12
-rw-r--r--clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c8
-rw-r--r--clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c18
-rw-r--r--clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c8
-rw-r--r--clang/test/CodeGen/arm-mve-intrinsics/vminq.c14
-rw-r--r--clang/test/CodeGen/arm-mve-intrinsics/vmulhq.c18
-rw-r--r--clang/test/CodeGen/arm-mve-intrinsics/vmullbq.c24
-rw-r--r--clang/test/CodeGen/arm-mve-intrinsics/vmulltq.c18
-rw-r--r--clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c6
-rw-r--r--clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c6
-rw-r--r--clang/test/CodeGen/arm-mve-intrinsics/vrhaddq.c20
-rw-r--r--clang/test/CodeGen/arm-mve-intrinsics/vrmulhq.c18
-rw-r--r--llvm/include/llvm/IR/IntrinsicsARM.td61
-rw-r--r--llvm/lib/Target/ARM/ARMInstrMVE.td77
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-intrinsics/vabdq.ll42
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll40
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-intrinsics/vhsubq.ll30
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxnmq.ll12
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxq.ll26
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-intrinsics/vminnmq.ll12
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-intrinsics/vminq.ll22
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulhq.ll40
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-intrinsics/vmullbq.ll40
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulltq.ll40
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-intrinsics/vqaddq.ll14
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-intrinsics/vqsubq.ll14
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-intrinsics/vrhaddq.ll42
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-intrinsics/vrmulhq.ll40
31 files changed, 432 insertions, 401 deletions
diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
index 061759ad91d..e81b75dfe31 100644
--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -63,9 +63,9 @@ def vqsubq_u: Intrinsic<Vector, (args Vector:$a, Vector:$b),
let params = T.Int in {
def vaddq: Intrinsic<Vector, (args Vector:$a, Vector:$b), (add $a, $b)>;
def vhaddq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
- (IRInt<"vhadd", [Vector]> $a, $b)>;
+ (IRInt<"vhadd", [Vector]> $a, $b, (unsignedflag Scalar))>;
def vrhaddq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
- (IRInt<"vrhadd", [Vector]> $a, $b)>;
+ (IRInt<"vrhadd", [Vector]> $a, $b, (unsignedflag Scalar))>;
def vandq: Intrinsic<Vector, (args Vector:$a, Vector:$b), (and $a, $b)>;
def vbicq: Intrinsic<Vector, (args Vector:$a, Vector:$b), (and $a, (not $b))>;
def veorq: Intrinsic<Vector, (args Vector:$a, Vector:$b), (xor $a, $b)>;
@@ -73,18 +73,18 @@ def vornq: Intrinsic<Vector, (args Vector:$a, Vector:$b), (or $a, (not $b))>;
def vorrq: Intrinsic<Vector, (args Vector:$a, Vector:$b), (or $a, $b)>;
def vsubq: Intrinsic<Vector, (args Vector:$a, Vector:$b), (sub $a, $b)>;
def vhsubq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
- (IRInt<"vhsub", [Vector]> $a, $b)>;
+ (IRInt<"vhsub", [Vector]> $a, $b, (unsignedflag Scalar))>;
def vmulq: Intrinsic<Vector, (args Vector:$a, Vector:$b), (mul $a, $b)>;
def vmulhq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
- (IRInt<"vmulh", [Vector]> $a, $b)>;
+ (IRInt<"vmulh", [Vector]> $a, $b, (unsignedflag Scalar))>;
def vrmulhq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
- (IRInt<"vrmulh", [Vector]> $a, $b)>;
+ (IRInt<"vrmulh", [Vector]> $a, $b, (unsignedflag Scalar))>;
def vmullbq_int: Intrinsic<DblVector, (args Vector:$a, Vector:$b),
(IRInt<"vmull", [DblVector, Vector]>
- $a, $b, 0)>;
+ $a, $b, (unsignedflag Scalar), 0)>;
def vmulltq_int: Intrinsic<DblVector, (args Vector:$a, Vector:$b),
(IRInt<"vmull", [DblVector, Vector]>
- $a, $b, 1)>;
+ $a, $b, (unsignedflag Scalar), 1)>;
}
let params = T.Signed in {
def vqdmulhq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
@@ -134,15 +134,15 @@ def "": Intrinsic<Vector, (args Vector:$inactive, Vector:$a, Vector:$b,
// Plain intrinsics
let params = T.Usual in {
def vabdq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
- (IRInt<"vabd", [Vector]> $a, $b)>;
+ (IRInt<"vabd", [Vector]> $a, $b, (unsignedflag Scalar))>;
}
-multiclass VectorVectorArithmetic<string operation, int wantXVariant = 1> {
- defm "" : IntrinsicMX<Vector, (args Vector:$a, Vector:$b,
- Predicate:$pred),
- (IRInt<operation, [Vector, Predicate]> $a, $b,
- $pred, $inactive),
- wantXVariant>;
+multiclass VectorVectorArithmetic<string operation, dag extraArgs = (?),
+ int wantXVariant = 1> {
+ defm "" : IntrinsicMX<
+ Vector, (args Vector:$a, Vector:$b, Predicate:$pred),
+ !con((IRInt<operation, [Vector, Predicate]> $a, $b),
+ extraArgs, (? $pred, $inactive)), wantXVariant>;
}
multiclass VectorVectorArithmeticBitcast<string operation> {
@@ -157,7 +157,7 @@ multiclass VectorVectorArithmeticBitcast<string operation> {
// Predicated intrinsics
let params = T.Usual in {
- defm vabdq : VectorVectorArithmetic<"abd_predicated">;
+ defm vabdq : VectorVectorArithmetic<"abd_predicated", (? (unsignedflag Scalar))>;
defm vaddq : VectorVectorArithmetic<"add_predicated">;
defm vsubq : VectorVectorArithmetic<"sub_predicated">;
defm vmulq : VectorVectorArithmetic<"mul_predicated">;
@@ -168,42 +168,41 @@ let params = T.Usual in {
defm vorrq : VectorVectorArithmeticBitcast<"orr_predicated">;
}
-multiclass DblVectorVectorArithmetic<string operation, dag top> {
- defm "" : IntrinsicMX<DblVector, (args Vector:$a, Vector:$b,
- Predicate:$pred),
- (IRInt<operation,
- [DblVector, Vector, Predicate]>
- $a, $b, top, $pred, $inactive)>;
+multiclass DblVectorVectorArithmetic<string operation, dag extraArgs = (?)> {
+ defm "" : IntrinsicMX<
+ DblVector, (args Vector:$a, Vector:$b, Predicate:$pred),
+ !con((IRInt<operation, [DblVector, Vector, Predicate]> $a, $b),
+ extraArgs, (? $pred, $inactive))>;
}
// Predicated intrinsics - Int types only
let params = T.Int in {
- defm vminq : VectorVectorArithmetic<"min_predicated">;
- defm vmaxq : VectorVectorArithmetic<"max_predicated">;
- defm vmulhq : VectorVectorArithmetic<"mulh_predicated">;
- defm vrmulhq : VectorVectorArithmetic<"rmulh_predicated">;
- defm vqaddq : VectorVectorArithmetic<"qadd_predicated", 0>;
- defm vhaddq : VectorVectorArithmetic<"hadd_predicated">;
- defm vrhaddq : VectorVectorArithmetic<"rhadd_predicated">;
- defm vqsubq : VectorVectorArithmetic<"qsub_predicated", 0>;
- defm vhsubq : VectorVectorArithmetic<"hsub_predicated">;
- defm vmullbq_int : DblVectorVectorArithmetic<"mull_int_predicated", (u32 0)>;
- defm vmulltq_int : DblVectorVectorArithmetic<"mull_int_predicated", (u32 1)>;
+ defm vminq : VectorVectorArithmetic<"min_predicated", (? (unsignedflag Scalar))>;
+ defm vmaxq : VectorVectorArithmetic<"max_predicated", (? (unsignedflag Scalar))>;
+ defm vmulhq : VectorVectorArithmetic<"mulh_predicated", (? (unsignedflag Scalar))>;
+ defm vrmulhq : VectorVectorArithmetic<"rmulh_predicated", (? (unsignedflag Scalar))>;
+ defm vqaddq : VectorVectorArithmetic<"qadd_predicated", (? (unsignedflag Scalar)), 0>;
+ defm vhaddq : VectorVectorArithmetic<"hadd_predicated", (? (unsignedflag Scalar))>;
+ defm vrhaddq : VectorVectorArithmetic<"rhadd_predicated", (? (unsignedflag Scalar))>;
+ defm vqsubq : VectorVectorArithmetic<"qsub_predicated", (? (unsignedflag Scalar)), 0>;
+ defm vhsubq : VectorVectorArithmetic<"hsub_predicated", (? (unsignedflag Scalar))>;
+ defm vmullbq_int : DblVectorVectorArithmetic<"mull_int_predicated", (? (unsignedflag Scalar), (u32 0))>;
+ defm vmulltq_int : DblVectorVectorArithmetic<"mull_int_predicated", (? (unsignedflag Scalar), (u32 1))>;
}
let params = T.Signed in {
- defm vqdmulhq : VectorVectorArithmetic<"qdmulh_predicated", 0>;
- defm vqrdmulhq : VectorVectorArithmetic<"qrdmulh_predicated", 0>;
+ defm vqdmulhq : VectorVectorArithmetic<"qdmulh_predicated", (?), 0>;
+ defm vqrdmulhq : VectorVectorArithmetic<"qrdmulh_predicated", (?), 0>;
}
let params = T.Poly, overrideKindLetter = "p" in {
- defm vmullbq_poly : DblVectorVectorArithmetic<"mull_poly_predicated", (u32 0)>;
- defm vmulltq_poly : DblVectorVectorArithmetic<"mull_poly_predicated", (u32 1)>;
+ defm vmullbq_poly : DblVectorVectorArithmetic<"mull_poly_predicated", (? (u32 0))>;
+ defm vmulltq_poly : DblVectorVectorArithmetic<"mull_poly_predicated", (? (u32 1))>;
}
// Predicated intrinsics - Float types only
let params = T.Float in {
- defm vminnmq : VectorVectorArithmetic<"min_predicated">;
- defm vmaxnmq : VectorVectorArithmetic<"max_predicated">;
+ defm vminnmq : VectorVectorArithmetic<"min_predicated", (? (u32 0))>;
+ defm vmaxnmq : VectorVectorArithmetic<"max_predicated", (? (u32 0))>;
}
let params = T.Int in {
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vabdq.c b/clang/test/CodeGen/arm-mve-intrinsics/vabdq.c
index fb8a5fb5e56..d972fb78b50 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vabdq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vabdq.c
@@ -6,7 +6,7 @@
// CHECK-LABEL: @test_vabdq_s8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vabd.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vabd.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0)
// CHECK-NEXT: ret <16 x i8> [[TMP0]]
//
int8x16_t test_vabdq_s8(int8x16_t a, int8x16_t b)
@@ -20,7 +20,7 @@ int8x16_t test_vabdq_s8(int8x16_t a, int8x16_t b)
// CHECK-LABEL: @test_vabdq_u32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vabd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vabd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1)
// CHECK-NEXT: ret <4 x i32> [[TMP0]]
//
uint32x4_t test_vabdq_u32(uint32x4_t a, uint32x4_t b)
@@ -34,7 +34,7 @@ uint32x4_t test_vabdq_u32(uint32x4_t a, uint32x4_t b)
// CHECK-LABEL: @test_vabdq_f32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.arm.mve.vabd.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.arm.mve.vabd.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], i32 0)
// CHECK-NEXT: ret <8 x half> [[TMP0]]
//
float16x8_t test_vabdq_f32(float16x8_t a, float16x8_t b)
@@ -50,7 +50,7 @@ float16x8_t test_vabdq_f32(float16x8_t a, float16x8_t b)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
uint16x8_t test_vabdq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
@@ -66,7 +66,7 @@ uint16x8_t test_vabdq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.abd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.abd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <16 x i8> [[TMP2]]
//
int8x16_t test_vabdq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
@@ -82,7 +82,7 @@ int8x16_t test_vabdq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.abd.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.abd.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <4 x float> [[TMP2]]
//
float32x4_t test_vabdq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t b, mve_pred16_t p)
@@ -98,7 +98,7 @@ float32x4_t test_vabdq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t b,
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> undef)
// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
uint16x8_t test_vabdq_x_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p)
@@ -114,7 +114,7 @@ uint16x8_t test_vabdq_x_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.abd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.abd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <4 x i32> undef)
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
uint32x4_t test_vabdq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
@@ -130,7 +130,7 @@ uint32x4_t test_vabdq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.abd.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.abd.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x half> undef)
// CHECK-NEXT: ret <8 x half> [[TMP2]]
//
float16x8_t test_vabdq_x_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c b/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c
index a31e6ae4ffa..1d97ea80520 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c
@@ -6,7 +6,7 @@
// CHECK-LABEL: @test_vhaddq_u8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vhadd.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vhadd.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1)
// CHECK-NEXT: ret <16 x i8> [[TMP0]]
//
uint8x16_t test_vhaddq_u8(uint8x16_t a, uint8x16_t b)
@@ -20,7 +20,7 @@ uint8x16_t test_vhaddq_u8(uint8x16_t a, uint8x16_t b)
// CHECK-LABEL: @test_vhaddq_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vhadd.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vhadd.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0)
// CHECK-NEXT: ret <8 x i16> [[TMP0]]
//
int16x8_t test_vhaddq_s16(int16x8_t a, int16x8_t b)
@@ -34,7 +34,7 @@ int16x8_t test_vhaddq_s16(int16x8_t a, int16x8_t b)
// CHECK-LABEL: @test_vhaddq_u32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1)
// CHECK-NEXT: ret <4 x i32> [[TMP0]]
//
uint32x4_t test_vhaddq_u32(uint32x4_t a, uint32x4_t b)
@@ -50,7 +50,7 @@ uint32x4_t test_vhaddq_u32(uint32x4_t a, uint32x4_t b)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <16 x i8> [[TMP2]]
//
int8x16_t test_vhaddq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
@@ -66,7 +66,7 @@ int8x16_t test_vhaddq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pre
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
uint16x8_t test_vhaddq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
@@ -82,7 +82,7 @@ uint16x8_t test_vhaddq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mv
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
int32x4_t test_vhaddq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
@@ -98,7 +98,7 @@ int32x4_t test_vhaddq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pr
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> undef)
// CHECK-NEXT: ret <16 x i8> [[TMP2]]
//
uint8x16_t test_vhaddq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
@@ -114,7 +114,7 @@ uint8x16_t test_vhaddq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x i16> undef)
// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
int16x8_t test_vhaddq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
@@ -130,7 +130,7 @@ int16x8_t test_vhaddq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <4 x i32> undef)
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
uint32x4_t test_vhaddq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c b/clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c
index 34fd8c93a9a..633fd32d947 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c
@@ -6,7 +6,7 @@
// CHECK-LABEL: @test_vhsubq_u8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vhsub.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vhsub.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1)
// CHECK-NEXT: ret <16 x i8> [[TMP0]]
//
uint8x16_t test_vhsubq_u8(uint8x16_t a, uint8x16_t b)
@@ -20,7 +20,7 @@ uint8x16_t test_vhsubq_u8(uint8x16_t a, uint8x16_t b)
// CHECK-LABEL: @test_vhsubq_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vhsub.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vhsub.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0)
// CHECK-NEXT: ret <8 x i16> [[TMP0]]
//
int16x8_t test_vhsubq_s16(int16x8_t a, int16x8_t b)
@@ -34,7 +34,7 @@ int16x8_t test_vhsubq_s16(int16x8_t a, int16x8_t b)
// CHECK-LABEL: @test_vhsubq_u32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1)
// CHECK-NEXT: ret <4 x i32> [[TMP0]]
//
uint32x4_t test_vhsubq_u32(uint32x4_t a, uint32x4_t b)
@@ -50,7 +50,7 @@ uint32x4_t test_vhsubq_u32(uint32x4_t a, uint32x4_t b)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <16 x i8> [[TMP2]]
//
int8x16_t test_vhsubq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
@@ -66,7 +66,7 @@ int8x16_t test_vhsubq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pre
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
uint16x8_t test_vhsubq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
@@ -82,7 +82,7 @@ uint16x8_t test_vhsubq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mv
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
int32x4_t test_vhsubq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c
index 3f44b654c48..539508127ca 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c
@@ -36,7 +36,7 @@ float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x half> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <8 x half> [[TMP2]]
//
float16x8_t test_vmaxnmq_m_f16(float16x8_t inactive, float16x8_t a, float16x8_t b, mve_pred16_t p)
@@ -52,7 +52,7 @@ float16x8_t test_vmaxnmq_m_f16(float16x8_t inactive, float16x8_t a, float16x8_t
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <4 x float> [[TMP2]]
//
float32x4_t test_vmaxnmq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t b, mve_pred16_t p)
@@ -68,7 +68,7 @@ float32x4_t test_vmaxnmq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x half> undef)
// CHECK-NEXT: ret <8 x half> [[TMP2]]
//
float16x8_t test_vmaxnmq_x_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
@@ -84,7 +84,7 @@ float16x8_t test_vmaxnmq_x_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x float> undef)
// CHECK-NEXT: ret <4 x float> [[TMP2]]
//
float32x4_t test_vmaxnmq_x_f32(float32x4_t a, float32x4_t b, mve_pred16_t p)
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c
index 7b766fb0d25..93fb24d339b 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c
@@ -53,7 +53,7 @@ int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <16 x i8> [[TMP2]]
//
uint8x16_t test_vmaxq_m_u8(uint8x16_t inactive, uint8x16_t a, uint8x16_t b, mve_pred16_t p)
@@ -69,7 +69,7 @@ uint8x16_t test_vmaxq_m_u8(uint8x16_t inactive, uint8x16_t a, uint8x16_t b, mve_
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
int16x8_t test_vmaxq_m_s16(int16x8_t inactive, int16x8_t a, int16x8_t b, mve_pred16_t p)
@@ -85,7 +85,7 @@ int16x8_t test_vmaxq_m_s16(int16x8_t inactive, int16x8_t a, int16x8_t b, mve_pre
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
uint32x4_t test_vmaxq_m_u32(uint32x4_t inactive, uint32x4_t a, uint32x4_t b, mve_pred16_t p)
@@ -101,15 +101,15 @@ uint32x4_t test_vmaxq_m_u32(uint32x4_t inactive, uint32x4_t a, uint32x4_t b, mve
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> undef)
// CHECK-NEXT: ret <16 x i8> [[TMP2]]
//
-int8x16_t test_vmaxq_x_u8(int8x16_t a, int8x16_t b, mve_pred16_t p)
+uint8x16_t test_vmaxq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
{
#ifdef POLYMORPHIC
return vmaxq_x(a, b, p);
#else /* POLYMORPHIC */
- return vmaxq_x_s8(a, b, p);
+ return vmaxq_x_u8(a, b, p);
#endif /* POLYMORPHIC */
}
@@ -117,7 +117,7 @@ int8x16_t test_vmaxq_x_u8(int8x16_t a, int8x16_t b, mve_pred16_t p)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> undef)
// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
uint16x8_t test_vmaxq_x_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p)
@@ -133,7 +133,7 @@ uint16x8_t test_vmaxq_x_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> undef)
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
int32x4_t test_vmaxq_x_s32(int32x4_t a, int32x4_t b, mve_pred16_t p)
@@ -141,6 +141,6 @@ int32x4_t test_vmaxq_x_s32(int32x4_t a, int32x4_t b, mve_pred16_t p)
#ifdef POLYMORPHIC
return vmaxq_x(a, b, p);
#else /* POLYMORPHIC */
- return vmaxq_x_u32(a, b, p);
+ return vmaxq_x_s32(a, b, p);
#endif /* POLYMORPHIC */
}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c b/clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c
index ed6b31dfb0f..f951afe61b8 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c
@@ -36,7 +36,7 @@ float32x4_t test_vminnmq_f32(float32x4_t a, float32x4_t b)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x half> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <8 x half> [[TMP2]]
//
float16x8_t test_vminnmq_m_f16(float16x8_t inactive, float16x8_t a, float16x8_t b, mve_pred16_t p)
@@ -52,7 +52,7 @@ float16x8_t test_vminnmq_m_f16(float16x8_t inactive, float16x8_t a, float16x8_t
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <4 x float> [[TMP2]]
//
float32x4_t test_vminnmq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t b, mve_pred16_t p)
@@ -68,7 +68,7 @@ float32x4_t test_vminnmq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x half> undef)
// CHECK-NEXT: ret <8 x half> [[TMP2]]
//
float16x8_t test_vminnmq_x_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
@@ -84,7 +84,7 @@ float16x8_t test_vminnmq_x_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x float> undef)
// CHECK-NEXT: ret <4 x float> [[TMP2]]
//
float32x4_t test_vminnmq_x_f32(float32x4_t a, float32x4_t b, mve_pred16_t p)
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vminq.c b/clang/test/CodeGen/arm-mve-intrinsics/vminq.c
index b6981104d8b..105c5d6a361 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vminq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vminq.c
@@ -53,7 +53,7 @@ uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <16 x i8> [[TMP2]]
//
int8x16_t test_vminq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
@@ -69,7 +69,7 @@ int8x16_t test_vminq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
uint16x8_t test_vminq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
@@ -85,7 +85,7 @@ uint16x8_t test_vminq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
int32x4_t test_vminq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
@@ -101,7 +101,7 @@ int32x4_t test_vminq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pre
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> undef)
// CHECK-NEXT: ret <16 x i8> [[TMP2]]
//
uint8x16_t test_vminq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
@@ -117,7 +117,7 @@ uint8x16_t test_vminq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x i16> undef)
// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
int16x8_t test_vminq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
@@ -125,7 +125,7 @@ int16x8_t test_vminq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
#ifdef POLYMORPHIC
return vminq_x(a, b, p);
#else /* POLYMORPHIC */
- return vminq_x_u16(a, b, p);
+ return vminq_x_s16(a, b, p);
#endif /* POLYMORPHIC */
}
@@ -133,7 +133,7 @@ int16x8_t test_vminq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <4 x i32> undef)
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
uint32x4_t test_vminq_x_s32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmulhq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmulhq.c
index 3a6b89504dd..816a0df0111 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmulhq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmulhq.c
@@ -6,7 +6,7 @@
// CHECK-LABEL: @test_vmulhq_u8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vmulh.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vmulh.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1)
// CHECK-NEXT: ret <16 x i8> [[TMP0]]
//
uint8x16_t test_vmulhq_u8(uint8x16_t a, uint8x16_t b)
@@ -20,7 +20,7 @@ uint8x16_t test_vmulhq_u8(uint8x16_t a, uint8x16_t b)
// CHECK-LABEL: @test_vmulhq_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vmulh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vmulh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0)
// CHECK-NEXT: ret <8 x i16> [[TMP0]]
//
int16x8_t test_vmulhq_s16(int16x8_t a, int16x8_t b)
@@ -34,7 +34,7 @@ int16x8_t test_vmulhq_s16(int16x8_t a, int16x8_t b)
// CHECK-LABEL: @test_vmulhq_u32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vmulh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vmulh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1)
// CHECK-NEXT: ret <4 x i32> [[TMP0]]
//
uint32x4_t test_vmulhq_u32(uint32x4_t a, uint32x4_t b)
@@ -50,7 +50,7 @@ uint32x4_t test_vmulhq_u32(uint32x4_t a, uint32x4_t b)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <16 x i8> [[TMP2]]
//
int8x16_t test_vmulhq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
@@ -66,7 +66,7 @@ int8x16_t test_vmulhq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pre
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
uint16x8_t test_vmulhq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
@@ -82,7 +82,7 @@ uint16x8_t test_vmulhq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mv
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
int32x4_t test_vmulhq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
@@ -98,7 +98,7 @@ int32x4_t test_vmulhq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pr
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> undef)
// CHECK-NEXT: ret <16 x i8> [[TMP2]]
//
uint8x16_t test_vmulhq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
@@ -114,7 +114,7 @@ uint8x16_t test_vmulhq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x i16> undef)
// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
int16x8_t test_vmulhq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
@@ -130,7 +130,7 @@ int16x8_t test_vmulhq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <4 x i32> undef)
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
uint32x4_t test_vmulhq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmullbq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmullbq.c
index 008a368912c..757fb03b0bc 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmullbq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmullbq.c
@@ -6,7 +6,7 @@
// CHECK-LABEL: @test_vmullbq_int_u8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0)
+// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, i32 0)
// CHECK-NEXT: ret <8 x i16> [[TMP0]]
//
int16x8_t test_vmullbq_int_u8(uint8x16_t a, uint8x16_t b)
@@ -20,7 +20,7 @@ int16x8_t test_vmullbq_int_u8(uint8x16_t a, uint8x16_t b)
// CHECK-LABEL: @test_vmullbq_int_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0)
+// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, i32 0)
// CHECK-NEXT: ret <4 x i32> [[TMP0]]
//
int32x4_t test_vmullbq_int_s16(int16x8_t a, int16x8_t b)
@@ -34,7 +34,7 @@ int32x4_t test_vmullbq_int_s16(int16x8_t a, int16x8_t b)
// CHECK-LABEL: @test_vmullbq_int_u32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0)
+// CHECK-NEXT: [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, i32 0)
// CHECK-NEXT: ret <2 x i64> [[TMP0]]
//
uint64x2_t test_vmullbq_int_u32(uint32x4_t a, uint32x4_t b)
@@ -64,7 +64,7 @@ uint32x4_t test_vmullbq_poly_p16(uint16x8_t a, uint16x8_t b)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, i32 0, <16 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
int16x8_t test_vmullbq_int_m_s8(int16x8_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
@@ -80,7 +80,7 @@ int16x8_t test_vmullbq_int_m_s8(int16x8_t inactive, int8x16_t a, int8x16_t b, mv
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, i32 0, <8 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
uint32x4_t test_vmullbq_int_m_u16(uint32x4_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
@@ -96,7 +96,7 @@ uint32x4_t test_vmullbq_int_m_u16(uint32x4_t inactive, uint16x8_t a, uint16x8_t
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, i32 0, <4 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <2 x i64> [[TMP2]]
//
int64x2_t test_vmullbq_int_m_s32(int64x2_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
@@ -128,7 +128,7 @@ uint16x8_t test_vmullbq_poly_m_p8(uint16x8_t inactive, uint8x16_t a, uint8x16_t
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, i32 0, <16 x i1> [[TMP1]], <8 x i16> undef)
// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
uint16x8_t test_vmullbq_int_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
@@ -136,7 +136,7 @@ uint16x8_t test_vmullbq_int_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
#ifdef POLYMORPHIC
return vmullbq_int_x(a, b, p);
#else /* POLYMORPHIC */
- return vmullbq_int_x_s8(a, b, p);
+ return vmullbq_int_x_u8(a, b, p);
#endif /* POLYMORPHIC */
}
@@ -144,7 +144,7 @@ uint16x8_t test_vmullbq_int_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, i32 0, <8 x i1> [[TMP1]], <4 x i32> undef)
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
int32x4_t test_vmullbq_int_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
@@ -152,7 +152,7 @@ int32x4_t test_vmullbq_int_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
#ifdef POLYMORPHIC
return vmullbq_int_x(a, b, p);
#else /* POLYMORPHIC */
- return vmullbq_int_x_u16(a, b, p);
+ return vmullbq_int_x_s16(a, b, p);
#endif /* POLYMORPHIC */
}
@@ -160,7 +160,7 @@ int32x4_t test_vmullbq_int_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <2 x i64> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, i32 0, <4 x i1> [[TMP1]], <2 x i64> undef)
// CHECK-NEXT: ret <2 x i64> [[TMP2]]
//
uint64x2_t test_vmullbq_int_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
@@ -168,7 +168,7 @@ uint64x2_t test_vmullbq_int_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
#ifdef POLYMORPHIC
return vmullbq_int_x(a, b, p);
#else /* POLYMORPHIC */
- return vmullbq_int_x_s32(a, b, p);
+ return vmullbq_int_x_u32(a, b, p);
#endif /* POLYMORPHIC */
}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmulltq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmulltq.c
index 7baa9944ba1..1cd982e31a4 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmulltq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmulltq.c
@@ -6,7 +6,7 @@
// CHECK-LABEL: @test_vmulltq_int_u8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1)
+// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, i32 1)
// CHECK-NEXT: ret <8 x i16> [[TMP0]]
//
int16x8_t test_vmulltq_int_u8(uint8x16_t a, uint8x16_t b)
@@ -20,7 +20,7 @@ int16x8_t test_vmulltq_int_u8(uint8x16_t a, uint8x16_t b)
// CHECK-LABEL: @test_vmulltq_int_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1)
+// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, i32 1)
// CHECK-NEXT: ret <4 x i32> [[TMP0]]
//
int32x4_t test_vmulltq_int_s16(int16x8_t a, int16x8_t b)
@@ -34,7 +34,7 @@ int32x4_t test_vmulltq_int_s16(int16x8_t a, int16x8_t b)
// CHECK-LABEL: @test_vmulltq_int_u32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1)
+// CHECK-NEXT: [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, i32 1)
// CHECK-NEXT: ret <2 x i64> [[TMP0]]
//
uint64x2_t test_vmulltq_int_u32(uint32x4_t a, uint32x4_t b)
@@ -64,7 +64,7 @@ uint32x4_t test_vmulltq_poly_p16(uint16x8_t a, uint16x8_t b)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, i32 1, <16 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
int16x8_t test_vmulltq_int_m_s8(int16x8_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
@@ -80,7 +80,7 @@ int16x8_t test_vmulltq_int_m_s8(int16x8_t inactive, int8x16_t a, int8x16_t b, mv
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, i32 1, <8 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
uint32x4_t test_vmulltq_int_m_u16(uint32x4_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
@@ -96,7 +96,7 @@ uint32x4_t test_vmulltq_int_m_u16(uint32x4_t inactive, uint16x8_t a, uint16x8_t
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, i32 1, <4 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <2 x i64> [[TMP2]]
//
int64x2_t test_vmulltq_int_m_s32(int64x2_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
@@ -128,7 +128,7 @@ uint16x8_t test_vmulltq_poly_m_p8(uint16x8_t inactive, uint8x16_t a, uint8x16_t
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, i32 1, <16 x i1> [[TMP1]], <8 x i16> undef)
// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
uint16x8_t test_vmulltq_int_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
@@ -144,7 +144,7 @@ uint16x8_t test_vmulltq_int_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, i32 1, <8 x i1> [[TMP1]], <4 x i32> undef)
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
int32x4_t test_vmulltq_int_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
@@ -160,7 +160,7 @@ int32x4_t test_vmulltq_int_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <2 x i64> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, i32 1, <4 x i1> [[TMP1]], <2 x i64> undef)
// CHECK-NEXT: ret <2 x i64> [[TMP2]]
//
uint64x2_t test_vmulltq_int_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c b/clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c
index 6d5df3d4d6e..8361ab3618e 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c
@@ -50,7 +50,7 @@ uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.qadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.qadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <16 x i8> [[TMP2]]
//
int8x16_t test_vqaddq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
@@ -66,7 +66,7 @@ int8x16_t test_vqaddq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pre
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.qadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.qadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
uint16x8_t test_vqaddq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
@@ -82,7 +82,7 @@ uint16x8_t test_vqaddq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mv
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
int32x4_t test_vqaddq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c b/clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c
index b51f01041af..c8924bf9128 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c
@@ -50,7 +50,7 @@ uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.qsub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.qsub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <16 x i8> [[TMP2]]
//
int8x16_t test_vqsubq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
@@ -66,7 +66,7 @@ int8x16_t test_vqsubq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pre
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.qsub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.qsub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
uint16x8_t test_vqsubq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
@@ -82,7 +82,7 @@ uint16x8_t test_vqsubq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mv
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
int32x4_t test_vqsubq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vrhaddq.c b/clang/test/CodeGen/arm-mve-intrinsics/vrhaddq.c
index eb34fec9c13..eb28d239741 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vrhaddq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vrhaddq.c
@@ -6,7 +6,7 @@
// CHECK-LABEL: @test_vrhaddq_u8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vrhadd.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vrhadd.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1)
// CHECK-NEXT: ret <16 x i8> [[TMP0]]
//
uint8x16_t test_vrhaddq_u8(uint8x16_t a, uint8x16_t b)
@@ -20,7 +20,7 @@ uint8x16_t test_vrhaddq_u8(uint8x16_t a, uint8x16_t b)
// CHECK-LABEL: @test_vrhaddq_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vrhadd.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vrhadd.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0)
// CHECK-NEXT: ret <8 x i16> [[TMP0]]
//
int16x8_t test_vrhaddq_s16(int16x8_t a, int16x8_t b)
@@ -34,7 +34,7 @@ int16x8_t test_vrhaddq_s16(int16x8_t a, int16x8_t b)
// CHECK-LABEL: @test_vrhaddq_u32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vrhadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vrhadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1)
// CHECK-NEXT: ret <4 x i32> [[TMP0]]
//
uint32x4_t test_vrhaddq_u32(uint32x4_t a, uint32x4_t b)
@@ -50,7 +50,7 @@ uint32x4_t test_vrhaddq_u32(uint32x4_t a, uint32x4_t b)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <16 x i8> [[TMP2]]
//
int8x16_t test_vrhaddq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
@@ -66,7 +66,7 @@ int8x16_t test_vrhaddq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pr
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
uint16x8_t test_vrhaddq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
@@ -82,7 +82,7 @@ uint16x8_t test_vrhaddq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, m
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
int32x4_t test_vrhaddq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
@@ -98,7 +98,7 @@ int32x4_t test_vrhaddq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_p
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> undef)
// CHECK-NEXT: ret <16 x i8> [[TMP2]]
//
uint8x16_t test_vrhaddq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
@@ -114,10 +114,10 @@ uint8x16_t test_vrhaddq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> undef)
// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
-int16x8_t test_vrhaddq_x_u16(int16x8_t a, int16x8_t b, mve_pred16_t p)
+uint16x8_t test_vrhaddq_x_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p)
{
#ifdef POLYMORPHIC
return vrhaddq_x(a, b, p);
@@ -130,7 +130,7 @@ int16x8_t test_vrhaddq_x_u16(int16x8_t a, int16x8_t b, mve_pred16_t p)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <4 x i32> undef)
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
uint32x4_t test_vrhaddq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vrmulhq.c b/clang/test/CodeGen/arm-mve-intrinsics/vrmulhq.c
index 3bf77f0d16c..4f65f002b01 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vrmulhq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vrmulhq.c
@@ -6,7 +6,7 @@
// CHECK-LABEL: @test_vrmulhq_u8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vrmulh.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vrmulh.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1)
// CHECK-NEXT: ret <16 x i8> [[TMP0]]
//
uint8x16_t test_vrmulhq_u8(uint8x16_t a, uint8x16_t b)
@@ -20,7 +20,7 @@ uint8x16_t test_vrmulhq_u8(uint8x16_t a, uint8x16_t b)
// CHECK-LABEL: @test_vrmulhq_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vrmulh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vrmulh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0)
// CHECK-NEXT: ret <8 x i16> [[TMP0]]
//
int16x8_t test_vrmulhq_s16(int16x8_t a, int16x8_t b)
@@ -34,7 +34,7 @@ int16x8_t test_vrmulhq_s16(int16x8_t a, int16x8_t b)
// CHECK-LABEL: @test_vrmulhq_u32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vrmulh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vrmulh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1)
// CHECK-NEXT: ret <4 x i32> [[TMP0]]
//
uint32x4_t test_vrmulhq_u32(uint32x4_t a, uint32x4_t b)
@@ -50,7 +50,7 @@ uint32x4_t test_vrmulhq_u32(uint32x4_t a, uint32x4_t b)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <16 x i8> [[TMP2]]
//
int8x16_t test_vrmulhq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
@@ -66,7 +66,7 @@ int8x16_t test_vrmulhq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pr
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
uint16x8_t test_vrmulhq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
@@ -82,7 +82,7 @@ uint16x8_t test_vrmulhq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, m
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
int32x4_t test_vrmulhq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
@@ -98,7 +98,7 @@ int32x4_t test_vrmulhq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_p
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> undef)
// CHECK-NEXT: ret <16 x i8> [[TMP2]]
//
uint8x16_t test_vrmulhq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
@@ -114,7 +114,7 @@ uint8x16_t test_vrmulhq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x i16> undef)
// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
int16x8_t test_vrmulhq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
@@ -130,7 +130,7 @@ int16x8_t test_vrmulhq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <4 x i32> undef)
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
uint32x4_t test_vrmulhq_m_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td
index f76ba563512..50329ecc0e6 100644
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -802,14 +802,16 @@ multiclass IntrinsicSignSuffix<list<LLVMType> rets, list<LLVMType> params = [],
}
def int_arm_mve_min_predicated: Intrinsic<[llvm_anyvector_ty],
- [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
+ [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,
+ llvm_anyvector_ty, LLVMMatchType<0>],
[IntrNoMem]>;
def int_arm_mve_max_predicated: Intrinsic<[llvm_anyvector_ty],
- [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
+ [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,
+ llvm_anyvector_ty, LLVMMatchType<0>],
[IntrNoMem]>;
def int_arm_mve_abd_predicated: Intrinsic<[llvm_anyvector_ty],
- [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
- [IntrNoMem]>;
+ [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,
+ llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>;
def int_arm_mve_add_predicated: Intrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
[IntrNoMem]>;
@@ -835,40 +837,42 @@ def int_arm_mve_mul_predicated: Intrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
[IntrNoMem]>;
def int_arm_mve_mulh_predicated: Intrinsic<[llvm_anyvector_ty],
- [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
+ [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,
+ llvm_anyvector_ty, LLVMMatchType<0>],
[IntrNoMem]>;
def int_arm_mve_qdmulh_predicated: Intrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
[IntrNoMem]>;
def int_arm_mve_rmulh_predicated: Intrinsic<[llvm_anyvector_ty],
- [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
+ [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,
+ llvm_anyvector_ty, LLVMMatchType<0>],
[IntrNoMem]>;
def int_arm_mve_qrdmulh_predicated: Intrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
[IntrNoMem]>;
def int_arm_mve_mull_int_predicated: Intrinsic<[llvm_anyvector_ty],
- [llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty, llvm_anyvector_ty,
- LLVMMatchType<0>],
+ [llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty /* unsigned */,
+ llvm_i32_ty /* top */, llvm_anyvector_ty, LLVMMatchType<0>],
[IntrNoMem]>;
def int_arm_mve_mull_poly_predicated: Intrinsic<[llvm_anyvector_ty],
[llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty, llvm_anyvector_ty,
LLVMMatchType<0>],
[IntrNoMem]>;
def int_arm_mve_qadd_predicated: Intrinsic<[llvm_anyvector_ty],
- [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
- [IntrNoMem]>;
+ [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,
+ llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>;
def int_arm_mve_hadd_predicated: Intrinsic<[llvm_anyvector_ty],
- [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
- [IntrNoMem]>;
+ [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,
+ llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>;
def int_arm_mve_rhadd_predicated: Intrinsic<[llvm_anyvector_ty],
- [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
- [IntrNoMem]>;
+ [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,
+ llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>;
def int_arm_mve_qsub_predicated: Intrinsic<[llvm_anyvector_ty],
- [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
- [IntrNoMem]>;
+ [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,
+ llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>;
def int_arm_mve_hsub_predicated: Intrinsic<[llvm_anyvector_ty],
- [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
- [IntrNoMem]>;
+ [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,
+ llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>;
defm int_arm_mve_minv: IntrinsicSignSuffix<[llvm_i32_ty],
[llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>;
@@ -962,7 +966,8 @@ def int_arm_mve_asrl: ARM_MVE_qrshift_single<[llvm_i32_ty, llvm_i32_ty]>;
def int_arm_mve_vabd: Intrinsic<
[llvm_anyvector_ty],
- [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+ [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */],
+ [IntrNoMem]>;
def int_arm_mve_vadc: Intrinsic<
[llvm_anyvector_ty, llvm_i32_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem]>;
@@ -972,28 +977,34 @@ def int_arm_mve_vadc_predicated: Intrinsic<
llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>;
def int_arm_mve_vmulh: Intrinsic<
[llvm_anyvector_ty],
- [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+ [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */],
+ [IntrNoMem]>;
def int_arm_mve_vqdmulh: Intrinsic<
[llvm_anyvector_ty],
[LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
def int_arm_mve_vhadd: Intrinsic<
[llvm_anyvector_ty],
- [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+ [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */],
+ [IntrNoMem]>;
def int_arm_mve_vrhadd: Intrinsic<
[llvm_anyvector_ty],
- [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+ [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */],
+ [IntrNoMem]>;
def int_arm_mve_vhsub: Intrinsic<
[llvm_anyvector_ty],
- [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+ [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */],
+ [IntrNoMem]>;
def int_arm_mve_vrmulh: Intrinsic<
[llvm_anyvector_ty],
- [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+ [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */],
+ [IntrNoMem]>;
def int_arm_mve_vqrdmulh: Intrinsic<
[llvm_anyvector_ty],
[LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
def int_arm_mve_vmull: Intrinsic<
[llvm_anyvector_ty],
- [llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty], [IntrNoMem]>;
+ [llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty /* unsigned */,
+ llvm_i32_ty /* top */], [IntrNoMem]>;
def int_arm_mve_vmull_poly: Intrinsic<
[llvm_anyvector_ty],
[llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty], [IntrNoMem]>;
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 3189caf626d..e25af565503 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -320,6 +320,9 @@ class MVEVectorVTInfo<ValueType vec, ValueType dblvec, ValueType pred,
// The suffix used on an instruction that mentions the whole type.
string Suffix = suffixletter ## BitsSuffix;
+
+ // The letter part of the suffix only.
+ string SuffixLetter = suffixletter;
}
// Integer vector types that don't treat signed and unsigned differently.
@@ -1090,12 +1093,12 @@ let Predicates = [HasMVEFloat] in {
(v4f32 (MVE_VMAXNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>;
def : Pat<(v8f16 (fmaxnum (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))),
(v8f16 (MVE_VMAXNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
- def : Pat<(v4f32 (int_arm_mve_max_predicated (v4f32 MQPR:$val1), (v4f32 MQPR:$val2),
+ def : Pat<(v4f32 (int_arm_mve_max_predicated (v4f32 MQPR:$val1), (v4f32 MQPR:$val2), (i32 0),
(v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))),
(v4f32 (MVE_VMAXNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2),
ARMVCCThen, (v4i1 VCCR:$mask),
(v4f32 MQPR:$inactive)))>;
- def : Pat<(v8f16 (int_arm_mve_max_predicated (v8f16 MQPR:$val1), (v8f16 MQPR:$val2),
+ def : Pat<(v8f16 (int_arm_mve_max_predicated (v8f16 MQPR:$val1), (v8f16 MQPR:$val2), (i32 0),
(v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))),
(v8f16 (MVE_VMAXNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2),
ARMVCCThen, (v8i1 VCCR:$mask),
@@ -1111,12 +1114,12 @@ let Predicates = [HasMVEFloat] in {
def : Pat<(v8f16 (fminnum (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))),
(v8f16 (MVE_VMINNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
def : Pat<(v4f32 (int_arm_mve_min_predicated (v4f32 MQPR:$val1), (v4f32 MQPR:$val2),
- (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))),
+ (i32 0), (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))),
(v4f32 (MVE_VMINNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2),
ARMVCCThen, (v4i1 VCCR:$mask),
(v4f32 MQPR:$inactive)))>;
def : Pat<(v8f16 (int_arm_mve_min_predicated (v8f16 MQPR:$val1), (v8f16 MQPR:$val2),
- (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))),
+ (i32 0), (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))),
(v8f16 (MVE_VMINNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2),
ARMVCCThen, (v8i1 VCCR:$mask),
(v8f16 MQPR:$inactive)))>;
@@ -1149,7 +1152,8 @@ multiclass MVE_VMINMAX_m<string iname, bit bit_4, MVEVectorVTInfo VTI,
// Predicated min/max
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+ (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
ARMVCCThen, (VTI.Pred VCCR:$mask),
@@ -1771,14 +1775,15 @@ multiclass MVE_VQADD_m<MVEVectorVTInfo VTI,
def "" : MVE_VQADD_<VTI.Suffix, VTI.Unsigned, VTI.Size>;
let Predicates = [HasMVEInt] in {
- // Unpredicated add-and-divide-by-two
+ // Unpredicated saturating add
def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
- // Predicated add-and-divide-by-two
+ // Predicated saturating add
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+ (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
ARMVCCThen, (VTI.Pred VCCR:$mask),
@@ -1801,14 +1806,15 @@ multiclass MVE_VQSUB_m<MVEVectorVTInfo VTI,
def "" : MVE_VQSUB_<VTI.Suffix, VTI.Unsigned, VTI.Size>;
let Predicates = [HasMVEInt] in {
- // Unpredicated subtract-and-divide-by-two
+ // Unpredicated saturating subtract
def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
- // Predicated subtract-and-divide-by-two
+ // Predicated saturating subtract
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+ (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
ARMVCCThen, (VTI.Pred VCCR:$mask),
@@ -1845,13 +1851,15 @@ multiclass MVE_VABD_m<MVEVectorVTInfo VTI,
let Predicates = [HasMVEInt] in {
// Unpredicated absolute difference
- def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+ def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ (i32 VTI.Unsigned))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
// Predicated absolute difference
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+ (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
ARMVCCThen, (VTI.Pred VCCR:$mask),
@@ -1887,13 +1895,15 @@ multiclass MVE_VRHADD_m<MVEVectorVTInfo VTI,
let Predicates = [HasMVEInt] in {
// Unpredicated rounding add-with-divide-by-two
- def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ (i32 VTI.Unsigned))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
// Predicated add-with-divide-by-two
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+ (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
ARMVCCThen, (VTI.Pred VCCR:$mask),
@@ -1939,12 +1949,12 @@ multiclass MVE_VHADD_m<MVEVectorVTInfo VTI,
let Predicates = [HasMVEInt] in {
// Unpredicated add-and-divide-by-two
- def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
// Predicated add-and-divide-by-two
- def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned),
(VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
@@ -1969,13 +1979,15 @@ multiclass MVE_VHSUB_m<MVEVectorVTInfo VTI,
let Predicates = [HasMVEInt] in {
// Unpredicated subtract-and-divide-by-two
- def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ (i32 VTI.Unsigned))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
// Predicated subtract-and-divide-by-two
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+ (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
ARMVCCThen, (VTI.Pred VCCR:$mask),
@@ -3318,11 +3330,13 @@ multiclass MVE_VABDT_fp_m<MVEVectorVTInfo VTI,
def "" : MVE_VABD_fp<VTI.Suffix, VTI.Size{0}>;
let Predicates = [HasMVEFloat] in {
- def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+ def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ (i32 0))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+ (i32 0), (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
ARMVCCThen, (VTI.Pred VCCR:$mask),
@@ -3976,17 +3990,22 @@ multiclass MVE_VMULL_m<MVEVectorVTInfo VTI,
def "" : MVE_VMULL<"vmull" # !if(Top, "t", "b"), VTI.Suffix, VTI.Unsigned,
VTI.Size, Top, cstr>;
+ foreach uflag = [!if(!eq(VTI.SuffixLetter, "p"),
+ (?), (? (i32 VTI.Unsigned)))] in
let Predicates = [HasMVEInt] in {
+
// Unpredicated multiply
- def : Pat<(VTI.DblVec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- (i32 Top))),
+ def : Pat<(VTI.DblVec !con((unpred_op (VTI.Vec MQPR:$Qm),
+ (VTI.Vec MQPR:$Qn)),
+ uflag, (? (i32 Top)))),
(VTI.DblVec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
// Predicated multiply
- def : Pat<(VTI.DblVec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- (i32 Top),
- (VTI.Pred VCCR:$mask), (VTI.DblVec MQPR:$inactive))),
+ def : Pat<(VTI.DblVec !con((pred_int (VTI.Vec MQPR:$Qm),
+ (VTI.Vec MQPR:$Qn)),
+ uflag, (? (i32 Top), (VTI.Pred VCCR:$mask),
+ (VTI.DblVec MQPR:$inactive)))),
(VTI.DblVec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
ARMVCCThen, (VTI.Pred VCCR:$mask),
@@ -4059,13 +4078,15 @@ multiclass MVE_VxMULH_m<string iname, MVEVectorVTInfo VTI, SDNode unpred_op,
let Predicates = [HasMVEInt] in {
// Unpredicated multiply returning high bits
- def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ (i32 VTI.Unsigned))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
// Predicated multiply returning high bits
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+ (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
ARMVCCThen, (VTI.Pred VCCR:$mask),
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabdq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabdq.ll
index fcd57664a16..961489613b6 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabdq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabdq.ll
@@ -7,23 +7,23 @@ define arm_aapcs_vfpcc <16 x i8> @test_vabdq_s8(<16 x i8> %a, <16 x i8> %b) loca
; CHECK-NEXT: vabd.s8 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
- %0 = tail call <16 x i8> @llvm.arm.mve.vabd.v16i8(<16 x i8> %a, <16 x i8> %b)
+ %0 = tail call <16 x i8> @llvm.arm.mve.vabd.v16i8(<16 x i8> %a, <16 x i8> %b, i32 0)
ret <16 x i8> %0
}
-declare <16 x i8> @llvm.arm.mve.vabd.v16i8(<16 x i8>, <16 x i8>) #1
+declare <16 x i8> @llvm.arm.mve.vabd.v16i8(<16 x i8>, <16 x i8>, i32) #1
define arm_aapcs_vfpcc <4 x i32> @test_vabdq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vabdq_u32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vabd.s32 q0, q0, q1
+; CHECK-NEXT: vabd.u32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
- %0 = tail call <4 x i32> @llvm.arm.mve.vabd.v4i32(<4 x i32> %a, <4 x i32> %b)
+ %0 = tail call <4 x i32> @llvm.arm.mve.vabd.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1)
ret <4 x i32> %0
}
-declare <4 x i32> @llvm.arm.mve.vabd.v4i32(<4 x i32>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.vabd.v4i32(<4 x i32>, <4 x i32>, i32) #1
define arm_aapcs_vfpcc <8 x half> @test_vabdq_f32(<8 x half> %a, <8 x half> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vabdq_f32:
@@ -31,29 +31,29 @@ define arm_aapcs_vfpcc <8 x half> @test_vabdq_f32(<8 x half> %a, <8 x half> %b)
; CHECK-NEXT: vabd.f16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
- %0 = tail call <8 x half> @llvm.arm.mve.vabd.v8f16(<8 x half> %a, <8 x half> %b)
+ %0 = tail call <8 x half> @llvm.arm.mve.vabd.v8f16(<8 x half> %a, <8 x half> %b, i32 0)
ret <8 x half> %0
}
-declare <8 x half> @llvm.arm.mve.vabd.v8f16(<8 x half>, <8 x half>) #1
+declare <8 x half> @llvm.arm.mve.vabd.v8f16(<8 x half>, <8 x half>, i32) #1
define arm_aapcs_vfpcc <8 x i16> @test_vabdq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vabdq_m_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vabdt.s16 q0, q1, q2
+; CHECK-NEXT: vabdt.u16 q0, q1, q2
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
- %2 = tail call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+ %2 = tail call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive)
ret <8 x i16> %2
}
declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1
-declare <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #1
+declare <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #1
define arm_aapcs_vfpcc <16 x i8> @test_vabdq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vabdq_m_s8:
@@ -65,13 +65,13 @@ define arm_aapcs_vfpcc <16 x i8> @test_vabdq_m_s8(<16 x i8> %inactive, <16 x i8>
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
- %2 = tail call <16 x i8> @llvm.arm.mve.abd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+ %2 = tail call <16 x i8> @llvm.arm.mve.abd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive)
ret <16 x i8> %2
}
declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1
-declare <16 x i8> @llvm.arm.mve.abd.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #1
+declare <16 x i8> @llvm.arm.mve.abd.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #1
define arm_aapcs_vfpcc <4 x float> @test_vabdq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vabdq_m_f32:
@@ -83,25 +83,25 @@ define arm_aapcs_vfpcc <4 x float> @test_vabdq_m_f32(<4 x float> %inactive, <4 x
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
- %2 = tail call <4 x float> @llvm.arm.mve.abd.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> %inactive)
+ %2 = tail call <4 x float> @llvm.arm.mve.abd.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, i32 0, <4 x i1> %1, <4 x float> %inactive)
ret <4 x float> %2
}
declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
-declare <4 x float> @llvm.arm.mve.abd.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #1
+declare <4 x float> @llvm.arm.mve.abd.predicated.v4f32.v4i1(<4 x float>, <4 x float>, i32, <4 x i1>, <4 x float>) #1
define arm_aapcs_vfpcc <8 x i16> @test_vabdq_x_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vabdq_x_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vabdt.s16 q0, q0, q1
+; CHECK-NEXT: vabdt.u16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
- %2 = tail call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+ %2 = tail call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> undef)
ret <8 x i16> %2
}
@@ -110,16 +110,16 @@ define arm_aapcs_vfpcc <4 x i32> @test_vabdq_x_u32(<4 x i32> %a, <4 x i32> %b, i
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vabdt.s32 q0, q0, q1
+; CHECK-NEXT: vabdt.u32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
- %2 = tail call <4 x i32> @llvm.arm.mve.abd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
+ %2 = tail call <4 x i32> @llvm.arm.mve.abd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <4 x i32> undef)
ret <4 x i32> %2
}
-declare <4 x i32> @llvm.arm.mve.abd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.abd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1
define arm_aapcs_vfpcc <8 x half> @test_vabdq_x_f16(<8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vabdq_x_f16:
@@ -131,9 +131,9 @@ define arm_aapcs_vfpcc <8 x half> @test_vabdq_x_f16(<8 x half> %a, <8 x half> %b
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
- %2 = tail call <8 x half> @llvm.arm.mve.abd.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> undef)
+ %2 = tail call <8 x half> @llvm.arm.mve.abd.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, i32 0, <8 x i1> %1, <8 x half> undef)
ret <8 x half> %2
}
-declare <8 x half> @llvm.arm.mve.abd.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #1
+declare <8 x half> @llvm.arm.mve.abd.predicated.v8f16.v8i1(<8 x half>, <8 x half>, i32, <8 x i1>, <8 x half>) #1
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll
index 82ecc062c08..9319dd1c4c4 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll
@@ -4,14 +4,14 @@
define arm_aapcs_vfpcc <16 x i8> @test_vhaddq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vhaddq_u8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vhadd.s8 q0, q0, q1
+; CHECK-NEXT: vhadd.u8 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
- %0 = tail call <16 x i8> @llvm.arm.mve.vhadd.v16i8(<16 x i8> %a, <16 x i8> %b)
+ %0 = tail call <16 x i8> @llvm.arm.mve.vhadd.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1)
ret <16 x i8> %0
}
-declare <16 x i8> @llvm.arm.mve.vhadd.v16i8(<16 x i8>, <16 x i8>) #1
+declare <16 x i8> @llvm.arm.mve.vhadd.v16i8(<16 x i8>, <16 x i8>, i32) #1
define arm_aapcs_vfpcc <8 x i16> @test_vhaddq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vhaddq_s16:
@@ -19,23 +19,23 @@ define arm_aapcs_vfpcc <8 x i16> @test_vhaddq_s16(<8 x i16> %a, <8 x i16> %b) lo
; CHECK-NEXT: vhadd.s16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
- %0 = tail call <8 x i16> @llvm.arm.mve.vhadd.v8i16(<8 x i16> %a, <8 x i16> %b)
+ %0 = tail call <8 x i16> @llvm.arm.mve.vhadd.v8i16(<8 x i16> %a, <8 x i16> %b, i32 0)
ret <8 x i16> %0
}
-declare <8 x i16> @llvm.arm.mve.vhadd.v8i16(<8 x i16>, <8 x i16>) #1
+declare <8 x i16> @llvm.arm.mve.vhadd.v8i16(<8 x i16>, <8 x i16>, i32) #1
define arm_aapcs_vfpcc <4 x i32> @test_vhaddq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vhaddq_u32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vhadd.s32 q0, q0, q1
+; CHECK-NEXT: vhadd.u32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
- %0 = tail call <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32> %a, <4 x i32> %b)
+ %0 = tail call <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1)
ret <4 x i32> %0
}
-declare <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32>, <4 x i32>, i32) #1
define arm_aapcs_vfpcc <16 x i8> @test_vhaddq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vhaddq_m_s8:
@@ -47,31 +47,31 @@ define arm_aapcs_vfpcc <16 x i8> @test_vhaddq_m_s8(<16 x i8> %inactive, <16 x i8
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
- %2 = tail call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+ %2 = tail call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive)
ret <16 x i8> %2
}
declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1
-declare <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #1
+declare <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #1
define arm_aapcs_vfpcc <8 x i16> @test_vhaddq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vhaddq_m_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vhaddt.s16 q0, q1, q2
+; CHECK-NEXT: vhaddt.u16 q0, q1, q2
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
- %2 = tail call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+ %2 = tail call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive)
ret <8 x i16> %2
}
declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1
-declare <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #1
+declare <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #1
define arm_aapcs_vfpcc <4 x i32> @test_vhaddq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vhaddq_m_s32:
@@ -83,25 +83,25 @@ define arm_aapcs_vfpcc <4 x i32> @test_vhaddq_m_s32(<4 x i32> %inactive, <4 x i3
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
- %2 = tail call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+ %2 = tail call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> %inactive)
ret <4 x i32> %2
}
declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
-declare <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1
define arm_aapcs_vfpcc <16 x i8> @test_vhaddq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vhaddq_x_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vhaddt.s8 q0, q0, q1
+; CHECK-NEXT: vhaddt.u8 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
- %2 = tail call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
+ %2 = tail call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <16 x i8> undef)
ret <16 x i8> %2
}
@@ -115,7 +115,7 @@ define arm_aapcs_vfpcc <8 x i16> @test_vhaddq_x_s16(<8 x i16> %a, <8 x i16> %b,
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
- %2 = tail call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+ %2 = tail call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <8 x i16> undef)
ret <8 x i16> %2
}
@@ -124,12 +124,12 @@ define arm_aapcs_vfpcc <4 x i32> @test_vhaddq_x_u32(<4 x i32> %a, <4 x i32> %b,
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vhaddt.s32 q0, q0, q1
+; CHECK-NEXT: vhaddt.u32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
- %2 = tail call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
+ %2 = tail call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <4 x i32> undef)
ret <4 x i32> %2
}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhsubq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhsubq.ll
index 43ec0cfed4d..5bc8a22810a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhsubq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhsubq.ll
@@ -4,14 +4,14 @@
define arm_aapcs_vfpcc <16 x i8> @test_vhsubq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vhsubq_u8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vhsub.s8 q0, q0, q1
+; CHECK-NEXT: vhsub.u8 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
- %0 = tail call <16 x i8> @llvm.arm.mve.vhsub.v16i8(<16 x i8> %a, <16 x i8> %b)
+ %0 = tail call <16 x i8> @llvm.arm.mve.vhsub.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1)
ret <16 x i8> %0
}
-declare <16 x i8> @llvm.arm.mve.vhsub.v16i8(<16 x i8>, <16 x i8>) #1
+declare <16 x i8> @llvm.arm.mve.vhsub.v16i8(<16 x i8>, <16 x i8>, i32) #1
define arm_aapcs_vfpcc <8 x i16> @test_vhsubq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vhsubq_s16:
@@ -19,23 +19,23 @@ define arm_aapcs_vfpcc <8 x i16> @test_vhsubq_s16(<8 x i16> %a, <8 x i16> %b) lo
; CHECK-NEXT: vhsub.s16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
- %0 = tail call <8 x i16> @llvm.arm.mve.vhsub.v8i16(<8 x i16> %a, <8 x i16> %b)
+ %0 = tail call <8 x i16> @llvm.arm.mve.vhsub.v8i16(<8 x i16> %a, <8 x i16> %b, i32 0)
ret <8 x i16> %0
}
-declare <8 x i16> @llvm.arm.mve.vhsub.v8i16(<8 x i16>, <8 x i16>) #1
+declare <8 x i16> @llvm.arm.mve.vhsub.v8i16(<8 x i16>, <8 x i16>, i32) #1
define arm_aapcs_vfpcc <4 x i32> @test_vhsubq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vhsubq_u32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vhsub.s32 q0, q0, q1
+; CHECK-NEXT: vhsub.u32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
- %0 = tail call <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32> %a, <4 x i32> %b)
+ %0 = tail call <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1)
ret <4 x i32> %0
}
-declare <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32>, <4 x i32>, i32) #1
define arm_aapcs_vfpcc <16 x i8> @test_vhsubq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vhsubq_m_s8:
@@ -47,31 +47,31 @@ define arm_aapcs_vfpcc <16 x i8> @test_vhsubq_m_s8(<16 x i8> %inactive, <16 x i8
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
- %2 = tail call <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+ %2 = tail call <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive)
ret <16 x i8> %2
}
declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1
-declare <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #1
+declare <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #1
define arm_aapcs_vfpcc <8 x i16> @test_vhsubq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vhsubq_m_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vhsubt.s16 q0, q1, q2
+; CHECK-NEXT: vhsubt.u16 q0, q1, q2
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
- %2 = tail call <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+ %2 = tail call <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive)
ret <8 x i16> %2
}
declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1
-declare <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #1
+declare <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #1
define arm_aapcs_vfpcc <4 x i32> @test_vhsubq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vhsubq_m_s32:
@@ -83,10 +83,10 @@ define arm_aapcs_vfpcc <4 x i32> @test_vhsubq_m_s32(<4 x i32> %inactive, <4 x i3
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
- %2 = tail call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+ %2 = tail call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> %inactive)
ret <4 x i32> %2
}
declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
-declare <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxnmq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxnmq.ll
index c1e45b87b45..a8dca7c77d1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxnmq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxnmq.ll
@@ -35,13 +35,13 @@ define arm_aapcs_vfpcc <8 x half> @test_vmaxnmq_m_f16(<8 x half> %inactive, <8 x
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
- %2 = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> %inactive)
+ %2 = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, i32 0, <8 x i1> %1, <8 x half> %inactive)
ret <8 x half> %2
}
declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
-declare <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #2
+declare <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half>, <8 x half>, i32, <8 x i1>, <8 x half>) #2
define arm_aapcs_vfpcc <4 x float> @test_vmaxnmq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vmaxnmq_m_f32:
@@ -53,13 +53,13 @@ define arm_aapcs_vfpcc <4 x float> @test_vmaxnmq_m_f32(<4 x float> %inactive, <4
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
- %2 = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> %inactive)
+ %2 = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, i32 0, <4 x i1> %1, <4 x float> %inactive)
ret <4 x float> %2
}
declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
-declare <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #2
+declare <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float>, <4 x float>, i32, <4 x i1>, <4 x float>) #2
define arm_aapcs_vfpcc <8 x half> @test_vmaxnmq_x_f16(<8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vmaxnmq_x_f16:
@@ -71,7 +71,7 @@ define arm_aapcs_vfpcc <8 x half> @test_vmaxnmq_x_f16(<8 x half> %a, <8 x half>
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
- %2 = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> undef)
+ %2 = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, i32 0, <8 x i1> %1, <8 x half> undef)
ret <8 x half> %2
}
@@ -85,7 +85,7 @@ define arm_aapcs_vfpcc <4 x float> @test_vmaxnmq_x_f32(<4 x float> %a, <4 x floa
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
- %2 = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> undef)
+ %2 = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, i32 0, <4 x i1> %1, <4 x float> undef)
ret <4 x float> %2
}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxq.ll
index 622aa1ccd33..2036378b4b9 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxq.ll
@@ -39,18 +39,18 @@ define arm_aapcs_vfpcc <16 x i8> @test_vmaxq_m_u8(<16 x i8> %inactive, <16 x i8>
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vmaxt.s8 q0, q1, q2
+; CHECK-NEXT: vmaxt.u8 q0, q1, q2
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
- %2 = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+ %2 = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <16 x i8> %inactive)
ret <16 x i8> %2
}
declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2
-declare <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2
+declare <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #2
define arm_aapcs_vfpcc <8 x i16> @test_vmaxq_m_s16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
; CHECK-LABEL: test_vmaxq_m_s16:
@@ -62,43 +62,43 @@ define arm_aapcs_vfpcc <8 x i16> @test_vmaxq_m_s16(<8 x i16> %inactive, <8 x i16
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
- %2 = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+ %2 = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <8 x i16> %inactive)
ret <8 x i16> %2
}
declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
-declare <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
+declare <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #2
define arm_aapcs_vfpcc <4 x i32> @test_vmaxq_m_u32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
; CHECK-LABEL: test_vmaxq_m_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vmaxt.s32 q0, q1, q2
+; CHECK-NEXT: vmaxt.u32 q0, q1, q2
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
- %2 = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+ %2 = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <4 x i32> %inactive)
ret <4 x i32> %2
}
declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
-declare <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
+declare <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #2
define arm_aapcs_vfpcc <16 x i8> @test_vmaxq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
; CHECK-LABEL: test_vmaxq_x_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vmaxt.s8 q0, q0, q1
+; CHECK-NEXT: vmaxt.u8 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
- %2 = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
+ %2 = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <16 x i8> undef)
ret <16 x i8> %2
}
@@ -112,7 +112,7 @@ define arm_aapcs_vfpcc <8 x i16> @test_vmaxq_x_u16(<8 x i16> %a, <8 x i16> %b, i
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
- %2 = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+ %2 = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <8 x i16> undef)
ret <8 x i16> %2
}
@@ -121,12 +121,12 @@ define arm_aapcs_vfpcc <4 x i32> @test_vmaxq_x_s32(<4 x i32> %a, <4 x i32> %b, i
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vmaxt.s32 q0, q0, q1
+; CHECK-NEXT: vmaxt.u32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
- %2 = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
+ %2 = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <4 x i32> undef)
ret <4 x i32> %2
}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminnmq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminnmq.ll
index 9bbc2a9c936..d4853f7a2be 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminnmq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminnmq.ll
@@ -35,13 +35,13 @@ define arm_aapcs_vfpcc <8 x half> @test_vminnmq_m_f16(<8 x half> %inactive, <8 x
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
- %2 = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> %inactive)
+ %2 = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, i32 0, <8 x i1> %1, <8 x half> %inactive)
ret <8 x half> %2
}
declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
-declare <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #2
+declare <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half>, <8 x half>, i32, <8 x i1>, <8 x half>) #2
define arm_aapcs_vfpcc <4 x float> @test_vminnmq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vminnmq_m_f32:
@@ -53,13 +53,13 @@ define arm_aapcs_vfpcc <4 x float> @test_vminnmq_m_f32(<4 x float> %inactive, <4
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
- %2 = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> %inactive)
+ %2 = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, i32 0, <4 x i1> %1, <4 x float> %inactive)
ret <4 x float> %2
}
declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
-declare <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #2
+declare <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float>, <4 x float>, i32, <4 x i1>, <4 x float>) #2
define arm_aapcs_vfpcc <8 x half> @test_vminnmq_x_f16(<8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vminnmq_x_f16:
@@ -71,7 +71,7 @@ define arm_aapcs_vfpcc <8 x half> @test_vminnmq_x_f16(<8 x half> %a, <8 x half>
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
- %2 = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> undef)
+ %2 = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, i32 0, <8 x i1> %1, <8 x half> undef)
ret <8 x half> %2
}
@@ -85,7 +85,7 @@ define arm_aapcs_vfpcc <4 x float> @test_vminnmq_x_f32(<4 x float> %a, <4 x floa
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
- %2 = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> undef)
+ %2 = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, i32 0, <4 x i1> %1, <4 x float> undef)
ret <4 x float> %2
}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminq.ll
index 96bc2233f22..9c93a7d238d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminq.ll
@@ -44,31 +44,31 @@ define arm_aapcs_vfpcc <16 x i8> @test_vminq_m_s8(<16 x i8> %inactive, <16 x i8>
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
- %2 = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+ %2 = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive)
ret <16 x i8> %2
}
declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2
-declare <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2
+declare <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #2
define arm_aapcs_vfpcc <8 x i16> @test_vminq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
; CHECK-LABEL: test_vminq_m_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vmint.s16 q0, q1, q2
+; CHECK-NEXT: vmint.u16 q0, q1, q2
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
- %2 = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+ %2 = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive)
ret <8 x i16> %2
}
declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
-declare <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
+declare <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #2
define arm_aapcs_vfpcc <4 x i32> @test_vminq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
; CHECK-LABEL: test_vminq_m_s32:
@@ -80,25 +80,25 @@ define arm_aapcs_vfpcc <4 x i32> @test_vminq_m_s32(<4 x i32> %inactive, <4 x i32
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
- %2 = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+ %2 = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> %inactive)
ret <4 x i32> %2
}
declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
-declare <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
+declare <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #2
define arm_aapcs_vfpcc <16 x i8> @test_vminq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
; CHECK-LABEL: test_vminq_x_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vmint.s8 q0, q0, q1
+; CHECK-NEXT: vmint.u8 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
- %2 = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
+ %2 = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <16 x i8> undef)
ret <16 x i8> %2
}
@@ -112,7 +112,7 @@ define arm_aapcs_vfpcc <8 x i16> @test_vminq_x_s16(<8 x i16> %a, <8 x i16> %b, i
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
- %2 = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+ %2 = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <8 x i16> undef)
ret <8 x i16> %2
}
@@ -126,7 +126,7 @@ define arm_aapcs_vfpcc <4 x i32> @test_vminq_x_s32(<4 x i32> %a, <4 x i32> %b, i
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
- %2 = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
+ %2 = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> undef)
ret <4 x i32> %2
}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulhq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulhq.ll
index ba15af6c549..aae9a23978e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulhq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulhq.ll
@@ -4,14 +4,14 @@
define arm_aapcs_vfpcc <16 x i8> @test_vmulhq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vmulhq_u8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmulh.s8 q0, q0, q1
+; CHECK-NEXT: vmulh.u8 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
- %0 = tail call <16 x i8> @llvm.arm.mve.vmulh.v16i8(<16 x i8> %a, <16 x i8> %b)
+ %0 = tail call <16 x i8> @llvm.arm.mve.vmulh.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1)
ret <16 x i8> %0
}
-declare <16 x i8> @llvm.arm.mve.vmulh.v16i8(<16 x i8>, <16 x i8>) #1
+declare <16 x i8> @llvm.arm.mve.vmulh.v16i8(<16 x i8>, <16 x i8>, i32) #1
define arm_aapcs_vfpcc <8 x i16> @test_vmulhq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vmulhq_s16:
@@ -19,23 +19,23 @@ define arm_aapcs_vfpcc <8 x i16> @test_vmulhq_s16(<8 x i16> %a, <8 x i16> %b) lo
; CHECK-NEXT: vmulh.s16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
- %0 = tail call <8 x i16> @llvm.arm.mve.vmulh.v8i16(<8 x i16> %a, <8 x i16> %b)
+ %0 = tail call <8 x i16> @llvm.arm.mve.vmulh.v8i16(<8 x i16> %a, <8 x i16> %b, i32 0)
ret <8 x i16> %0
}
-declare <8 x i16> @llvm.arm.mve.vmulh.v8i16(<8 x i16>, <8 x i16>) #1
+declare <8 x i16> @llvm.arm.mve.vmulh.v8i16(<8 x i16>, <8 x i16>, i32) #1
define arm_aapcs_vfpcc <4 x i32> @test_vmulhq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vmulhq_u32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmulh.s32 q0, q0, q1
+; CHECK-NEXT: vmulh.u32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
- %0 = tail call <4 x i32> @llvm.arm.mve.vmulh.v4i32(<4 x i32> %a, <4 x i32> %b)
+ %0 = tail call <4 x i32> @llvm.arm.mve.vmulh.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1)
ret <4 x i32> %0
}
-declare <4 x i32> @llvm.arm.mve.vmulh.v4i32(<4 x i32>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.vmulh.v4i32(<4 x i32>, <4 x i32>, i32) #1
define arm_aapcs_vfpcc <16 x i8> @test_vmulhq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vmulhq_m_s8:
@@ -47,31 +47,31 @@ define arm_aapcs_vfpcc <16 x i8> @test_vmulhq_m_s8(<16 x i8> %inactive, <16 x i8
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
- %2 = tail call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+ %2 = tail call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive)
ret <16 x i8> %2
}
declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1
-declare <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #1
+declare <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #1
define arm_aapcs_vfpcc <8 x i16> @test_vmulhq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vmulhq_m_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vmulht.s16 q0, q1, q2
+; CHECK-NEXT: vmulht.u16 q0, q1, q2
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
- %2 = tail call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+ %2 = tail call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive)
ret <8 x i16> %2
}
declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1
-declare <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #1
+declare <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #1
define arm_aapcs_vfpcc <4 x i32> @test_vmulhq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vmulhq_m_s32:
@@ -83,25 +83,25 @@ define arm_aapcs_vfpcc <4 x i32> @test_vmulhq_m_s32(<4 x i32> %inactive, <4 x i3
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
- %2 = tail call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+ %2 = tail call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> %inactive)
ret <4 x i32> %2
}
declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
-declare <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1
define arm_aapcs_vfpcc <16 x i8> @test_vmulhq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vmulhq_x_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vmulht.s8 q0, q0, q1
+; CHECK-NEXT: vmulht.u8 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
- %2 = tail call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
+ %2 = tail call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <16 x i8> undef)
ret <16 x i8> %2
}
@@ -115,7 +115,7 @@ define arm_aapcs_vfpcc <8 x i16> @test_vmulhq_x_s16(<8 x i16> %a, <8 x i16> %b,
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
- %2 = tail call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+ %2 = tail call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <8 x i16> undef)
ret <8 x i16> %2
}
@@ -124,12 +124,12 @@ define arm_aapcs_vfpcc <4 x i32> @test_vmulhq_x_u32(<4 x i32> %a, <4 x i32> %b,
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vmulht.s32 q0, q0, q1
+; CHECK-NEXT: vmulht.u32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
- %2 = tail call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
+ %2 = tail call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <4 x i32> undef)
ret <4 x i32> %2
}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmullbq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmullbq.ll
index e443d54f020..43914690549 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmullbq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmullbq.ll
@@ -4,14 +4,14 @@
define arm_aapcs_vfpcc <8 x i16> @test_vmullbq_int_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vmullbq_int_u8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmullb.s8 q0, q0, q1
+; CHECK-NEXT: vmullb.u8 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
- %0 = tail call <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8> %a, <16 x i8> %b, i32 0)
+ %0 = tail call <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1, i32 0)
ret <8 x i16> %0
}
-declare <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8>, <16 x i8>, i32) #1
+declare <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8>, <16 x i8>, i32, i32) #1
define arm_aapcs_vfpcc <4 x i32> @test_vmullbq_int_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vmullbq_int_s16:
@@ -19,24 +19,24 @@ define arm_aapcs_vfpcc <4 x i32> @test_vmullbq_int_s16(<8 x i16> %a, <8 x i16> %
; CHECK-NEXT: vmullb.s16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
- %0 = tail call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> %a, <8 x i16> %b, i32 0)
+ %0 = tail call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> %a, <8 x i16> %b, i32 0, i32 0)
ret <4 x i32> %0
}
-declare <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16>, <8 x i16>, i32) #1
+declare <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16>, <8 x i16>, i32, i32) #1
define arm_aapcs_vfpcc <2 x i64> @test_vmullbq_int_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vmullbq_int_u32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmullb.s32 q2, q0, q1
+; CHECK-NEXT: vmullb.u32 q2, q0, q1
; CHECK-NEXT: vmov q0, q2
; CHECK-NEXT: bx lr
entry:
- %0 = tail call <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32> %a, <4 x i32> %b, i32 0)
+ %0 = tail call <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1, i32 0)
ret <2 x i64> %0
}
-declare <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32>, <4 x i32>, i32) #1
+declare <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32>, <4 x i32>, i32, i32) #1
define arm_aapcs_vfpcc <4 x i32> @test_vmullbq_poly_p16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vmullbq_poly_p16:
@@ -60,31 +60,31 @@ define arm_aapcs_vfpcc <8 x i16> @test_vmullbq_int_m_s8(<8 x i16> %inactive, <16
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
- %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <8 x i16> %inactive)
+ %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, i32 0, <16 x i1> %1, <8 x i16> %inactive)
ret <8 x i16> %2
}
declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1
-declare <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <8 x i16>) #1
+declare <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, i32, <16 x i1>, <8 x i16>) #1
define arm_aapcs_vfpcc <4 x i32> @test_vmullbq_int_m_u16(<4 x i32> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vmullbq_int_m_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vmullbt.s16 q0, q1, q2
+; CHECK-NEXT: vmullbt.u16 q0, q1, q2
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
- %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <4 x i32> %inactive)
+ %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, i32 0, <8 x i1> %1, <4 x i32> %inactive)
ret <4 x i32> %2
}
declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1
-declare <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, i32, <8 x i1>, <4 x i32>) #1
define arm_aapcs_vfpcc <2 x i64> @test_vmullbq_int_m_s32(<2 x i64> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vmullbq_int_m_s32:
@@ -96,13 +96,13 @@ define arm_aapcs_vfpcc <2 x i64> @test_vmullbq_int_m_s32(<2 x i64> %inactive, <4
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
- %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <2 x i64> %inactive)
+ %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, i32 0, <4 x i1> %1, <2 x i64> %inactive)
ret <2 x i64> %2
}
declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
-declare <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <2 x i64>) #1
+declare <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, i32, <4 x i1>, <2 x i64>) #1
define arm_aapcs_vfpcc <8 x i16> @test_vmullbq_poly_m_p8(<8 x i16> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vmullbq_poly_m_p8:
@@ -125,12 +125,12 @@ define arm_aapcs_vfpcc <8 x i16> @test_vmullbq_int_x_u8(<16 x i8> %a, <16 x i8>
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vmullbt.s8 q0, q0, q1
+; CHECK-NEXT: vmullbt.u8 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
- %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <8 x i16> undef)
+ %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, i32 0, <16 x i1> %1, <8 x i16> undef)
ret <8 x i16> %2
}
@@ -144,7 +144,7 @@ define arm_aapcs_vfpcc <4 x i32> @test_vmullbq_int_x_s16(<8 x i16> %a, <8 x i16>
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
- %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <4 x i32> undef)
+ %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, i32 0, <8 x i1> %1, <4 x i32> undef)
ret <4 x i32> %2
}
@@ -153,13 +153,13 @@ define arm_aapcs_vfpcc <2 x i64> @test_vmullbq_int_x_u32(<4 x i32> %a, <4 x i32>
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vmullbt.s32 q2, q0, q1
+; CHECK-NEXT: vmullbt.u32 q2, q0, q1
; CHECK-NEXT: vmov q0, q2
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
- %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <2 x i64> undef)
+ %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, i32 0, <4 x i1> %1, <2 x i64> undef)
ret <2 x i64> %2
}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulltq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulltq.ll
index ca2aae6034a..39c1105f95f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulltq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulltq.ll
@@ -4,14 +4,14 @@
define arm_aapcs_vfpcc <8 x i16> @test_vmulltq_int_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vmulltq_int_u8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmullt.s8 q0, q0, q1
+; CHECK-NEXT: vmullt.u8 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
- %0 = tail call <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1)
+ %0 = tail call <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1, i32 1)
ret <8 x i16> %0
}
-declare <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8>, <16 x i8>, i32) #1
+declare <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8>, <16 x i8>, i32, i32) #1
define arm_aapcs_vfpcc <4 x i32> @test_vmulltq_int_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vmulltq_int_s16:
@@ -19,24 +19,24 @@ define arm_aapcs_vfpcc <4 x i32> @test_vmulltq_int_s16(<8 x i16> %a, <8 x i16> %
; CHECK-NEXT: vmullt.s16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
- %0 = tail call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> %a, <8 x i16> %b, i32 1)
+ %0 = tail call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> %a, <8 x i16> %b, i32 0, i32 1)
ret <4 x i32> %0
}
-declare <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16>, <8 x i16>, i32) #1
+declare <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16>, <8 x i16>, i32, i32) #1
define arm_aapcs_vfpcc <2 x i64> @test_vmulltq_int_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vmulltq_int_u32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmullt.s32 q2, q0, q1
+; CHECK-NEXT: vmullt.u32 q2, q0, q1
; CHECK-NEXT: vmov q0, q2
; CHECK-NEXT: bx lr
entry:
- %0 = tail call <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1)
+ %0 = tail call <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1, i32 1)
ret <2 x i64> %0
}
-declare <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32>, <4 x i32>, i32) #1
+declare <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32>, <4 x i32>, i32, i32) #1
define arm_aapcs_vfpcc <4 x i32> @test_vmulltq_poly_p16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vmulltq_poly_p16:
@@ -60,31 +60,31 @@ define arm_aapcs_vfpcc <8 x i16> @test_vmulltq_int_m_s8(<8 x i16> %inactive, <16
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
- %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <8 x i16> %inactive)
+ %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, i32 1, <16 x i1> %1, <8 x i16> %inactive)
ret <8 x i16> %2
}
declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1
-declare <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <8 x i16>) #1
+declare <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, i32, <16 x i1>, <8 x i16>) #1
define arm_aapcs_vfpcc <4 x i32> @test_vmulltq_int_m_u16(<4 x i32> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vmulltq_int_m_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vmulltt.s16 q0, q1, q2
+; CHECK-NEXT: vmulltt.u16 q0, q1, q2
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
- %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <4 x i32> %inactive)
+ %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, i32 1, <8 x i1> %1, <4 x i32> %inactive)
ret <4 x i32> %2
}
declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1
-declare <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, i32, <8 x i1>, <4 x i32>) #1
define arm_aapcs_vfpcc <2 x i64> @test_vmulltq_int_m_s32(<2 x i64> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vmulltq_int_m_s32:
@@ -96,13 +96,13 @@ define arm_aapcs_vfpcc <2 x i64> @test_vmulltq_int_m_s32(<2 x i64> %inactive, <4
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
- %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <2 x i64> %inactive)
+ %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, i32 1, <4 x i1> %1, <2 x i64> %inactive)
ret <2 x i64> %2
}
declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
-declare <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <2 x i64>) #1
+declare <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, i32, <4 x i1>, <2 x i64>) #1
define arm_aapcs_vfpcc <8 x i16> @test_vmulltq_poly_m_p8(<8 x i16> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vmulltq_poly_m_p8:
@@ -125,12 +125,12 @@ define arm_aapcs_vfpcc <8 x i16> @test_vmulltq_int_x_u8(<16 x i8> %a, <16 x i8>
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vmulltt.s8 q0, q0, q1
+; CHECK-NEXT: vmulltt.u8 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
- %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <8 x i16> undef)
+ %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, i32 1, <16 x i1> %1, <8 x i16> undef)
ret <8 x i16> %2
}
@@ -144,7 +144,7 @@ define arm_aapcs_vfpcc <4 x i32> @test_vmulltq_int_x_s16(<8 x i16> %a, <8 x i16>
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
- %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <4 x i32> undef)
+ %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, i32 1, <8 x i1> %1, <4 x i32> undef)
ret <4 x i32> %2
}
@@ -153,13 +153,13 @@ define arm_aapcs_vfpcc <2 x i64> @test_vmulltq_int_x_u32(<4 x i32> %a, <4 x i32>
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vmulltt.s32 q2, q0, q1
+; CHECK-NEXT: vmulltt.u32 q2, q0, q1
; CHECK-NEXT: vmov q0, q2
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
- %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <2 x i64> undef)
+ %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, i32 1, <4 x i1> %1, <2 x i64> undef)
ret <2 x i64> %2
}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqaddq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqaddq.ll
index 304a6e83aa2..b05fded2b30 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqaddq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqaddq.ll
@@ -47,31 +47,31 @@ define arm_aapcs_vfpcc <16 x i8> @test_vqaddq_m_s8(<16 x i8> %inactive, <16 x i8
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
- %2 = tail call <16 x i8> @llvm.arm.mve.qadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+ %2 = tail call <16 x i8> @llvm.arm.mve.qadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive)
ret <16 x i8> %2
}
declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2
-declare <16 x i8> @llvm.arm.mve.qadd.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2
+declare <16 x i8> @llvm.arm.mve.qadd.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #2
define arm_aapcs_vfpcc <8 x i16> @test_vqaddq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vqaddq_m_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vqaddt.s16 q0, q1, q2
+; CHECK-NEXT: vqaddt.u16 q0, q1, q2
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
- %2 = tail call <8 x i16> @llvm.arm.mve.qadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+ %2 = tail call <8 x i16> @llvm.arm.mve.qadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive)
ret <8 x i16> %2
}
declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
-declare <8 x i16> @llvm.arm.mve.qadd.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
+declare <8 x i16> @llvm.arm.mve.qadd.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #2
define arm_aapcs_vfpcc <4 x i32> @test_vqaddq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vqaddq_m_s32:
@@ -83,10 +83,10 @@ define arm_aapcs_vfpcc <4 x i32> @test_vqaddq_m_s32(<4 x i32> %inactive, <4 x i3
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
- %2 = tail call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+ %2 = tail call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> %inactive)
ret <4 x i32> %2
}
declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
-declare <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
+declare <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #2
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqsubq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqsubq.ll
index 81b3c1c73e9..37f251c83a3 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqsubq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqsubq.ll
@@ -47,31 +47,31 @@ define arm_aapcs_vfpcc <16 x i8> @test_vqsubq_m_s8(<16 x i8> %inactive, <16 x i8
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
- %2 = tail call <16 x i8> @llvm.arm.mve.qsub.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+ %2 = tail call <16 x i8> @llvm.arm.mve.qsub.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive)
ret <16 x i8> %2
}
declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2
-declare <16 x i8> @llvm.arm.mve.qsub.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2
+declare <16 x i8> @llvm.arm.mve.qsub.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #2
define arm_aapcs_vfpcc <8 x i16> @test_vqsubq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vqsubq_m_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vqsubt.s16 q0, q1, q2
+; CHECK-NEXT: vqsubt.u16 q0, q1, q2
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
- %2 = tail call <8 x i16> @llvm.arm.mve.qsub.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+ %2 = tail call <8 x i16> @llvm.arm.mve.qsub.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive)
ret <8 x i16> %2
}
declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
-declare <8 x i16> @llvm.arm.mve.qsub.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
+declare <8 x i16> @llvm.arm.mve.qsub.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #2
define arm_aapcs_vfpcc <4 x i32> @test_vqsubq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vqsubq_m_s32:
@@ -83,10 +83,10 @@ define arm_aapcs_vfpcc <4 x i32> @test_vqsubq_m_s32(<4 x i32> %inactive, <4 x i3
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
- %2 = tail call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+ %2 = tail call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> %inactive)
ret <4 x i32> %2
}
declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
-declare <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
+declare <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #2
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrhaddq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrhaddq.ll
index 15b23d5ed27..d7e39548e79 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrhaddq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrhaddq.ll
@@ -4,14 +4,14 @@
define arm_aapcs_vfpcc <16 x i8> @test_vrhaddq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vrhaddq_u8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vrhadd.s8 q0, q0, q1
+; CHECK-NEXT: vrhadd.u8 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
- %0 = tail call <16 x i8> @llvm.arm.mve.vrhadd.v16i8(<16 x i8> %a, <16 x i8> %b)
+ %0 = tail call <16 x i8> @llvm.arm.mve.vrhadd.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1)
ret <16 x i8> %0
}
-declare <16 x i8> @llvm.arm.mve.vrhadd.v16i8(<16 x i8>, <16 x i8>) #1
+declare <16 x i8> @llvm.arm.mve.vrhadd.v16i8(<16 x i8>, <16 x i8>, i32) #1
define arm_aapcs_vfpcc <8 x i16> @test_vrhaddq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vrhaddq_s16:
@@ -19,23 +19,23 @@ define arm_aapcs_vfpcc <8 x i16> @test_vrhaddq_s16(<8 x i16> %a, <8 x i16> %b) l
; CHECK-NEXT: vrhadd.s16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
- %0 = tail call <8 x i16> @llvm.arm.mve.vrhadd.v8i16(<8 x i16> %a, <8 x i16> %b)
+ %0 = tail call <8 x i16> @llvm.arm.mve.vrhadd.v8i16(<8 x i16> %a, <8 x i16> %b, i32 0)
ret <8 x i16> %0
}
-declare <8 x i16> @llvm.arm.mve.vrhadd.v8i16(<8 x i16>, <8 x i16>) #1
+declare <8 x i16> @llvm.arm.mve.vrhadd.v8i16(<8 x i16>, <8 x i16>, i32) #1
define arm_aapcs_vfpcc <4 x i32> @test_vrhaddq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vrhaddq_u32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vrhadd.s32 q0, q0, q1
+; CHECK-NEXT: vrhadd.u32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
- %0 = tail call <4 x i32> @llvm.arm.mve.vrhadd.v4i32(<4 x i32> %a, <4 x i32> %b)
+ %0 = tail call <4 x i32> @llvm.arm.mve.vrhadd.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1)
ret <4 x i32> %0
}
-declare <4 x i32> @llvm.arm.mve.vrhadd.v4i32(<4 x i32>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.vrhadd.v4i32(<4 x i32>, <4 x i32>, i32) #1
define arm_aapcs_vfpcc <16 x i8> @test_vrhaddq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vrhaddq_m_s8:
@@ -47,31 +47,31 @@ define arm_aapcs_vfpcc <16 x i8> @test_vrhaddq_m_s8(<16 x i8> %inactive, <16 x i
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
- %2 = tail call <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+ %2 = tail call <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive)
ret <16 x i8> %2
}
declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1
-declare <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #1
+declare <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #1
define arm_aapcs_vfpcc <8 x i16> @test_vrhaddq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vrhaddq_m_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vrhaddt.s16 q0, q1, q2
+; CHECK-NEXT: vrhaddt.u16 q0, q1, q2
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
- %2 = tail call <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+ %2 = tail call <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive)
ret <8 x i16> %2
}
declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1
-declare <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #1
+declare <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #1
define arm_aapcs_vfpcc <4 x i32> @test_vrhaddq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vrhaddq_m_s32:
@@ -83,25 +83,25 @@ define arm_aapcs_vfpcc <4 x i32> @test_vrhaddq_m_s32(<4 x i32> %inactive, <4 x i
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
- %2 = tail call <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+ %2 = tail call <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> %inactive)
ret <4 x i32> %2
}
declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
-declare <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1
define arm_aapcs_vfpcc <16 x i8> @test_vrhaddq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vrhaddq_x_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vrhaddt.s8 q0, q0, q1
+; CHECK-NEXT: vrhaddt.u8 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
- %2 = tail call <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
+ %2 = tail call <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <16 x i8> undef)
ret <16 x i8> %2
}
@@ -110,12 +110,12 @@ define arm_aapcs_vfpcc <8 x i16> @test_vrhaddq_x_u16(<8 x i16> %a, <8 x i16> %b,
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vrhaddt.s16 q0, q0, q1
+; CHECK-NEXT: vrhaddt.u16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
- %2 = tail call <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+ %2 = tail call <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> undef)
ret <8 x i16> %2
}
@@ -124,12 +124,12 @@ define arm_aapcs_vfpcc <4 x i32> @test_vrhaddq_x_u32(<4 x i32> %a, <4 x i32> %b,
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vrhaddt.s32 q0, q0, q1
+; CHECK-NEXT: vrhaddt.u32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
- %2 = tail call <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
+ %2 = tail call <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <4 x i32> undef)
ret <4 x i32> %2
}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrmulhq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrmulhq.ll
index 9418f581eb4..e8ff21452f3 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrmulhq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrmulhq.ll
@@ -4,14 +4,14 @@
define arm_aapcs_vfpcc <16 x i8> @test_vrmulhq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vrmulhq_u8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vrmulh.s8 q0, q0, q1
+; CHECK-NEXT: vrmulh.u8 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
- %0 = tail call <16 x i8> @llvm.arm.mve.vrmulh.v16i8(<16 x i8> %a, <16 x i8> %b)
+ %0 = tail call <16 x i8> @llvm.arm.mve.vrmulh.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1)
ret <16 x i8> %0
}
-declare <16 x i8> @llvm.arm.mve.vrmulh.v16i8(<16 x i8>, <16 x i8>) #1
+declare <16 x i8> @llvm.arm.mve.vrmulh.v16i8(<16 x i8>, <16 x i8>, i32) #1
define arm_aapcs_vfpcc <8 x i16> @test_vrmulhq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vrmulhq_s16:
@@ -19,23 +19,23 @@ define arm_aapcs_vfpcc <8 x i16> @test_vrmulhq_s16(<8 x i16> %a, <8 x i16> %b) l
; CHECK-NEXT: vrmulh.s16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
- %0 = tail call <8 x i16> @llvm.arm.mve.vrmulh.v8i16(<8 x i16> %a, <8 x i16> %b)
+ %0 = tail call <8 x i16> @llvm.arm.mve.vrmulh.v8i16(<8 x i16> %a, <8 x i16> %b, i32 0)
ret <8 x i16> %0
}
-declare <8 x i16> @llvm.arm.mve.vrmulh.v8i16(<8 x i16>, <8 x i16>) #1
+declare <8 x i16> @llvm.arm.mve.vrmulh.v8i16(<8 x i16>, <8 x i16>, i32) #1
define arm_aapcs_vfpcc <4 x i32> @test_vrmulhq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
; CHECK-LABEL: test_vrmulhq_u32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vrmulh.s32 q0, q0, q1
+; CHECK-NEXT: vrmulh.u32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
- %0 = tail call <4 x i32> @llvm.arm.mve.vrmulh.v4i32(<4 x i32> %a, <4 x i32> %b)
+ %0 = tail call <4 x i32> @llvm.arm.mve.vrmulh.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1)
ret <4 x i32> %0
}
-declare <4 x i32> @llvm.arm.mve.vrmulh.v4i32(<4 x i32>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.vrmulh.v4i32(<4 x i32>, <4 x i32>, i32) #1
define arm_aapcs_vfpcc <16 x i8> @test_vrmulhq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vrmulhq_m_s8:
@@ -47,31 +47,31 @@ define arm_aapcs_vfpcc <16 x i8> @test_vrmulhq_m_s8(<16 x i8> %inactive, <16 x i
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
- %2 = tail call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+ %2 = tail call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive)
ret <16 x i8> %2
}
declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1
-declare <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #1
+declare <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #1
define arm_aapcs_vfpcc <8 x i16> @test_vrmulhq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vrmulhq_m_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vrmulht.s16 q0, q1, q2
+; CHECK-NEXT: vrmulht.u16 q0, q1, q2
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
- %2 = tail call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+ %2 = tail call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive)
ret <8 x i16> %2
}
declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1
-declare <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #1
+declare <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #1
define arm_aapcs_vfpcc <4 x i32> @test_vrmulhq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vrmulhq_m_s32:
@@ -83,25 +83,25 @@ define arm_aapcs_vfpcc <4 x i32> @test_vrmulhq_m_s32(<4 x i32> %inactive, <4 x i
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
- %2 = tail call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+ %2 = tail call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> %inactive)
ret <4 x i32> %2
}
declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
-declare <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1
define arm_aapcs_vfpcc <16 x i8> @test_vrmulhq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
; CHECK-LABEL: test_vrmulhq_x_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vrmulht.s8 q0, q0, q1
+; CHECK-NEXT: vrmulht.u8 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
- %2 = tail call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
+ %2 = tail call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <16 x i8> undef)
ret <16 x i8> %2
}
@@ -115,7 +115,7 @@ define arm_aapcs_vfpcc <8 x i16> @test_vrmulhq_x_s16(<8 x i16> %a, <8 x i16> %b,
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
- %2 = tail call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+ %2 = tail call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <8 x i16> undef)
ret <8 x i16> %2
}
@@ -124,12 +124,12 @@ define arm_aapcs_vfpcc <4 x i32> @test_vrmulhq_m_u32(<4 x i32> %a, <4 x i32> %b,
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vrmulht.s32 q0, q0, q1
+; CHECK-NEXT: vrmulht.u32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
- %2 = tail call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
+ %2 = tail call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <4 x i32> undef)
ret <4 x i32> %2
}
OpenPOWER on IntegriCloud