31 files changed, 432 insertions, 401 deletions
diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
index 061759ad91d..e81b75dfe31 100644
--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -63,9 +63,9 @@ def vqsubq_u: Intrinsic<Vector, (args Vector:$a, Vector:$b),
 let params = T.Int in {
 def vaddq: Intrinsic<Vector, (args Vector:$a, Vector:$b), (add $a, $b)>;
 def vhaddq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
-                              (IRInt<"vhadd", [Vector]> $a, $b)>;
+                              (IRInt<"vhadd", [Vector]> $a, $b, (unsignedflag Scalar))>;
 def vrhaddq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
-                               (IRInt<"vrhadd", [Vector]> $a, $b)>;
+                               (IRInt<"vrhadd", [Vector]> $a, $b, (unsignedflag Scalar))>;
 def vandq: Intrinsic<Vector, (args Vector:$a, Vector:$b), (and $a, $b)>;
 def vbicq: Intrinsic<Vector, (args Vector:$a, Vector:$b), (and $a, (not $b))>;
 def veorq: Intrinsic<Vector, (args Vector:$a, Vector:$b), (xor $a, $b)>;
@@ -73,18 +73,18 @@ def vornq: Intrinsic<Vector, (args Vector:$a, Vector:$b), (or $a, (not $b))>;
 def vorrq: Intrinsic<Vector, (args Vector:$a, Vector:$b), (or $a, $b)>;
 def vsubq: Intrinsic<Vector, (args Vector:$a, Vector:$b), (sub $a, $b)>;
 def vhsubq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
-                              (IRInt<"vhsub", [Vector]> $a, $b)>;
+                              (IRInt<"vhsub", [Vector]> $a, $b, (unsignedflag Scalar))>;
 def vmulq: Intrinsic<Vector, (args Vector:$a, Vector:$b), (mul $a, $b)>;
 def vmulhq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
-                              (IRInt<"vmulh", [Vector]> $a, $b)>;
+                              (IRInt<"vmulh", [Vector]> $a, $b, (unsignedflag Scalar))>;
 def vrmulhq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
-                               (IRInt<"vrmulh", [Vector]> $a, $b)>;
+                               (IRInt<"vrmulh", [Vector]> $a, $b, (unsignedflag Scalar))>;
 def vmullbq_int: Intrinsic<DblVector, (args Vector:$a, Vector:$b),
                                       (IRInt<"vmull", [DblVector, Vector]>
-                                       $a, $b, 0)>;
+                                       $a, $b, (unsignedflag Scalar), 0)>;
 def vmulltq_int: Intrinsic<DblVector, (args Vector:$a, Vector:$b),
                                       (IRInt<"vmull", [DblVector, Vector]>
-                                       $a, $b, 1)>;
+                                       $a, $b, (unsignedflag Scalar), 1)>;
 }
 let params = T.Signed in {
 def vqdmulhq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
@@ -134,15 +134,15 @@ def "": Intrinsic<Vector, (args Vector:$inactive, Vector:$a, Vector:$b,
 // Plain intrinsics
 let params = T.Usual in {
 def vabdq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
-                             (IRInt<"vabd", [Vector]> $a, $b)>;
+                     (IRInt<"vabd", [Vector]> $a, $b, (unsignedflag Scalar))>;
 }
 
-multiclass VectorVectorArithmetic<string operation, int wantXVariant = 1> {
-  defm "" : IntrinsicMX<Vector, (args Vector:$a, Vector:$b,
-                                 Predicate:$pred),
-                                (IRInt<operation, [Vector, Predicate]> $a, $b,
-                                 $pred, $inactive),
-                                wantXVariant>;
+multiclass VectorVectorArithmetic<string operation, dag extraArgs = (?),
+                                  int wantXVariant = 1> {
+  defm "" : IntrinsicMX<
+      Vector, (args Vector:$a, Vector:$b, Predicate:$pred),
+      !con((IRInt<operation, [Vector, Predicate]> $a, $b),
+           extraArgs, (? $pred, $inactive)), wantXVariant>;
 }
 
 multiclass VectorVectorArithmeticBitcast<string operation> {
@@ -157,7 +157,7 @@ multiclass VectorVectorArithmeticBitcast<string operation> {
 
 // Predicated intrinsics
 let params = T.Usual in {
-  defm vabdq : VectorVectorArithmetic<"abd_predicated">;
+  defm vabdq : VectorVectorArithmetic<"abd_predicated", (? (unsignedflag Scalar))>;
   defm vaddq : VectorVectorArithmetic<"add_predicated">;
   defm vsubq : VectorVectorArithmetic<"sub_predicated">;
   defm vmulq : VectorVectorArithmetic<"mul_predicated">;
@@ -168,42 +168,41 @@ let params = T.Usual in {
   defm vorrq : VectorVectorArithmeticBitcast<"orr_predicated">;
 }
 
-multiclass DblVectorVectorArithmetic<string operation, dag top> {
-  defm "" : IntrinsicMX<DblVector, (args Vector:$a, Vector:$b,
-                                    Predicate:$pred),
-                                   (IRInt<operation,
-                                          [DblVector, Vector, Predicate]>
-                                    $a, $b, top, $pred, $inactive)>;
+multiclass DblVectorVectorArithmetic<string operation, dag extraArgs = (?)> {
+  defm "" : IntrinsicMX<
+      DblVector, (args Vector:$a, Vector:$b, Predicate:$pred),
+      !con((IRInt<operation, [DblVector, Vector, Predicate]> $a, $b),
+           extraArgs, (? $pred, $inactive))>;
 }
 
 // Predicated intrinsics - Int types only
 let params = T.Int in {
-  defm vminq : VectorVectorArithmetic<"min_predicated">;
-  defm vmaxq : VectorVectorArithmetic<"max_predicated">;
-  defm vmulhq : VectorVectorArithmetic<"mulh_predicated">;
-  defm vrmulhq : VectorVectorArithmetic<"rmulh_predicated">;
-  defm vqaddq : VectorVectorArithmetic<"qadd_predicated", 0>;
-  defm vhaddq : VectorVectorArithmetic<"hadd_predicated">;
-  defm vrhaddq : VectorVectorArithmetic<"rhadd_predicated">;
-  defm vqsubq : VectorVectorArithmetic<"qsub_predicated", 0>;
-  defm vhsubq : VectorVectorArithmetic<"hsub_predicated">;
-  defm vmullbq_int : DblVectorVectorArithmetic<"mull_int_predicated", (u32 0)>;
-  defm vmulltq_int : DblVectorVectorArithmetic<"mull_int_predicated", (u32 1)>;
+  defm vminq : VectorVectorArithmetic<"min_predicated", (? (unsignedflag Scalar))>;
+  defm vmaxq : VectorVectorArithmetic<"max_predicated", (? (unsignedflag Scalar))>;
+  defm vmulhq : VectorVectorArithmetic<"mulh_predicated", (? (unsignedflag Scalar))>;
+  defm vrmulhq : VectorVectorArithmetic<"rmulh_predicated", (? (unsignedflag Scalar))>;
+  defm vqaddq : VectorVectorArithmetic<"qadd_predicated", (? (unsignedflag Scalar)), 0>;
+  defm vhaddq : VectorVectorArithmetic<"hadd_predicated", (? (unsignedflag Scalar))>;
+  defm vrhaddq : VectorVectorArithmetic<"rhadd_predicated", (? (unsignedflag Scalar))>;
+  defm vqsubq : VectorVectorArithmetic<"qsub_predicated", (? (unsignedflag Scalar)), 0>;
+  defm vhsubq : VectorVectorArithmetic<"hsub_predicated", (? (unsignedflag Scalar))>;
+  defm vmullbq_int : DblVectorVectorArithmetic<"mull_int_predicated", (? (unsignedflag Scalar), (u32 0))>;
+  defm vmulltq_int : DblVectorVectorArithmetic<"mull_int_predicated", (? (unsignedflag Scalar), (u32 1))>;
 }
 let params = T.Signed in {
-  defm vqdmulhq : VectorVectorArithmetic<"qdmulh_predicated", 0>;
-  defm vqrdmulhq : VectorVectorArithmetic<"qrdmulh_predicated", 0>;
+  defm vqdmulhq : VectorVectorArithmetic<"qdmulh_predicated", (?), 0>;
+  defm vqrdmulhq : VectorVectorArithmetic<"qrdmulh_predicated", (?), 0>;
 }
 
 let params = T.Poly, overrideKindLetter = "p" in {
-  defm vmullbq_poly : DblVectorVectorArithmetic<"mull_poly_predicated", (u32 0)>;
-  defm vmulltq_poly : DblVectorVectorArithmetic<"mull_poly_predicated", (u32 1)>;
+  defm vmullbq_poly : DblVectorVectorArithmetic<"mull_poly_predicated", (? (u32 0))>;
+  defm vmulltq_poly : DblVectorVectorArithmetic<"mull_poly_predicated", (? (u32 1))>;
 }
 
 // Predicated intrinsics - Float types only
 let params = T.Float in {
-  defm vminnmq : VectorVectorArithmetic<"min_predicated">;
-  defm vmaxnmq : VectorVectorArithmetic<"max_predicated">;
+  defm vminnmq : VectorVectorArithmetic<"min_predicated", (? (u32 0))>;
+  defm vmaxnmq : VectorVectorArithmetic<"max_predicated", (? (u32 0))>;
 }
 
 let params = T.Int in {
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vabdq.c b/clang/test/CodeGen/arm-mve-intrinsics/vabdq.c
index fb8a5fb5e56..d972fb78b50 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vabdq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vabdq.c
@@ -6,7 +6,7 @@
 
 // CHECK-LABEL: @test_vabdq_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vabd.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vabd.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0)
 // CHECK-NEXT:    ret <16 x i8> [[TMP0]]
 //
 int8x16_t test_vabdq_s8(int8x16_t a, int8x16_t b)
@@ -20,7 +20,7 @@ int8x16_t test_vabdq_s8(int8x16_t a, int8x16_t b)
 
 // CHECK-LABEL: @test_vabdq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vabd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vabd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1)
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 uint32x4_t test_vabdq_u32(uint32x4_t a, uint32x4_t b)
@@ -34,7 +34,7 @@ uint32x4_t test_vabdq_u32(uint32x4_t a, uint32x4_t b)
 
 // CHECK-LABEL: @test_vabdq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.arm.mve.vabd.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.arm.mve.vabd.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], i32 0)
 // CHECK-NEXT:    ret <8 x half> [[TMP0]]
 //
 float16x8_t test_vabdq_f32(float16x8_t a, float16x8_t b)
@@ -50,7 +50,7 @@ float16x8_t test_vabdq_f32(float16x8_t a, float16x8_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 uint16x8_t test_vabdq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
@@ -66,7 +66,7 @@ uint16x8_t test_vabdq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.abd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.abd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
 int8x16_t test_vabdq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
@@ -82,7 +82,7 @@ int8x16_t test_vabdq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.abd.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.abd.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <4 x float> [[TMP2]]
 //
 float32x4_t test_vabdq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t b, mve_pred16_t p)
@@ -98,7 +98,7 @@ float32x4_t test_vabdq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t b,
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> undef)
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 uint16x8_t test_vabdq_x_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p)
@@ -114,7 +114,7 @@ uint16x8_t test_vabdq_x_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.abd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.abd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <4 x i32> undef)
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 uint32x4_t test_vabdq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
@@ -130,7 +130,7 @@ uint32x4_t test_vabdq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.abd.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.abd.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x half> undef)
 // CHECK-NEXT:    ret <8 x half> [[TMP2]]
 //
 float16x8_t test_vabdq_x_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c b/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c
index a31e6ae4ffa..1d97ea80520 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c
@@ -6,7 +6,7 @@
 
 // CHECK-LABEL: @test_vhaddq_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vhadd.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vhadd.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1)
 // CHECK-NEXT:    ret <16 x i8> [[TMP0]]
 //
 uint8x16_t test_vhaddq_u8(uint8x16_t a, uint8x16_t b)
@@ -20,7 +20,7 @@ uint8x16_t test_vhaddq_u8(uint8x16_t a, uint8x16_t b)
 
 // CHECK-LABEL: @test_vhaddq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vhadd.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vhadd.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0)
 // CHECK-NEXT:    ret <8 x i16> [[TMP0]]
 //
 int16x8_t test_vhaddq_s16(int16x8_t a, int16x8_t b)
@@ -34,7 +34,7 @@ int16x8_t test_vhaddq_s16(int16x8_t a, int16x8_t b)
 
 // CHECK-LABEL: @test_vhaddq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1)
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 uint32x4_t test_vhaddq_u32(uint32x4_t a, uint32x4_t b)
@@ -50,7 +50,7 @@ uint32x4_t test_vhaddq_u32(uint32x4_t a, uint32x4_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
 int8x16_t test_vhaddq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
@@ -66,7 +66,7 @@ int8x16_t test_vhaddq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pre
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 uint16x8_t test_vhaddq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
@@ -82,7 +82,7 @@ uint16x8_t test_vhaddq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mv
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vhaddq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
@@ -98,7 +98,7 @@ int32x4_t test_vhaddq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pr
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> undef)
 // CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
 uint8x16_t test_vhaddq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
@@ -114,7 +114,7 @@ uint8x16_t test_vhaddq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x i16> undef)
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 int16x8_t test_vhaddq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
@@ -130,7 +130,7 @@ int16x8_t test_vhaddq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <4 x i32> undef)
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 uint32x4_t test_vhaddq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c b/clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c
index 34fd8c93a9a..633fd32d947 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c
@@ -6,7 +6,7 @@
 
 // CHECK-LABEL: @test_vhsubq_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vhsub.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vhsub.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1)
 // CHECK-NEXT:    ret <16 x i8> [[TMP0]]
 //
 uint8x16_t test_vhsubq_u8(uint8x16_t a, uint8x16_t b)
@@ -20,7 +20,7 @@ uint8x16_t test_vhsubq_u8(uint8x16_t a, uint8x16_t b)
 
 // CHECK-LABEL: @test_vhsubq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vhsub.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vhsub.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0)
 // CHECK-NEXT:    ret <8 x i16> [[TMP0]]
 //
 int16x8_t test_vhsubq_s16(int16x8_t a, int16x8_t b)
@@ -34,7 +34,7 @@ int16x8_t test_vhsubq_s16(int16x8_t a, int16x8_t b)
 
 // CHECK-LABEL: @test_vhsubq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1)
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 uint32x4_t test_vhsubq_u32(uint32x4_t a, uint32x4_t b)
@@ -50,7 +50,7 @@ uint32x4_t test_vhsubq_u32(uint32x4_t a, uint32x4_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
 int8x16_t test_vhsubq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
@@ -66,7 +66,7 @@ int8x16_t test_vhsubq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pre
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 uint16x8_t test_vhsubq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
@@ -82,7 +82,7 @@ uint16x8_t test_vhsubq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mv
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vhsubq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c
index 3f44b654c48..539508127ca 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c
@@ -36,7 +36,7 @@ float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x half> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <8 x half> [[TMP2]]
 //
 float16x8_t test_vmaxnmq_m_f16(float16x8_t inactive, float16x8_t a, float16x8_t b, mve_pred16_t p)
@@ -52,7 +52,7 @@ float16x8_t test_vmaxnmq_m_f16(float16x8_t inactive, float16x8_t a, float16x8_t
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <4 x float> [[TMP2]]
 //
 float32x4_t test_vmaxnmq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t b, mve_pred16_t p)
@@ -68,7 +68,7 @@ float32x4_t test_vmaxnmq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x half> undef)
 // CHECK-NEXT:    ret <8 x half> [[TMP2]]
 //
 float16x8_t test_vmaxnmq_x_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
@@ -84,7 +84,7 @@ float16x8_t test_vmaxnmq_x_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x float> undef)
 // CHECK-NEXT:    ret <4 x float> [[TMP2]]
 //
 float32x4_t test_vmaxnmq_x_f32(float32x4_t a, float32x4_t b, mve_pred16_t p)
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c
index 7b766fb0d25..93fb24d339b 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c
@@ -53,7 +53,7 @@ int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
 uint8x16_t test_vmaxq_m_u8(uint8x16_t inactive, uint8x16_t a, uint8x16_t b, mve_pred16_t p)
@@ -69,7 +69,7 @@ uint8x16_t test_vmaxq_m_u8(uint8x16_t inactive, uint8x16_t a, uint8x16_t b, mve_
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 int16x8_t test_vmaxq_m_s16(int16x8_t inactive, int16x8_t a, int16x8_t b, mve_pred16_t p)
@@ -85,7 +85,7 @@ int16x8_t test_vmaxq_m_s16(int16x8_t inactive, int16x8_t a, int16x8_t b, mve_pre
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 uint32x4_t test_vmaxq_m_u32(uint32x4_t inactive, uint32x4_t a, uint32x4_t b, mve_pred16_t p)
@@ -101,15 +101,15 @@ uint32x4_t test_vmaxq_m_u32(uint32x4_t inactive, uint32x4_t a, uint32x4_t b, mve
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> undef)
 // CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
-int8x16_t test_vmaxq_x_u8(int8x16_t a, int8x16_t b, mve_pred16_t p)
+uint8x16_t test_vmaxq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
 {
 #ifdef POLYMORPHIC
     return vmaxq_x(a, b, p);
 #else /* POLYMORPHIC */
-    return vmaxq_x_s8(a, b, p);
+    return vmaxq_x_u8(a, b, p);
 #endif /* POLYMORPHIC */
 }
 
@@ -117,7 +117,7 @@ int8x16_t test_vmaxq_x_u8(int8x16_t a, int8x16_t b, mve_pred16_t p)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> undef)
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 uint16x8_t test_vmaxq_x_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p)
@@ -133,7 +133,7 @@ uint16x8_t test_vmaxq_x_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> undef)
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vmaxq_x_s32(int32x4_t a, int32x4_t b, mve_pred16_t p)
@@ -141,6 +141,6 @@ int32x4_t test_vmaxq_x_s32(int32x4_t a, int32x4_t b, mve_pred16_t p)
 #ifdef POLYMORPHIC
     return vmaxq_x(a, b, p);
 #else /* POLYMORPHIC */
-    return vmaxq_x_u32(a, b, p);
+    return vmaxq_x_s32(a, b, p);
 #endif /* POLYMORPHIC */
 }
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c b/clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c
index ed6b31dfb0f..f951afe61b8 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c
@@ -36,7 +36,7 @@ float32x4_t test_vminnmq_f32(float32x4_t a, float32x4_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x half> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <8 x half> [[TMP2]]
 //
 float16x8_t test_vminnmq_m_f16(float16x8_t inactive, float16x8_t a, float16x8_t b, mve_pred16_t p)
@@ -52,7 +52,7 @@ float16x8_t test_vminnmq_m_f16(float16x8_t inactive, float16x8_t a, float16x8_t
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <4 x float> [[TMP2]]
 //
 float32x4_t test_vminnmq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t b, mve_pred16_t p)
@@ -68,7 +68,7 @@ float32x4_t test_vminnmq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x half> undef)
 // CHECK-NEXT:    ret <8 x half> [[TMP2]]
 //
 float16x8_t test_vminnmq_x_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
@@ -84,7 +84,7 @@ float16x8_t test_vminnmq_x_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x float> undef)
 // CHECK-NEXT:    ret <4 x float> [[TMP2]]
 //
 float32x4_t test_vminnmq_x_f32(float32x4_t a, float32x4_t b, mve_pred16_t p)
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vminq.c b/clang/test/CodeGen/arm-mve-intrinsics/vminq.c
index b6981104d8b..105c5d6a361 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vminq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vminq.c
@@ -53,7 +53,7 @@ uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
 int8x16_t test_vminq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
@@ -69,7 +69,7 @@ int8x16_t test_vminq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 uint16x8_t test_vminq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
@@ -85,7 +85,7 @@ uint16x8_t test_vminq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vminq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
@@ -101,7 +101,7 @@ int32x4_t test_vminq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pre
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> undef)
 // CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
 uint8x16_t test_vminq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
@@ -117,7 +117,7 @@ uint8x16_t test_vminq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x i16> undef)
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 int16x8_t test_vminq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
@@ -125,7 +125,7 @@ int16x8_t test_vminq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
 #ifdef POLYMORPHIC
     return vminq_x(a, b, p);
 #else /* POLYMORPHIC */
-    return vminq_x_u16(a, b, p);
+    return vminq_x_s16(a, b, p);
 #endif /* POLYMORPHIC */
 }
 
@@ -133,7 +133,7 @@ int16x8_t test_vminq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <4 x i32> undef)
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 uint32x4_t test_vminq_x_s32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmulhq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmulhq.c
index 3a6b89504dd..816a0df0111 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmulhq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmulhq.c
@@ -6,7 +6,7 @@
 
 // CHECK-LABEL: @test_vmulhq_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vmulh.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vmulh.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1)
 // CHECK-NEXT:    ret <16 x i8> [[TMP0]]
 //
 uint8x16_t test_vmulhq_u8(uint8x16_t a, uint8x16_t b)
@@ -20,7 +20,7 @@ uint8x16_t test_vmulhq_u8(uint8x16_t a, uint8x16_t b)
 
 // CHECK-LABEL: @test_vmulhq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vmulh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vmulh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0)
 // CHECK-NEXT:    ret <8 x i16> [[TMP0]]
 //
 int16x8_t test_vmulhq_s16(int16x8_t a, int16x8_t b)
@@ -34,7 +34,7 @@ int16x8_t test_vmulhq_s16(int16x8_t a, int16x8_t b)
 
 // CHECK-LABEL: @test_vmulhq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vmulh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vmulh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1)
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 uint32x4_t test_vmulhq_u32(uint32x4_t a, uint32x4_t b)
@@ -50,7 +50,7 @@ uint32x4_t test_vmulhq_u32(uint32x4_t a, uint32x4_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
 int8x16_t test_vmulhq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
@@ -66,7 +66,7 @@ int8x16_t test_vmulhq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pre
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 uint16x8_t test_vmulhq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
@@ -82,7 +82,7 @@ uint16x8_t test_vmulhq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mv
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vmulhq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
@@ -98,7 +98,7 @@ int32x4_t test_vmulhq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pr
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> undef)
 // CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
 uint8x16_t test_vmulhq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
@@ -114,7 +114,7 @@ uint8x16_t test_vmulhq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x i16> undef)
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 int16x8_t test_vmulhq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
@@ -130,7 +130,7 @@ int16x8_t test_vmulhq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <4 x i32> undef)
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 uint32x4_t test_vmulhq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmullbq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmullbq.c
index 008a368912c..757fb03b0bc 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmullbq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmullbq.c
@@ -6,7 +6,7 @@
 
 // CHECK-LABEL: @test_vmullbq_int_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0)
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, i32 0)
 // CHECK-NEXT:    ret <8 x i16> [[TMP0]]
 //
 int16x8_t test_vmullbq_int_u8(uint8x16_t a, uint8x16_t b)
@@ -20,7 +20,7 @@ int16x8_t test_vmullbq_int_u8(uint8x16_t a, uint8x16_t b)
 
 // CHECK-LABEL: @test_vmullbq_int_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0)
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, i32 0)
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 int32x4_t test_vmullbq_int_s16(int16x8_t a, int16x8_t b)
@@ -34,7 +34,7 @@ int32x4_t test_vmullbq_int_s16(int16x8_t a, int16x8_t b)
 
 // CHECK-LABEL: @test_vmullbq_int_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0)
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, i32 0)
 // CHECK-NEXT:    ret <2 x i64> [[TMP0]]
 //
 uint64x2_t test_vmullbq_int_u32(uint32x4_t a, uint32x4_t b)
@@ -64,7 +64,7 @@ uint32x4_t test_vmullbq_poly_p16(uint16x8_t a, uint16x8_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, i32 0, <16 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 int16x8_t test_vmullbq_int_m_s8(int16x8_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
@@ -80,7 +80,7 @@ int16x8_t test_vmullbq_int_m_s8(int16x8_t inactive, int8x16_t a, int8x16_t b, mv
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, i32 0, <8 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 uint32x4_t test_vmullbq_int_m_u16(uint32x4_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
@@ -96,7 +96,7 @@ uint32x4_t test_vmullbq_int_m_u16(uint32x4_t inactive, uint16x8_t a, uint16x8_t
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, i32 0, <4 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <2 x i64> [[TMP2]]
 //
 int64x2_t test_vmullbq_int_m_s32(int64x2_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
@@ -128,7 +128,7 @@ uint16x8_t test_vmullbq_poly_m_p8(uint16x8_t inactive, uint8x16_t a, uint8x16_t
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, i32 0, <16 x i1> [[TMP1]], <8 x i16> undef)
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 uint16x8_t test_vmullbq_int_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
@@ -136,7 +136,7 @@ uint16x8_t test_vmullbq_int_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
 #ifdef POLYMORPHIC
     return vmullbq_int_x(a, b, p);
 #else /* POLYMORPHIC */
-    return vmullbq_int_x_s8(a, b, p);
+    return vmullbq_int_x_u8(a, b, p);
 #endif /* POLYMORPHIC */
 }
 
@@ -144,7 +144,7 @@ uint16x8_t test_vmullbq_int_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, i32 0, <8 x i1> [[TMP1]], <4 x i32> undef)
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vmullbq_int_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
@@ -152,7 +152,7 @@ int32x4_t test_vmullbq_int_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
 #ifdef POLYMORPHIC
     return vmullbq_int_x(a, b, p);
 #else /* POLYMORPHIC */
-    return vmullbq_int_x_u16(a, b, p);
+    return vmullbq_int_x_s16(a, b, p);
 #endif /* POLYMORPHIC */
 }
 
@@ -160,7 +160,7 @@ int32x4_t test_vmullbq_int_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <2 x i64> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, i32 0, <4 x i1> [[TMP1]], <2 x i64> undef)
 // CHECK-NEXT:    ret <2 x i64> [[TMP2]]
 //
 uint64x2_t test_vmullbq_int_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
@@ -168,7 +168,7 @@ uint64x2_t test_vmullbq_int_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
 #ifdef POLYMORPHIC
     return vmullbq_int_x(a, b, p);
 #else /* POLYMORPHIC */
-    return vmullbq_int_x_s32(a, b, p);
+    return vmullbq_int_x_u32(a, b, p);
 #endif /* POLYMORPHIC */
 }
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmulltq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmulltq.c
index 7baa9944ba1..1cd982e31a4 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmulltq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmulltq.c
@@ -6,7 +6,7 @@
 
 // CHECK-LABEL: @test_vmulltq_int_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, i32 1)
 // CHECK-NEXT:    ret <8 x i16> [[TMP0]]
 //
 int16x8_t test_vmulltq_int_u8(uint8x16_t a, uint8x16_t b)
@@ -20,7 +20,7 @@ int16x8_t test_vmulltq_int_u8(uint8x16_t a, uint8x16_t b)
 
 // CHECK-LABEL: @test_vmulltq_int_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, i32 1)
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 int32x4_t test_vmulltq_int_s16(int16x8_t a, int16x8_t b)
@@ -34,7 +34,7 @@ int32x4_t test_vmulltq_int_s16(int16x8_t a, int16x8_t b)
 
 // CHECK-LABEL: @test_vmulltq_int_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, i32 1)
 // CHECK-NEXT:    ret <2 x i64> [[TMP0]]
 //
 uint64x2_t test_vmulltq_int_u32(uint32x4_t a, uint32x4_t b)
@@ -64,7 +64,7 @@ uint32x4_t test_vmulltq_poly_p16(uint16x8_t a, uint16x8_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, i32 1, <16 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 int16x8_t test_vmulltq_int_m_s8(int16x8_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
@@ -80,7 +80,7 @@ int16x8_t test_vmulltq_int_m_s8(int16x8_t inactive, int8x16_t a, int8x16_t b, mv
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, i32 1, <8 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 uint32x4_t test_vmulltq_int_m_u16(uint32x4_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
@@ -96,7 +96,7 @@ uint32x4_t test_vmulltq_int_m_u16(uint32x4_t inactive, uint16x8_t a, uint16x8_t
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, i32 1, <4 x i1> [[TMP1]], <2 x i64> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <2 x i64> [[TMP2]]
 //
 int64x2_t test_vmulltq_int_m_s32(int64x2_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
@@ -128,7 +128,7 @@ uint16x8_t test_vmulltq_poly_m_p8(uint16x8_t inactive, uint8x16_t a, uint8x16_t
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, i32 1, <16 x i1> [[TMP1]], <8 x i16> undef)
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 uint16x8_t test_vmulltq_int_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
@@ -144,7 +144,7 @@ uint16x8_t test_vmulltq_int_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, i32 1, <8 x i1> [[TMP1]], <4 x i32> undef)
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vmulltq_int_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
@@ -160,7 +160,7 @@ int32x4_t test_vmulltq_int_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <2 x i64> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, i32 1, <4 x i1> [[TMP1]], <2 x i64> undef)
 // CHECK-NEXT:    ret <2 x i64> [[TMP2]]
 //
 uint64x2_t test_vmulltq_int_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c b/clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c
index 6d5df3d4d6e..8361ab3618e 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c
@@ -50,7 +50,7 @@ uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.qadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.qadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
 int8x16_t test_vqaddq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
@@ -66,7 +66,7 @@ int8x16_t test_vqaddq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pre
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.qadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.qadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 uint16x8_t test_vqaddq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
@@ -82,7 +82,7 @@ uint16x8_t test_vqaddq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mv
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vqaddq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c b/clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c
index b51f01041af..c8924bf9128 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c
@@ -50,7 +50,7 @@ uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.qsub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.qsub.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
 int8x16_t test_vqsubq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
@@ -66,7 +66,7 @@ int8x16_t test_vqsubq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pre
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.qsub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.qsub.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 uint16x8_t test_vqsubq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
@@ -82,7 +82,7 @@ uint16x8_t test_vqsubq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mv
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vqsubq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vrhaddq.c b/clang/test/CodeGen/arm-mve-intrinsics/vrhaddq.c
index eb34fec9c13..eb28d239741 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vrhaddq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vrhaddq.c
@@ -6,7 +6,7 @@
 
 // CHECK-LABEL: @test_vrhaddq_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vrhadd.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vrhadd.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1)
 // CHECK-NEXT:    ret <16 x i8> [[TMP0]]
 //
 uint8x16_t test_vrhaddq_u8(uint8x16_t a, uint8x16_t b)
@@ -20,7 +20,7 @@ uint8x16_t test_vrhaddq_u8(uint8x16_t a, uint8x16_t b)
 
 // CHECK-LABEL: @test_vrhaddq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vrhadd.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vrhadd.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0)
 // CHECK-NEXT:    ret <8 x i16> [[TMP0]]
 //
 int16x8_t test_vrhaddq_s16(int16x8_t a, int16x8_t b)
@@ -34,7 +34,7 @@ int16x8_t test_vrhaddq_s16(int16x8_t a, int16x8_t b)
 
 // CHECK-LABEL: @test_vrhaddq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vrhadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vrhadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1)
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 uint32x4_t test_vrhaddq_u32(uint32x4_t a, uint32x4_t b)
@@ -50,7 +50,7 @@ uint32x4_t test_vrhaddq_u32(uint32x4_t a, uint32x4_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
 int8x16_t test_vrhaddq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
@@ -66,7 +66,7 @@ int8x16_t test_vrhaddq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pr
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 uint16x8_t test_vrhaddq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
@@ -82,7 +82,7 @@ uint16x8_t test_vrhaddq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, m
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vrhaddq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
@@ -98,7 +98,7 @@ int32x4_t test_vrhaddq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_p
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> undef)
 // CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
 uint8x16_t test_vrhaddq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
@@ -114,10 +114,10 @@ uint8x16_t test_vrhaddq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> undef)
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
-int16x8_t test_vrhaddq_x_u16(int16x8_t a, int16x8_t b, mve_pred16_t p)
+uint16x8_t test_vrhaddq_x_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p)
 {
 #ifdef POLYMORPHIC
     return vrhaddq_x(a, b, p);
@@ -130,7 +130,7 @@ int16x8_t test_vrhaddq_x_u16(int16x8_t a, int16x8_t b, mve_pred16_t p)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <4 x i32> undef)
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 uint32x4_t test_vrhaddq_x_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vrmulhq.c b/clang/test/CodeGen/arm-mve-intrinsics/vrmulhq.c
index 3bf77f0d16c..4f65f002b01 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vrmulhq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vrmulhq.c
@@ -6,7 +6,7 @@
 
 // CHECK-LABEL: @test_vrmulhq_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vrmulh.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vrmulh.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1)
 // CHECK-NEXT:    ret <16 x i8> [[TMP0]]
 //
 uint8x16_t test_vrmulhq_u8(uint8x16_t a, uint8x16_t b)
@@ -20,7 +20,7 @@ uint8x16_t test_vrmulhq_u8(uint8x16_t a, uint8x16_t b)
 
 // CHECK-LABEL: @test_vrmulhq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vrmulh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vrmulh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0)
 // CHECK-NEXT:    ret <8 x i16> [[TMP0]]
 //
 int16x8_t test_vrmulhq_s16(int16x8_t a, int16x8_t b)
@@ -34,7 +34,7 @@ int16x8_t test_vrmulhq_s16(int16x8_t a, int16x8_t b)
 
 // CHECK-LABEL: @test_vrmulhq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vrmulh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vrmulh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1)
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 uint32x4_t test_vrmulhq_u32(uint32x4_t a, uint32x4_t b)
@@ -50,7 +50,7 @@ uint32x4_t test_vrmulhq_u32(uint32x4_t a, uint32x4_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
 int8x16_t test_vrmulhq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
@@ -66,7 +66,7 @@ int8x16_t test_vrmulhq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pr
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 uint16x8_t test_vrmulhq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
@@ -82,7 +82,7 @@ uint16x8_t test_vrmulhq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, m
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vrmulhq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
@@ -98,7 +98,7 @@ int32x4_t test_vrmulhq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_p
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> undef)
 // CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
 uint8x16_t test_vrmulhq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
@@ -114,7 +114,7 @@ uint8x16_t test_vrmulhq_x_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]], <8 x i16> undef)
 // CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 int16x8_t test_vrmulhq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
@@ -130,7 +130,7 @@ int16x8_t test_vrmulhq_x_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> undef)
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]], <4 x i32> undef)
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 uint32x4_t test_vrmulhq_m_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td
index f76ba563512..50329ecc0e6 100644
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -802,14 +802,16 @@ multiclass IntrinsicSignSuffix<list<LLVMType> rets, list<LLVMType> params = [],
 }
 
 def int_arm_mve_min_predicated: Intrinsic<[llvm_anyvector_ty],
-   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,
+    llvm_anyvector_ty, LLVMMatchType<0>],
    [IntrNoMem]>;
 def int_arm_mve_max_predicated: Intrinsic<[llvm_anyvector_ty],
-   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,
+    llvm_anyvector_ty, LLVMMatchType<0>],
    [IntrNoMem]>;
 def int_arm_mve_abd_predicated: Intrinsic<[llvm_anyvector_ty],
-   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
-   [IntrNoMem]>;
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,
+    llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>;
 def int_arm_mve_add_predicated: Intrinsic<[llvm_anyvector_ty],
    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
    [IntrNoMem]>;
@@ -835,40 +837,42 @@ def int_arm_mve_mul_predicated: Intrinsic<[llvm_anyvector_ty],
    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
    [IntrNoMem]>;
 def int_arm_mve_mulh_predicated: Intrinsic<[llvm_anyvector_ty],
-   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,
+    llvm_anyvector_ty, LLVMMatchType<0>],
    [IntrNoMem]>;
 def int_arm_mve_qdmulh_predicated: Intrinsic<[llvm_anyvector_ty],
    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
    [IntrNoMem]>;
 def int_arm_mve_rmulh_predicated: Intrinsic<[llvm_anyvector_ty],
-   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,
+    llvm_anyvector_ty, LLVMMatchType<0>],
    [IntrNoMem]>;
 def int_arm_mve_qrdmulh_predicated: Intrinsic<[llvm_anyvector_ty],
    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
    [IntrNoMem]>;
 def int_arm_mve_mull_int_predicated: Intrinsic<[llvm_anyvector_ty],
-   [llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty, llvm_anyvector_ty,
-    LLVMMatchType<0>],
+   [llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty /* unsigned */,
+    llvm_i32_ty /* top */, llvm_anyvector_ty, LLVMMatchType<0>],
    [IntrNoMem]>;
 def int_arm_mve_mull_poly_predicated: Intrinsic<[llvm_anyvector_ty],
    [llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty, llvm_anyvector_ty,
     LLVMMatchType<0>],
    [IntrNoMem]>;
 def int_arm_mve_qadd_predicated: Intrinsic<[llvm_anyvector_ty],
-   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
-   [IntrNoMem]>;
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,
+    llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>;
 def int_arm_mve_hadd_predicated: Intrinsic<[llvm_anyvector_ty],
-   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
-   [IntrNoMem]>;
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,
+    llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>;
 def int_arm_mve_rhadd_predicated: Intrinsic<[llvm_anyvector_ty],
-   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
-   [IntrNoMem]>;
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,
+    llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>;
 def int_arm_mve_qsub_predicated: Intrinsic<[llvm_anyvector_ty],
-   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
-   [IntrNoMem]>;
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,
+    llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>;
 def int_arm_mve_hsub_predicated: Intrinsic<[llvm_anyvector_ty],
-   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
-   [IntrNoMem]>;
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,
+    llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>;
 
 defm int_arm_mve_minv: IntrinsicSignSuffix<[llvm_i32_ty],
    [llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>;
@@ -962,7 +966,8 @@ def int_arm_mve_asrl: ARM_MVE_qrshift_single<[llvm_i32_ty, llvm_i32_ty]>;
 
 def int_arm_mve_vabd: Intrinsic<
    [llvm_anyvector_ty],
-   [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */],
+   [IntrNoMem]>;
 def int_arm_mve_vadc: Intrinsic<
    [llvm_anyvector_ty, llvm_i32_ty],
    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem]>;
@@ -972,28 +977,34 @@ def int_arm_mve_vadc_predicated: Intrinsic<
     llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>;
 def int_arm_mve_vmulh: Intrinsic<
    [llvm_anyvector_ty],
-   [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */],
+   [IntrNoMem]>;
 def int_arm_mve_vqdmulh: Intrinsic<
    [llvm_anyvector_ty],
    [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
 def int_arm_mve_vhadd: Intrinsic<
    [llvm_anyvector_ty],
-   [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */],
+   [IntrNoMem]>;
 def int_arm_mve_vrhadd: Intrinsic<
    [llvm_anyvector_ty],
-   [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */],
+   [IntrNoMem]>;
 def int_arm_mve_vhsub: Intrinsic<
    [llvm_anyvector_ty],
-   [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */],
+   [IntrNoMem]>;
 def int_arm_mve_vrmulh: Intrinsic<
    [llvm_anyvector_ty],
-   [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */],
+   [IntrNoMem]>;
 def int_arm_mve_vqrdmulh: Intrinsic<
    [llvm_anyvector_ty],
    [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
 def int_arm_mve_vmull: Intrinsic<
    [llvm_anyvector_ty],
-   [llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty], [IntrNoMem]>;
+   [llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty /* unsigned */,
+    llvm_i32_ty /* top */], [IntrNoMem]>;
 def int_arm_mve_vmull_poly: Intrinsic<
    [llvm_anyvector_ty],
    [llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty], [IntrNoMem]>;
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 3189caf626d..e25af565503 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -320,6 +320,9 @@ class MVEVectorVTInfo<ValueType vec, ValueType dblvec, ValueType pred,
 
   // The suffix used on an instruction that mentions the whole type.
   string Suffix = suffixletter ## BitsSuffix;
+
+  // The letter part of the suffix only.
+  string SuffixLetter = suffixletter;
 }
 
 // Integer vector types that don't treat signed and unsigned differently.
@@ -1090,12 +1093,12 @@ let Predicates = [HasMVEFloat] in {
             (v4f32 (MVE_VMAXNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>;
   def : Pat<(v8f16 (fmaxnum (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))),
             (v8f16 (MVE_VMAXNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
-  def : Pat<(v4f32 (int_arm_mve_max_predicated (v4f32 MQPR:$val1), (v4f32 MQPR:$val2),
+  def : Pat<(v4f32 (int_arm_mve_max_predicated (v4f32 MQPR:$val1), (v4f32 MQPR:$val2), (i32 0),
                           (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))),
             (v4f32 (MVE_VMAXNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2),
                           ARMVCCThen, (v4i1 VCCR:$mask),
                           (v4f32 MQPR:$inactive)))>;
-  def : Pat<(v8f16 (int_arm_mve_max_predicated (v8f16 MQPR:$val1), (v8f16 MQPR:$val2),
+  def : Pat<(v8f16 (int_arm_mve_max_predicated (v8f16 MQPR:$val1), (v8f16 MQPR:$val2), (i32 0),
                           (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))),
             (v8f16 (MVE_VMAXNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2),
                           ARMVCCThen, (v8i1 VCCR:$mask),
@@ -1111,12 +1114,12 @@ let Predicates = [HasMVEFloat] in {
   def : Pat<(v8f16 (fminnum (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))),
             (v8f16 (MVE_VMINNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
   def : Pat<(v4f32 (int_arm_mve_min_predicated (v4f32 MQPR:$val1), (v4f32 MQPR:$val2),
-                          (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))),
+                          (i32 0), (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))),
             (v4f32 (MVE_VMINNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2),
                           ARMVCCThen, (v4i1 VCCR:$mask),
                           (v4f32 MQPR:$inactive)))>;
   def : Pat<(v8f16 (int_arm_mve_min_predicated (v8f16 MQPR:$val1), (v8f16 MQPR:$val2),
-                          (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))),
+                          (i32 0), (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))),
             (v8f16 (MVE_VMINNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2),
                           ARMVCCThen, (v8i1 VCCR:$mask),
                           (v8f16 MQPR:$inactive)))>;
@@ -1149,7 +1152,8 @@ multiclass MVE_VMINMAX_m<string iname, bit bit_4, MVEVectorVTInfo VTI,
 
     // Predicated min/max
     def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                            (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+                            (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
+                            (VTI.Vec MQPR:$inactive))),
               (VTI.Vec (!cast<Instruction>(NAME)
                             (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
                             ARMVCCThen, (VTI.Pred VCCR:$mask),
@@ -1771,14 +1775,15 @@ multiclass MVE_VQADD_m<MVEVectorVTInfo VTI,
   def "" : MVE_VQADD_<VTI.Suffix, VTI.Unsigned, VTI.Size>;
 
   let Predicates = [HasMVEInt] in {
-    // Unpredicated add-and-divide-by-two
+    // Unpredicated saturating add
     def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
               (VTI.Vec (!cast<Instruction>(NAME)
                             (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
 
-    // Predicated add-and-divide-by-two
+    // Predicated saturating add
     def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                            (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+                            (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
+                            (VTI.Vec MQPR:$inactive))),
               (VTI.Vec (!cast<Instruction>(NAME)
                             (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
                             ARMVCCThen, (VTI.Pred VCCR:$mask),
@@ -1801,14 +1806,15 @@ multiclass MVE_VQSUB_m<MVEVectorVTInfo VTI,
   def "" : MVE_VQSUB_<VTI.Suffix, VTI.Unsigned, VTI.Size>;
 
   let Predicates = [HasMVEInt] in {
-    // Unpredicated subtract-and-divide-by-two
+    // Unpredicated saturating subtract
     def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
               (VTI.Vec (!cast<Instruction>(NAME)
                             (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
 
-    // Predicated subtract-and-divide-by-two
+    // Predicated saturating subtract
     def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                            (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+                            (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
+                            (VTI.Vec MQPR:$inactive))),
               (VTI.Vec (!cast<Instruction>(NAME)
                             (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
                             ARMVCCThen, (VTI.Pred VCCR:$mask),
@@ -1845,13 +1851,15 @@ multiclass MVE_VABD_m<MVEVectorVTInfo VTI,
 
   let Predicates = [HasMVEInt] in {
     // Unpredicated absolute difference
-    def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+    def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                            (i32 VTI.Unsigned))),
               (VTI.Vec (!cast<Instruction>(NAME)
                             (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
 
     // Predicated absolute difference
     def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                            (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+                            (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
+                            (VTI.Vec MQPR:$inactive))),
               (VTI.Vec (!cast<Instruction>(NAME)
                             (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
                             ARMVCCThen, (VTI.Pred VCCR:$mask),
@@ -1887,13 +1895,15 @@ multiclass MVE_VRHADD_m<MVEVectorVTInfo VTI,
 
   let Predicates = [HasMVEInt] in {
     // Unpredicated rounding add-with-divide-by-two
-    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                            (i32 VTI.Unsigned))),
               (VTI.Vec (!cast<Instruction>(NAME)
                             (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
 
     // Predicated add-with-divide-by-two
     def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                            (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+                            (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
+                            (VTI.Vec MQPR:$inactive))),
               (VTI.Vec (!cast<Instruction>(NAME)
                             (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
                             ARMVCCThen, (VTI.Pred VCCR:$mask),
@@ -1939,12 +1949,12 @@ multiclass MVE_VHADD_m<MVEVectorVTInfo VTI,
 
   let Predicates = [HasMVEInt] in {
     // Unpredicated add-and-divide-by-two
-    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned))),
               (VTI.Vec (!cast<Instruction>(NAME)
                             (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
 
     // Predicated add-and-divide-by-two
-    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned),
                             (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
               (VTI.Vec (!cast<Instruction>(NAME)
                             (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
@@ -1969,13 +1979,15 @@ multiclass MVE_VHSUB_m<MVEVectorVTInfo VTI,
 
   let Predicates = [HasMVEInt] in {
     // Unpredicated subtract-and-divide-by-two
-    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                            (i32 VTI.Unsigned))),
               (VTI.Vec (!cast<Instruction>(NAME)
                             (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
 
     // Predicated subtract-and-divide-by-two
     def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                            (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+                            (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
+                            (VTI.Vec MQPR:$inactive))),
               (VTI.Vec (!cast<Instruction>(NAME)
                             (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
                             ARMVCCThen, (VTI.Pred VCCR:$mask),
@@ -3318,11 +3330,13 @@ multiclass MVE_VABDT_fp_m<MVEVectorVTInfo VTI,
   def "" : MVE_VABD_fp<VTI.Suffix, VTI.Size{0}>;
 
   let Predicates = [HasMVEFloat] in {
-    def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+    def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                            (i32 0))),
               (VTI.Vec (!cast<Instruction>(NAME)
                             (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
     def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                            (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+                            (i32 0), (VTI.Pred VCCR:$mask),
+                            (VTI.Vec MQPR:$inactive))),
               (VTI.Vec (!cast<Instruction>(NAME)
                             (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
                             ARMVCCThen, (VTI.Pred VCCR:$mask),
@@ -3976,17 +3990,22 @@ multiclass MVE_VMULL_m<MVEVectorVTInfo VTI,
   def "" : MVE_VMULL<"vmull" # !if(Top, "t", "b"), VTI.Suffix, VTI.Unsigned,
                      VTI.Size, Top, cstr>;
 
+  foreach uflag = [!if(!eq(VTI.SuffixLetter, "p"),
+                       (?), (? (i32 VTI.Unsigned)))] in
   let Predicates = [HasMVEInt] in {
+
     // Unpredicated multiply
-    def : Pat<(VTI.DblVec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                            (i32 Top))),
+    def : Pat<(VTI.DblVec !con((unpred_op (VTI.Vec MQPR:$Qm),
+                                          (VTI.Vec MQPR:$Qn)),
+                               uflag, (? (i32 Top)))),
               (VTI.DblVec (!cast<Instruction>(NAME)
                             (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
 
     // Predicated multiply
-    def : Pat<(VTI.DblVec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                            (i32 Top),
-                            (VTI.Pred VCCR:$mask), (VTI.DblVec MQPR:$inactive))),
+    def : Pat<(VTI.DblVec !con((pred_int (VTI.Vec MQPR:$Qm),
+                                         (VTI.Vec MQPR:$Qn)),
+                               uflag, (? (i32 Top), (VTI.Pred VCCR:$mask),
+                                         (VTI.DblVec MQPR:$inactive)))),
               (VTI.DblVec (!cast<Instruction>(NAME)
                             (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
                             ARMVCCThen, (VTI.Pred VCCR:$mask),
@@ -4059,13 +4078,15 @@ multiclass MVE_VxMULH_m<string iname, MVEVectorVTInfo VTI, SDNode unpred_op,
 
   let Predicates = [HasMVEInt] in {
     // Unpredicated multiply returning high bits
-    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                            (i32 VTI.Unsigned))),
               (VTI.Vec (!cast<Instruction>(NAME)
                             (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
 
     // Predicated multiply returning high bits
     def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                            (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+                            (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
+                            (VTI.Vec MQPR:$inactive))),
               (VTI.Vec (!cast<Instruction>(NAME)
                             (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
                             ARMVCCThen, (VTI.Pred VCCR:$mask),
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabdq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabdq.ll
index fcd57664a16..961489613b6 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabdq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabdq.ll
@@ -7,23 +7,23 @@ define arm_aapcs_vfpcc <16 x i8> @test_vabdq_s8(<16 x i8> %a, <16 x i8> %b) loca
 ; CHECK-NEXT:    vabd.s8 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <16 x i8> @llvm.arm.mve.vabd.v16i8(<16 x i8> %a, <16 x i8> %b)
+  %0 = tail call <16 x i8> @llvm.arm.mve.vabd.v16i8(<16 x i8> %a, <16 x i8> %b, i32 0)
   ret <16 x i8> %0
 }
 
-declare <16 x i8> @llvm.arm.mve.vabd.v16i8(<16 x i8>, <16 x i8>) #1
+declare <16 x i8> @llvm.arm.mve.vabd.v16i8(<16 x i8>, <16 x i8>, i32) #1
 
 define arm_aapcs_vfpcc <4 x i32> @test_vabdq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vabdq_u32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vabd.s32 q0, q0, q1
+; CHECK-NEXT:    vabd.u32 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <4 x i32> @llvm.arm.mve.vabd.v4i32(<4 x i32> %a, <4 x i32> %b)
+  %0 = tail call <4 x i32> @llvm.arm.mve.vabd.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1)
   ret <4 x i32> %0
 }
 
-declare <4 x i32> @llvm.arm.mve.vabd.v4i32(<4 x i32>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.vabd.v4i32(<4 x i32>, <4 x i32>, i32) #1
 
 define arm_aapcs_vfpcc <8 x half> @test_vabdq_f32(<8 x half> %a, <8 x half> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vabdq_f32:
@@ -31,29 +31,29 @@ define arm_aapcs_vfpcc <8 x half> @test_vabdq_f32(<8 x half> %a, <8 x half> %b)
 ; CHECK-NEXT:    vabd.f16 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <8 x half> @llvm.arm.mve.vabd.v8f16(<8 x half> %a, <8 x half> %b)
+  %0 = tail call <8 x half> @llvm.arm.mve.vabd.v8f16(<8 x half> %a, <8 x half> %b, i32 0)
   ret <8 x half> %0
 }
 
-declare <8 x half> @llvm.arm.mve.vabd.v8f16(<8 x half>, <8 x half>) #1
+declare <8 x half> @llvm.arm.mve.vabd.v8f16(<8 x half>, <8 x half>, i32) #1
 
 define arm_aapcs_vfpcc <8 x i16> @test_vabdq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vabdq_m_u16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vabdt.s16 q0, q1, q2
+; CHECK-NEXT:    vabdt.u16 q0, q1, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+  %2 = tail call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive)
   ret <8 x i16> %2
 }
 
 declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1
 
-declare <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #1
+declare <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #1
 
 define arm_aapcs_vfpcc <16 x i8> @test_vabdq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vabdq_m_s8:
@@ -65,13 +65,13 @@ define arm_aapcs_vfpcc <16 x i8> @test_vabdq_m_s8(<16 x i8> %inactive, <16 x i8>
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
-  %2 = tail call <16 x i8> @llvm.arm.mve.abd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+  %2 = tail call <16 x i8> @llvm.arm.mve.abd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive)
   ret <16 x i8> %2
 }
 
 declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1
 
-declare <16 x i8> @llvm.arm.mve.abd.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #1
+declare <16 x i8> @llvm.arm.mve.abd.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #1
 
 define arm_aapcs_vfpcc <4 x float> @test_vabdq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vabdq_m_f32:
@@ -83,25 +83,25 @@ define arm_aapcs_vfpcc <4 x float> @test_vabdq_m_f32(<4 x float> %inactive, <4 x
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <4 x float> @llvm.arm.mve.abd.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> %inactive)
+  %2 = tail call <4 x float> @llvm.arm.mve.abd.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, i32 0, <4 x i1> %1, <4 x float> %inactive)
   ret <4 x float> %2
 }
 
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
 
-declare <4 x float> @llvm.arm.mve.abd.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #1
+declare <4 x float> @llvm.arm.mve.abd.predicated.v4f32.v4i1(<4 x float>, <4 x float>, i32, <4 x i1>, <4 x float>) #1
 
 define arm_aapcs_vfpcc <8 x i16> @test_vabdq_x_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vabdq_x_u16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vabdt.s16 q0, q0, q1
+; CHECK-NEXT:    vabdt.u16 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+  %2 = tail call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> undef)
   ret <8 x i16> %2
 }
 
@@ -110,16 +110,16 @@ define arm_aapcs_vfpcc <4 x i32> @test_vabdq_x_u32(<4 x i32> %a, <4 x i32> %b, i
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vabdt.s32 q0, q0, q1
+; CHECK-NEXT:    vabdt.u32 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <4 x i32> @llvm.arm.mve.abd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
+  %2 = tail call <4 x i32> @llvm.arm.mve.abd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <4 x i32> undef)
   ret <4 x i32> %2
 }
 
-declare <4 x i32> @llvm.arm.mve.abd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.abd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1
 
 define arm_aapcs_vfpcc <8 x half> @test_vabdq_x_f16(<8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vabdq_x_f16:
@@ -131,9 +131,9 @@ define arm_aapcs_vfpcc <8 x half> @test_vabdq_x_f16(<8 x half> %a, <8 x half> %b
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <8 x half> @llvm.arm.mve.abd.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> undef)
+  %2 = tail call <8 x half> @llvm.arm.mve.abd.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, i32 0, <8 x i1> %1, <8 x half> undef)
   ret <8 x half> %2
 }
 
-declare <8 x half> @llvm.arm.mve.abd.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #1
+declare <8 x half> @llvm.arm.mve.abd.predicated.v8f16.v8i1(<8 x half>, <8 x half>, i32, <8 x i1>, <8 x half>) #1
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll
index 82ecc062c08..9319dd1c4c4 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhaddq.ll
@@ -4,14 +4,14 @@
 define arm_aapcs_vfpcc <16 x i8> @test_vhaddq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vhaddq_u8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vhadd.s8 q0, q0, q1
+; CHECK-NEXT:    vhadd.u8 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <16 x i8> @llvm.arm.mve.vhadd.v16i8(<16 x i8> %a, <16 x i8> %b)
+  %0 = tail call <16 x i8> @llvm.arm.mve.vhadd.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1)
   ret <16 x i8> %0
 }
 
-declare <16 x i8> @llvm.arm.mve.vhadd.v16i8(<16 x i8>, <16 x i8>) #1
+declare <16 x i8> @llvm.arm.mve.vhadd.v16i8(<16 x i8>, <16 x i8>, i32) #1
 
 define arm_aapcs_vfpcc <8 x i16> @test_vhaddq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vhaddq_s16:
@@ -19,23 +19,23 @@ define arm_aapcs_vfpcc <8 x i16> @test_vhaddq_s16(<8 x i16> %a, <8 x i16> %b) lo
 ; CHECK-NEXT:    vhadd.s16 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <8 x i16> @llvm.arm.mve.vhadd.v8i16(<8 x i16> %a, <8 x i16> %b)
+  %0 = tail call <8 x i16> @llvm.arm.mve.vhadd.v8i16(<8 x i16> %a, <8 x i16> %b, i32 0)
   ret <8 x i16> %0
 }
 
-declare <8 x i16> @llvm.arm.mve.vhadd.v8i16(<8 x i16>, <8 x i16>) #1
+declare <8 x i16> @llvm.arm.mve.vhadd.v8i16(<8 x i16>, <8 x i16>, i32) #1
 
 define arm_aapcs_vfpcc <4 x i32> @test_vhaddq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vhaddq_u32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vhadd.s32 q0, q0, q1
+; CHECK-NEXT:    vhadd.u32 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32> %a, <4 x i32> %b)
+  %0 = tail call <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1)
   ret <4 x i32> %0
 }
 
-declare <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32>, <4 x i32>, i32) #1
 
 define arm_aapcs_vfpcc <16 x i8> @test_vhaddq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vhaddq_m_s8:
@@ -47,31 +47,31 @@ define arm_aapcs_vfpcc <16 x i8> @test_vhaddq_m_s8(<16 x i8> %inactive, <16 x i8
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
-  %2 = tail call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+  %2 = tail call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive)
   ret <16 x i8> %2
 }
 
 declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1
 
-declare <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #1
+declare <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #1
 
 define arm_aapcs_vfpcc <8 x i16> @test_vhaddq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vhaddq_m_u16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vhaddt.s16 q0, q1, q2
+; CHECK-NEXT:    vhaddt.u16 q0, q1, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+  %2 = tail call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive)
   ret <8 x i16> %2
 }
 
 declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1
 
-declare <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #1
+declare <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #1
 
 define arm_aapcs_vfpcc <4 x i32> @test_vhaddq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vhaddq_m_s32:
@@ -83,25 +83,25 @@ define arm_aapcs_vfpcc <4 x i32> @test_vhaddq_m_s32(<4 x i32> %inactive, <4 x i3
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+  %2 = tail call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> %inactive)
   ret <4 x i32> %2
 }
 
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
 
-declare <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1
 
 define arm_aapcs_vfpcc <16 x i8> @test_vhaddq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vhaddq_x_u8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vhaddt.s8 q0, q0, q1
+; CHECK-NEXT:    vhaddt.u8 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
-  %2 = tail call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
+  %2 = tail call <16 x i8> @llvm.arm.mve.hadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <16 x i8> undef)
   ret <16 x i8> %2
 }
 
@@ -115,7 +115,7 @@ define arm_aapcs_vfpcc <8 x i16> @test_vhaddq_x_s16(<8 x i16> %a, <8 x i16> %b,
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+  %2 = tail call <8 x i16> @llvm.arm.mve.hadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <8 x i16> undef)
   ret <8 x i16> %2
 }
 
@@ -124,12 +124,12 @@ define arm_aapcs_vfpcc <4 x i32> @test_vhaddq_x_u32(<4 x i32> %a, <4 x i32> %b,
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vhaddt.s32 q0, q0, q1
+; CHECK-NEXT:    vhaddt.u32 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
+  %2 = tail call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <4 x i32> undef)
   ret <4 x i32> %2
 }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhsubq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhsubq.ll
index 43ec0cfed4d..5bc8a22810a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhsubq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vhsubq.ll
@@ -4,14 +4,14 @@
 define arm_aapcs_vfpcc <16 x i8> @test_vhsubq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vhsubq_u8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vhsub.s8 q0, q0, q1
+; CHECK-NEXT:    vhsub.u8 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <16 x i8> @llvm.arm.mve.vhsub.v16i8(<16 x i8> %a, <16 x i8> %b)
+  %0 = tail call <16 x i8> @llvm.arm.mve.vhsub.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1)
   ret <16 x i8> %0
 }
 
-declare <16 x i8> @llvm.arm.mve.vhsub.v16i8(<16 x i8>, <16 x i8>) #1
+declare <16 x i8> @llvm.arm.mve.vhsub.v16i8(<16 x i8>, <16 x i8>, i32) #1
 
 define arm_aapcs_vfpcc <8 x i16> @test_vhsubq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vhsubq_s16:
@@ -19,23 +19,23 @@ define arm_aapcs_vfpcc <8 x i16> @test_vhsubq_s16(<8 x i16> %a, <8 x i16> %b) lo
 ; CHECK-NEXT:    vhsub.s16 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <8 x i16> @llvm.arm.mve.vhsub.v8i16(<8 x i16> %a, <8 x i16> %b)
+  %0 = tail call <8 x i16> @llvm.arm.mve.vhsub.v8i16(<8 x i16> %a, <8 x i16> %b, i32 0)
   ret <8 x i16> %0
 }
 
-declare <8 x i16> @llvm.arm.mve.vhsub.v8i16(<8 x i16>, <8 x i16>) #1
+declare <8 x i16> @llvm.arm.mve.vhsub.v8i16(<8 x i16>, <8 x i16>, i32) #1
 
 define arm_aapcs_vfpcc <4 x i32> @test_vhsubq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vhsubq_u32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vhsub.s32 q0, q0, q1
+; CHECK-NEXT:    vhsub.u32 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32> %a, <4 x i32> %b)
+  %0 = tail call <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1)
   ret <4 x i32> %0
 }
 
-declare <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32>, <4 x i32>, i32) #1
 
 define arm_aapcs_vfpcc <16 x i8> @test_vhsubq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vhsubq_m_s8:
@@ -47,31 +47,31 @@ define arm_aapcs_vfpcc <16 x i8> @test_vhsubq_m_s8(<16 x i8> %inactive, <16 x i8
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
-  %2 = tail call <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+  %2 = tail call <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive)
   ret <16 x i8> %2
 }
 
 declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1
 
-declare <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #1
+declare <16 x i8> @llvm.arm.mve.hsub.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #1
 
 define arm_aapcs_vfpcc <8 x i16> @test_vhsubq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vhsubq_m_u16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vhsubt.s16 q0, q1, q2
+; CHECK-NEXT:    vhsubt.u16 q0, q1, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+  %2 = tail call <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive)
   ret <8 x i16> %2
 }
 
 declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1
 
-declare <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #1
+declare <8 x i16> @llvm.arm.mve.hsub.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #1
 
 define arm_aapcs_vfpcc <4 x i32> @test_vhsubq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vhsubq_m_s32:
@@ -83,10 +83,10 @@ define arm_aapcs_vfpcc <4 x i32> @test_vhsubq_m_s32(<4 x i32> %inactive, <4 x i3
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+  %2 = tail call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> %inactive)
   ret <4 x i32> %2
 }
 
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
 
-declare <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxnmq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxnmq.ll
index c1e45b87b45..a8dca7c77d1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxnmq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxnmq.ll
@@ -35,13 +35,13 @@ define arm_aapcs_vfpcc <8 x half> @test_vmaxnmq_m_f16(<8 x half> %inactive, <8 x
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> %inactive)
+  %2 = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, i32 0, <8 x i1> %1, <8 x half> %inactive)
   ret <8 x half> %2
 }
 
 declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
 
-declare <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #2
+declare <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half>, <8 x half>, i32, <8 x i1>, <8 x half>) #2
 
 define arm_aapcs_vfpcc <4 x float> @test_vmaxnmq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmaxnmq_m_f32:
@@ -53,13 +53,13 @@ define arm_aapcs_vfpcc <4 x float> @test_vmaxnmq_m_f32(<4 x float> %inactive, <4
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> %inactive)
+  %2 = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, i32 0, <4 x i1> %1, <4 x float> %inactive)
   ret <4 x float> %2
 }
 
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
 
-declare <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #2
+declare <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float>, <4 x float>, i32, <4 x i1>, <4 x float>) #2
 
 define arm_aapcs_vfpcc <8 x half> @test_vmaxnmq_x_f16(<8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmaxnmq_x_f16:
@@ -71,7 +71,7 @@ define arm_aapcs_vfpcc <8 x half> @test_vmaxnmq_x_f16(<8 x half> %a, <8 x half>
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> undef)
+  %2 = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, i32 0, <8 x i1> %1, <8 x half> undef)
   ret <8 x half> %2
 }
 
@@ -85,7 +85,7 @@ define arm_aapcs_vfpcc <4 x float> @test_vmaxnmq_x_f32(<4 x float> %a, <4 x floa
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> undef)
+  %2 = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, i32 0, <4 x i1> %1, <4 x float> undef)
   ret <4 x float> %2
 }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxq.ll
index 622aa1ccd33..2036378b4b9 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxq.ll
@@ -39,18 +39,18 @@ define arm_aapcs_vfpcc <16 x i8> @test_vmaxq_m_u8(<16 x i8> %inactive, <16 x i8>
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmaxt.s8 q0, q1, q2
+; CHECK-NEXT:    vmaxt.u8 q0, q1, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
-  %2 = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+  %2 = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <16 x i8> %inactive)
   ret <16 x i8> %2
 }
 
 declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2
 
-declare <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2
+declare <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #2
 
 define arm_aapcs_vfpcc <8 x i16> @test_vmaxq_m_s16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
 ; CHECK-LABEL: test_vmaxq_m_s16:
@@ -62,43 +62,43 @@ define arm_aapcs_vfpcc <8 x i16> @test_vmaxq_m_s16(<8 x i16> %inactive, <8 x i16
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+  %2 = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <8 x i16> %inactive)
   ret <8 x i16> %2
 }
 
 declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
 
-declare <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
+declare <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #2
 
 define arm_aapcs_vfpcc <4 x i32> @test_vmaxq_m_u32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
 ; CHECK-LABEL: test_vmaxq_m_u32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmaxt.s32 q0, q1, q2
+; CHECK-NEXT:    vmaxt.u32 q0, q1, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+  %2 = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <4 x i32> %inactive)
   ret <4 x i32> %2
 }
 
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
 
-declare <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
+declare <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #2
 
 define arm_aapcs_vfpcc <16 x i8> @test_vmaxq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
 ; CHECK-LABEL: test_vmaxq_x_u8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmaxt.s8 q0, q0, q1
+; CHECK-NEXT:    vmaxt.u8 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
-  %2 = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
+  %2 = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <16 x i8> undef)
   ret <16 x i8> %2
 }
 
@@ -112,7 +112,7 @@ define arm_aapcs_vfpcc <8 x i16> @test_vmaxq_x_u16(<8 x i16> %a, <8 x i16> %b, i
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+  %2 = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <8 x i16> undef)
   ret <8 x i16> %2
 }
 
@@ -121,12 +121,12 @@ define arm_aapcs_vfpcc <4 x i32> @test_vmaxq_x_s32(<4 x i32> %a, <4 x i32> %b, i
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmaxt.s32 q0, q0, q1
+; CHECK-NEXT:    vmaxt.u32 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
+  %2 = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <4 x i32> undef)
   ret <4 x i32> %2
 }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminnmq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminnmq.ll
index 9bbc2a9c936..d4853f7a2be 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminnmq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminnmq.ll
@@ -35,13 +35,13 @@ define arm_aapcs_vfpcc <8 x half> @test_vminnmq_m_f16(<8 x half> %inactive, <8 x
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> %inactive)
+  %2 = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, i32 0, <8 x i1> %1, <8 x half> %inactive)
   ret <8 x half> %2
 }
 
 declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
 
-declare <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #2
+declare <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half>, <8 x half>, i32, <8 x i1>, <8 x half>) #2
 
 define arm_aapcs_vfpcc <4 x float> @test_vminnmq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vminnmq_m_f32:
@@ -53,13 +53,13 @@ define arm_aapcs_vfpcc <4 x float> @test_vminnmq_m_f32(<4 x float> %inactive, <4
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> %inactive)
+  %2 = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, i32 0, <4 x i1> %1, <4 x float> %inactive)
   ret <4 x float> %2
 }
 
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
 
-declare <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #2
+declare <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float>, <4 x float>, i32, <4 x i1>, <4 x float>) #2
 
 define arm_aapcs_vfpcc <8 x half> @test_vminnmq_x_f16(<8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vminnmq_x_f16:
@@ -71,7 +71,7 @@ define arm_aapcs_vfpcc <8 x half> @test_vminnmq_x_f16(<8 x half> %a, <8 x half>
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> undef)
+  %2 = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, i32 0, <8 x i1> %1, <8 x half> undef)
   ret <8 x half> %2
 }
 
@@ -85,7 +85,7 @@ define arm_aapcs_vfpcc <4 x float> @test_vminnmq_x_f32(<4 x float> %a, <4 x floa
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> undef)
+  %2 = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, i32 0, <4 x i1> %1, <4 x float> undef)
   ret <4 x float> %2
 }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminq.ll
index 96bc2233f22..9c93a7d238d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminq.ll
@@ -44,31 +44,31 @@ define arm_aapcs_vfpcc <16 x i8> @test_vminq_m_s8(<16 x i8> %inactive, <16 x i8>
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
-  %2 = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+  %2 = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive)
   ret <16 x i8> %2
 }
 
 declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2
 
-declare <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2
+declare <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #2
 
 define arm_aapcs_vfpcc <8 x i16> @test_vminq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
 ; CHECK-LABEL: test_vminq_m_u16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmint.s16 q0, q1, q2
+; CHECK-NEXT:    vmint.u16 q0, q1, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+  %2 = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive)
   ret <8 x i16> %2
 }
 
 declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
 
-declare <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
+declare <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #2
 
 define arm_aapcs_vfpcc <4 x i32> @test_vminq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
 ; CHECK-LABEL: test_vminq_m_s32:
@@ -80,25 +80,25 @@ define arm_aapcs_vfpcc <4 x i32> @test_vminq_m_s32(<4 x i32> %inactive, <4 x i32
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+  %2 = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> %inactive)
   ret <4 x i32> %2
 }
 
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
 
-declare <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
+declare <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #2
 
 define arm_aapcs_vfpcc <16 x i8> @test_vminq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
 ; CHECK-LABEL: test_vminq_x_u8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmint.s8 q0, q0, q1
+; CHECK-NEXT:    vmint.u8 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
-  %2 = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
+  %2 = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <16 x i8> undef)
   ret <16 x i8> %2
 }
 
@@ -112,7 +112,7 @@ define arm_aapcs_vfpcc <8 x i16> @test_vminq_x_s16(<8 x i16> %a, <8 x i16> %b, i
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+  %2 = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <8 x i16> undef)
   ret <8 x i16> %2
 }
 
@@ -126,7 +126,7 @@ define arm_aapcs_vfpcc <4 x i32> @test_vminq_x_s32(<4 x i32> %a, <4 x i32> %b, i
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
+  %2 = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> undef)
   ret <4 x i32> %2
 }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulhq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulhq.ll
index ba15af6c549..aae9a23978e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulhq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulhq.ll
@@ -4,14 +4,14 @@
 define arm_aapcs_vfpcc <16 x i8> @test_vmulhq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmulhq_u8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmulh.s8 q0, q0, q1
+; CHECK-NEXT:    vmulh.u8 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <16 x i8> @llvm.arm.mve.vmulh.v16i8(<16 x i8> %a, <16 x i8> %b)
+  %0 = tail call <16 x i8> @llvm.arm.mve.vmulh.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1)
   ret <16 x i8> %0
 }
 
-declare <16 x i8> @llvm.arm.mve.vmulh.v16i8(<16 x i8>, <16 x i8>) #1
+declare <16 x i8> @llvm.arm.mve.vmulh.v16i8(<16 x i8>, <16 x i8>, i32) #1
 
 define arm_aapcs_vfpcc <8 x i16> @test_vmulhq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmulhq_s16:
@@ -19,23 +19,23 @@ define arm_aapcs_vfpcc <8 x i16> @test_vmulhq_s16(<8 x i16> %a, <8 x i16> %b) lo
 ; CHECK-NEXT:    vmulh.s16 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <8 x i16> @llvm.arm.mve.vmulh.v8i16(<8 x i16> %a, <8 x i16> %b)
+  %0 = tail call <8 x i16> @llvm.arm.mve.vmulh.v8i16(<8 x i16> %a, <8 x i16> %b, i32 0)
   ret <8 x i16> %0
 }
 
-declare <8 x i16> @llvm.arm.mve.vmulh.v8i16(<8 x i16>, <8 x i16>) #1
+declare <8 x i16> @llvm.arm.mve.vmulh.v8i16(<8 x i16>, <8 x i16>, i32) #1
 
 define arm_aapcs_vfpcc <4 x i32> @test_vmulhq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmulhq_u32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmulh.s32 q0, q0, q1
+; CHECK-NEXT:    vmulh.u32 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <4 x i32> @llvm.arm.mve.vmulh.v4i32(<4 x i32> %a, <4 x i32> %b)
+  %0 = tail call <4 x i32> @llvm.arm.mve.vmulh.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1)
   ret <4 x i32> %0
 }
 
-declare <4 x i32> @llvm.arm.mve.vmulh.v4i32(<4 x i32>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.vmulh.v4i32(<4 x i32>, <4 x i32>, i32) #1
 
 define arm_aapcs_vfpcc <16 x i8> @test_vmulhq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmulhq_m_s8:
@@ -47,31 +47,31 @@ define arm_aapcs_vfpcc <16 x i8> @test_vmulhq_m_s8(<16 x i8> %inactive, <16 x i8
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
-  %2 = tail call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+  %2 = tail call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive)
   ret <16 x i8> %2
 }
 
 declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1
 
-declare <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #1
+declare <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #1
 
 define arm_aapcs_vfpcc <8 x i16> @test_vmulhq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmulhq_m_u16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmulht.s16 q0, q1, q2
+; CHECK-NEXT:    vmulht.u16 q0, q1, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+  %2 = tail call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive)
   ret <8 x i16> %2
 }
 
 declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1
 
-declare <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #1
+declare <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #1
 
 define arm_aapcs_vfpcc <4 x i32> @test_vmulhq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmulhq_m_s32:
@@ -83,25 +83,25 @@ define arm_aapcs_vfpcc <4 x i32> @test_vmulhq_m_s32(<4 x i32> %inactive, <4 x i3
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+  %2 = tail call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> %inactive)
   ret <4 x i32> %2
 }
 
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
 
-declare <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1
 
 define arm_aapcs_vfpcc <16 x i8> @test_vmulhq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmulhq_x_u8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmulht.s8 q0, q0, q1
+; CHECK-NEXT:    vmulht.u8 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
-  %2 = tail call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
+  %2 = tail call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <16 x i8> undef)
   ret <16 x i8> %2
 }
 
@@ -115,7 +115,7 @@ define arm_aapcs_vfpcc <8 x i16> @test_vmulhq_x_s16(<8 x i16> %a, <8 x i16> %b,
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+  %2 = tail call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <8 x i16> undef)
   ret <8 x i16> %2
 }
 
@@ -124,12 +124,12 @@ define arm_aapcs_vfpcc <4 x i32> @test_vmulhq_x_u32(<4 x i32> %a, <4 x i32> %b,
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmulht.s32 q0, q0, q1
+; CHECK-NEXT:    vmulht.u32 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
+  %2 = tail call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <4 x i32> undef)
   ret <4 x i32> %2
 }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmullbq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmullbq.ll
index e443d54f020..43914690549 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmullbq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmullbq.ll
@@ -4,14 +4,14 @@
 define arm_aapcs_vfpcc <8 x i16> @test_vmullbq_int_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmullbq_int_u8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmullb.s8 q0, q0, q1
+; CHECK-NEXT:    vmullb.u8 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8> %a, <16 x i8> %b, i32 0)
+  %0 = tail call <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1, i32 0)
   ret <8 x i16> %0
 }
 
-declare <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8>, <16 x i8>, i32) #1
+declare <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8>, <16 x i8>, i32, i32) #1
 
 define arm_aapcs_vfpcc <4 x i32> @test_vmullbq_int_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmullbq_int_s16:
@@ -19,24 +19,24 @@ define arm_aapcs_vfpcc <4 x i32> @test_vmullbq_int_s16(<8 x i16> %a, <8 x i16> %
 ; CHECK-NEXT:    vmullb.s16 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> %a, <8 x i16> %b, i32 0)
+  %0 = tail call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> %a, <8 x i16> %b, i32 0, i32 0)
   ret <4 x i32> %0
 }
 
-declare <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16>, <8 x i16>, i32) #1
+declare <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16>, <8 x i16>, i32, i32) #1
 
 define arm_aapcs_vfpcc <2 x i64> @test_vmullbq_int_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmullbq_int_u32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmullb.s32 q2, q0, q1
+; CHECK-NEXT:    vmullb.u32 q2, q0, q1
 ; CHECK-NEXT:    vmov q0, q2
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32> %a, <4 x i32> %b, i32 0)
+  %0 = tail call <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1, i32 0)
   ret <2 x i64> %0
 }
 
-declare <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32>, <4 x i32>, i32) #1
+declare <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32>, <4 x i32>, i32, i32) #1
 
 define arm_aapcs_vfpcc <4 x i32> @test_vmullbq_poly_p16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmullbq_poly_p16:
@@ -60,31 +60,31 @@ define arm_aapcs_vfpcc <8 x i16> @test_vmullbq_int_m_s8(<8 x i16> %inactive, <16
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
-  %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <8 x i16> %inactive)
+  %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, i32 0, <16 x i1> %1, <8 x i16> %inactive)
   ret <8 x i16> %2
 }
 
 declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1
 
-declare <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <8 x i16>) #1
+declare <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, i32, <16 x i1>, <8 x i16>) #1
 
 define arm_aapcs_vfpcc <4 x i32> @test_vmullbq_int_m_u16(<4 x i32> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmullbq_int_m_u16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmullbt.s16 q0, q1, q2
+; CHECK-NEXT:    vmullbt.u16 q0, q1, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <4 x i32> %inactive)
+  %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, i32 0, <8 x i1> %1, <4 x i32> %inactive)
   ret <4 x i32> %2
 }
 
 declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1
 
-declare <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, i32, <8 x i1>, <4 x i32>) #1
 
 define arm_aapcs_vfpcc <2 x i64> @test_vmullbq_int_m_s32(<2 x i64> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmullbq_int_m_s32:
@@ -96,13 +96,13 @@ define arm_aapcs_vfpcc <2 x i64> @test_vmullbq_int_m_s32(<2 x i64> %inactive, <4
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <2 x i64> %inactive)
+  %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, i32 0, <4 x i1> %1, <2 x i64> %inactive)
   ret <2 x i64> %2
 }
 
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
 
-declare <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <2 x i64>) #1
+declare <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, i32, <4 x i1>, <2 x i64>) #1
 
 define arm_aapcs_vfpcc <8 x i16> @test_vmullbq_poly_m_p8(<8 x i16> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmullbq_poly_m_p8:
@@ -125,12 +125,12 @@ define arm_aapcs_vfpcc <8 x i16> @test_vmullbq_int_x_u8(<16 x i8> %a, <16 x i8>
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmullbt.s8 q0, q0, q1
+; CHECK-NEXT:    vmullbt.u8 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
-  %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <8 x i16> undef)
+  %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, i32 0, <16 x i1> %1, <8 x i16> undef)
   ret <8 x i16> %2
 }
 
@@ -144,7 +144,7 @@ define arm_aapcs_vfpcc <4 x i32> @test_vmullbq_int_x_s16(<8 x i16> %a, <8 x i16>
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <4 x i32> undef)
+  %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, i32 0, <8 x i1> %1, <4 x i32> undef)
   ret <4 x i32> %2
 }
 
@@ -153,13 +153,13 @@ define arm_aapcs_vfpcc <2 x i64> @test_vmullbq_int_x_u32(<4 x i32> %a, <4 x i32>
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmullbt.s32 q2, q0, q1
+; CHECK-NEXT:    vmullbt.u32 q2, q0, q1
 ; CHECK-NEXT:    vmov q0, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <2 x i64> undef)
+  %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, i32 0, <4 x i1> %1, <2 x i64> undef)
   ret <2 x i64> %2
 }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulltq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulltq.ll
index ca2aae6034a..39c1105f95f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulltq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulltq.ll
@@ -4,14 +4,14 @@
 define arm_aapcs_vfpcc <8 x i16> @test_vmulltq_int_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmulltq_int_u8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmullt.s8 q0, q0, q1
+; CHECK-NEXT:    vmullt.u8 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1)
+  %0 = tail call <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1, i32 1)
   ret <8 x i16> %0
 }
 
-declare <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8>, <16 x i8>, i32) #1
+declare <8 x i16> @llvm.arm.mve.vmull.v8i16.v16i8(<16 x i8>, <16 x i8>, i32, i32) #1
 
 define arm_aapcs_vfpcc <4 x i32> @test_vmulltq_int_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmulltq_int_s16:
@@ -19,24 +19,24 @@ define arm_aapcs_vfpcc <4 x i32> @test_vmulltq_int_s16(<8 x i16> %a, <8 x i16> %
 ; CHECK-NEXT:    vmullt.s16 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> %a, <8 x i16> %b, i32 1)
+  %0 = tail call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> %a, <8 x i16> %b, i32 0, i32 1)
   ret <4 x i32> %0
 }
 
-declare <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16>, <8 x i16>, i32) #1
+declare <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16>, <8 x i16>, i32, i32) #1
 
 define arm_aapcs_vfpcc <2 x i64> @test_vmulltq_int_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmulltq_int_u32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmullt.s32 q2, q0, q1
+; CHECK-NEXT:    vmullt.u32 q2, q0, q1
 ; CHECK-NEXT:    vmov q0, q2
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1)
+  %0 = tail call <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1, i32 1)
   ret <2 x i64> %0
 }
 
-declare <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32>, <4 x i32>, i32) #1
+declare <2 x i64> @llvm.arm.mve.vmull.v2i64.v4i32(<4 x i32>, <4 x i32>, i32, i32) #1
 
 define arm_aapcs_vfpcc <4 x i32> @test_vmulltq_poly_p16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmulltq_poly_p16:
@@ -60,31 +60,31 @@ define arm_aapcs_vfpcc <8 x i16> @test_vmulltq_int_m_s8(<8 x i16> %inactive, <16
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
-  %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <8 x i16> %inactive)
+  %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, i32 1, <16 x i1> %1, <8 x i16> %inactive)
   ret <8 x i16> %2
 }
 
 declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1
 
-declare <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <8 x i16>) #1
+declare <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, i32, <16 x i1>, <8 x i16>) #1
 
 define arm_aapcs_vfpcc <4 x i32> @test_vmulltq_int_m_u16(<4 x i32> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmulltq_int_m_u16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmulltt.s16 q0, q1, q2
+; CHECK-NEXT:    vmulltt.u16 q0, q1, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <4 x i32> %inactive)
+  %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, i32 1, <8 x i1> %1, <4 x i32> %inactive)
   ret <4 x i32> %2
 }
 
 declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1
 
-declare <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, i32, <8 x i1>, <4 x i32>) #1
 
 define arm_aapcs_vfpcc <2 x i64> @test_vmulltq_int_m_s32(<2 x i64> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmulltq_int_m_s32:
@@ -96,13 +96,13 @@ define arm_aapcs_vfpcc <2 x i64> @test_vmulltq_int_m_s32(<2 x i64> %inactive, <4
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <2 x i64> %inactive)
+  %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, i32 1, <4 x i1> %1, <2 x i64> %inactive)
   ret <2 x i64> %2
 }
 
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
 
-declare <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <2 x i64>) #1
+declare <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, i32, <4 x i1>, <2 x i64>) #1
 
 define arm_aapcs_vfpcc <8 x i16> @test_vmulltq_poly_m_p8(<8 x i16> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vmulltq_poly_m_p8:
@@ -125,12 +125,12 @@ define arm_aapcs_vfpcc <8 x i16> @test_vmulltq_int_x_u8(<16 x i8> %a, <16 x i8>
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmulltt.s8 q0, q0, q1
+; CHECK-NEXT:    vmulltt.u8 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
-  %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <8 x i16> undef)
+  %2 = tail call <8 x i16> @llvm.arm.mve.mull.int.predicated.v8i16.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, i32 1, <16 x i1> %1, <8 x i16> undef)
   ret <8 x i16> %2
 }
 
@@ -144,7 +144,7 @@ define arm_aapcs_vfpcc <4 x i32> @test_vmulltq_int_x_s16(<8 x i16> %a, <8 x i16>
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <4 x i32> undef)
+  %2 = tail call <4 x i32> @llvm.arm.mve.mull.int.predicated.v4i32.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, i32 1, <8 x i1> %1, <4 x i32> undef)
   ret <4 x i32> %2
 }
 
@@ -153,13 +153,13 @@ define arm_aapcs_vfpcc <2 x i64> @test_vmulltq_int_x_u32(<4 x i32> %a, <4 x i32>
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmulltt.s32 q2, q0, q1
+; CHECK-NEXT:    vmulltt.u32 q2, q0, q1
 ; CHECK-NEXT:    vmov q0, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <2 x i64> undef)
+  %2 = tail call <2 x i64> @llvm.arm.mve.mull.int.predicated.v2i64.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, i32 1, <4 x i1> %1, <2 x i64> undef)
   ret <2 x i64> %2
 }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqaddq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqaddq.ll
index 304a6e83aa2..b05fded2b30 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqaddq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqaddq.ll
@@ -47,31 +47,31 @@ define arm_aapcs_vfpcc <16 x i8> @test_vqaddq_m_s8(<16 x i8> %inactive, <16 x i8
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
-  %2 = tail call <16 x i8> @llvm.arm.mve.qadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+  %2 = tail call <16 x i8> @llvm.arm.mve.qadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive)
   ret <16 x i8> %2
 }
 
 declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2
 
-declare <16 x i8> @llvm.arm.mve.qadd.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2
+declare <16 x i8> @llvm.arm.mve.qadd.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #2
 
 define arm_aapcs_vfpcc <8 x i16> @test_vqaddq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vqaddq_m_u16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vqaddt.s16 q0, q1, q2
+; CHECK-NEXT:    vqaddt.u16 q0, q1, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <8 x i16> @llvm.arm.mve.qadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+  %2 = tail call <8 x i16> @llvm.arm.mve.qadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive)
   ret <8 x i16> %2
 }
 
 declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
 
-declare <8 x i16> @llvm.arm.mve.qadd.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
+declare <8 x i16> @llvm.arm.mve.qadd.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #2
 
 define arm_aapcs_vfpcc <4 x i32> @test_vqaddq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vqaddq_m_s32:
@@ -83,10 +83,10 @@ define arm_aapcs_vfpcc <4 x i32> @test_vqaddq_m_s32(<4 x i32> %inactive, <4 x i3
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+  %2 = tail call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> %inactive)
   ret <4 x i32> %2
 }
 
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
 
-declare <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
+declare <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #2
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqsubq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqsubq.ll
index 81b3c1c73e9..37f251c83a3 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqsubq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqsubq.ll
@@ -47,31 +47,31 @@ define arm_aapcs_vfpcc <16 x i8> @test_vqsubq_m_s8(<16 x i8> %inactive, <16 x i8
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
-  %2 = tail call <16 x i8> @llvm.arm.mve.qsub.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+  %2 = tail call <16 x i8> @llvm.arm.mve.qsub.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive)
   ret <16 x i8> %2
 }
 
 declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2
 
-declare <16 x i8> @llvm.arm.mve.qsub.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2
+declare <16 x i8> @llvm.arm.mve.qsub.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #2
 
 define arm_aapcs_vfpcc <8 x i16> @test_vqsubq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vqsubq_m_u16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vqsubt.s16 q0, q1, q2
+; CHECK-NEXT:    vqsubt.u16 q0, q1, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <8 x i16> @llvm.arm.mve.qsub.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+  %2 = tail call <8 x i16> @llvm.arm.mve.qsub.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive)
   ret <8 x i16> %2
 }
 
 declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
 
-declare <8 x i16> @llvm.arm.mve.qsub.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
+declare <8 x i16> @llvm.arm.mve.qsub.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #2
 
 define arm_aapcs_vfpcc <4 x i32> @test_vqsubq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vqsubq_m_s32:
@@ -83,10 +83,10 @@ define arm_aapcs_vfpcc <4 x i32> @test_vqsubq_m_s32(<4 x i32> %inactive, <4 x i3
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+  %2 = tail call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> %inactive)
   ret <4 x i32> %2
 }
 
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
 
-declare <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
+declare <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #2
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrhaddq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrhaddq.ll
index 15b23d5ed27..d7e39548e79 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrhaddq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrhaddq.ll
@@ -4,14 +4,14 @@
 define arm_aapcs_vfpcc <16 x i8> @test_vrhaddq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vrhaddq_u8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vrhadd.s8 q0, q0, q1
+; CHECK-NEXT:    vrhadd.u8 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <16 x i8> @llvm.arm.mve.vrhadd.v16i8(<16 x i8> %a, <16 x i8> %b)
+  %0 = tail call <16 x i8> @llvm.arm.mve.vrhadd.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1)
   ret <16 x i8> %0
 }
 
-declare <16 x i8> @llvm.arm.mve.vrhadd.v16i8(<16 x i8>, <16 x i8>) #1
+declare <16 x i8> @llvm.arm.mve.vrhadd.v16i8(<16 x i8>, <16 x i8>, i32) #1
 
 define arm_aapcs_vfpcc <8 x i16> @test_vrhaddq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vrhaddq_s16:
@@ -19,23 +19,23 @@ define arm_aapcs_vfpcc <8 x i16> @test_vrhaddq_s16(<8 x i16> %a, <8 x i16> %b) l
 ; CHECK-NEXT:    vrhadd.s16 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <8 x i16> @llvm.arm.mve.vrhadd.v8i16(<8 x i16> %a, <8 x i16> %b)
+  %0 = tail call <8 x i16> @llvm.arm.mve.vrhadd.v8i16(<8 x i16> %a, <8 x i16> %b, i32 0)
   ret <8 x i16> %0
 }
 
-declare <8 x i16> @llvm.arm.mve.vrhadd.v8i16(<8 x i16>, <8 x i16>) #1
+declare <8 x i16> @llvm.arm.mve.vrhadd.v8i16(<8 x i16>, <8 x i16>, i32) #1
 
 define arm_aapcs_vfpcc <4 x i32> @test_vrhaddq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vrhaddq_u32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vrhadd.s32 q0, q0, q1
+; CHECK-NEXT:    vrhadd.u32 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <4 x i32> @llvm.arm.mve.vrhadd.v4i32(<4 x i32> %a, <4 x i32> %b)
+  %0 = tail call <4 x i32> @llvm.arm.mve.vrhadd.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1)
   ret <4 x i32> %0
 }
 
-declare <4 x i32> @llvm.arm.mve.vrhadd.v4i32(<4 x i32>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.vrhadd.v4i32(<4 x i32>, <4 x i32>, i32) #1
 
 define arm_aapcs_vfpcc <16 x i8> @test_vrhaddq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vrhaddq_m_s8:
@@ -47,31 +47,31 @@ define arm_aapcs_vfpcc <16 x i8> @test_vrhaddq_m_s8(<16 x i8> %inactive, <16 x i
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
-  %2 = tail call <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+  %2 = tail call <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive)
   ret <16 x i8> %2
 }
 
 declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1
 
-declare <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #1
+declare <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #1
 
 define arm_aapcs_vfpcc <8 x i16> @test_vrhaddq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vrhaddq_m_u16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vrhaddt.s16 q0, q1, q2
+; CHECK-NEXT:    vrhaddt.u16 q0, q1, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+  %2 = tail call <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive)
   ret <8 x i16> %2
 }
 
 declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1
 
-declare <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #1
+declare <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #1
 
 define arm_aapcs_vfpcc <4 x i32> @test_vrhaddq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vrhaddq_m_s32:
@@ -83,25 +83,25 @@ define arm_aapcs_vfpcc <4 x i32> @test_vrhaddq_m_s32(<4 x i32> %inactive, <4 x i
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+  %2 = tail call <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> %inactive)
   ret <4 x i32> %2
 }
 
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
 
-declare <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1
 
 define arm_aapcs_vfpcc <16 x i8> @test_vrhaddq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vrhaddq_x_u8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vrhaddt.s8 q0, q0, q1
+; CHECK-NEXT:    vrhaddt.u8 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
-  %2 = tail call <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
+  %2 = tail call <16 x i8> @llvm.arm.mve.rhadd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <16 x i8> undef)
   ret <16 x i8> %2
 }
 
@@ -110,12 +110,12 @@ define arm_aapcs_vfpcc <8 x i16> @test_vrhaddq_x_u16(<8 x i16> %a, <8 x i16> %b,
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vrhaddt.s16 q0, q0, q1
+; CHECK-NEXT:    vrhaddt.u16 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+  %2 = tail call <8 x i16> @llvm.arm.mve.rhadd.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> undef)
   ret <8 x i16> %2
 }
 
@@ -124,12 +124,12 @@ define arm_aapcs_vfpcc <4 x i32> @test_vrhaddq_x_u32(<4 x i32> %a, <4 x i32> %b,
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vrhaddt.s32 q0, q0, q1
+; CHECK-NEXT:    vrhaddt.u32 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
+  %2 = tail call <4 x i32> @llvm.arm.mve.rhadd.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <4 x i32> undef)
   ret <4 x i32> %2
 }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrmulhq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrmulhq.ll
index 9418f581eb4..e8ff21452f3 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrmulhq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrmulhq.ll
@@ -4,14 +4,14 @@
 define arm_aapcs_vfpcc <16 x i8> @test_vrmulhq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vrmulhq_u8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vrmulh.s8 q0, q0, q1
+; CHECK-NEXT:    vrmulh.u8 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <16 x i8> @llvm.arm.mve.vrmulh.v16i8(<16 x i8> %a, <16 x i8> %b)
+  %0 = tail call <16 x i8> @llvm.arm.mve.vrmulh.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1)
   ret <16 x i8> %0
 }
 
-declare <16 x i8> @llvm.arm.mve.vrmulh.v16i8(<16 x i8>, <16 x i8>) #1
+declare <16 x i8> @llvm.arm.mve.vrmulh.v16i8(<16 x i8>, <16 x i8>, i32) #1
 
 define arm_aapcs_vfpcc <8 x i16> @test_vrmulhq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vrmulhq_s16:
@@ -19,23 +19,23 @@ define arm_aapcs_vfpcc <8 x i16> @test_vrmulhq_s16(<8 x i16> %a, <8 x i16> %b) l
 ; CHECK-NEXT:    vrmulh.s16 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <8 x i16> @llvm.arm.mve.vrmulh.v8i16(<8 x i16> %a, <8 x i16> %b)
+  %0 = tail call <8 x i16> @llvm.arm.mve.vrmulh.v8i16(<8 x i16> %a, <8 x i16> %b, i32 0)
   ret <8 x i16> %0
 }
 
-declare <8 x i16> @llvm.arm.mve.vrmulh.v8i16(<8 x i16>, <8 x i16>) #1
+declare <8 x i16> @llvm.arm.mve.vrmulh.v8i16(<8 x i16>, <8 x i16>, i32) #1
 
 define arm_aapcs_vfpcc <4 x i32> @test_vrmulhq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vrmulhq_u32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vrmulh.s32 q0, q0, q1
+; CHECK-NEXT:    vrmulh.u32 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call <4 x i32> @llvm.arm.mve.vrmulh.v4i32(<4 x i32> %a, <4 x i32> %b)
+  %0 = tail call <4 x i32> @llvm.arm.mve.vrmulh.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1)
   ret <4 x i32> %0
 }
 
-declare <4 x i32> @llvm.arm.mve.vrmulh.v4i32(<4 x i32>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.vrmulh.v4i32(<4 x i32>, <4 x i32>, i32) #1
 
 define arm_aapcs_vfpcc <16 x i8> @test_vrmulhq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vrmulhq_m_s8:
@@ -47,31 +47,31 @@ define arm_aapcs_vfpcc <16 x i8> @test_vrmulhq_m_s8(<16 x i8> %inactive, <16 x i
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
-  %2 = tail call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+  %2 = tail call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 0, <16 x i1> %1, <16 x i8> %inactive)
   ret <16 x i8> %2
 }
 
 declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1
 
-declare <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #1
+declare <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>, <16 x i8>) #1
 
 define arm_aapcs_vfpcc <8 x i16> @test_vrmulhq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vrmulhq_m_u16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vrmulht.s16 q0, q1, q2
+; CHECK-NEXT:    vrmulht.u16 q0, q1, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+  %2 = tail call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1, <8 x i16> %inactive)
   ret <8 x i16> %2
 }
 
 declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1
 
-declare <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #1
+declare <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>) #1
 
 define arm_aapcs_vfpcc <4 x i32> @test_vrmulhq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vrmulhq_m_s32:
@@ -83,25 +83,25 @@ define arm_aapcs_vfpcc <4 x i32> @test_vrmulhq_m_s32(<4 x i32> %inactive, <4 x i
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+  %2 = tail call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1, <4 x i32> %inactive)
   ret <4 x i32> %2
 }
 
 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
 
-declare <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1
 
 define arm_aapcs_vfpcc <16 x i8> @test_vrmulhq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_vrmulhq_x_u8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vrmulht.s8 q0, q0, q1
+; CHECK-NEXT:    vrmulht.u8 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
-  %2 = tail call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
+  %2 = tail call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 1, <16 x i1> %1, <16 x i8> undef)
   ret <16 x i8> %2
 }
 
@@ -115,7 +115,7 @@ define arm_aapcs_vfpcc <8 x i16> @test_vrmulhq_x_s16(<8 x i16> %a, <8 x i16> %b,
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
-  %2 = tail call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
+  %2 = tail call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 0, <8 x i1> %1, <8 x i16> undef)
   ret <8 x i16> %2
 }
 
@@ -124,12 +124,12 @@ define arm_aapcs_vfpcc <4 x i32> @test_vrmulhq_m_u32(<4 x i32> %a, <4 x i32> %b,
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmsr p0, r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vrmulht.s32 q0, q0, q1
+; CHECK-NEXT:    vrmulht.u32 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
+  %2 = tail call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1, <4 x i32> undef)
   ret <4 x i32> %2
 }