[ARM,MVE] Add intrinsics to deal with predicates.

Summary: This commit adds the `vpselq` intrinsics which take an MVE predicate word and select lanes from two vectors; the `vctp` intrinsics which create a tail predicate word suitable for processing the first m elements of a vector (e.g. in the last iteration of a loop); and `vpnot`, which simply complements a predicate word and is just syntactic sugar for the `~` operator. The `vctp` ACLE intrinsics are lowered to the IR intrinsics we've already added (and which D70592 just reorganized). I've filled in the missing isel rule for VCTP64, and added another set of rules to generate the predicated forms. I needed one small tweak in MveEmitter to allow the `unpromoted` type modifier to apply to predicates as well as integers, so that `vpnot` doesn't pointlessly convert its input integer to an `<n x i1>` before complementing it. Reviewers: ostannard, MarkMurrayARM, dmgreen Reviewed By: dmgreen Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D70485
author: Simon Tatham <simon.tatham@arm.com> 2019-12-02 16:17:59 +0000
committer: Simon Tatham <simon.tatham@arm.com> 2019-12-02 16:20:30 +0000
commit: d173fb5d2854a1ce42bcc34832db5039b2c60e69 (patch)
tree: 68655e854c5774fcad96180adac1007a95064f19
parent: 48cce077efcc3c3637aac0143b3c2c9d1cf7ab8b (diff)
download: bcm5719-llvm-d173fb5d2854a1ce42bcc34832db5039b2c60e69.tar.gz
bcm5719-llvm-d173fb5d2854a1ce42bcc34832db5039b2c60e69.zip
5 files changed, 558 insertions, 19 deletions
diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
index 90cccb12472..ed925a20072 100644
--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -214,6 +214,32 @@ let params = T.Float in {
                                (IRIntBase<"maxnum", [Vector]> $a, $b)>;
 }
 
+def vpselq: Intrinsic<Vector, (args Vector:$t, Vector:$f, Predicate:$pred),
+                      (select $pred, $t, $f)> { let params = T.Usual; }
+def vpselq_64: Intrinsic<
+    Vector, (args Vector:$t, Vector:$f, PredOf<u32>:$pred),
+            (bitcast (select $pred, (bitcast $t, VecOf<u32>),
+                                    (bitcast $f, VecOf<u32>)), Vector)>,
+    NameOverride<"vpselq"> { let params = T.All64; }
+
+let params = [Void], pnt = PNT_None in {
+
+  multiclass vctp<Type pred, string intname> {
+    def "": Intrinsic<pred, (args u32:$val),
+        (u16 (IRInt<"pred_v2i", [pred]> (IRIntBase<intname> $val)))>;
+    def _m: Intrinsic<pred, (args u32:$val, pred:$inpred),
+        (u16 (IRInt<"pred_v2i", [pred]> (and $inpred,
+                                         (IRIntBase<intname> $val))))>;
+  }
+  defm vctp8q:  vctp<PredOf<u8>,  "arm_mve_vctp8">;
+  defm vctp16q: vctp<PredOf<u16>, "arm_mve_vctp16">;
+  defm vctp32q: vctp<PredOf<u32>, "arm_mve_vctp32">;
+  defm vctp64q: vctp<PredOf<u64>, "arm_mve_vctp64">;
+
+  def vpnot: Intrinsic<PredOf<u8>, (args unpromoted<PredOf<u8>>:$pred),
+                       (xor $pred, (u16 65535))>;
+
+}
 
 multiclass contiguous_load<string mnemonic, PrimitiveType memtype,
                            list<Type> same_size, list<Type> wider> {
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/predicates.c b/clang/test/CodeGen/arm-mve-intrinsics/predicates.c
new file mode 100644
index 00000000000..5761849d094
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/predicates.c
@@ -0,0 +1,290 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg -sroa -early-cse | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg -sroa -early-cse | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vctp16q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
+mve_pred16_t test_vctp16q(uint32_t a)
+{
+    return vctp16q(a);
+}
+
+// CHECK-LABEL: @test_vctp16q_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT:    ret i16 [[TMP5]]
+//
+mve_pred16_t test_vctp16q_m(uint32_t a, mve_pred16_t p)
+{
+    return vctp16q_m(a, p);
+}
+
+// CHECK-LABEL: @test_vctp32q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
+mve_pred16_t test_vctp32q(uint32_t a)
+{
+    return vctp32q(a);
+}
+
+// CHECK-LABEL: @test_vctp32q_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT:    ret i16 [[TMP5]]
+//
+mve_pred16_t test_vctp32q_m(uint32_t a, mve_pred16_t p)
+{
+    return vctp32q_m(a, p);
+}
+
+// CHECK-LABEL: @test_vctp64q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i1> @llvm.arm.mve.vctp64(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
+mve_pred16_t test_vctp64q(uint32_t a)
+{
+    return vctp64q(a);
+}
+
+// CHECK-LABEL: @test_vctp64q_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp64(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT:    ret i16 [[TMP5]]
+//
+mve_pred16_t test_vctp64q_m(uint32_t a, mve_pred16_t p)
+{
+    return vctp64q_m(a, p);
+}
+
+// CHECK-LABEL: @test_vctp8q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
+mve_pred16_t test_vctp8q(uint32_t a)
+{
+    return vctp8q(a);
+}
+
+// CHECK-LABEL: @test_vctp8q_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = and <16 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT:    ret i16 [[TMP5]]
+//
+mve_pred16_t test_vctp8q_m(uint32_t a, mve_pred16_t p)
+{
+    return vctp8q_m(a, p);
+}
+
+// CHECK-LABEL: @test_vpnot(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = xor i16 [[A:%.*]], -1
+// CHECK-NEXT:    ret i16 [[TMP0]]
+//
+mve_pred16_t test_vpnot(mve_pred16_t a)
+{
+    return vpnot(a);
+}
+
+// CHECK-LABEL: @test_vpselq_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x half> [[A:%.*]], <8 x half> [[B:%.*]]
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
+float16x8_t test_vpselq_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_f16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vpselq_f32(float32x4_t a, float32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_f32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vpselq_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vpselq_s32(int32x4_t a, int32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_s32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]]
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP5]]
+//
+int64x2_t test_vpselq_s64(int64x2_t a, int64x2_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_s64(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vpselq_s8(int8x16_t a, int8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_s8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vpselq_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_u16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vpselq_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]]
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP5]]
+//
+uint64x2_t test_vpselq_u64(uint64x2_t a, uint64x2_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_u64(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vpselq_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_u8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp
index 1ca3b5a3f22..422188a5f3d 100644
--- a/clang/utils/TableGen/MveEmitter.cpp
+++ b/clang/utils/TableGen/MveEmitter.cpp
@@ -1208,14 +1208,16 @@ Result::Ptr MveEmitter::getCodeForArg(unsigned ArgNum, const Type *ArgType,
   Result::Ptr V =
       std::make_shared<BuiltinArgResult>(ArgNum, isa<PointerType>(ArgType));
 
-  if (const auto *ST = dyn_cast<ScalarType>(ArgType)) {
-    if (Promote && ST->isInteger() && ST->sizeInBits() < 32)
+  if (Promote) {
+    if (const auto *ST = dyn_cast<ScalarType>(ArgType)) {
+      if (ST->isInteger() && ST->sizeInBits() < 32)
+        V = std::make_shared<IntCastResult>(getScalarType("u32"), V);
+    } else if (const auto *PT = dyn_cast<PredicateType>(ArgType)) {
       V = std::make_shared<IntCastResult>(getScalarType("u32"), V);
-  } else if (const auto *PT = dyn_cast<PredicateType>(ArgType)) {
-    V = std::make_shared<IntCastResult>(getScalarType("u32"), V);
-    V = std::make_shared<IRIntrinsicResult>("arm_mve_pred_i2v",
-                                            std::vector<const Type *>{PT},
-                                            std::vector<Result::Ptr>{V});
+      V = std::make_shared<IRIntrinsicResult>("arm_mve_pred_i2v",
+                                              std::vector<const Type *>{PT},
+                                              std::vector<Result::Ptr>{V});
+    }
   }
 
   return V;
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 6cd20309126..b209711e17b 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -4267,7 +4267,7 @@ def MVE_VDWDUPu16 : MVE_VxWDUP<"vdwdup", "u16", 0b01, 0b1>;
 def MVE_VDWDUPu32 : MVE_VxWDUP<"vdwdup", "u32", 0b10, 0b1>;
 
 let hasSideEffects = 1 in
-class MVE_VCTP<string suffix, bits<2> size, list<dag> pattern=[]>
+class MVE_VCTPInst<string suffix, bits<2> size, list<dag> pattern=[]>
   : MVE_p<(outs VCCR:$P0), (ins rGPR:$Rn), NoItinerary, "vctp", suffix,
           "$Rn", vpred_n, "", pattern> {
   bits<4> Rn;
@@ -4285,20 +4285,22 @@ class MVE_VCTP<string suffix, bits<2> size, list<dag> pattern=[]>
   let validForTailPredication = 1;
 }
 
-def MVE_VCTP8  : MVE_VCTP<"8",  0b00>;
-def MVE_VCTP16 : MVE_VCTP<"16", 0b01>;
-def MVE_VCTP32 : MVE_VCTP<"32", 0b10>;
-def MVE_VCTP64 : MVE_VCTP<"64", 0b11>;
+multiclass MVE_VCTP<MVEVectorVTInfo VTI, Intrinsic intr> {
+  def "": MVE_VCTPInst<VTI.BitsSuffix, VTI.Size>;
 
-let Predicates = [HasMVEInt] in {
-  def : Pat<(int_arm_mve_vctp8 rGPR:$Rn),
-            (v16i1 (MVE_VCTP8 rGPR:$Rn))>;
-  def : Pat<(int_arm_mve_vctp16 rGPR:$Rn),
-            (v8i1 (MVE_VCTP16 rGPR:$Rn))>;
-  def : Pat<(int_arm_mve_vctp32 rGPR:$Rn),
-            (v4i1 (MVE_VCTP32 rGPR:$Rn))>;
+  let Predicates = [HasMVEInt] in {
+    def : Pat<(intr rGPR:$Rn),
+              (VTI.Pred (!cast<Instruction>(NAME) rGPR:$Rn))>;
+    def : Pat<(and (intr rGPR:$Rn), (VTI.Pred VCCR:$mask)),
+              (VTI.Pred (!cast<Instruction>(NAME) rGPR:$Rn, 1, VCCR:$mask))>;
+  }
 }
 
+defm MVE_VCTP8  : MVE_VCTP<MVE_v16i8, int_arm_mve_vctp8>;
+defm MVE_VCTP16 : MVE_VCTP<MVE_v8i16, int_arm_mve_vctp16>;
+defm MVE_VCTP32 : MVE_VCTP<MVE_v4i32, int_arm_mve_vctp32>;
+defm MVE_VCTP64 : MVE_VCTP<MVE_v2i64, int_arm_mve_vctp64>;
+
 // end of mve_qDest_rSrc
 
 // start of coproc mov
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll
new file mode 100644
index 00000000000..f5b541203f6
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll
@@ -0,0 +1,219 @@
+; RUN: opt -instcombine %s | llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - | FileCheck %s
+
+declare <16 x i1> @llvm.arm.mve.vctp8(i32)
+declare <8 x i1> @llvm.arm.mve.vctp16(i32)
+declare <4 x i1> @llvm.arm.mve.vctp32(i32)
+declare <4 x i1> @llvm.arm.mve.vctp64(i32)
+
+declare i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1>)
+declare i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1>)
+declare i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1>)
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp8q(i32 %a) {
+; CHECK-LABEL: test_vctp8q:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vctp.8 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call <16 x i1> @llvm.arm.mve.vctp8(i32 %a)
+  %1 = call i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1> %0)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp8q_m(i32 %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vctp8q_m:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vctpt.8 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call <16 x i1> @llvm.arm.mve.vctp8(i32 %a)
+  %3 = and <16 x i1> %1, %2
+  %4 = call i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1> %3)
+  %5 = trunc i32 %4 to i16
+  ret i16 %5
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp16q(i32 %a) {
+; CHECK-LABEL: test_vctp16q:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vctp.16 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %a)
+  %1 = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> %0)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp16q_m(i32 %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vctp16q_m:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vctpt.16 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %a)
+  %3 = and <8 x i1> %1, %2
+  %4 = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> %3)
+  %5 = trunc i32 %4 to i16
+  ret i16 %5
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp32q(i32 %a) {
+; CHECK-LABEL: test_vctp32q:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vctp.32 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %a)
+  %1 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %0)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp32q_m(i32 %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vctp32q_m:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vctpt.32 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %a)
+  %3 = and <4 x i1> %1, %2
+  %4 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %3)
+  %5 = trunc i32 %4 to i16
+  ret i16 %5
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp64q(i32 %a) {
+; CHECK-LABEL: test_vctp64q:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vctp.64 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call <4 x i1> @llvm.arm.mve.vctp64(i32 %a)
+  %1 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %0)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp64q_m(i32 %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vctp64q_m:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vctpt.64 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <4 x i1> @llvm.arm.mve.vctp64(i32 %a)
+  %3 = and <4 x i1> %1, %2
+  %4 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %3)
+  %5 = trunc i32 %4 to i16
+  ret i16 %5
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vpselq_i8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) #2 {
+; CHECK-LABEL: test_vpselq_i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = select <16 x i1> %1, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vpselq_i16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) #2 {
+; CHECK-LABEL: test_vpselq_i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = select <8 x i1> %1, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vpselq_f16(<8 x half> %a, <8 x half> %b, i16 zeroext %p) #2 {
+; CHECK-LABEL: test_vpselq_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = select <8 x i1> %1, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vpselq_i32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) #2 {
+; CHECK-LABEL: test_vpselq_i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = select <4 x i1> %1, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vpselq_f32(<4 x float> %a, <4 x float> %b, i16 zeroext %p) #2 {
+; CHECK-LABEL: test_vpselq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = select <4 x i1> %1, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %2
+}
+
+define arm_aapcs_vfpcc <2 x i64> @test_vpselq_i64(<2 x i64> %a, <2 x i64> %b, i16 zeroext %p) #2 {
+; CHECK-LABEL: test_vpselq_i64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = bitcast <2 x i64> %a to <4 x i32>
+  %3 = bitcast <2 x i64> %b to <4 x i32>
+  %4 = select <4 x i1> %1, <4 x i32> %2, <4 x i32> %3
+  %5 = bitcast <4 x i32> %4 to <2 x i64>
+  ret <2 x i64> %5
+}
author	Simon Tatham <simon.tatham@arm.com>	2019-12-02 16:17:59 +0000
committer	Simon Tatham <simon.tatham@arm.com>	2019-12-02 16:20:30 +0000
commit	d173fb5d2854a1ce42bcc34832db5039b2c60e69 (patch)
tree	68655e854c5774fcad96180adac1007a95064f19
parent	48cce077efcc3c3637aac0143b3c2c9d1cf7ab8b (diff)
download	bcm5719-llvm-d173fb5d2854a1ce42bcc34832db5039b2c60e69.tar.gz bcm5719-llvm-d173fb5d2854a1ce42bcc34832db5039b2c60e69.zip