diff options
author | Simon Tatham <simon.tatham@arm.com> | 2020-01-08 13:36:25 +0000 |
---|---|---|
committer | Simon Tatham <simon.tatham@arm.com> | 2020-01-08 14:42:24 +0000 |
commit | 3100480925df10960c1e0a077dd9875037d3fe29 (patch) | |
tree | 7dfc09b50812bfbcba88d5bda8487722db2c30bc /llvm/test | |
parent | ba129c7d0f5c7c32398ad708c88e14cb06a339ad (diff) | |
download | bcm5719-llvm-3100480925df10960c1e0a077dd9875037d3fe29.tar.gz bcm5719-llvm-3100480925df10960c1e0a077dd9875037d3fe29.zip |
[ARM,MVE] Intrinsics for partial-overwrite imm shifts.
This batch of intrinsics covers two sets of immediate shift
instructions, which have in common that they only overwrite part of
their output register and so they need an extra input giving its
previous value.
The VSLI and VSRI instructions shift each lane of the input vector
left or right just as if they were normal immediate VSHL/VSHR, but
then they only overwrite the output bits that correspond to actual
shifted bits of the input. So VSLI will leave the low n bits of each
output lane unchanged, and VSRI the same with the top n bits.
The V[Q][R]SHR[U]N family are all narrowing shifts: they take an input
vector of 2n-bit integers, shift each lane right by a constant, and
then narrowing the shifted result to only n bits. So they only
overwrite half of the n-bit lanes in the output register, and the B/T
suffix indicates whether it's the bottom or top half of each 2n-bit
lane.
I've implemented the whole of the latter family using a single IR
intrinsic `vshrn`, which takes a lot of i32 parameters indicating
which instruction it expands to (by specifying signedness of the input
and output types, whether it saturates and/or rounds, etc).
Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard
Reviewed By: dmgreen
Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits
Tags: #clang, #llvm
Differential Revision: https://reviews.llvm.org/D72328
Diffstat (limited to 'llvm/test')
-rw-r--r-- | llvm/test/CodeGen/Thumb2/mve-intrinsics/vector-shift-imm-dyadic.ll | 1270 |
1 files changed, 1270 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vector-shift-imm-dyadic.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vector-shift-imm-dyadic.ll new file mode 100644 index 00000000000..6173cb5520b --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vector-shift-imm-dyadic.ll @@ -0,0 +1,1270 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s + +define arm_aapcs_vfpcc <16 x i8> @test_vshrnbq_n_s16(<16 x i8> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vshrnbq_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vshrnb.i16 q0, q1, #3 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vshrn.v16i8.v8i16(<16 x i8> %a, <8 x i16> %b, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vshrnbq_n_s32(<8 x i16> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vshrnbq_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vshrnb.i32 q0, q1, #9 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16> %a, <4 x i32> %b, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vshrnbq_n_u16(<16 x i8> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vshrnbq_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vshrnb.i16 q0, q1, #1 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vshrn.v16i8.v8i16(<16 x i8> %a, <8 x i16> %b, i32 1, i32 0, i32 0, i32 1, i32 1, i32 0) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vshrnbq_n_u32(<8 x i16> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vshrnbq_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vshrnb.i32 q0, q1, #3 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16> %a, <4 x i32> %b, i32 3, i32 0, i32 0, i32 1, i32 1, i32 0) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vshrntq_n_s16(<16 x i8> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vshrntq_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vshrnt.i16 q0, q1, #1 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vshrn.v16i8.v8i16(<16 x i8> %a, <8 x i16> %b, i32 1, i32 0, i32 0, i32 0, i32 0, i32 1) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vshrntq_n_s32(<8 x i16> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vshrntq_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vshrnt.i32 q0, q1, #10 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16> %a, <4 x i32> %b, i32 10, i32 0, i32 0, i32 0, i32 0, i32 1) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vshrntq_n_u16(<16 x i8> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vshrntq_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vshrnt.i16 q0, q1, #6 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vshrn.v16i8.v8i16(<16 x i8> %a, <8 x i16> %b, i32 6, i32 0, i32 0, i32 1, i32 1, i32 1) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vshrntq_n_u32(<8 x i16> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vshrntq_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vshrnt.i32 q0, q1, #10 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16> %a, <4 x i32> %b, i32 10, i32 0, i32 0, i32 1, i32 1, i32 1) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vshrnbq_m_n_s16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vshrnbq_m_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vshrnbt.i16 q0, q1, #4 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vshrn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vshrnbq_m_n_s32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vshrnbq_m_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vshrnbt.i32 q0, q1, #13 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vshrn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 13, i32 0, i32 0, i32 0, i32 0, i32 0, <4 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vshrnbq_m_n_u16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vshrnbq_m_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vshrnbt.i16 q0, q1, #7 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vshrn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 7, i32 0, i32 0, i32 1, i32 1, i32 0, <8 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vshrnbq_m_n_u32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vshrnbq_m_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vshrnbt.i32 q0, q1, #15 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vshrn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 15, i32 0, i32 0, i32 1, i32 1, i32 0, <4 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vshrntq_m_n_s16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vshrntq_m_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vshrntt.i16 q0, q1, #6 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vshrn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 6, i32 0, i32 0, i32 0, i32 0, i32 1, <8 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vshrntq_m_n_s32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vshrntq_m_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vshrntt.i32 q0, q1, #13 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vshrn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 13, i32 0, i32 0, i32 0, i32 0, i32 1, <4 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vshrntq_m_n_u16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vshrntq_m_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vshrntt.i16 q0, q1, #1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vshrn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 1, i32 0, i32 0, i32 1, i32 1, i32 1, <8 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vshrntq_m_n_u32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vshrntq_m_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vshrntt.i32 q0, q1, #10 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vshrn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 10, i32 0, i32 0, i32 1, i32 1, i32 1, <4 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vrshrnbq_n_s16(<16 x i8> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vrshrnbq_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrshrnb.i16 q0, q1, #5 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vshrn.v16i8.v8i16(<16 x i8> %a, <8 x i16> %b, i32 5, i32 0, i32 1, i32 0, i32 0, i32 0) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vrshrnbq_n_s32(<8 x i16> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vrshrnbq_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrshrnb.i32 q0, q1, #10 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16> %a, <4 x i32> %b, i32 10, i32 0, i32 1, i32 0, i32 0, i32 0) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vrshrnbq_n_u16(<16 x i8> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vrshrnbq_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrshrnb.i16 q0, q1, #2 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vshrn.v16i8.v8i16(<16 x i8> %a, <8 x i16> %b, i32 2, i32 0, i32 1, i32 1, i32 1, i32 0) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vrshrnbq_n_u32(<8 x i16> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vrshrnbq_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrshrnb.i32 q0, q1, #12 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16> %a, <4 x i32> %b, i32 12, i32 0, i32 1, i32 1, i32 1, i32 0) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vrshrntq_n_s16(<16 x i8> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vrshrntq_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrshrnt.i16 q0, q1, #4 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vshrn.v16i8.v8i16(<16 x i8> %a, <8 x i16> %b, i32 4, i32 0, i32 1, i32 0, i32 0, i32 1) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vrshrntq_n_s32(<8 x i16> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vrshrntq_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrshrnt.i32 q0, q1, #11 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16> %a, <4 x i32> %b, i32 11, i32 0, i32 1, i32 0, i32 0, i32 1) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vrshrntq_n_u16(<16 x i8> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vrshrntq_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrshrnt.i16 q0, q1, #1 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vshrn.v16i8.v8i16(<16 x i8> %a, <8 x i16> %b, i32 1, i32 0, i32 1, i32 1, i32 1, i32 1) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vrshrntq_n_u32(<8 x i16> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vrshrntq_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrshrnt.i32 q0, q1, #6 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16> %a, <4 x i32> %b, i32 6, i32 0, i32 1, i32 1, i32 1, i32 1) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vrshrnbq_m_n_s16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vrshrnbq_m_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vrshrnbt.i16 q0, q1, #1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vshrn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0, <8 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vrshrnbq_m_n_s32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vrshrnbq_m_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vrshrnbt.i32 q0, q1, #14 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vshrn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 14, i32 0, i32 1, i32 0, i32 0, i32 0, <4 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vrshrnbq_m_n_u16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vrshrnbq_m_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vrshrnbt.i16 q0, q1, #2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vshrn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 2, i32 0, i32 1, i32 1, i32 1, i32 0, <8 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vrshrnbq_m_n_u32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vrshrnbq_m_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vrshrnbt.i32 q0, q1, #12 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vshrn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 12, i32 0, i32 1, i32 1, i32 1, i32 0, <4 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vrshrntq_m_n_s16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vrshrntq_m_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vrshrntt.i16 q0, q1, #4 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vshrn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 4, i32 0, i32 1, i32 0, i32 0, i32 1, <8 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vrshrntq_m_n_s32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vrshrntq_m_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vrshrntt.i32 q0, q1, #6 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vshrn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 6, i32 0, i32 1, i32 0, i32 0, i32 1, <4 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vrshrntq_m_n_u16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vrshrntq_m_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vrshrntt.i16 q0, q1, #6 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vshrn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 6, i32 0, i32 1, i32 1, i32 1, i32 1, <8 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vrshrntq_m_n_u32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vrshrntq_m_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vrshrntt.i32 q0, q1, #10 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vshrn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 10, i32 0, i32 1, i32 1, i32 1, i32 1, <4 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqshrnbq_n_s16(<16 x i8> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vqshrnbq_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqshrnb.s16 q0, q1, #7 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vshrn.v16i8.v8i16(<16 x i8> %a, <8 x i16> %b, i32 7, i32 1, i32 0, i32 0, i32 0, i32 0) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqshrnbq_n_s32(<8 x i16> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vqshrnbq_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqshrnb.s32 q0, q1, #15 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16> %a, <4 x i32> %b, i32 15, i32 1, i32 0, i32 0, i32 0, i32 0) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqshrnbq_n_u16(<16 x i8> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vqshrnbq_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqshrnb.u16 q0, q1, #3 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vshrn.v16i8.v8i16(<16 x i8> %a, <8 x i16> %b, i32 3, i32 1, i32 0, i32 1, i32 1, i32 0) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqshrnbq_n_u32(<8 x i16> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vqshrnbq_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqshrnb.u32 q0, q1, #3 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16> %a, <4 x i32> %b, i32 3, i32 1, i32 0, i32 1, i32 1, i32 0) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqshrntq_n_s16(<16 x i8> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vqshrntq_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqshrnt.s16 q0, q1, #5 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vshrn.v16i8.v8i16(<16 x i8> %a, <8 x i16> %b, i32 5, i32 1, i32 0, i32 0, i32 0, i32 1) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqshrntq_n_s32(<8 x i16> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vqshrntq_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqshrnt.s32 q0, q1, #6 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16> %a, <4 x i32> %b, i32 6, i32 1, i32 0, i32 0, i32 0, i32 1) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqshrntq_n_u16(<16 x i8> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vqshrntq_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqshrnt.u16 q0, q1, #1 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vshrn.v16i8.v8i16(<16 x i8> %a, <8 x i16> %b, i32 1, i32 1, i32 0, i32 1, i32 1, i32 1) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqshrntq_n_u32(<8 x i16> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vqshrntq_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqshrnt.u32 q0, q1, #15 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16> %a, <4 x i32> %b, i32 15, i32 1, i32 0, i32 1, i32 1, i32 1) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqshrnbq_m_n_s16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqshrnbq_m_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqshrnbt.s16 q0, q1, #7 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vshrn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 7, i32 1, i32 0, i32 0, i32 0, i32 0, <8 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqshrnbq_m_n_s32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqshrnbq_m_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqshrnbt.s32 q0, q1, #1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vshrn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 1, i32 1, i32 0, i32 0, i32 0, i32 0, <4 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqshrnbq_m_n_u16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqshrnbq_m_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqshrnbt.u16 q0, q1, #1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vshrn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 1, i32 1, i32 0, i32 1, i32 1, i32 0, <8 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqshrnbq_m_n_u32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqshrnbq_m_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqshrnbt.u32 q0, q1, #8 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vshrn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 8, i32 1, i32 0, i32 1, i32 1, i32 0, <4 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqshrntq_m_n_s16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqshrntq_m_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqshrntt.s16 q0, q1, #1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vshrn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 1, i32 1, i32 0, i32 0, i32 0, i32 1, <8 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqshrntq_m_n_s32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqshrntq_m_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqshrntt.s32 q0, q1, #11 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vshrn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 11, i32 1, i32 0, i32 0, i32 0, i32 1, <4 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqshrntq_m_n_u16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqshrntq_m_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqshrntt.u16 q0, q1, #3 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vshrn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 3, i32 1, i32 0, i32 1, i32 1, i32 1, <8 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqshrntq_m_n_u32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqshrntq_m_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqshrntt.u32 q0, q1, #1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vshrn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 1, i32 1, i32 0, i32 1, i32 1, i32 1, <4 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqshrunbq_n_s16(<16 x i8> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vqshrunbq_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqshrunb.s16 q0, q1, #5 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vshrn.v16i8.v8i16(<16 x i8> %a, <8 x i16> %b, i32 5, i32 1, i32 0, i32 1, i32 0, i32 0) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqshrunbq_n_s32(<8 x i16> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vqshrunbq_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqshrunb.s32 q0, q1, #13 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16> %a, <4 x i32> %b, i32 13, i32 1, i32 0, i32 1, i32 0, i32 0) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqshruntq_n_s16(<16 x i8> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vqshruntq_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqshrunt.s16 q0, q1, #2 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vshrn.v16i8.v8i16(<16 x i8> %a, <8 x i16> %b, i32 2, i32 1, i32 0, i32 1, i32 0, i32 1) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqshruntq_n_s32(<8 x i16> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vqshruntq_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqshrunt.s32 q0, q1, #7 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16> %a, <4 x i32> %b, i32 7, i32 1, i32 0, i32 1, i32 0, i32 1) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqshrunbq_m_n_s16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqshrunbq_m_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqshrunbt.s16 q0, q1, #7 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vshrn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 7, i32 1, i32 0, i32 1, i32 0, i32 0, <8 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqshrunbq_m_n_s32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqshrunbq_m_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqshrunbt.s32 q0, q1, #7 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vshrn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 7, i32 1, i32 0, i32 1, i32 0, i32 0, <4 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqshruntq_m_n_s16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqshruntq_m_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqshruntt.s16 q0, q1, #7 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vshrn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 7, i32 1, i32 0, i32 1, i32 0, i32 1, <8 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqshruntq_m_n_s32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqshruntq_m_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqshruntt.s32 q0, q1, #7 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vshrn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 7, i32 1, i32 0, i32 1, i32 0, i32 1, <4 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqrshrnbq_n_s16(<16 x i8> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vqrshrnbq_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrshrnb.s16 q0, q1, #5 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vshrn.v16i8.v8i16(<16 x i8> %a, <8 x i16> %b, i32 5, i32 1, i32 1, i32 0, i32 0, i32 0) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrshrnbq_n_s32(<8 x i16> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vqrshrnbq_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrshrnb.s32 q0, q1, #13 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16> %a, <4 x i32> %b, i32 13, i32 1, i32 1, i32 0, i32 0, i32 0) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqrshrnbq_n_u16(<16 x i8> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vqrshrnbq_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrshrnb.u16 q0, q1, #7 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vshrn.v16i8.v8i16(<16 x i8> %a, <8 x i16> %b, i32 7, i32 1, i32 1, i32 1, i32 1, i32 0) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrshrnbq_n_u32(<8 x i16> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vqrshrnbq_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrshrnb.u32 q0, q1, #8 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16> %a, <4 x i32> %b, i32 8, i32 1, i32 1, i32 1, i32 1, i32 0) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqrshrntq_n_s16(<16 x i8> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vqrshrntq_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrshrnt.s16 q0, q1, #7 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vshrn.v16i8.v8i16(<16 x i8> %a, <8 x i16> %b, i32 7, i32 1, i32 1, i32 0, i32 0, i32 1) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrshrntq_n_s32(<8 x i16> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vqrshrntq_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrshrnt.s32 q0, q1, #2 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16> %a, <4 x i32> %b, i32 2, i32 1, i32 1, i32 0, i32 0, i32 1) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqrshrntq_n_u16(<16 x i8> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vqrshrntq_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrshrnt.u16 q0, q1, #1 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vshrn.v16i8.v8i16(<16 x i8> %a, <8 x i16> %b, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrshrntq_n_u32(<8 x i16> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vqrshrntq_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrshrnt.u32 q0, q1, #11 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16> %a, <4 x i32> %b, i32 11, i32 1, i32 1, i32 1, i32 1, i32 1) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqrshrnbq_m_n_s16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqrshrnbq_m_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrshrnbt.s16 q0, q1, #2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vshrn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 2, i32 1, i32 1, i32 0, i32 0, i32 0, <8 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrshrnbq_m_n_s32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqrshrnbq_m_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrshrnbt.s32 q0, q1, #12 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vshrn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 12, i32 1, i32 1, i32 0, i32 0, i32 0, <4 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqrshrnbq_m_n_u16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqrshrnbq_m_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrshrnbt.u16 q0, q1, #5 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vshrn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 5, i32 1, i32 1, i32 1, i32 1, i32 0, <8 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrshrnbq_m_n_u32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqrshrnbq_m_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrshrnbt.u32 q0, q1, #11 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vshrn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 11, i32 1, i32 1, i32 1, i32 1, i32 0, <4 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqrshrntq_m_n_s16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqrshrntq_m_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrshrntt.s16 q0, q1, #4 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vshrn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 4, i32 1, i32 1, i32 0, i32 0, i32 1, <8 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrshrntq_m_n_s32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqrshrntq_m_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrshrntt.s32 q0, q1, #6 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vshrn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 6, i32 1, i32 1, i32 0, i32 0, i32 1, <4 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqrshrntq_m_n_u16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqrshrntq_m_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrshrntt.u16 q0, q1, #7 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vshrn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 7, i32 1, i32 1, i32 1, i32 1, i32 1, <8 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrshrntq_m_n_u32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqrshrntq_m_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrshrntt.u32 q0, q1, #15 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vshrn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 15, i32 1, i32 1, i32 1, i32 1, i32 1, <4 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqrshrunbq_n_s16(<16 x i8> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vqrshrunbq_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqshrunb.s16 q0, q1, #7 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vshrn.v16i8.v8i16(<16 x i8> %a, <8 x i16> %b, i32 7, i32 1, i32 0, i32 1, i32 0, i32 0) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrshrunbq_n_s32(<8 x i16> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vqrshrunbq_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqshrunb.s32 q0, q1, #1 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16> %a, <4 x i32> %b, i32 1, i32 1, i32 0, i32 1, i32 0, i32 0) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqrshruntq_n_s16(<16 x i8> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vqrshruntq_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqshrunt.s16 q0, q1, #1 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vshrn.v16i8.v8i16(<16 x i8> %a, <8 x i16> %b, i32 1, i32 1, i32 0, i32 1, i32 0, i32 1) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrshruntq_n_s32(<8 x i16> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vqrshruntq_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqshrunt.s32 q0, q1, #3 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16> %a, <4 x i32> %b, i32 3, i32 1, i32 0, i32 1, i32 0, i32 1) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqrshrunbq_m_n_s16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqrshrunbq_m_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqshrunbt.s16 q0, q1, #4 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vshrn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 4, i32 1, i32 0, i32 1, i32 0, i32 0, <8 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrshrunbq_m_n_s32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqrshrunbq_m_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqshrunbt.s32 q0, q1, #10 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vshrn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 10, i32 1, i32 0, i32 1, i32 0, i32 0, <4 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqrshruntq_m_n_s16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqrshruntq_m_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqshruntt.s16 q0, q1, #3 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vshrn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 3, i32 1, i32 0, i32 1, i32 0, i32 1, <8 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrshruntq_m_n_s32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqrshruntq_m_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqshruntt.s32 q0, q1, #13 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vshrn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 13, i32 1, i32 0, i32 1, i32 0, i32 1, <4 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vsliq_n_s8(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vsliq_n_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vsli.8 q0, q1, #2 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 2) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vsliq_n_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vsliq_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vsli.16 q0, q1, #10 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vsli.v8i16(<8 x i16> %a, <8 x i16> %b, i32 10) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vsliq_n_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vsliq_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vsli.32 q0, q1, #1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vsli.v4i32(<4 x i32> %a, <4 x i32> %b, i32 1) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vsliq_n_u8(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vsliq_n_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vsli.8 q0, q1, #1 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vsliq_n_u16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vsliq_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vsli.16 q0, q1, #1 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vsli.v8i16(<8 x i16> %a, <8 x i16> %b, i32 1) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vsliq_n_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vsliq_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vsli.32 q0, q1, #28 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vsli.v4i32(<4 x i32> %a, <4 x i32> %b, i32 28) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vsliq_m_n_s8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vsliq_m_n_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vslit.8 q0, q1, #4 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vsli.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 4, <16 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vsliq_m_n_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vsliq_m_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vslit.16 q0, q1, #1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vsli.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vsliq_m_n_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vsliq_m_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vslit.32 q0, q1, #1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vsli.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 1, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vsliq_m_n_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vsliq_m_n_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vslit.8 q0, q1, #5 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vsli.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 5, <16 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vsliq_m_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vsliq_m_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vslit.16 q0, q1, #3 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vsli.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 3, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vsliq_m_n_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vsliq_m_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vslit.32 q0, q1, #9 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vsli.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 9, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vsriq_n_s8(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vsriq_n_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vsri.8 q0, q1, #3 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vsriq_n_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vsriq_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vsri.16 q0, q1, #2 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vsri.v8i16(<8 x i16> %a, <8 x i16> %b, i32 2) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vsriq_n_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vsriq_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vsri.32 q0, q1, #28 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vsri.v4i32(<4 x i32> %a, <4 x i32> %b, i32 28) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vsriq_n_u8(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vsriq_n_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vsri.8 q0, q1, #3 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vsriq_n_u16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vsriq_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vsri.16 q0, q1, #3 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vsri.v8i16(<8 x i16> %a, <8 x i16> %b, i32 3) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vsriq_n_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vsriq_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vsri.32 q0, q1, #26 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vsri.v4i32(<4 x i32> %a, <4 x i32> %b, i32 26) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vsriq_m_n_s8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vsriq_m_n_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vsrit.8 q0, q1, #4 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vsri.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 4, <16 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vsriq_m_n_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vsriq_m_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vsrit.16 q0, q1, #1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vsri.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 1, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vsriq_m_n_s32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vsriq_m_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vsrit.32 q0, q1, #27 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vsri.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 27, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vsriq_m_n_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vsriq_m_n_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vsrit.8 q0, q1, #7 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vsri.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 7, <16 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vsriq_m_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vsriq_m_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vsrit.16 q0, q1, #9 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vsri.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 9, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vsriq_m_n_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vsriq_m_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vsrit.32 q0, q1, #13 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vsri.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 13, <4 x i1> %1) + ret <4 x i32> %2 +} + +declare <16 x i8> @llvm.arm.mve.vshrn.v16i8.v8i16(<16 x i8>, <8 x i16>, i32, i32, i32, i32, i32, i32) +declare <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16>, <4 x i32>, i32, i32, i32, i32, i32, i32) +declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) +declare <16 x i8> @llvm.arm.mve.vshrn.predicated.v16i8.v8i16.v8i1(<16 x i8>, <8 x i16>, i32, i32, i32, i32, i32, i32, <8 x i1>) +declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) +declare <8 x i16> @llvm.arm.mve.vshrn.predicated.v8i16.v4i32.v4i1(<8 x i16>, <4 x i32>, i32, i32, i32, i32, i32, i32, <4 x i1>) +declare <16 x i8> @llvm.arm.mve.vsli.v16i8(<16 x i8>, <16 x i8>, i32) +declare <8 x i16> @llvm.arm.mve.vsli.v8i16(<8 x i16>, <8 x i16>, i32) +declare <4 x i32> @llvm.arm.mve.vsli.v4i32(<4 x i32>, <4 x i32>, i32) +declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) +declare <16 x i8> @llvm.arm.mve.vsli.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>) +declare <8 x i16> @llvm.arm.mve.vsli.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>) +declare <4 x i32> @llvm.arm.mve.vsli.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>) +declare <16 x i8> @llvm.arm.mve.vsri.v16i8(<16 x i8>, <16 x i8>, i32) +declare <8 x i16> @llvm.arm.mve.vsri.v8i16(<8 x i16>, <8 x i16>, i32) +declare <4 x i32> @llvm.arm.mve.vsri.v4i32(<4 x i32>, <4 x i32>, i32) +declare <16 x i8> @llvm.arm.mve.vsri.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>) +declare <8 x i16> @llvm.arm.mve.vsri.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>) +declare <4 x i32> @llvm.arm.mve.vsri.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>) |