diff options
Diffstat (limited to 'llvm/test/CodeGen/Thumb2/mve-intrinsics/scatter-gather.ll')
| -rw-r--r-- | llvm/test/CodeGen/Thumb2/mve-intrinsics/scatter-gather.ll | 2018 |
1 files changed, 2018 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/scatter-gather.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/scatter-gather.ll new file mode 100644 index 00000000000..5e19f81cbf5 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/scatter-gather.ll @@ -0,0 +1,2018 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s + +define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_gather_offset_s16(i8* %base, <8 x i16> %offset) { +; CHECK-LABEL: test_vldrbq_gather_offset_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.s16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i8.v8i16(i8* %base, <8 x i16> %offset, i32 8, i32 0, i32 0) + ret <8 x i16> %0 +} + +declare <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i8.v8i16(i8*, <8 x i16>, i32, i32, i32) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_gather_offset_s32(i8* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrbq_gather_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.s32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i8.v4i32(i8* %base, <4 x i32> %offset, i32 8, i32 0, i32 0) + ret <4 x i32> %0 +} + +declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i8.v4i32(i8*, <4 x i32>, i32, i32, i32) + +define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_gather_offset_s8(i8* %base, <16 x i8> %offset) { +; CHECK-LABEL: test_vldrbq_gather_offset_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vldr.gather.offset.v16i8.p0i8.v16i8(i8* %base, <16 x i8> %offset, i32 8, i32 0, i32 0) + ret <16 x i8> %0 +} + +declare <16 x i8> @llvm.arm.mve.vldr.gather.offset.v16i8.p0i8.v16i8(i8*, <16 x i8>, i32, i32, i32) + +define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_gather_offset_u16(i8* %base, <8 x i16> %offset) { +; CHECK-LABEL: test_vldrbq_gather_offset_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i8.v8i16(i8* %base, <8 x i16> %offset, i32 8, i32 0, i32 1) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_gather_offset_u32(i8* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrbq_gather_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i8.v4i32(i8* %base, <4 x i32> %offset, i32 8, i32 0, i32 1) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_gather_offset_u8(i8* %base, <16 x i8> %offset) { +; CHECK-LABEL: test_vldrbq_gather_offset_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vldr.gather.offset.v16i8.p0i8.v16i8(i8* %base, <16 x i8> %offset, i32 8, i32 0, i32 1) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_gather_offset_z_s16(i8* %base, <8 x i16> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrbq_gather_offset_z_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.s16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i8.v8i16.v8i1(i8* %base, <8 x i16> %offset, i32 8, i32 0, i32 0, <8 x i1> %1) + ret <8 x i16> %2 +} + +declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) + +declare <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i8.v8i16.v8i1(i8*, <8 x i16>, i32, i32, i32, <8 x i1>) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_gather_offset_z_s32(i8* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrbq_gather_offset_z_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.s32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i8.v4i32.v4i1(i8* %base, <4 x i32> %offset, i32 8, i32 0, i32 0, <4 x i1> %1) + ret <4 x i32> %2 +} + +declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) + +declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i8.v4i32.v4i1(i8*, <4 x i32>, i32, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_gather_offset_z_s8(i8* %base, <16 x i8> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrbq_gather_offset_z_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u8 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vldr.gather.offset.predicated.v16i8.p0i8.v16i8.v16i1(i8* %base, <16 x i8> %offset, i32 8, i32 0, i32 0, <16 x i1> %1) + ret <16 x i8> %2 +} + +declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) + +declare <16 x i8> @llvm.arm.mve.vldr.gather.offset.predicated.v16i8.p0i8.v16i8.v16i1(i8*, <16 x i8>, i32, i32, i32, <16 x i1>) + +define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_gather_offset_z_u16(i8* %base, <8 x i16> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrbq_gather_offset_z_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i8.v8i16.v8i1(i8* %base, <8 x i16> %offset, i32 8, i32 0, i32 1, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_gather_offset_z_u32(i8* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrbq_gather_offset_z_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i8.v4i32.v4i1(i8* %base, <4 x i32> %offset, i32 8, i32 0, i32 1, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_gather_offset_z_u8(i8* %base, <16 x i8> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrbq_gather_offset_z_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u8 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vldr.gather.offset.predicated.v16i8.p0i8.v16i8.v16i1(i8* %base, <16 x i8> %offset, i32 8, i32 0, i32 1, <16 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_s64(<2 x i64> %addr) { +; CHECK-LABEL: test_vldrdq_gather_base_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrd.u64 q1, [q0, #616] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <2 x i64> @llvm.arm.mve.vldr.gather.base.v2i64.v2i64(<2 x i64> %addr, i32 616) + ret <2 x i64> %0 +} + +declare <2 x i64> @llvm.arm.mve.vldr.gather.base.v2i64.v2i64(<2 x i64>, i32) + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_u64(<2 x i64> %addr) { +; CHECK-LABEL: test_vldrdq_gather_base_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrd.u64 q1, [q0, #336] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <2 x i64> @llvm.arm.mve.vldr.gather.base.v2i64.v2i64(<2 x i64> %addr, i32 336) + ret <2 x i64> %0 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_wb_s64(<2 x i64>* %addr) { +; CHECK-LABEL: test_vldrdq_gather_base_wb_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrd.u64 q1, [q0, #576]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <2 x i64>, <2 x i64>* %addr, align 8 + %1 = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.v2i64.v2i64(<2 x i64> %0, i32 576) + %2 = extractvalue { <2 x i64>, <2 x i64> } %1, 1 + store <2 x i64> %2, <2 x i64>* %addr, align 8 + %3 = extractvalue { <2 x i64>, <2 x i64> } %1, 0 + ret <2 x i64> %3 +} + +declare { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.v2i64.v2i64(<2 x i64>, i32) + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_wb_u64(<2 x i64>* %addr) { +; CHECK-LABEL: test_vldrdq_gather_base_wb_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrd.u64 q1, [q0, #328]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <2 x i64>, <2 x i64>* %addr, align 8 + %1 = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.v2i64.v2i64(<2 x i64> %0, i32 328) + %2 = extractvalue { <2 x i64>, <2 x i64> } %1, 1 + store <2 x i64> %2, <2 x i64>* %addr, align 8 + %3 = extractvalue { <2 x i64>, <2 x i64> } %1, 0 + ret <2 x i64> %3 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_wb_z_s64(<2 x i64>* %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrdq_gather_base_wb_z_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrdt.u64 q1, [q0, #664]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <2 x i64>, <2 x i64>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> %0, i32 664, <4 x i1> %2) + %4 = extractvalue { <2 x i64>, <2 x i64> } %3, 1 + store <2 x i64> %4, <2 x i64>* %addr, align 8 + %5 = extractvalue { <2 x i64>, <2 x i64> } %3, 0 + ret <2 x i64> %5 +} + +declare { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64>, i32, <4 x i1>) + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_wb_z_u64(<2 x i64>* %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrdq_gather_base_wb_z_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrdt.u64 q1, [q0, #656]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <2 x i64>, <2 x i64>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> %0, i32 656, <4 x i1> %2) + %4 = extractvalue { <2 x i64>, <2 x i64> } %3, 1 + store <2 x i64> %4, <2 x i64>* %addr, align 8 + %5 = extractvalue { <2 x i64>, <2 x i64> } %3, 0 + ret <2 x i64> %5 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_z_s64(<2 x i64> %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrdq_gather_base_z_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrdt.u64 q1, [q0, #888] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1(<2 x i64> %addr, i32 888, <4 x i1> %1) + ret <2 x i64> %2 +} + +declare <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1(<2 x i64>, i32, <4 x i1>) + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_z_u64(<2 x i64> %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrdq_gather_base_z_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrdt.u64 q1, [q0, #1000] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1(<2 x i64> %addr, i32 1000, <4 x i1> %1) + ret <2 x i64> %2 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_offset_s64(i64* %base, <2 x i64> %offset) { +; CHECK-LABEL: test_vldrdq_gather_offset_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrd.u64 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0i64.v2i64(i64* %base, <2 x i64> %offset, i32 64, i32 0, i32 0) + ret <2 x i64> %0 +} + +declare <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0i64.v2i64(i64*, <2 x i64>, i32, i32, i32) + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_offset_u64(i64* %base, <2 x i64> %offset) { +; CHECK-LABEL: test_vldrdq_gather_offset_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrd.u64 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0i64.v2i64(i64* %base, <2 x i64> %offset, i32 64, i32 0, i32 1) + ret <2 x i64> %0 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_offset_z_s64(i64* %base, <2 x i64> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrdq_gather_offset_z_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrdt.u64 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, i32 64, i32 0, i32 0, <4 x i1> %1) + ret <2 x i64> %2 +} + +declare <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64*, <2 x i64>, i32, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_offset_z_u64(i64* %base, <2 x i64> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrdq_gather_offset_z_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrdt.u64 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, i32 64, i32 0, i32 1, <4 x i1> %1) + ret <2 x i64> %2 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_shifted_offset_s64(i64* %base, <2 x i64> %offset) { +; CHECK-LABEL: test_vldrdq_gather_shifted_offset_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrd.u64 q1, [r0, q0, uxtw #3] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0i64.v2i64(i64* %base, <2 x i64> %offset, i32 64, i32 3, i32 0) + ret <2 x i64> %0 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_shifted_offset_u64(i64* %base, <2 x i64> %offset) { +; CHECK-LABEL: test_vldrdq_gather_shifted_offset_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrd.u64 q1, [r0, q0, uxtw #3] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0i64.v2i64(i64* %base, <2 x i64> %offset, i32 64, i32 3, i32 1) + ret <2 x i64> %0 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_shifted_offset_z_s64(i64* %base, <2 x i64> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrdq_gather_shifted_offset_z_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrdt.u64 q1, [r0, q0, uxtw #3] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, i32 64, i32 3, i32 0, <4 x i1> %1) + ret <2 x i64> %2 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_shifted_offset_z_u64(i64* %base, <2 x i64> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrdq_gather_shifted_offset_z_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrdt.u64 q1, [r0, q0, uxtw #3] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, i32 64, i32 3, i32 1, <4 x i1> %1) + ret <2 x i64> %2 +} + +define arm_aapcs_vfpcc <8 x half> @test_vldrhq_gather_offset_f16(half* %base, <8 x i16> %offset) { +; CHECK-LABEL: test_vldrhq_gather_offset_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x half> @llvm.arm.mve.vldr.gather.offset.v8f16.p0f16.v8i16(half* %base, <8 x i16> %offset, i32 16, i32 0, i32 0) + ret <8 x half> %0 +} + +declare <8 x half> @llvm.arm.mve.vldr.gather.offset.v8f16.p0f16.v8i16(half*, <8 x i16>, i32, i32, i32) + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_offset_s16(i16* %base, <8 x i16> %offset) { +; CHECK-LABEL: test_vldrhq_gather_offset_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i16.v8i16(i16* %base, <8 x i16> %offset, i32 16, i32 0, i32 0) + ret <8 x i16> %0 +} + +declare <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i16.v8i16(i16*, <8 x i16>, i32, i32, i32) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_offset_s32(i16* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrhq_gather_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.s32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i16.v4i32(i16* %base, <4 x i32> %offset, i32 16, i32 0, i32 0) + ret <4 x i32> %0 +} + +declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i16.v4i32(i16*, <4 x i32>, i32, i32, i32) + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_offset_u16(i16* %base, <8 x i16> %offset) { +; CHECK-LABEL: test_vldrhq_gather_offset_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i16.v8i16(i16* %base, <8 x i16> %offset, i32 16, i32 0, i32 1) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_offset_u32(i16* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrhq_gather_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i16.v4i32(i16* %base, <4 x i32> %offset, i32 16, i32 0, i32 1) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <8 x half> @test_vldrhq_gather_offset_z_f16(half* %base, <8 x i16> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_offset_z_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x half> @llvm.arm.mve.vldr.gather.offset.predicated.v8f16.p0f16.v8i16.v8i1(half* %base, <8 x i16> %offset, i32 16, i32 0, i32 0, <8 x i1> %1) + ret <8 x half> %2 +} + +declare <8 x half> @llvm.arm.mve.vldr.gather.offset.predicated.v8f16.p0f16.v8i16.v8i1(half*, <8 x i16>, i32, i32, i32, <8 x i1>) + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_offset_z_s16(i16* %base, <8 x i16> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_offset_z_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i16.v8i16.v8i1(i16* %base, <8 x i16> %offset, i32 16, i32 0, i32 0, <8 x i1> %1) + ret <8 x i16> %2 +} + +declare <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i16.v8i16.v8i1(i16*, <8 x i16>, i32, i32, i32, <8 x i1>) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_offset_z_s32(i16* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_offset_z_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.s32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i16.v4i32.v4i1(i16* %base, <4 x i32> %offset, i32 16, i32 0, i32 0, <4 x i1> %1) + ret <4 x i32> %2 +} + +declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i16.v4i32.v4i1(i16*, <4 x i32>, i32, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_offset_z_u16(i16* %base, <8 x i16> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_offset_z_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i16.v8i16.v8i1(i16* %base, <8 x i16> %offset, i32 16, i32 0, i32 1, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_offset_z_u32(i16* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_offset_z_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i16.v4i32.v4i1(i16* %base, <4 x i32> %offset, i32 16, i32 0, i32 1, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <8 x half> @test_vldrhq_gather_shifted_offset_f16(half* %base, <8 x i16> %offset) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x half> @llvm.arm.mve.vldr.gather.offset.v8f16.p0f16.v8i16(half* %base, <8 x i16> %offset, i32 16, i32 1, i32 0) + ret <8 x half> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_shifted_offset_s16(i16* %base, <8 x i16> %offset) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i16.v8i16(i16* %base, <8 x i16> %offset, i32 16, i32 1, i32 0) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_shifted_offset_s32(i16* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.s32 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i16.v4i32(i16* %base, <4 x i32> %offset, i32 16, i32 1, i32 0) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_shifted_offset_u16(i16* %base, <8 x i16> %offset) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i16.v8i16(i16* %base, <8 x i16> %offset, i32 16, i32 1, i32 1) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_shifted_offset_u32(i16* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i16.v4i32(i16* %base, <4 x i32> %offset, i32 16, i32 1, i32 1) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <8 x half> @test_vldrhq_gather_shifted_offset_z_f16(half* %base, <8 x i16> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_z_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x half> @llvm.arm.mve.vldr.gather.offset.predicated.v8f16.p0f16.v8i16.v8i1(half* %base, <8 x i16> %offset, i32 16, i32 1, i32 0, <8 x i1> %1) + ret <8 x half> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_shifted_offset_z_s16(i16* %base, <8 x i16> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_z_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i16.v8i16.v8i1(i16* %base, <8 x i16> %offset, i32 16, i32 1, i32 0, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_shifted_offset_z_s32(i16* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_z_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.s32 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i16.v4i32.v4i1(i16* %base, <4 x i32> %offset, i32 16, i32 1, i32 0, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_shifted_offset_z_u16(i16* %base, <8 x i16> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_z_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i16.v8i16.v8i1(i16* %base, <8 x i16> %offset, i32 16, i32 1, i32 1, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_shifted_offset_z_u32(i16* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_z_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u32 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i16.v4i32.v4i1(i16* %base, <4 x i32> %offset, i32 16, i32 1, i32 1, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_base_f32(<4 x i32> %addr) { +; CHECK-LABEL: test_vldrwq_gather_base_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [q0, #12] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x float> @llvm.arm.mve.vldr.gather.base.v4f32.v4i32(<4 x i32> %addr, i32 12) + ret <4 x float> %0 +} + +declare <4 x float> @llvm.arm.mve.vldr.gather.base.v4f32.v4i32(<4 x i32>, i32) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_s32(<4 x i32> %addr) { +; CHECK-LABEL: test_vldrwq_gather_base_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [q0, #400] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.base.v4i32.v4i32(<4 x i32> %addr, i32 400) + ret <4 x i32> %0 +} + +declare <4 x i32> @llvm.arm.mve.vldr.gather.base.v4i32.v4i32(<4 x i32>, i32) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_u32(<4 x i32> %addr) { +; CHECK-LABEL: test_vldrwq_gather_base_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [q0, #284] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.base.v4i32.v4i32(<4 x i32> %addr, i32 284) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_base_wb_f32(<4 x i32>* %addr) { +; CHECK-LABEL: test_vldrwq_gather_base_wb_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [q0, #64]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4f32.v4i32(<4 x i32> %0, i32 64) + %2 = extractvalue { <4 x float>, <4 x i32> } %1, 1 + store <4 x i32> %2, <4 x i32>* %addr, align 8 + %3 = extractvalue { <4 x float>, <4 x i32> } %1, 0 + ret <4 x float> %3 +} + +declare { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4f32.v4i32(<4 x i32>, i32) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_wb_s32(<4 x i32>* %addr) { +; CHECK-LABEL: test_vldrwq_gather_base_wb_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [q0, #80]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4i32.v4i32(<4 x i32> %0, i32 80) + %2 = extractvalue { <4 x i32>, <4 x i32> } %1, 1 + store <4 x i32> %2, <4 x i32>* %addr, align 8 + %3 = extractvalue { <4 x i32>, <4 x i32> } %1, 0 + ret <4 x i32> %3 +} + +declare { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4i32.v4i32(<4 x i32>, i32) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_wb_u32(<4 x i32>* %addr) { +; CHECK-LABEL: test_vldrwq_gather_base_wb_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [q0, #480]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4i32.v4i32(<4 x i32> %0, i32 480) + %2 = extractvalue { <4 x i32>, <4 x i32> } %1, 1 + store <4 x i32> %2, <4 x i32>* %addr, align 8 + %3 = extractvalue { <4 x i32>, <4 x i32> } %1, 0 + ret <4 x i32> %3 +} + +define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_base_wb_z_f32(<4 x i32>* %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_base_wb_z_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [q0, #352]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %0, i32 352, <4 x i1> %2) + %4 = extractvalue { <4 x float>, <4 x i32> } %3, 1 + store <4 x i32> %4, <4 x i32>* %addr, align 8 + %5 = extractvalue { <4 x float>, <4 x i32> } %3, 0 + ret <4 x float> %5 +} + +declare { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_wb_z_s32(<4 x i32>* %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_base_wb_z_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [q0, #276]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32> %0, i32 276, <4 x i1> %2) + %4 = extractvalue { <4 x i32>, <4 x i32> } %3, 1 + store <4 x i32> %4, <4 x i32>* %addr, align 8 + %5 = extractvalue { <4 x i32>, <4 x i32> } %3, 0 + ret <4 x i32> %5 +} + +declare { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_wb_z_u32(<4 x i32>* %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_base_wb_z_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [q0, #88]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32> %0, i32 88, <4 x i1> %2) + %4 = extractvalue { <4 x i32>, <4 x i32> } %3, 1 + store <4 x i32> %4, <4 x i32>* %addr, align 8 + %5 = extractvalue { <4 x i32>, <4 x i32> } %3, 0 + ret <4 x i32> %5 +} + +define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_base_z_f32(<4 x i32> %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_base_z_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [q0, #300] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x float> @llvm.arm.mve.vldr.gather.base.predicated.v4f32.v4i32.v4i1(<4 x i32> %addr, i32 300, <4 x i1> %1) + ret <4 x float> %2 +} + +declare <4 x float> @llvm.arm.mve.vldr.gather.base.predicated.v4f32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_z_s32(<4 x i32> %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_base_z_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [q0, #440] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.base.predicated.v4i32.v4i32.v4i1(<4 x i32> %addr, i32 440, <4 x i1> %1) + ret <4 x i32> %2 +} + +declare <4 x i32> @llvm.arm.mve.vldr.gather.base.predicated.v4i32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_z_u32(<4 x i32> %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_base_z_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [q0, #300] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.base.predicated.v4i32.v4i32.v4i1(<4 x i32> %addr, i32 300, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_offset_f32(float* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrwq_gather_offset_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x float> @llvm.arm.mve.vldr.gather.offset.v4f32.p0f32.v4i32(float* %base, <4 x i32> %offset, i32 32, i32 0, i32 0) + ret <4 x float> %0 +} + +declare <4 x float> @llvm.arm.mve.vldr.gather.offset.v4f32.p0f32.v4i32(float*, <4 x i32>, i32, i32, i32) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_offset_s32(i32* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrwq_gather_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* %base, <4 x i32> %offset, i32 32, i32 0, i32 0) + ret <4 x i32> %0 +} + +declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32*, <4 x i32>, i32, i32, i32) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_offset_u32(i32* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrwq_gather_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* %base, <4 x i32> %offset, i32 32, i32 0, i32 1) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_offset_z_f32(float* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_offset_z_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x float> @llvm.arm.mve.vldr.gather.offset.predicated.v4f32.p0f32.v4i32.v4i1(float* %base, <4 x i32> %offset, i32 32, i32 0, i32 0, <4 x i1> %1) + ret <4 x float> %2 +} + +declare <4 x float> @llvm.arm.mve.vldr.gather.offset.predicated.v4f32.p0f32.v4i32.v4i1(float*, <4 x i32>, i32, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_offset_z_s32(i32* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_offset_z_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i32.v4i32.v4i1(i32* %base, <4 x i32> %offset, i32 32, i32 0, i32 0, <4 x i1> %1) + ret <4 x i32> %2 +} + +declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i32.v4i32.v4i1(i32*, <4 x i32>, i32, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_offset_z_u32(i32* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_offset_z_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i32.v4i32.v4i1(i32* %base, <4 x i32> %offset, i32 32, i32 0, i32 1, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_shifted_offset_f32(float* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrwq_gather_shifted_offset_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x float> @llvm.arm.mve.vldr.gather.offset.v4f32.p0f32.v4i32(float* %base, <4 x i32> %offset, i32 32, i32 2, i32 0) + ret <4 x float> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_shifted_offset_s32(i32* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrwq_gather_shifted_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* %base, <4 x i32> %offset, i32 32, i32 2, i32 0) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_shifted_offset_u32(i32* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrwq_gather_shifted_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* %base, <4 x i32> %offset, i32 32, i32 2, i32 1) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_shifted_offset_z_f32(float* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_shifted_offset_z_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x float> @llvm.arm.mve.vldr.gather.offset.predicated.v4f32.p0f32.v4i32.v4i1(float* %base, <4 x i32> %offset, i32 32, i32 2, i32 0, <4 x i1> %1) + ret <4 x float> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_shifted_offset_z_s32(i32* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_shifted_offset_z_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i32.v4i32.v4i1(i32* %base, <4 x i32> %offset, i32 32, i32 2, i32 0, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_shifted_offset_z_u32(i32* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_shifted_offset_z_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i32.v4i32.v4i1(i32* %base, <4 x i32> %offset, i32 32, i32 2, i32 1, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_p_s16(i8* %base, <8 x i16> %offset, <8 x i16> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrbq_scatter_offset_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v8i16.v8i16.v8i1(i8* %base, <8 x i16> %offset, <8 x i16> %value, i32 8, i32 0, <8 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v8i16.v8i16.v8i1(i8*, <8 x i16>, <8 x i16>, i32, i32, <8 x i1>) + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_p_s32(i8* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrbq_scatter_offset_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v4i32.v4i32.v4i1(i8* %base, <4 x i32> %offset, <4 x i32> %value, i32 8, i32 0, <4 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v4i32.v4i32.v4i1(i8*, <4 x i32>, <4 x i32>, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_p_s8(i8* %base, <16 x i8> %offset, <16 x i8> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrbq_scatter_offset_p_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.8 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v16i8.v16i8.v16i1(i8* %base, <16 x i8> %offset, <16 x i8> %value, i32 8, i32 0, <16 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v16i8.v16i8.v16i1(i8*, <16 x i8>, <16 x i8>, i32, i32, <16 x i1>) + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_p_u16(i8* %base, <8 x i16> %offset, <8 x i16> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrbq_scatter_offset_p_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v8i16.v8i16.v8i1(i8* %base, <8 x i16> %offset, <8 x i16> %value, i32 8, i32 0, <8 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_p_u32(i8* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrbq_scatter_offset_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v4i32.v4i32.v4i1(i8* %base, <4 x i32> %offset, <4 x i32> %value, i32 8, i32 0, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_p_u8(i8* %base, <16 x i8> %offset, <16 x i8> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrbq_scatter_offset_p_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.8 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v16i8.v16i8.v16i1(i8* %base, <16 x i8> %offset, <16 x i8> %value, i32 8, i32 0, <16 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_s16(i8* %base, <8 x i16> %offset, <8 x i16> %value) { +; CHECK-LABEL: test_vstrbq_scatter_offset_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrb.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v8i16.v8i16(i8* %base, <8 x i16> %offset, <8 x i16> %value, i32 8, i32 0) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.p0i8.v8i16.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_s32(i8* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrbq_scatter_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrb.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v4i32.v4i32(i8* %base, <4 x i32> %offset, <4 x i32> %value, i32 8, i32 0) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.p0i8.v4i32.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_s8(i8* %base, <16 x i8> %offset, <16 x i8> %value) { +; CHECK-LABEL: test_vstrbq_scatter_offset_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrb.8 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v16i8.v16i8(i8* %base, <16 x i8> %offset, <16 x i8> %value, i32 8, i32 0) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.p0i8.v16i8.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32) + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_u16(i8* %base, <8 x i16> %offset, <8 x i16> %value) { +; CHECK-LABEL: test_vstrbq_scatter_offset_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrb.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v8i16.v8i16(i8* %base, <8 x i16> %offset, <8 x i16> %value, i32 8, i32 0) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_u32(i8* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrbq_scatter_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrb.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v4i32.v4i32(i8* %base, <4 x i32> %offset, <4 x i32> %value, i32 8, i32 0) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_u8(i8* %base, <16 x i8> %offset, <16 x i8> %value) { +; CHECK-LABEL: test_vstrbq_scatter_offset_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrb.8 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v16i8.v16i8(i8* %base, <16 x i8> %offset, <16 x i8> %value, i32 8, i32 0) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_p_s64(<2 x i64> %addr, <2 x i64> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrdq_scatter_base_p_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrdt.64 q1, [q0, #888] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1(<2 x i64> %addr, i32 888, <2 x i64> %value, <4 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1(<2 x i64>, i32, <2 x i64>, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_p_u64(<2 x i64> %addr, <2 x i64> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrdq_scatter_base_p_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrdt.64 q1, [q0, #264] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1(<2 x i64> %addr, i32 264, <2 x i64> %value, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_s64(<2 x i64> %addr, <2 x i64> %value) { +; CHECK-LABEL: test_vstrdq_scatter_base_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrd.64 q1, [q0, #408] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.base.v2i64.v2i64(<2 x i64> %addr, i32 408, <2 x i64> %value) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.base.v2i64.v2i64(<2 x i64>, i32, <2 x i64>) + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_u64(<2 x i64> %addr, <2 x i64> %value) { +; CHECK-LABEL: test_vstrdq_scatter_base_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrd.64 q1, [q0, #472] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.base.v2i64.v2i64(<2 x i64> %addr, i32 472, <2 x i64> %value) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_wb_p_s64(<2 x i64>* %addr, <2 x i64> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrdq_scatter_base_wb_p_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrdt.64 q0, [q1, #248]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <2 x i64>, <2 x i64>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> %0, i32 248, <2 x i64> %value, <4 x i1> %2) + store <2 x i64> %3, <2 x i64>* %addr, align 8 + ret void +} + +declare <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64>, i32, <2 x i64>, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_wb_p_u64(<2 x i64>* %addr, <2 x i64> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrdq_scatter_base_wb_p_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrdt.64 q0, [q1, #136]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <2 x i64>, <2 x i64>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> %0, i32 136, <2 x i64> %value, <4 x i1> %2) + store <2 x i64> %3, <2 x i64>* %addr, align 8 + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_wb_s64(<2 x i64>* %addr, <2 x i64> %value) { +; CHECK-LABEL: test_vstrdq_scatter_base_wb_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vstrd.64 q0, [q1, #208]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <2 x i64>, <2 x i64>* %addr, align 8 + %1 = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.v2i64.v2i64(<2 x i64> %0, i32 208, <2 x i64> %value) + store <2 x i64> %1, <2 x i64>* %addr, align 8 + ret void +} + +declare <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.v2i64.v2i64(<2 x i64>, i32, <2 x i64>) + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_wb_u64(<2 x i64>* %addr, <2 x i64> %value) { +; CHECK-LABEL: test_vstrdq_scatter_base_wb_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vstrd.64 q0, [q1, #168]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <2 x i64>, <2 x i64>* %addr, align 8 + %1 = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.v2i64.v2i64(<2 x i64> %0, i32 168, <2 x i64> %value) + store <2 x i64> %1, <2 x i64>* %addr, align 8 + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_offset_p_s64(i64* %base, <2 x i64> %offset, <2 x i64> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrdq_scatter_offset_p_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrdt.64 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 0, <4 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64*, <2 x i64>, <2 x i64>, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_offset_p_u64(i64* %base, <2 x i64> %offset, <2 x i64> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrdq_scatter_offset_p_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrdt.64 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 0, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_offset_s64(i64* %base, <2 x i64> %offset, <2 x i64> %value) { +; CHECK-LABEL: test_vstrdq_scatter_offset_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrd.64 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i64.v2i64.v2i64(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 0) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.p0i64.v2i64.v2i64(i64*, <2 x i64>, <2 x i64>, i32, i32) + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_offset_u64(i64* %base, <2 x i64> %offset, <2 x i64> %value) { +; CHECK-LABEL: test_vstrdq_scatter_offset_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrd.64 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i64.v2i64.v2i64(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 0) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_shifted_offset_p_s64(i64* %base, <2 x i64> %offset, <2 x i64> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrdq_scatter_shifted_offset_p_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrdt.64 q1, [r0, q0, uxtw #3] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 3, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_shifted_offset_p_u64(i64* %base, <2 x i64> %offset, <2 x i64> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrdq_scatter_shifted_offset_p_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrdt.64 q1, [r0, q0, uxtw #3] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 3, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_shifted_offset_s64(i64* %base, <2 x i64> %offset, <2 x i64> %value) { +; CHECK-LABEL: test_vstrdq_scatter_shifted_offset_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrd.64 q1, [r0, q0, uxtw #3] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i64.v2i64.v2i64(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 3) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_shifted_offset_u64(i64* %base, <2 x i64> %offset, <2 x i64> %value) { +; CHECK-LABEL: test_vstrdq_scatter_shifted_offset_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrd.64 q1, [r0, q0, uxtw #3] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i64.v2i64.v2i64(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 3) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_f16(half* %base, <8 x i16> %offset, <8 x half> %value) { +; CHECK-LABEL: test_vstrhq_scatter_offset_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0f16.v8i16.v8f16(half* %base, <8 x i16> %offset, <8 x half> %value, i32 16, i32 0) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.p0f16.v8i16.v8f16(half*, <8 x i16>, <8 x half>, i32, i32) + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_p_f16(half* %base, <8 x i16> %offset, <8 x half> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_offset_p_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0f16.v8i16.v8f16.v8i1(half* %base, <8 x i16> %offset, <8 x half> %value, i32 16, i32 0, <8 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0f16.v8i16.v8f16.v8i1(half*, <8 x i16>, <8 x half>, i32, i32, <8 x i1>) + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_p_s16(i16* %base, <8 x i16> %offset, <8 x i16> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_offset_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v8i16.v8i16.v8i1(i16* %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 0, <8 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v8i16.v8i16.v8i1(i16*, <8 x i16>, <8 x i16>, i32, i32, <8 x i1>) + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_p_s32(i16* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_offset_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v4i32.v4i32.v4i1(i16* %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 0, <4 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v4i32.v4i32.v4i1(i16*, <4 x i32>, <4 x i32>, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_p_u16(i16* %base, <8 x i16> %offset, <8 x i16> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_offset_p_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v8i16.v8i16.v8i1(i16* %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 0, <8 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_p_u32(i16* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_offset_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v4i32.v4i32.v4i1(i16* %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 0, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_s16(i16* %base, <8 x i16> %offset, <8 x i16> %value) { +; CHECK-LABEL: test_vstrhq_scatter_offset_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v8i16.v8i16(i16* %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 0) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.p0i16.v8i16.v8i16(i16*, <8 x i16>, <8 x i16>, i32, i32) + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_s32(i16* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrhq_scatter_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v4i32.v4i32(i16* %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 0) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.p0i16.v4i32.v4i32(i16*, <4 x i32>, <4 x i32>, i32, i32) + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_u16(i16* %base, <8 x i16> %offset, <8 x i16> %value) { +; CHECK-LABEL: test_vstrhq_scatter_offset_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v8i16.v8i16(i16* %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 0) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_u32(i16* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrhq_scatter_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v4i32.v4i32(i16* %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 0) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_f16(half* %base, <8 x i16> %offset, <8 x half> %value) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0f16.v8i16.v8f16(half* %base, <8 x i16> %offset, <8 x half> %value, i32 16, i32 1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_p_f16(half* %base, <8 x i16> %offset, <8 x half> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_p_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0f16.v8i16.v8f16.v8i1(half* %base, <8 x i16> %offset, <8 x half> %value, i32 16, i32 1, <8 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_p_s16(i16* %base, <8 x i16> %offset, <8 x i16> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v8i16.v8i16.v8i1(i16* %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 1, <8 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_p_s32(i16* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.32 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v4i32.v4i32.v4i1(i16* %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 1, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_p_u16(i16* %base, <8 x i16> %offset, <8 x i16> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_p_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v8i16.v8i16.v8i1(i16* %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 1, <8 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_p_u32(i16* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.32 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v4i32.v4i32.v4i1(i16* %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 1, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_s16(i16* %base, <8 x i16> %offset, <8 x i16> %value) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v8i16.v8i16(i16* %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_s32(i16* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.32 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v4i32.v4i32(i16* %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_u16(i16* %base, <8 x i16> %offset, <8 x i16> %value) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v8i16.v8i16(i16* %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_u32(i16* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.32 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v4i32.v4i32(i16* %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_f32(<4 x i32> %addr, <4 x float> %value) { +; CHECK-LABEL: test_vstrwq_scatter_base_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q1, [q0, #380] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.base.v4i32.v4f32(<4 x i32> %addr, i32 380, <4 x float> %value) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.base.v4i32.v4f32(<4 x i32>, i32, <4 x float>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_p_f32(<4 x i32> %addr, <4 x float> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_base_p_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [q0, #400] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.base.predicated.v4i32.v4f32.v4i1(<4 x i32> %addr, i32 400, <4 x float> %value, <4 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.base.predicated.v4i32.v4f32.v4i1(<4 x i32>, i32, <4 x float>, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_p_s32(<4 x i32> %addr, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_base_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [q0, #48] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.base.predicated.v4i32.v4i32.v4i1(<4 x i32> %addr, i32 48, <4 x i32> %value, <4 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.base.predicated.v4i32.v4i32.v4i1(<4 x i32>, i32, <4 x i32>, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_p_u32(<4 x i32> %addr, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_base_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [q0, #376] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.base.predicated.v4i32.v4i32.v4i1(<4 x i32> %addr, i32 376, <4 x i32> %value, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_s32(<4 x i32> %addr, <4 x i32> %value) { +; CHECK-LABEL: test_vstrwq_scatter_base_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q1, [q0, #156] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.base.v4i32.v4i32(<4 x i32> %addr, i32 156, <4 x i32> %value) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.base.v4i32.v4i32(<4 x i32>, i32, <4 x i32>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_u32(<4 x i32> %addr, <4 x i32> %value) { +; CHECK-LABEL: test_vstrwq_scatter_base_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q1, [q0, #212] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.base.v4i32.v4i32(<4 x i32> %addr, i32 212, <4 x i32> %value) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_wb_f32(<4 x i32>* %addr, <4 x float> %value) { +; CHECK-LABEL: test_vstrwq_scatter_base_wb_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vstrw.32 q0, [q1, #412]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.v4i32.v4f32(<4 x i32> %0, i32 412, <4 x float> %value) + store <4 x i32> %1, <4 x i32>* %addr, align 8 + ret void +} + +declare <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.v4i32.v4f32(<4 x i32>, i32, <4 x float>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_wb_p_f32(<4 x i32>* %addr, <4 x float> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_base_wb_p_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q0, [q1, #236]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v4i32.v4f32.v4i1(<4 x i32> %0, i32 236, <4 x float> %value, <4 x i1> %2) + store <4 x i32> %3, <4 x i32>* %addr, align 8 + ret void +} + +declare <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v4i32.v4f32.v4i1(<4 x i32>, i32, <4 x float>, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_wb_p_s32(<4 x i32>* %addr, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_base_wb_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q0, [q1, #328]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32> %0, i32 328, <4 x i32> %value, <4 x i1> %2) + store <4 x i32> %3, <4 x i32>* %addr, align 8 + ret void +} + +declare <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32>, i32, <4 x i32>, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_wb_p_u32(<4 x i32>* %addr, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_base_wb_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q0, [q1, #412]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32> %0, i32 412, <4 x i32> %value, <4 x i1> %2) + store <4 x i32> %3, <4 x i32>* %addr, align 8 + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_wb_s32(<4 x i32>* %addr, <4 x i32> %value) { +; CHECK-LABEL: test_vstrwq_scatter_base_wb_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vstrw.32 q0, [q1, #152]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.v4i32.v4i32(<4 x i32> %0, i32 152, <4 x i32> %value) + store <4 x i32> %1, <4 x i32>* %addr, align 8 + ret void +} + +declare <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.v4i32.v4i32(<4 x i32>, i32, <4 x i32>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_wb_u32(<4 x i32>* %addr, <4 x i32> %value) { +; CHECK-LABEL: test_vstrwq_scatter_base_wb_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vstrw.32 q0, [q1, #64]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.v4i32.v4i32(<4 x i32> %0, i32 64, <4 x i32> %value) + store <4 x i32> %1, <4 x i32>* %addr, align 8 + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_offset_f32(float* %base, <4 x i32> %offset, <4 x float> %value) { +; CHECK-LABEL: test_vstrwq_scatter_offset_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0f32.v4i32.v4f32(float* %base, <4 x i32> %offset, <4 x float> %value, i32 32, i32 0) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.p0f32.v4i32.v4f32(float*, <4 x i32>, <4 x float>, i32, i32) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_offset_p_f32(float* %base, <4 x i32> %offset, <4 x float> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_offset_p_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0f32.v4i32.v4f32.v4i1(float* %base, <4 x i32> %offset, <4 x float> %value, i32 32, i32 0, <4 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0f32.v4i32.v4f32.v4i1(float*, <4 x i32>, <4 x float>, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_offset_p_s32(i32* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_offset_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i32.v4i32.v4i32.v4i1(i32* %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 0, <4 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i32.v4i32.v4i32.v4i1(i32*, <4 x i32>, <4 x i32>, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_offset_p_u32(i32* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_offset_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i32.v4i32.v4i32.v4i1(i32* %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 0, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_offset_s32(i32* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrwq_scatter_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i32.v4i32.v4i32(i32* %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 0) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.p0i32.v4i32.v4i32(i32*, <4 x i32>, <4 x i32>, i32, i32) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_offset_u32(i32* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrwq_scatter_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i32.v4i32.v4i32(i32* %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 0) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_shifted_offset_f32(float* %base, <4 x i32> %offset, <4 x float> %value) { +; CHECK-LABEL: test_vstrwq_scatter_shifted_offset_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0f32.v4i32.v4f32(float* %base, <4 x i32> %offset, <4 x float> %value, i32 32, i32 2) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_shifted_offset_p_f32(float* %base, <4 x i32> %offset, <4 x float> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_shifted_offset_p_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0f32.v4i32.v4f32.v4i1(float* %base, <4 x i32> %offset, <4 x float> %value, i32 32, i32 2, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_shifted_offset_p_s32(i32* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_shifted_offset_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i32.v4i32.v4i32.v4i1(i32* %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 2, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_shifted_offset_p_u32(i32* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_shifted_offset_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i32.v4i32.v4i32.v4i1(i32* %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 2, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_shifted_offset_s32(i32* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrwq_scatter_shifted_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i32.v4i32.v4i32(i32* %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 2) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_shifted_offset_u32(i32* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrwq_scatter_shifted_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i32.v4i32.v4i32(i32* %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 2) + ret void +} |

