diff options
author | Simon Tatham <simon.tatham@arm.com> | 2019-10-31 17:02:07 +0000 |
---|---|---|
committer | Simon Tatham <simon.tatham@arm.com> | 2019-11-06 09:01:42 +0000 |
commit | 6c3fee47a6492b472be2d48cee0a85773f160df0 (patch) | |
tree | 94682a37e91a0fb924bad53458d32e866d1e95c9 | |
parent | f0c6890f32c0d5ee7f3973181eb83fcb0a50dc1a (diff) | |
download | bcm5719-llvm-6c3fee47a6492b472be2d48cee0a85773f160df0.tar.gz bcm5719-llvm-6c3fee47a6492b472be2d48cee0a85773f160df0.zip |
[ARM,MVE] Add intrinsics for gather/scatter load/stores.
This patch adds two new families of intrinsics, both of which are
memory accesses taking a vector of locations to load from / store to.
The vldrq_gather_base / vstrq_scatter_base intrinsics take a vector of
base addresses, and an immediate offset to be added consistently to
each one. vldrq_gather_offset / vstrq_scatter_offset take a scalar
base address, and a vector of offsets to add to it. The
'shifted_offset' variants also multiply each offset by the element
size type, so that the vector is effectively of array indices.
At the IR level, these operations are represented by a single set of
four IR intrinsics: {gather,scatter} × {base,offset}. The other
details (signed/unsigned, shift, and memory element size as opposed to
vector element size) are all specified by IR intrinsic polymorphism
and immediate operands, because that made the selection job easier
than making a huge family of similarly named intrinsics.
I considered using the standard IR representations such as
llvm.masked.gather, but they're not a good fit. In order to use
llvm.masked.gather to represent a gather_offset load with element size
smaller than a pointer, you'd have to expand the <8 x i16> vector of
offsets into an <8 x i16*> vector of pointers, which would be split up
during legalization, so you'd spend most of your time undoing the mess
it had made. Also, ISel support for llvm.masked.gather would be easy
enough in a trivial way (you can expand it into a gather-base load
with a zero immediate offset), but instruction-selecting lots of
fiddly idioms back into all the _other_ MVE load instructions would be
much more work. So I think dedicated IR intrinsics are the more
sensible approach, at least for the moment.
On the clang tablegen side, I've added two new features to the
Tablegen source accepted by MveEmitter: a 'CopyKind' type node for
defining a type that varies with the parameter type (it lets you ask
for an unsigned integer type of the same width as the parameter), and
an 'unsignedflag' value node for passing an immediate IR operand which
is 0 for a signed integer type or 1 for an unsigned one. That lets me
write each kind of intrinsic just once and get all its subtypes and
immediate arguments generated automatically.
Also I've tweaked the handling of pointer-typed values in the code
generation part of MveEmitter: they're generated as Address rather
than Value (i.e. including an alignment) so that they can be given to
the ordinary IR load and store operations, but I'd omitted the code to
convert them back to Value when they're going to be used as an
argument to an IR intrinsic.
On the MC side, I've enhanced MVEVectorVTInfo so that it can tell you
not only the full assembly-language suffix for a given vector type
(like 's32' or 'u16') but also the numeric-only one used by store
instructions (just '32' or '16').
Reviewers: dmgreen
Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits
Tags: #clang, #llvm
Differential Revision: https://reviews.llvm.org/D69791
-rw-r--r-- | clang/include/clang/Basic/arm_mve.td | 162 | ||||
-rw-r--r-- | clang/include/clang/Basic/arm_mve_defs.td | 25 | ||||
-rw-r--r-- | clang/test/CodeGen/arm-mve-intrinsics/scatter-gather.c | 2146 | ||||
-rw-r--r-- | clang/test/Sema/arm-mve-immediates.c | 56 | ||||
-rw-r--r-- | clang/utils/TableGen/MveEmitter.cpp | 40 | ||||
-rw-r--r-- | llvm/include/llvm/IR/IntrinsicsARM.td | 44 | ||||
-rw-r--r-- | llvm/lib/Target/ARM/ARMInstrMVE.td | 182 | ||||
-rw-r--r-- | llvm/test/CodeGen/Thumb2/mve-intrinsics/scatter-gather.ll | 2018 |
8 files changed, 4599 insertions, 74 deletions
diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td index aca0d9fa925..6e0e8ce4e5e 100644 --- a/clang/include/clang/Basic/arm_mve.td +++ b/clang/include/clang/Basic/arm_mve.td @@ -72,22 +72,158 @@ def vcvt#half#q_m_f16: Intrinsic< } // loop over half = "b", "t" -let params = T.All32, pnt = PNT_None in -def vldrwq_gather_base_wb: Intrinsic< - Vector, (args Ptr<VecOf<Unsigned<Scalar>>>:$addr, imm_mem7bit<4>:$offset), - (seq (IRInt<"vldr_gather_base_wb", [Vector, VecOf<Unsigned<Scalar>>]> +multiclass gather_base<list<Type> types, int size> { + let params = types, pnt = PNT_None in { + def _gather_base: Intrinsic< + Vector, (args UVector:$addr, imm_mem7bit<size>:$offset), + (IRInt<"vldr_gather_base", [Vector, UVector]> $addr, $offset)>; + + def _gather_base_z: Intrinsic< + Vector, (args UVector:$addr, imm_mem7bit<size>:$offset, Predicate:$pred), + (IRInt<"vldr_gather_base_predicated", [Vector, UVector, Predicate]> + $addr, $offset, $pred)>; + + def _gather_base_wb: Intrinsic< + Vector, (args Ptr<UVector>:$addr, imm_mem7bit<size>:$offset), + (seq (IRInt<"vldr_gather_base_wb", [Vector, UVector]> (load $addr), $offset):$pair, - (store (xval $pair, 1), $addr), - (xval $pair, 0))>; + (store (xval $pair, 1), $addr), + (xval $pair, 0))>; -let params = T.All64, pnt = PNT_None in -def vldrdq_gather_base_wb_z: Intrinsic< - Vector, (args Ptr<VecOf<Unsigned<Scalar>>>:$addr, imm_mem7bit<8>:$offset, - Predicate:$pred), - (seq (IRInt<"vldr_gather_base_wb_predicated", [Vector, VecOf<Unsigned<Scalar>>, Predicate]> + def _gather_base_wb_z: Intrinsic< + Vector, (args Ptr<UVector>:$addr, imm_mem7bit<size>:$offset, + Predicate:$pred), + (seq (IRInt<"vldr_gather_base_wb_predicated", + [Vector, UVector, Predicate]> (load $addr), $offset, $pred):$pair, - (store (xval $pair, 1), $addr), - (xval $pair, 0))>; + (store (xval $pair, 1), $addr), + (xval $pair, 0))>; + } +} + +defm vldrwq: gather_base<T.All32, 4>; +defm vldrdq: gather_base<T.All64, 8>; + +multiclass scatter_base<list<Type> types, int size> { + let params = types in { + def _scatter_base: Intrinsic< + Void, (args UVector:$addr, imm_mem7bit<size>:$offset, Vector:$data), + (IRInt<"vstr_scatter_base", [UVector, Vector]> $addr, $offset, $data)>; + + def _scatter_base_p: Intrinsic< + Void, (args UVector:$addr, imm_mem7bit<size>:$offset, Vector:$data, + Predicate:$pred), + (IRInt<"vstr_scatter_base_predicated", [UVector, Vector, Predicate]> + $addr, $offset, $data, $pred)>; + + def _scatter_base_wb: Intrinsic< + Void, (args Ptr<UVector>:$addr, imm_mem7bit<size>:$offset, Vector:$data), + (seq (IRInt<"vstr_scatter_base_wb", [UVector, Vector]> + (load $addr), $offset, $data):$wbaddr, + (store $wbaddr, $addr))>; + + def _scatter_base_wb_p: Intrinsic< + Void, (args Ptr<UVector>:$addr, imm_mem7bit<size>:$offset, + Vector:$data, Predicate:$pred), + (seq (IRInt<"vstr_scatter_base_wb_predicated", + [UVector, Vector, Predicate]> + (load $addr), $offset, $data, $pred):$wbaddr, + (store $wbaddr, $addr))>; + } +} + +defm vstrwq: scatter_base<T.All32, 4>; +defm vstrdq: scatter_base<T.All64, 8>; + +multiclass gather_offset_unshifted<list<Type> types, PrimitiveType memtype> { + let params = types in { + def _gather_offset: Intrinsic< + Vector, (args CPtr<CopyKind<memtype, Scalar>>:$base, UVector:$offsets), + (IRInt<"vldr_gather_offset", + [Vector, CPtr<CopyKind<memtype, Scalar>>, UVector]> + $base, $offsets, memtype.size, 0, (unsignedflag Scalar))>; + def _gather_offset_z: Intrinsic< + Vector, (args CPtr<CopyKind<memtype, Scalar>>:$base, UVector:$offsets, + Predicate:$pred), + (IRInt<"vldr_gather_offset_predicated", + [Vector, CPtr<CopyKind<memtype, Scalar>>, UVector, Predicate]> + $base, $offsets, memtype.size, 0, (unsignedflag Scalar), $pred)>; + } +} + +multiclass gather_offset_shifted<list<Type> types, PrimitiveType memtype, + int shift> { + let params = types in { + def _gather_shifted_offset: Intrinsic< + Vector, (args CPtr<CopyKind<memtype, Scalar>>:$base, UVector:$offsets), + (IRInt<"vldr_gather_offset", + [Vector, CPtr<CopyKind<memtype, Scalar>>, UVector]> + $base, $offsets, memtype.size, shift, (unsignedflag Scalar))>; + def _gather_shifted_offset_z: Intrinsic< + Vector, (args CPtr<CopyKind<memtype, Scalar>>:$base, UVector:$offsets, + Predicate:$pred), + (IRInt<"vldr_gather_offset_predicated", + [Vector, CPtr<CopyKind<memtype, Scalar>>, UVector, Predicate]> + $base, $offsets, memtype.size, shift, (unsignedflag Scalar), $pred)>; + } +} + +multiclass gather_offset_both<list<Type> types, PrimitiveType memtype, + int shift> { + defm "": gather_offset_unshifted<types, memtype>; + defm "": gather_offset_shifted<types, memtype, shift>; +} + +defm vldrbq: gather_offset_unshifted<!listconcat(T.All8, T.Int16, T.Int32), u8>; +defm vldrhq: gather_offset_both<!listconcat(T.All16, T.Int32), u16, 1>; +defm vldrwq: gather_offset_both<T.All32, u32, 2>; +defm vldrdq: gather_offset_both<T.Int64, u64, 3>; + +multiclass scatter_offset_unshifted<list<Type> types, PrimitiveType memtype> { + let params = types in { + def _scatter_offset: Intrinsic< + Void, (args Ptr<CopyKind<memtype, Scalar>>:$base, UVector:$offsets, + Vector:$data), + (IRInt<"vstr_scatter_offset", + [Ptr<CopyKind<memtype, Scalar>>, UVector, Vector]> + $base, $offsets, $data, memtype.size, 0)>; + def _scatter_offset_p: Intrinsic< + Void, (args Ptr<CopyKind<memtype, Scalar>>:$base, UVector:$offsets, + Vector:$data, Predicate:$pred), + (IRInt<"vstr_scatter_offset_predicated", + [Ptr<CopyKind<memtype, Scalar>>, UVector, Vector, Predicate]> + $base, $offsets, $data, memtype.size, 0, $pred)>; + } +} + +multiclass scatter_offset_shifted<list<Type> types, PrimitiveType memtype, + int shift> { + let params = types in { + def _scatter_shifted_offset: Intrinsic< + Void, (args Ptr<CopyKind<memtype, Scalar>>:$base, UVector:$offsets, + Vector:$data), + (IRInt<"vstr_scatter_offset", + [Ptr<CopyKind<memtype, Scalar>>, UVector, Vector]> + $base, $offsets, $data, memtype.size, shift)>; + def _scatter_shifted_offset_p: Intrinsic< + Void, (args Ptr<CopyKind<memtype, Scalar>>:$base, UVector:$offsets, + Vector:$data, Predicate:$pred), + (IRInt<"vstr_scatter_offset_predicated", + [Ptr<CopyKind<memtype, Scalar>>, UVector, Vector, Predicate]> + $base, $offsets, $data, memtype.size, shift, $pred)>; + } +} + +multiclass scatter_offset_both<list<Type> types, PrimitiveType memtype, + int shift> { + defm "": scatter_offset_unshifted<types, memtype>; + defm "": scatter_offset_shifted<types, memtype, shift>; +} + +defm vstrbq: scatter_offset_unshifted<!listconcat(T.All8,T.Int16,T.Int32), u8>; +defm vstrhq: scatter_offset_both<!listconcat(T.All16, T.Int32), u16, 1>; +defm vstrwq: scatter_offset_both<T.All32, u32, 2>; +defm vstrdq: scatter_offset_both<T.Int64, u64, 3>; let params = [Void], pnt = PNT_None in def urshrl: Intrinsic<u64, (args u64:$value, imm_1to32:$shift), diff --git a/clang/include/clang/Basic/arm_mve_defs.td b/clang/include/clang/Basic/arm_mve_defs.td index 14afc04a825..3d9333f3d44 100644 --- a/clang/include/clang/Basic/arm_mve_defs.td +++ b/clang/include/clang/Basic/arm_mve_defs.td @@ -82,6 +82,11 @@ class IRInt<string name_, list<Type> params_ = [], bit appendKind_ = 0> { // the return value of the seq construction as a whole. def seq; +// Another magic operation is 'unsignedflag', which you give a scalar +// _type_ as an argument, and it expands into 1 for an unsigned type +// and 0 for a signed (or floating) one. +def unsignedflag; + // If you put CustomCodegen<"foo"> in an intrinsic's codegen field, it // indicates that the IR generation for that intrinsic is done by handwritten // C++ and not autogenerated at all. The effect in the MVE builtin codegen @@ -109,7 +114,7 @@ def CTO_Vec: ComplexTypeOp; def CTO_Pred: ComplexTypeOp; class CTO_Tuple<int n_>: ComplexTypeOp { int n = n_; } class CTO_Pointer<bit const_>: ComplexTypeOp { bit const = const_; } -class CTO_Sign<bit signed_>: ComplexTypeOp { bit signed = signed_; } +def CTO_CopyKind: ComplexTypeOp; // ----------------------------------------------------------------------------- // Instances of Type intended to be used directly in the specification of an @@ -167,10 +172,20 @@ class MultiVector<int n>: ComplexType<(CTO_Tuple<n> Vector)>; class Ptr<Type t>: ComplexType<(CTO_Pointer<0> t)>; class CPtr<Type t>: ComplexType<(CTO_Pointer<1> t)>; -// Unsigned<t> expects t to be a scalar, and expands to the unsigned integer -// scalar of the same size. So it returns u16 if you give it s16 or f16 (or -// u16 itself). -class Unsigned<Type t>: ComplexType<(CTO_Sign<0> t)>; +// CopyKind<s,k> expects s and k to be scalar types. It returns a scalar type +// whose kind (signed, unsigned or float) matches that of k, and whose size +// matches that of s. +class CopyKind<Type s, Type k>: ComplexType<(CTO_CopyKind s, k)>; + +// Unsigned<t> expects t to be a scalar type, and expands to the unsigned +// integer scalar of the same size. So it returns u16 if you give it s16 or +// f16 (or u16 itself). +class Unsigned<Type t>: ComplexType<(CTO_CopyKind t, u32)>; + +// UScalar and UVector expand to the unsigned-integer versions of +// Scalar and Vector. +def UScalar: Unsigned<Scalar>; +def UVector: VecOf<UScalar>; // ----------------------------------------------------------------------------- // Internal definitions for specifying immediate arguments for an intrinsic. diff --git a/clang/test/CodeGen/arm-mve-intrinsics/scatter-gather.c b/clang/test/CodeGen/arm-mve-intrinsics/scatter-gather.c new file mode 100644 index 00000000000..830f62442c3 --- /dev/null +++ b/clang/test/CodeGen/arm-mve-intrinsics/scatter-gather.c @@ -0,0 +1,2146 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s + +#include <arm_mve.h> + +// CHECK-LABEL: @test_vldrbq_gather_offset_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i8.v8i16(i8* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 8, i32 0, i32 0) +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +int16x8_t test_vldrbq_gather_offset_s16(const int8_t *base, uint16x8_t offset) +{ +#ifdef POLYMORPHIC + return vldrbq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrbq_gather_offset_s16(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrbq_gather_offset_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i8.v4i32(i8* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 8, i32 0, i32 0) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vldrbq_gather_offset_s32(const int8_t *base, uint32x4_t offset) +{ +#ifdef POLYMORPHIC + return vldrbq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrbq_gather_offset_s32(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrbq_gather_offset_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vldr.gather.offset.v16i8.p0i8.v16i8(i8* [[BASE:%.*]], <16 x i8> [[OFFSET:%.*]], i32 8, i32 0, i32 0) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +int8x16_t test_vldrbq_gather_offset_s8(const int8_t *base, uint8x16_t offset) +{ +#ifdef POLYMORPHIC + return vldrbq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrbq_gather_offset_s8(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrbq_gather_offset_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i8.v8i16(i8* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 8, i32 0, i32 1) +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +uint16x8_t test_vldrbq_gather_offset_u16(const uint8_t *base, uint16x8_t offset) +{ +#ifdef POLYMORPHIC + return vldrbq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrbq_gather_offset_u16(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrbq_gather_offset_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i8.v4i32(i8* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 8, i32 0, i32 1) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +uint32x4_t test_vldrbq_gather_offset_u32(const uint8_t *base, uint32x4_t offset) +{ +#ifdef POLYMORPHIC + return vldrbq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrbq_gather_offset_u32(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrbq_gather_offset_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vldr.gather.offset.v16i8.p0i8.v16i8(i8* [[BASE:%.*]], <16 x i8> [[OFFSET:%.*]], i32 8, i32 0, i32 1) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +uint8x16_t test_vldrbq_gather_offset_u8(const uint8_t *base, uint8x16_t offset) +{ +#ifdef POLYMORPHIC + return vldrbq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrbq_gather_offset_u8(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrbq_gather_offset_z_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i8.v8i16.v8i1(i8* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 8, i32 0, i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +int16x8_t test_vldrbq_gather_offset_z_s16(const int8_t *base, uint16x8_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrbq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrbq_gather_offset_z_s16(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrbq_gather_offset_z_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i8.v4i32.v4i1(i8* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 8, i32 0, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vldrbq_gather_offset_z_s32(const int8_t *base, uint32x4_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrbq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrbq_gather_offset_z_s32(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrbq_gather_offset_z_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.vldr.gather.offset.predicated.v16i8.p0i8.v16i8.v16i1(i8* [[BASE:%.*]], <16 x i8> [[OFFSET:%.*]], i32 8, i32 0, i32 0, <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +int8x16_t test_vldrbq_gather_offset_z_s8(const int8_t *base, uint8x16_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrbq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrbq_gather_offset_z_s8(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrbq_gather_offset_z_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i8.v8i16.v8i1(i8* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 8, i32 0, i32 1, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +uint16x8_t test_vldrbq_gather_offset_z_u16(const uint8_t *base, uint16x8_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrbq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrbq_gather_offset_z_u16(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrbq_gather_offset_z_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i8.v4i32.v4i1(i8* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 8, i32 0, i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +uint32x4_t test_vldrbq_gather_offset_z_u32(const uint8_t *base, uint32x4_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrbq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrbq_gather_offset_z_u32(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrbq_gather_offset_z_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.vldr.gather.offset.predicated.v16i8.p0i8.v16i8.v16i1(i8* [[BASE:%.*]], <16 x i8> [[OFFSET:%.*]], i32 8, i32 0, i32 1, <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +uint8x16_t test_vldrbq_gather_offset_z_u8(const uint8_t *base, uint8x16_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrbq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrbq_gather_offset_z_u8(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrdq_gather_base_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.base.v2i64.v2i64(<2 x i64> [[ADDR:%.*]], i32 616) +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// +int64x2_t test_vldrdq_gather_base_s64(uint64x2_t addr) +{ + return vldrdq_gather_base_s64(addr, 0x268); +} + +// CHECK-LABEL: @test_vldrdq_gather_base_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.base.v2i64.v2i64(<2 x i64> [[ADDR:%.*]], i32 336) +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// +uint64x2_t test_vldrdq_gather_base_u64(uint64x2_t addr) +{ + return vldrdq_gather_base_u64(addr, 0x150); +} + +// CHECK-LABEL: @test_vldrdq_gather_base_wb_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.v2i64.v2i64(<2 x i64> [[TMP0]], i32 576) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP1]], 1 +// CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* [[ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP1]], 0 +// CHECK-NEXT: ret <2 x i64> [[TMP3]] +// +int64x2_t test_vldrdq_gather_base_wb_s64(uint64x2_t *addr) +{ + return vldrdq_gather_base_wb_s64(addr, 0x240); +} + +// CHECK-LABEL: @test_vldrdq_gather_base_wb_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.v2i64.v2i64(<2 x i64> [[TMP0]], i32 328) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP1]], 1 +// CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* [[ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP1]], 0 +// CHECK-NEXT: ret <2 x i64> [[TMP3]] +// +uint64x2_t test_vldrdq_gather_base_wb_u64(uint64x2_t *addr) +{ + return vldrdq_gather_base_wb_u64(addr, 0x148); +} + +// CHECK-LABEL: @test_vldrdq_gather_base_wb_z_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> [[TMP0]], i32 664, <4 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP3]], 1 +// CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[ADDR]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP3]], 0 +// CHECK-NEXT: ret <2 x i64> [[TMP5]] +// +int64x2_t test_vldrdq_gather_base_wb_z_s64(uint64x2_t *addr, mve_pred16_t p) +{ + return vldrdq_gather_base_wb_z_s64(addr, 0x298, p); +} + +// CHECK-LABEL: @test_vldrdq_gather_base_wb_z_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> [[TMP0]], i32 656, <4 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP3]], 1 +// CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[ADDR]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP3]], 0 +// CHECK-NEXT: ret <2 x i64> [[TMP5]] +// +uint64x2_t test_vldrdq_gather_base_wb_z_u64(uint64x2_t *addr, mve_pred16_t p) +{ + return vldrdq_gather_base_wb_z_u64(addr, 0x290, p); +} + +// CHECK-LABEL: @test_vldrdq_gather_base_z_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1(<2 x i64> [[ADDR:%.*]], i32 888, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// +int64x2_t test_vldrdq_gather_base_z_s64(uint64x2_t addr, mve_pred16_t p) +{ + return vldrdq_gather_base_z_s64(addr, 0x378, p); +} + +// CHECK-LABEL: @test_vldrdq_gather_base_z_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1(<2 x i64> [[ADDR:%.*]], i32 1000, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// +uint64x2_t test_vldrdq_gather_base_z_u64(uint64x2_t addr, mve_pred16_t p) +{ + return vldrdq_gather_base_z_u64(addr, 0x3e8, p); +} + +// CHECK-LABEL: @test_vldrdq_gather_offset_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0i64.v2i64(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], i32 64, i32 0, i32 0) +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// +int64x2_t test_vldrdq_gather_offset_s64(const int64_t *base, uint64x2_t offset) +{ +#ifdef POLYMORPHIC + return vldrdq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrdq_gather_offset_s64(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrdq_gather_offset_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0i64.v2i64(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], i32 64, i32 0, i32 1) +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// +uint64x2_t test_vldrdq_gather_offset_u64(const uint64_t *base, uint64x2_t offset) +{ +#ifdef POLYMORPHIC + return vldrdq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrdq_gather_offset_u64(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrdq_gather_offset_z_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], i32 64, i32 0, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// +int64x2_t test_vldrdq_gather_offset_z_s64(const int64_t *base, uint64x2_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrdq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrdq_gather_offset_z_s64(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrdq_gather_offset_z_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], i32 64, i32 0, i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// +uint64x2_t test_vldrdq_gather_offset_z_u64(const uint64_t *base, uint64x2_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrdq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrdq_gather_offset_z_u64(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrdq_gather_shifted_offset_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0i64.v2i64(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], i32 64, i32 3, i32 0) +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// +int64x2_t test_vldrdq_gather_shifted_offset_s64(const int64_t *base, uint64x2_t offset) +{ +#ifdef POLYMORPHIC + return vldrdq_gather_shifted_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrdq_gather_shifted_offset_s64(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrdq_gather_shifted_offset_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0i64.v2i64(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], i32 64, i32 3, i32 1) +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// +uint64x2_t test_vldrdq_gather_shifted_offset_u64(const uint64_t *base, uint64x2_t offset) +{ +#ifdef POLYMORPHIC + return vldrdq_gather_shifted_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrdq_gather_shifted_offset_u64(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrdq_gather_shifted_offset_z_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], i32 64, i32 3, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// +int64x2_t test_vldrdq_gather_shifted_offset_z_s64(const int64_t *base, uint64x2_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrdq_gather_shifted_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrdq_gather_shifted_offset_z_s64(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrdq_gather_shifted_offset_z_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], i32 64, i32 3, i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// +uint64x2_t test_vldrdq_gather_shifted_offset_z_u64(const uint64_t *base, uint64x2_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrdq_gather_shifted_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrdq_gather_shifted_offset_z_u64(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_offset_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.arm.mve.vldr.gather.offset.v8f16.p0f16.v8i16(half* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 16, i32 0, i32 0) +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// +float16x8_t test_vldrhq_gather_offset_f16(const float16_t *base, uint16x8_t offset) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrhq_gather_offset_f16(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_offset_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i16.v8i16(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 16, i32 0, i32 0) +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +int16x8_t test_vldrhq_gather_offset_s16(const int16_t *base, uint16x8_t offset) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrhq_gather_offset_s16(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_offset_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i16.v4i32(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 16, i32 0, i32 0) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vldrhq_gather_offset_s32(const int16_t *base, uint32x4_t offset) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrhq_gather_offset_s32(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_offset_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i16.v8i16(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 16, i32 0, i32 1) +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +uint16x8_t test_vldrhq_gather_offset_u16(const uint16_t *base, uint16x8_t offset) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrhq_gather_offset_u16(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_offset_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i16.v4i32(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 16, i32 0, i32 1) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +uint32x4_t test_vldrhq_gather_offset_u32(const uint16_t *base, uint32x4_t offset) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrhq_gather_offset_u32(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_offset_z_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.vldr.gather.offset.predicated.v8f16.p0f16.v8i16.v8i1(half* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 16, i32 0, i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x half> [[TMP2]] +// +float16x8_t test_vldrhq_gather_offset_z_f16(const float16_t *base, uint16x8_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrhq_gather_offset_z_f16(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_offset_z_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i16.v8i16.v8i1(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 16, i32 0, i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +int16x8_t test_vldrhq_gather_offset_z_s16(const int16_t *base, uint16x8_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrhq_gather_offset_z_s16(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_offset_z_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i16.v4i32.v4i1(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 16, i32 0, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vldrhq_gather_offset_z_s32(const int16_t *base, uint32x4_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrhq_gather_offset_z_s32(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_offset_z_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i16.v8i16.v8i1(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 16, i32 0, i32 1, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +uint16x8_t test_vldrhq_gather_offset_z_u16(const uint16_t *base, uint16x8_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrhq_gather_offset_z_u16(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_offset_z_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i16.v4i32.v4i1(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 16, i32 0, i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +uint32x4_t test_vldrhq_gather_offset_z_u32(const uint16_t *base, uint32x4_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrhq_gather_offset_z_u32(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_shifted_offset_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.arm.mve.vldr.gather.offset.v8f16.p0f16.v8i16(half* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 16, i32 1, i32 0) +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// +float16x8_t test_vldrhq_gather_shifted_offset_f16(const float16_t *base, uint16x8_t offset) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_shifted_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrhq_gather_shifted_offset_f16(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_shifted_offset_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i16.v8i16(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 16, i32 1, i32 0) +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +int16x8_t test_vldrhq_gather_shifted_offset_s16(const int16_t *base, uint16x8_t offset) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_shifted_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrhq_gather_shifted_offset_s16(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_shifted_offset_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i16.v4i32(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 16, i32 1, i32 0) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vldrhq_gather_shifted_offset_s32(const int16_t *base, uint32x4_t offset) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_shifted_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrhq_gather_shifted_offset_s32(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_shifted_offset_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i16.v8i16(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 16, i32 1, i32 1) +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +uint16x8_t test_vldrhq_gather_shifted_offset_u16(const uint16_t *base, uint16x8_t offset) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_shifted_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrhq_gather_shifted_offset_u16(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_shifted_offset_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i16.v4i32(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 16, i32 1, i32 1) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +uint32x4_t test_vldrhq_gather_shifted_offset_u32(const uint16_t *base, uint32x4_t offset) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_shifted_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrhq_gather_shifted_offset_u32(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_shifted_offset_z_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.vldr.gather.offset.predicated.v8f16.p0f16.v8i16.v8i1(half* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 16, i32 1, i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x half> [[TMP2]] +// +float16x8_t test_vldrhq_gather_shifted_offset_z_f16(const float16_t *base, uint16x8_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_shifted_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrhq_gather_shifted_offset_z_f16(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_shifted_offset_z_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i16.v8i16.v8i1(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 16, i32 1, i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +int16x8_t test_vldrhq_gather_shifted_offset_z_s16(const int16_t *base, uint16x8_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_shifted_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrhq_gather_shifted_offset_z_s16(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_shifted_offset_z_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i16.v4i32.v4i1(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 16, i32 1, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vldrhq_gather_shifted_offset_z_s32(const int16_t *base, uint32x4_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_shifted_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrhq_gather_shifted_offset_z_s32(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_shifted_offset_z_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i16.v8i16.v8i1(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 16, i32 1, i32 1, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +uint16x8_t test_vldrhq_gather_shifted_offset_z_u16(const uint16_t *base, uint16x8_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_shifted_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrhq_gather_shifted_offset_z_u16(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_shifted_offset_z_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i16.v4i32.v4i1(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 16, i32 1, i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +uint32x4_t test_vldrhq_gather_shifted_offset_z_u32(const uint16_t *base, uint32x4_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_shifted_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrhq_gather_shifted_offset_z_u32(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrwq_gather_base_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.arm.mve.vldr.gather.base.v4f32.v4i32(<4 x i32> [[ADDR:%.*]], i32 12) +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// +float32x4_t test_vldrwq_gather_base_f32(uint32x4_t addr) +{ + return vldrwq_gather_base_f32(addr, 0xc); +} + +// CHECK-LABEL: @test_vldrwq_gather_base_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.base.v4i32.v4i32(<4 x i32> [[ADDR:%.*]], i32 400) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vldrwq_gather_base_s32(uint32x4_t addr) +{ + return vldrwq_gather_base_s32(addr, 0x190); +} + +// CHECK-LABEL: @test_vldrwq_gather_base_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.base.v4i32.v4i32(<4 x i32> [[ADDR:%.*]], i32 284) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +uint32x4_t test_vldrwq_gather_base_u32(uint32x4_t addr) +{ + return vldrwq_gather_base_u32(addr, 0x11c); +} + +// CHECK-LABEL: @test_vldrwq_gather_base_wb_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4f32.v4i32(<4 x i32> [[TMP0]], i32 64) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP1]], 1 +// CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP1]], 0 +// CHECK-NEXT: ret <4 x float> [[TMP3]] +// +float32x4_t test_vldrwq_gather_base_wb_f32(uint32x4_t *addr) +{ + return vldrwq_gather_base_wb_f32(addr, 0x40); +} + +// CHECK-LABEL: @test_vldrwq_gather_base_wb_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4i32.v4i32(<4 x i32> [[TMP0]], i32 80) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP1]], 1 +// CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP1]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// +int32x4_t test_vldrwq_gather_base_wb_s32(uint32x4_t *addr) +{ + return vldrwq_gather_base_wb_s32(addr, 0x50); +} + +// CHECK-LABEL: @test_vldrwq_gather_base_wb_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4i32.v4i32(<4 x i32> [[TMP0]], i32 480) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP1]], 1 +// CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP1]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// +uint32x4_t test_vldrwq_gather_base_wb_u32(uint32x4_t *addr) +{ + return vldrwq_gather_base_wb_u32(addr, 0x1e0); +} + +// CHECK-LABEL: @test_vldrwq_gather_base_wb_z_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP0]], i32 352, <4 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP3]], 1 +// CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[ADDR]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP3]], 0 +// CHECK-NEXT: ret <4 x float> [[TMP5]] +// +float32x4_t test_vldrwq_gather_base_wb_z_f32(uint32x4_t *addr, mve_pred16_t p) +{ + return vldrwq_gather_base_wb_z_f32(addr, 0x160, p); +} + +// CHECK-LABEL: @test_vldrwq_gather_base_wb_z_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32> [[TMP0]], i32 276, <4 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP3]], 1 +// CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[ADDR]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP3]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP5]] +// +int32x4_t test_vldrwq_gather_base_wb_z_s32(uint32x4_t *addr, mve_pred16_t p) +{ + return vldrwq_gather_base_wb_z_s32(addr, 0x114, p); +} + +// CHECK-LABEL: @test_vldrwq_gather_base_wb_z_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32> [[TMP0]], i32 88, <4 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP3]], 1 +// CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[ADDR]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP3]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP5]] +// +uint32x4_t test_vldrwq_gather_base_wb_z_u32(uint32x4_t *addr, mve_pred16_t p) +{ + return vldrwq_gather_base_wb_z_u32(addr, 0x58, p); +} + +// CHECK-LABEL: @test_vldrwq_gather_base_z_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.vldr.gather.base.predicated.v4f32.v4i32.v4i1(<4 x i32> [[ADDR:%.*]], i32 300, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x float> [[TMP2]] +// +float32x4_t test_vldrwq_gather_base_z_f32(uint32x4_t addr, mve_pred16_t p) +{ + return vldrwq_gather_base_z_f32(addr, 0x12c, p); +} + +// CHECK-LABEL: @test_vldrwq_gather_base_z_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.base.predicated.v4i32.v4i32.v4i1(<4 x i32> [[ADDR:%.*]], i32 440, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vldrwq_gather_base_z_s32(uint32x4_t addr, mve_pred16_t p) +{ + return vldrwq_gather_base_z_s32(addr, 0x1b8, p); +} + +// CHECK-LABEL: @test_vldrwq_gather_base_z_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.base.predicated.v4i32.v4i32.v4i1(<4 x i32> [[ADDR:%.*]], i32 300, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +uint32x4_t test_vldrwq_gather_base_z_u32(uint32x4_t addr, mve_pred16_t p) +{ + return vldrwq_gather_base_z_u32(addr, 0x12c, p); +} + +// CHECK-LABEL: @test_vldrwq_gather_offset_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.arm.mve.vldr.gather.offset.v4f32.p0f32.v4i32(float* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 32, i32 0, i32 0) +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// +float32x4_t test_vldrwq_gather_offset_f32(const float32_t *base, uint32x4_t offset) +{ +#ifdef POLYMORPHIC + return vldrwq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrwq_gather_offset_f32(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrwq_gather_offset_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 32, i32 0, i32 0) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vldrwq_gather_offset_s32(const int32_t *base, uint32x4_t offset) +{ +#ifdef POLYMORPHIC + return vldrwq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrwq_gather_offset_s32(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrwq_gather_offset_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 32, i32 0, i32 1) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +uint32x4_t test_vldrwq_gather_offset_u32(const uint32_t *base, uint32x4_t offset) +{ +#ifdef POLYMORPHIC + return vldrwq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrwq_gather_offset_u32(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrwq_gather_offset_z_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.vldr.gather.offset.predicated.v4f32.p0f32.v4i32.v4i1(float* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 32, i32 0, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x float> [[TMP2]] +// +float32x4_t test_vldrwq_gather_offset_z_f32(const float32_t *base, uint32x4_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrwq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrwq_gather_offset_z_f32(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrwq_gather_offset_z_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i32.v4i32.v4i1(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 32, i32 0, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vldrwq_gather_offset_z_s32(const int32_t *base, uint32x4_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrwq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrwq_gather_offset_z_s32(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrwq_gather_offset_z_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i32.v4i32.v4i1(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 32, i32 0, i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +uint32x4_t test_vldrwq_gather_offset_z_u32(const uint32_t *base, uint32x4_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrwq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrwq_gather_offset_z_u32(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrwq_gather_shifted_offset_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.arm.mve.vldr.gather.offset.v4f32.p0f32.v4i32(float* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 32, i32 2, i32 0) +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// +float32x4_t test_vldrwq_gather_shifted_offset_f32(const float32_t *base, uint32x4_t offset) +{ +#ifdef POLYMORPHIC + return vldrwq_gather_shifted_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrwq_gather_shifted_offset_f32(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrwq_gather_shifted_offset_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 32, i32 2, i32 0) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vldrwq_gather_shifted_offset_s32(const int32_t *base, uint32x4_t offset) +{ +#ifdef POLYMORPHIC + return vldrwq_gather_shifted_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrwq_gather_shifted_offset_s32(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrwq_gather_shifted_offset_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 32, i32 2, i32 1) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +uint32x4_t test_vldrwq_gather_shifted_offset_u32(const uint32_t *base, uint32x4_t offset) +{ +#ifdef POLYMORPHIC + return vldrwq_gather_shifted_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrwq_gather_shifted_offset_u32(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrwq_gather_shifted_offset_z_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.vldr.gather.offset.predicated.v4f32.p0f32.v4i32.v4i1(float* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 32, i32 2, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x float> [[TMP2]] +// +float32x4_t test_vldrwq_gather_shifted_offset_z_f32(const float32_t *base, uint32x4_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrwq_gather_shifted_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrwq_gather_shifted_offset_z_f32(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrwq_gather_shifted_offset_z_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i32.v4i32.v4i1(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 32, i32 2, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vldrwq_gather_shifted_offset_z_s32(const int32_t *base, uint32x4_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrwq_gather_shifted_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrwq_gather_shifted_offset_z_s32(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrwq_gather_shifted_offset_z_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i32.v4i32.v4i1(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 32, i32 2, i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +uint32x4_t test_vldrwq_gather_shifted_offset_z_u32(const uint32_t *base, uint32x4_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrwq_gather_shifted_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrwq_gather_shifted_offset_z_u32(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_scatter_offset_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v8i16.v8i16.v8i1(i8* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x i16> [[VALUE:%.*]], i32 8, i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrbq_scatter_offset_p_s16(int8_t *base, uint16x8_t offset, int16x8_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrbq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrbq_scatter_offset_p_s16(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_scatter_offset_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v4i32.v4i32.v4i1(i8* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 8, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrbq_scatter_offset_p_s32(int8_t *base, uint32x4_t offset, int32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrbq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrbq_scatter_offset_p_s32(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_scatter_offset_p_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v16i8.v16i8.v16i1(i8* [[BASE:%.*]], <16 x i8> [[OFFSET:%.*]], <16 x i8> [[VALUE:%.*]], i32 8, i32 0, <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrbq_scatter_offset_p_s8(int8_t *base, uint8x16_t offset, int8x16_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrbq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrbq_scatter_offset_p_s8(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_scatter_offset_p_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v8i16.v8i16.v8i1(i8* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x i16> [[VALUE:%.*]], i32 8, i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrbq_scatter_offset_p_u16(uint8_t *base, uint16x8_t offset, uint16x8_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrbq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrbq_scatter_offset_p_u16(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_scatter_offset_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v4i32.v4i32.v4i1(i8* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 8, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrbq_scatter_offset_p_u32(uint8_t *base, uint32x4_t offset, uint32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrbq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrbq_scatter_offset_p_u32(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_scatter_offset_p_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v16i8.v16i8.v16i1(i8* [[BASE:%.*]], <16 x i8> [[OFFSET:%.*]], <16 x i8> [[VALUE:%.*]], i32 8, i32 0, <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrbq_scatter_offset_p_u8(uint8_t *base, uint8x16_t offset, uint8x16_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrbq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrbq_scatter_offset_p_u8(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_scatter_offset_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v8i16.v8i16(i8* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x i16> [[VALUE:%.*]], i32 8, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrbq_scatter_offset_s16(int8_t *base, uint16x8_t offset, int16x8_t value) +{ +#ifdef POLYMORPHIC + vstrbq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrbq_scatter_offset_s16(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_scatter_offset_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v4i32.v4i32(i8* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 8, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrbq_scatter_offset_s32(int8_t *base, uint32x4_t offset, int32x4_t value) +{ +#ifdef POLYMORPHIC + vstrbq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrbq_scatter_offset_s32(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_scatter_offset_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v16i8.v16i8(i8* [[BASE:%.*]], <16 x i8> [[OFFSET:%.*]], <16 x i8> [[VALUE:%.*]], i32 8, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrbq_scatter_offset_s8(int8_t *base, uint8x16_t offset, int8x16_t value) +{ +#ifdef POLYMORPHIC + vstrbq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrbq_scatter_offset_s8(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_scatter_offset_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v8i16.v8i16(i8* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x i16> [[VALUE:%.*]], i32 8, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrbq_scatter_offset_u16(uint8_t *base, uint16x8_t offset, uint16x8_t value) +{ +#ifdef POLYMORPHIC + vstrbq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrbq_scatter_offset_u16(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_scatter_offset_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v4i32.v4i32(i8* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 8, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrbq_scatter_offset_u32(uint8_t *base, uint32x4_t offset, uint32x4_t value) +{ +#ifdef POLYMORPHIC + vstrbq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrbq_scatter_offset_u32(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_scatter_offset_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v16i8.v16i8(i8* [[BASE:%.*]], <16 x i8> [[OFFSET:%.*]], <16 x i8> [[VALUE:%.*]], i32 8, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrbq_scatter_offset_u8(uint8_t *base, uint8x16_t offset, uint8x16_t value) +{ +#ifdef POLYMORPHIC + vstrbq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrbq_scatter_offset_u8(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_base_p_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1(<2 x i64> [[ADDR:%.*]], i32 888, <2 x i64> [[VALUE:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_base_p_s64(uint64x2_t addr, int64x2_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_base_p(addr, 0x378, value, p); +#else /* POLYMORPHIC */ + vstrdq_scatter_base_p_s64(addr, 0x378, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_base_p_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1(<2 x i64> [[ADDR:%.*]], i32 264, <2 x i64> [[VALUE:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_base_p_u64(uint64x2_t addr, uint64x2_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_base_p(addr, 0x108, value, p); +#else /* POLYMORPHIC */ + vstrdq_scatter_base_p_u64(addr, 0x108, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_base_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.base.v2i64.v2i64(<2 x i64> [[ADDR:%.*]], i32 408, <2 x i64> [[VALUE:%.*]]) +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_base_s64(uint64x2_t addr, int64x2_t value) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_base(addr, 0x198, value); +#else /* POLYMORPHIC */ + vstrdq_scatter_base_s64(addr, 0x198, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_base_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.base.v2i64.v2i64(<2 x i64> [[ADDR:%.*]], i32 472, <2 x i64> [[VALUE:%.*]]) +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_base_u64(uint64x2_t addr, uint64x2_t value) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_base(addr, 0x1d8, value); +#else /* POLYMORPHIC */ + vstrdq_scatter_base_u64(addr, 0x1d8, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_base_wb_p_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> [[TMP0]], i32 248, <2 x i64> [[VALUE:%.*]], <4 x i1> [[TMP2]]) +// CHECK-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* [[ADDR]], align 8 +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_base_wb_p_s64(uint64x2_t *addr, int64x2_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_base_wb_p(addr, 0xf8, value, p); +#else /* POLYMORPHIC */ + vstrdq_scatter_base_wb_p_s64(addr, 0xf8, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_base_wb_p_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> [[TMP0]], i32 136, <2 x i64> [[VALUE:%.*]], <4 x i1> [[TMP2]]) +// CHECK-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* [[ADDR]], align 8 +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_base_wb_p_u64(uint64x2_t *addr, uint64x2_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_base_wb_p(addr, 0x88, value, p); +#else /* POLYMORPHIC */ + vstrdq_scatter_base_wb_p_u64(addr, 0x88, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_base_wb_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.v2i64.v2i64(<2 x i64> [[TMP0]], i32 208, <2 x i64> [[VALUE:%.*]]) +// CHECK-NEXT: store <2 x i64> [[TMP1]], <2 x i64>* [[ADDR]], align 8 +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_base_wb_s64(uint64x2_t *addr, int64x2_t value) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_base_wb(addr, 0xd0, value); +#else /* POLYMORPHIC */ + vstrdq_scatter_base_wb_s64(addr, 0xd0, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_base_wb_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.v2i64.v2i64(<2 x i64> [[TMP0]], i32 168, <2 x i64> [[VALUE:%.*]]) +// CHECK-NEXT: store <2 x i64> [[TMP1]], <2 x i64>* [[ADDR]], align 8 +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_base_wb_u64(uint64x2_t *addr, uint64x2_t value) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_base_wb(addr, 0xa8, value); +#else /* POLYMORPHIC */ + vstrdq_scatter_base_wb_u64(addr, 0xa8, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_offset_p_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], <2 x i64> [[VALUE:%.*]], i32 64, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_offset_p_s64(int64_t *base, uint64x2_t offset, int64x2_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrdq_scatter_offset_p_s64(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_offset_p_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], <2 x i64> [[VALUE:%.*]], i32 64, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_offset_p_u64(uint64_t *base, uint64x2_t offset, uint64x2_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrdq_scatter_offset_p_u64(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_offset_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i64.v2i64.v2i64(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], <2 x i64> [[VALUE:%.*]], i32 64, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_offset_s64(int64_t *base, uint64x2_t offset, int64x2_t value) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrdq_scatter_offset_s64(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_offset_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i64.v2i64.v2i64(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], <2 x i64> [[VALUE:%.*]], i32 64, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_offset_u64(uint64_t *base, uint64x2_t offset, uint64x2_t value) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrdq_scatter_offset_u64(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_shifted_offset_p_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], <2 x i64> [[VALUE:%.*]], i32 64, i32 3, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_shifted_offset_p_s64(int64_t *base, uint64x2_t offset, int64x2_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_shifted_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrdq_scatter_shifted_offset_p_s64(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_shifted_offset_p_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], <2 x i64> [[VALUE:%.*]], i32 64, i32 3, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_shifted_offset_p_u64(uint64_t *base, uint64x2_t offset, uint64x2_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_shifted_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrdq_scatter_shifted_offset_p_u64(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_shifted_offset_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i64.v2i64.v2i64(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], <2 x i64> [[VALUE:%.*]], i32 64, i32 3) +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_shifted_offset_s64(int64_t *base, uint64x2_t offset, int64x2_t value) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_shifted_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrdq_scatter_shifted_offset_s64(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_shifted_offset_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i64.v2i64.v2i64(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], <2 x i64> [[VALUE:%.*]], i32 64, i32 3) +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_shifted_offset_u64(uint64_t *base, uint64x2_t offset, uint64x2_t value) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_shifted_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrdq_scatter_shifted_offset_u64(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_offset_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0f16.v8i16.v8f16(half* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x half> [[VALUE:%.*]], i32 16, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_offset_f16(float16_t *base, uint16x8_t offset, float16x8_t value) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrhq_scatter_offset_f16(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_offset_p_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0f16.v8i16.v8f16.v8i1(half* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x half> [[VALUE:%.*]], i32 16, i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_offset_p_f16(float16_t *base, uint16x8_t offset, float16x8_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrhq_scatter_offset_p_f16(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_offset_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v8i16.v8i16.v8i1(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x i16> [[VALUE:%.*]], i32 16, i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_offset_p_s16(int16_t *base, uint16x8_t offset, int16x8_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrhq_scatter_offset_p_s16(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_offset_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v4i32.v4i32.v4i1(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 16, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_offset_p_s32(int16_t *base, uint32x4_t offset, int32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrhq_scatter_offset_p_s32(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_offset_p_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v8i16.v8i16.v8i1(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x i16> [[VALUE:%.*]], i32 16, i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_offset_p_u16(uint16_t *base, uint16x8_t offset, uint16x8_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrhq_scatter_offset_p_u16(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_offset_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v4i32.v4i32.v4i1(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 16, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_offset_p_u32(uint16_t *base, uint32x4_t offset, uint32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrhq_scatter_offset_p_u32(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_offset_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v8i16.v8i16(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x i16> [[VALUE:%.*]], i32 16, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_offset_s16(int16_t *base, uint16x8_t offset, int16x8_t value) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrhq_scatter_offset_s16(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_offset_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v4i32.v4i32(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 16, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_offset_s32(int16_t *base, uint32x4_t offset, int32x4_t value) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrhq_scatter_offset_s32(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_offset_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v8i16.v8i16(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x i16> [[VALUE:%.*]], i32 16, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_offset_u16(uint16_t *base, uint16x8_t offset, uint16x8_t value) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrhq_scatter_offset_u16(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_offset_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v4i32.v4i32(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 16, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_offset_u32(uint16_t *base, uint32x4_t offset, uint32x4_t value) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrhq_scatter_offset_u32(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_shifted_offset_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0f16.v8i16.v8f16(half* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x half> [[VALUE:%.*]], i32 16, i32 1) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_shifted_offset_f16(float16_t *base, uint16x8_t offset, float16x8_t value) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_shifted_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrhq_scatter_shifted_offset_f16(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_shifted_offset_p_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0f16.v8i16.v8f16.v8i1(half* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x half> [[VALUE:%.*]], i32 16, i32 1, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_shifted_offset_p_f16(float16_t *base, uint16x8_t offset, float16x8_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_shifted_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrhq_scatter_shifted_offset_p_f16(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_shifted_offset_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v8i16.v8i16.v8i1(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x i16> [[VALUE:%.*]], i32 16, i32 1, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_shifted_offset_p_s16(int16_t *base, uint16x8_t offset, int16x8_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_shifted_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrhq_scatter_shifted_offset_p_s16(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_shifted_offset_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v4i32.v4i32.v4i1(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 16, i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_shifted_offset_p_s32(int16_t *base, uint32x4_t offset, int32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_shifted_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrhq_scatter_shifted_offset_p_s32(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_shifted_offset_p_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v8i16.v8i16.v8i1(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x i16> [[VALUE:%.*]], i32 16, i32 1, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_shifted_offset_p_u16(uint16_t *base, uint16x8_t offset, uint16x8_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_shifted_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrhq_scatter_shifted_offset_p_u16(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_shifted_offset_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v4i32.v4i32.v4i1(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 16, i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_shifted_offset_p_u32(uint16_t *base, uint32x4_t offset, uint32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_shifted_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrhq_scatter_shifted_offset_p_u32(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_shifted_offset_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v8i16.v8i16(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x i16> [[VALUE:%.*]], i32 16, i32 1) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_shifted_offset_s16(int16_t *base, uint16x8_t offset, int16x8_t value) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_shifted_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrhq_scatter_shifted_offset_s16(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_shifted_offset_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v4i32.v4i32(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 16, i32 1) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_shifted_offset_s32(int16_t *base, uint32x4_t offset, int32x4_t value) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_shifted_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrhq_scatter_shifted_offset_s32(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_shifted_offset_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v8i16.v8i16(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x i16> [[VALUE:%.*]], i32 16, i32 1) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_shifted_offset_u16(uint16_t *base, uint16x8_t offset, uint16x8_t value) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_shifted_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrhq_scatter_shifted_offset_u16(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_shifted_offset_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v4i32.v4i32(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 16, i32 1) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_shifted_offset_u32(uint16_t *base, uint32x4_t offset, uint32x4_t value) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_shifted_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrhq_scatter_shifted_offset_u32(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_base_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.base.v4i32.v4f32(<4 x i32> [[ADDR:%.*]], i32 380, <4 x float> [[VALUE:%.*]]) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_base_f32(uint32x4_t addr, float32x4_t value) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_base(addr, 0x17c, value); +#else /* POLYMORPHIC */ + vstrwq_scatter_base_f32(addr, 0x17c, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_base_p_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.base.predicated.v4i32.v4f32.v4i1(<4 x i32> [[ADDR:%.*]], i32 400, <4 x float> [[VALUE:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_base_p_f32(uint32x4_t addr, float32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_base_p(addr, 0x190, value, p); +#else /* POLYMORPHIC */ + vstrwq_scatter_base_p_f32(addr, 0x190, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_base_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.base.predicated.v4i32.v4i32.v4i1(<4 x i32> [[ADDR:%.*]], i32 48, <4 x i32> [[VALUE:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_base_p_s32(uint32x4_t addr, int32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_base_p(addr, 0x30, value, p); +#else /* POLYMORPHIC */ + vstrwq_scatter_base_p_s32(addr, 0x30, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_base_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.base.predicated.v4i32.v4i32.v4i1(<4 x i32> [[ADDR:%.*]], i32 376, <4 x i32> [[VALUE:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_base_p_u32(uint32x4_t addr, uint32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_base_p(addr, 0x178, value, p); +#else /* POLYMORPHIC */ + vstrwq_scatter_base_p_u32(addr, 0x178, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_base_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.base.v4i32.v4i32(<4 x i32> [[ADDR:%.*]], i32 156, <4 x i32> [[VALUE:%.*]]) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_base_s32(uint32x4_t addr, int32x4_t value) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_base(addr, 0x9c, value); +#else /* POLYMORPHIC */ + vstrwq_scatter_base_s32(addr, 0x9c, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_base_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.base.v4i32.v4i32(<4 x i32> [[ADDR:%.*]], i32 212, <4 x i32> [[VALUE:%.*]]) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_base_u32(uint32x4_t addr, uint32x4_t value) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_base(addr, 0xd4, value); +#else /* POLYMORPHIC */ + vstrwq_scatter_base_u32(addr, 0xd4, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_base_wb_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.v4i32.v4f32(<4 x i32> [[TMP0]], i32 412, <4 x float> [[VALUE:%.*]]) +// CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[ADDR]], align 8 +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_base_wb_f32(uint32x4_t *addr, float32x4_t value) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_base_wb(addr, 0x19c, value); +#else /* POLYMORPHIC */ + vstrwq_scatter_base_wb_f32(addr, 0x19c, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_base_wb_p_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v4i32.v4f32.v4i1(<4 x i32> [[TMP0]], i32 236, <4 x float> [[VALUE:%.*]], <4 x i1> [[TMP2]]) +// CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[ADDR]], align 8 +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_base_wb_p_f32(uint32x4_t *addr, float32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_base_wb_p(addr, 0xec, value, p); +#else /* POLYMORPHIC */ + vstrwq_scatter_base_wb_p_f32(addr, 0xec, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_base_wb_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32> [[TMP0]], i32 328, <4 x i32> [[VALUE:%.*]], <4 x i1> [[TMP2]]) +// CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[ADDR]], align 8 +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_base_wb_p_s32(uint32x4_t *addr, int32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_base_wb_p(addr, 0x148, value, p); +#else /* POLYMORPHIC */ + vstrwq_scatter_base_wb_p_s32(addr, 0x148, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_base_wb_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32> [[TMP0]], i32 412, <4 x i32> [[VALUE:%.*]], <4 x i1> [[TMP2]]) +// CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[ADDR]], align 8 +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_base_wb_p_u32(uint32x4_t *addr, uint32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_base_wb_p(addr, 0x19c, value, p); +#else /* POLYMORPHIC */ + vstrwq_scatter_base_wb_p_u32(addr, 0x19c, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_base_wb_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.v4i32.v4i32(<4 x i32> [[TMP0]], i32 152, <4 x i32> [[VALUE:%.*]]) +// CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[ADDR]], align 8 +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_base_wb_s32(uint32x4_t *addr, int32x4_t value) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_base_wb(addr, 0x98, value); +#else /* POLYMORPHIC */ + vstrwq_scatter_base_wb_s32(addr, 0x98, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_base_wb_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.v4i32.v4i32(<4 x i32> [[TMP0]], i32 64, <4 x i32> [[VALUE:%.*]]) +// CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[ADDR]], align 8 +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_base_wb_u32(uint32x4_t *addr, uint32x4_t value) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_base_wb(addr, 0x40, value); +#else /* POLYMORPHIC */ + vstrwq_scatter_base_wb_u32(addr, 0x40, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_offset_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0f32.v4i32.v4f32(float* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x float> [[VALUE:%.*]], i32 32, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_offset_f32(float32_t *base, uint32x4_t offset, float32x4_t value) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrwq_scatter_offset_f32(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_offset_p_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0f32.v4i32.v4f32.v4i1(float* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x float> [[VALUE:%.*]], i32 32, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_offset_p_f32(float32_t *base, uint32x4_t offset, float32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrwq_scatter_offset_p_f32(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_offset_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i32.v4i32.v4i32.v4i1(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 32, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_offset_p_s32(int32_t *base, uint32x4_t offset, int32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrwq_scatter_offset_p_s32(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_offset_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i32.v4i32.v4i32.v4i1(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 32, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_offset_p_u32(uint32_t *base, uint32x4_t offset, uint32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrwq_scatter_offset_p_u32(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_offset_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i32.v4i32.v4i32(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 32, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_offset_s32(int32_t *base, uint32x4_t offset, int32x4_t value) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrwq_scatter_offset_s32(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_offset_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i32.v4i32.v4i32(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 32, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_offset_u32(uint32_t *base, uint32x4_t offset, uint32x4_t value) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrwq_scatter_offset_u32(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_shifted_offset_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0f32.v4i32.v4f32(float* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x float> [[VALUE:%.*]], i32 32, i32 2) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_shifted_offset_f32(float32_t *base, uint32x4_t offset, float32x4_t value) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_shifted_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrwq_scatter_shifted_offset_f32(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_shifted_offset_p_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0f32.v4i32.v4f32.v4i1(float* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x float> [[VALUE:%.*]], i32 32, i32 2, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_shifted_offset_p_f32(float32_t *base, uint32x4_t offset, float32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_shifted_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrwq_scatter_shifted_offset_p_f32(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_shifted_offset_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i32.v4i32.v4i32.v4i1(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 32, i32 2, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_shifted_offset_p_s32(int32_t *base, uint32x4_t offset, int32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_shifted_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrwq_scatter_shifted_offset_p_s32(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_shifted_offset_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i32.v4i32.v4i32.v4i1(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 32, i32 2, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_shifted_offset_p_u32(uint32_t *base, uint32x4_t offset, uint32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_shifted_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrwq_scatter_shifted_offset_p_u32(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_shifted_offset_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i32.v4i32.v4i32(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 32, i32 2) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_shifted_offset_s32(int32_t *base, uint32x4_t offset, int32x4_t value) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_shifted_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrwq_scatter_shifted_offset_s32(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_shifted_offset_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i32.v4i32.v4i32(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 32, i32 2) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_shifted_offset_u32(uint32_t *base, uint32x4_t offset, uint32x4_t value) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_shifted_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrwq_scatter_shifted_offset_u32(base, offset, value); +#endif /* POLYMORPHIC */ +} + diff --git a/clang/test/Sema/arm-mve-immediates.c b/clang/test/Sema/arm-mve-immediates.c new file mode 100644 index 00000000000..cdf68b8a949 --- /dev/null +++ b/clang/test/Sema/arm-mve-immediates.c @@ -0,0 +1,56 @@ +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -verify -fsyntax-only %s + +#include <arm_mve.h> + +void test_load_offsets(uint32x4_t addr32, uint64x2_t addr64) +{ + // Offsets that should be a multiple of 8 times 0,1,...,127 + vldrdq_gather_base_s64(addr64, 0); + vldrdq_gather_base_s64(addr64, 8); + vldrdq_gather_base_s64(addr64, 2*8); + vldrdq_gather_base_s64(addr64, 125*8); + vldrdq_gather_base_s64(addr64, 126*8); + vldrdq_gather_base_s64(addr64, 127*8); + vldrdq_gather_base_s64(addr64, -8); // expected-error {{argument value -8 is outside the valid range [0, 1016]}} + vldrdq_gather_base_s64(addr64, 128*8); // expected-error {{argument value 1024 is outside the valid range [0, 1016]}} + vldrdq_gather_base_s64(addr64, 4); // expected-error {{argument should be a multiple of 8}} + vldrdq_gather_base_s64(addr64, 1); // expected-error {{argument should be a multiple of 8}} + + // Offsets that should be a multiple of 4 times 0,1,...,127 + vldrwq_gather_base_s32(addr32, 0); + vldrwq_gather_base_s32(addr32, 4); + vldrwq_gather_base_s32(addr32, 2*4); + vldrwq_gather_base_s32(addr32, 125*4); + vldrwq_gather_base_s32(addr32, 126*4); + vldrwq_gather_base_s32(addr32, 127*4); + vldrwq_gather_base_s32(addr32, -4); // expected-error {{argument value -4 is outside the valid range [0, 508]}} + vldrwq_gather_base_s32(addr32, 128*4); // expected-error {{argument value 512 is outside the valid range [0, 508]}} + vldrwq_gather_base_s32(addr32, 2); // expected-error {{argument should be a multiple of 4}} + vldrwq_gather_base_s32(addr32, 1); // expected-error {{argument should be a multiple of 4}} + + // Show that the polymorphic store intrinsics get the right set of + // error checks after overload resolution. These ones expand to the + // 8-byte granular versions... + vstrdq_scatter_base(addr64, 0, addr64); + vstrdq_scatter_base(addr64, 8, addr64); + vstrdq_scatter_base(addr64, 2*8, addr64); + vstrdq_scatter_base(addr64, 125*8, addr64); + vstrdq_scatter_base(addr64, 126*8, addr64); + vstrdq_scatter_base(addr64, 127*8, addr64); + vstrdq_scatter_base(addr64, -8, addr64); // expected-error {{argument value -8 is outside the valid range [0, 1016]}} + vstrdq_scatter_base(addr64, 128*8, addr64); // expected-error {{argument value 1024 is outside the valid range [0, 1016]}} + vstrdq_scatter_base(addr64, 4, addr64); // expected-error {{argument should be a multiple of 8}} + vstrdq_scatter_base(addr64, 1, addr64); // expected-error {{argument should be a multiple of 8}} + + /// ... and these ones to the 4-byte. + vstrwq_scatter_base(addr32, 0, addr32); + vstrwq_scatter_base(addr32, 4, addr32); + vstrwq_scatter_base(addr32, 2*4, addr32); + vstrwq_scatter_base(addr32, 125*4, addr32); + vstrwq_scatter_base(addr32, 126*4, addr32); + vstrwq_scatter_base(addr32, 127*4, addr32); + vstrwq_scatter_base(addr32, -4, addr32); // expected-error {{argument value -4 is outside the valid range [0, 508]}} + vstrwq_scatter_base(addr32, 128*4, addr32); // expected-error {{argument value 512 is outside the valid range [0, 508]}} + vstrwq_scatter_base(addr32, 2, addr32); // expected-error {{argument should be a multiple of 4}} + vstrwq_scatter_base(addr32, 1, addr32); // expected-error {{argument should be a multiple of 4}} +} diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp index ddec171d671..aa3b475ea7b 100644 --- a/clang/utils/TableGen/MveEmitter.cpp +++ b/clang/utils/TableGen/MveEmitter.cpp @@ -204,6 +204,9 @@ public: Name = "const " + Name; return Name + " *"; } + std::string llvmName() const override { + return "llvm::PointerType::getUnqual(" + Pointee->llvmName() + ")"; + } static bool classof(const Type *T) { return T->typeKind() == TypeKind::Pointer; @@ -512,6 +515,11 @@ public: void setVarname(const StringRef s) { VarName = s; } bool varnameUsed() const { return VarNameUsed; } + // Emit code to generate this result as a Value *. + virtual std::string asValue() { + return varname(); + } + // Code generation happens in multiple passes. This method tracks whether a // Result has yet been visited in a given pass, without the need for a // tedious loop in between passes that goes through and resets a 'visited' @@ -547,6 +555,12 @@ public: std::string typeName() const override { return AddressType ? "Address" : Result::typeName(); } + // Emit code to generate this result as a Value *. + std::string asValue() override { + if (AddressType) + return "(" + varname() + ".getPointer())"; + return Result::asValue(); + } }; // Result subclass for an integer literal appearing in Tablegen. This may need @@ -665,7 +679,7 @@ public: OS << "), llvm::SmallVector<Value *, " << Args.size() << "> {"; const char *Sep = ""; for (auto Arg : Args) { - OS << Sep << Arg->varname(); + OS << Sep << Arg->asValue(); Sep = ", "; } OS << "})"; @@ -974,17 +988,15 @@ const Type *MveEmitter::getType(DagInit *D, const Type *Param) { return getPointerType(Pointee, Op->getValueAsBit("const")); } - if (Op->isSubClassOf("CTO_Sign")) { - const ScalarType *ST = cast<ScalarType>(getType(D->getArg(0), Param)); - ScalarTypeKind NewKind = Op->getValueAsBit("signed") - ? ScalarTypeKind::SignedInt - : ScalarTypeKind::UnsignedInt; + if (Op->getName() == "CTO_CopyKind") { + const ScalarType *STSize = cast<ScalarType>(getType(D->getArg(0), Param)); + const ScalarType *STKind = cast<ScalarType>(getType(D->getArg(1), Param)); for (const auto &kv : ScalarTypes) { const ScalarType *RT = kv.second.get(); - if (RT->kind() == NewKind && RT->sizeInBits() == ST->sizeInBits()) + if (RT->kind() == STKind->kind() && RT->sizeInBits() == STSize->sizeInBits()) return RT; } - PrintFatalError("Cannot change sign of this type"); + PrintFatalError("Cannot find a type to satisfy CopyKind"); } PrintFatalError("Bad operator in type dag expression"); @@ -1025,6 +1037,18 @@ Result::Ptr MveEmitter::getCodeForDag(DagInit *D, const Result::Scope &Scope, } } PrintFatalError("Unsupported type cast"); + } else if (Op->getName() == "unsignedflag") { + if (D->getNumArgs() != 1) + PrintFatalError("unsignedflag should have exactly one argument"); + Record *TypeRec = cast<DefInit>(D->getArg(0))->getDef(); + if (!TypeRec->isSubClassOf("Type")) + PrintFatalError("unsignedflag's argument should be a type"); + if (const auto *ST = dyn_cast<ScalarType>(getType(TypeRec, Param))) { + return std::make_shared<IntLiteralResult>( + getScalarType("u32"), ST->kind() == ScalarTypeKind::UnsignedInt); + } else { + PrintFatalError("unsignedflag's argument should be a scalar type"); + } } else { std::vector<Result::Ptr> Args; for (unsigned i = 0, e = D->getNumArgs(); i < e; ++i) diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td index a188652ba1d..0ee56a2b1c1 100644 --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -812,17 +812,43 @@ defm int_arm_mve_minv: IntrinsicSignSuffix<[llvm_i32_ty], defm int_arm_mve_maxv: IntrinsicSignSuffix<[llvm_i32_ty], [llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>; -def int_arm_mve_vcvt_narrow: Intrinsic<[llvm_v8f16_ty], - [llvm_v8f16_ty, llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>; -def int_arm_mve_vcvt_narrow_predicated: Intrinsic<[llvm_v8f16_ty], - [llvm_v8f16_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_v4i1_ty], [IntrNoMem]>; +multiclass MVEPredicated<list<LLVMType> rets, list<LLVMType> params, + LLVMType pred, list<IntrinsicProperty> props = []> { + def "": Intrinsic<rets, params, props>; + def _predicated: Intrinsic<rets, params # [pred], props>; +} -def int_arm_mve_vldr_gather_base_wb: Intrinsic< - [llvm_anyvector_ty, llvm_anyvector_ty], - [LLVMMatchType<1>, llvm_i32_ty], [IntrReadMem]>; -def int_arm_mve_vldr_gather_base_wb_predicated: Intrinsic< +defm int_arm_mve_vcvt_narrow: MVEPredicated<[llvm_v8f16_ty], + [llvm_v8f16_ty, llvm_v4f32_ty, llvm_i32_ty], llvm_v4i1_ty, [IntrNoMem]>; + +defm int_arm_mve_vldr_gather_base: MVEPredicated< + [llvm_anyvector_ty], [llvm_anyvector_ty, llvm_i32_ty], + llvm_anyvector_ty, [IntrReadMem]>; +defm int_arm_mve_vldr_gather_base_wb: MVEPredicated< [llvm_anyvector_ty, llvm_anyvector_ty], - [LLVMMatchType<1>, llvm_i32_ty, llvm_anyvector_ty], [IntrReadMem]>; + [LLVMMatchType<1>, llvm_i32_ty], llvm_anyvector_ty, [IntrReadMem]>; +defm int_arm_mve_vstr_scatter_base: MVEPredicated< + [], [llvm_anyvector_ty, llvm_i32_ty, llvm_anyvector_ty], + llvm_anyvector_ty, [IntrWriteMem]>; +defm int_arm_mve_vstr_scatter_base_wb: MVEPredicated< + [llvm_anyvector_ty], [LLVMMatchType<0>, llvm_i32_ty, llvm_anyvector_ty], + llvm_anyvector_ty, [IntrWriteMem]>; + +// gather_offset takes three i32 parameters. The first is the size of +// memory element loaded, in bits. The second is a left bit shift to +// apply to each offset in the vector parameter (must be either 0, or +// correspond to the element size of the destination vector type). The +// last is 1 to indicate zero extension (if the load is widening), or +// 0 for sign extension. +// +// scatter_offset has the first two of those parameters, but since it +// narrows rather than widening, it doesn't have the last one. +defm int_arm_mve_vldr_gather_offset: MVEPredicated< + [llvm_anyvector_ty], [llvm_anyptr_ty, llvm_anyvector_ty, + llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], llvm_anyvector_ty, [IntrReadMem]>; +defm int_arm_mve_vstr_scatter_offset: MVEPredicated< + [], [llvm_anyptr_ty, llvm_anyvector_ty, llvm_anyvector_ty, + llvm_i32_ty, llvm_i32_ty], llvm_anyvector_ty, [IntrWriteMem]>; def int_arm_mve_urshrl: Intrinsic< [llvm_i32_ty, llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td index 040b6f64832..f96520e37dc 100644 --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -278,7 +278,7 @@ class mve_addr_q_shift<int shift> : MemOperand { // A family of classes wrapping up information about the vector types // used by MVE. class MVEVectorVTInfo<ValueType vec, ValueType pred, bits<2> size, - string suffix, bit unsigned> { + string suffixletter, bit unsigned> { // The LLVM ValueType representing the vector, so we can use it in // ISel patterns. ValueType Vec = vec; @@ -304,32 +304,39 @@ class MVEVectorVTInfo<ValueType vec, ValueType pred, bits<2> size, // signed and 1 for unsigned. For anything else, undefined. bit Unsigned = unsigned; - // The suffix used on the instruction in assembly language. - string Suffix = suffix; + // The number of bits in a vector element, in integer form. + int LaneBits = !shl(8, Size); + + // The suffix used in assembly language on an instruction operating + // on this lane if it only cares about number of bits. + string BitsSuffix = !cast<string>(LaneBits); + + // The suffix used on an instruction that mentions the whole type. + string Suffix = suffixletter ## BitsSuffix; } // Integer vector types that don't treat signed and unsigned differently. -def MVE_v16i8 : MVEVectorVTInfo<v16i8, v16i1, 0b00, "i8", ?>; -def MVE_v8i16 : MVEVectorVTInfo<v8i16, v8i1, 0b01, "i16", ?>; -def MVE_v4i32 : MVEVectorVTInfo<v4i32, v4i1, 0b10, "i32", ?>; -def MVE_v2i64 : MVEVectorVTInfo<v2i64, v4i1, 0b11, "i64", ?>; +def MVE_v16i8 : MVEVectorVTInfo<v16i8, v16i1, 0b00, "i", ?>; +def MVE_v8i16 : MVEVectorVTInfo<v8i16, v8i1, 0b01, "i", ?>; +def MVE_v4i32 : MVEVectorVTInfo<v4i32, v4i1, 0b10, "i", ?>; +def MVE_v2i64 : MVEVectorVTInfo<v2i64, v4i1, 0b11, "i", ?>; // Explicitly signed and unsigned integer vectors. They map to the // same set of LLVM ValueTypes as above, but are represented // differently in assembly and instruction encodings. -def MVE_v16s8 : MVEVectorVTInfo<v16i8, v16i1, 0b00, "s8", 0b0>; -def MVE_v8s16 : MVEVectorVTInfo<v8i16, v8i1, 0b01, "s16", 0b0>; -def MVE_v4s32 : MVEVectorVTInfo<v4i32, v4i1, 0b10, "s32", 0b0>; -def MVE_v2s64 : MVEVectorVTInfo<v2i64, v4i1, 0b11, "s64", 0b0>; -def MVE_v16u8 : MVEVectorVTInfo<v16i8, v16i1, 0b00, "u8", 0b1>; -def MVE_v8u16 : MVEVectorVTInfo<v8i16, v8i1, 0b01, "u16", 0b1>; -def MVE_v4u32 : MVEVectorVTInfo<v4i32, v4i1, 0b10, "u32", 0b1>; -def MVE_v2u64 : MVEVectorVTInfo<v2i64, v4i1, 0b11, "u64", 0b1>; +def MVE_v16s8 : MVEVectorVTInfo<v16i8, v16i1, 0b00, "s", 0b0>; +def MVE_v8s16 : MVEVectorVTInfo<v8i16, v8i1, 0b01, "s", 0b0>; +def MVE_v4s32 : MVEVectorVTInfo<v4i32, v4i1, 0b10, "s", 0b0>; +def MVE_v2s64 : MVEVectorVTInfo<v2i64, v4i1, 0b11, "s", 0b0>; +def MVE_v16u8 : MVEVectorVTInfo<v16i8, v16i1, 0b00, "u", 0b1>; +def MVE_v8u16 : MVEVectorVTInfo<v8i16, v8i1, 0b01, "u", 0b1>; +def MVE_v4u32 : MVEVectorVTInfo<v4i32, v4i1, 0b10, "u", 0b1>; +def MVE_v2u64 : MVEVectorVTInfo<v2i64, v4i1, 0b11, "u", 0b1>; // FP vector types. -def MVE_v8f16 : MVEVectorVTInfo<v8f16, v8i1, 0b01, "f16", ?>; -def MVE_v4f32 : MVEVectorVTInfo<v4f32, v4i1, 0b10, "f32", ?>; -def MVE_v2f64 : MVEVectorVTInfo<v2f64, v4i1, 0b11, "f64", ?>; +def MVE_v8f16 : MVEVectorVTInfo<v8f16, v8i1, 0b01, "f", ?>; +def MVE_v4f32 : MVEVectorVTInfo<v4f32, v4i1, 0b10, "f", ?>; +def MVE_v2f64 : MVEVectorVTInfo<v2f64, v4i1, 0b11, "f", ?>; // --------- Start of base classes for the instructions themselves @@ -4614,28 +4621,80 @@ class MVE_VLDRSTR_rq_b<MVE_ldst_direction dir, MVE_memsz memsz, string asm, string suffix, bit U, bits<2> size> : MVE_VLDRSTR_rq<dir, memsz, U, size, 0, asm, suffix, 0>; +// Multiclasses wrapping that to add ISel patterns for intrinsics. +multiclass MVE_VLDR_rq_w<MVE_memsz memsz, list<MVEVectorVTInfo> VTIs> { + defm "": MVE_VLDRSTR_rq_w<MVE_ld, memsz, "vldr" # memsz.MnemonicLetter, + VTIs[0].Suffix, VTIs[0].Unsigned, VTIs[0].Size>; + foreach VTI = VTIs in + foreach UnsignedFlag = !if(!eq(VTI.Size, memsz.encoding), + [0,1], [VTI.Unsigned]) in { + def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), memsz.TypeBits, 0, UnsignedFlag)), + (VTI.Vec (!cast<Instruction>(NAME#"_u") GPR:$base, MQPR:$offsets))>; + def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), memsz.TypeBits, memsz.shift, UnsignedFlag)), + (VTI.Vec (!cast<Instruction>(NAME) GPR:$base, MQPR:$offsets))>; + def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), memsz.TypeBits, 0, UnsignedFlag, (VTI.Pred VCCR:$pred))), + (VTI.Vec (!cast<Instruction>(NAME#"_u") GPR:$base, MQPR:$offsets, 1, VCCR:$pred))>; + def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), memsz.TypeBits, memsz.shift, UnsignedFlag, (VTI.Pred VCCR:$pred))), + (VTI.Vec (!cast<Instruction>(NAME) GPR:$base, MQPR:$offsets, 1, VCCR:$pred))>; + } +} +multiclass MVE_VLDR_rq_b<list<MVEVectorVTInfo> VTIs> { + def "": MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb", + VTIs[0].Suffix, VTIs[0].Unsigned, VTIs[0].Size>; + foreach VTI = VTIs in { + def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), 8, 0, VTI.Unsigned)), + (VTI.Vec (!cast<Instruction>(NAME) GPR:$base, MQPR:$offsets))>; + def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), 8, 0, VTI.Unsigned, (VTI.Pred VCCR:$pred))), + (VTI.Vec (!cast<Instruction>(NAME) GPR:$base, MQPR:$offsets, 1, VCCR:$pred))>; + } +} +multiclass MVE_VSTR_rq_w<MVE_memsz memsz, list<MVEVectorVTInfo> VTIs> { + defm "": MVE_VLDRSTR_rq_w<MVE_st, memsz, "vstr" # memsz.MnemonicLetter, + VTIs[0].BitsSuffix, 0, VTIs[0].Size>; + foreach VTI = VTIs in { + def : Pat<(int_arm_mve_vstr_scatter_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), memsz.TypeBits, 0), + (!cast<Instruction>(NAME#"_u") MQPR:$data, GPR:$base, MQPR:$offsets)>; + def : Pat<(int_arm_mve_vstr_scatter_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), memsz.TypeBits, memsz.shift), + (!cast<Instruction>(NAME) MQPR:$data, GPR:$base, MQPR:$offsets)>; + def : Pat<(int_arm_mve_vstr_scatter_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), memsz.TypeBits, 0, (VTI.Pred VCCR:$pred)), + (!cast<Instruction>(NAME#"_u") MQPR:$data, GPR:$base, MQPR:$offsets, 1, VCCR:$pred)>; + def : Pat<(int_arm_mve_vstr_scatter_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), memsz.TypeBits, memsz.shift, (VTI.Pred VCCR:$pred)), + (!cast<Instruction>(NAME) MQPR:$data, GPR:$base, MQPR:$offsets, 1, VCCR:$pred)>; + } +} +multiclass MVE_VSTR_rq_b<list<MVEVectorVTInfo> VTIs> { + def "": MVE_VLDRSTR_rq_b<MVE_st, MVE_memB, "vstrb", + VTIs[0].BitsSuffix, 0, VTIs[0].Size>; + foreach VTI = VTIs in { + def : Pat<(int_arm_mve_vstr_scatter_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), 8, 0), + (!cast<Instruction>(NAME) MQPR:$data, GPR:$base, MQPR:$offsets)>; + def : Pat<(int_arm_mve_vstr_scatter_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), 8, 0, (VTI.Pred VCCR:$pred)), + (!cast<Instruction>(NAME) MQPR:$data, GPR:$base, MQPR:$offsets, 1, VCCR:$pred)>; + } +} + // Actually define all the loads and stores in this family. -def MVE_VLDRBU8_rq : MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb","u8", 1,0b00>; -def MVE_VLDRBU16_rq: MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb","u16", 1,0b01>; -def MVE_VLDRBS16_rq: MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb","s16", 0,0b01>; -def MVE_VLDRBU32_rq: MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb","u32", 1,0b10>; -def MVE_VLDRBS32_rq: MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb","s32", 0,0b10>; +defm MVE_VLDRBU8_rq : MVE_VLDR_rq_b<[MVE_v16u8,MVE_v16s8]>; +defm MVE_VLDRBU16_rq: MVE_VLDR_rq_b<[MVE_v8u16]>; +defm MVE_VLDRBS16_rq: MVE_VLDR_rq_b<[MVE_v8s16]>; +defm MVE_VLDRBU32_rq: MVE_VLDR_rq_b<[MVE_v4u32]>; +defm MVE_VLDRBS32_rq: MVE_VLDR_rq_b<[MVE_v4s32]>; -defm MVE_VLDRHU16_rq: MVE_VLDRSTR_rq_w<MVE_ld, MVE_memH, "vldrh","u16", 1,0b01>; -defm MVE_VLDRHU32_rq: MVE_VLDRSTR_rq_w<MVE_ld, MVE_memH, "vldrh","u32", 1,0b10>; -defm MVE_VLDRHS32_rq: MVE_VLDRSTR_rq_w<MVE_ld, MVE_memH, "vldrh","s32", 0,0b10>; -defm MVE_VLDRWU32_rq: MVE_VLDRSTR_rq_w<MVE_ld, MVE_memW, "vldrw","u32", 1,0b10>; -defm MVE_VLDRDU64_rq: MVE_VLDRSTR_rq_w<MVE_ld, MVE_memD, "vldrd","u64", 1,0b11>; +defm MVE_VLDRHU16_rq: MVE_VLDR_rq_w<MVE_memH, [MVE_v8u16,MVE_v8s16,MVE_v8f16]>; +defm MVE_VLDRHU32_rq: MVE_VLDR_rq_w<MVE_memH, [MVE_v4u32]>; +defm MVE_VLDRHS32_rq: MVE_VLDR_rq_w<MVE_memH, [MVE_v4s32]>; +defm MVE_VLDRWU32_rq: MVE_VLDR_rq_w<MVE_memW, [MVE_v4u32,MVE_v4s32,MVE_v4f32]>; +defm MVE_VLDRDU64_rq: MVE_VLDR_rq_w<MVE_memD, [MVE_v2u64,MVE_v2s64]>; -def MVE_VSTRB8_rq : MVE_VLDRSTR_rq_b<MVE_st, MVE_memB, "vstrb","8", 0,0b00>; -def MVE_VSTRB16_rq : MVE_VLDRSTR_rq_b<MVE_st, MVE_memB, "vstrb","16", 0,0b01>; -def MVE_VSTRB32_rq : MVE_VLDRSTR_rq_b<MVE_st, MVE_memB, "vstrb","32", 0,0b10>; +defm MVE_VSTRB8_rq : MVE_VSTR_rq_b<[MVE_v16i8]>; +defm MVE_VSTRB16_rq : MVE_VSTR_rq_b<[MVE_v8i16]>; +defm MVE_VSTRB32_rq : MVE_VSTR_rq_b<[MVE_v4i32]>; -defm MVE_VSTRH16_rq : MVE_VLDRSTR_rq_w<MVE_st, MVE_memH, "vstrh","16", 0,0b01>; -defm MVE_VSTRH32_rq : MVE_VLDRSTR_rq_w<MVE_st, MVE_memH, "vstrh","32", 0,0b10>; -defm MVE_VSTRW32_rq : MVE_VLDRSTR_rq_w<MVE_st, MVE_memW, "vstrw","32", 0,0b10>; -defm MVE_VSTRD64_rq : MVE_VLDRSTR_rq_w<MVE_st, MVE_memD, "vstrd","64", 0,0b11>; +defm MVE_VSTRH16_rq : MVE_VSTR_rq_w<MVE_memH, [MVE_v8i16,MVE_v8f16]>; +defm MVE_VSTRH32_rq : MVE_VSTR_rq_w<MVE_memH, [MVE_v4i32]>; +defm MVE_VSTRW32_rq : MVE_VSTR_rq_w<MVE_memW, [MVE_v4i32,MVE_v4f32]>; +defm MVE_VSTRD64_rq : MVE_VSTR_rq_w<MVE_memD, [MVE_v2i64]>; // Gather loads / scatter stores whose address operand is of the form // [Qm,#imm], i.e. a vector containing a full base address for each @@ -4674,11 +4733,56 @@ multiclass MVE_VLDRSTR_qi_m<MVE_ldst_direction dir, MVE_memsz memsz, } } +// Multiclasses wrapping that one, adding selection patterns for the +// non-writeback loads and all the stores. (The writeback loads must +// deliver multiple output values, so they have to be selected by C++ +// code.) +multiclass MVE_VLDR_qi<MVE_memsz memsz, MVEVectorVTInfo AVTI, + list<MVEVectorVTInfo> DVTIs> { + defm "" : MVE_VLDRSTR_qi_m<MVE_ld, memsz, "vldr" # memsz.MnemonicLetter, + "u" # memsz.TypeBits>; + + foreach DVTI = DVTIs in { + def : Pat<(DVTI.Vec (int_arm_mve_vldr_gather_base + (AVTI.Vec MQPR:$addr), (i32 imm:$offset))), + (DVTI.Vec (!cast<Instruction>(NAME) + (AVTI.Vec MQPR:$addr), (i32 imm:$offset)))>; + def : Pat<(DVTI.Vec (int_arm_mve_vldr_gather_base_predicated + (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (AVTI.Pred VCCR:$pred))), + (DVTI.Vec (!cast<Instruction>(NAME) + (AVTI.Vec MQPR:$addr), (i32 imm:$offset), 1, VCCR:$pred))>; + } +} +multiclass MVE_VSTR_qi<MVE_memsz memsz, MVEVectorVTInfo AVTI, + list<MVEVectorVTInfo> DVTIs> { + defm "" : MVE_VLDRSTR_qi_m<MVE_st, memsz, "vstr" # memsz.MnemonicLetter, + !cast<string>(memsz.TypeBits)>; + + foreach DVTI = DVTIs in { + def : Pat<(int_arm_mve_vstr_scatter_base + (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (DVTI.Vec MQPR:$data)), + (!cast<Instruction>(NAME) + (DVTI.Vec MQPR:$data), (AVTI.Vec MQPR:$addr), (i32 imm:$offset))>; + def : Pat<(int_arm_mve_vstr_scatter_base_predicated + (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (DVTI.Vec MQPR:$data), (AVTI.Pred VCCR:$pred)), + (!cast<Instruction>(NAME) + (DVTI.Vec MQPR:$data), (AVTI.Vec MQPR:$addr), (i32 imm:$offset), 1, VCCR:$pred)>; + def : Pat<(AVTI.Vec (int_arm_mve_vstr_scatter_base_wb + (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (DVTI.Vec MQPR:$data))), + (AVTI.Vec (!cast<Instruction>(NAME # "_pre") + (DVTI.Vec MQPR:$data), (AVTI.Vec MQPR:$addr), (i32 imm:$offset)))>; + def : Pat<(AVTI.Vec (int_arm_mve_vstr_scatter_base_wb_predicated + (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (DVTI.Vec MQPR:$data), (AVTI.Pred VCCR:$pred))), + (AVTI.Vec (!cast<Instruction>(NAME # "_pre") + (DVTI.Vec MQPR:$data), (AVTI.Vec MQPR:$addr), (i32 imm:$offset), 1, VCCR:$pred))>; + } +} + // Actual instruction definitions. -defm MVE_VLDRWU32_qi: MVE_VLDRSTR_qi_m<MVE_ld, MVE_memW, "vldrw", "u32">; -defm MVE_VLDRDU64_qi: MVE_VLDRSTR_qi_m<MVE_ld, MVE_memD, "vldrd", "u64">; -defm MVE_VSTRW32_qi: MVE_VLDRSTR_qi_m<MVE_st, MVE_memW, "vstrw", "32">; -defm MVE_VSTRD64_qi: MVE_VLDRSTR_qi_m<MVE_st, MVE_memD, "vstrd", "64">; +defm MVE_VLDRWU32_qi: MVE_VLDR_qi<MVE_memW, MVE_v4i32, [MVE_v4i32,MVE_v4f32]>; +defm MVE_VLDRDU64_qi: MVE_VLDR_qi<MVE_memD, MVE_v2i64, [MVE_v2i64,MVE_v2f64]>; +defm MVE_VSTRW32_qi: MVE_VSTR_qi<MVE_memW, MVE_v4i32, [MVE_v4i32,MVE_v4f32]>; +defm MVE_VSTRD64_qi: MVE_VSTR_qi<MVE_memD, MVE_v2i64, [MVE_v2i64,MVE_v2f64]>; // Define aliases for all the instructions where memory size and // vector lane size are the same. These are mnemonic aliases, so they diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/scatter-gather.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/scatter-gather.ll new file mode 100644 index 00000000000..5e19f81cbf5 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/scatter-gather.ll @@ -0,0 +1,2018 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s + +define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_gather_offset_s16(i8* %base, <8 x i16> %offset) { +; CHECK-LABEL: test_vldrbq_gather_offset_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.s16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i8.v8i16(i8* %base, <8 x i16> %offset, i32 8, i32 0, i32 0) + ret <8 x i16> %0 +} + +declare <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i8.v8i16(i8*, <8 x i16>, i32, i32, i32) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_gather_offset_s32(i8* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrbq_gather_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.s32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i8.v4i32(i8* %base, <4 x i32> %offset, i32 8, i32 0, i32 0) + ret <4 x i32> %0 +} + +declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i8.v4i32(i8*, <4 x i32>, i32, i32, i32) + +define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_gather_offset_s8(i8* %base, <16 x i8> %offset) { +; CHECK-LABEL: test_vldrbq_gather_offset_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vldr.gather.offset.v16i8.p0i8.v16i8(i8* %base, <16 x i8> %offset, i32 8, i32 0, i32 0) + ret <16 x i8> %0 +} + +declare <16 x i8> @llvm.arm.mve.vldr.gather.offset.v16i8.p0i8.v16i8(i8*, <16 x i8>, i32, i32, i32) + +define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_gather_offset_u16(i8* %base, <8 x i16> %offset) { +; CHECK-LABEL: test_vldrbq_gather_offset_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i8.v8i16(i8* %base, <8 x i16> %offset, i32 8, i32 0, i32 1) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_gather_offset_u32(i8* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrbq_gather_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i8.v4i32(i8* %base, <4 x i32> %offset, i32 8, i32 0, i32 1) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_gather_offset_u8(i8* %base, <16 x i8> %offset) { +; CHECK-LABEL: test_vldrbq_gather_offset_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vldr.gather.offset.v16i8.p0i8.v16i8(i8* %base, <16 x i8> %offset, i32 8, i32 0, i32 1) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_gather_offset_z_s16(i8* %base, <8 x i16> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrbq_gather_offset_z_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.s16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i8.v8i16.v8i1(i8* %base, <8 x i16> %offset, i32 8, i32 0, i32 0, <8 x i1> %1) + ret <8 x i16> %2 +} + +declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) + +declare <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i8.v8i16.v8i1(i8*, <8 x i16>, i32, i32, i32, <8 x i1>) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_gather_offset_z_s32(i8* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrbq_gather_offset_z_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.s32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i8.v4i32.v4i1(i8* %base, <4 x i32> %offset, i32 8, i32 0, i32 0, <4 x i1> %1) + ret <4 x i32> %2 +} + +declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) + +declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i8.v4i32.v4i1(i8*, <4 x i32>, i32, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_gather_offset_z_s8(i8* %base, <16 x i8> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrbq_gather_offset_z_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u8 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vldr.gather.offset.predicated.v16i8.p0i8.v16i8.v16i1(i8* %base, <16 x i8> %offset, i32 8, i32 0, i32 0, <16 x i1> %1) + ret <16 x i8> %2 +} + +declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) + +declare <16 x i8> @llvm.arm.mve.vldr.gather.offset.predicated.v16i8.p0i8.v16i8.v16i1(i8*, <16 x i8>, i32, i32, i32, <16 x i1>) + +define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_gather_offset_z_u16(i8* %base, <8 x i16> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrbq_gather_offset_z_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i8.v8i16.v8i1(i8* %base, <8 x i16> %offset, i32 8, i32 0, i32 1, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_gather_offset_z_u32(i8* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrbq_gather_offset_z_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i8.v4i32.v4i1(i8* %base, <4 x i32> %offset, i32 8, i32 0, i32 1, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_gather_offset_z_u8(i8* %base, <16 x i8> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrbq_gather_offset_z_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u8 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vldr.gather.offset.predicated.v16i8.p0i8.v16i8.v16i1(i8* %base, <16 x i8> %offset, i32 8, i32 0, i32 1, <16 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_s64(<2 x i64> %addr) { +; CHECK-LABEL: test_vldrdq_gather_base_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrd.u64 q1, [q0, #616] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <2 x i64> @llvm.arm.mve.vldr.gather.base.v2i64.v2i64(<2 x i64> %addr, i32 616) + ret <2 x i64> %0 +} + +declare <2 x i64> @llvm.arm.mve.vldr.gather.base.v2i64.v2i64(<2 x i64>, i32) + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_u64(<2 x i64> %addr) { +; CHECK-LABEL: test_vldrdq_gather_base_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrd.u64 q1, [q0, #336] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <2 x i64> @llvm.arm.mve.vldr.gather.base.v2i64.v2i64(<2 x i64> %addr, i32 336) + ret <2 x i64> %0 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_wb_s64(<2 x i64>* %addr) { +; CHECK-LABEL: test_vldrdq_gather_base_wb_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrd.u64 q1, [q0, #576]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <2 x i64>, <2 x i64>* %addr, align 8 + %1 = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.v2i64.v2i64(<2 x i64> %0, i32 576) + %2 = extractvalue { <2 x i64>, <2 x i64> } %1, 1 + store <2 x i64> %2, <2 x i64>* %addr, align 8 + %3 = extractvalue { <2 x i64>, <2 x i64> } %1, 0 + ret <2 x i64> %3 +} + +declare { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.v2i64.v2i64(<2 x i64>, i32) + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_wb_u64(<2 x i64>* %addr) { +; CHECK-LABEL: test_vldrdq_gather_base_wb_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrd.u64 q1, [q0, #328]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <2 x i64>, <2 x i64>* %addr, align 8 + %1 = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.v2i64.v2i64(<2 x i64> %0, i32 328) + %2 = extractvalue { <2 x i64>, <2 x i64> } %1, 1 + store <2 x i64> %2, <2 x i64>* %addr, align 8 + %3 = extractvalue { <2 x i64>, <2 x i64> } %1, 0 + ret <2 x i64> %3 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_wb_z_s64(<2 x i64>* %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrdq_gather_base_wb_z_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrdt.u64 q1, [q0, #664]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <2 x i64>, <2 x i64>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> %0, i32 664, <4 x i1> %2) + %4 = extractvalue { <2 x i64>, <2 x i64> } %3, 1 + store <2 x i64> %4, <2 x i64>* %addr, align 8 + %5 = extractvalue { <2 x i64>, <2 x i64> } %3, 0 + ret <2 x i64> %5 +} + +declare { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64>, i32, <4 x i1>) + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_wb_z_u64(<2 x i64>* %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrdq_gather_base_wb_z_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrdt.u64 q1, [q0, #656]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <2 x i64>, <2 x i64>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> %0, i32 656, <4 x i1> %2) + %4 = extractvalue { <2 x i64>, <2 x i64> } %3, 1 + store <2 x i64> %4, <2 x i64>* %addr, align 8 + %5 = extractvalue { <2 x i64>, <2 x i64> } %3, 0 + ret <2 x i64> %5 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_z_s64(<2 x i64> %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrdq_gather_base_z_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrdt.u64 q1, [q0, #888] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1(<2 x i64> %addr, i32 888, <4 x i1> %1) + ret <2 x i64> %2 +} + +declare <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1(<2 x i64>, i32, <4 x i1>) + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_z_u64(<2 x i64> %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrdq_gather_base_z_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrdt.u64 q1, [q0, #1000] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1(<2 x i64> %addr, i32 1000, <4 x i1> %1) + ret <2 x i64> %2 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_offset_s64(i64* %base, <2 x i64> %offset) { +; CHECK-LABEL: test_vldrdq_gather_offset_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrd.u64 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0i64.v2i64(i64* %base, <2 x i64> %offset, i32 64, i32 0, i32 0) + ret <2 x i64> %0 +} + +declare <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0i64.v2i64(i64*, <2 x i64>, i32, i32, i32) + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_offset_u64(i64* %base, <2 x i64> %offset) { +; CHECK-LABEL: test_vldrdq_gather_offset_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrd.u64 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0i64.v2i64(i64* %base, <2 x i64> %offset, i32 64, i32 0, i32 1) + ret <2 x i64> %0 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_offset_z_s64(i64* %base, <2 x i64> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrdq_gather_offset_z_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrdt.u64 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, i32 64, i32 0, i32 0, <4 x i1> %1) + ret <2 x i64> %2 +} + +declare <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64*, <2 x i64>, i32, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_offset_z_u64(i64* %base, <2 x i64> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrdq_gather_offset_z_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrdt.u64 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, i32 64, i32 0, i32 1, <4 x i1> %1) + ret <2 x i64> %2 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_shifted_offset_s64(i64* %base, <2 x i64> %offset) { +; CHECK-LABEL: test_vldrdq_gather_shifted_offset_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrd.u64 q1, [r0, q0, uxtw #3] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0i64.v2i64(i64* %base, <2 x i64> %offset, i32 64, i32 3, i32 0) + ret <2 x i64> %0 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_shifted_offset_u64(i64* %base, <2 x i64> %offset) { +; CHECK-LABEL: test_vldrdq_gather_shifted_offset_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrd.u64 q1, [r0, q0, uxtw #3] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0i64.v2i64(i64* %base, <2 x i64> %offset, i32 64, i32 3, i32 1) + ret <2 x i64> %0 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_shifted_offset_z_s64(i64* %base, <2 x i64> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrdq_gather_shifted_offset_z_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrdt.u64 q1, [r0, q0, uxtw #3] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, i32 64, i32 3, i32 0, <4 x i1> %1) + ret <2 x i64> %2 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_shifted_offset_z_u64(i64* %base, <2 x i64> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrdq_gather_shifted_offset_z_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrdt.u64 q1, [r0, q0, uxtw #3] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, i32 64, i32 3, i32 1, <4 x i1> %1) + ret <2 x i64> %2 +} + +define arm_aapcs_vfpcc <8 x half> @test_vldrhq_gather_offset_f16(half* %base, <8 x i16> %offset) { +; CHECK-LABEL: test_vldrhq_gather_offset_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x half> @llvm.arm.mve.vldr.gather.offset.v8f16.p0f16.v8i16(half* %base, <8 x i16> %offset, i32 16, i32 0, i32 0) + ret <8 x half> %0 +} + +declare <8 x half> @llvm.arm.mve.vldr.gather.offset.v8f16.p0f16.v8i16(half*, <8 x i16>, i32, i32, i32) + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_offset_s16(i16* %base, <8 x i16> %offset) { +; CHECK-LABEL: test_vldrhq_gather_offset_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i16.v8i16(i16* %base, <8 x i16> %offset, i32 16, i32 0, i32 0) + ret <8 x i16> %0 +} + +declare <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i16.v8i16(i16*, <8 x i16>, i32, i32, i32) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_offset_s32(i16* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrhq_gather_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.s32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i16.v4i32(i16* %base, <4 x i32> %offset, i32 16, i32 0, i32 0) + ret <4 x i32> %0 +} + +declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i16.v4i32(i16*, <4 x i32>, i32, i32, i32) + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_offset_u16(i16* %base, <8 x i16> %offset) { +; CHECK-LABEL: test_vldrhq_gather_offset_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i16.v8i16(i16* %base, <8 x i16> %offset, i32 16, i32 0, i32 1) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_offset_u32(i16* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrhq_gather_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i16.v4i32(i16* %base, <4 x i32> %offset, i32 16, i32 0, i32 1) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <8 x half> @test_vldrhq_gather_offset_z_f16(half* %base, <8 x i16> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_offset_z_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x half> @llvm.arm.mve.vldr.gather.offset.predicated.v8f16.p0f16.v8i16.v8i1(half* %base, <8 x i16> %offset, i32 16, i32 0, i32 0, <8 x i1> %1) + ret <8 x half> %2 +} + +declare <8 x half> @llvm.arm.mve.vldr.gather.offset.predicated.v8f16.p0f16.v8i16.v8i1(half*, <8 x i16>, i32, i32, i32, <8 x i1>) + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_offset_z_s16(i16* %base, <8 x i16> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_offset_z_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i16.v8i16.v8i1(i16* %base, <8 x i16> %offset, i32 16, i32 0, i32 0, <8 x i1> %1) + ret <8 x i16> %2 +} + +declare <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i16.v8i16.v8i1(i16*, <8 x i16>, i32, i32, i32, <8 x i1>) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_offset_z_s32(i16* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_offset_z_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.s32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i16.v4i32.v4i1(i16* %base, <4 x i32> %offset, i32 16, i32 0, i32 0, <4 x i1> %1) + ret <4 x i32> %2 +} + +declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i16.v4i32.v4i1(i16*, <4 x i32>, i32, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_offset_z_u16(i16* %base, <8 x i16> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_offset_z_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i16.v8i16.v8i1(i16* %base, <8 x i16> %offset, i32 16, i32 0, i32 1, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_offset_z_u32(i16* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_offset_z_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i16.v4i32.v4i1(i16* %base, <4 x i32> %offset, i32 16, i32 0, i32 1, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <8 x half> @test_vldrhq_gather_shifted_offset_f16(half* %base, <8 x i16> %offset) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x half> @llvm.arm.mve.vldr.gather.offset.v8f16.p0f16.v8i16(half* %base, <8 x i16> %offset, i32 16, i32 1, i32 0) + ret <8 x half> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_shifted_offset_s16(i16* %base, <8 x i16> %offset) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i16.v8i16(i16* %base, <8 x i16> %offset, i32 16, i32 1, i32 0) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_shifted_offset_s32(i16* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.s32 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i16.v4i32(i16* %base, <4 x i32> %offset, i32 16, i32 1, i32 0) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_shifted_offset_u16(i16* %base, <8 x i16> %offset) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i16.v8i16(i16* %base, <8 x i16> %offset, i32 16, i32 1, i32 1) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_shifted_offset_u32(i16* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i16.v4i32(i16* %base, <4 x i32> %offset, i32 16, i32 1, i32 1) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <8 x half> @test_vldrhq_gather_shifted_offset_z_f16(half* %base, <8 x i16> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_z_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x half> @llvm.arm.mve.vldr.gather.offset.predicated.v8f16.p0f16.v8i16.v8i1(half* %base, <8 x i16> %offset, i32 16, i32 1, i32 0, <8 x i1> %1) + ret <8 x half> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_shifted_offset_z_s16(i16* %base, <8 x i16> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_z_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i16.v8i16.v8i1(i16* %base, <8 x i16> %offset, i32 16, i32 1, i32 0, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_shifted_offset_z_s32(i16* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_z_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.s32 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i16.v4i32.v4i1(i16* %base, <4 x i32> %offset, i32 16, i32 1, i32 0, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_shifted_offset_z_u16(i16* %base, <8 x i16> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_z_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i16.v8i16.v8i1(i16* %base, <8 x i16> %offset, i32 16, i32 1, i32 1, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_shifted_offset_z_u32(i16* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_z_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u32 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i16.v4i32.v4i1(i16* %base, <4 x i32> %offset, i32 16, i32 1, i32 1, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_base_f32(<4 x i32> %addr) { +; CHECK-LABEL: test_vldrwq_gather_base_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [q0, #12] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x float> @llvm.arm.mve.vldr.gather.base.v4f32.v4i32(<4 x i32> %addr, i32 12) + ret <4 x float> %0 +} + +declare <4 x float> @llvm.arm.mve.vldr.gather.base.v4f32.v4i32(<4 x i32>, i32) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_s32(<4 x i32> %addr) { +; CHECK-LABEL: test_vldrwq_gather_base_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [q0, #400] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.base.v4i32.v4i32(<4 x i32> %addr, i32 400) + ret <4 x i32> %0 +} + +declare <4 x i32> @llvm.arm.mve.vldr.gather.base.v4i32.v4i32(<4 x i32>, i32) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_u32(<4 x i32> %addr) { +; CHECK-LABEL: test_vldrwq_gather_base_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [q0, #284] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.base.v4i32.v4i32(<4 x i32> %addr, i32 284) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_base_wb_f32(<4 x i32>* %addr) { +; CHECK-LABEL: test_vldrwq_gather_base_wb_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [q0, #64]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4f32.v4i32(<4 x i32> %0, i32 64) + %2 = extractvalue { <4 x float>, <4 x i32> } %1, 1 + store <4 x i32> %2, <4 x i32>* %addr, align 8 + %3 = extractvalue { <4 x float>, <4 x i32> } %1, 0 + ret <4 x float> %3 +} + +declare { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4f32.v4i32(<4 x i32>, i32) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_wb_s32(<4 x i32>* %addr) { +; CHECK-LABEL: test_vldrwq_gather_base_wb_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [q0, #80]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4i32.v4i32(<4 x i32> %0, i32 80) + %2 = extractvalue { <4 x i32>, <4 x i32> } %1, 1 + store <4 x i32> %2, <4 x i32>* %addr, align 8 + %3 = extractvalue { <4 x i32>, <4 x i32> } %1, 0 + ret <4 x i32> %3 +} + +declare { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4i32.v4i32(<4 x i32>, i32) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_wb_u32(<4 x i32>* %addr) { +; CHECK-LABEL: test_vldrwq_gather_base_wb_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [q0, #480]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4i32.v4i32(<4 x i32> %0, i32 480) + %2 = extractvalue { <4 x i32>, <4 x i32> } %1, 1 + store <4 x i32> %2, <4 x i32>* %addr, align 8 + %3 = extractvalue { <4 x i32>, <4 x i32> } %1, 0 + ret <4 x i32> %3 +} + +define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_base_wb_z_f32(<4 x i32>* %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_base_wb_z_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [q0, #352]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %0, i32 352, <4 x i1> %2) + %4 = extractvalue { <4 x float>, <4 x i32> } %3, 1 + store <4 x i32> %4, <4 x i32>* %addr, align 8 + %5 = extractvalue { <4 x float>, <4 x i32> } %3, 0 + ret <4 x float> %5 +} + +declare { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_wb_z_s32(<4 x i32>* %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_base_wb_z_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [q0, #276]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32> %0, i32 276, <4 x i1> %2) + %4 = extractvalue { <4 x i32>, <4 x i32> } %3, 1 + store <4 x i32> %4, <4 x i32>* %addr, align 8 + %5 = extractvalue { <4 x i32>, <4 x i32> } %3, 0 + ret <4 x i32> %5 +} + +declare { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_wb_z_u32(<4 x i32>* %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_base_wb_z_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [q0, #88]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32> %0, i32 88, <4 x i1> %2) + %4 = extractvalue { <4 x i32>, <4 x i32> } %3, 1 + store <4 x i32> %4, <4 x i32>* %addr, align 8 + %5 = extractvalue { <4 x i32>, <4 x i32> } %3, 0 + ret <4 x i32> %5 +} + +define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_base_z_f32(<4 x i32> %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_base_z_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [q0, #300] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x float> @llvm.arm.mve.vldr.gather.base.predicated.v4f32.v4i32.v4i1(<4 x i32> %addr, i32 300, <4 x i1> %1) + ret <4 x float> %2 +} + +declare <4 x float> @llvm.arm.mve.vldr.gather.base.predicated.v4f32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_z_s32(<4 x i32> %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_base_z_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [q0, #440] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.base.predicated.v4i32.v4i32.v4i1(<4 x i32> %addr, i32 440, <4 x i1> %1) + ret <4 x i32> %2 +} + +declare <4 x i32> @llvm.arm.mve.vldr.gather.base.predicated.v4i32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_z_u32(<4 x i32> %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_base_z_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [q0, #300] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.base.predicated.v4i32.v4i32.v4i1(<4 x i32> %addr, i32 300, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_offset_f32(float* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrwq_gather_offset_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x float> @llvm.arm.mve.vldr.gather.offset.v4f32.p0f32.v4i32(float* %base, <4 x i32> %offset, i32 32, i32 0, i32 0) + ret <4 x float> %0 +} + +declare <4 x float> @llvm.arm.mve.vldr.gather.offset.v4f32.p0f32.v4i32(float*, <4 x i32>, i32, i32, i32) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_offset_s32(i32* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrwq_gather_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* %base, <4 x i32> %offset, i32 32, i32 0, i32 0) + ret <4 x i32> %0 +} + +declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32*, <4 x i32>, i32, i32, i32) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_offset_u32(i32* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrwq_gather_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* %base, <4 x i32> %offset, i32 32, i32 0, i32 1) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_offset_z_f32(float* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_offset_z_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x float> @llvm.arm.mve.vldr.gather.offset.predicated.v4f32.p0f32.v4i32.v4i1(float* %base, <4 x i32> %offset, i32 32, i32 0, i32 0, <4 x i1> %1) + ret <4 x float> %2 +} + +declare <4 x float> @llvm.arm.mve.vldr.gather.offset.predicated.v4f32.p0f32.v4i32.v4i1(float*, <4 x i32>, i32, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_offset_z_s32(i32* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_offset_z_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i32.v4i32.v4i1(i32* %base, <4 x i32> %offset, i32 32, i32 0, i32 0, <4 x i1> %1) + ret <4 x i32> %2 +} + +declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i32.v4i32.v4i1(i32*, <4 x i32>, i32, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_offset_z_u32(i32* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_offset_z_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i32.v4i32.v4i1(i32* %base, <4 x i32> %offset, i32 32, i32 0, i32 1, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_shifted_offset_f32(float* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrwq_gather_shifted_offset_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x float> @llvm.arm.mve.vldr.gather.offset.v4f32.p0f32.v4i32(float* %base, <4 x i32> %offset, i32 32, i32 2, i32 0) + ret <4 x float> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_shifted_offset_s32(i32* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrwq_gather_shifted_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* %base, <4 x i32> %offset, i32 32, i32 2, i32 0) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_shifted_offset_u32(i32* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrwq_gather_shifted_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* %base, <4 x i32> %offset, i32 32, i32 2, i32 1) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_shifted_offset_z_f32(float* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_shifted_offset_z_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x float> @llvm.arm.mve.vldr.gather.offset.predicated.v4f32.p0f32.v4i32.v4i1(float* %base, <4 x i32> %offset, i32 32, i32 2, i32 0, <4 x i1> %1) + ret <4 x float> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_shifted_offset_z_s32(i32* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_shifted_offset_z_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i32.v4i32.v4i1(i32* %base, <4 x i32> %offset, i32 32, i32 2, i32 0, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_shifted_offset_z_u32(i32* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_shifted_offset_z_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i32.v4i32.v4i1(i32* %base, <4 x i32> %offset, i32 32, i32 2, i32 1, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_p_s16(i8* %base, <8 x i16> %offset, <8 x i16> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrbq_scatter_offset_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v8i16.v8i16.v8i1(i8* %base, <8 x i16> %offset, <8 x i16> %value, i32 8, i32 0, <8 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v8i16.v8i16.v8i1(i8*, <8 x i16>, <8 x i16>, i32, i32, <8 x i1>) + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_p_s32(i8* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrbq_scatter_offset_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v4i32.v4i32.v4i1(i8* %base, <4 x i32> %offset, <4 x i32> %value, i32 8, i32 0, <4 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v4i32.v4i32.v4i1(i8*, <4 x i32>, <4 x i32>, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_p_s8(i8* %base, <16 x i8> %offset, <16 x i8> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrbq_scatter_offset_p_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.8 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v16i8.v16i8.v16i1(i8* %base, <16 x i8> %offset, <16 x i8> %value, i32 8, i32 0, <16 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v16i8.v16i8.v16i1(i8*, <16 x i8>, <16 x i8>, i32, i32, <16 x i1>) + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_p_u16(i8* %base, <8 x i16> %offset, <8 x i16> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrbq_scatter_offset_p_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v8i16.v8i16.v8i1(i8* %base, <8 x i16> %offset, <8 x i16> %value, i32 8, i32 0, <8 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_p_u32(i8* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrbq_scatter_offset_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v4i32.v4i32.v4i1(i8* %base, <4 x i32> %offset, <4 x i32> %value, i32 8, i32 0, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_p_u8(i8* %base, <16 x i8> %offset, <16 x i8> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrbq_scatter_offset_p_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.8 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v16i8.v16i8.v16i1(i8* %base, <16 x i8> %offset, <16 x i8> %value, i32 8, i32 0, <16 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_s16(i8* %base, <8 x i16> %offset, <8 x i16> %value) { +; CHECK-LABEL: test_vstrbq_scatter_offset_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrb.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v8i16.v8i16(i8* %base, <8 x i16> %offset, <8 x i16> %value, i32 8, i32 0) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.p0i8.v8i16.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_s32(i8* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrbq_scatter_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrb.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v4i32.v4i32(i8* %base, <4 x i32> %offset, <4 x i32> %value, i32 8, i32 0) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.p0i8.v4i32.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_s8(i8* %base, <16 x i8> %offset, <16 x i8> %value) { +; CHECK-LABEL: test_vstrbq_scatter_offset_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrb.8 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v16i8.v16i8(i8* %base, <16 x i8> %offset, <16 x i8> %value, i32 8, i32 0) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.p0i8.v16i8.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32) + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_u16(i8* %base, <8 x i16> %offset, <8 x i16> %value) { +; CHECK-LABEL: test_vstrbq_scatter_offset_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrb.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v8i16.v8i16(i8* %base, <8 x i16> %offset, <8 x i16> %value, i32 8, i32 0) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_u32(i8* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrbq_scatter_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrb.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v4i32.v4i32(i8* %base, <4 x i32> %offset, <4 x i32> %value, i32 8, i32 0) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_u8(i8* %base, <16 x i8> %offset, <16 x i8> %value) { +; CHECK-LABEL: test_vstrbq_scatter_offset_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrb.8 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v16i8.v16i8(i8* %base, <16 x i8> %offset, <16 x i8> %value, i32 8, i32 0) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_p_s64(<2 x i64> %addr, <2 x i64> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrdq_scatter_base_p_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrdt.64 q1, [q0, #888] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1(<2 x i64> %addr, i32 888, <2 x i64> %value, <4 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1(<2 x i64>, i32, <2 x i64>, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_p_u64(<2 x i64> %addr, <2 x i64> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrdq_scatter_base_p_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrdt.64 q1, [q0, #264] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1(<2 x i64> %addr, i32 264, <2 x i64> %value, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_s64(<2 x i64> %addr, <2 x i64> %value) { +; CHECK-LABEL: test_vstrdq_scatter_base_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrd.64 q1, [q0, #408] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.base.v2i64.v2i64(<2 x i64> %addr, i32 408, <2 x i64> %value) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.base.v2i64.v2i64(<2 x i64>, i32, <2 x i64>) + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_u64(<2 x i64> %addr, <2 x i64> %value) { +; CHECK-LABEL: test_vstrdq_scatter_base_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrd.64 q1, [q0, #472] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.base.v2i64.v2i64(<2 x i64> %addr, i32 472, <2 x i64> %value) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_wb_p_s64(<2 x i64>* %addr, <2 x i64> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrdq_scatter_base_wb_p_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrdt.64 q0, [q1, #248]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <2 x i64>, <2 x i64>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> %0, i32 248, <2 x i64> %value, <4 x i1> %2) + store <2 x i64> %3, <2 x i64>* %addr, align 8 + ret void +} + +declare <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64>, i32, <2 x i64>, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_wb_p_u64(<2 x i64>* %addr, <2 x i64> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrdq_scatter_base_wb_p_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrdt.64 q0, [q1, #136]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <2 x i64>, <2 x i64>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> %0, i32 136, <2 x i64> %value, <4 x i1> %2) + store <2 x i64> %3, <2 x i64>* %addr, align 8 + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_wb_s64(<2 x i64>* %addr, <2 x i64> %value) { +; CHECK-LABEL: test_vstrdq_scatter_base_wb_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vstrd.64 q0, [q1, #208]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <2 x i64>, <2 x i64>* %addr, align 8 + %1 = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.v2i64.v2i64(<2 x i64> %0, i32 208, <2 x i64> %value) + store <2 x i64> %1, <2 x i64>* %addr, align 8 + ret void +} + +declare <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.v2i64.v2i64(<2 x i64>, i32, <2 x i64>) + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_wb_u64(<2 x i64>* %addr, <2 x i64> %value) { +; CHECK-LABEL: test_vstrdq_scatter_base_wb_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vstrd.64 q0, [q1, #168]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <2 x i64>, <2 x i64>* %addr, align 8 + %1 = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.v2i64.v2i64(<2 x i64> %0, i32 168, <2 x i64> %value) + store <2 x i64> %1, <2 x i64>* %addr, align 8 + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_offset_p_s64(i64* %base, <2 x i64> %offset, <2 x i64> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrdq_scatter_offset_p_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrdt.64 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 0, <4 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64*, <2 x i64>, <2 x i64>, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_offset_p_u64(i64* %base, <2 x i64> %offset, <2 x i64> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrdq_scatter_offset_p_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrdt.64 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 0, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_offset_s64(i64* %base, <2 x i64> %offset, <2 x i64> %value) { +; CHECK-LABEL: test_vstrdq_scatter_offset_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrd.64 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i64.v2i64.v2i64(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 0) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.p0i64.v2i64.v2i64(i64*, <2 x i64>, <2 x i64>, i32, i32) + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_offset_u64(i64* %base, <2 x i64> %offset, <2 x i64> %value) { +; CHECK-LABEL: test_vstrdq_scatter_offset_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrd.64 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i64.v2i64.v2i64(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 0) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_shifted_offset_p_s64(i64* %base, <2 x i64> %offset, <2 x i64> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrdq_scatter_shifted_offset_p_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrdt.64 q1, [r0, q0, uxtw #3] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 3, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_shifted_offset_p_u64(i64* %base, <2 x i64> %offset, <2 x i64> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrdq_scatter_shifted_offset_p_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrdt.64 q1, [r0, q0, uxtw #3] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 3, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_shifted_offset_s64(i64* %base, <2 x i64> %offset, <2 x i64> %value) { +; CHECK-LABEL: test_vstrdq_scatter_shifted_offset_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrd.64 q1, [r0, q0, uxtw #3] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i64.v2i64.v2i64(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 3) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_shifted_offset_u64(i64* %base, <2 x i64> %offset, <2 x i64> %value) { +; CHECK-LABEL: test_vstrdq_scatter_shifted_offset_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrd.64 q1, [r0, q0, uxtw #3] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i64.v2i64.v2i64(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 3) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_f16(half* %base, <8 x i16> %offset, <8 x half> %value) { +; CHECK-LABEL: test_vstrhq_scatter_offset_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0f16.v8i16.v8f16(half* %base, <8 x i16> %offset, <8 x half> %value, i32 16, i32 0) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.p0f16.v8i16.v8f16(half*, <8 x i16>, <8 x half>, i32, i32) + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_p_f16(half* %base, <8 x i16> %offset, <8 x half> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_offset_p_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0f16.v8i16.v8f16.v8i1(half* %base, <8 x i16> %offset, <8 x half> %value, i32 16, i32 0, <8 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0f16.v8i16.v8f16.v8i1(half*, <8 x i16>, <8 x half>, i32, i32, <8 x i1>) + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_p_s16(i16* %base, <8 x i16> %offset, <8 x i16> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_offset_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v8i16.v8i16.v8i1(i16* %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 0, <8 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v8i16.v8i16.v8i1(i16*, <8 x i16>, <8 x i16>, i32, i32, <8 x i1>) + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_p_s32(i16* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_offset_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v4i32.v4i32.v4i1(i16* %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 0, <4 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v4i32.v4i32.v4i1(i16*, <4 x i32>, <4 x i32>, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_p_u16(i16* %base, <8 x i16> %offset, <8 x i16> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_offset_p_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v8i16.v8i16.v8i1(i16* %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 0, <8 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_p_u32(i16* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_offset_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v4i32.v4i32.v4i1(i16* %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 0, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_s16(i16* %base, <8 x i16> %offset, <8 x i16> %value) { +; CHECK-LABEL: test_vstrhq_scatter_offset_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v8i16.v8i16(i16* %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 0) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.p0i16.v8i16.v8i16(i16*, <8 x i16>, <8 x i16>, i32, i32) + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_s32(i16* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrhq_scatter_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v4i32.v4i32(i16* %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 0) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.p0i16.v4i32.v4i32(i16*, <4 x i32>, <4 x i32>, i32, i32) + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_u16(i16* %base, <8 x i16> %offset, <8 x i16> %value) { +; CHECK-LABEL: test_vstrhq_scatter_offset_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v8i16.v8i16(i16* %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 0) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_u32(i16* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrhq_scatter_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v4i32.v4i32(i16* %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 0) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_f16(half* %base, <8 x i16> %offset, <8 x half> %value) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0f16.v8i16.v8f16(half* %base, <8 x i16> %offset, <8 x half> %value, i32 16, i32 1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_p_f16(half* %base, <8 x i16> %offset, <8 x half> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_p_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0f16.v8i16.v8f16.v8i1(half* %base, <8 x i16> %offset, <8 x half> %value, i32 16, i32 1, <8 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_p_s16(i16* %base, <8 x i16> %offset, <8 x i16> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v8i16.v8i16.v8i1(i16* %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 1, <8 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_p_s32(i16* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.32 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v4i32.v4i32.v4i1(i16* %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 1, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_p_u16(i16* %base, <8 x i16> %offset, <8 x i16> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_p_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v8i16.v8i16.v8i1(i16* %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 1, <8 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_p_u32(i16* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.32 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v4i32.v4i32.v4i1(i16* %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 1, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_s16(i16* %base, <8 x i16> %offset, <8 x i16> %value) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v8i16.v8i16(i16* %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_s32(i16* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.32 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v4i32.v4i32(i16* %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_u16(i16* %base, <8 x i16> %offset, <8 x i16> %value) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v8i16.v8i16(i16* %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_u32(i16* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.32 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v4i32.v4i32(i16* %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_f32(<4 x i32> %addr, <4 x float> %value) { +; CHECK-LABEL: test_vstrwq_scatter_base_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q1, [q0, #380] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.base.v4i32.v4f32(<4 x i32> %addr, i32 380, <4 x float> %value) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.base.v4i32.v4f32(<4 x i32>, i32, <4 x float>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_p_f32(<4 x i32> %addr, <4 x float> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_base_p_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [q0, #400] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.base.predicated.v4i32.v4f32.v4i1(<4 x i32> %addr, i32 400, <4 x float> %value, <4 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.base.predicated.v4i32.v4f32.v4i1(<4 x i32>, i32, <4 x float>, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_p_s32(<4 x i32> %addr, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_base_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [q0, #48] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.base.predicated.v4i32.v4i32.v4i1(<4 x i32> %addr, i32 48, <4 x i32> %value, <4 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.base.predicated.v4i32.v4i32.v4i1(<4 x i32>, i32, <4 x i32>, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_p_u32(<4 x i32> %addr, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_base_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [q0, #376] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.base.predicated.v4i32.v4i32.v4i1(<4 x i32> %addr, i32 376, <4 x i32> %value, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_s32(<4 x i32> %addr, <4 x i32> %value) { +; CHECK-LABEL: test_vstrwq_scatter_base_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q1, [q0, #156] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.base.v4i32.v4i32(<4 x i32> %addr, i32 156, <4 x i32> %value) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.base.v4i32.v4i32(<4 x i32>, i32, <4 x i32>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_u32(<4 x i32> %addr, <4 x i32> %value) { +; CHECK-LABEL: test_vstrwq_scatter_base_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q1, [q0, #212] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.base.v4i32.v4i32(<4 x i32> %addr, i32 212, <4 x i32> %value) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_wb_f32(<4 x i32>* %addr, <4 x float> %value) { +; CHECK-LABEL: test_vstrwq_scatter_base_wb_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vstrw.32 q0, [q1, #412]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.v4i32.v4f32(<4 x i32> %0, i32 412, <4 x float> %value) + store <4 x i32> %1, <4 x i32>* %addr, align 8 + ret void +} + +declare <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.v4i32.v4f32(<4 x i32>, i32, <4 x float>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_wb_p_f32(<4 x i32>* %addr, <4 x float> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_base_wb_p_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q0, [q1, #236]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v4i32.v4f32.v4i1(<4 x i32> %0, i32 236, <4 x float> %value, <4 x i1> %2) + store <4 x i32> %3, <4 x i32>* %addr, align 8 + ret void +} + +declare <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v4i32.v4f32.v4i1(<4 x i32>, i32, <4 x float>, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_wb_p_s32(<4 x i32>* %addr, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_base_wb_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q0, [q1, #328]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32> %0, i32 328, <4 x i32> %value, <4 x i1> %2) + store <4 x i32> %3, <4 x i32>* %addr, align 8 + ret void +} + +declare <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32>, i32, <4 x i32>, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_wb_p_u32(<4 x i32>* %addr, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_base_wb_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q0, [q1, #412]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32> %0, i32 412, <4 x i32> %value, <4 x i1> %2) + store <4 x i32> %3, <4 x i32>* %addr, align 8 + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_wb_s32(<4 x i32>* %addr, <4 x i32> %value) { +; CHECK-LABEL: test_vstrwq_scatter_base_wb_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vstrw.32 q0, [q1, #152]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.v4i32.v4i32(<4 x i32> %0, i32 152, <4 x i32> %value) + store <4 x i32> %1, <4 x i32>* %addr, align 8 + ret void +} + +declare <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.v4i32.v4i32(<4 x i32>, i32, <4 x i32>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_wb_u32(<4 x i32>* %addr, <4 x i32> %value) { +; CHECK-LABEL: test_vstrwq_scatter_base_wb_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vstrw.32 q0, [q1, #64]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.v4i32.v4i32(<4 x i32> %0, i32 64, <4 x i32> %value) + store <4 x i32> %1, <4 x i32>* %addr, align 8 + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_offset_f32(float* %base, <4 x i32> %offset, <4 x float> %value) { +; CHECK-LABEL: test_vstrwq_scatter_offset_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0f32.v4i32.v4f32(float* %base, <4 x i32> %offset, <4 x float> %value, i32 32, i32 0) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.p0f32.v4i32.v4f32(float*, <4 x i32>, <4 x float>, i32, i32) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_offset_p_f32(float* %base, <4 x i32> %offset, <4 x float> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_offset_p_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0f32.v4i32.v4f32.v4i1(float* %base, <4 x i32> %offset, <4 x float> %value, i32 32, i32 0, <4 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0f32.v4i32.v4f32.v4i1(float*, <4 x i32>, <4 x float>, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_offset_p_s32(i32* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_offset_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i32.v4i32.v4i32.v4i1(i32* %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 0, <4 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i32.v4i32.v4i32.v4i1(i32*, <4 x i32>, <4 x i32>, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_offset_p_u32(i32* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_offset_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i32.v4i32.v4i32.v4i1(i32* %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 0, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_offset_s32(i32* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrwq_scatter_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i32.v4i32.v4i32(i32* %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 0) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.p0i32.v4i32.v4i32(i32*, <4 x i32>, <4 x i32>, i32, i32) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_offset_u32(i32* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrwq_scatter_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i32.v4i32.v4i32(i32* %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 0) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_shifted_offset_f32(float* %base, <4 x i32> %offset, <4 x float> %value) { +; CHECK-LABEL: test_vstrwq_scatter_shifted_offset_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0f32.v4i32.v4f32(float* %base, <4 x i32> %offset, <4 x float> %value, i32 32, i32 2) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_shifted_offset_p_f32(float* %base, <4 x i32> %offset, <4 x float> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_shifted_offset_p_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0f32.v4i32.v4f32.v4i1(float* %base, <4 x i32> %offset, <4 x float> %value, i32 32, i32 2, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_shifted_offset_p_s32(i32* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_shifted_offset_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i32.v4i32.v4i32.v4i1(i32* %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 2, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_shifted_offset_p_u32(i32* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_shifted_offset_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i32.v4i32.v4i32.v4i1(i32* %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 2, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_shifted_offset_s32(i32* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrwq_scatter_shifted_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i32.v4i32.v4i32(i32* %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 2) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_shifted_offset_u32(i32* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrwq_scatter_shifted_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i32.v4i32.v4i32(i32* %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 2) + ret void +} |