[ARM] Add and update a lot of VLDn tests. NFC

author: David Green <david.green@arm.com> 2019-11-19 18:17:46 +0000
committer: David Green <david.green@arm.com> 2019-11-19 18:37:30 +0000
commit: 411bfe476b758c09a0c9d4b3176e46f0a70de3bb (patch)
tree: 0f17f58eb094d88ecb96b468ddb104a44fd26b41 /llvm/test/Transforms
parent: b91f798fde42668a3a7b361c015deb787a46720d (diff)
download: bcm5719-llvm-411bfe476b758c09a0c9d4b3176e46f0a70de3bb.tar.gz
bcm5719-llvm-411bfe476b758c09a0c9d4b3176e46f0a70de3bb.zip
2 files changed, 1693 insertions, 509 deletions
diff --git a/llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll
index fdf6e1feda2..1328c22a25c 100644
--- a/llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll
+++ b/llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll
@@ -1,19 +1,30 @@
-; RUN: opt < %s -mattr=+neon -interleaved-access -S | FileCheck %s -check-prefixes=NEON,ALL
-; RUN: opt < %s -interleaved-access -S | FileCheck %s -check-prefixes=NO_NEON,ALL
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mattr=+neon -interleaved-access -S | FileCheck %s --check-prefix=CHECK-NEON
+; RUN: opt < %s -mattr=+mve.fp -interleaved-access -S | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt < %s -interleaved-access -S | FileCheck %s --check-prefix=CHECK-NONE
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
 target triple = "arm---eabi"
 
 define void @load_factor2(<16 x i8>* %ptr) {
-; NEON-LABEL:    @load_factor2(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i8>* %ptr to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0i8(i8* [[TMP1]], i32 4)
-; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLDN]], 0
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_factor2(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_factor2(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0i8(i8* [[TMP1]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i8>, <16 x i8>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <16 x i8> [[INTERLEAVED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <16 x i8> [[INTERLEAVED_VEC]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i8>, <16 x i8>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <16 x i8> [[INTERLEAVED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <16 x i8> [[INTERLEAVED_VEC]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <16 x i8>, <16 x i8>* %ptr, align 4
   %v0 = shufflevector <16 x i8> %interleaved.vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -22,16 +33,27 @@ define void @load_factor2(<16 x i8>* %ptr) {
 }
 
 define void @load_factor3(<6 x i32>* %ptr) {
-; NEON-LABEL:    @load_factor3(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <6 x i32>* %ptr to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0i8(i8* [[TMP1]], i32 4)
-; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
-; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_factor3(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_factor3(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <6 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0i8(i8* [[TMP1]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_factor3(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <6 x i32>, <6 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <6 x i32> [[INTERLEAVED_VEC]], <6 x i32> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <6 x i32> [[INTERLEAVED_VEC]], <6 x i32> undef, <2 x i32> <i32 1, i32 4>
+; CHECK-MVE-NEXT:    [[V2:%.*]] = shufflevector <6 x i32> [[INTERLEAVED_VEC]], <6 x i32> undef, <2 x i32> <i32 2, i32 5>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_factor3(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <6 x i32>, <6 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <6 x i32> [[INTERLEAVED_VEC]], <6 x i32> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <6 x i32> [[INTERLEAVED_VEC]], <6 x i32> undef, <2 x i32> <i32 1, i32 4>
+; CHECK-NONE-NEXT:    [[V2:%.*]] = shufflevector <6 x i32> [[INTERLEAVED_VEC]], <6 x i32> undef, <2 x i32> <i32 2, i32 5>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <6 x i32>, <6 x i32>* %ptr, align 4
   %v0 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> undef, <2 x i32> <i32 0, i32 3>
@@ -41,17 +63,30 @@ define void @load_factor3(<6 x i32>* %ptr) {
 }
 
 define void @load_factor4(<16 x i32>* %ptr) {
-; NEON-LABEL:    @load_factor4(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP1]], i32 4)
-; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3
-; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
-; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_factor4(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_factor4(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP1]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_factor4(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-MVE-NEXT:    [[V2:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-MVE-NEXT:    [[V3:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_factor4(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NONE-NEXT:    [[V2:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NONE-NEXT:    [[V3:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
   %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
@@ -62,15 +97,22 @@ define void @load_factor4(<16 x i32>* %ptr) {
 }
 
 define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
-; NEON-LABEL:    @store_factor2(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i8>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; NEON-NEXT:       call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_factor2(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_factor2(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i8> [[V0:%.*]], <8 x i8> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i8> [[V0]], <8 x i8> [[V1]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[V0:%.*]], <8 x i8> [[V1:%.*]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-MVE-NEXT:    store <16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[V0:%.*]], <8 x i8> [[V1:%.*]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-NONE-NEXT:    store <16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4
@@ -78,16 +120,29 @@ define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
 }
 
 define void @store_factor3(<12 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
-; NEON-LABEL:    @store_factor3(
-; NEON:            [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_factor3(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_factor3(
+; CHECK-NEON-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_factor3(
+; CHECK-MVE-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_factor3(
+; CHECK-NONE-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NONE-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %s1 = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -97,17 +152,30 @@ define void @store_factor3(<12 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x
 }
 
 define void @store_factor4(<16 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
-; NEON-LABEL:    @store_factor4(
-; NEON:            [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_factor4(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_factor4(
+; CHECK-NEON-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_factor4(
+; CHECK-MVE-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-MVE-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_factor4(
+; CHECK-NONE-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NONE-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NONE-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -117,15 +185,22 @@ define void @store_factor4(<16 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x
 }
 
 define void @load_ptrvec_factor2(<4 x i32*>* %ptr) {
-; NEON-LABEL:    @load_ptrvec_factor2(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <4 x i32*>* %ptr to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32.p0i8(i8* [[TMP1]], i32 4)
-; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLDN]], 0
-; NEON-NEXT:       [[TMP3:%.*]] = inttoptr <2 x i32> [[TMP2]] to <2 x i32*>
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_ptrvec_factor2(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_ptrvec_factor2(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32*>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32.p0i8(i8* [[TMP1]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = inttoptr <2 x i32> [[TMP2]] to <2 x i32*>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_ptrvec_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <4 x i32*>, <4 x i32*>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <4 x i32*> [[INTERLEAVED_VEC]], <4 x i32*> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_ptrvec_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <4 x i32*>, <4 x i32*>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <4 x i32*> [[INTERLEAVED_VEC]], <4 x i32*> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <4 x i32*>, <4 x i32*>* %ptr, align 4
   %v0 = shufflevector <4 x i32*> %interleaved.vec, <4 x i32*> undef, <2 x i32> <i32 0, i32 2>
@@ -133,19 +208,30 @@ define void @load_ptrvec_factor2(<4 x i32*>* %ptr) {
 }
 
 define void @load_ptrvec_factor3(<6 x i32*>* %ptr) {
-; NEON-LABEL:    @load_ptrvec_factor3(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <6 x i32*>* %ptr to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0i8(i8* [[TMP1]], i32 4)
-; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
-; NEON-NEXT:       [[TMP3:%.*]] = inttoptr <2 x i32> [[TMP2]] to <2 x i32*>
-; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP5:%.*]] = inttoptr <2 x i32> [[TMP4]] to <2 x i32*>
-; NEON-NEXT:       [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
-; NEON-NEXT:       [[TMP7:%.*]] = inttoptr <2 x i32> [[TMP6]] to <2 x i32*>
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_ptrvec_factor3(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_ptrvec_factor3(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <6 x i32*>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0i8(i8* [[TMP1]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = inttoptr <2 x i32> [[TMP2]] to <2 x i32*>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = inttoptr <2 x i32> [[TMP4]] to <2 x i32*>
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = inttoptr <2 x i32> [[TMP6]] to <2 x i32*>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_ptrvec_factor3(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <6 x i32*>, <6 x i32*>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <6 x i32*> [[INTERLEAVED_VEC]], <6 x i32*> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <6 x i32*> [[INTERLEAVED_VEC]], <6 x i32*> undef, <2 x i32> <i32 1, i32 4>
+; CHECK-MVE-NEXT:    [[V2:%.*]] = shufflevector <6 x i32*> [[INTERLEAVED_VEC]], <6 x i32*> undef, <2 x i32> <i32 2, i32 5>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_ptrvec_factor3(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <6 x i32*>, <6 x i32*>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <6 x i32*> [[INTERLEAVED_VEC]], <6 x i32*> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <6 x i32*> [[INTERLEAVED_VEC]], <6 x i32*> undef, <2 x i32> <i32 1, i32 4>
+; CHECK-NONE-NEXT:    [[V2:%.*]] = shufflevector <6 x i32*> [[INTERLEAVED_VEC]], <6 x i32*> undef, <2 x i32> <i32 2, i32 5>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <6 x i32*>, <6 x i32*>* %ptr, align 4
   %v0 = shufflevector <6 x i32*> %interleaved.vec, <6 x i32*> undef, <2 x i32> <i32 0, i32 3>
@@ -155,21 +241,34 @@ define void @load_ptrvec_factor3(<6 x i32*>* %ptr) {
 }
 
 define void @load_ptrvec_factor4(<8 x i32*>* %ptr) {
-; NEON-LABEL:    @load_ptrvec_factor4(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32*>* %ptr to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32.p0i8(i8* [[TMP1]], i32 4)
-; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 3
-; NEON-NEXT:       [[TMP3:%.*]] = inttoptr <2 x i32> [[TMP2]] to <2 x i32*>
-; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
-; NEON-NEXT:       [[TMP5:%.*]] = inttoptr <2 x i32> [[TMP4]] to <2 x i32*>
-; NEON-NEXT:       [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP7:%.*]] = inttoptr <2 x i32> [[TMP6]] to <2 x i32*>
-; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
-; NEON-NEXT:       [[TMP9:%.*]] = inttoptr <2 x i32> [[TMP8]] to <2 x i32*>
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_ptrvec_factor4(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_ptrvec_factor4(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32*>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32.p0i8(i8* [[TMP1]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 3
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = inttoptr <2 x i32> [[TMP2]] to <2 x i32*>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = inttoptr <2 x i32> [[TMP4]] to <2 x i32*>
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = inttoptr <2 x i32> [[TMP6]] to <2 x i32*>
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = inttoptr <2 x i32> [[TMP8]] to <2 x i32*>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_ptrvec_factor4(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <8 x i32*>, <8 x i32*>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-MVE-NEXT:    [[V2:%.*]] = shufflevector <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-MVE-NEXT:    [[V3:%.*]] = shufflevector <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_ptrvec_factor4(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <8 x i32*>, <8 x i32*>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-NONE-NEXT:    [[V2:%.*]] = shufflevector <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-NONE-NEXT:    [[V3:%.*]] = shufflevector <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <8 x i32*>, <8 x i32*>* %ptr, align 4
   %v0 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> undef, <2 x i32> <i32 0, i32 4>
@@ -180,17 +279,24 @@ define void @load_ptrvec_factor4(<8 x i32*>* %ptr) {
 }
 
 define void @store_ptrvec_factor2(<4 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %v1) {
-; NEON-LABEL:    @store_ptrvec_factor2(
-; NEON-NEXT:       [[TMP1:%.*]] = ptrtoint <2 x i32*> %v0 to <2 x i32>
-; NEON-NEXT:       [[TMP2:%.*]] = ptrtoint <2 x i32*> %v1 to <2 x i32>
-; NEON-NEXT:       [[TMP3:%.*]] = bitcast <4 x i32*>* %ptr to i8*
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 1>
-; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 2, i32 3>
-; NEON-NEXT:       call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_ptrvec_factor2(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_ptrvec_factor2(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = ptrtoint <2 x i32*> [[V0:%.*]] to <2 x i32>
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = ptrtoint <2 x i32*> [[V1:%.*]] to <2 x i32>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32*>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_ptrvec_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <2 x i32*> [[V0:%.*]], <2 x i32*> [[V1:%.*]], <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-MVE-NEXT:    store <4 x i32*> [[INTERLEAVED_VEC]], <4 x i32*>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_ptrvec_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <2 x i32*> [[V0:%.*]], <2 x i32*> [[V1:%.*]], <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NONE-NEXT:    store <4 x i32*> [[INTERLEAVED_VEC]], <4 x i32*>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
   store <4 x i32*> %interleaved.vec, <4 x i32*>* %ptr, align 4
@@ -198,18 +304,31 @@ define void @store_ptrvec_factor2(<4 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %
 }
 
 define void @store_ptrvec_factor3(<6 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) {
-; NEON-LABEL:    @store_ptrvec_factor3(
-; NEON:            [[TMP1:%.*]] = ptrtoint <4 x i32*> %s0 to <4 x i32>
-; NEON-NEXT:       [[TMP2:%.*]] = ptrtoint <4 x i32*> %s1 to <4 x i32>
-; NEON-NEXT:       [[TMP3:%.*]] = bitcast <6 x i32*>* %ptr to i8*
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 0, i32 1>
-; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 2, i32 3>
-; NEON-NEXT:       [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 4, i32 5>
-; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_ptrvec_factor3(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_ptrvec_factor3(
+; CHECK-NEON-NEXT:    [[S0:%.*]] = shufflevector <2 x i32*> [[V0:%.*]], <2 x i32*> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[S1:%.*]] = shufflevector <2 x i32*> [[V2:%.*]], <2 x i32*> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = ptrtoint <4 x i32*> [[S0]] to <4 x i32>
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = ptrtoint <4 x i32*> [[S1]] to <4 x i32>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = bitcast <6 x i32*>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 4, i32 5>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_ptrvec_factor3(
+; CHECK-MVE-NEXT:    [[S0:%.*]] = shufflevector <2 x i32*> [[V0:%.*]], <2 x i32*> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:    [[S1:%.*]] = shufflevector <2 x i32*> [[V2:%.*]], <2 x i32*> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32*> [[S0]], <4 x i32*> [[S1]], <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+; CHECK-MVE-NEXT:    store <6 x i32*> [[INTERLEAVED_VEC]], <6 x i32*>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_ptrvec_factor3(
+; CHECK-NONE-NEXT:    [[S0:%.*]] = shufflevector <2 x i32*> [[V0:%.*]], <2 x i32*> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NONE-NEXT:    [[S1:%.*]] = shufflevector <2 x i32*> [[V2:%.*]], <2 x i32*> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32*> [[S0]], <4 x i32*> [[S1]], <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+; CHECK-NONE-NEXT:    store <6 x i32*> [[INTERLEAVED_VEC]], <6 x i32*>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %s0 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %s1 = shufflevector <2 x i32*> %v2, <2 x i32*> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@@ -219,19 +338,32 @@ define void @store_ptrvec_factor3(<6 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %
 }
 
 define void @store_ptrvec_factor4(<8 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) {
-; NEON-LABEL:    @store_ptrvec_factor4(
-; NEON:            [[TMP1:%.*]] = ptrtoint <4 x i32*> %s0 to <4 x i32>
-; NEON-NEXT:       [[TMP2:%.*]] = ptrtoint <4 x i32*> %s1 to <4 x i32>
-; NEON-NEXT:       [[TMP3:%.*]] = bitcast <8 x i32*>* %ptr to i8*
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 0, i32 1>
-; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 2, i32 3>
-; NEON-NEXT:       [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 4, i32 5>
-; NEON-NEXT:       [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 6, i32 7>
-; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_ptrvec_factor4(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_ptrvec_factor4(
+; CHECK-NEON-NEXT:    [[S0:%.*]] = shufflevector <2 x i32*> [[V0:%.*]], <2 x i32*> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[S1:%.*]] = shufflevector <2 x i32*> [[V2:%.*]], <2 x i32*> [[V3:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = ptrtoint <4 x i32*> [[S0]] to <4 x i32>
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = ptrtoint <4 x i32*> [[S1]] to <4 x i32>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32*>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 4, i32 5>
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 6, i32 7>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_ptrvec_factor4(
+; CHECK-MVE-NEXT:    [[S0:%.*]] = shufflevector <2 x i32*> [[V0:%.*]], <2 x i32*> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:    [[S1:%.*]] = shufflevector <2 x i32*> [[V2:%.*]], <2 x i32*> [[V3:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32*> [[S0]], <4 x i32*> [[S1]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-MVE-NEXT:    store <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_ptrvec_factor4(
+; CHECK-NONE-NEXT:    [[S0:%.*]] = shufflevector <2 x i32*> [[V0:%.*]], <2 x i32*> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NONE-NEXT:    [[S1:%.*]] = shufflevector <2 x i32*> [[V2:%.*]], <2 x i32*> [[V3:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32*> [[S0]], <4 x i32*> [[S1]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-NONE-NEXT:    store <8 x i32*> [[INTERLEAVED_VEC]], <8 x i32*>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %s0 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %s1 = shufflevector <2 x i32*> %v2, <2 x i32*> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -241,15 +373,24 @@ define void @store_ptrvec_factor4(<8 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %
 }
 
 define void @load_undef_mask_factor2(<8 x i32>* %ptr) {
-; NEON-LABEL:    @load_undef_mask_factor2(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP1]], i32 4)
-; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_undef_mask_factor2(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_undef_mask_factor2(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP1]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_undef_mask_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_undef_mask_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
   %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
@@ -258,16 +399,27 @@ define void @load_undef_mask_factor2(<8 x i32>* %ptr) {
 }
 
 define void @load_undef_mask_factor3(<12 x i32>* %ptr) {
-; NEON-LABEL:    @load_undef_mask_factor3(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP1]], i32 4)
-; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
-; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_undef_mask_factor3(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_undef_mask_factor3(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP1]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_undef_mask_factor3(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <12 x i32> [[INTERLEAVED_VEC]], <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <12 x i32> [[INTERLEAVED_VEC]], <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-MVE-NEXT:    [[V2:%.*]] = shufflevector <12 x i32> [[INTERLEAVED_VEC]], <12 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_undef_mask_factor3(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <12 x i32> [[INTERLEAVED_VEC]], <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <12 x i32> [[INTERLEAVED_VEC]], <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NONE-NEXT:    [[V2:%.*]] = shufflevector <12 x i32> [[INTERLEAVED_VEC]], <12 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <12 x i32>, <12 x i32>* %ptr, align 4
   %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
@@ -277,17 +429,30 @@ define void @load_undef_mask_factor3(<12 x i32>* %ptr) {
 }
 
 define void @load_undef_mask_factor4(<16 x i32>* %ptr) {
-; NEON-LABEL:    @load_undef_mask_factor4(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP1]], i32 4)
-; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3
-; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
-; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_undef_mask_factor4(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_undef_mask_factor4(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP1]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_undef_mask_factor4(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    [[V2:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    [[V3:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_undef_mask_factor4(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    [[V2:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    [[V3:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
   %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
@@ -298,15 +463,22 @@ define void @load_undef_mask_factor4(<16 x i32>* %ptr) {
 }
 
 define void @store_undef_mask_factor2(<8 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
-; NEON-LABEL:    @store_undef_mask_factor2(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_undef_mask_factor2(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_undef_mask_factor2(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> [[V1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_undef_mask_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
+; CHECK-MVE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_undef_mask_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NONE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
   store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
@@ -314,16 +486,29 @@ define void @store_undef_mask_factor2(<8 x i32>* %ptr, <4 x i32> %v0, <4 x i32>
 }
 
 define void @store_undef_mask_factor3(<12 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
-; NEON-LABEL:    @store_undef_mask_factor3(
-; NEON:            [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_undef_mask_factor3(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_undef_mask_factor3(
+; CHECK-NEON-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_undef_mask_factor3(
+; CHECK-MVE-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_undef_mask_factor3(
+; CHECK-NONE-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NONE-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %s1 = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -333,17 +518,30 @@ define void @store_undef_mask_factor3(<12 x i32>* %ptr, <4 x i32> %v0, <4 x i32>
 }
 
 define void @store_undef_mask_factor4(<16 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
-; NEON-LABEL:    @store_undef_mask_factor4(
-; NEON:            [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_undef_mask_factor4(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_undef_mask_factor4(
+; CHECK-NEON-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_undef_mask_factor4(
+; CHECK-MVE-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-MVE-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_undef_mask_factor4(
+; CHECK-NONE-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NONE-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NONE-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -353,16 +551,27 @@ define void @store_undef_mask_factor4(<16 x i32>* %ptr, <4 x i32> %v0, <4 x i32>
 }
 
 define void @load_address_space(<8 x i32> addrspace(1)* %ptr) {
-; NEON-LABEL:    @load_address_space(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32> addrspace(1)* %ptr to i8 addrspace(1)*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p1i8(i8 addrspace(1)* [[TMP1]], i32 0)
-; NEON-NEXT:       [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
-; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_address_space(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_address_space(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32> addrspace(1)* [[PTR:%.*]] to i8 addrspace(1)*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p1i8(i8 addrspace(1)* [[TMP1]], i32 0)
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_address_space(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[PTR:%.*]]
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> undef, <2 x i32> <i32 1, i32 4>
+; CHECK-MVE-NEXT:    [[V2:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> undef, <2 x i32> <i32 2, i32 5>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_address_space(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[PTR:%.*]]
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> undef, <2 x i32> <i32 1, i32 4>
+; CHECK-NONE-NEXT:    [[V2:%.*]] = shufflevector <8 x i32> [[INTERLEAVED_VEC]], <8 x i32> undef, <2 x i32> <i32 2, i32 5>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <8 x i32>, <8 x i32> addrspace(1)* %ptr
   %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> undef, <2 x i32> <i32 0, i32 3>
@@ -372,15 +581,22 @@ define void @load_address_space(<8 x i32> addrspace(1)* %ptr) {
 }
 
 define void @store_address_space(<4 x i32> addrspace(1)* %ptr, <2 x i32> %v0, <2 x i32> %v1) {
-; NEON-LABEL:    @store_address_space(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <4 x i32> addrspace(1)* %ptr to i8 addrspace(1)*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 0, i32 1>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 2, i32 3>
-; NEON-NEXT:       call void @llvm.arm.neon.vst2.p1i8.v2i32(i8 addrspace(1)* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i32 0)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_address_space(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_address_space(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> addrspace(1)* [[PTR:%.*]] to i8 addrspace(1)*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> [[V1:%.*]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> [[V1]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst2.p1i8.v2i32(i8 addrspace(1)* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i32 0)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_address_space(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-MVE-NEXT:    store <4 x i32> [[INTERLEAVED_VEC]], <4 x i32> addrspace(1)* [[PTR:%.*]]
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_address_space(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NONE-NEXT:    store <4 x i32> [[INTERLEAVED_VEC]], <4 x i32> addrspace(1)* [[PTR:%.*]]
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <2 x i32> %v0, <2 x i32> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
   store <4 x i32> %interleaved.vec, <4 x i32> addrspace(1)* %ptr
@@ -388,9 +604,23 @@ define void @store_address_space(<4 x i32> addrspace(1)* %ptr, <2 x i32> %v0, <2
 }
 
 define void @load_f16_factor2(<8 x half>* %ptr) {
-; ALL-LABEL: @load_f16_factor2(
-; ALL-NOT:     @llvm.arm.neon
-; ALL:         ret void
+; CHECK-NEON-LABEL: @load_f16_factor2(
+; CHECK-NEON-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <8 x half>, <8 x half>* [[PTR:%.*]], align 4
+; CHECK-NEON-NEXT:    [[V0:%.*]] = shufflevector <8 x half> [[INTERLEAVED_VEC]], <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEON-NEXT:    [[V1:%.*]] = shufflevector <8 x half> [[INTERLEAVED_VEC]], <8 x half> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_f16_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <8 x half>, <8 x half>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <8 x half> [[INTERLEAVED_VEC]], <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <8 x half> [[INTERLEAVED_VEC]], <8 x half> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_f16_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <8 x half>, <8 x half>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <8 x half> [[INTERLEAVED_VEC]], <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <8 x half> [[INTERLEAVED_VEC]], <8 x half> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <8 x half>, <8 x half>* %ptr, align 4
   %v0 = shufflevector <8 x half> %interleaved.vec, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -399,9 +629,20 @@ define void @load_f16_factor2(<8 x half>* %ptr) {
 }
 
 define void @store_f16_factor2(<8 x half>* %ptr, <4 x half> %v0, <4 x half> %v1) {
-; ALL-LABEL: @store_f16_factor2(
-; ALL-NOT:     @llvm.arm.neon
-; ALL:         ret void
+; CHECK-NEON-LABEL: @store_f16_factor2(
+; CHECK-NEON-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x half> [[V0:%.*]], <4 x half> [[V1:%.*]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEON-NEXT:    store <8 x half> [[INTERLEAVED_VEC]], <8 x half>* [[PTR:%.*]], align 4
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_f16_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x half> [[V0:%.*]], <4 x half> [[V1:%.*]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-MVE-NEXT:    store <8 x half> [[INTERLEAVED_VEC]], <8 x half>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_f16_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x half> [[V0:%.*]], <4 x half> [[V1:%.*]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NONE-NEXT:    store <8 x half> [[INTERLEAVED_VEC]], <8 x half>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <4 x half> %v0, <4 x half> %v1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
   store <8 x half> %interleaved.vec, <8 x half>* %ptr, align 4
@@ -409,9 +650,20 @@ define void @store_f16_factor2(<8 x half>* %ptr, <4 x half> %v0, <4 x half> %v1)
 }
 
 define void @load_illegal_factor2(<3 x float>* %ptr) nounwind {
-; ALL-LABEL:    @load_illegal_factor2(
-; ALL-NOT:        @llvm.arm.neon
-; ALL:            ret void
+; CHECK-NEON-LABEL: @load_illegal_factor2(
+; CHECK-NEON-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <3 x float>, <3 x float>* [[PTR:%.*]], align 16
+; CHECK-NEON-NEXT:    [[V0:%.*]] = shufflevector <3 x float> [[INTERLEAVED_VEC]], <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_illegal_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <3 x float>, <3 x float>* [[PTR:%.*]], align 16
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <3 x float> [[INTERLEAVED_VEC]], <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_illegal_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <3 x float>, <3 x float>* [[PTR:%.*]], align 16
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <3 x float> [[INTERLEAVED_VEC]], <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <3 x float>, <3 x float>* %ptr, align 16
   %v0 = shufflevector <3 x float> %interleaved.vec, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
@@ -419,9 +671,20 @@ define void @load_illegal_factor2(<3 x float>* %ptr) nounwind {
 }
 
 define void @store_illegal_factor2(<3 x float>* %ptr, <3 x float> %v0) nounwind {
-; ALL-LABEL: @store_illegal_factor2(
-; ALL-NOT:     @llvm.arm.neon
-; ALL:         ret void
+; CHECK-NEON-LABEL: @store_illegal_factor2(
+; CHECK-NEON-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <3 x float> [[V0:%.*]], <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
+; CHECK-NEON-NEXT:    store <3 x float> [[INTERLEAVED_VEC]], <3 x float>* [[PTR:%.*]], align 16
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_illegal_factor2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <3 x float> [[V0:%.*]], <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
+; CHECK-MVE-NEXT:    store <3 x float> [[INTERLEAVED_VEC]], <3 x float>* [[PTR:%.*]], align 16
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_illegal_factor2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <3 x float> [[V0:%.*]], <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
+; CHECK-NONE-NEXT:    store <3 x float> [[INTERLEAVED_VEC]], <3 x float>* [[PTR:%.*]], align 16
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <3 x float> %v0, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
   store <3 x float> %interleaved.vec, <3 x float>* %ptr, align 16
@@ -429,17 +692,24 @@ define void @store_illegal_factor2(<3 x float>* %ptr, <3 x float> %v0) nounwind
 }
 
 define void @store_general_mask_factor4(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; NEON-LABEL:    @store_general_mask_factor4(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
-; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
-; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_general_mask_factor4(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_general_mask_factor4(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> <i32 4, i32 5>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 16, i32 17>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 32, i32 33>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 8, i32 9>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor4(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
+; CHECK-MVE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor4(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
+; CHECK-NONE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
   store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
@@ -447,17 +717,24 @@ define void @store_general_mask_factor4(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i
 }
 
 define void @store_general_mask_factor4_undefbeg(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; NEON-LABEL:    @store_general_mask_factor4_undefbeg(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
-; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
-; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_general_mask_factor4_undefbeg(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_general_mask_factor4_undefbeg(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> <i32 4, i32 5>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 16, i32 17>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 32, i32 33>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 8, i32 9>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor4_undefbeg(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 undef, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
+; CHECK-MVE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor4_undefbeg(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 undef, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
+; CHECK-NONE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 undef, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
   store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
@@ -465,17 +742,24 @@ define void @store_general_mask_factor4_undefbeg(<8 x i32>* %ptr, <32 x i32> %v0
 }
 
 define void @store_general_mask_factor4_undefend(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; NEON-LABEL:    @store_general_mask_factor4_undefend(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
-; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
-; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_general_mask_factor4_undefend(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_general_mask_factor4_undefend(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> <i32 4, i32 5>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 16, i32 17>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 32, i32 33>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 8, i32 9>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor4_undefend(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 undef>
+; CHECK-MVE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor4_undefend(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 undef>
+; CHECK-NONE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 undef>
   store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
@@ -483,17 +767,24 @@ define void @store_general_mask_factor4_undefend(<8 x i32>* %ptr, <32 x i32> %v0
 }
 
 define void @store_general_mask_factor4_undefmid(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; NEON-LABEL:    @store_general_mask_factor4_undefmid(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
-; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
-; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_general_mask_factor4_undefmid(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_general_mask_factor4_undefmid(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> <i32 4, i32 5>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 16, i32 17>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 32, i32 33>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 8, i32 9>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor4_undefmid(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 4, i32 undef, i32 32, i32 8, i32 5, i32 17, i32 undef, i32 9>
+; CHECK-MVE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor4_undefmid(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 4, i32 undef, i32 32, i32 8, i32 5, i32 17, i32 undef, i32 9>
+; CHECK-NONE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 undef, i32 32, i32 8, i32 5, i32 17, i32 undef, i32 9>
   store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
@@ -501,17 +792,24 @@ define void @store_general_mask_factor4_undefmid(<8 x i32>* %ptr, <32 x i32> %v0
 }
 
 define void @store_general_mask_factor4_undefmulti(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; NEON-LABEL:    @store_general_mask_factor4_undefmulti(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 0, i32 1>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 0, i32 1>
-; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
-; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_general_mask_factor4_undefmulti(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_general_mask_factor4_undefmulti(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <2 x i32> <i32 4, i32 5>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <2 x i32> <i32 8, i32 9>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor4_undefmulti(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 4, i32 undef, i32 undef, i32 8, i32 undef, i32 undef, i32 undef, i32 9>
+; CHECK-MVE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor4_undefmulti(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <8 x i32> <i32 4, i32 undef, i32 undef, i32 8, i32 undef, i32 undef, i32 undef, i32 9>
+; CHECK-NONE-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 undef, i32 undef, i32 8, i32 undef, i32 undef, i32 undef, i32 9>
   store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
@@ -519,16 +817,23 @@ define void @store_general_mask_factor4_undefmulti(<8 x i32>* %ptr, <32 x i32> %
 }
 
 define void @store_general_mask_factor3(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; NEON-LABEL:    @store_general_mask_factor3(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_general_mask_factor3(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_general_mask_factor3(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor3(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 4, i32 32, i32 16, i32 5, i32 33, i32 17, i32 6, i32 34, i32 18, i32 7, i32 35, i32 19>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor3(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 4, i32 32, i32 16, i32 5, i32 33, i32 17, i32 6, i32 34, i32 18, i32 7, i32 35, i32 19>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 5, i32 33, i32 17, i32 6, i32 34, i32 18, i32 7, i32 35, i32 19>
   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
@@ -536,16 +841,23 @@ define void @store_general_mask_factor3(<12 x i32>* %ptr, <32 x i32> %v0, <32 x
 }
 
 define void @store_general_mask_factor3_undefmultimid(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; NEON-LABEL:    @store_general_mask_factor3_undefmultimid(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_general_mask_factor3_undefmultimid(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_general_mask_factor3_undefmultimid(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor3_undefmultimid(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor3_undefmultimid(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
@@ -553,9 +865,20 @@ define void @store_general_mask_factor3_undefmultimid(<12 x i32>* %ptr, <32 x i3
 }
 
 define void @store_general_mask_factor3_undef_fail(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; ALL-LABEL: @store_general_mask_factor3_undef_fail(
-; ALL-NOT:     @llvm.arm.neon
-; ALL:         ret void
+; CHECK-NEON-LABEL: @store_general_mask_factor3_undef_fail(
+; CHECK-NEON-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 8, i32 35, i32 19>
+; CHECK-NEON-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor3_undef_fail(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 8, i32 35, i32 19>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor3_undef_fail(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 8, i32 35, i32 19>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 8, i32 35, i32 19>
   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
@@ -563,16 +886,23 @@ define void @store_general_mask_factor3_undef_fail(<12 x i32>* %ptr, <32 x i32>
 }
 
 define void @store_general_mask_factor3_undeflane(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; NEON-LABEL:    @store_general_mask_factor3_undeflane(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_general_mask_factor3_undeflane(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_general_mask_factor3_undeflane(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor3_undeflane(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor3_undeflane(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
@@ -580,9 +910,20 @@ define void @store_general_mask_factor3_undeflane(<12 x i32>* %ptr, <32 x i32> %
 }
 
 define void @store_general_mask_factor3_endstart_fail(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; ALL-LABEL:    @store_general_mask_factor3_endstart_fail(
-; ALL-NOT:        @llvm.arm.neon
-; ALL:            ret void
+; CHECK-NEON-LABEL: @store_general_mask_factor3_endstart_fail(
+; CHECK-NEON-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 2, i32 35, i32 19>
+; CHECK-NEON-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor3_endstart_fail(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 2, i32 35, i32 19>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor3_endstart_fail(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 2, i32 35, i32 19>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 2, i32 35, i32 19>
   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
@@ -590,16 +931,23 @@ define void @store_general_mask_factor3_endstart_fail(<12 x i32>* %ptr, <32 x i3
 }
 
 define void @store_general_mask_factor3_endstart_pass(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; NEON-LABEL:    @store_general_mask_factor3_endstart_pass(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_general_mask_factor3_endstart_pass(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_general_mask_factor3_endstart_pass(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor3_endstart_pass(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor3_endstart_pass(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
@@ -607,9 +955,20 @@ define void @store_general_mask_factor3_endstart_pass(<12 x i32>* %ptr, <32 x i3
 }
 
 define void @store_general_mask_factor3_midstart_fail(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; ALL-LABEL:    @store_general_mask_factor3_midstart_fail(
-; ALL-NOT:        @llvm.arm.neon
-; ALL:            ret void
+; CHECK-NEON-LABEL: @store_general_mask_factor3_midstart_fail(
+; CHECK-NEON-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 0, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+; CHECK-NEON-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor3_midstart_fail(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 0, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor3_midstart_fail(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 0, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 0, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
@@ -617,16 +976,23 @@ define void @store_general_mask_factor3_midstart_fail(<12 x i32>* %ptr, <32 x i3
 }
 
 define void @store_general_mask_factor3_midstart_pass(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; NEON-LABEL:    @store_general_mask_factor3_midstart_pass(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
-; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_general_mask_factor3_midstart_pass(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_general_mask_factor3_midstart_pass(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <12 x i32>* [[PTR:%.*]] to i8*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i32> [[V0]], <32 x i32> [[V1]], <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_general_mask_factor3_midstart_pass(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 1, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+; CHECK-MVE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_general_mask_factor3_midstart_pass(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[V0:%.*]], <32 x i32> [[V1:%.*]], <12 x i32> <i32 undef, i32 32, i32 16, i32 1, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+; CHECK-NONE-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 1, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
@@ -636,35 +1002,54 @@ define void @store_general_mask_factor3_midstart_pass(<12 x i32>* %ptr, <32 x i3
 @g = external global <4 x float>
 
 ; The following does not give a valid interleaved store
-; ALL-LABEL: define void @no_interleave
-; ALL-NOT: call void @llvm.arm.neon.vst2
-; ALL: shufflevector
-; ALL: store
-; ALL: ret void
 define void @no_interleave(<4 x float> %a0) {
+; CHECK-NEON-LABEL: @no_interleave(
+; CHECK-NEON-NEXT:    [[V0:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A0]], <4 x i32> <i32 0, i32 7, i32 1, i32 undef>
+; CHECK-NEON-NEXT:    store <4 x float> [[V0]], <4 x float>* @g, align 16
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @no_interleave(
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A0]], <4 x i32> <i32 0, i32 7, i32 1, i32 undef>
+; CHECK-MVE-NEXT:    store <4 x float> [[V0]], <4 x float>* @g, align 16
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @no_interleave(
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A0]], <4 x i32> <i32 0, i32 7, i32 1, i32 undef>
+; CHECK-NONE-NEXT:    store <4 x float> [[V0]], <4 x float>* @g, align 16
+; CHECK-NONE-NEXT:    ret void
+;
   %v0 = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 0, i32 7, i32 1, i32 undef>
   store <4 x float> %v0, <4 x float>* @g, align 16
   ret void
 }
 
 define void @load_factor2_wide2(<16 x i32>* %ptr) {
-; NEON-LABEL:    @load_factor2_wide2(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32*
-; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
-; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
-; NEON-NEXT:       [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
-; NEON-NEXT:       [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
-; NEON-NEXT:       [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP6]], i32 4)
-; NEON-NEXT:       [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
-; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
-; NEON-NEXT:       [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_factor2_wide2(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_factor2_wide2(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i32*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP6]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_factor2_wide2(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_factor2_wide2(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
   %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -673,32 +1058,41 @@ define void @load_factor2_wide2(<16 x i32>* %ptr) {
 }
 
 define void @load_factor2_wide3(<24 x i32>* %ptr) {
-; NEON-LABEL:    @load_factor2_wide3(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32*
-; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
-; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
-; NEON-NEXT:       [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
-; NEON-NEXT:       [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
-; NEON-NEXT:       [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP6]], i32 4)
-; NEON-NEXT:       [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
-; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
-; NEON-NEXT:       [[TMP9:%.*]] = getelementptr i32, i32* [[TMP5]], i32 8
-; NEON-NEXT:       [[TMP10:%.*]] = bitcast i32* [[TMP9]] to i8*
-; NEON-NEXT:       [[VLDN2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP10]], i32 4)
-; NEON-NEXT:       [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 1
-; NEON-NEXT:       [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 0
-; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; NEON-NEXT:       [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP14]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; NEON-NEXT:       [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; NEON-NEXT:       [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP16]], <8 x i32> [[TMP17]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_factor2_wide3(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_factor2_wide3(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP6]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = getelementptr i32, i32* [[TMP5]], i32 8
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP10]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 1
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 0
+; CHECK-NEON-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEON-NEXT:    [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP14]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEON-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEON-NEXT:    [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP16]], <8 x i32> [[TMP17]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_factor2_wide3(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <24 x i32>, <24 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> undef, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> undef, <12 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_factor2_wide3(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <24 x i32>, <24 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> undef, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> undef, <12 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
   %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22>
@@ -707,26 +1101,37 @@ define void @load_factor2_wide3(<24 x i32>* %ptr) {
 }
 
 define void @load_factor3_wide(<24 x i32>* %ptr) {
-; NEON-LABEL:    @load_factor3_wide(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32*
-; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP2]], i32 4)
-; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
-; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
-; NEON-NEXT:       [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
-; NEON-NEXT:       [[TMP7:%.*]] = bitcast i32* [[TMP6]] to i8*
-; NEON-NEXT:       [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP7]], i32 4)
-; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 2
-; NEON-NEXT:       [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 1
-; NEON-NEXT:       [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 0
-; NEON-NEXT:       [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_factor3_wide(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_factor3_wide(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP2]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP7]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 2
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 1
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 0
+; CHECK-NEON-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_factor3_wide(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <24 x i32>, <24 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+; CHECK-MVE-NEXT:    [[V2:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_factor3_wide(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <24 x i32>, <24 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+; CHECK-NONE-NEXT:    [[V2:%.*]] = shufflevector <24 x i32> [[INTERLEAVED_VEC]], <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
   %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
@@ -736,29 +1141,42 @@ define void @load_factor3_wide(<24 x i32>* %ptr) {
 }
 
 define void @load_factor4_wide(<32 x i32>* %ptr) {
-; NEON-LABEL:    @load_factor4_wide(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32*
-; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP2]], i32 4)
-; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3
-; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
-; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
-; NEON-NEXT:       [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
-; NEON-NEXT:       [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
-; NEON-NEXT:       [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP8]], i32 4)
-; NEON-NEXT:       [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 3
-; NEON-NEXT:       [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 2
-; NEON-NEXT:       [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 1
-; NEON-NEXT:       [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 0
-; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_factor4_wide(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_factor4_wide(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <32 x i32>* [[PTR:%.*]] to i32*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP2]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP8]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 3
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 2
+; CHECK-NEON-NEXT:    [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 1
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 0
+; CHECK-NEON-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_factor4_wide(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <32 x i32>, <32 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-MVE-NEXT:    [[V2:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-MVE-NEXT:    [[V3:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_factor4_wide(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <32 x i32>, <32 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-NONE-NEXT:    [[V2:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-NONE-NEXT:    [[V3:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <32 x i32>, <32 x i32>* %ptr, align 4
   %v0 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
@@ -769,21 +1187,28 @@ define void @load_factor4_wide(<32 x i32>* %ptr) {
 }
 
 define void @store_factor2_wide(<16 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1) {
-; NEON-LABEL:    @store_factor2_wide(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32*
-; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; NEON-NEXT:       call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
-; NEON-NEXT:       [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
-; NEON-NEXT:       [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
-; NEON-NEXT:       [[TMP7:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP8:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; NEON-NEXT:       call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_factor2_wide(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_factor2_wide(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i32*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_factor2_wide(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-MVE-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_factor2_wide(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-NONE-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4
@@ -791,23 +1216,36 @@ define void @store_factor2_wide(<16 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1)
 }
 
 define void @store_factor3_wide(<24 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2) {
-; NEON-LABEL:    @store_factor3_wide(
-; NEON:            [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32*
-; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4)
-; NEON-NEXT:       [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
-; NEON-NEXT:       [[TMP7:%.*]] = bitcast i32* [[TMP6]] to i8*
-; NEON-NEXT:       [[TMP8:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP9:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; NEON-NEXT:       [[TMP10:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_factor3_wide(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_factor3_wide(
+; CHECK-NEON-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to i8*
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_factor3_wide(
+; CHECK-MVE-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-MVE-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], <24 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_factor3_wide(
+; CHECK-NONE-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NONE-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NONE-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], <24 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %s1 = shufflevector <8 x i32> %v2, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -817,25 +1255,38 @@ define void @store_factor3_wide(<24 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1,
 }
 
 define void @store_factor4_wide(<32 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2, <8 x i32> %v3) {
-; NEON-LABEL:    @store_factor4_wide(
-; NEON:            [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32*
-; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
-; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; NEON-NEXT:       [[TMP6:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], i32 4)
-; NEON-NEXT:       [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
-; NEON-NEXT:       [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
-; NEON-NEXT:       [[TMP9:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP10:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; NEON-NEXT:       [[TMP11:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; NEON-NEXT:       [[TMP12:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4)
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @store_factor4_wide(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @store_factor4_wide(
+; CHECK-NEON-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> [[V3:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <32 x i32>* [[PTR:%.*]] to i32*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:    [[TMP11:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEON-NEXT:    call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4)
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @store_factor4_wide(
+; CHECK-MVE-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> [[V3:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-MVE-NEXT:    store <32 x i32> [[INTERLEAVED_VEC]], <32 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @store_factor4_wide(
+; CHECK-NONE-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NONE-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> [[V3:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-NONE-NEXT:    store <32 x i32> [[INTERLEAVED_VEC]], <32 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    ret void
 ;
   %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -845,9 +1296,23 @@ define void @store_factor4_wide(<32 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1,
 }
 
 define void @load_factor2_fp128(<4 x fp128>* %ptr) {
-; ALL-LABEL: @load_factor2_fp128(
-; ALL-NOT:     @llvm.arm.neon
-; ALL:         ret void
+; CHECK-NEON-LABEL: @load_factor2_fp128(
+; CHECK-NEON-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <4 x fp128>, <4 x fp128>* [[PTR:%.*]], align 16
+; CHECK-NEON-NEXT:    [[V0:%.*]] = shufflevector <4 x fp128> [[INTERLEAVED_VEC]], <4 x fp128> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEON-NEXT:    [[V1:%.*]] = shufflevector <4 x fp128> [[INTERLEAVED_VEC]], <4 x fp128> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_factor2_fp128(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <4 x fp128>, <4 x fp128>* [[PTR:%.*]], align 16
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <4 x fp128> [[INTERLEAVED_VEC]], <4 x fp128> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <4 x fp128> [[INTERLEAVED_VEC]], <4 x fp128> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_factor2_fp128(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <4 x fp128>, <4 x fp128>* [[PTR:%.*]], align 16
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <4 x fp128> [[INTERLEAVED_VEC]], <4 x fp128> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <4 x fp128> [[INTERLEAVED_VEC]], <4 x fp128> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <4 x fp128>, <4 x fp128>* %ptr, align 16
   %v0 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> undef, <2 x i32> <i32 0, i32 2>
@@ -856,27 +1321,36 @@ define void @load_factor2_fp128(<4 x fp128>* %ptr) {
 }
 
 define void @load_factor2_wide_pointer(<16 x i32*>* %ptr) {
-; NEON-LABEL:    @load_factor2_wide_pointer(
-; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32*>* %ptr to i32*
-; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
-; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
-; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
-; NEON-NEXT:       [[TMP4:%.*]] = inttoptr <4 x i32> [[TMP3]] to <4 x i32*>
-; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
-; NEON-NEXT:       [[TMP6:%.*]] = inttoptr <4 x i32> [[TMP5]] to <4 x i32*>
-; NEON-NEXT:       [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
-; NEON-NEXT:       [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
-; NEON-NEXT:       [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP8]], i32 4)
-; NEON-NEXT:       [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
-; NEON-NEXT:       [[TMP10:%.*]] = inttoptr <4 x i32> [[TMP9]] to <4 x i32*>
-; NEON-NEXT:       [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
-; NEON-NEXT:       [[TMP12:%.*]] = inttoptr <4 x i32> [[TMP11]] to <4 x i32*>
-; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32*> [[TMP4]], <4 x i32*> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       [[TMP14:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:       ret void
-; NO_NEON-LABEL: @load_factor2_wide_pointer(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; CHECK-NEON-LABEL: @load_factor2_wide_pointer(
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32*>* [[PTR:%.*]] to i32*
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = inttoptr <4 x i32> [[TMP3]] to <4 x i32*>
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = inttoptr <4 x i32> [[TMP5]] to <4 x i32*>
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
+; CHECK-NEON-NEXT:    [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP8]], i32 4)
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = inttoptr <4 x i32> [[TMP9]] to <4 x i32*>
+; CHECK-NEON-NEXT:    [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = inttoptr <4 x i32> [[TMP11]] to <4 x i32*>
+; CHECK-NEON-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32*> [[TMP4]], <4 x i32*> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_factor2_wide_pointer(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i32*>, <16 x i32*>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <16 x i32*> [[INTERLEAVED_VEC]], <16 x i32*> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <16 x i32*> [[INTERLEAVED_VEC]], <16 x i32*> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_factor2_wide_pointer(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x i32*>, <16 x i32*>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <16 x i32*> [[INTERLEAVED_VEC]], <16 x i32*> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <16 x i32*> [[INTERLEAVED_VEC]], <16 x i32*> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NONE-NEXT:    ret void
 ;
   %interleaved.vec = load <16 x i32*>, <16 x i32*>* %ptr, align 4
   %v0 = shufflevector <16 x i32*> %interleaved.vec, <16 x i32*> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -888,9 +1362,24 @@ define void @load_factor2_wide_pointer(<16 x i32*>* %ptr) {
 ; actually load enough elements to satisfy the shuffle masks. (It would be
 ; possible to produce a vld2.v2i32, but that currently isn't implemented.)
 define void @load_out_of_range(<4 x i32>* %ptr) {
-; ALL-LABEL: @load_out_of_range(
-; ALL-NOT:     @llvm.arm.neon
-; ALL:         ret void
+; CHECK-NEON-LABEL: @load_out_of_range(
+; CHECK-NEON-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <4 x i32>, <4 x i32>* [[PTR:%.*]], align 4
+; CHECK-NEON-NEXT:    [[V0:%.*]] = shufflevector <4 x i32> [[INTERLEAVED_VEC]], <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+; CHECK-NEON-NEXT:    [[V1:%.*]] = shufflevector <4 x i32> [[INTERLEAVED_VEC]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+; CHECK-NEON-NEXT:    ret void
+;
+; CHECK-MVE-LABEL: @load_out_of_range(
+; CHECK-MVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <4 x i32>, <4 x i32>* [[PTR:%.*]], align 4
+; CHECK-MVE-NEXT:    [[V0:%.*]] = shufflevector <4 x i32> [[INTERLEAVED_VEC]], <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    [[V1:%.*]] = shufflevector <4 x i32> [[INTERLEAVED_VEC]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+; CHECK-MVE-NEXT:    ret void
+;
+; CHECK-NONE-LABEL: @load_out_of_range(
+; CHECK-NONE-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <4 x i32>, <4 x i32>* [[PTR:%.*]], align 4
+; CHECK-NONE-NEXT:    [[V0:%.*]] = shufflevector <4 x i32> [[INTERLEAVED_VEC]], <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    [[V1:%.*]] = shufflevector <4 x i32> [[INTERLEAVED_VEC]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+; CHECK-NONE-NEXT:    ret void
+;
   %interleaved.vec = load <4 x i32>, <4 x i32>* %ptr, align 4
   %v0 = shufflevector <4 x i32> %interleaved.vec, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %v1 = shufflevector <4 x i32> %interleaved.vec, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
index 7c73323880f..c25071156a2 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
@@ -7,11 +7,23 @@
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8.1-m.main-none-eabi"
 
+; Factor 2
+
 %i8.2 = type {i8, i8}
 define void @i8_factor_2(%i8.2* %data, i64 %n) #0 {
 entry:
   br label %for.body
 
+; VF_2-LABEL:  Checking a loop in "i8_factor_2"
+; VF_2:          Found an estimated cost of 20 for VF 2 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 0, i8* %tmp1, align 1
+; VF_4-LABEL:  Checking a loop in "i8_factor_2"
+; VF_4:          Found an estimated cost of 72 for VF 4 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_4-NEXT:     Found an estimated cost of 40 for VF 4 For instruction: store i8 0, i8* %tmp1, align 1
 ; VF_8-LABEL:  Checking a loop in "i8_factor_2"
 ; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
@@ -43,6 +55,11 @@ define void @i16_factor_2(%i16.2* %data, i64 %n) #0 {
 entry:
   br label %for.body
 
+; VF_2-LABEL:  Checking a loop in "i16_factor_2"
+; VF_2:          Found an estimated cost of 20 for VF 2 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 0, i16* %tmp1, align 2
 ; VF_4-LABEL:  Checking a loop in "i16_factor_2"
 ; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
@@ -115,11 +132,193 @@ for.end:
   ret void
 }
 
+%i64.2 = type {i64, i64}
+define void @i64_factor_2(%i64.2* %data, i64 %n) #0 {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "i64_factor_2"
+; VF_2:          Found an estimated cost of 24 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 16 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_4-LABEL:  Checking a loop in "i64_factor_2"
+; VF_4:          Found an estimated cost of 80 for VF 4 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 48 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_8-LABEL:  Checking a loop in "i64_factor_2"
+; VF_8:          Found an estimated cost of 288 for VF 8 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 160 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_16-LABEL: Checking a loop in "i64_factor_2"
+; VF_16:         Found an estimated cost of 1088 for VF 16 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 576 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 1
+  %tmp2 = load i64, i64* %tmp0, align 8
+  %tmp3 = load i64, i64* %tmp1, align 8
+  store i64 0, i64* %tmp0, align 8
+  store i64 0, i64* %tmp1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%f16.2 = type {half, half}
+define void @f16_factor_2(%f16.2* %data, i64 %n) #0 {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "f16_factor_2"
+; VF_2:          Found an estimated cost of 20 for VF 2 For instruction: %tmp2 = load half, half* %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load half, half* %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_4-LABEL:  Checking a loop in "f16_factor_2"
+; VF_4:          Found an estimated cost of 72 for VF 4 For instruction: %tmp2 = load half, half* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, half* %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 40 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_8-LABEL:  Checking a loop in "f16_factor_2"
+; VF_8:          Found an estimated cost of 272 for VF 8 For instruction: %tmp2 = load half, half* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, half* %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 144 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_16-LABEL: Checking a loop in "f16_factor_2"
+; VF_16:         Found an estimated cost of 1056 for VF 16 For instruction: %tmp2 = load half, half* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load half, half* %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 544 for VF 16 For instruction: store half 0xH0000, half* %tmp1, align 2
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %f16.2, %f16.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %f16.2, %f16.2* %data, i64 %i, i32 1
+  %tmp2 = load half, half* %tmp0, align 2
+  %tmp3 = load half, half* %tmp1, align 2
+  store half 0.0, half* %tmp0, align 2
+  store half 0.0, half* %tmp1, align 2
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%f32.2 = type {float, float}
+define void @f32_factor_2(%f32.2* %data, i64 %n) #0 {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "f32_factor_2"
+; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load float, float* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load float, float* %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store float 0.000000e+00, float* %tmp1, align 4
+; VF_4-LABEL:  Checking a loop in "f32_factor_2"
+; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load float, float* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load float, float* %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store float 0.000000e+00, float* %tmp1, align 4
+; VF_8-LABEL:  Checking a loop in "f32_factor_2"
+; VF_8:          Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load float, float* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load float, float* %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store float 0.000000e+00, float* %tmp1, align 4
+; VF_16-LABEL: Checking a loop in "f32_factor_2"
+; VF_16:         Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load float, float* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load float, float* %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store float 0.000000e+00, float* %tmp1, align 4
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %f32.2, %f32.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %f32.2, %f32.2* %data, i64 %i, i32 1
+  %tmp2 = load float, float* %tmp0, align 4
+  %tmp3 = load float, float* %tmp1, align 4
+  store float 0.0, float* %tmp0, align 4
+  store float 0.0, float* %tmp1, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%f64.2 = type {double, double}
+define void @f64_factor_2(%f64.2* %data, i64 %n) #0 {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "f64_factor_2"
+; VF_2:          Found an estimated cost of 20 for VF 2 For instruction: %tmp2 = load double, double* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load double, double* %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, double* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store double 0.000000e+00, double* %tmp1, align 8
+; VF_4-LABEL:  Checking a loop in "f64_factor_2"
+; VF_4:          Found an estimated cost of 72 for VF 4 For instruction: %tmp2 = load double, double* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load double, double* %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, double* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 40 for VF 4 For instruction: store double 0.000000e+00, double* %tmp1, align 8
+; VF_8-LABEL:  Checking a loop in "f64_factor_2"
+; VF_8:          Found an estimated cost of 272 for VF 8 For instruction: %tmp2 = load double, double* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load double, double* %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, double* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 144 for VF 8 For instruction: store double 0.000000e+00, double* %tmp1, align 8
+; VF_16-LABEL: Checking a loop in "f64_factor_2"
+; VF_16:         Found an estimated cost of 1056 for VF 16 For instruction: %tmp2 = load double, double* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load double, double* %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 544 for VF 16 For instruction: store double 0.000000e+00, double* %tmp1, align 8
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %f64.2, %f64.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %f64.2, %f64.2* %data, i64 %i, i32 1
+  %tmp2 = load double, double* %tmp0, align 8
+  %tmp3 = load double, double* %tmp1, align 8
+  store double 0.0, double* %tmp0, align 8
+  store double 0.0, double* %tmp1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+
+
+; Factor 3
+
 %i8.3 = type {i8, i8, i8}
 define void @i8_factor_3(%i8.3* %data, i64 %n) #0 {
 entry:
   br label %for.body
 
+; VF_2-LABEL:  Checking a loop in "i8_factor_3"
+; VF_2:          Found an estimated cost of 30 for VF 2 For instruction: %tmp3 = load i8, i8* %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i8, i8* %tmp1, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i8, i8* %tmp2, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 0, i8* %tmp1, align 1
+; VF_2-NEXT:     Found an estimated cost of 18 for VF 2 For instruction: store i8 0, i8* %tmp2, align 1
+; VF_4-LABEL:  Checking a loop in "i8_factor_3"
+; VF_4:          Found an estimated cost of 108 for VF 4 For instruction: %tmp3 = load i8, i8* %tmp0, align 1
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i8, i8* %tmp1, align 1
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i8, i8* %tmp2, align 1
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i8 0, i8* %tmp1, align 1
+; VF_4-NEXT:     Found an estimated cost of 60 for VF 4 For instruction: store i8 0, i8* %tmp2, align 1
 ; VF_8-LABEL:  Checking a loop in "i8_factor_3"
 ; VF_8:          Found an estimated cost of 408 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i8, i8* %tmp1, align 1
@@ -158,6 +357,13 @@ define void @i16_factor_3(%i16.3* %data, i64 %n) #0 {
 entry:
   br label %for.body
 
+; VF_2-LABEL:  Checking a loop in "i16_factor_3"
+; VF_2:          Found an estimated cost of 30 for VF 2 For instruction: %tmp3 = load i16, i16* %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i16, i16* %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i16, i16* %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 18 for VF 2 For instruction: store i16 0, i16* %tmp2, align 2
 ; VF_4-LABEL:  Checking a loop in "i16_factor_3"
 ; VF_4:          Found an estimated cost of 108 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i16, i16* %tmp1, align 2
@@ -250,12 +456,240 @@ for.end:
   ret void
 }
 
+%i64.3 = type {i64, i64, i64}
+define void @i64_factor_3(%i64.3* %data, i64 %n) #0 {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "i64_factor_3"
+; VF_2:          Found an estimated cost of 36 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i64, i64* %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i64, i64* %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 24 for VF 2 For instruction: store i64 0, i64* %tmp2, align 8
+; VF_4-LABEL:  Checking a loop in "i64_factor_3"
+; VF_4:          Found an estimated cost of 120 for VF 4 For instruction: %tmp3 = load i64, i64* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i64, i64* %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i64, i64* %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 72 for VF 4 For instruction: store i64 0, i64* %tmp2, align 8
+; VF_8-LABEL:  Checking a loop in "i64_factor_3"
+; VF_8:          Found an estimated cost of 432 for VF 8 For instruction: %tmp3 = load i64, i64* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i64, i64* %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i64, i64* %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 240 for VF 8 For instruction: store i64 0, i64* %tmp2, align 8
+; VF_16-LABEL: Checking a loop in "i64_factor_3"
+; VF_16:         Found an estimated cost of 1632 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i64, i64* %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i64, i64* %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 864 for VF 16 For instruction: store i64 0, i64* %tmp2, align 8
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i64.3, %i64.3* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i64.3, %i64.3* %data, i64 %i, i32 1
+  %tmp2 = getelementptr inbounds %i64.3, %i64.3* %data, i64 %i, i32 2
+  %tmp3 = load i64, i64* %tmp0, align 8
+  %tmp4 = load i64, i64* %tmp1, align 8
+  %tmp5 = load i64, i64* %tmp2, align 8
+  store i64 0, i64* %tmp0, align 8
+  store i64 0, i64* %tmp1, align 8
+  store i64 0, i64* %tmp2, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%f16.3 = type {half, half, half}
+define void @f16_factor_3(%f16.3* %data, i64 %n) #0 {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "f16_factor_3"
+; VF_2:          Found an estimated cost of 30 for VF 2 For instruction: %tmp3 = load half, half* %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load half, half* %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load half, half* %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 18 for VF 2 For instruction: store half 0xH0000, half* %tmp2, align 2
+; VF_4-LABEL:  Checking a loop in "f16_factor_3"
+; VF_4:          Found an estimated cost of 108 for VF 4 For instruction: %tmp3 = load half, half* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load half, half* %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load half, half* %tmp2, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 60 for VF 4 For instruction: store half 0xH0000, half* %tmp2, align 2
+; VF_8-LABEL:  Checking a loop in "f16_factor_3"
+; VF_8:          Found an estimated cost of 408 for VF 8 For instruction: %tmp3 = load half, half* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load half, half* %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load half, half* %tmp2, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 216 for VF 8 For instruction: store half 0xH0000, half* %tmp2, align 2
+; VF_16-LABEL: Checking a loop in "f16_factor_3"
+; VF_16:         Found an estimated cost of 1584 for VF 16 For instruction: %tmp3 = load half, half* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load half, half* %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load half, half* %tmp2, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 816 for VF 16 For instruction: store half 0xH0000, half* %tmp2, align 2
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %f16.3, %f16.3* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %f16.3, %f16.3* %data, i64 %i, i32 1
+  %tmp2 = getelementptr inbounds %f16.3, %f16.3* %data, i64 %i, i32 2
+  %tmp3 = load half, half* %tmp0, align 2
+  %tmp4 = load half, half* %tmp1, align 2
+  %tmp5 = load half, half* %tmp2, align 2
+  store half 0.0, half* %tmp0, align 2
+  store half 0.0, half* %tmp1, align 2
+  store half 0.0, half* %tmp2, align 2
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%f32.3 = type {float, float, float}
+define void @f32_factor_3(%f32.3* %data, i64 %n) #0 {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "f32_factor_3"
+; VF_2:          Found an estimated cost of 30 for VF 2 For instruction: %tmp3 = load float, float* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load float, float* %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load float, float* %tmp2, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 18 for VF 2 For instruction: store float 0.000000e+00, float* %tmp2, align 4
+; VF_4-LABEL:  Checking a loop in "f32_factor_3"
+; VF_4:          Found an estimated cost of 108 for VF 4 For instruction: %tmp3 = load float, float* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load float, float* %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load float, float* %tmp2, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 60 for VF 4 For instruction: store float 0.000000e+00, float* %tmp2, align 4
+; VF_8-LABEL:  Checking a loop in "f32_factor_3"
+; VF_8:          Found an estimated cost of 408 for VF 8 For instruction: %tmp3 = load float, float* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load float, float* %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load float, float* %tmp2, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 216 for VF 8 For instruction: store float 0.000000e+00, float* %tmp2, align 4
+; VF_16-LABEL: Checking a loop in "f32_factor_3"
+; VF_16:         Found an estimated cost of 1584 for VF 16 For instruction: %tmp3 = load float, float* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load float, float* %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load float, float* %tmp2, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 816 for VF 16 For instruction: store float 0.000000e+00, float* %tmp2, align 4
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %f32.3, %f32.3* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %f32.3, %f32.3* %data, i64 %i, i32 1
+  %tmp2 = getelementptr inbounds %f32.3, %f32.3* %data, i64 %i, i32 2
+  %tmp3 = load float, float* %tmp0, align 4
+  %tmp4 = load float, float* %tmp1, align 4
+  %tmp5 = load float, float* %tmp2, align 4
+  store float 0.0, float* %tmp0, align 4
+  store float 0.0, float* %tmp1, align 4
+  store float 0.0, float* %tmp2, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%f64.3 = type {double, double, double}
+define void @f64_factor_3(%f64.3* %data, i64 %n) #0 {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "f64_factor_3"
+; VF_2:          Found an estimated cost of 30 for VF 2 For instruction: %tmp3 = load double, double* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load double, double* %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load double, double* %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, double* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, double* %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 18 for VF 2 For instruction: store double 0.000000e+00, double* %tmp2, align 8
+; VF_4-LABEL:  Checking a loop in "f64_factor_3"
+; VF_4:          Found an estimated cost of 108 for VF 4 For instruction: %tmp3 = load double, double* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load double, double* %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load double, double* %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, double* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, double* %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 60 for VF 4 For instruction: store double 0.000000e+00, double* %tmp2, align 8
+; VF_8-LABEL:  Checking a loop in "f64_factor_3"
+; VF_8:          Found an estimated cost of 408 for VF 8 For instruction: %tmp3 = load double, double* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load double, double* %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load double, double* %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, double* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, double* %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 216 for VF 8 For instruction: store double 0.000000e+00, double* %tmp2, align 8
+; VF_16-LABEL: Checking a loop in "f64_factor_3"
+; VF_16:         Found an estimated cost of 1584 for VF 16 For instruction: %tmp3 = load double, double* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load double, double* %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load double, double* %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 816 for VF 16 For instruction: store double 0.000000e+00, double* %tmp2, align 8
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %f64.3, %f64.3* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %f64.3, %f64.3* %data, i64 %i, i32 1
+  %tmp2 = getelementptr inbounds %f64.3, %f64.3* %data, i64 %i, i32 2
+  %tmp3 = load double, double* %tmp0, align 8
+  %tmp4 = load double, double* %tmp1, align 8
+  %tmp5 = load double, double* %tmp2, align 8
+  store double 0.0, double* %tmp0, align 8
+  store double 0.0, double* %tmp1, align 8
+  store double 0.0, double* %tmp2, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+
+; Factor 4
 
 %i8.4 = type {i8, i8, i8, i8}
 define void @i8_factor_4(%i8.4* %data, i64 %n) #0 {
 entry:
   br label %for.body
 
+; VF_2-LABEL:  Checking a loop in "i8_factor_4"
+; VF_2:          Found an estimated cost of 40 for VF 2 For instruction: %tmp4 = load i8, i8* %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i8, i8* %tmp1, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i8, i8* %tmp2, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i8, i8* %tmp3, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 0, i8* %tmp1, align 1
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i8 0, i8* %tmp2, align 1
+; VF_2-NEXT:     Found an estimated cost of 24 for VF 2 For instruction: store i8 0, i8* %tmp3, align 1
+; VF_4-LABEL: Checking a loop in "i8_factor_4"
+; VF_4:         Found an estimated cost of 144 for VF 4 For instruction: %tmp4 = load i8, i8* %tmp0, align 1
+; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i8, i8* %tmp1, align 1
+; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i8, i8* %tmp2, align 1
+; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i8, i8* %tmp3, align 1
+; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store i8 0, i8* %tmp1, align 1
+; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store i8 0, i8* %tmp2, align 1
+; VF_4-NEXT:    Found an estimated cost of 80 for VF 4 For instruction: store i8 0, i8* %tmp3, align 1
 ; VF_8-LABEL:  Checking a loop in "i8_factor_4"
 ; VF_8:          Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load i8, i8* %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i8, i8* %tmp1, align 1
@@ -301,6 +735,15 @@ define void @i16_factor_4(%i16.4* %data, i64 %n) #0 {
 entry:
   br label %for.body
 
+; VF_2-LABEL:  Checking a loop in "i16_factor_4"
+; VF_2:          Found an estimated cost of 40 for VF 2 For instruction: %tmp4 = load i16, i16* %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i16, i16* %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i16, i16* %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i16, i16* %tmp3, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i16 0, i16* %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 24 for VF 2 For instruction: store i16 0, i16* %tmp3, align 2
 ; VF_4-LABEL:  Checking a loop in "i16_factor_4"
 ; VF_4:          Found an estimated cost of 144 for VF 4 For instruction: %tmp4 = load i16, i16* %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i16, i16* %tmp1, align 2
@@ -413,4 +856,256 @@ for.end:
   ret void
 }
 
+%i64.4 = type {i64, i64, i64, i64}
+define void @i64_factor_4(%i64.4* %data, i64 %n) #0 {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "i64_factor_4"
+; VF_2:          Found an estimated cost of 48 for VF 2 For instruction: %tmp4 = load i64, i64* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i64, i64* %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i64, i64* %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i64, i64* %tmp3, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 32 for VF 2 For instruction: store i64 0, i64* %tmp3, align 8
+; VF_4-LABEL:  Checking a loop in "i64_factor_4"
+; VF_4:          Found an estimated cost of 160 for VF 4 For instruction: %tmp4 = load i64, i64* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i64, i64* %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i64, i64* %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i64, i64* %tmp3, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 96 for VF 4 For instruction: store i64 0, i64* %tmp3, align 8
+; VF_8-LABEL:  Checking a loop in "i64_factor_4"
+; VF_8:          Found an estimated cost of 576 for VF 8 For instruction: %tmp4 = load i64, i64* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i64, i64* %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i64, i64* %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i64, i64* %tmp3, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 320 for VF 8 For instruction: store i64 0, i64* %tmp3, align 8
+; VF_16-LABEL: Checking a loop in "i64_factor_4"
+; VF_16:         Found an estimated cost of 2176 for VF 16 For instruction: %tmp4 = load i64, i64* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i64, i64* %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i64, i64* %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i64, i64* %tmp3, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 1152 for VF 16 For instruction: store i64 0, i64* %tmp3, align 8
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i64.4, %i64.4* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i64.4, %i64.4* %data, i64 %i, i32 1
+  %tmp2 = getelementptr inbounds %i64.4, %i64.4* %data, i64 %i, i32 2
+  %tmp3 = getelementptr inbounds %i64.4, %i64.4* %data, i64 %i, i32 3
+  %tmp4 = load i64, i64* %tmp0, align 8
+  %tmp5 = load i64, i64* %tmp1, align 8
+  %tmp6 = load i64, i64* %tmp2, align 8
+  %tmp7 = load i64, i64* %tmp3, align 8
+  store i64 0, i64* %tmp0, align 8
+  store i64 0, i64* %tmp1, align 8
+  store i64 0, i64* %tmp2, align 8
+  store i64 0, i64* %tmp3, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%f16.4 = type {half, half, half, half}
+define void @f16_factor_4(%f16.4* %data, i64 %n) #0 {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "f16_factor_4"
+; VF_2:          Found an estimated cost of 40 for VF 2 For instruction: %tmp4 = load half, half* %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load half, half* %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load half, half* %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load half, half* %tmp3, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, half* %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 24 for VF 2 For instruction: store half 0xH0000, half* %tmp3, align 2
+; VF_4-LABEL:  Checking a loop in "f16_factor_4"
+; VF_4:          Found an estimated cost of 144 for VF 4 For instruction: %tmp4 = load half, half* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load half, half* %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load half, half* %tmp2, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load half, half* %tmp3, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp2, align 2
+; VF_4-NEXT:     Found an estimated cost of 80 for VF 4 For instruction: store half 0xH0000, half* %tmp3, align 2
+; VF_8-LABEL:  Checking a loop in "f16_factor_4"
+; VF_8:          Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load half, half* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load half, half* %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load half, half* %tmp2, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load half, half* %tmp3, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp2, align 2
+; VF_8-NEXT:     Found an estimated cost of 288 for VF 8 For instruction: store half 0xH0000, half* %tmp3, align 2
+; VF_16-LABEL: Checking a loop in "f16_factor_4"
+; VF_16:         Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load half, half* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load half, half* %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load half, half* %tmp2, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load half, half* %tmp3, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp2, align 2
+; VF_16-NEXT:    Found an estimated cost of 1088 for VF 16 For instruction: store half 0xH0000, half* %tmp3, align 2
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %f16.4, %f16.4* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %f16.4, %f16.4* %data, i64 %i, i32 1
+  %tmp2 = getelementptr inbounds %f16.4, %f16.4* %data, i64 %i, i32 2
+  %tmp3 = getelementptr inbounds %f16.4, %f16.4* %data, i64 %i, i32 3
+  %tmp4 = load half, half* %tmp0, align 2
+  %tmp5 = load half, half* %tmp1, align 2
+  %tmp6 = load half, half* %tmp2, align 2
+  %tmp7 = load half, half* %tmp3, align 2
+  store half 0.0, half* %tmp0, align 2
+  store half 0.0, half* %tmp1, align 2
+  store half 0.0, half* %tmp2, align 2
+  store half 0.0, half* %tmp3, align 2
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%f32.4 = type {float, float, float, float}
+define void @f32_factor_4(%f32.4* %data, i64 %n) #0 {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "f32_factor_4"
+; VF_2:          Found an estimated cost of 40 for VF 2 For instruction: %tmp4 = load float, float* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load float, float* %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load float, float* %tmp2, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load float, float* %tmp3, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp2, align 4
+; VF_2-NEXT:     Found an estimated cost of 24 for VF 2 For instruction: store float 0.000000e+00, float* %tmp3, align 4
+; VF_4-LABEL:  Checking a loop in "f32_factor_4"
+; VF_4:          Found an estimated cost of 144 for VF 4 For instruction: %tmp4 = load float, float* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load float, float* %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load float, float* %tmp2, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load float, float* %tmp3, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp2, align 4
+; VF_4-NEXT:     Found an estimated cost of 80 for VF 4 For instruction: store float 0.000000e+00, float* %tmp3, align 4
+; VF_8-LABEL:  Checking a loop in "f32_factor_4"
+; VF_8:          Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load float, float* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load float, float* %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load float, float* %tmp2, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load float, float* %tmp3, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp2, align 4
+; VF_8-NEXT:     Found an estimated cost of 288 for VF 8 For instruction: store float 0.000000e+00, float* %tmp3, align 4
+; VF_16-LABEL: Checking a loop in "f32_factor_4"
+; VF_16:         Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load float, float* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load float, float* %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load float, float* %tmp2, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load float, float* %tmp3, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp2, align 4
+; VF_16-NEXT:    Found an estimated cost of 1088 for VF 16 For instruction: store float 0.000000e+00, float* %tmp3, align 4
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %f32.4, %f32.4* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %f32.4, %f32.4* %data, i64 %i, i32 1
+  %tmp2 = getelementptr inbounds %f32.4, %f32.4* %data, i64 %i, i32 2
+  %tmp3 = getelementptr inbounds %f32.4, %f32.4* %data, i64 %i, i32 3
+  %tmp4 = load float, float* %tmp0, align 4
+  %tmp5 = load float, float* %tmp1, align 4
+  %tmp6 = load float, float* %tmp2, align 4
+  %tmp7 = load float, float* %tmp3, align 4
+  store float 0.0, float* %tmp0, align 4
+  store float 0.0, float* %tmp1, align 4
+  store float 0.0, float* %tmp2, align 4
+  store float 0.0, float* %tmp3, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%f64.4 = type {double, double, double, double}
+define void @f64_factor_4(%f64.4* %data, i64 %n) #0 {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "f64_factor_4"
+; VF_2:          Found an estimated cost of 40 for VF 2 For instruction: %tmp4 = load double, double* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load double, double* %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load double, double* %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load double, double* %tmp3, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, double* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, double* %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, double* %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 24 for VF 2 For instruction: store double 0.000000e+00, double* %tmp3, align 8
+; VF_4-LABEL:  Checking a loop in "f64_factor_4"
+; VF_4:          Found an estimated cost of 144 for VF 4 For instruction: %tmp4 = load double, double* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load double, double* %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load double, double* %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load double, double* %tmp3, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, double* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, double* %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, double* %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 80 for VF 4 For instruction: store double 0.000000e+00, double* %tmp3, align 8
+; VF_8-LABEL:  Checking a loop in "f64_factor_4"
+; VF_8:          Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load double, double* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load double, double* %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load double, double* %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load double, double* %tmp3, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, double* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, double* %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, double* %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 288 for VF 8 For instruction: store double 0.000000e+00, double* %tmp3, align 8
+; VF_16-LABEL: Checking a loop in "f64_factor_4"
+; VF_16:         Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load double, double* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load double, double* %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load double, double* %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load double, double* %tmp3, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 1088 for VF 16 For instruction: store double 0.000000e+00, double* %tmp3, align 8
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %f64.4, %f64.4* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %f64.4, %f64.4* %data, i64 %i, i32 1
+  %tmp2 = getelementptr inbounds %f64.4, %f64.4* %data, i64 %i, i32 2
+  %tmp3 = getelementptr inbounds %f64.4, %f64.4* %data, i64 %i, i32 3
+  %tmp4 = load double, double* %tmp0, align 8
+  %tmp5 = load double, double* %tmp1, align 8
+  %tmp6 = load double, double* %tmp2, align 8
+  %tmp7 = load double, double* %tmp3, align 8
+  store double 0.0, double* %tmp0, align 8
+  store double 0.0, double* %tmp1, align 8
+  store double 0.0, double* %tmp2, align 8
+  store double 0.0, double* %tmp3, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
 attributes #0 = { "target-features"="+mve.fp" }
author	David Green <david.green@arm.com>	2019-11-19 18:17:46 +0000
committer	David Green <david.green@arm.com>	2019-11-19 18:37:30 +0000
commit	411bfe476b758c09a0c9d4b3176e46f0a70de3bb (patch)
tree	0f17f58eb094d88ecb96b468ddb104a44fd26b41 /llvm/test/Transforms
parent	b91f798fde42668a3a7b361c015deb787a46720d (diff)
download	bcm5719-llvm-411bfe476b758c09a0c9d4b3176e46f0a70de3bb.tar.gz bcm5719-llvm-411bfe476b758c09a0c9d4b3176e46f0a70de3bb.zip