10 files changed, 299 insertions, 805 deletions
diff --git a/clang/test/CodeGen/64bit-swiftcall.c b/clang/test/CodeGen/64bit-swiftcall.c
index f7d27d830e2..7486e44406e 100644
--- a/clang/test/CodeGen/64bit-swiftcall.c
+++ b/clang/test/CodeGen/64bit-swiftcall.c
@@ -108,9 +108,7 @@ typedef struct {
 TEST(struct_1);
 // CHECK-LABEL: define swiftcc { i64, i64 } @return_struct_1() {{.*}}{
 // CHECK:   [[RET:%.*]] = alloca [[STRUCT1:%.*]], align 4
-// CHECK:   [[VAR:%.*]] = alloca [[STRUCT1]], align 4
 // CHECK:   call void @llvm.memset
-// CHECK:   call void @llvm.memcpy
 // CHECK:   [[CAST:%.*]] = bitcast [[STRUCT1]]* %retval to { i64, i64 }*
 // CHECK:   [[GEP0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
 // CHECK:   [[T0:%.*]] = load i64, i64* [[GEP0]], align 4
@@ -158,12 +156,8 @@ typedef struct {
 TEST(struct_2);
 // CHECK-LABEL: define swiftcc { i64, i64 } @return_struct_2() {{.*}}{
 // CHECK:   [[RET:%.*]] = alloca [[STRUCT2_TYPE]], align 4
-// CHECK:   [[VAR:%.*]] = alloca [[STRUCT2_TYPE]], align 4
-// CHECK:   [[CASTVAR:%.*]] = bitcast {{.*}} [[VAR]]
+// CHECK:   [[CASTVAR:%.*]] = bitcast {{.*}} [[RET]]
 // CHECK:   call void @llvm.memcpy{{.*}}({{.*}}[[CASTVAR]], {{.*}}[[STRUCT2_RESULT]]
-// CHECK:   [[CASTRET:%.*]] = bitcast {{.*}} [[RET]]
-// CHECK:   [[CASTVAR:%.*]] = bitcast {{.*}} [[VAR]]
-// CHECK:   call void @llvm.memcpy{{.*}}({{.*}}[[CASTRET]], {{.*}}[[CASTVAR]]
 // CHECK:   [[CAST:%.*]] = bitcast [[STRUCT2_TYPE]]* [[RET]] to { i64, i64 }*
 // CHECK:   [[GEP0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
 // CHECK:   [[T0:%.*]] = load i64, i64* [[GEP0]], align 4
@@ -214,12 +208,8 @@ typedef struct {
 TEST(struct_misaligned_1)
 // CHECK-LABEL: define swiftcc i64 @return_struct_misaligned_1()
 // CHECK:  [[RET:%.*]] = alloca [[STRUCT:%.*]], align 1
-// CHECK:  [[RES:%.*]] = alloca [[STRUCT]], align 1
-// CHECK:  [[CAST:%.*]] = bitcast [[STRUCT]]* [[RES]] to i8*
+// CHECK:  [[CAST:%.*]] = bitcast [[STRUCT]]* [[RET]] to i8*
 // CHECK:  call void @llvm.memset{{.*}}(i8* align 1 [[CAST]], i8 0, i64 5
-// CHECK:  [[CASTRET:%.*]] = bitcast [[STRUCT]]* [[RET]] to i8*
-// CHECK:  [[CASTRES:%.*]] = bitcast [[STRUCT]]* [[RES]] to i8*
-// CHECK:  call void @llvm.memcpy{{.*}}(i8* align 1 [[CASTRET]], i8* align 1 [[CASTRES]], i64 5
 // CHECK:  [[CAST:%.*]] = bitcast [[STRUCT]]* [[RET]] to { i64 }*
 // CHECK:  [[GEP:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[CAST]], i32 0, i32 0
 // CHECK:  [[R0:%.*]] = load i64, i64* [[GEP]], align 1
@@ -267,12 +257,8 @@ typedef union {
 TEST(union_het_fp)
 // CHECK-LABEL: define swiftcc i64 @return_union_het_fp()
 // CHECK:  [[RET:%.*]] = alloca [[UNION:%.*]], align 8
-// CHECK:  [[RES:%.*]] = alloca [[UNION]], align 8
-// CHECK:  [[CAST:%.*]] = bitcast [[UNION]]* [[RES]] to i8*
+// CHECK:  [[CAST:%.*]] = bitcast [[UNION]]* [[RET]] to i8*
 // CHECK:  call void @llvm.memcpy{{.*}}(i8* align 8 [[CAST]]
-// CHECK:  [[CASTRET:%.*]] = bitcast [[UNION]]* [[RET]] to i8*
-// CHECK:  [[CASTRES:%.*]] = bitcast [[UNION]]* [[RES]] to i8*
-// CHECK:  call void @llvm.memcpy{{.*}}(i8* align 8 [[CASTRET]], i8* align 8 [[CASTRES]]
 // CHECK:  [[CAST:%.*]] = bitcast [[UNION]]* [[RET]] to { i64 }*
 // CHECK:  [[GEP:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[CAST]], i32 0, i32 0
 // CHECK:  [[R0:%.*]] = load i64, i64* [[GEP]], align 8
diff --git a/clang/test/CodeGen/aarch64-neon-perm.c b/clang/test/CodeGen/aarch64-neon-perm.c
index c24447bf514..c5d5ab18070 100644
--- a/clang/test/CodeGen/aarch64-neon-perm.c
+++ b/clang/test/CodeGen/aarch64-neon-perm.c
@@ -888,18 +888,14 @@ poly16x8_t test_vtrn2q_p16(poly16x8_t a, poly16x8_t b) {
 
 // CHECK-LABEL: @test_vuzp_s8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 // CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 // CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
@@ -912,9 +908,8 @@ int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
 
 // CHECK-LABEL: @test_vuzp_s16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
@@ -923,9 +918,6 @@ int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 // CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x4x2_t [[TMP7]], 0
@@ -938,9 +930,8 @@ int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
 
 // CHECK-LABEL: @test_vuzp_s32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
@@ -949,9 +940,6 @@ int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x2x2_t [[TMP7]], 0
@@ -964,18 +952,14 @@ int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
 
 // CHECK-LABEL: @test_vuzp_u8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 // CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 // CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
@@ -988,9 +972,8 @@ uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
 
 // CHECK-LABEL: @test_vuzp_u16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
@@ -999,9 +982,6 @@ uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 // CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP7]], 0
@@ -1014,9 +994,8 @@ uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
 
 // CHECK-LABEL: @test_vuzp_u32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
@@ -1025,9 +1004,6 @@ uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP7]], 0
@@ -1040,9 +1016,8 @@ uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
 
 // CHECK-LABEL: @test_vuzp_f32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
@@ -1051,9 +1026,6 @@ uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   store <2 x float> [[VUZP1_I]], <2 x float>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x2x2_t [[TMP7]], 0
@@ -1066,18 +1038,14 @@ float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
 
 // CHECK-LABEL: @test_vuzp_p8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 // CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 // CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
@@ -1090,9 +1058,8 @@ poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
 
 // CHECK-LABEL: @test_vuzp_p16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
@@ -1101,9 +1068,6 @@ poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 // CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP7]], 0
@@ -1116,18 +1080,14 @@ poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
 
 // CHECK-LABEL: @test_vuzpq_s8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 // CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 // CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
@@ -1140,9 +1100,8 @@ int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
 
 // CHECK-LABEL: @test_vuzpq_s16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
@@ -1151,9 +1110,6 @@ int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 // CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x8x2_t [[TMP7]], 0
@@ -1166,9 +1122,8 @@ int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
 
 // CHECK-LABEL: @test_vuzpq_s32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
@@ -1177,9 +1132,6 @@ int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 // CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x4x2_t [[TMP7]], 0
@@ -1192,18 +1144,14 @@ int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
 
 // CHECK-LABEL: @test_vuzpq_u8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 // CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 // CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
@@ -1216,9 +1164,8 @@ uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
 
 // CHECK-LABEL: @test_vuzpq_u16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
@@ -1227,9 +1174,6 @@ uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 // CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP7]], 0
@@ -1242,9 +1186,8 @@ uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
 
 // CHECK-LABEL: @test_vuzpq_u32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
@@ -1253,9 +1196,6 @@ uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 // CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP7]], 0
@@ -1268,9 +1208,8 @@ uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
 
 // CHECK-LABEL: @test_vuzpq_f32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
@@ -1279,9 +1218,6 @@ uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 // CHECK:   store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x4x2_t [[TMP7]], 0
@@ -1294,18 +1230,14 @@ float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
 
 // CHECK-LABEL: @test_vuzpq_p8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 // CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 // CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
@@ -1318,9 +1250,8 @@ poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
 
 // CHECK-LABEL: @test_vuzpq_p16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
@@ -1329,9 +1260,6 @@ poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 // CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP7]], 0
@@ -1344,18 +1272,14 @@ poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
 
 // CHECK-LABEL: @test_vzip_s8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
 // CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 // CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
@@ -1368,9 +1292,8 @@ int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
 
 // CHECK-LABEL: @test_vzip_s16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
@@ -1379,9 +1302,6 @@ int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
 // CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x4x2_t [[TMP7]], 0
@@ -1394,9 +1314,8 @@ int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
 
 // CHECK-LABEL: @test_vzip_s32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
@@ -1405,9 +1324,6 @@ int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x2x2_t [[TMP7]], 0
@@ -1420,18 +1336,14 @@ int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
 
 // CHECK-LABEL: @test_vzip_u8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
 // CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 // CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
@@ -1444,9 +1356,8 @@ uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
 
 // CHECK-LABEL: @test_vzip_u16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
@@ -1455,9 +1366,6 @@ uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
 // CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP7]], 0
@@ -1470,9 +1378,8 @@ uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
 
 // CHECK-LABEL: @test_vzip_u32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
@@ -1481,9 +1388,6 @@ uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP7]], 0
@@ -1496,9 +1400,8 @@ uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
 
 // CHECK-LABEL: @test_vzip_f32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
@@ -1507,9 +1410,6 @@ uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   store <2 x float> [[VZIP1_I]], <2 x float>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x2x2_t [[TMP7]], 0
@@ -1522,18 +1422,14 @@ float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
 
 // CHECK-LABEL: @test_vzip_p8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
 // CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 // CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
@@ -1546,9 +1442,8 @@ poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
 
 // CHECK-LABEL: @test_vzip_p16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
@@ -1557,9 +1452,6 @@ poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
 // CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP7]], 0
@@ -1572,18 +1464,14 @@ poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
 
 // CHECK-LABEL: @test_vzipq_s8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
 // CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
 // CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
@@ -1596,9 +1484,8 @@ int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
 
 // CHECK-LABEL: @test_vzipq_s16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
@@ -1607,9 +1494,6 @@ int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 // CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x8x2_t [[TMP7]], 0
@@ -1622,9 +1506,8 @@ int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
 
 // CHECK-LABEL: @test_vzipq_s32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
@@ -1633,9 +1516,6 @@ int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
 // CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x4x2_t [[TMP7]], 0
@@ -1648,18 +1528,14 @@ int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
 
 // CHECK-LABEL: @test_vzipq_u8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
 // CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
 // CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
@@ -1672,9 +1548,8 @@ uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
 
 // CHECK-LABEL: @test_vzipq_u16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
@@ -1683,9 +1558,6 @@ uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 // CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP7]], 0
@@ -1698,9 +1570,8 @@ uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
 
 // CHECK-LABEL: @test_vzipq_u32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
@@ -1709,9 +1580,6 @@ uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
 // CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP7]], 0
@@ -1724,9 +1592,8 @@ uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
 
 // CHECK-LABEL: @test_vzipq_f32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
@@ -1735,9 +1602,6 @@ uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
 // CHECK:   store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x4x2_t [[TMP7]], 0
@@ -1750,18 +1614,14 @@ float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
 
 // CHECK-LABEL: @test_vzipq_p8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
 // CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
 // CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
@@ -1774,9 +1634,8 @@ poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
 
 // CHECK-LABEL: @test_vzipq_p16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
@@ -1785,9 +1644,6 @@ poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 // CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP7]], 0
@@ -1800,18 +1656,14 @@ poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) {
 
 // CHECK-LABEL: @test_vtrn_s8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 // CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 // CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
@@ -1824,9 +1676,8 @@ int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
 
 // CHECK-LABEL: @test_vtrn_s16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
@@ -1835,9 +1686,6 @@ int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 // CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x4x2_t [[TMP7]], 0
@@ -1850,9 +1698,8 @@ int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
 
 // CHECK-LABEL: @test_vtrn_s32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
@@ -1861,9 +1708,6 @@ int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x2x2_t [[TMP7]], 0
@@ -1876,18 +1720,14 @@ int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
 
 // CHECK-LABEL: @test_vtrn_u8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 // CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 // CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
@@ -1900,9 +1740,8 @@ uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
 
 // CHECK-LABEL: @test_vtrn_u16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
@@ -1911,9 +1750,6 @@ uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 // CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP7]], 0
@@ -1926,9 +1762,8 @@ uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
 
 // CHECK-LABEL: @test_vtrn_u32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
@@ -1937,9 +1772,6 @@ uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP7]], 0
@@ -1952,9 +1784,8 @@ uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
 
 // CHECK-LABEL: @test_vtrn_f32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
@@ -1963,9 +1794,6 @@ uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   store <2 x float> [[VTRN1_I]], <2 x float>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x2x2_t [[TMP7]], 0
@@ -1978,18 +1806,14 @@ float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
 
 // CHECK-LABEL: @test_vtrn_p8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 // CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 // CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
@@ -2002,9 +1826,8 @@ poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
 
 // CHECK-LABEL: @test_vtrn_p16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
@@ -2013,9 +1836,6 @@ poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 // CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP7]], 0
@@ -2028,18 +1848,14 @@ poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
 
 // CHECK-LABEL: @test_vtrnq_s8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
 // CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
 // CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
@@ -2052,9 +1868,8 @@ int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
 
 // CHECK-LABEL: @test_vtrnq_s16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
@@ -2063,9 +1878,6 @@ int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 // CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x8x2_t [[TMP7]], 0
@@ -2078,9 +1890,8 @@ int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
 
 // CHECK-LABEL: @test_vtrnq_s32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
@@ -2089,9 +1900,6 @@ int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 // CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x4x2_t [[TMP7]], 0
@@ -2104,18 +1912,14 @@ int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
 
 // CHECK-LABEL: @test_vtrnq_u8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
 // CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
 // CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
@@ -2128,9 +1932,8 @@ uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
 
 // CHECK-LABEL: @test_vtrnq_u16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
@@ -2139,9 +1942,6 @@ uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 // CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP7]], 0
@@ -2154,9 +1954,8 @@ uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
 
 // CHECK-LABEL: @test_vtrnq_u32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
@@ -2165,9 +1964,6 @@ uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 // CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP7]], 0
@@ -2180,9 +1976,8 @@ uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
 
 // CHECK-LABEL: @test_vtrnq_f32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
@@ -2191,9 +1986,6 @@ uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 // CHECK:   store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x4x2_t [[TMP7]], 0
@@ -2206,18 +1998,14 @@ float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
 
 // CHECK-LABEL: @test_vtrnq_p8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
 // CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
 // CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
@@ -2230,9 +2018,8 @@ poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
 
 // CHECK-LABEL: @test_vtrnq_p16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
@@ -2241,9 +2028,6 @@ poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 // CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP7]], 0
diff --git a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
index c475816ad9f..814a959152a 100644
--- a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
@@ -1348,16 +1348,13 @@ float16x8_t test_vbslq_f16(uint16x8_t a, float16x8_t b, float16x8_t c) {
 // CHECK-LABEL: test_vzip_f16
 // CHECK:   [[RETVAL:%.*]]  = alloca %struct.float16x4x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP1:%.*]]  = bitcast i8* [[TMP0]] to <4 x half>*
 // CHECK:   [[VZIP0_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
 // CHECK:   store <4 x half> [[VZIP0_I]], <4 x half>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <4 x half>, <4 x half>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
 // CHECK:   store <4 x half> [[VZIP1_I]], <4 x half>* [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false)
 float16x4x2_t test_vzip_f16(float16x4_t a, float16x4_t b) {
   return vzip_f16(a, b);
 }
@@ -1365,16 +1362,13 @@ float16x4x2_t test_vzip_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: test_vzipq_f16
 // CHECK:   [[RETVAL:%.*]]  = alloca %struct.float16x8x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP1:%.*]]  = bitcast i8* [[TMP0]] to <8 x half>*
 // CHECK:   [[VZIP0_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
 // CHECK:   store <8 x half> [[VZIP0_I]], <8 x half>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x half>, <8 x half>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 // CHECK:   store <8 x half> [[VZIP1_I]], <8 x half>* [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false)
 float16x8x2_t test_vzipq_f16(float16x8_t a, float16x8_t b) {
   return vzipq_f16(a, b);
 }
@@ -1382,16 +1376,13 @@ float16x8x2_t test_vzipq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: test_vuzp_f16
 // CHECK:   [[RETVAL:%.*]]  = alloca %struct.float16x4x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP1:%.*]]  = bitcast i8* [[TMP0]] to <4 x half>*
 // CHECK:   [[VZIP0_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 // CHECK:   store <4 x half> [[VZIP0_I]], <4 x half>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <4 x half>, <4 x half>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 // CHECK:   store <4 x half> [[VZIP1_I]], <4 x half>* [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false)
 float16x4x2_t test_vuzp_f16(float16x4_t a, float16x4_t b) {
   return vuzp_f16(a, b);
 }
@@ -1399,16 +1390,13 @@ float16x4x2_t test_vuzp_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: test_vuzpq_f16
 // CHECK:   [[RETVAL:%.*]]  = alloca %struct.float16x8x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP1:%.*]]  = bitcast i8* [[TMP0]] to <8 x half>*
 // CHECK:   [[VZIP0_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 // CHECK:   store <8 x half> [[VZIP0_I]], <8 x half>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x half>, <8 x half>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 // CHECK:   store <8 x half> [[VZIP1_I]], <8 x half>* [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false)
 float16x8x2_t test_vuzpq_f16(float16x8_t a, float16x8_t b) {
   return vuzpq_f16(a, b);
 }
@@ -1416,16 +1404,13 @@ float16x8x2_t test_vuzpq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: test_vtrn_f16
 // CHECK:   [[RETVAL:%.*]]  = alloca %struct.float16x4x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP1:%.*]]  = bitcast i8* [[TMP0]] to <4 x half>*
 // CHECK:   [[VZIP0_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
 // CHECK:   store <4 x half> [[VZIP0_I]], <4 x half>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <4 x half>, <4 x half>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 // CHECK:   store <4 x half> [[VZIP1_I]], <4 x half>* [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false)
 float16x4x2_t test_vtrn_f16(float16x4_t a, float16x4_t b) {
   return vtrn_f16(a, b);
 }
@@ -1433,16 +1418,13 @@ float16x4x2_t test_vtrn_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: test_vtrnq_f16
 // CHECK:   [[RETVAL:%.*]]  = alloca %struct.float16x8x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP1:%.*]]  = bitcast i8* [[TMP0]] to <8 x half>*
 // CHECK:   [[VZIP0_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 // CHECK:   store <8 x half> [[VZIP0_I]], <8 x half>* [[TMP1]] 
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x half>, <8 x half>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32>  <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 // CHECK:   store <8 x half> [[VZIP1_I]], <8 x half>* [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false)
 float16x8x2_t test_vtrnq_f16(float16x8_t a, float16x8_t b) {
   return vtrnq_f16(a, b);
 }
diff --git a/clang/test/CodeGen/aggregate-assign-call.c b/clang/test/CodeGen/aggregate-assign-call.c
index 3539223f214..83983722db9 100644
--- a/clang/test/CodeGen/aggregate-assign-call.c
+++ b/clang/test/CodeGen/aggregate-assign-call.c
@@ -18,8 +18,6 @@ struct S bar() {
   // O0-NOT: @llvm.lifetime.end
 
   struct S r;
-  // O1: call void @llvm.lifetime.start.p0i8({{[^,]*}}, i8* nonnull %[[R_TMP:[^)]+]])
-
   // O1: call void @llvm.lifetime.start.p0i8({{[^,]*}}, i8* nonnull %[[TMP1:[^)]+]])
   // O1: call void @foo
   r = foo();
@@ -35,7 +33,6 @@ struct S bar() {
   r = foo();
   // O1: call void @llvm.lifetime.end.p0i8({{[^,]*}}, i8* nonnull %[[TMP3]])
 
-  // O1: call void @llvm.lifetime.end.p0i8({{[^,]*}}, i8* nonnull %[[R_TMP]])
   return r;
 }
 
@@ -51,11 +48,8 @@ struct S baz(int i, volatile int *j) {
   // O0-NOT: @llvm.lifetime.end
 
   struct S r;
-  // O1: %[[RESULT_ALLOCA:[^ ]+]] = alloca %struct.S
   // O1: %[[TMP1_ALLOCA:[^ ]+]] = alloca %struct.S
   // O1: %[[TMP2_ALLOCA:[^ ]+]] = alloca %struct.S
-  // O1: %[[P:[^ ]+]] = bitcast %struct.S* %[[RESULT_ALLOCA]] to i8*
-  // O1: call void @llvm.lifetime.start.p0i8({{[^,]*}}, i8* %[[P]])
   // O1: br label %[[DO_BODY:.+]]
 
   do {
@@ -94,8 +88,6 @@ struct S baz(int i, volatile int *j) {
    } while (1);
 
   // O1: [[DO_END]]:
-  // O1: call void @llvm.memcpy
-  // O1: %[[P:[^ ]+]] = bitcast %struct.S* %[[RESULT_ALLOCA]] to i8*
-  // O1: call void @llvm.lifetime.end.p0i8({{[^,]*}}, i8* %[[P]])
+  // O1-NEXT: ret void
   return r;
 }
diff --git a/clang/test/CodeGen/arm-swiftcall.c b/clang/test/CodeGen/arm-swiftcall.c
index f5c33845e2f..53109a3f681 100644
--- a/clang/test/CodeGen/arm-swiftcall.c
+++ b/clang/test/CodeGen/arm-swiftcall.c
@@ -103,9 +103,7 @@ typedef struct {
 TEST(struct_1);
 // CHECK-LABEL: define {{.*}} @return_struct_1()
 // CHECK:   [[RET:%.*]] = alloca [[REC:%.*]], align 4
-// CHECK:   [[VAR:%.*]] = alloca [[REC]], align 4
 // CHECK:   @llvm.memset
-// CHECK:   @llvm.memcpy
 // CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[RET]] to [[AGG:{ i32, i16, \[2 x i8\], float, float }]]*
 // CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
 // CHECK:   [[FIRST:%.*]] = load i32, i32* [[T0]], align 4
@@ -170,8 +168,6 @@ typedef struct {
 TEST(struct_2);
 // CHECK-LABEL: define {{.*}} @return_struct_2()
 // CHECK:   [[RET:%.*]] = alloca [[REC:%.*]], align 4
-// CHECK:   [[VAR:%.*]] = alloca [[REC]], align 4
-// CHECK:   @llvm.memcpy
 // CHECK:   @llvm.memcpy
 // CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[RET]] to [[AGG:{ i32, i32, float, float }]]*
 // CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
@@ -240,9 +236,7 @@ typedef struct {
 TEST(struct_misaligned_1)
 // CHECK-LABEL: define {{.*}} @return_struct_misaligned_1()
 // CHECK:   [[RET:%.*]] = alloca [[REC:%.*]], align
-// CHECK:   [[VAR:%.*]] = alloca [[REC]], align
 // CHECK:   @llvm.memset
-// CHECK:   @llvm.memcpy
 // CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[RET]] to [[AGG:{ i32, i8 }]]*
 // CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
 // CHECK:   [[FIRST:%.*]] = load i32, i32* [[T0]], align
@@ -282,8 +276,6 @@ typedef union {
 TEST(union_het_fp)
 // CHECK-LABEL: define {{.*}} @return_union_het_fp()
 // CHECK:   [[RET:%.*]] = alloca [[REC:%.*]], align {{(4|8)}}
-// CHECK:   [[VAR:%.*]] = alloca [[REC]], align {{(4|8)}}
-// CHECK:   @llvm.memcpy
 // CHECK:   @llvm.memcpy
 // CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[RET]] to [[AGG:{ i32, i32 }]]*
 // CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
@@ -414,7 +406,6 @@ TEST(int4)
 TEST(int8)
 // CHECK-LABEL: define {{.*}} @return_int8()
 // CHECK:   [[RET:%.*]] = alloca [[REC:<8 x i32>]], align 32
-// CHECK:   [[VAR:%.*]] = alloca [[REC]], align
 // CHECK:   store
 // CHECK:   load
 // CHECK:   store
@@ -458,7 +449,6 @@ TEST(int8)
 TEST(int5)
 // CHECK-LABEL: define {{.*}} @return_int5()
 // CHECK:   [[RET:%.*]] = alloca [[REC:<5 x i32>]], align 32
-// CHECK:   [[VAR:%.*]] = alloca [[REC]], align
 // CHECK:   store
 // CHECK:   load
 // CHECK:   store
diff --git a/clang/test/CodeGen/arm_neon_intrinsics.c b/clang/test/CodeGen/arm_neon_intrinsics.c
index 28b4f116c26..95ac3dca7a8 100644
--- a/clang/test/CodeGen/arm_neon_intrinsics.c
+++ b/clang/test/CodeGen/arm_neon_intrinsics.c
@@ -20463,331 +20463,259 @@ poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) {
   return vtbx4_p8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vtrn_s8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrn_s8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]], !noalias !3
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]], !alias.scope !3
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]], !noalias !3
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i32 16, i1 false)
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]], !alias.scope !3
 // CHECK:   ret void
 int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
   return vtrn_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn_s16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrn_s16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]], !noalias !6
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]], !alias.scope !6
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]], !noalias !6
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]], !alias.scope !6
 // CHECK:   ret void
 int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
   return vtrn_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn_s32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrn_s32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]], !noalias !9
+// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]], !alias.scope !9
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]], !noalias !9
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
+// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]], !alias.scope !9
 // CHECK:   ret void
 int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
   return vtrn_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn_u8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrn_u8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]], !noalias !12
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]], !alias.scope !12
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]], !noalias !12
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i32 16, i1 false)
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]], !alias.scope !12
 // CHECK:   ret void
 uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
   return vtrn_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn_u16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrn_u16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]], !noalias !15
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]], !alias.scope !15
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]], !noalias !15
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]], !alias.scope !15
 // CHECK:   ret void
 uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
   return vtrn_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn_u32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrn_u32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]], !noalias !18
+// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]], !alias.scope !18
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]], !noalias !18
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
+// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]], !alias.scope !18
 // CHECK:   ret void
 uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
   return vtrn_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn_f32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrn_f32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x float> [[VTRN_I]], <2 x float>* [[TMP3]], !noalias !21
+// CHECK:   store <2 x float> [[VTRN_I]], <2 x float>* [[TMP3]], !alias.scope !21
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x float> [[VTRN1_I]], <2 x float>* [[TMP4]], !noalias !21
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
+// CHECK:   store <2 x float> [[VTRN1_I]], <2 x float>* [[TMP4]], !alias.scope !21
 // CHECK:   ret void
 float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
   return vtrn_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn_p8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrn_p8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]], !noalias !24
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]], !alias.scope !24
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]], !noalias !24
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i32 16, i1 false)
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]], !alias.scope !24
 // CHECK:   ret void
 poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
   return vtrn_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn_p16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrn_p16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]], !noalias !27
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]], !alias.scope !27
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]], !noalias !27
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]], !alias.scope !27
 // CHECK:   ret void
 poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
   return vtrn_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrnq_s8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrnq_s8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]], !noalias !30
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]], !alias.scope !30
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !noalias !30
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i32 32, i1 false)
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !alias.scope !30
 // CHECK:   ret void
 int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
   return vtrnq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrnq_s16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrnq_s16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]], !noalias !33
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]], !alias.scope !33
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !noalias !33
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !alias.scope !33
 // CHECK:   ret void
 int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
   return vtrnq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrnq_s32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrnq_s32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]], !noalias !36
+// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]], !alias.scope !36
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]], !noalias !36
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
+// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]], !alias.scope !36
 // CHECK:   ret void
 int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
   return vtrnq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrnq_u8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrnq_u8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]], !noalias !39
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]], !alias.scope !39
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !noalias !39
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i32 32, i1 false)
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !alias.scope !39
 // CHECK:   ret void
 uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
   return vtrnq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrnq_u16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrnq_u16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]], !noalias !42
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]], !alias.scope !42
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !noalias !42
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !alias.scope !42
 // CHECK:   ret void
 uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
   return vtrnq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrnq_u32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrnq_u32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]], !noalias !45
+// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]], !alias.scope !45
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]], !noalias !45
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
+// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]], !alias.scope !45
 // CHECK:   ret void
 uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
   return vtrnq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrnq_f32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrnq_f32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x float> [[VTRN_I]], <4 x float>* [[TMP3]], !noalias !48
+// CHECK:   store <4 x float> [[VTRN_I]], <4 x float>* [[TMP3]], !alias.scope !48
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP4]], !noalias !48
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
+// CHECK:   store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP4]], !alias.scope !48
 // CHECK:   ret void
 float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
   return vtrnq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrnq_p8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrnq_p8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]], !noalias !51
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]], !alias.scope !51
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !noalias !51
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i32 32, i1 false)
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !alias.scope !51
 // CHECK:   ret void
 poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
   return vtrnq_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrnq_p16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrnq_p16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]], !noalias !54
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]], !alias.scope !54
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !noalias !54
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !alias.scope !54
 // CHECK:   ret void
 poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) {
   return vtrnq_p16(a, b);
@@ -20957,661 +20885,517 @@ uint16x8_t test_vtstq_p16(poly16x8_t a, poly16x8_t b) {
   return vtstq_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp_s8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzp_s8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]], !noalias !57
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]], !alias.scope !57
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]], !noalias !57
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i32 16, i1 false)
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]], !alias.scope !57
 // CHECK:   ret void
 int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
   return vuzp_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp_s16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzp_s16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]], !noalias !60
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]], !alias.scope !60
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]], !noalias !60
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]], !alias.scope !60
 // CHECK:   ret void
 int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
   return vuzp_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp_s32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzp_s32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]], !noalias !63
+// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]], !alias.scope !63
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]], !noalias !63
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
+// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]], !alias.scope !63
 // CHECK:   ret void
 int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
   return vuzp_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp_u8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzp_u8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]], !noalias !66
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]], !alias.scope !66
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]], !noalias !66
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i32 16, i1 false)
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]], !alias.scope !66
 // CHECK:   ret void
 uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
   return vuzp_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp_u16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzp_u16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]], !noalias !69
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]], !alias.scope !69
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]], !noalias !69
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]], !alias.scope !69
 // CHECK:   ret void
 uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
   return vuzp_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp_u32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzp_u32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]], !noalias !72
+// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]], !alias.scope !72
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]], !noalias !72
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
+// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]], !alias.scope !72
 // CHECK:   ret void
 uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
   return vuzp_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp_f32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzp_f32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x float> [[VUZP_I]], <2 x float>* [[TMP3]], !noalias !75
+// CHECK:   store <2 x float> [[VUZP_I]], <2 x float>* [[TMP3]], !alias.scope !75
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x float> [[VUZP1_I]], <2 x float>* [[TMP4]], !noalias !75
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
+// CHECK:   store <2 x float> [[VUZP1_I]], <2 x float>* [[TMP4]], !alias.scope !75
 // CHECK:   ret void
 float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
   return vuzp_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp_p8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzp_p8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]], !noalias !78
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]], !alias.scope !78
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]], !noalias !78
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i32 16, i1 false)
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]], !alias.scope !78
 // CHECK:   ret void
 poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
   return vuzp_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp_p16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzp_p16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]], !noalias !81
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]], !alias.scope !81
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]], !noalias !81
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]], !alias.scope !81
 // CHECK:   ret void
 poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
   return vuzp_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzpq_s8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzpq_s8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]], !noalias !84
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]], !alias.scope !84
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !noalias !84
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i32 32, i1 false)
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !alias.scope !84
 // CHECK:   ret void
 int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
   return vuzpq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzpq_s16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzpq_s16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]], !noalias !87
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]], !alias.scope !87
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !noalias !87
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !alias.scope !87
 // CHECK:   ret void
 int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
   return vuzpq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzpq_s32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzpq_s32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]], !noalias !90
+// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]], !alias.scope !90
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]], !noalias !90
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
+// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]], !alias.scope !90
 // CHECK:   ret void
 int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
   return vuzpq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzpq_u8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzpq_u8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]], !noalias !93
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]], !alias.scope !93
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !noalias !93
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i32 32, i1 false)
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !alias.scope !93
 // CHECK:   ret void
 uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
   return vuzpq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzpq_u16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzpq_u16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]], !noalias !96
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]], !alias.scope !96
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !noalias !96
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !alias.scope !96
 // CHECK:   ret void
 uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
   return vuzpq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzpq_u32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzpq_u32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]], !noalias !99
+// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]], !alias.scope !99
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]], !noalias !99
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
+// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]], !alias.scope !99
 // CHECK:   ret void
 uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
   return vuzpq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzpq_f32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzpq_f32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x float> [[VUZP_I]], <4 x float>* [[TMP3]], !noalias !102
+// CHECK:   store <4 x float> [[VUZP_I]], <4 x float>* [[TMP3]], !alias.scope !102
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP4]], !noalias !102
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
+// CHECK:   store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP4]], !alias.scope !102
 // CHECK:   ret void
 float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
   return vuzpq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzpq_p8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzpq_p8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]], !noalias !105
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]], !alias.scope !105
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !noalias !105
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i32 32, i1 false)
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !alias.scope !105
 // CHECK:   ret void
 poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
   return vuzpq_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzpq_p16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzpq_p16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]], !noalias !108
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]], !alias.scope !108
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !noalias !108
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !alias.scope !108
 // CHECK:   ret void
 poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
   return vuzpq_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vzip_s8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzip_s8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]], !noalias !111
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]], !alias.scope !111
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]], !noalias !111
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i32 16, i1 false)
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]], !alias.scope !111
 // CHECK:   ret void
 int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
   return vzip_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vzip_s16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzip_s16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]], !noalias !114
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]], !alias.scope !114
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]], !noalias !114
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]], !alias.scope !114
 // CHECK:   ret void
 int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
   return vzip_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vzip_s32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzip_s32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]], !noalias !117
+// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]], !alias.scope !117
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]], !noalias !117
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
+// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]], !alias.scope !117
 // CHECK:   ret void
 int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
   return vzip_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vzip_u8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzip_u8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]], !noalias !120
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]], !alias.scope !120
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]], !noalias !120
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i32 16, i1 false)
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]], !alias.scope !120
 // CHECK:   ret void
 uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
   return vzip_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vzip_u16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzip_u16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]], !noalias !123
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]], !alias.scope !123
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]], !noalias !123
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]], !alias.scope !123
 // CHECK:   ret void
 uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
   return vzip_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vzip_u32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzip_u32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]], !noalias !126
+// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]], !alias.scope !126
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]], !noalias !126
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
+// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]], !alias.scope !126
 // CHECK:   ret void
 uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
   return vzip_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vzip_f32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzip_f32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x float> [[VZIP_I]], <2 x float>* [[TMP3]], !noalias !129
+// CHECK:   store <2 x float> [[VZIP_I]], <2 x float>* [[TMP3]], !alias.scope !129
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x float> [[VZIP1_I]], <2 x float>* [[TMP4]], !noalias !129
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
+// CHECK:   store <2 x float> [[VZIP1_I]], <2 x float>* [[TMP4]], !alias.scope !129
 // CHECK:   ret void
 float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
   return vzip_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vzip_p8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzip_p8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]], !noalias !132
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]], !alias.scope !132
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]], !noalias !132
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i32 16, i1 false)
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]], !alias.scope !132
 // CHECK:   ret void
 poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
   return vzip_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vzip_p16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzip_p16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]], !noalias !135
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]], !alias.scope !135
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]], !noalias !135
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]], !alias.scope !135
 // CHECK:   ret void
 poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
   return vzip_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vzipq_s8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzipq_s8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]], !noalias !138
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]], !alias.scope !138
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !noalias !138
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i32 32, i1 false)
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !alias.scope !138
 // CHECK:   ret void
 int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
   return vzipq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vzipq_s16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzipq_s16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]], !noalias !141
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]], !alias.scope !141
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !noalias !141
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !alias.scope !141
 // CHECK:   ret void
 int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
   return vzipq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vzipq_s32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzipq_s32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]], !noalias !144
+// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]], !alias.scope !144
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]], !noalias !144
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
+// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]], !alias.scope !144
 // CHECK:   ret void
 int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
   return vzipq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vzipq_u8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzipq_u8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]], !noalias !147
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]], !alias.scope !147
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !noalias !147
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i32 32, i1 false)
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !alias.scope !147
 // CHECK:   ret void
 uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
   return vzipq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vzipq_u16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzipq_u16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]], !noalias !150
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]], !alias.scope !150
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !noalias !150
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !alias.scope !150
 // CHECK:   ret void
 uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
   return vzipq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vzipq_u32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzipq_u32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]], !noalias !153
+// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]], !alias.scope !153
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]], !noalias !153
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
+// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]], !alias.scope !153
 // CHECK:   ret void
 uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
   return vzipq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vzipq_f32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzipq_f32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x float> [[VZIP_I]], <4 x float>* [[TMP3]], !noalias !156
+// CHECK:   store <4 x float> [[VZIP_I]], <4 x float>* [[TMP3]], !alias.scope !156
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP4]], !noalias !156
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
+// CHECK:   store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP4]], !alias.scope !156
 // CHECK:   ret void
 float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
   return vzipq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vzipq_p8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzipq_p8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]], !noalias !159
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]], !alias.scope !159
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !noalias !159
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i32 32, i1 false)
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !alias.scope !159
 // CHECK:   ret void
 poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
   return vzipq_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vzipq_p16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzipq_p16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]], !noalias !162
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]], !alias.scope !162
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !noalias !162
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !alias.scope !162
 // CHECK:   ret void
 poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) {
   return vzipq_p16(a, b);
diff --git a/clang/test/CodeGen/atomic-ops.c b/clang/test/CodeGen/atomic-ops.c
index 1ebb2baa1ac..db97706c5d4 100644
--- a/clang/test/CodeGen/atomic-ops.c
+++ b/clang/test/CodeGen/atomic-ops.c
@@ -186,8 +186,8 @@ struct S {
 struct S fd1(struct S *a) {
   // CHECK-LABEL: @fd1
   // CHECK: [[RETVAL:%.*]] = alloca %struct.S, align 4
-  // CHECK: [[RET:%.*]]    = alloca %struct.S, align 4
-  // CHECK: [[CAST:%.*]]   = bitcast %struct.S* [[RET]] to i64*
+  // CHECK: bitcast %struct.S* {{.*}} to i64*
+  // CHECK: [[CAST:%.*]]  = bitcast %struct.S* [[RETVAL]] to i64*
   // CHECK: [[CALL:%.*]]   = call i64 @__atomic_load_8(
   // CHECK: store i64 [[CALL]], i64* [[CAST]], align 4
   struct S ret;
diff --git a/clang/test/CodeGen/ppc64-align-struct.c b/clang/test/CodeGen/ppc64-align-struct.c
index 3cf3e0475b3..5e2dc8b2eb8 100644
--- a/clang/test/CodeGen/ppc64-align-struct.c
+++ b/clang/test/CodeGen/ppc64-align-struct.c
@@ -48,13 +48,12 @@ void test7 (int x, struct test7 y)
 {
 }
 
-// CHECK-LABEL: define void @test1va(%struct.test1* noalias sret %agg.result, i32 signext %x, ...)
-// CHECK: %y = alloca %struct.test1, align 4
+// CHECK: define void @test1va(%struct.test1* noalias sret %[[AGG_RESULT:.*]], i32 signext %x, ...)
 // CHECK: %[[CUR:[^ ]+]] = load i8*, i8** %ap
 // CHECK: %[[NEXT:[^ ]+]] = getelementptr inbounds i8, i8* %[[CUR]], i64 8
 // CHECK: store i8* %[[NEXT]], i8** %ap
 // CHECK: [[T0:%.*]] = bitcast i8* %[[CUR]] to %struct.test1*
-// CHECK: [[DEST:%.*]] = bitcast %struct.test1* %y to i8*
+// CHECK: [[DEST:%.*]] = bitcast %struct.test1* %[[AGG_RESULT]] to i8*
 // CHECK: [[SRC:%.*]] = bitcast %struct.test1* [[T0]] to i8*
 // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[DEST]], i8* align 8 [[SRC]], i64 8, i1 false)
 struct test1 test1va (int x, ...)
@@ -67,8 +66,7 @@ struct test1 test1va (int x, ...)
   return y;
 }
 
-// CHECK-LABEL: define void @test2va(%struct.test2* noalias sret %agg.result, i32 signext %x, ...)
-// CHECK: %y = alloca %struct.test2, align 16
+// CHECK: define void @test2va(%struct.test2* noalias sret %[[AGG_RESULT:.*]], i32 signext %x, ...)
 // CHECK: %[[CUR:[^ ]+]] = load i8*, i8** %ap
 // CHECK: %[[TMP0:[^ ]+]] = ptrtoint i8* %[[CUR]] to i64
 // CHECK: %[[TMP1:[^ ]+]] = add i64 %[[TMP0]], 15
@@ -77,7 +75,7 @@ struct test1 test1va (int x, ...)
 // CHECK: %[[NEXT:[^ ]+]] = getelementptr inbounds i8, i8* %[[ALIGN]], i64 16
 // CHECK: store i8* %[[NEXT]], i8** %ap
 // CHECK: [[T0:%.*]] = bitcast i8* %[[ALIGN]] to %struct.test2*
-// CHECK: [[DEST:%.*]] = bitcast %struct.test2* %y to i8*
+// CHECK: [[DEST:%.*]] = bitcast %struct.test2* %[[AGG_RESULT]] to i8*
 // CHECK: [[SRC:%.*]] = bitcast %struct.test2* [[T0]] to i8*
 // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[DEST]], i8* align 16 [[SRC]], i64 16, i1 false)
 struct test2 test2va (int x, ...)
@@ -90,8 +88,7 @@ struct test2 test2va (int x, ...)
   return y;
 }
 
-// CHECK-LABEL: define void @test3va(%struct.test3* noalias sret %agg.result, i32 signext %x, ...)
-// CHECK: %y = alloca %struct.test3, align 32
+// CHECK: define void @test3va(%struct.test3* noalias sret %[[AGG_RESULT:.*]], i32 signext %x, ...)
 // CHECK: %[[CUR:[^ ]+]] = load i8*, i8** %ap
 // CHECK: %[[TMP0:[^ ]+]] = ptrtoint i8* %[[CUR]] to i64
 // CHECK: %[[TMP1:[^ ]+]] = add i64 %[[TMP0]], 15
@@ -100,7 +97,7 @@ struct test2 test2va (int x, ...)
 // CHECK: %[[NEXT:[^ ]+]] = getelementptr inbounds i8, i8* %[[ALIGN]], i64 32
 // CHECK: store i8* %[[NEXT]], i8** %ap
 // CHECK: [[T0:%.*]] = bitcast i8* %[[ALIGN]] to %struct.test3*
-// CHECK: [[DEST:%.*]] = bitcast %struct.test3* %y to i8*
+// CHECK: [[DEST:%.*]] = bitcast %struct.test3* %[[AGG_RESULT]] to i8*
 // CHECK: [[SRC:%.*]] = bitcast %struct.test3* [[T0]] to i8*
 // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 32 [[DEST]], i8* align 16 [[SRC]], i64 32, i1 false)
 struct test3 test3va (int x, ...)
@@ -113,13 +110,12 @@ struct test3 test3va (int x, ...)
   return y;
 }
 
-// CHECK-LABEL: define void @test4va(%struct.test4* noalias sret %agg.result, i32 signext %x, ...)
-// CHECK: %y = alloca %struct.test4, align 4
+// CHECK: define void @test4va(%struct.test4* noalias sret %[[AGG_RESULT:.*]], i32 signext %x, ...)
 // CHECK: %[[CUR:[^ ]+]] = load i8*, i8** %ap
 // CHECK: %[[NEXT:[^ ]+]] = getelementptr inbounds i8, i8* %[[CUR]], i64 16
 // CHECK: store i8* %[[NEXT]], i8** %ap
 // CHECK: [[T0:%.*]] = bitcast i8* %[[CUR]] to %struct.test4*
-// CHECK: [[DEST:%.*]] = bitcast %struct.test4* %y to i8*
+// CHECK: [[DEST:%.*]] = bitcast %struct.test4* %[[AGG_RESULT]] to i8*
 // CHECK: [[SRC:%.*]] = bitcast %struct.test4* [[T0]] to i8*
 // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[DEST]], i8* align 8 [[SRC]], i64 12, i1 false)
 struct test4 test4va (int x, ...)
@@ -132,13 +128,12 @@ struct test4 test4va (int x, ...)
   return y;
 }
 
-// CHECK-LABEL: define void @testva_longdouble(%struct.test_longdouble* noalias sret %agg.result, i32 signext %x, ...)
-// CHECK: %y = alloca %struct.test_longdouble, align 16
+// CHECK: define void @testva_longdouble(%struct.test_longdouble* noalias sret %[[AGG_RESULT:.*]], i32 signext %x, ...)
 // CHECK: %[[CUR:[^ ]+]] = load i8*, i8** %ap
 // CHECK: %[[NEXT:[^ ]+]] = getelementptr inbounds i8, i8* %[[CUR]], i64 16
 // CHECK: store i8* %[[NEXT]], i8** %ap
 // CHECK: [[T0:%.*]] = bitcast i8* %[[CUR]] to %struct.test_longdouble*
-// CHECK: [[DEST:%.*]] = bitcast %struct.test_longdouble* %y to i8*
+// CHECK: [[DEST:%.*]] = bitcast %struct.test_longdouble* %[[AGG_RESULT]] to i8*
 // CHECK: [[SRC:%.*]] = bitcast %struct.test_longdouble* [[T0]] to i8*
 // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[DEST]], i8* align 8 [[SRC]], i64 16, i1 false)
 struct test_longdouble { long double x; };
@@ -152,8 +147,7 @@ struct test_longdouble testva_longdouble (int x, ...)
   return y;
 }
 
-// CHECK-LABEL: define void @testva_vector(%struct.test_vector* noalias sret %agg.result, i32 signext %x, ...)
-// CHECK: %y = alloca %struct.test_vector, align 16
+// CHECK: define void @testva_vector(%struct.test_vector* noalias sret %[[AGG_RESULT:.*]], i32 signext %x, ...)
 // CHECK: %[[CUR:[^ ]+]] = load i8*, i8** %ap
 // CHECK: %[[TMP0:[^ ]+]] = ptrtoint i8* %[[CUR]] to i64
 // CHECK: %[[TMP1:[^ ]+]] = add i64 %[[TMP0]], 15
@@ -162,7 +156,7 @@ struct test_longdouble testva_longdouble (int x, ...)
 // CHECK: %[[NEXT:[^ ]+]] = getelementptr inbounds i8, i8* %[[ALIGN]], i64 16
 // CHECK: store i8* %[[NEXT]], i8** %ap
 // CHECK: [[T0:%.*]] = bitcast i8* %[[ALIGN]] to %struct.test_vector*
-// CHECK: [[DEST:%.*]] = bitcast %struct.test_vector* %y to i8*
+// CHECK: [[DEST:%.*]] = bitcast %struct.test_vector* %[[AGG_RESULT]] to i8*
 // CHECK: [[SRC:%.*]] = bitcast %struct.test_vector* [[T0]] to i8*
 // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[DEST]], i8* align 16 [[SRC]], i64 16, i1 false)
 struct test_vector { vector int x; };
diff --git a/clang/test/CodeGen/wasm-varargs.c b/clang/test/CodeGen/wasm-varargs.c
index 3883549a0b6..acbd590cc25 100644
--- a/clang/test/CodeGen/wasm-varargs.c
+++ b/clang/test/CodeGen/wasm-varargs.c
@@ -80,10 +80,9 @@ struct S test_struct(char *fmt, ...) {
   return v;
 }
 
-// CHECK: define void @test_struct([[STRUCT_S:%[^,=]+]]*{{.*}} noalias sret %agg.result, i8*{{.*}} %fmt, ...) {{.*}} {
+// CHECK: define void @test_struct([[STRUCT_S:%[^,=]+]]*{{.*}} noalias sret [[AGG_RESULT:%.*]], i8*{{.*}} %fmt, ...) {{.*}} {
 // CHECK:   [[FMT_ADDR:%[^,=]+]] = alloca i8*, align 4
 // CHECK:   [[VA:%[^,=]+]] = alloca i8*, align 4
-// CHECK:   [[V:%[^,=]+]] = alloca [[STRUCT_S]], align 4
 // CHECK:   store i8* %fmt, i8** [[FMT_ADDR]], align 4
 // CHECK:   [[VA1:%[^,=]+]] = bitcast i8** [[VA]] to i8*
 // CHECK:   call void @llvm.va_start(i8* [[VA1]])
@@ -91,13 +90,10 @@ struct S test_struct(char *fmt, ...) {
 // CHECK:   [[ARGP_NEXT:%[^,=]+]] = getelementptr inbounds i8, i8* [[ARGP_CUR]], i32 12
 // CHECK:   store i8* [[ARGP_NEXT]], i8** [[VA]], align 4
 // CHECK:   [[R3:%[^,=]+]] = bitcast i8* [[ARGP_CUR]] to [[STRUCT_S]]*
-// CHECK:   [[R4:%[^,=]+]] = bitcast [[STRUCT_S]]* [[V]] to i8*
+// CHECK:   [[R4:%[^,=]+]] = bitcast [[STRUCT_S]]* [[AGG_RESULT]] to i8*
 // CHECK:   [[R5:%[^,=]+]] = bitcast [[STRUCT_S]]* [[R3]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[R4]], i8* align 4 [[R5]], i32 12, i1 false)
 // CHECK:   [[VA2:%[^,=]+]] = bitcast i8** [[VA]] to i8*
 // CHECK:   call void @llvm.va_end(i8* [[VA2]])
-// CHECK:   [[R6:%[^,=]+]] = bitcast [[STRUCT_S]]* %agg.result to i8*
-// CHECK:   [[R7:%[^,=]+]] = bitcast [[STRUCT_S]]* [[V]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[R6]], i8* align 4 [[R7]], i32 12, i1 false)
 // CHECK:   ret void
 // CHECK: }
diff --git a/clang/test/CodeGen/windows-swiftcall.c b/clang/test/CodeGen/windows-swiftcall.c
index a7cfb4dd75a..d8fdec47aa2 100644
--- a/clang/test/CodeGen/windows-swiftcall.c
+++ b/clang/test/CodeGen/windows-swiftcall.c
@@ -99,9 +99,7 @@ typedef struct {
 TEST(struct_1);
 // CHECK-LABEL: define dso_local swiftcc { i64, i64 } @return_struct_1() {{.*}}{
 // CHECK:   [[RET:%.*]] = alloca [[STRUCT1:%.*]], align 4
-// CHECK:   [[VAR:%.*]] = alloca [[STRUCT1]], align 4
 // CHECK:   call void @llvm.memset
-// CHECK:   call void @llvm.memcpy
 // CHECK:   [[CAST:%.*]] = bitcast [[STRUCT1]]* %retval to { i64, i64 }*
 // CHECK:   [[GEP0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
 // CHECK:   [[T0:%.*]] = load i64, i64* [[GEP0]], align 4
@@ -149,12 +147,8 @@ typedef struct {
 TEST(struct_2);
 // CHECK-LABEL: define dso_local swiftcc { i64, i64 } @return_struct_2() {{.*}}{
 // CHECK:   [[RET:%.*]] = alloca [[STRUCT2_TYPE]], align 4
-// CHECK:   [[VAR:%.*]] = alloca [[STRUCT2_TYPE]], align 4
-// CHECK:   [[CASTVAR:%.*]] = bitcast {{.*}} [[VAR]]
+// CHECK:   [[CASTVAR:%.*]] = bitcast {{.*}} [[RET]]
 // CHECK:   call void @llvm.memcpy{{.*}}({{.*}}[[CASTVAR]], {{.*}}[[STRUCT2_RESULT]]
-// CHECK:   [[CASTRET:%.*]] = bitcast {{.*}} [[RET]]
-// CHECK:   [[CASTVAR:%.*]] = bitcast {{.*}} [[VAR]]
-// CHECK:   call void @llvm.memcpy{{.*}}({{.*}}[[CASTRET]], {{.*}}[[CASTVAR]]
 // CHECK:   [[CAST:%.*]] = bitcast [[STRUCT2_TYPE]]* [[RET]] to { i64, i64 }*
 // CHECK:   [[GEP0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
 // CHECK:   [[T0:%.*]] = load i64, i64* [[GEP0]], align 4
@@ -205,12 +199,8 @@ typedef struct {
 TEST(struct_misaligned_1)
 // CHECK-LABEL: define dso_local swiftcc i64 @return_struct_misaligned_1()
 // CHECK:  [[RET:%.*]] = alloca [[STRUCT:%.*]], align 1
-// CHECK:  [[RES:%.*]] = alloca [[STRUCT]], align 1
-// CHECK:  [[CAST:%.*]] = bitcast [[STRUCT]]* [[RES]] to i8*
+// CHECK:  [[CAST:%.*]] = bitcast [[STRUCT]]* [[RET]] to i8*
 // CHECK:  call void @llvm.memset{{.*}}(i8* align 1 [[CAST]], i8 0, i64 5
-// CHECK:  [[CASTRET:%.*]] = bitcast [[STRUCT]]* [[RET]] to i8*
-// CHECK:  [[CASTRES:%.*]] = bitcast [[STRUCT]]* [[RES]] to i8*
-// CHECK:  call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} [[CASTRET]], i8* align {{[0-9]+}} [[CASTRES]], i64 5
 // CHECK:  [[CAST:%.*]] = bitcast [[STRUCT]]* [[RET]] to { i64 }*
 // CHECK:  [[GEP:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[CAST]], i32 0, i32 0
 // CHECK:  [[R0:%.*]] = load i64, i64* [[GEP]], align 1
@@ -258,12 +248,8 @@ typedef union {
 TEST(union_het_fp)
 // CHECK-LABEL: define dso_local swiftcc i64 @return_union_het_fp()
 // CHECK:  [[RET:%.*]] = alloca [[UNION:%.*]], align 8
-// CHECK:  [[RES:%.*]] = alloca [[UNION]], align 8
-// CHECK:  [[CAST:%.*]] = bitcast [[UNION]]* [[RES]] to i8*
+// CHECK:  [[CAST:%.*]] = bitcast [[UNION]]* [[RET]] to i8*
 // CHECK:  call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} [[CAST]]
-// CHECK:  [[CASTRET:%.*]] = bitcast [[UNION]]* [[RET]] to i8*
-// CHECK:  [[CASTRES:%.*]] = bitcast [[UNION]]* [[RES]] to i8*
-// CHECK:  call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} [[CASTRET]], i8* align {{[0-9]+}} [[CASTRES]]
 // CHECK:  [[CAST:%.*]] = bitcast [[UNION]]* [[RET]] to { i64 }*
 // CHECK:  [[GEP:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[CAST]], i32 0, i32 0
 // CHECK:  [[R0:%.*]] = load i64, i64* [[GEP]], align 8