Temporarily Revert "Add basic loop fusion pass."

As it's causing some bot failures (and per request from kbarton). This reverts commit r358543/ab70da07286e618016e78247e4a24fcb84077fda. llvm-svn: 358546
author: Eric Christopher <echristo@gmail.com> 2019-04-17 02:12:23 +0000
committer: Eric Christopher <echristo@gmail.com> 2019-04-17 02:12:23 +0000
commit: a86343512845c9c1fdbac865fea88aa5fce7142a (patch)
tree: 666fc6353de19ad8b00e56b67edd33f24104e4a7 /llvm/test/Transforms/LoadStoreVectorizer
parent: 7f8ca6e3679b3af951cb7a4b1377edfaa3244b93 (diff)
download: bcm5719-llvm-a86343512845c9c1fdbac865fea88aa5fce7142a.tar.gz
bcm5719-llvm-a86343512845c9c1fdbac865fea88aa5fce7142a.zip
36 files changed, 0 insertions, 3355 deletions
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll
deleted file mode 100644
index d2834be18b0..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -scoped-noalias -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=SCOPE -check-prefix=ALL %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=NOSCOPE -check-prefix=ALL %s
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
-
-; This fails to vectorize if the !alias.scope is not used
-
-; ALL-LABEL: @vectorize_alias_scope(
-; SCOPE: load float, float addrspace(1)* %c
-; SCOPE: bitcast float addrspace(1)* %a to <2 x float> addrspace(1)*
-; SCOPE: store <2 x float> zeroinitializer
-; SCOPE: store float %ld.c, float addrspace(1)* %b,
-
-; NOSCOPE: store float
-; NOSCOPE: load float
-; NOSCOPE: store float
-; NOSCOPE: store float
-define amdgpu_kernel void @vectorize_alias_scope(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
-entry:
-  %a.idx.1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
-  store float 0.0, float addrspace(1)* %a, align 4, !noalias !0
-  %ld.c = load float, float addrspace(1)* %c, align 4, !alias.scope !0
-  store float 0.0, float addrspace(1)* %a.idx.1, align 4, !noalias !0
-  store float %ld.c, float addrspace(1)* %b, align 4, !noalias !0
-  ret void
-}
-
-attributes #0 = { nounwind }
-
-!0 = !{!1}
-!1 = distinct !{!1, !2, !"some scope"}
-!2 = distinct !{!2, !"some domain"}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
deleted file mode 100644
index b0dd5d185c7..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
+++ /dev/null
@@ -1,210 +0,0 @@
-; RUN: opt -S -load-store-vectorizer -mattr=-unaligned-buffer-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=ALIGNED -check-prefix=ALL %s
-; RUN: opt -S -load-store-vectorizer -mattr=+unaligned-buffer-access,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=UNALIGNED -check-prefix=ALL %s
-; RUN: opt -S -passes='function(load-store-vectorizer)' -mattr=-unaligned-buffer-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=ALIGNED -check-prefix=ALL %s
-; RUN: opt -S -passes='function(load-store-vectorizer)' -mattr=+unaligned-buffer-access,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=UNALIGNED -check-prefix=ALL %s
-
-target triple = "amdgcn--"
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
-
-; ALL-LABEL: @load_unknown_offset_align1_i8(
-; ALL: alloca [128 x i8], align 1
-; UNALIGNED: load <2 x i8>, <2 x i8> addrspace(5)* %{{[0-9]+}}, align 1{{$}}
-
-; ALIGNED: load i8, i8 addrspace(5)* %ptr0, align 1{{$}}
-; ALIGNED: load i8, i8 addrspace(5)* %ptr1, align 1{{$}}
-define amdgpu_kernel void @load_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
-  %alloca = alloca [128 x i8], align 1, addrspace(5)
-  %ptr0 = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* %alloca, i32 0, i32 %offset
-  %val0 = load i8, i8 addrspace(5)* %ptr0, align 1
-  %ptr1 = getelementptr inbounds i8, i8 addrspace(5)* %ptr0, i32 1
-  %val1 = load i8, i8 addrspace(5)* %ptr1, align 1
-  %add = add i8 %val0, %val1
-  store i8 %add, i8 addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: @load_unknown_offset_align1_i16(
-; ALL: alloca [128 x i16], align 1, addrspace(5){{$}}
-; UNALIGNED: load <2 x i16>, <2 x i16> addrspace(5)* %{{[0-9]+}}, align 1{{$}}
-
-; ALIGNED: load i16, i16 addrspace(5)* %ptr0, align 1{{$}}
-; ALIGNED: load i16, i16 addrspace(5)* %ptr1, align 1{{$}}
-define amdgpu_kernel void @load_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
-  %alloca = alloca [128 x i16], align 1, addrspace(5)
-  %ptr0 = getelementptr inbounds [128 x i16], [128 x i16] addrspace(5)* %alloca, i32 0, i32 %offset
-  %val0 = load i16, i16 addrspace(5)* %ptr0, align 1
-  %ptr1 = getelementptr inbounds i16, i16 addrspace(5)* %ptr0, i32 1
-  %val1 = load i16, i16 addrspace(5)* %ptr1, align 1
-  %add = add i16 %val0, %val1
-  store i16 %add, i16 addrspace(1)* %out
-  ret void
-}
-
-; FIXME: Although the offset is unknown here, we know it is a multiple
-; of the element size, so should still be align 4
-
-; ALL-LABEL: @load_unknown_offset_align1_i32(
-; ALL: alloca [128 x i32], align 1
-; UNALIGNED: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 1{{$}}
-
-; ALIGNED: load i32, i32 addrspace(5)* %ptr0, align 1
-; ALIGNED: load i32, i32 addrspace(5)* %ptr1, align 1
-define amdgpu_kernel void @load_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
-  %alloca = alloca [128 x i32], align 1, addrspace(5)
-  %ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset
-  %val0 = load i32, i32 addrspace(5)* %ptr0, align 1
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %ptr0, i32 1
-  %val1 = load i32, i32 addrspace(5)* %ptr1, align 1
-  %add = add i32 %val0, %val1
-  store i32 %add, i32 addrspace(1)* %out
-  ret void
-}
-
-; FIXME: Should always increase alignment of the load
-; Make sure alloca alignment isn't decreased
-; ALL-LABEL: @load_alloca16_unknown_offset_align1_i32(
-; ALL: alloca [128 x i32], align 16
-
-; UNALIGNED: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 1{{$}}
-; ALIGNED: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 4{{$}}
-define amdgpu_kernel void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
-  %alloca = alloca [128 x i32], align 16, addrspace(5)
-  %ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset
-  %val0 = load i32, i32 addrspace(5)* %ptr0, align 1
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %ptr0, i32 1
-  %val1 = load i32, i32 addrspace(5)* %ptr1, align 1
-  %add = add i32 %val0, %val1
-  store i32 %add, i32 addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: @store_unknown_offset_align1_i8(
-; ALL: alloca [128 x i8], align 1
-; UNALIGNED: store <2 x i8> <i8 9, i8 10>, <2 x i8> addrspace(5)* %{{[0-9]+}}, align 1{{$}}
-
-; ALIGNED: store i8 9, i8 addrspace(5)* %ptr0, align 1{{$}}
-; ALIGNED: store i8 10, i8 addrspace(5)* %ptr1, align 1{{$}}
-define amdgpu_kernel void @store_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
-  %alloca = alloca [128 x i8], align 1, addrspace(5)
-  %ptr0 = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* %alloca, i32 0, i32 %offset
-  store i8 9, i8 addrspace(5)* %ptr0, align 1
-  %ptr1 = getelementptr inbounds i8, i8 addrspace(5)* %ptr0, i32 1
-  store i8 10, i8 addrspace(5)* %ptr1, align 1
-  ret void
-}
-
-; ALL-LABEL: @store_unknown_offset_align1_i16(
-; ALL: alloca [128 x i16], align 1
-; UNALIGNED: store <2 x i16> <i16 9, i16 10>, <2 x i16> addrspace(5)* %{{[0-9]+}}, align 1{{$}}
-
-; ALIGNED: store i16 9, i16 addrspace(5)* %ptr0, align 1{{$}}
-; ALIGNED: store i16 10, i16 addrspace(5)* %ptr1, align 1{{$}}
-define amdgpu_kernel void @store_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
-  %alloca = alloca [128 x i16], align 1, addrspace(5)
-  %ptr0 = getelementptr inbounds [128 x i16], [128 x i16] addrspace(5)* %alloca, i32 0, i32 %offset
-  store i16 9, i16 addrspace(5)* %ptr0, align 1
-  %ptr1 = getelementptr inbounds i16, i16 addrspace(5)* %ptr0, i32 1
-  store i16 10, i16 addrspace(5)* %ptr1, align 1
-  ret void
-}
-
-; FIXME: Although the offset is unknown here, we know it is a multiple
-; of the element size, so it still should be align 4.
-
-; ALL-LABEL: @store_unknown_offset_align1_i32(
-; ALL: alloca [128 x i32], align 1
-
-; UNALIGNED: store <2 x i32> <i32 9, i32 10>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 1{{$}}
-
-; ALIGNED: store i32 9, i32 addrspace(5)* %ptr0, align 1
-; ALIGNED: store i32 10, i32 addrspace(5)* %ptr1, align 1
-define amdgpu_kernel void @store_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
-  %alloca = alloca [128 x i32], align 1, addrspace(5)
-  %ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset
-  store i32 9, i32 addrspace(5)* %ptr0, align 1
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %ptr0, i32 1
-  store i32 10, i32 addrspace(5)* %ptr1, align 1
-  ret void
-}
-
-; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32(
-; ALIGNED: %alloca = alloca [8 x i32], align 4, addrspace(5)
-; ALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 4
-
-; UNALIGNED: %alloca = alloca [8 x i32], align 1, addrspace(5)
-; UNALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 1
-define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32() {
-  %alloca = alloca [8 x i32], align 1, addrspace(5)
-  %out = bitcast [8 x i32] addrspace(5)* %alloca to i32 addrspace(5)*
-  %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3
-
-  store i32 9, i32 addrspace(5)* %out, align 1
-  store i32 1, i32 addrspace(5)* %out.gep.1, align 1
-  store i32 23, i32 addrspace(5)* %out.gep.2, align 1
-  store i32 19, i32 addrspace(5)* %out.gep.3, align 1
-  ret void
-}
-
-; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8(
-; ALIGNED: %alloca = alloca [8 x i8], align 4, addrspace(5)
-; ALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8> addrspace(5)* %1, align 4
-
-; UNALIGNED: %alloca = alloca [8 x i8], align 1, addrspace(5)
-; UNALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8> addrspace(5)* %1, align 1
-define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8() {
-  %alloca = alloca [8 x i8], align 1, addrspace(5)
-  %out = bitcast [8 x i8] addrspace(5)* %alloca to i8 addrspace(5)*
-  %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1
-  %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2
-  %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i8 3
-
-  store i8 9, i8 addrspace(5)* %out, align 1
-  store i8 1, i8 addrspace(5)* %out.gep.1, align 1
-  store i8 23, i8 addrspace(5)* %out.gep.2, align 1
-  store i8 19, i8 addrspace(5)* %out.gep.3, align 1
-  ret void
-}
-
-; ALL-LABEL: @merge_private_load_4_vector_elts_loads_v4i32(
-; ALIGNED: %alloca = alloca [8 x i32], align 4, addrspace(5)
-; ALIGNED: load <4 x i32>, <4 x i32> addrspace(5)* %1, align 4
-
-; UNALIGNED: %alloca = alloca [8 x i32], align 1, addrspace(5)
-; UNALIGNED: load <4 x i32>, <4 x i32> addrspace(5)* %1, align 1
-define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i32() {
-  %alloca = alloca [8 x i32], align 1, addrspace(5)
-  %out = bitcast [8 x i32] addrspace(5)* %alloca to i32 addrspace(5)*
-  %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3
-
-  %load0 = load i32, i32 addrspace(5)* %out, align 1
-  %load1 = load i32, i32 addrspace(5)* %out.gep.1, align 1
-  %load2 = load i32, i32 addrspace(5)* %out.gep.2, align 1
-  %load3 = load i32, i32 addrspace(5)* %out.gep.3, align 1
-  ret void
-}
-
-; ALL-LABEL: @merge_private_load_4_vector_elts_loads_v4i8(
-; ALIGNED: %alloca = alloca [8 x i8], align 4, addrspace(5)
-; ALIGNED: load <4 x i8>, <4 x i8> addrspace(5)* %1, align 4
-
-; UNALIGNED: %alloca = alloca [8 x i8], align 1, addrspace(5)
-; UNALIGNED: load <4 x i8>, <4 x i8> addrspace(5)* %1, align 1
-define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i8() {
-  %alloca = alloca [8 x i8], align 1, addrspace(5)
-  %out = bitcast [8 x i8] addrspace(5)* %alloca to i8 addrspace(5)*
-  %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1
-  %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2
-  %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i8 3
-
-  %load0 = load i8, i8 addrspace(5)* %out, align 1
-  %load1 = load i8, i8 addrspace(5)* %out.gep.1, align 1
-  %load2 = load i8, i8 addrspace(5)* %out.gep.2, align 1
-  %load3 = load i8, i8 addrspace(5)* %out.gep.3, align 1
-  ret void
-}
-
-attributes #0 = { nounwind }
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll
deleted file mode 100644
index cd1c7fdc521..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll
+++ /dev/null
@@ -1,52 +0,0 @@
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
-
-declare i64 @_Z12get_local_idj(i32)
-
-declare i64 @_Z12get_group_idj(i32)
-
-declare double @llvm.fmuladd.f64(double, double, double)
-
-; CHECK-LABEL: @factorizedVsNonfactorizedAccess(
-; CHECK: load <2 x float>
-; CHECK: store <2 x float>
-define amdgpu_kernel void @factorizedVsNonfactorizedAccess(float addrspace(1)* nocapture %c) {
-entry:
-  %call = tail call i64 @_Z12get_local_idj(i32 0)
-  %call1 = tail call i64 @_Z12get_group_idj(i32 0)
-  %div = lshr i64 %call, 4
-  %div2 = lshr i64 %call1, 3
-  %mul = shl i64 %div2, 7
-  %rem = shl i64 %call, 3
-  %mul3 = and i64 %rem, 120
-  %add = or i64 %mul, %mul3
-  %rem4 = shl i64 %call1, 7
-  %mul5 = and i64 %rem4, 896
-  %mul6 = shl nuw nsw i64 %div, 3
-  %add7 = add nuw i64 %mul5, %mul6
-  %mul9 = shl i64 %add7, 10
-  %add10 = add i64 %mul9, %add
-  %arrayidx = getelementptr inbounds float, float addrspace(1)* %c, i64 %add10
-  %load1 = load float, float addrspace(1)* %arrayidx, align 4
-  %conv = fpext float %load1 to double
-  %mul11 = fmul double %conv, 0x3FEAB481D8F35506
-  %conv12 = fptrunc double %mul11 to float
-  %conv18 = fpext float %conv12 to double
-  %storeval1 = tail call double @llvm.fmuladd.f64(double 0x3FF4FFAFBBEC946A, double 0.000000e+00, double %conv18)
-  %cstoreval1 = fptrunc double %storeval1 to float
-  store float %cstoreval1, float addrspace(1)* %arrayidx, align 4
-
-  %add23 = or i64 %add10, 1
-  %arrayidx24 = getelementptr inbounds float, float addrspace(1)* %c, i64 %add23
-  %load2 = load float, float addrspace(1)* %arrayidx24, align 4
-  %conv25 = fpext float %load2 to double
-  %mul26 = fmul double %conv25, 0x3FEAB481D8F35506
-  %conv27 = fptrunc double %mul26 to float
-  %conv34 = fpext float %conv27 to double
-  %storeval2 = tail call double @llvm.fmuladd.f64(double 0x3FF4FFAFBBEC946A, double 0.000000e+00, double %conv34)
-  %cstoreval2 = fptrunc double %storeval2 to float
-  store float %cstoreval2, float addrspace(1)* %arrayidx24, align 4
-  ret void
-}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll
deleted file mode 100644
index b8e95a6793e..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll
+++ /dev/null
@@ -1,151 +0,0 @@
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
-
-declare i32 @llvm.amdgcn.workitem.id.x() #1
-
-; CHECK-LABEL: @basic_merge_sext_index(
-; CHECK: sext i32 %id.x to i64
-; CHECK: load <2 x float>
-; CHECK: store <2 x float> zeroinitializer
-define amdgpu_kernel void @basic_merge_sext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
-entry:
-  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
-  %sext.id.x = sext i32 %id.x to i64
-  %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %sext.id.x
-  %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %sext.id.x
-  %a.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %a.idx.x, i64 1
-  %c.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %c.idx.x, i64 1
-
-  %ld.c = load float, float addrspace(1)* %c.idx.x, align 4
-  %ld.c.idx.1 = load float, float addrspace(1)* %c.idx.x.1, align 4
-
-  store float 0.0, float addrspace(1)* %a.idx.x, align 4
-  store float 0.0, float addrspace(1)* %a.idx.x.1, align 4
-
-  %add = fadd float %ld.c, %ld.c.idx.1
-  store float %add, float addrspace(1)* %b, align 4
-  ret void
-}
-
-; CHECK-LABEL: @basic_merge_zext_index(
-; CHECK: zext i32 %id.x to i64
-; CHECK: load <2 x float>
-; CHECK: store <2 x float>
-define amdgpu_kernel void @basic_merge_zext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
-entry:
-  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
-  %zext.id.x = zext i32 %id.x to i64
-  %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %zext.id.x
-  %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %zext.id.x
-  %a.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %a.idx.x, i64 1
-  %c.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %c.idx.x, i64 1
-
-  %ld.c = load float, float addrspace(1)* %c.idx.x, align 4
-  %ld.c.idx.1 = load float, float addrspace(1)* %c.idx.x.1, align 4
-  store float 0.0, float addrspace(1)* %a.idx.x, align 4
-  store float 0.0, float addrspace(1)* %a.idx.x.1, align 4
-
-  %add = fadd float %ld.c, %ld.c.idx.1
-  store float %add, float addrspace(1)* %b, align 4
-  ret void
-}
-
-; CHECK-LABEL: @merge_op_zext_index(
-; CHECK: load <2 x float>
-; CHECK: store <2 x float>
-define amdgpu_kernel void @merge_op_zext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
-entry:
-  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
-  %shl = shl i32 %id.x, 2
-  %zext.id.x = zext i32 %shl to i64
-  %a.0 = getelementptr inbounds float, float addrspace(1)* %a, i64 %zext.id.x
-  %c.0 = getelementptr inbounds float, float addrspace(1)* %c, i64 %zext.id.x
-
-  %id.x.1 = or i32 %shl, 1
-  %id.x.1.ext = zext i32 %id.x.1 to i64
-
-  %a.1 = getelementptr inbounds float, float addrspace(1)* %a, i64 %id.x.1.ext
-  %c.1 = getelementptr inbounds float, float addrspace(1)* %c, i64 %id.x.1.ext
-
-  %ld.c.0 = load float, float addrspace(1)* %c.0, align 4
-  store float 0.0, float addrspace(1)* %a.0, align 4
-  %ld.c.1 = load float, float addrspace(1)* %c.1, align 4
-  store float 0.0, float addrspace(1)* %a.1, align 4
-
-  %add = fadd float %ld.c.0, %ld.c.1
-  store float %add, float addrspace(1)* %b, align 4
-  ret void
-}
-
-; CHECK-LABEL: @merge_op_sext_index(
-; CHECK: load <2 x float>
-; CHECK: store <2 x float>
-define amdgpu_kernel void @merge_op_sext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
-entry:
-  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
-  %shl = shl i32 %id.x, 2
-  %zext.id.x = sext i32 %shl to i64
-  %a.0 = getelementptr inbounds float, float addrspace(1)* %a, i64 %zext.id.x
-  %c.0 = getelementptr inbounds float, float addrspace(1)* %c, i64 %zext.id.x
-
-  %id.x.1 = or i32 %shl, 1
-  %id.x.1.ext = sext i32 %id.x.1 to i64
-
-  %a.1 = getelementptr inbounds float, float addrspace(1)* %a, i64 %id.x.1.ext
-  %c.1 = getelementptr inbounds float, float addrspace(1)* %c, i64 %id.x.1.ext
-
-  %ld.c.0 = load float, float addrspace(1)* %c.0, align 4
-  store float 0.0, float addrspace(1)* %a.0, align 4
-  %ld.c.1 = load float, float addrspace(1)* %c.1, align 4
-  store float 0.0, float addrspace(1)* %a.1, align 4
-
-  %add = fadd float %ld.c.0, %ld.c.1
-  store float %add, float addrspace(1)* %b, align 4
-  ret void
-}
-
-; This case fails to vectorize if not using the extra extension
-; handling in isConsecutiveAccess.
-
-; CHECK-LABEL: @zext_trunc_phi_1(
-; CHECK: loop:
-; CHECK: load <2 x i32>
-; CHECK: store <2 x i32>
-define amdgpu_kernel void @zext_trunc_phi_1(i32 addrspace(1)* nocapture noalias %a, i32 addrspace(1)* nocapture noalias %b, i32 addrspace(1)* nocapture readonly noalias %c, i32 %n, i64 %arst, i64 %aoeu) #0 {
-entry:
-  %cmp0 = icmp eq i32 %n, 0
-  br i1 %cmp0, label %exit, label %loop
-
-loop:
-  %indvars.iv = phi i64 [ %indvars.iv.next, %loop ], [ 0, %entry ]
-  %trunc.iv = trunc i64 %indvars.iv to i32
-  %idx = shl i32 %trunc.iv, 4
-
-  %idx.ext = zext i32 %idx to i64
-  %c.0 = getelementptr inbounds i32, i32 addrspace(1)* %c, i64 %idx.ext
-  %a.0 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idx.ext
-
-  %idx.1 = or i32 %idx, 1
-  %idx.1.ext = zext i32 %idx.1 to i64
-  %c.1 = getelementptr inbounds i32, i32 addrspace(1)* %c, i64 %idx.1.ext
-  %a.1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idx.1.ext
-
-  %ld.c.0 = load i32, i32 addrspace(1)* %c.0, align 4
-  store i32 %ld.c.0, i32 addrspace(1)* %a.0, align 4
-  %ld.c.1 = load i32, i32 addrspace(1)* %c.1, align 4
-  store i32 %ld.c.1, i32 addrspace(1)* %a.1, align 4
-
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-
-  %exitcond = icmp eq i32 %lftr.wideiv, %n
-  br i1 %exitcond, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll
deleted file mode 100644
index 5bb6289ff19..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll
+++ /dev/null
@@ -1,135 +0,0 @@
-; RUN: opt -S -mtriple=amdgcn--amdhsa -load-store-vectorizer < %s | FileCheck %s
-; RUN: opt -S -mtriple=amdgcn--amdhsa -passes='function(load-store-vectorizer)' < %s | FileCheck %s
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
-
-; Check that vectorizer can find a GEP through bitcast
-; CHECK-LABEL: @vect_zext_bitcast_f32_to_i32_idx
-; CHECK: load <4 x i32>
-define void @vect_zext_bitcast_f32_to_i32_idx(float addrspace(1)* %arg1, i32 %base) {
-  %add1 = add nuw i32 %base, 0
-  %zext1 = zext i32 %add1 to i64
-  %gep1 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 %zext1
-  %f2i1 = bitcast float addrspace(1)* %gep1 to i32 addrspace(1)*
-  %load1 = load i32, i32 addrspace(1)* %f2i1, align 4
-  %add2 = add nuw i32 %base, 1
-  %zext2 = zext i32 %add2 to i64
-  %gep2 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 %zext2
-  %f2i2 = bitcast float addrspace(1)* %gep2 to i32 addrspace(1)*
-  %load2 = load i32, i32 addrspace(1)* %f2i2, align 4
-  %add3 = add nuw i32 %base, 2
-  %zext3 = zext i32 %add3 to i64
-  %gep3 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 %zext3
-  %f2i3 = bitcast float addrspace(1)* %gep3 to i32 addrspace(1)*
-  %load3 = load i32, i32 addrspace(1)* %f2i3, align 4
-  %add4 = add nuw i32 %base, 3
-  %zext4 = zext i32 %add4 to i64
-  %gep4 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 %zext4
-  %f2i4 = bitcast float addrspace(1)* %gep4 to i32 addrspace(1)*
-  %load4 = load i32, i32 addrspace(1)* %f2i4, align 4
-  ret void
-}
-
-; CHECK-LABEL: @vect_zext_bitcast_i8_st1_to_i32_idx
-; CHECK: load i32
-; CHECK: load i32
-; CHECK: load i32
-; CHECK: load i32
-define void @vect_zext_bitcast_i8_st1_to_i32_idx(i8 addrspace(1)* %arg1, i32 %base) {
-  %add1 = add nuw i32 %base, 0
-  %zext1 = zext i32 %add1 to i64
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext1
-  %f2i1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %load1 = load i32, i32 addrspace(1)* %f2i1, align 4
-  %add2 = add nuw i32 %base, 1
-  %zext2 = zext i32 %add2 to i64
-  %gep2 = getelementptr inbounds i8,i8 addrspace(1)* %arg1, i64 %zext2
-  %f2i2 = bitcast i8 addrspace(1)* %gep2 to i32 addrspace(1)*
-  %load2 = load i32, i32 addrspace(1)* %f2i2, align 4
-  %add3 = add nuw i32 %base, 2
-  %zext3 = zext i32 %add3 to i64
-  %gep3 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext3
-  %f2i3 = bitcast i8 addrspace(1)* %gep3 to i32 addrspace(1)*
-  %load3 = load i32, i32 addrspace(1)* %f2i3, align 4
-  %add4 = add nuw i32 %base, 3
-  %zext4 = zext i32 %add4 to i64
-  %gep4 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext4
-  %f2i4 = bitcast i8 addrspace(1)* %gep4 to i32 addrspace(1)*
-  %load4 = load i32, i32 addrspace(1)* %f2i4, align 4
-  ret void
-}
-
-; CHECK-LABEL: @vect_zext_bitcast_i8_st4_to_i32_idx
-; CHECK: load <4 x i32>
-define void @vect_zext_bitcast_i8_st4_to_i32_idx(i8 addrspace(1)* %arg1, i32 %base) {
-  %add1 = add nuw i32 %base, 0
-  %zext1 = zext i32 %add1 to i64
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext1
-  %f2i1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %load1 = load i32, i32 addrspace(1)* %f2i1, align 4
-  %add2 = add nuw i32 %base, 4
-  %zext2 = zext i32 %add2 to i64
-  %gep2 = getelementptr inbounds i8,i8 addrspace(1)* %arg1, i64 %zext2
-  %f2i2 = bitcast i8 addrspace(1)* %gep2 to i32 addrspace(1)*
-  %load2 = load i32, i32 addrspace(1)* %f2i2, align 4
-  %add3 = add nuw i32 %base, 8
-  %zext3 = zext i32 %add3 to i64
-  %gep3 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext3
-  %f2i3 = bitcast i8 addrspace(1)* %gep3 to i32 addrspace(1)*
-  %load3 = load i32, i32 addrspace(1)* %f2i3, align 4
-  %add4 = add nuw i32 %base, 12
-  %zext4 = zext i32 %add4 to i64
-  %gep4 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext4
-  %f2i4 = bitcast i8 addrspace(1)* %gep4 to i32 addrspace(1)*
-  %load4 = load i32, i32 addrspace(1)* %f2i4, align 4
-  ret void
-}
-
-; CHECK-LABEL: @vect_zext_bitcast_negative_ptr_delta
-; CHECK: load <2 x i32>
-define void @vect_zext_bitcast_negative_ptr_delta(i32 addrspace(1)* %p, i32 %base) {
-  %p.bitcasted = bitcast i32 addrspace(1)* %p to i16 addrspace(1)*
-  %a.offset = add nuw i32 %base, 4
-  %t.offset.zexted = zext i32 %base to i64
-  %a.offset.zexted = zext i32 %a.offset to i64
-  %t.ptr = getelementptr inbounds i16, i16 addrspace(1)* %p.bitcasted, i64 %t.offset.zexted
-  %a.ptr = getelementptr inbounds i16, i16 addrspace(1)* %p.bitcasted, i64 %a.offset.zexted
-  %b.ptr = getelementptr inbounds i16, i16 addrspace(1)* %t.ptr, i64 6
-  %a.ptr.bitcasted = bitcast i16 addrspace(1)* %a.ptr to i32 addrspace(1)*
-  %b.ptr.bitcasted = bitcast i16 addrspace(1)* %b.ptr to i32 addrspace(1)*
-  %a.val = load i32, i32 addrspace(1)* %a.ptr.bitcasted
-  %b.val = load i32, i32 addrspace(1)* %b.ptr.bitcasted
-  ret void
-}
-
-; Check i1 corner case
-; CHECK-LABEL: @zexted_i1_gep_index
-; CHECK: load i32
-; CHECK: load i32
-define void @zexted_i1_gep_index(i32 addrspace(1)* %p, i32 %val) {
-  %selector = icmp eq i32 %val, 0
-  %flipped = xor i1 %selector, 1
-  %index.0 = zext i1 %selector to i64
-  %index.1 = zext i1 %flipped to i64
-  %gep.0 = getelementptr inbounds i32, i32 addrspace(1)* %p, i64 %index.0
-  %gep.1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i64 %index.1
-  %val0 = load i32, i32 addrspace(1)* %gep.0
-  %val1 = load i32, i32 addrspace(1)* %gep.1
-  ret void
-}
-
-; Check i1 corner case
-; CHECK-LABEL: @sexted_i1_gep_index
-; CHECK: load i32
-; CHECK: load i32
-define void @sexted_i1_gep_index(i32 addrspace(1)* %p, i32 %val) {
-  %selector = icmp eq i32 %val, 0
-  %flipped = xor i1 %selector, 1
-  %index.0 = sext i1 %selector to i64
-  %index.1 = sext i1 %flipped to i64
-  %gep.0 = getelementptr inbounds i32, i32 addrspace(1)* %p, i64 %index.0
-  %gep.1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i64 %index.1
-  %val0 = load i32, i32 addrspace(1)* %gep.0
-  %val1 = load i32, i32 addrspace(1)* %gep.1
-  ret void
-}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
deleted file mode 100644
index 35836f80456..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
+++ /dev/null
@@ -1,118 +0,0 @@
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
-
-; Check position of the inserted vector load/store.  Vectorized loads should be
-; inserted at the position of the first load in the chain, and stores should be
-; inserted at the position of the last store.
-
-; CHECK-LABEL: @insert_load_point(
-; CHECK: %z = add i32 %x, 4
-; CHECK: load <2 x float>
-; CHECK: %w = add i32 %y, 9
-; CHECK: %foo = add i32 %z, %w
-define amdgpu_kernel void @insert_load_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
-entry:
-  %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx
-  %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %idx
-  %a.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %a.idx.x, i64 1
-  %c.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %c.idx.x, i64 1
-
-  %z = add i32 %x, 4
-  %ld.c = load float, float addrspace(1)* %c.idx.x, align 4
-  %w = add i32 %y, 9
-  %ld.c.idx.1 = load float, float addrspace(1)* %c.idx.x.1, align 4
-  %foo = add i32 %z, %w
-
-  store float 0.0, float addrspace(1)* %a.idx.x, align 4
-  store float 0.0, float addrspace(1)* %a.idx.x.1, align 4
-
-  %add = fadd float %ld.c, %ld.c.idx.1
-  store float %add, float addrspace(1)* %b, align 4
-  store i32 %foo, i32 addrspace(3)* null, align 4
-  ret void
-}
-
-; CHECK-LABEL: @insert_store_point(
-; CHECK: %z = add i32 %x, 4
-; CHECK: %w = add i32 %y, 9
-; CHECK: store <2 x float>
-; CHECK: %foo = add i32 %z, %w
-define amdgpu_kernel void @insert_store_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
-entry:
-  %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx
-  %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %idx
-  %a.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %a.idx.x, i64 1
-  %c.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %c.idx.x, i64 1
-
-  %ld.c = load float, float addrspace(1)* %c.idx.x, align 4
-  %ld.c.idx.1 = load float, float addrspace(1)* %c.idx.x.1, align 4
-
-  %z = add i32 %x, 4
-  store float 0.0, float addrspace(1)* %a.idx.x, align 4
-  %w = add i32 %y, 9
-  store float 0.0, float addrspace(1)* %a.idx.x.1, align 4
-  %foo = add i32 %z, %w
-
-  %add = fadd float %ld.c, %ld.c.idx.1
-  store float %add, float addrspace(1)* %b, align 4
-  store i32 %foo, i32 addrspace(3)* null, align 4
-  ret void
-}
-
-; Here we have four stores, with an aliasing load before the last one.  We can
-; vectorize the first three stores as <3 x float>, but this vectorized store must
-; be inserted at the location of the third scalar store, not the fourth one.
-;
-; CHECK-LABEL: @insert_store_point_alias
-; CHECK: store <3 x float>
-; CHECK: load float, float addrspace(1)* %a.idx.2
-; CHECK: store float
-; CHECK-SAME: %a.idx.3
-define float @insert_store_point_alias(float addrspace(1)* nocapture %a, i64 %idx) {
-  %a.idx = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx
-  %a.idx.1 = getelementptr inbounds float, float addrspace(1)* %a.idx, i64 1
-  %a.idx.2 = getelementptr inbounds float, float addrspace(1)* %a.idx.1, i64 1
-  %a.idx.3 = getelementptr inbounds float, float addrspace(1)* %a.idx.2, i64 1
-
-  store float 0.0, float addrspace(1)* %a.idx, align 4
-  store float 0.0, float addrspace(1)* %a.idx.1, align 4
-  store float 0.0, float addrspace(1)* %a.idx.2, align 4
-  %x = load float, float addrspace(1)* %a.idx.2, align 4
-  store float 0.0, float addrspace(1)* %a.idx.3, align 4
-
-  ret float %x
-}
-
-; Here we have four stores, with an aliasing load before the last one.  We
-; could vectorize two of the stores before the load (although we currently
-; don't), but the important thing is that we *don't* sink the store to
-; a[idx + 1] below the load.
-;
-; CHECK-LABEL: @insert_store_point_alias_ooo
-; CHECK: store float
-; CHECK-SAME: %a.idx.3
-; CHECK: store float
-; CHECK-SAME: %a.idx.1
-; CHECK: store float
-; CHECK-SAME: %a.idx.2
-; CHECK: load float, float addrspace(1)* %a.idx.2
-; CHECK: store float
-; CHECK-SAME: %a.idx
-define float @insert_store_point_alias_ooo(float addrspace(1)* nocapture %a, i64 %idx) {
-  %a.idx = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx
-  %a.idx.1 = getelementptr inbounds float, float addrspace(1)* %a.idx, i64 1
-  %a.idx.2 = getelementptr inbounds float, float addrspace(1)* %a.idx.1, i64 1
-  %a.idx.3 = getelementptr inbounds float, float addrspace(1)* %a.idx.2, i64 1
-
-  store float 0.0, float addrspace(1)* %a.idx.3, align 4
-  store float 0.0, float addrspace(1)* %a.idx.1, align 4
-  store float 0.0, float addrspace(1)* %a.idx.2, align 4
-  %x = load float, float addrspace(1)* %a.idx.2, align 4
-  store float 0.0, float addrspace(1)* %a.idx, align 4
-
-  ret float %x
-}
-
-attributes #0 = { nounwind }
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll
deleted file mode 100644
index 81ebb712e33..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
-
-; This is NOT OK to vectorize, as either load may alias either store.
-
-; CHECK: load double
-; CHECK: store double 0.000000e+00, double addrspace(1)* %a,
-; CHECK: load double
-; CHECK: store double 0.000000e+00, double addrspace(1)* %a.idx.1
-define amdgpu_kernel void @interleave(double addrspace(1)* nocapture %a, double addrspace(1)* nocapture %b, double addrspace(1)* nocapture readonly %c) #0 {
-entry:
-  %a.idx.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
-  %c.idx.1 = getelementptr inbounds double, double addrspace(1)* %c, i64 1
-
-  %ld.c = load double, double addrspace(1)* %c, align 8 ; may alias store to %a
-  store double 0.0, double addrspace(1)* %a, align 8
-
-  %ld.c.idx.1 = load double, double addrspace(1)* %c.idx.1, align 8 ; may alias store to %a
-  store double 0.0, double addrspace(1)* %a.idx.1, align 8
-
-  %add = fadd double %ld.c, %ld.c.idx.1
-  store double %add, double addrspace(1)* %b
-
-  ret void
-}
-
-attributes #0 = { nounwind }
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/invariant-load.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/invariant-load.ll
deleted file mode 100644
index 15c47716aaf..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/invariant-load.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
-
-; CHECK-LABEL: @interleave
-; CHECK: load <2 x double>, <2 x double> addrspace(1)* %{{.}}, align 8{{$}}
-; CHECK: store <2 x double> zeroinitializer
-; CHECK: store double %add
-define amdgpu_kernel void @interleave(double addrspace(1)* nocapture %a, double addrspace(1)* nocapture %b, double addrspace(1)* nocapture readonly %c) #0 {
-entry:
-  %a.idx.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
-  %c.idx.1 = getelementptr inbounds double, double addrspace(1)* %c, i64 1
-
-  %ld.c = load double, double addrspace(1)* %c, align 8
-  store double 0.0, double addrspace(1)* %a, align 8 ; Cannot alias invariant load
-
-  %ld.c.idx.1 = load double, double addrspace(1)* %c.idx.1, align 8, !invariant.load !0
-  store double 0.0, double addrspace(1)* %a.idx.1, align 8
-
-  %add = fadd double %ld.c, %ld.c.idx.1
-  store double %add, double addrspace(1)* %b
-
-  ret void
-}
-
-attributes #0 = { nounwind }
-
-!0 = !{}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/lit.local.cfg b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/lit.local.cfg
deleted file mode 100644
index 6baccf05fff..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/lit.local.cfg
+++ /dev/null
@@ -1,3 +0,0 @@
-if not 'AMDGPU' in config.root.targets:
-    config.unsupported = True
-
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
deleted file mode 100644
index 4292cbcec85..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
+++ /dev/null
@@ -1,223 +0,0 @@
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4,-unaligned-scratch-access  -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ELT4-ALIGNED,ALIGNED,ALL %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,-unaligned-scratch-access  -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ELT8-ALIGNED,ALIGNED,ALL %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,-unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ELT16-ALIGNED,ALIGNED,ALL %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4,+unaligned-scratch-access  -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ELT4-UNALIGNED,UNALIGNED,ALL %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,+unaligned-scratch-access  -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ELT8-UNALIGNED,UNALIGNED,ALL %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ELT16-UNALIGNED,UNALIGNED,ALL %s
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
-
-; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32
-; ELT4-ALIGNED: store i32
-; ELT4-ALIGNED: store i32
-; ELT4-ALIGNED: store i32
-; ELT4-ALIGNED: store i32
-
-; ELT8: store <2 x i32>
-; ELT8: store <2 x i32>
-
-; ELT16-UNALIGNED: store <4 x i32>
-define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32(i32 addrspace(5)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3
-
-  store i32 9, i32 addrspace(5)* %out
-  store i32 1, i32 addrspace(5)* %out.gep.1
-  store i32 23, i32 addrspace(5)* %out.gep.2
-  store i32 19, i32 addrspace(5)* %out.gep.3
-  ret void
-}
-
-; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32_align1(
-; ALIGNED: store i32 9, i32 addrspace(5)* %out, align 1
-; ALIGNED: store i32 1, i32 addrspace(5)* %out.gep.1, align 1
-; ALIGNED: store i32 23, i32 addrspace(5)* %out.gep.2, align 1
-; ALIGNED: store i32 19, i32 addrspace(5)* %out.gep.3, align 1
-
-; ELT16-UNALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 1
-
-; ELT8-UNALIGNED: store <2 x i32> <i32 9, i32 1>, <2 x i32> addrspace(5)* %1, align 1
-; ELT8-UNALIGNED: store <2 x i32> <i32 23, i32 19>, <2 x i32> addrspace(5)* %2, align 1
-
-; ELT4-UNALIGNED: store i32
-; ELT4-UNALIGNED: store i32
-; ELT4-UNALIGNED: store i32
-; ELT4-UNALIGNED: store i32
-define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align1(i32 addrspace(5)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3
-
-  store i32 9, i32 addrspace(5)* %out, align 1
-  store i32 1, i32 addrspace(5)* %out.gep.1, align 1
-  store i32 23, i32 addrspace(5)* %out.gep.2, align 1
-  store i32 19, i32 addrspace(5)* %out.gep.3, align 1
-  ret void
-}
-
-; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32_align2(
-; ALIGNED: store i32 9, i32 addrspace(5)* %out, align 2
-; ALIGNED: store i32 1, i32 addrspace(5)* %out.gep.1, align 2
-; ALIGNED: store i32 23, i32 addrspace(5)* %out.gep.2, align 2
-; ALIGNED: store i32 19, i32 addrspace(5)* %out.gep.3, align 2
-
-; ELT16-UNALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 2
-
-; ELT8-UNALIGNED: store <2 x i32>
-; ELT8-UNALIGNED: store <2 x i32>
-
-; ELT4-UNALIGNED: store i32
-; ELT4-UNALIGNED: store i32
-; ELT4-UNALIGNED: store i32
-; ELT4-UNALIGNED: store i32
-define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align2(i32 addrspace(5)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3
-
-  store i32 9, i32 addrspace(5)* %out, align 2
-  store i32 1, i32 addrspace(5)* %out.gep.1, align 2
-  store i32 23, i32 addrspace(5)* %out.gep.2, align 2
-  store i32 19, i32 addrspace(5)* %out.gep.3, align 2
-  ret void
-}
-
-; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8(
-; ALL: store <4 x i8>
-define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8(i8 addrspace(5)* %out) #0 {
-  %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i32 1
-  %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i32 2
-  %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i32 3
-
-  store i8 9, i8 addrspace(5)* %out, align 4
-  store i8 1, i8 addrspace(5)* %out.gep.1
-  store i8 23, i8 addrspace(5)* %out.gep.2
-  store i8 19, i8 addrspace(5)* %out.gep.3
-  ret void
-}
-
-; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8_align1(
-; ALIGNED: store i8
-; ALIGNED: store i8
-; ALIGNED: store i8
-; ALIGNED: store i8
-
-; UNALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8> addrspace(5)* %1, align 1
-define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8_align1(i8 addrspace(5)* %out) #0 {
-  %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i32 1
-  %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i32 2
-  %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i32 3
-
-  store i8 9, i8 addrspace(5)* %out, align 1
-  store i8 1, i8 addrspace(5)* %out.gep.1, align 1
-  store i8 23, i8 addrspace(5)* %out.gep.2, align 1
-  store i8 19, i8 addrspace(5)* %out.gep.3, align 1
-  ret void
-}
-
-; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16(
-; ALL: store <2 x i16>
-define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16(i16 addrspace(5)* %out) #0 {
-  %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1
-
-  store i16 9, i16 addrspace(5)* %out, align 4
-  store i16 12, i16 addrspace(5)* %out.gep.1
-  ret void
-}
-
-; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align2(
-; ALIGNED: store i16
-; ALIGNED: store i16
-
-; UNALIGNED: store <2 x i16> <i16 9, i16 12>, <2 x i16> addrspace(5)* %1, align 2
-define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align2(i16 addrspace(5)* %out) #0 {
-  %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1
-
-  store i16 9, i16 addrspace(5)* %out, align 2
-  store i16 12, i16 addrspace(5)* %out.gep.1, align 2
-  ret void
-}
-
-; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align1(
-; ALIGNED: store i16
-; ALIGNED: store i16
-
-; UNALIGNED: store <2 x i16> <i16 9, i16 12>, <2 x i16> addrspace(5)* %1, align 1
-define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align1(i16 addrspace(5)* %out) #0 {
-  %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1
-
-  store i16 9, i16 addrspace(5)* %out, align 1
-  store i16 12, i16 addrspace(5)* %out.gep.1, align 1
-  ret void
-}
-
-; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align8(
-; ALL: store <2 x i16> <i16 9, i16 12>, <2 x i16> addrspace(5)* %1, align 8
-define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align8(i16 addrspace(5)* %out) #0 {
-  %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1
-
-  store i16 9, i16 addrspace(5)* %out, align 8
-  store i16 12, i16 addrspace(5)* %out.gep.1, align 2
-  ret void
-}
-
-; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i32
-; ELT4: store i32
-; ELT4: store i32
-; ELT4: store i32
-
-; ELT8: store <2 x i32>
-; ELT8: store i32
-
-; ELT16: store <3 x i32>
-define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32(i32 addrspace(5)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
-
-  store i32 9, i32 addrspace(5)* %out
-  store i32 1, i32 addrspace(5)* %out.gep.1
-  store i32 23, i32 addrspace(5)* %out.gep.2
-  ret void
-}
-
-; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i32_align1(
-; ALIGNED: store i32
-; ALIGNED: store i32
-; ALIGNED: store i32
-
-; ELT4-UNALIGNED: store i32
-; ELT4-UNALIGNED: store i32
-; ELT4-UNALIGNED: store i32
-
-; ELT8-UNALIGNED: store <2 x i32>
-; ELT8-UNALIGNED: store i32
-
-; ELT16-UNALIGNED: store <3 x i32>
-define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32_align1(i32 addrspace(5)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
-
-  store i32 9, i32 addrspace(5)* %out, align 1
-  store i32 1, i32 addrspace(5)* %out.gep.1, align 1
-  store i32 23, i32 addrspace(5)* %out.gep.2, align 1
-  ret void
-}
-
-; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i8_align1(
-; ALIGNED: store i8
-; ALIGNED: store i8
-; ALIGNED: store i8
-
-; UNALIGNED: store <3 x i8>
-define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i8_align1(i8 addrspace(5)* %out) #0 {
-  %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1
-  %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2
-
-  store i8 9, i8 addrspace(5)* %out, align 1
-  store i8 1, i8 addrspace(5)* %out.gep.1, align 1
-  store i8 23, i8 addrspace(5)* %out.gep.2, align 1
-  ret void
-}
-
-attributes #0 = { nounwind }
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
deleted file mode 100644
index 0d9a4184e71..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
+++ /dev/null
@@ -1,657 +0,0 @@
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s
-; Copy of test/CodeGen/AMDGPU/merge-stores.ll with some additions
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
-
-; TODO: Vector element tests
-; TODO: Non-zero base offset for load and store combinations
-; TODO: Same base addrspacecasted
-
-
-; CHECK-LABEL: @merge_global_store_2_constants_i8(
-; CHECK: store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(1)* %{{[0-9]+}}, align 2
-define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-
-  store i8 123, i8 addrspace(1)* %out.gep.1
-  store i8 456, i8 addrspace(1)* %out, align 2
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_2_constants_i8_natural_align
-; CHECK: store <2 x i8>
-define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-
-  store i8 123, i8 addrspace(1)* %out.gep.1
-  store i8 456, i8 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_2_constants_i16
-; CHECK: store <2 x i16> <i16 456, i16 123>, <2 x i16> addrspace(1)* %{{[0-9]+}}, align 4
-define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
-
-  store i16 123, i16 addrspace(1)* %out.gep.1
-  store i16 456, i16 addrspace(1)* %out, align 4
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_2_constants_0_i16
-; CHECK: store <2 x i16> zeroinitializer, <2 x i16> addrspace(1)* %{{[0-9]+}}, align 4
-define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
-
-  store i16 0, i16 addrspace(1)* %out.gep.1
-  store i16 0, i16 addrspace(1)* %out, align 4
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_2_constants_i16_natural_align
-; CHECK: store <2 x i16>
-define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
-
-  store i16 123, i16 addrspace(1)* %out.gep.1
-  store i16 456, i16 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_2_constants_half_natural_align
-; CHECK: store <2 x half>
-define amdgpu_kernel void @merge_global_store_2_constants_half_natural_align(half addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
-
-  store half 2.0, half addrspace(1)* %out.gep.1
-  store half 1.0, half addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_2_constants_i32
-; CHECK: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-
-  store i32 123, i32 addrspace(1)* %out.gep.1
-  store i32 456, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_2_constants_i32_f32
-; CHECK: store <2 x i32> <i32 456, i32 1065353216>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
-  store float 1.0, float addrspace(1)* %out.gep.1.bc
-  store i32 456, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_2_constants_f32_i32
-; CHECK  store <2 x float> <float 4.000000e+00, float 0x370EC00000000000>, <2 x float> addrspace(1)* %{{[0-9]+$}}
-define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
-  %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
-  store i32 123, i32 addrspace(1)* %out.gep.1.bc
-  store float 4.0, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_4_constants_i32
-; CHECK: store <4 x i32> <i32 1234, i32 123, i32 456, i32 333>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
-
-  store i32 123, i32 addrspace(1)* %out.gep.1
-  store i32 456, i32 addrspace(1)* %out.gep.2
-  store i32 333, i32 addrspace(1)* %out.gep.3
-  store i32 1234, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_4_constants_f32_order
-; CHECK: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* %{{[0-9]+}}
-define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
-
-  store float 8.0, float addrspace(1)* %out
-  store float 1.0, float addrspace(1)* %out.gep.1
-  store float 2.0, float addrspace(1)* %out.gep.2
-  store float 4.0, float addrspace(1)* %out.gep.3
-  ret void
-}
-
-; First store is out of order.
-; CHECK-LABEL: @merge_global_store_4_constants_f32
-; CHECK: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* %{{[0-9]+}}, align 4
-define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
-
-  store float 1.0, float addrspace(1)* %out.gep.1
-  store float 2.0, float addrspace(1)* %out.gep.2
-  store float 4.0, float addrspace(1)* %out.gep.3
-  store float 8.0, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_4_constants_mixed_i32_f32
-; CHECK: store <4 x i32> <i32 1090519040, i32 11, i32 1073741824, i32 17>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
-
-  %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
-  %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)*
-
-  store i32 11, i32 addrspace(1)* %out.gep.1.bc
-  store float 2.0, float addrspace(1)* %out.gep.2
-  store i32 17, i32 addrspace(1)* %out.gep.3.bc
-  store float 8.0, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_3_constants_i32
-; CHECK: store <3 x i32> <i32 1234, i32 123, i32 456>, <3 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-
-  store i32 123, i32 addrspace(1)* %out.gep.1
-  store i32 456, i32 addrspace(1)* %out.gep.2
-  store i32 1234, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_2_constants_i64
-; CHECK: store <2 x i64> <i64 456, i64 123>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
-define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
-
-  store i64 123, i64 addrspace(1)* %out.gep.1
-  store i64 456, i64 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_4_constants_i64
-; CHECK: store <2 x i64> <i64 456, i64 333>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
-; CHECK: store <2 x i64> <i64 1234, i64 123>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
-define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
-  %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
-  %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
-
-  store i64 123, i64 addrspace(1)* %out.gep.1
-  store i64 456, i64 addrspace(1)* %out.gep.2
-  store i64 333, i64 addrspace(1)* %out.gep.3
-  store i64 1234, i64 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_2_adjacent_loads_i32
-; CHECK: [[LOAD:%[^ ]+]] = load <2 x i32>
-; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 0
-; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 1
-; CHECK: [[INSERT0:%[^ ]+]] = insertelement <2 x i32> undef, i32 [[ELT0]], i32 0
-; CHECK: [[INSERT1:%[^ ]+]] = insertelement <2 x i32> [[INSERT0]], i32 [[ELT1]], i32 1
-; CHECK: store <2 x i32> [[INSERT1]]
-define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
-
-  %lo = load i32, i32 addrspace(1)* %in
-  %hi = load i32, i32 addrspace(1)* %in.gep.1
-
-  store i32 %lo, i32 addrspace(1)* %out
-  store i32 %hi, i32 addrspace(1)* %out.gep.1
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_2_adjacent_loads_i32_nonzero_base
-; CHECK: extractelement
-; CHECK: extractelement
-; CHECK: insertelement
-; CHECK: insertelement
-; CHECK: store <2 x i32>
-define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
-
-  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
-  %lo = load i32, i32 addrspace(1)* %in.gep.0
-  %hi = load i32, i32 addrspace(1)* %in.gep.1
-
-  store i32 %lo, i32 addrspace(1)* %out.gep.0
-  store i32 %hi, i32 addrspace(1)* %out.gep.1
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_2_adjacent_loads_shuffle_i32
-; CHECK: [[LOAD:%[^ ]+]] = load <2 x i32>
-; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 0
-; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 1
-; CHECK: [[INSERT0:%[^ ]+]] = insertelement <2 x i32> undef, i32 [[ELT1]], i32 0
-; CHECK: [[INSERT1:%[^ ]+]] = insertelement <2 x i32> [[INSERT0]], i32 [[ELT0]], i32 1
-; CHECK: store <2 x i32> [[INSERT1]]
-define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
-
-  %lo = load i32, i32 addrspace(1)* %in
-  %hi = load i32, i32 addrspace(1)* %in.gep.1
-
-  store i32 %hi, i32 addrspace(1)* %out
-  store i32 %lo, i32 addrspace(1)* %out.gep.1
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32
-; CHECK: load <4 x i32>
-; CHECK: store <4 x i32>
-define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
-  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
-
-  %x = load i32, i32 addrspace(1)* %in
-  %y = load i32, i32 addrspace(1)* %in.gep.1
-  %z = load i32, i32 addrspace(1)* %in.gep.2
-  %w = load i32, i32 addrspace(1)* %in.gep.3
-
-  store i32 %x, i32 addrspace(1)* %out
-  store i32 %y, i32 addrspace(1)* %out.gep.1
-  store i32 %z, i32 addrspace(1)* %out.gep.2
-  store i32 %w, i32 addrspace(1)* %out.gep.3
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_3_adjacent_loads_i32
-; CHECK: load <3 x i32>
-; CHECK: store <3 x i32>
-define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
-
-  %x = load i32, i32 addrspace(1)* %in
-  %y = load i32, i32 addrspace(1)* %in.gep.1
-  %z = load i32, i32 addrspace(1)* %in.gep.2
-
-  store i32 %x, i32 addrspace(1)* %out
-  store i32 %y, i32 addrspace(1)* %out.gep.1
-  store i32 %z, i32 addrspace(1)* %out.gep.2
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_4_adjacent_loads_f32
-; CHECK: load <4 x float>
-; CHECK: store <4 x float>
-define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
-  %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
-  %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
-  %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
-
-  %x = load float, float addrspace(1)* %in
-  %y = load float, float addrspace(1)* %in.gep.1
-  %z = load float, float addrspace(1)* %in.gep.2
-  %w = load float, float addrspace(1)* %in.gep.3
-
-  store float %x, float addrspace(1)* %out
-  store float %y, float addrspace(1)* %out.gep.1
-  store float %z, float addrspace(1)* %out.gep.2
-  store float %w, float addrspace(1)* %out.gep.3
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32_nonzero_base
-; CHECK: load <4 x i32>
-; CHECK: store <4 x i32>
-define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
-  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
-  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
-  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
-  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
-
-  %x = load i32, i32 addrspace(1)* %in.gep.0
-  %y = load i32, i32 addrspace(1)* %in.gep.1
-  %z = load i32, i32 addrspace(1)* %in.gep.2
-  %w = load i32, i32 addrspace(1)* %in.gep.3
-
-  store i32 %x, i32 addrspace(1)* %out.gep.0
-  store i32 %y, i32 addrspace(1)* %out.gep.1
-  store i32 %z, i32 addrspace(1)* %out.gep.2
-  store i32 %w, i32 addrspace(1)* %out.gep.3
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_4_adjacent_loads_inverse_i32
-; CHECK: load <4 x i32>
-; CHECK: store <4 x i32>
-define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
-  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
-
-  %x = load i32, i32 addrspace(1)* %in
-  %y = load i32, i32 addrspace(1)* %in.gep.1
-  %z = load i32, i32 addrspace(1)* %in.gep.2
-  %w = load i32, i32 addrspace(1)* %in.gep.3
-
-  ; Make sure the barrier doesn't stop this
-  tail call void @llvm.amdgcn.s.barrier() #1
-
-  store i32 %w, i32 addrspace(1)* %out.gep.3
-  store i32 %z, i32 addrspace(1)* %out.gep.2
-  store i32 %y, i32 addrspace(1)* %out.gep.1
-  store i32 %x, i32 addrspace(1)* %out
-
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_4_adjacent_loads_shuffle_i32
-; CHECK: load <4 x i32>
-; CHECK: store <4 x i32>
-define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
-  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
-
-  %x = load i32, i32 addrspace(1)* %in
-  %y = load i32, i32 addrspace(1)* %in.gep.1
-  %z = load i32, i32 addrspace(1)* %in.gep.2
-  %w = load i32, i32 addrspace(1)* %in.gep.3
-
-  ; Make sure the barrier doesn't stop this
-  tail call void @llvm.amdgcn.s.barrier() #1
-
-  store i32 %w, i32 addrspace(1)* %out
-  store i32 %z, i32 addrspace(1)* %out.gep.1
-  store i32 %y, i32 addrspace(1)* %out.gep.2
-  store i32 %x, i32 addrspace(1)* %out.gep.3
-
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8
-; CHECK: load <4 x i8>
-; CHECK: extractelement <4 x i8>
-; CHECK: extractelement <4 x i8>
-; CHECK: extractelement <4 x i8>
-; CHECK: extractelement <4 x i8>
-; CHECK: insertelement <4 x i8>
-; CHECK: insertelement <4 x i8>
-; CHECK: insertelement <4 x i8>
-; CHECK: insertelement <4 x i8>
-; CHECK: store <4 x i8>
-define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
-  %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
-  %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
-  %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
-  %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
-  %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
-
-  %x = load i8, i8 addrspace(1)* %in, align 4
-  %y = load i8, i8 addrspace(1)* %in.gep.1
-  %z = load i8, i8 addrspace(1)* %in.gep.2
-  %w = load i8, i8 addrspace(1)* %in.gep.3
-
-  store i8 %x, i8 addrspace(1)* %out, align 4
-  store i8 %y, i8 addrspace(1)* %out.gep.1
-  store i8 %z, i8 addrspace(1)* %out.gep.2
-  store i8 %w, i8 addrspace(1)* %out.gep.3
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8_natural_align
-; CHECK: load <4 x i8>
-; CHECK: store <4 x i8>
-define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
-  %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
-  %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
-  %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
-  %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
-  %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
-
-  %x = load i8, i8 addrspace(1)* %in
-  %y = load i8, i8 addrspace(1)* %in.gep.1
-  %z = load i8, i8 addrspace(1)* %in.gep.2
-  %w = load i8, i8 addrspace(1)* %in.gep.3
-
-  store i8 %x, i8 addrspace(1)* %out
-  store i8 %y, i8 addrspace(1)* %out.gep.1
-  store i8 %z, i8 addrspace(1)* %out.gep.2
-  store i8 %w, i8 addrspace(1)* %out.gep.3
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_4_vector_elts_loads_v4i32
-; CHECK: load <4 x i32>
-; CHECK: store <4 x i32>
-define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
-  %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
-
-  %x = extractelement <4 x i32> %vec, i32 0
-  %y = extractelement <4 x i32> %vec, i32 1
-  %z = extractelement <4 x i32> %vec, i32 2
-  %w = extractelement <4 x i32> %vec, i32 3
-
-  store i32 %x, i32 addrspace(1)* %out
-  store i32 %y, i32 addrspace(1)* %out.gep.1
-  store i32 %z, i32 addrspace(1)* %out.gep.2
-  store i32 %w, i32 addrspace(1)* %out.gep.3
-  ret void
-}
-
-; CHECK-LABEL: @merge_local_store_2_constants_i8
-; CHECK: store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(3)* %{{[0-9]+}}, align 2
-define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
-  %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
-
-  store i8 123, i8 addrspace(3)* %out.gep.1
-  store i8 456, i8 addrspace(3)* %out, align 2
-  ret void
-}
-
-; CHECK-LABEL: @merge_local_store_2_constants_i32
-; CHECK: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(3)* %{{[0-9]+}}, align 4
-define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
-
-  store i32 123, i32 addrspace(3)* %out.gep.1
-  store i32 456, i32 addrspace(3)* %out
-  ret void
-}
-
-; CHECK-LABEL: @merge_local_store_2_constants_i32_align_2
-; CHECK: store i32
-; CHECK: store i32
-define amdgpu_kernel void @merge_local_store_2_constants_i32_align_2(i32 addrspace(3)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
-
-  store i32 123, i32 addrspace(3)* %out.gep.1, align 2
-  store i32 456, i32 addrspace(3)* %out, align 2
-  ret void
-}
-
-; CHECK-LABEL: @merge_local_store_4_constants_i32
-; CHECK: store <4 x i32> <i32 1234, i32 123, i32 456, i32 333>, <4 x i32> addrspace(3)*
-define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
-
-  store i32 123, i32 addrspace(3)* %out.gep.1
-  store i32 456, i32 addrspace(3)* %out.gep.2
-  store i32 333, i32 addrspace(3)* %out.gep.3
-  store i32 1234, i32 addrspace(3)* %out
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_5_constants_i32
-; CHECK: store <4 x i32> <i32 9, i32 12, i32 16, i32 -12>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-; CHECK: store i32
-define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
-  store i32 9, i32 addrspace(1)* %out, align 4
-  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
-  store i32 12, i32 addrspace(1)* %idx1, align 4
-  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
-  store i32 16, i32 addrspace(1)* %idx2, align 4
-  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
-  store i32 -12, i32 addrspace(1)* %idx3, align 4
-  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
-  store i32 11, i32 addrspace(1)* %idx4, align 4
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_6_constants_i32
-; CHECK: store <4 x i32> <i32 13, i32 15, i32 62, i32 63>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-; CHECK: store <2 x i32> <i32 11, i32 123>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
-  store i32 13, i32 addrspace(1)* %out, align 4
-  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
-  store i32 15, i32 addrspace(1)* %idx1, align 4
-  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
-  store i32 62, i32 addrspace(1)* %idx2, align 4
-  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
-  store i32 63, i32 addrspace(1)* %idx3, align 4
-  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
-  store i32 11, i32 addrspace(1)* %idx4, align 4
-  %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
-  store i32 123, i32 addrspace(1)* %idx5, align 4
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_7_constants_i32
-; CHECK: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-; CHECK: store <3 x i32> <i32 98, i32 91, i32 212>, <3 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
-  store i32 34, i32 addrspace(1)* %out, align 4
-  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
-  store i32 999, i32 addrspace(1)* %idx1, align 4
-  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
-  store i32 65, i32 addrspace(1)* %idx2, align 4
-  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
-  store i32 33, i32 addrspace(1)* %idx3, align 4
-  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
-  store i32 98, i32 addrspace(1)* %idx4, align 4
-  %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
-  store i32 91, i32 addrspace(1)* %idx5, align 4
-  %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
-  store i32 212, i32 addrspace(1)* %idx6, align 4
-  ret void
-}
-
-; CHECK-LABEL: @merge_global_store_8_constants_i32
-; CHECK: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-; CHECK: store <4 x i32> <i32 98, i32 91, i32 212, i32 999>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
-  store i32 34, i32 addrspace(1)* %out, align 4
-  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
-  store i32 999, i32 addrspace(1)* %idx1, align 4
-  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
-  store i32 65, i32 addrspace(1)* %idx2, align 4
-  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
-  store i32 33, i32 addrspace(1)* %idx3, align 4
-  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
-  store i32 98, i32 addrspace(1)* %idx4, align 4
-  %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
-  store i32 91, i32 addrspace(1)* %idx5, align 4
-  %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
-  store i32 212, i32 addrspace(1)* %idx6, align 4
-  %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7
-  store i32 999, i32 addrspace(1)* %idx7, align 4
-  ret void
-}
-
-; CHECK-LABEL: @copy_v3i32_align4
-; CHECK: %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
-; CHECK: store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
-define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
-  %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
-  store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: @copy_v3i64_align4
-; CHECK: %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
-; CHECK: store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
-define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
-  %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
-  store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: @copy_v3f32_align4
-; CHECK: %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
-; CHECK: store <3 x float>
-define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
-  %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
-  %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
-  store <3 x float> %fadd, <3 x float> addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: @copy_v3f64_align4
-; CHECK: %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
-; CHECK: store <3 x double> %fadd, <3 x double> addrspace(1)* %out
-define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
-  %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
-  %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
-  store <3 x double> %fadd, <3 x double> addrspace(1)* %out
-  ret void
-}
-
-; Verify that we no longer hit asserts for this test case. No change expected.
-; CHECK-LABEL: @copy_vec_of_ptrs
-; CHECK: %in.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %in, i32 1
-; CHECK: %vec1 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in.gep.1
-; CHECK: %vec2 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in, align 4
-; CHECK: %out.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %out, i32 1
-; CHECK: store <2 x i16*> %vec1, <2 x i16*> addrspace(1)* %out.gep.1
-; CHECK: store <2 x i16*> %vec2, <2 x i16*> addrspace(1)* %out, align 4
-define amdgpu_kernel void @copy_vec_of_ptrs(<2 x i16*> addrspace(1)* %out,
-                                            <2 x i16*> addrspace(1)* %in ) #0 {
-  %in.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %in, i32 1
-  %vec1 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in.gep.1
-  %vec2 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in, align 4
-
-  %out.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %out, i32 1
-  store <2 x i16*> %vec1, <2 x i16*> addrspace(1)* %out.gep.1
-  store <2 x i16*> %vec2, <2 x i16*> addrspace(1)* %out, align 4
-  ret void
-}
-
-declare void @llvm.amdgcn.s.barrier() #1
-
-attributes #0 = { nounwind }
-attributes #1 = { convergent nounwind }
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
deleted file mode 100644
index bcf2265f310..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
+++ /dev/null
@@ -1,91 +0,0 @@
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
-
-; CHECK-LABEL: @merge_v2i32_v2i32(
-; CHECK: load <4 x i32>
-; CHECK: store <4 x i32> zeroinitializer
-define amdgpu_kernel void @merge_v2i32_v2i32(<2 x i32> addrspace(1)* nocapture %a, <2 x i32> addrspace(1)* nocapture readonly %b) #0 {
-entry:
-  %a.1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %a, i64 1
-  %b.1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %b, i64 1
-
-  %ld.c = load <2 x i32>, <2 x i32> addrspace(1)* %b, align 4
-  %ld.c.idx.1 = load <2 x i32>, <2 x i32> addrspace(1)* %b.1, align 4
-
-  store <2 x i32> zeroinitializer, <2 x i32> addrspace(1)* %a, align 4
-  store <2 x i32> zeroinitializer, <2 x i32> addrspace(1)* %a.1, align 4
-
-  ret void
-}
-
-; CHECK-LABEL: @merge_v1i32_v1i32(
-; CHECK: load <2 x i32>
-; CHECK: store <2 x i32> zeroinitializer
-define amdgpu_kernel void @merge_v1i32_v1i32(<1 x i32> addrspace(1)* nocapture %a, <1 x i32> addrspace(1)* nocapture readonly %b) #0 {
-entry:
-  %a.1 = getelementptr inbounds <1 x i32>, <1 x i32> addrspace(1)* %a, i64 1
-  %b.1 = getelementptr inbounds <1 x i32>, <1 x i32> addrspace(1)* %b, i64 1
-
-  %ld.c = load <1 x i32>, <1 x i32> addrspace(1)* %b, align 4
-  %ld.c.idx.1 = load <1 x i32>, <1 x i32> addrspace(1)* %b.1, align 4
-
-  store <1 x i32> zeroinitializer, <1 x i32> addrspace(1)* %a, align 4
-  store <1 x i32> zeroinitializer, <1 x i32> addrspace(1)* %a.1, align 4
-
-  ret void
-}
-
-; CHECK-LABEL: @no_merge_v3i32_v3i32(
-; CHECK: load <3 x i32>
-; CHECK: load <3 x i32>
-; CHECK: store <3 x i32> zeroinitializer
-; CHECK: store <3 x i32> zeroinitializer
-define amdgpu_kernel void @no_merge_v3i32_v3i32(<3 x i32> addrspace(1)* nocapture %a, <3 x i32> addrspace(1)* nocapture readonly %b) #0 {
-entry:
-  %a.1 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %a, i64 1
-  %b.1 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %b, i64 1
-
-  %ld.c = load <3 x i32>, <3 x i32> addrspace(1)* %b, align 4
-  %ld.c.idx.1 = load <3 x i32>, <3 x i32> addrspace(1)* %b.1, align 4
-
-  store <3 x i32> zeroinitializer, <3 x i32> addrspace(1)* %a, align 4
-  store <3 x i32> zeroinitializer, <3 x i32> addrspace(1)* %a.1, align 4
-
-  ret void
-}
-
-; CHECK-LABEL: @merge_v2i16_v2i16(
-; CHECK: load <4 x i16>
-; CHECK: store <4 x i16> zeroinitializer
-define amdgpu_kernel void @merge_v2i16_v2i16(<2 x i16> addrspace(1)* nocapture %a, <2 x i16> addrspace(1)* nocapture readonly %b) #0 {
-entry:
-  %a.1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a, i64 1
-  %b.1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b, i64 1
-
-  %ld.c = load <2 x i16>, <2 x i16> addrspace(1)* %b, align 4
-  %ld.c.idx.1 = load <2 x i16>, <2 x i16> addrspace(1)* %b.1, align 4
-
-  store <2 x i16> zeroinitializer, <2 x i16> addrspace(1)* %a, align 4
-  store <2 x i16> zeroinitializer, <2 x i16> addrspace(1)* %a.1, align 4
-
-  ret void
-}
-
-; Ideally this would be merged
-; CHECK-LABEL: @merge_load_i32_v2i16(
-; CHECK: load i32,
-; CHECK: load <2 x i16>
-define amdgpu_kernel void @merge_load_i32_v2i16(i32 addrspace(1)* nocapture %a) #0 {
-entry:
-  %a.1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 1
-  %a.1.cast = bitcast i32 addrspace(1)* %a.1 to <2 x i16> addrspace(1)*
-
-  %ld.0 = load i32, i32 addrspace(1)* %a
-  %ld.1 = load <2 x i16>, <2 x i16> addrspace(1)* %a.1.cast
-
-  ret void
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll
deleted file mode 100644
index ff718c1b101..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; RUN: opt -mtriple=amdgcn-- -load-store-vectorizer -S -o - %s | FileCheck %s
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
-
-@lds = internal addrspace(3) global [512 x float] undef, align 4
-
-; The original load has an implicit alignment of 4, and should not
-; increase to an align 8 load.
-
-; CHECK-LABEL: @load_keep_base_alignment_missing_align(
-; CHECK: load <2 x float>, <2 x float> addrspace(3)* %{{[0-9]+}}, align 4
-define amdgpu_kernel void @load_keep_base_alignment_missing_align(float addrspace(1)* %out) {
-  %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 11
-  %val0 = load float, float addrspace(3)* %ptr0
-
-  %ptr1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 12
-  %val1 = load float, float addrspace(3)* %ptr1
-  %add = fadd float %val0, %val1
-  store float %add, float addrspace(1)* %out
-  ret void
-}
-
-
-; CHECK-LABEL: @store_keep_base_alignment_missing_align(
-; CHECK: store <2 x float> zeroinitializer, <2 x float> addrspace(3)* %{{[0-9]+}}, align 4
-define amdgpu_kernel void @store_keep_base_alignment_missing_align() {
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 1
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 2
-  store float 0.0, float addrspace(3)* %arrayidx0
-  store float 0.0, float addrspace(3)* %arrayidx1
-  ret void
-}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
deleted file mode 100644
index ffd651b2c65..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
-
-; Checks that there is no crash when there are multiple tails
-; for a the same head starting a chain.
-@0 = internal addrspace(3) global [16384 x i32] undef
-
-; CHECK-LABEL: @no_crash(
-; CHECK: store <2 x i32> zeroinitializer
-; CHECK: store i32 0
-; CHECK: store i32 0
-
-define amdgpu_kernel void @no_crash(i32 %arg) {
-  %tmp2 = add i32 %arg, 14
-  %tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp2
-  %tmp4 = add i32 %arg, 15
-  %tmp5 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp4
-
-  store i32 0, i32 addrspace(3)* %tmp3, align 4
-  store i32 0, i32 addrspace(3)* %tmp5, align 4
-  store i32 0, i32 addrspace(3)* %tmp5, align 4
-  store i32 0, i32 addrspace(3)* %tmp5, align 4
-
-  ret void
-}
-
-; Check adjiacent memory locations are properly matched and the
-; longest chain vectorized
-
-; CHECK-LABEL: @interleave_get_longest
-; CHECK: load <4 x i32>
-; CHECK: load i32
-; CHECK: store <2 x i32> zeroinitializer
-; CHECK: load i32
-; CHECK: load i32
-; CHECK: load i32
-
-define amdgpu_kernel void @interleave_get_longest(i32 %arg) {
-  %a1 = add i32 %arg, 1
-  %a2 = add i32 %arg, 2
-  %a3 = add i32 %arg, 3
-  %a4 = add i32 %arg, 4
-  %tmp1 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %arg
-  %tmp2 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a1
-  %tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a2
-  %tmp4 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a3
-  %tmp5 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a4
-
-  %l1 = load i32, i32 addrspace(3)* %tmp2, align 4
-  %l2 = load i32, i32 addrspace(3)* %tmp1, align 4
-  store i32 0, i32 addrspace(3)* %tmp2, align 4
-  store i32 0, i32 addrspace(3)* %tmp1, align 4
-  %l3 = load i32, i32 addrspace(3)* %tmp2, align 4
-  %l4 = load i32, i32 addrspace(3)* %tmp3, align 4
-  %l5 = load i32, i32 addrspace(3)* %tmp4, align 4
-  %l6 = load i32, i32 addrspace(3)* %tmp5, align 4
-  %l7 = load i32, i32 addrspace(3)* %tmp5, align 4
-  %l8 = load i32, i32 addrspace(3)* %tmp5, align 4
-
-  ret void
-}
-
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll
deleted file mode 100644
index 86f6b6d55ec..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
-
-; CHECK-LABEL: @no_implicit_float(
-; CHECK: store i32
-; CHECK: store i32
-; CHECK: store i32
-; CHECK: store i32
-define amdgpu_kernel void @no_implicit_float(i32 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
-
-  store i32 123, i32 addrspace(1)* %out.gep.1
-  store i32 456, i32 addrspace(1)* %out.gep.2
-  store i32 333, i32 addrspace(1)* %out.gep.3
-  store i32 1234, i32 addrspace(1)* %out
-  ret void
-}
-
-attributes #0 = { nounwind noimplicitfloat }
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll
deleted file mode 100644
index 8a2abe50a5a..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
-
-; CHECK-LABEL: @optnone(
-; CHECK: store i32
-; CHECK: store i32
-define amdgpu_kernel void @optnone(i32 addrspace(1)* %out) noinline optnone {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-
-  store i32 123, i32 addrspace(1)* %out.gep.1
-  store i32 456, i32 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: @do_opt(
-; CHECK: store <2 x i32>
-define amdgpu_kernel void @do_opt(i32 addrspace(1)* %out) {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-
-  store i32 123, i32 addrspace(1)* %out.gep.1
-  store i32 456, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll
deleted file mode 100644
index 9290749bb89..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll
+++ /dev/null
@@ -1,311 +0,0 @@
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
-
-declare i32 @llvm.amdgcn.workitem.id.x() #1
-
-; CHECK-LABEL: @merge_v2p1i8(
-; CHECK: load <2 x i64>
-; CHECK: inttoptr i64 %{{[^ ]+}} to i8 addrspace(1)*
-; CHECK: inttoptr i64 %{{[^ ]+}} to i8 addrspace(1)*
-; CHECK: store <2 x i64> zeroinitializer
-define amdgpu_kernel void @merge_v2p1i8(i8 addrspace(1)* addrspace(1)* nocapture %a, i8 addrspace(1)* addrspace(1)* nocapture readonly %b) #0 {
-entry:
-  %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1
-  %b.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %b, i64 1
-
-  %ld.c = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %b, align 4
-  %ld.c.idx.1 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %b.1, align 4
-
-  store i8 addrspace(1)* null, i8 addrspace(1)* addrspace(1)* %a, align 4
-  store i8 addrspace(1)* null, i8 addrspace(1)* addrspace(1)* %a.1, align 4
-
-  ret void
-}
-
-; CHECK-LABEL: @merge_v2p3i8(
-; CHECK: load <2 x i32>
-; CHECK: inttoptr i32 %{{[^ ]+}} to i8 addrspace(3)*
-; CHECK: inttoptr i32 %{{[^ ]+}} to i8 addrspace(3)*
-; CHECK: store <2 x i32> zeroinitializer
-define amdgpu_kernel void @merge_v2p3i8(i8 addrspace(3)* addrspace(3)* nocapture %a, i8 addrspace(3)* addrspace(3)* nocapture readonly %b) #0 {
-entry:
-  %a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a, i64 1
-  %b.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %b, i64 1
-
-  %ld.c = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %b, align 4
-  %ld.c.idx.1 = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %b.1, align 4
-
-  store i8 addrspace(3)* null, i8 addrspace(3)* addrspace(3)* %a, align 4
-  store i8 addrspace(3)* null, i8 addrspace(3)* addrspace(3)* %a.1, align 4
-
-  ret void
-}
-
-; CHECK-LABEL: @merge_load_i64_ptr64(
-; CHECK: load <2 x i64>
-; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1
-; CHECK: inttoptr i64 [[ELT1]] to i8 addrspace(1)*
-define amdgpu_kernel void @merge_load_i64_ptr64(i64 addrspace(1)* nocapture %a) #0 {
-entry:
-  %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
-  %a.1.cast = bitcast i64 addrspace(1)* %a.1 to i8 addrspace(1)* addrspace(1)*
-
-  %ld.0 = load i64, i64 addrspace(1)* %a
-  %ld.1 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a.1.cast
-
-  ret void
-}
-
-; CHECK-LABEL: @merge_load_ptr64_i64(
-; CHECK: load <2 x i64>
-; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 0
-; CHECK: inttoptr i64 [[ELT0]] to i8 addrspace(1)*
-define amdgpu_kernel void @merge_load_ptr64_i64(i64 addrspace(1)* nocapture %a) #0 {
-entry:
-  %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
-  %a.1 =  getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
-
-  %ld.0 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a.cast
-  %ld.1 = load i64, i64 addrspace(1)* %a.1
-
-  ret void
-}
-
-; CHECK-LABEL: @merge_store_ptr64_i64(
-; CHECK: [[ELT0:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr0 to i64
-; CHECK: insertelement <2 x i64> undef, i64 [[ELT0]], i32 0
-; CHECK: store <2 x i64>
-define amdgpu_kernel void @merge_store_ptr64_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, i64 %val1) #0 {
-entry:
-  %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
-  %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
-
-
-  store i8 addrspace(1)* %ptr0, i8 addrspace(1)* addrspace(1)* %a.cast
-  store i64 %val1, i64 addrspace(1)* %a.1
-
-  ret void
-}
-
-; CHECK-LABEL: @merge_store_i64_ptr64(
-; CHECK: [[ELT1:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr1 to i64
-; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1]], i32 1
-; CHECK: store <2 x i64>
-define amdgpu_kernel void @merge_store_i64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(1)* %ptr1) #0 {
-entry:
-  %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1
-  %a.cast = bitcast i8 addrspace(1)* addrspace(1)* %a to i64 addrspace(1)*
-
-  store i64 %val0, i64 addrspace(1)* %a.cast
-  store i8 addrspace(1)* %ptr1, i8 addrspace(1)* addrspace(1)* %a.1
-
-  ret void
-}
-
-; CHECK-LABEL: @merge_load_i32_ptr32(
-; CHECK: load <2 x i32>
-; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 1
-; CHECK: inttoptr i32 [[ELT1]] to i8 addrspace(3)*
-define amdgpu_kernel void @merge_load_i32_ptr32(i32 addrspace(3)* nocapture %a) #0 {
-entry:
-  %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1
-  %a.1.cast = bitcast i32 addrspace(3)* %a.1 to i8 addrspace(3)* addrspace(3)*
-
-  %ld.0 = load i32, i32 addrspace(3)* %a
-  %ld.1 = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a.1.cast
-
-  ret void
-}
-
-; CHECK-LABEL: @merge_load_ptr32_i32(
-; CHECK: load <2 x i32>
-; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 0
-; CHECK: inttoptr i32 [[ELT0]] to i8 addrspace(3)*
-define amdgpu_kernel void @merge_load_ptr32_i32(i32 addrspace(3)* nocapture %a) #0 {
-entry:
-  %a.cast = bitcast i32 addrspace(3)* %a to i8 addrspace(3)* addrspace(3)*
-  %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1
-
-  %ld.0 = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a.cast
-  %ld.1 = load i32, i32 addrspace(3)* %a.1
-
-  ret void
-}
-
-; CHECK-LABEL: @merge_store_ptr32_i32(
-; CHECK: [[ELT0:%[^ ]+]] = ptrtoint i8 addrspace(3)* %ptr0 to i32
-; CHECK: insertelement <2 x i32> undef, i32 [[ELT0]], i32 0
-; CHECK: store <2 x i32>
-define amdgpu_kernel void @merge_store_ptr32_i32(i32 addrspace(3)* nocapture %a, i8 addrspace(3)* %ptr0, i32 %val1) #0 {
-entry:
-  %a.cast = bitcast i32 addrspace(3)* %a to i8 addrspace(3)* addrspace(3)*
-  %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1
-
-  store i8 addrspace(3)* %ptr0, i8 addrspace(3)* addrspace(3)* %a.cast
-  store i32 %val1, i32 addrspace(3)* %a.1
-
-  ret void
-}
-
-; CHECK-LABEL: @merge_store_i32_ptr32(
-; CHECK: [[ELT1:%[^ ]+]] = ptrtoint i8 addrspace(3)* %ptr1 to i32
-; CHECK: insertelement <2 x i32> %{{[^ ]+}}, i32 [[ELT1]], i32 1
-; CHECK: store <2 x i32>
-define amdgpu_kernel void @merge_store_i32_ptr32(i8 addrspace(3)* addrspace(3)* nocapture %a, i32 %val0, i8 addrspace(3)* %ptr1) #0 {
-entry:
-  %a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a, i32 1
-  %a.cast = bitcast i8 addrspace(3)* addrspace(3)* %a to i32 addrspace(3)*
-
-  store i32 %val0, i32 addrspace(3)* %a.cast
-  store i8 addrspace(3)* %ptr1, i8 addrspace(3)* addrspace(3)* %a.1
-
-  ret void
-}
-
-; CHECK-LABEL: @no_merge_store_ptr32_i64(
-; CHECK: store i8 addrspace(3)*
-; CHECK: store i64
-define amdgpu_kernel void @no_merge_store_ptr32_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(3)* %ptr0, i64 %val1) #0 {
-entry:
-  %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(3)* addrspace(1)*
-  %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
-
-
-  store i8 addrspace(3)* %ptr0, i8 addrspace(3)* addrspace(1)* %a.cast
-  store i64 %val1, i64 addrspace(1)* %a.1
-
-  ret void
-}
-
-; CHECK-LABEL: @no_merge_store_i64_ptr32(
-; CHECK: store i64
-; CHECK: store i8 addrspace(3)*
-define amdgpu_kernel void @no_merge_store_i64_ptr32(i8 addrspace(3)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(3)* %ptr1) #0 {
-entry:
-  %a.1 =  getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %a, i64 1
-  %a.cast = bitcast i8 addrspace(3)* addrspace(1)* %a to i64 addrspace(1)*
-
-  store i64 %val0, i64 addrspace(1)* %a.cast
-  store i8 addrspace(3)* %ptr1, i8 addrspace(3)* addrspace(1)* %a.1
-
-  ret void
-}
-
-; CHECK-LABEL: @no_merge_load_i64_ptr32(
-; CHECK: load i64,
-; CHECK: load i8 addrspace(3)*,
-define amdgpu_kernel void @no_merge_load_i64_ptr32(i64 addrspace(1)* nocapture %a) #0 {
-entry:
-  %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
-  %a.1.cast = bitcast i64 addrspace(1)* %a.1 to i8 addrspace(3)* addrspace(1)*
-
-  %ld.0 = load i64, i64 addrspace(1)* %a
-  %ld.1 = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %a.1.cast
-
-  ret void
-}
-
-; CHECK-LABEL: @no_merge_load_ptr32_i64(
-; CHECK: load i8 addrspace(3)*,
-; CHECK: load i64,
-define amdgpu_kernel void @no_merge_load_ptr32_i64(i64 addrspace(1)* nocapture %a) #0 {
-entry:
-  %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(3)* addrspace(1)*
-  %a.1 =  getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
-
-  %ld.0 = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %a.cast
-  %ld.1 = load i64, i64 addrspace(1)* %a.1
-
-  ret void
-}
-
-; XXX - This isn't merged for some reason
-; CHECK-LABEL: @merge_v2p1i8_v2p1i8(
-; CHECK: load <2 x i8 addrspace(1)*>
-; CHECK: load <2 x i8 addrspace(1)*>
-; CHECK: store <2 x i8 addrspace(1)*>
-; CHECK: store <2 x i8 addrspace(1)*>
-define amdgpu_kernel void @merge_v2p1i8_v2p1i8(<2 x i8 addrspace(1)*> addrspace(1)* nocapture noalias %a, <2 x i8 addrspace(1)*> addrspace(1)* nocapture readonly noalias %b) #0 {
-entry:
-  %a.1 = getelementptr inbounds <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %a, i64 1
-  %b.1 = getelementptr inbounds <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %b, i64 1
-
-  %ld.c = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %b, align 4
-  %ld.c.idx.1 = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %b.1, align 4
-
-  store <2 x i8 addrspace(1)*> zeroinitializer, <2 x i8 addrspace(1)*> addrspace(1)* %a, align 4
-  store <2 x i8 addrspace(1)*> zeroinitializer, <2 x i8 addrspace(1)*> addrspace(1)* %a.1, align 4
-  ret void
-}
-
-; CHECK-LABEL: @merge_load_ptr64_f64(
-; CHECK: load <2 x i64>
-; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 0
-; CHECK: [[ELT0_INT:%[^ ]+]] = inttoptr i64 [[ELT0]] to i8 addrspace(1)*
-; CHECK: [[ELT1_INT:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1
-; CHECK: bitcast i64 [[ELT1_INT]] to double
-define amdgpu_kernel void @merge_load_ptr64_f64(double addrspace(1)* nocapture %a) #0 {
-entry:
-  %a.cast = bitcast double addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
-  %a.1 =  getelementptr inbounds double, double addrspace(1)* %a, i64 1
-
-  %ld.0 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a.cast
-  %ld.1 = load double, double addrspace(1)* %a.1
-
-  ret void
-}
-
-; CHECK-LABEL: @merge_load_f64_ptr64(
-; CHECK: load <2 x i64>
-; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 0
-; CHECK: bitcast i64 [[ELT0]] to double
-; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1
-; CHECK: inttoptr i64 [[ELT1]] to i8 addrspace(1)*
-define amdgpu_kernel void @merge_load_f64_ptr64(double addrspace(1)* nocapture %a) #0 {
-entry:
-  %a.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
-  %a.1.cast = bitcast double addrspace(1)* %a.1 to i8 addrspace(1)* addrspace(1)*
-
-  %ld.0 = load double, double addrspace(1)* %a
-  %ld.1 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a.1.cast
-
-  ret void
-}
-
-; CHECK-LABEL: @merge_store_ptr64_f64(
-; CHECK: [[ELT0_INT:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr0 to i64
-; CHECK: insertelement <2 x i64> undef, i64 [[ELT0_INT]], i32 0
-; CHECK: [[ELT1_INT:%[^ ]+]] = bitcast double %val1 to i64
-; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1_INT]], i32 1
-; CHECK: store <2 x i64>
-define amdgpu_kernel void @merge_store_ptr64_f64(double addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, double %val1) #0 {
-entry:
-  %a.cast = bitcast double addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
-  %a.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
-
-  store i8 addrspace(1)* %ptr0, i8 addrspace(1)* addrspace(1)* %a.cast
-  store double %val1, double addrspace(1)* %a.1
-
-  ret void
-}
-
-; CHECK-LABEL: @merge_store_f64_ptr64(
-; CHECK: [[ELT0_INT:%[^ ]+]] = bitcast double %val0 to i64
-; CHECK: insertelement <2 x i64> undef, i64 [[ELT0_INT]], i32 0
-; CHECK: [[ELT1_INT:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr1 to i64
-; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1_INT]], i32 1
-; CHECK: store <2 x i64>
-define amdgpu_kernel void @merge_store_f64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, double %val0, i8 addrspace(1)* %ptr1) #0 {
-entry:
-  %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1
-  %a.cast = bitcast i8 addrspace(1)* addrspace(1)* %a to double addrspace(1)*
-
-  store double %val0, double addrspace(1)* %a.cast
-  store i8 addrspace(1)* %ptr1, i8 addrspace(1)* addrspace(1)* %a.1
-
-  ret void
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/selects.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/selects.ll
deleted file mode 100644
index c020cc71b4a..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/selects.ll
+++ /dev/null
@@ -1,95 +0,0 @@
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -dce -S -o - %s | FileCheck %s
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
-
-define void @base_case(i1 %cnd, i32 addrspace(1)* %a, i32 addrspace(1)* %b, <3 x i32> addrspace(1)* %out) {
-; CHECK-LABEL: @base_case
-; CHECK: load <3 x i32>
-entry:
-  %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 1
-  %gep2 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 2
-  %gep4 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 1
-  %gep5 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 2
-  %selected = select i1 %cnd, i32 addrspace(1)* %a, i32 addrspace(1)* %b
-  %selected14 = select i1 %cnd, i32 addrspace(1)* %gep1, i32 addrspace(1)* %gep4
-  %selected25 = select i1 %cnd, i32 addrspace(1)* %gep2, i32 addrspace(1)* %gep5
-  %val0 = load i32, i32 addrspace(1)* %selected, align 4
-  %val1 = load i32, i32 addrspace(1)* %selected14, align 4
-  %val2 = load i32, i32 addrspace(1)* %selected25, align 4
-  %t0 = insertelement <3 x i32> undef, i32 %val0, i32 0
-  %t1 = insertelement <3 x i32> %t0, i32 %val1, i32 1
-  %t2 = insertelement <3 x i32> %t1, i32 %val2, i32 2
-  store <3 x i32> %t2, <3 x i32> addrspace(1)* %out
-  ret void
-}
-
-define void @scev_targeting_complex_case(i1 %cnd, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 %base, <2 x i32> addrspace(1)* %out) {
-; CHECK-LABEL: @scev_targeting_complex_case
-; CHECK: load <2 x i32>
-entry:
-  %base.x4 = shl i32 %base, 2
-  %base.x4.p1 = add i32 %base.x4, 1
-  %base.x4.p2 = add i32 %base.x4, 2
-  %base.x4.p3 = add i32 %base.x4, 3
-  %zext.x4 = zext i32 %base.x4 to i64
-  %zext.x4.p1 = zext i32 %base.x4.p1 to i64
-  %zext.x4.p2 = zext i32 %base.x4.p2 to i64
-  %zext.x4.p3 = zext i32 %base.x4.p3 to i64
-  %base.x16 = mul i64 %zext.x4, 4
-  %base.x16.p4 = shl i64 %zext.x4.p1, 2
-  %base.x16.p8 = shl i64 %zext.x4.p2, 2
-  %base.x16.p12 = mul i64 %zext.x4.p3, 4
-  %a.pi8 = bitcast i32 addrspace(1)* %a to i8 addrspace(1)*
-  %b.pi8 = bitcast i32 addrspace(1)* %b to i8 addrspace(1)*
-  %gep.a.base.x16 = getelementptr inbounds i8, i8 addrspace(1)* %a.pi8, i64 %base.x16
-  %gep.b.base.x16.p4 = getelementptr inbounds i8, i8 addrspace(1)* %b.pi8, i64 %base.x16.p4
-  %gep.a.base.x16.p8 = getelementptr inbounds i8, i8 addrspace(1)* %a.pi8, i64 %base.x16.p8
-  %gep.b.base.x16.p12 = getelementptr inbounds i8, i8 addrspace(1)* %b.pi8, i64 %base.x16.p12
-  %a.base.x16 = bitcast i8 addrspace(1)* %gep.a.base.x16 to i32 addrspace(1)*
-  %b.base.x16.p4 = bitcast i8 addrspace(1)* %gep.b.base.x16.p4 to i32 addrspace(1)*
-  %selected.base.x16.p0.or.4 = select i1 %cnd, i32 addrspace(1)* %a.base.x16, i32 addrspace(1)* %b.base.x16.p4
-  %gep.selected.base.x16.p8.or.12 = select i1 %cnd, i8 addrspace(1)* %gep.a.base.x16.p8, i8 addrspace(1)* %gep.b.base.x16.p12
-  %selected.base.x16.p8.or.12 = bitcast i8 addrspace(1)* %gep.selected.base.x16.p8.or.12 to i32 addrspace(1)*
-  %selected.base.x16.p40.or.44 = getelementptr inbounds i32, i32 addrspace(1)* %selected.base.x16.p0.or.4, i64 10
-  %selected.base.x16.p44.or.48 = getelementptr inbounds i32, i32 addrspace(1)* %selected.base.x16.p8.or.12, i64 9
-  %val0 = load i32, i32 addrspace(1)* %selected.base.x16.p40.or.44, align 4
-  %val1 = load i32, i32 addrspace(1)* %selected.base.x16.p44.or.48, align 4
-  %t0 = insertelement <2 x i32> undef, i32 %val0, i32 0
-  %t1 = insertelement <2 x i32> %t0, i32 %val1, i32 1
-  store <2 x i32> %t1, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-define void @nested_selects(i1 %cnd0, i1 %cnd1, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 %base, <2 x i32> addrspace(1)* %out) {
-; CHECK-LABEL: @nested_selects
-; CHECK: load <2 x i32>
-entry:
-  %base.p1 = add nsw i32 %base, 1
-  %base.p2 = add i32 %base, 2
-  %base.p3 = add nsw i32 %base, 3
-  %base.x4 = mul i32 %base, 4
-  %base.x4.p5 = add i32 %base.x4, 5
-  %base.x4.p6 = add i32 %base.x4, 6
-  %sext = sext i32 %base to i64
-  %sext.p1 = sext i32 %base.p1 to i64
-  %sext.p2 = sext i32 %base.p2 to i64
-  %sext.p3 = sext i32 %base.p3 to i64
-  %sext.x4.p5 = sext i32 %base.x4.p5 to i64
-  %sext.x4.p6 = sext i32 %base.x4.p6 to i64
-  %gep.a.base = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext
-  %gep.a.base.p1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.p1
-  %gep.a.base.p2 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.p2
-  %gep.a.base.p3 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.p3
-  %gep.b.base.x4.p5 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.x4.p5
-  %gep.b.base.x4.p6 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.x4.p6
-  %selected.1.L = select i1 %cnd1, i32 addrspace(1)* %gep.a.base.p2, i32 addrspace(1)* %gep.b.base.x4.p5
-  %selected.1.R = select i1 %cnd1, i32 addrspace(1)* %gep.a.base.p3, i32 addrspace(1)* %gep.b.base.x4.p6
-  %selected.0.L = select i1 %cnd0, i32 addrspace(1)* %gep.a.base, i32 addrspace(1)* %selected.1.L
-  %selected.0.R = select i1 %cnd0, i32 addrspace(1)* %gep.a.base.p1, i32 addrspace(1)* %selected.1.R
-  %val0 = load i32, i32 addrspace(1)* %selected.0.L, align 4
-  %val1 = load i32, i32 addrspace(1)* %selected.0.R, align 4
-  %t0 = insertelement <2 x i32> undef, i32 %val0, i32 0
-  %t1 = insertelement <2 x i32> %t0, i32 %val1, i32 1
-  store <2 x i32> %t1, <2 x i32> addrspace(1)* %out
-  ret void
-}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll
deleted file mode 100644
index 5ed7ee80ea0..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
-
-; Check that, in the presence of an aliasing load, the stores preceding the
-; aliasing load are safe to vectorize.
-
-; CHECK-LABEL: store_vectorize_with_alias
-; CHECK: store <4 x float>
-; CHECK: load <4 x float>
-; CHECK: store <4 x float>
-
-; Function Attrs: nounwind
-define amdgpu_kernel void @store_vectorize_with_alias(i8 addrspace(1)* %a, i8 addrspace(1)* %b) #0 {
-bb:
-  %tmp = bitcast i8 addrspace(1)* %b to float addrspace(1)*
-  %tmp1 = load float, float addrspace(1)* %tmp, align 4
-
-  %tmp2 = bitcast i8 addrspace(1)* %a to float addrspace(1)*
-  store float %tmp1, float addrspace(1)* %tmp2, align 4
-  %tmp3 = getelementptr i8, i8 addrspace(1)* %a, i64 4
-  %tmp4 = bitcast i8 addrspace(1)* %tmp3 to float addrspace(1)*
-  store float %tmp1, float addrspace(1)* %tmp4, align 4
-  %tmp5 = getelementptr i8, i8 addrspace(1)* %a, i64 8
-  %tmp6 = bitcast i8 addrspace(1)* %tmp5 to float addrspace(1)*
-  store float %tmp1, float addrspace(1)* %tmp6, align 4
-  %tmp7 = getelementptr i8, i8 addrspace(1)* %a, i64 12
-  %tmp8 = bitcast i8 addrspace(1)* %tmp7 to float addrspace(1)*
-  store float %tmp1, float addrspace(1)* %tmp8, align 4
-
-  %tmp9 = getelementptr i8, i8 addrspace(1)* %b, i64 16
-  %tmp10 = bitcast i8 addrspace(1)* %tmp9 to float addrspace(1)*
-  %tmp11 = load float, float addrspace(1)* %tmp10, align 4
-  %tmp12 = getelementptr i8, i8 addrspace(1)* %b, i64 20
-  %tmp13 = bitcast i8 addrspace(1)* %tmp12 to float addrspace(1)*
-  %tmp14 = load float, float addrspace(1)* %tmp13, align 4
-  %tmp15 = getelementptr i8, i8 addrspace(1)* %b, i64 24
-  %tmp16 = bitcast i8 addrspace(1)* %tmp15 to float addrspace(1)*
-  %tmp17 = load float, float addrspace(1)* %tmp16, align 4
-  %tmp18 = getelementptr i8, i8 addrspace(1)* %b, i64 28
-  %tmp19 = bitcast i8 addrspace(1)* %tmp18 to float addrspace(1)*
-  %tmp20 = load float, float addrspace(1)* %tmp19, align 4
-
-  %tmp21 = getelementptr i8, i8 addrspace(1)* %a, i64 16
-  %tmp22 = bitcast i8 addrspace(1)* %tmp21 to float addrspace(1)*
-  store float %tmp11, float addrspace(1)* %tmp22, align 4
-  %tmp23 = getelementptr i8, i8 addrspace(1)* %a, i64 20
-  %tmp24 = bitcast i8 addrspace(1)* %tmp23 to float addrspace(1)*
-  store float %tmp14, float addrspace(1)* %tmp24, align 4
-  %tmp25 = getelementptr i8, i8 addrspace(1)* %a, i64 24
-  %tmp26 = bitcast i8 addrspace(1)* %tmp25 to float addrspace(1)*
-  store float %tmp17, float addrspace(1)* %tmp26, align 4
-  %tmp27 = getelementptr i8, i8 addrspace(1)* %a, i64 28
-  %tmp28 = bitcast i8 addrspace(1)* %tmp27 to float addrspace(1)*
-  store float %tmp20, float addrspace(1)* %tmp28, align 4
-
-  ret void
-}
-
-attributes #0 = { argmemonly nounwind }
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll
deleted file mode 100644
index 65d114478b4..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll
+++ /dev/null
@@ -1,201 +0,0 @@
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
-
-; Checks that we don't merge loads/stores of types smaller than one
-; byte, or vectors with elements smaller than one byte.
-
-%struct.foo = type { i32, i8 }
-
-declare void @use_i1(i1)
-declare void @use_i2(i2)
-declare void @use_i8(i8)
-declare void @use_foo(%struct.foo)
-declare void @use_v2i2(<2 x i2>)
-declare void @use_v4i2(<4 x i2>)
-declare void @use_v2i9(<2 x i9>)
-
-; CHECK-LABEL: @merge_store_2_constants_i1(
-; CHECK: store i1
-; CHECK: store i1
-define amdgpu_kernel void @merge_store_2_constants_i1(i1 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i1, i1 addrspace(1)* %out, i32 1
-  store i1 true, i1 addrspace(1)* %out.gep.1
-  store i1 false, i1 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: @merge_store_2_constants_i2(
-; CHECK: store i2 1
-; CHECK: store i2 -1
-define amdgpu_kernel void @merge_store_2_constants_i2(i2 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i2, i2 addrspace(1)* %out, i32 1
-  store i2 1, i2 addrspace(1)* %out.gep.1
-  store i2 -1, i2 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: @merge_different_store_sizes_i1_i8(
-; CHECK: store i1 true
-; CHECK: store i8 123
-define amdgpu_kernel void @merge_different_store_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
-  %out.i1 = bitcast i8 addrspace(1)* %out to i1 addrspace(1)*
-  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store i1 true, i1 addrspace(1)* %out.i1
-  store i8 123, i8 addrspace(1)* %out.gep.1
-  ret void
-}
-
-; CHECK-LABEL: @merge_different_store_sizes_i8_i1(
-; CHECK: store i8 123
-; CHECK: store i1 true
-define amdgpu_kernel void @merge_different_store_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
-  %out.i8 = bitcast i1 addrspace(1)* %out to i8 addrspace(1)*
-  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out.i8, i32 1
-  store i8 123, i8 addrspace(1)* %out.gep.1
-  store i1 true, i1 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: @merge_store_2_constant_structs(
-; CHECK: store %struct.foo
-; CHECK: store %struct.foo
-define amdgpu_kernel void @merge_store_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr %struct.foo, %struct.foo addrspace(1)* %out, i32 1
-  store %struct.foo { i32 12, i8 3 }, %struct.foo addrspace(1)* %out.gep.1
-  store %struct.foo { i32 92, i8 9 }, %struct.foo addrspace(1)* %out
-  ret void
-}
-
-; sub-byte element size
-; CHECK-LABEL: @merge_store_2_constants_v2i2(
-; CHECK: store <2 x i2>
-; CHECK: store <2 x i2>
-define amdgpu_kernel void @merge_store_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr <2 x i2>, <2 x i2> addrspace(1)* %out, i32 1
-  store <2 x i2> <i2 1, i2 -1>, <2 x i2> addrspace(1)* %out.gep.1
-  store <2 x i2> <i2 -1, i2 1>, <2 x i2> addrspace(1)* %out
-  ret void
-}
-
-; sub-byte element size but byte size
-
-; CHECK-LABEL: @merge_store_2_constants_v4i2(
-; CHECK: store <4 x i2>
-; CHECK: store <4 x i2>
-define amdgpu_kernel void @merge_store_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr <4 x i2>, <4 x i2> addrspace(1)* %out, i32 1
-  store <4 x i2> <i2 1, i2 -1, i2 1, i2 -1>, <4 x i2> addrspace(1)* %out.gep.1
-  store <4 x i2> <i2 -1, i2 1, i2 -1, i2 1>, <4 x i2> addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: @merge_load_2_constants_i1(
-; CHECK: load i1
-; CHECK: load i1
-define amdgpu_kernel void @merge_load_2_constants_i1(i1 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i1, i1 addrspace(1)* %out, i32 1
-  %x = load i1, i1 addrspace(1)* %out.gep.1
-  %y = load i1, i1 addrspace(1)* %out
-  call void @use_i1(i1 %x)
-  call void @use_i1(i1 %y)
-  ret void
-}
-
-; CHECK-LABEL: @merge_load_2_constants_i2(
-; CHECK: load i2
-; CHECK: load i2
-define amdgpu_kernel void @merge_load_2_constants_i2(i2 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i2, i2 addrspace(1)* %out, i32 1
-  %x = load i2, i2 addrspace(1)* %out.gep.1
-  %y = load i2, i2 addrspace(1)* %out
-  call void @use_i2(i2 %x)
-  call void @use_i2(i2 %y)
-  ret void
-}
-
-; CHECK-LABEL: @merge_different_load_sizes_i1_i8(
-; CHECK: load i1
-; CHECK: load i8
-define amdgpu_kernel void @merge_different_load_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
-  %out.i1 = bitcast i8 addrspace(1)* %out to i1 addrspace(1)*
-  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  %x = load i1, i1 addrspace(1)* %out.i1
-  %y = load i8, i8 addrspace(1)* %out.gep.1
-  call void @use_i1(i1 %x)
-  call void @use_i8(i8 %y)
-  ret void
-}
-
-; CHECK-LABEL: @merge_different_load_sizes_i8_i1(
-; CHECK: load i8
-; CHECK: load i1
-define amdgpu_kernel void @merge_different_load_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
-  %out.i8 = bitcast i1 addrspace(1)* %out to i8 addrspace(1)*
-  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out.i8, i32 1
-  %x = load i8, i8 addrspace(1)* %out.gep.1
-  %y = load i1, i1 addrspace(1)* %out
-  call void @use_i8(i8 %x)
-  call void @use_i1(i1 %y)
-  ret void
-}
-
-; CHECK-LABEL: @merge_load_2_constant_structs(
-; CHECK: load %struct.foo
-; CHECK: load %struct.foo
-define amdgpu_kernel void @merge_load_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr %struct.foo, %struct.foo addrspace(1)* %out, i32 1
-  %x = load %struct.foo, %struct.foo addrspace(1)* %out.gep.1
-  %y = load %struct.foo, %struct.foo addrspace(1)* %out
-  call void @use_foo(%struct.foo %x)
-  call void @use_foo(%struct.foo %y)
-  ret void
-}
-
-; CHECK-LABEL: @merge_load_2_constants_v2i2(
-; CHECK: load <2 x i2>
-; CHECK: load <2 x i2>
-define amdgpu_kernel void @merge_load_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr <2 x i2>, <2 x i2> addrspace(1)* %out, i32 1
-  %x = load <2 x i2>, <2 x i2> addrspace(1)* %out.gep.1
-  %y = load <2 x i2>, <2 x i2> addrspace(1)* %out
-  call void @use_v2i2(<2 x i2> %x)
-  call void @use_v2i2(<2 x i2> %y)
-  ret void
-}
-
-; CHECK-LABEL: @merge_load_2_constants_v4i2(
-; CHECK: load <4 x i2>
-; CHECK: load <4 x i2>
-define amdgpu_kernel void @merge_load_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr <4 x i2>, <4 x i2> addrspace(1)* %out, i32 1
-  %x = load <4 x i2>, <4 x i2> addrspace(1)* %out.gep.1
-  %y = load <4 x i2>, <4 x i2> addrspace(1)* %out
-  call void @use_v4i2(<4 x i2> %x)
-  call void @use_v4i2(<4 x i2> %y)
-  ret void
-}
-
-; CHECK-LABEL: @merge_store_2_constants_i9(
-; CHECK: store i9 3
-; CHECK: store i9 -5
-define amdgpu_kernel void @merge_store_2_constants_i9(i9 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i9, i9 addrspace(1)* %out, i32 1
-  store i9 3, i9 addrspace(1)* %out.gep.1
-  store i9 -5, i9 addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: @merge_load_2_constants_v2i9(
-; CHECK: load <2 x i9>
-; CHECK: load <2 x i9>
-define amdgpu_kernel void @merge_load_2_constants_v2i9(<2 x i9> addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr <2 x i9>, <2 x i9> addrspace(1)* %out, i32 1
-  %x = load <2 x i9>, <2 x i9> addrspace(1)* %out.gep.1
-  %y = load <2 x i9>, <2 x i9> addrspace(1)* %out
-  call void @use_v2i9(<2 x i9> %x)
-  call void @use_v2i9(<2 x i9> %y)
-  ret void
-}
-
-attributes #0 = { nounwind }
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/lit.local.cfg b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/lit.local.cfg
deleted file mode 100644
index a5e90f8e3c1..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/lit.local.cfg
+++ /dev/null
@@ -1,3 +0,0 @@
-if not 'NVPTX' in config.root.targets:
-    config.unsupported = True
-
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/merge-across-side-effects.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/merge-across-side-effects.ll
deleted file mode 100644
index 72c13b4d12e..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/merge-across-side-effects.ll
+++ /dev/null
@@ -1,209 +0,0 @@
-; RUN: opt -mtriple=nvptx64-nvidia-cuda -load-store-vectorizer -S -o - %s | FileCheck %s
-
-; Check that the load/store vectorizer is willing to move loads/stores across
-; intervening instructions only if it's safe.
-;
-;  - Loads can be moved across instructions that don't write or throw.
-;  - Stores can only be moved across instructions which don't read, write, or
-;    throw.
-
-declare void @fn()
-declare void @fn_nounwind() #0
-declare void @fn_nounwind_writeonly() #1
-declare void @fn_nounwind_readonly() #2
-declare void @fn_writeonly() #3
-declare void @fn_readonly() #4
-declare void @fn_readnone() #5
-
-; CHECK-LABEL: @load_fn
-; CHECK: load
-; CHECK: call void @fn()
-; CHECK: load
-define void @load_fn(i32* %p) #0 {
-  %p.1 = getelementptr i32, i32* %p, i32 1
-
-  %v0 = load i32, i32* %p, align 8
-  call void @fn()
-  %v1 = load i32, i32* %p.1, align 4
-  ret void
-}
-
-; CHECK-LABEL: @load_fn_nounwind
-; CHECK: load
-; CHECK: call void @fn_nounwind()
-; CHECK: load
-define void @load_fn_nounwind(i32* %p) #0 {
-  %p.1 = getelementptr i32, i32* %p, i32 1
-
-  %v0 = load i32, i32* %p, align 8
-  call void @fn_nounwind() #0
-  %v1 = load i32, i32* %p.1, align 4
-  ret void
-}
-
-; CHECK-LABEL: @load_fn_nounwind_writeonly
-; CHECK: load
-; CHECK: call void @fn_nounwind_writeonly()
-; CHECK: load
-define void @load_fn_nounwind_writeonly(i32* %p) #0 {
-  %p.1 = getelementptr i32, i32* %p, i32 1
-
-  %v0 = load i32, i32* %p, align 8
-  call void @fn_nounwind_writeonly() #1
-  %v1 = load i32, i32* %p.1, align 4
-  ret void
-}
-
-; CHECK-LABEL: @load_fn_nounwind_readonly
-; CHECK-DAG: load <2 x i32>
-; CHECK-DAG: call void @fn_nounwind_readonly()
-define void @load_fn_nounwind_readonly(i32* %p) #0 {
-  %p.1 = getelementptr i32, i32* %p, i32 1
-
-  %v0 = load i32, i32* %p, align 8
-  call void @fn_nounwind_readonly() #2
-  %v1 = load i32, i32* %p.1, align 4
-  ret void
-}
-
-; CHECK-LABEL: @load_fn_readonly
-; CHECK: load
-; CHECK: call void @fn_readonly
-; CHECK: load
-define void @load_fn_readonly(i32* %p) #0 {
-  %p.1 = getelementptr i32, i32* %p, i32 1
-
-  %v0 = load i32, i32* %p, align 8
-  call void @fn_readonly() #4
-  %v1 = load i32, i32* %p.1, align 4
-  ret void
-}
-
-; CHECK-LABEL: @load_fn_writeonly
-; CHECK: load
-; CHECK: call void @fn_writeonly()
-; CHECK: load
-define void @load_fn_writeonly(i32* %p) #0 {
-  %p.1 = getelementptr i32, i32* %p, i32 1
-
-  %v0 = load i32, i32* %p, align 8
-  call void @fn_writeonly() #3
-  %v1 = load i32, i32* %p.1, align 4
-  ret void
-}
-
-; CHECK-LABEL: @load_fn_readnone
-; CHECK-DAG: load <2 x i32>
-; CHECK-DAG: call void @fn_readnone()
-define void @load_fn_readnone(i32* %p) #0 {
-  %p.1 = getelementptr i32, i32* %p, i32 1
-
-  %v0 = load i32, i32* %p, align 8
-  call void @fn_readnone() #5
-  %v1 = load i32, i32* %p.1, align 4
-  ret void
-}
-
-; ------------------------------------------------
-; Same tests, but now for stores instead of loads.
-; ------------------------------------------------
-
-; CHECK-LABEL: @store_fn
-; CHECK: store
-; CHECK: call void @fn()
-; CHECK: store
-define void @store_fn(i32* %p) #0 {
-  %p.1 = getelementptr i32, i32* %p, i32 1
-
-  store i32 0, i32* %p
-  call void @fn()
-  store i32 0, i32* %p.1
-  ret void
-}
-
-; CHECK-LABEL: @store_fn_nounwind
-; CHECK: store
-; CHECK: call void @fn_nounwind()
-; CHECK: store
-define void @store_fn_nounwind(i32* %p) #0 {
-  %p.1 = getelementptr i32, i32* %p, i32 1
-
-  store i32 0, i32* %p
-  call void @fn_nounwind() #0
-  store i32 0, i32* %p.1
-  ret void
-}
-
-; CHECK-LABEL: @store_fn_nounwind_writeonly
-; CHECK: store
-; CHECK: call void @fn_nounwind_writeonly()
-; CHECK: store
-define void @store_fn_nounwind_writeonly(i32* %p) #0 {
-  %p.1 = getelementptr i32, i32* %p, i32 1
-
-  store i32 0, i32* %p
-  call void @fn_nounwind_writeonly() #1
-  store i32 0, i32* %p.1
-  ret void
-}
-
-; CHECK-LABEL: @store_fn_nounwind_readonly
-; CHECK: store
-; CHECK: call void @fn_nounwind_readonly()
-; CHECK: store
-define void @store_fn_nounwind_readonly(i32* %p) #0 {
-  %p.1 = getelementptr i32, i32* %p, i32 1
-
-  store i32 0, i32* %p
-  call void @fn_nounwind_readonly() #2
-  store i32 0, i32* %p.1
-  ret void
-}
-
-; CHECK-LABEL: @store_fn_readonly
-; CHECK: store
-; CHECK: call void @fn_readonly
-; CHECK: store
-define void @store_fn_readonly(i32* %p) #0 {
-  %p.1 = getelementptr i32, i32* %p, i32 1
-
-  store i32 0, i32* %p
-  call void @fn_readonly() #4
-  store i32 0, i32* %p.1
-  ret void
-}
-
-; CHECK-LABEL: @store_fn_writeonly
-; CHECK: store
-; CHECK: call void @fn_writeonly()
-; CHECK: store
-define void @store_fn_writeonly(i32* %p) #0 {
-  %p.1 = getelementptr i32, i32* %p, i32 1
-
-  store i32 0, i32* %p
-  call void @fn_writeonly() #3
-  store i32 0, i32* %p.1
-  ret void
-}
-
-; This is the only store idiom we can vectorize.
-; CHECK-LABEL: @store_fn_readnone
-; CHECK-DAG: store <2 x i32>
-; CHECK-DAG: call void @fn_readnone()
-define void @store_fn_readnone(i32* %p) #0 {
-  %p.1 = getelementptr i32, i32* %p, i32 1
-
-  store i32 0, i32* %p, align 8
-  call void @fn_readnone() #5
-  store i32 0, i32* %p.1, align 8
-  ret void
-}
-
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind writeonly }
-attributes #2 = { nounwind readonly }
-attributes #3 = { writeonly }
-attributes #4 = { readonly }
-; readnone implies nounwind, so no need to test separately
-attributes #5 = { nounwind readnone }
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/non-instr-bitcast.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/non-instr-bitcast.ll
deleted file mode 100644
index ff5e54f03ae..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/non-instr-bitcast.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: opt -mtriple=nvptx64-nvidia-cuda -load-store-vectorizer -S -o - %s | FileCheck %s
-
-; Load from a constant.  This can be vectorized, but shouldn't crash us.
-
-@global = internal addrspace(1) constant [4 x float] [float 0xBF71111120000000, float 0x3F70410420000000, float 0xBF81111120000000, float 0x3FB5555560000000], align 4
-
-define void @foo() {
-  ; CHECK: load <4 x float>
-  %a = load float, float addrspace(1)* getelementptr inbounds ([4 x float], [4 x float] addrspace(1)* @global, i64 0, i64 0), align 16
-  %b = load float, float addrspace(1)* getelementptr inbounds ([4 x float], [4 x float] addrspace(1)* @global, i64 0, i64 1), align 4
-  %c = load float, float addrspace(1)* getelementptr inbounds ([4 x float], [4 x float] addrspace(1)* @global, i64 0, i64 2), align 4
-  %d = load float, float addrspace(1)* getelementptr inbounds ([4 x float], [4 x float] addrspace(1)* @global, i64 0, i64 3), align 4
-  ret void
-}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/propagate-invariance-metadata.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/propagate-invariance-metadata.ll
deleted file mode 100644
index ac0660e7833..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/propagate-invariance-metadata.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: opt -load-store-vectorizer -march=nvptx64 -mcpu=sm_35 -S < %s | FileCheck %s
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
-target triple = "nvptx64-nvidia-cuda"
-
-; CHECK-LABEL: @foo
-define i32 @foo(i32* %ptr) {
-  %ptr1 = getelementptr i32, i32* %ptr, i32 1
-  %p1 = addrspacecast i32* %ptr1 to i32 addrspace(1)*
-  ; CHECK: load <2 x i32>, <2 x i32>* %{{[0-9]+}}, align 8, !invariant.load !0
-  %v0 = load i32, i32* %ptr, align 8, !invariant.load !0
-  %v1 = load i32, i32* %ptr1, align 4, !invariant.load !0
-  %sum = add i32 %v0, %v1
-  ret i32 %sum
-}
-
-!0 = !{}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll
deleted file mode 100644
index e29f3dfa537..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll
+++ /dev/null
@@ -1,80 +0,0 @@
-; RUN: opt -codegenprepare -load-store-vectorizer %s -S -o - | FileCheck %s
-; RUN: opt                 -load-store-vectorizer %s -S -o - | FileCheck %s
-; RUN: opt -codegenprepare -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' %s -S -o - | FileCheck %s
-; RUN: opt                 -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' %s -S -o - | FileCheck %s
-
-target triple = "x86_64--"
-
-%union = type { { [4 x [4 x [4 x [16 x float]]]], [4 x [4 x [4 x [16 x float]]]], [10 x [10 x [4 x float]]] } }
-
-@global_pointer = external unnamed_addr global { %union, [2000 x i8] }, align 4
-
-; Function Attrs: convergent nounwind
-define void @test(i32 %base) #0 {
-; CHECK-LABEL: @test(
-; CHECK-NOT: load i32
-; CHECK: load <2 x i32>
-; CHECK-NOT: load i32
-entry:
-  %mul331 = and i32 %base, -4
-  %add350.4 = add i32 4, %mul331
-  %idx351.4 = zext i32 %add350.4 to i64
-  %arrayidx352.4 = getelementptr inbounds { %union, [2000 x i8] }, { %union, [2000 x i8] }* @global_pointer, i64 0, i32 0, i32 0, i32 1, i64 0, i64 0, i64 0, i64 %idx351.4
-  %tmp296.4 = bitcast float* %arrayidx352.4 to i32*
-  %add350.5 = add i32 5, %mul331
-  %idx351.5 = zext i32 %add350.5 to i64
-  %arrayidx352.5 = getelementptr inbounds { %union, [2000 x i8] }, { %union, [2000 x i8] }* @global_pointer, i64 0, i32 0, i32 0, i32 1, i64 0, i64 0, i64 0, i64 %idx351.5
-  %tmp296.5 = bitcast float* %arrayidx352.5 to i32*
-  %cnd = icmp ult i32 %base, 1000
-  br i1 %cnd, label %loads, label %exit
-
-loads:
-  ; If and only if the loads are in a different BB from the GEPs codegenprepare
-  ; would try to turn the GEPs into math, which makes LoadStoreVectorizer's job
-  ; harder
-  %tmp297.4 = load i32, i32* %tmp296.4, align 4, !tbaa !0
-  %tmp297.5 = load i32, i32* %tmp296.5, align 4, !tbaa !0
-  br label %exit
-
-exit:
-  ret void
-}
-
-; Function Attrs: convergent nounwind
-define void @test.codegenprepared(i32 %base) #0 {
-; CHECK-LABEL: @test.codegenprepared(
-; CHECK-NOT: load i32
-; CHECK: load <2 x i32>
-; CHECK-NOT: load i32
-entry:
-  %mul331 = and i32 %base, -4
-  %add350.4 = add i32 4, %mul331
-  %idx351.4 = zext i32 %add350.4 to i64
-  %add350.5 = add i32 5, %mul331
-  %idx351.5 = zext i32 %add350.5 to i64
-  %cnd = icmp ult i32 %base, 1000
-  br i1 %cnd, label %loads, label %exit
-
-loads:                                            ; preds = %entry
-  %sunkaddr = mul i64 %idx351.4, 4
-  %sunkaddr1 = getelementptr inbounds i8, i8* bitcast ({ %union, [2000 x i8] }* @global_pointer to i8*), i64 %sunkaddr
-  %sunkaddr2 = getelementptr inbounds i8, i8* %sunkaddr1, i64 4096
-  %0 = bitcast i8* %sunkaddr2 to i32*
-  %tmp297.4 = load i32, i32* %0, align 4, !tbaa !0
-  %sunkaddr3 = mul i64 %idx351.5, 4
-  %sunkaddr4 = getelementptr inbounds i8, i8* bitcast ({ %union, [2000 x i8] }* @global_pointer to i8*), i64 %sunkaddr3
-  %sunkaddr5 = getelementptr inbounds i8, i8* %sunkaddr4, i64 4096
-  %1 = bitcast i8* %sunkaddr5 to i32*
-  %tmp297.5 = load i32, i32* %1, align 4, !tbaa !0
-  br label %exit
-
-exit:                                             ; preds = %loads, %entry
-  ret void
-}
-
-attributes #0 = { convergent nounwind }
-
-!0 = !{!1, !1, i64 0}
-!1 = !{!"float", !2, i64 0}
-!2 = !{!"omnipotent char", !3, i64 0}
-!3 = !{!"Simple C++ TBAA"}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/compare-scev-by-complexity.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/compare-scev-by-complexity.ll
deleted file mode 100644
index e2181f6086c..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/compare-scev-by-complexity.ll
+++ /dev/null
@@ -1,77 +0,0 @@
-; RUN: opt -load-store-vectorizer %s -S | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' %s -S | FileCheck %s
-
-; Check that setting wrapping flags after a SCEV node is created
-; does not invalidate "sorted by complexity" invariant for
-; operands of commutative and associative SCEV operators.
-
-target triple = "x86_64--"
-
-@global_value0 = external constant i32
-@global_value1 = external constant i32
-@other_value = external global float
-@a = external global float
-@b = external global float
-@c = external global float
-@d = external global float
-@plus1 = external global i32
-@cnd = external global i8
-
-; Function Attrs: nounwind
-define void @main() local_unnamed_addr #0 {
-; CHECK-LABEL: @main()
-; CHECK: [[PTR:%[0-9]+]] = bitcast float* %preheader.load0.address to <2 x float>*
-; CHECK:  = load <2 x float>, <2 x float>* [[PTR]]
-; CHECK-LABEL: for.body23:
-entry:
-  %tmp = load i32, i32* @global_value0, !range !0
-  %tmp2 = load i32, i32* @global_value1
-  %and.i.i = and i32 %tmp2, 2
-  %add.nuw.nsw.i.i = add nuw nsw i32 %and.i.i, 0
-  %mul.i.i = shl nuw nsw i32 %add.nuw.nsw.i.i, 1
-  %and6.i.i = and i32 %tmp2, 3
-  %and9.i.i = and i32 %tmp2, 4
-  %add.nuw.nsw10.i.i = add nuw nsw i32 %and6.i.i, %and9.i.i
-  %conv3.i42.i = add nuw nsw i32 %mul.i.i, 1
-  %reass.add346.7 = add nuw nsw i32 %add.nuw.nsw10.i.i, 56
-  %reass.mul347.7 = mul nuw nsw i32 %tmp, %reass.add346.7
-  %add7.i.7 = add nuw nsw i32 %reass.mul347.7, 0
-  %preheader.address0.idx = add nuw nsw i32 %add7.i.7, %mul.i.i
-  %preheader.address0.idx.zext = zext i32 %preheader.address0.idx to i64
-  %preheader.load0.address = getelementptr inbounds float, float* @other_value, i64 %preheader.address0.idx.zext
-  %preheader.load0. = load float, float* %preheader.load0.address, align 4, !tbaa !1
-  %common.address.idx = add nuw nsw i32 %add7.i.7, %conv3.i42.i
-  %preheader.header.common.address.idx.zext = zext i32 %common.address.idx to i64
-  %preheader.load1.address = getelementptr inbounds float, float* @other_value, i64 %preheader.header.common.address.idx.zext
-  %preheader.load1. = load float, float* %preheader.load1.address, align 4, !tbaa !1
-  br label %for.body23
-
-for.body23:                                       ; preds = %for.body23, %entry
-  %loop.header.load0.address = getelementptr inbounds float, float* @other_value, i64 %preheader.header.common.address.idx.zext
-  %loop.header.load0. = load float, float* %loop.header.load0.address, align 4, !tbaa !1
-  %reass.mul343.7 = mul nuw nsw i32 %reass.add346.7, 72
-  %add7.i286.7.7 = add nuw nsw i32 %reass.mul343.7, 56
-  %add9.i288.7.7 = add nuw nsw i32 %add7.i286.7.7, %mul.i.i
-  %loop.header.address1.idx = add nuw nsw i32 %add9.i288.7.7, 1
-  %loop.header.address1.idx.zext = zext i32 %loop.header.address1.idx to i64
-  %loop.header.load1.address = getelementptr inbounds float, float* @other_value, i64 %loop.header.address1.idx.zext
-  %loop.header.load1. = load float, float* %loop.header.load1.address, align 4, !tbaa !1
-  store float %preheader.load0., float* @a, align 4, !tbaa !1
-  store float %preheader.load1., float* @b, align 4, !tbaa !1
-  store float %loop.header.load0., float* @c, align 4, !tbaa !1
-  store float %loop.header.load1., float* @d, align 4, !tbaa !1
-  %loaded.cnd = load i8, i8* @cnd
-  %condition = trunc i8 %loaded.cnd to i1
-  br i1 %condition, label %for.body23, label %exit
-
-exit:
-  ret void
-}
-
-attributes #0 = { nounwind }
-
-!0 = !{i32 0, i32 65536}
-!1 = !{!2, !2, i64 0}
-!2 = !{!"float", !3, i64 0}
-!3 = !{!"omnipotent char", !4, i64 0}
-!4 = !{!"Simple C++ TBAA"}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll
deleted file mode 100644
index 043d6ea7e92..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s
-; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-
-; CHECK-LABEL: @correct_order(
-; CHECK: [[LOAD_PTR:%[0-9]+]] = bitcast i32* %next.gep1
-; CHECK: load <2 x i32>, <2 x i32>* [[LOAD_PTR]]
-; CHECK: load i32, i32* %next.gep
-; CHECK: [[STORE_PTR:%[0-9]+]] = bitcast i32* %next.gep
-; CHECK: store <2 x i32>
-; CHECK-SAME: <2 x i32>* [[STORE_PTR]]
-; CHECK: load i32, i32* %next.gep1
-define void @correct_order(i32* noalias %ptr) {
-  %next.gep = getelementptr i32, i32* %ptr, i64 0
-  %next.gep1 = getelementptr i32, i32* %ptr, i64 1
-  %next.gep2 = getelementptr i32, i32* %ptr, i64 2
-
-  %l1 = load i32, i32* %next.gep1, align 4
-  %l2 = load i32, i32* %next.gep, align 4
-  store i32 0, i32* %next.gep1, align 4
-  store i32 0, i32* %next.gep, align 4
-  %l3 = load i32, i32* %next.gep1, align 4
-  %l4 = load i32, i32* %next.gep2, align 4
-
-  ret void
-}
-
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/lit.local.cfg b/llvm/test/Transforms/LoadStoreVectorizer/X86/lit.local.cfg
deleted file mode 100644
index e71f3cc4c41..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/lit.local.cfg
+++ /dev/null
@@ -1,3 +0,0 @@
-if not 'X86' in config.root.targets:
-    config.unsupported = True
-
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/load-width.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/load-width.ll
deleted file mode 100644
index ac5f3ea9f0f..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/load-width.ll
+++ /dev/null
@@ -1,40 +0,0 @@
-; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu haswell -S -o - %s | FileCheck --check-prefix=CHECK-HSW %s
-; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu knl -S -o - %s | FileCheck --check-prefix=CHECK-KNL %s
-; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -mcpu haswell -S -o - %s | FileCheck --check-prefix=CHECK-HSW %s
-; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -mcpu knl -S -o - %s | FileCheck --check-prefix=CHECK-KNL %s
-
-define <8 x double> @loadwidth_insert_extract(double* %ptr) {
-    %a = bitcast double* %ptr to <2 x double> *
-    %b = getelementptr <2 x double>, <2 x double>* %a, i32 1
-    %c = getelementptr <2 x double>, <2 x double>* %a, i32 2
-    %d = getelementptr <2 x double>, <2 x double>* %a, i32 3
-; CHECK-HSW: load <4 x double>
-; CHECK-HSW: load <4 x double>
-; CHECK-HSW-NOT: load
-; CHECK-KNL: load <8 x double>
-; CHECK-KNL-NOT: load
-    %la = load <2 x double>, <2 x double> *%a
-    %lb = load <2 x double>, <2 x double> *%b
-    %lc = load <2 x double>, <2 x double> *%c
-    %ld = load <2 x double>, <2 x double> *%d
-    ; Scalarize everything - Explicitly not a shufflevector to test this code
-    ; path in the LSV
-    %v1 = extractelement <2 x double> %la, i32 0
-    %v2 = extractelement <2 x double> %la, i32 1
-    %v3 = extractelement <2 x double> %lb, i32 0
-    %v4 = extractelement <2 x double> %lb, i32 1
-    %v5 = extractelement <2 x double> %lc, i32 0
-    %v6 = extractelement <2 x double> %lc, i32 1
-    %v7 = extractelement <2 x double> %ld, i32 0
-    %v8 = extractelement <2 x double> %ld, i32 1
-    ; Make a vector again
-    %i1 = insertelement <8 x double> undef, double %v1, i32 0
-    %i2 = insertelement <8 x double> %i1, double %v2, i32 1
-    %i3 = insertelement <8 x double> %i2, double %v3, i32 2
-    %i4 = insertelement <8 x double> %i3, double %v4, i32 3
-    %i5 = insertelement <8 x double> %i4, double %v5, i32 4
-    %i6 = insertelement <8 x double> %i5, double %v6, i32 5
-    %i7 = insertelement <8 x double> %i6, double %v7, i32 6
-    %i8 = insertelement <8 x double> %i7, double %v8, i32 7
-    ret <8 x double> %i8
-}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll
deleted file mode 100644
index a93e9aceb73..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll
+++ /dev/null
@@ -1,48 +0,0 @@
-; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S < %s | \
-; RUN:     FileCheck %s
-; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S < %s | \
-; RUN:     FileCheck %s
-;
-; The GPU Load & Store Vectorizer may merge differently-typed accesses into a
-; single instruction. This test checks that we merge TBAA tags for such
-; accesses correctly.
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-; struct S {
-;   float f;
-;   int i;
-; };
-%struct.S = type { float, i32 }
-
-; float foo(S *p) {
-;   p->f -= 1;
-;   p->i -= 1;
-;   return p->f;
-; }
-define float @foo(%struct.S* %p) {
-entry:
-; CHECK-LABEL: foo
-; CHECK: load <2 x i32>, {{.*}}, !tbaa [[TAG_char:!.*]]
-; CHECK: store <2 x i32> {{.*}}, !tbaa [[TAG_char]]
-  %f = getelementptr inbounds %struct.S, %struct.S* %p, i64 0, i32 0
-  %0 = load float, float* %f, align 4, !tbaa !2
-  %sub = fadd float %0, -1.000000e+00
-  store float %sub, float* %f, align 4, !tbaa !2
-  %i = getelementptr inbounds %struct.S, %struct.S* %p, i64 0, i32 1
-  %1 = load i32, i32* %i, align 4, !tbaa !8
-  %sub1 = add nsw i32 %1, -1
-  store i32 %sub1, i32* %i, align 4, !tbaa !8
-  ret float %sub
-}
-
-!2 = !{!3, !4, i64 0}
-!3 = !{!"_ZTS1S", !4, i64 0, !7, i64 4}
-!4 = !{!"float", !5, i64 0}
-!5 = !{!"omnipotent char", !6, i64 0}
-!6 = !{!"Simple C++ TBAA"}
-!7 = !{!"int", !5, i64 0}
-!8 = !{!3, !7, i64 4}
-
-; CHECK-DAG: [[TYPE_char:!.*]] = !{!"omnipotent char", {{.*}}, i64 0}
-; CHECK-DAG: [[TAG_char]] = !{[[TYPE_char]], [[TYPE_char]], i64 0}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll
deleted file mode 100644
index 7a0073808a0..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -load-store-vectorizer -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
-; RUN: opt < %s -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
-
-%rec = type { i32, i28 }
-
-; We currently do not optimize this scenario.
-; But we verify that we no longer crash when compiling this.
-define void @test1(%rec* %out, %rec* %in) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[IN1:%.*]] = getelementptr [[REC:%.*]], %rec* [[IN:%.*]], i16 0, i32 0
-; CHECK-NEXT:    [[IN2:%.*]] = getelementptr [[REC]], %rec* [[IN]], i16 0, i32 1
-; CHECK-NEXT:    [[VAL1:%.*]] = load i32, i32* [[IN1]], align 8
-; CHECK-NEXT:    [[VAL2:%.*]] = load i28, i28* [[IN2]]
-; CHECK-NEXT:    [[OUT1:%.*]] = getelementptr [[REC]], %rec* [[OUT:%.*]], i16 0, i32 0
-; CHECK-NEXT:    [[OUT2:%.*]] = getelementptr [[REC]], %rec* [[OUT]], i16 0, i32 1
-; CHECK-NEXT:    store i32 [[VAL1]], i32* [[OUT1]], align 8
-; CHECK-NEXT:    store i28 [[VAL2]], i28* [[OUT2]]
-; CHECK-NEXT:    ret void
-;
-  %in1 = getelementptr %rec, %rec* %in, i16 0, i32 0
-  %in2 = getelementptr %rec, %rec* %in, i16 0, i32 1
-  %val1 = load i32, i32* %in1, align 8
-  %val2 = load i28, i28* %in2
-  %out1 = getelementptr %rec, %rec* %out, i16 0, i32 0
-  %out2 = getelementptr %rec, %rec* %out, i16 0, i32 1
-  store i32 %val1, i32* %out1, align 8
-  store i28 %val2, i28* %out2
-  ret void
-}
-
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll
deleted file mode 100644
index 3cfe7454baf..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: opt -mtriple=x86_64-unknown-linux -load-store-vectorizer -S -o - %s | FileCheck %s
-; RUN: opt -mtriple=x86_64-unknown-linux -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
-
-target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
-
-%struct.buffer_t = type { i32, i8* }
-
-; Check an i32 and i8* get vectorized, and that the two accesses
-; (load into buff.val and store to buff.p) preserve their order.
-; Vectorized loads should be inserted at the position of the first load,
-; and instructions which were between the first and last load should be
-; reordered preserving their relative order inasmuch as possible.
-
-; CHECK-LABEL: @preserve_order_32(
-; CHECK: load <2 x i32>
-; CHECK: %buff.val = load i8
-; CHECK: store i8 0
-define void @preserve_order_32(%struct.buffer_t* noalias %buff) #0 {
-entry:
-  %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i32 0, i32 1
-  %buff.p = load i8*, i8** %tmp1
-  %buff.val = load i8, i8* %buff.p
-  store i8 0, i8* %buff.p, align 8
-  %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i32 0, i32 0
-  %buff.int = load i32, i32* %tmp0, align 8
-  ret void
-}
-
-attributes #0 = { nounwind }
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll
deleted file mode 100644
index 3ae0d891dc5..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll
+++ /dev/null
@@ -1,78 +0,0 @@
-; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s
-; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-
-%struct.buffer_t = type { i64, i8* }
-%struct.nested.buffer = type { %struct.buffer_t, %struct.buffer_t }
-
-; Check an i64 and i8* get vectorized, and that the two accesses
-; (load into buff.val and store to buff.p) preserve their order.
-; Vectorized loads should be inserted at the position of the first load,
-; and instructions which were between the first and last load should be
-; reordered preserving their relative order inasmuch as possible.
-
-; CHECK-LABEL: @preserve_order_64(
-; CHECK: load <2 x i64>
-; CHECK: %buff.val = load i8
-; CHECK: store i8 0
-define void @preserve_order_64(%struct.buffer_t* noalias %buff) #0 {
-entry:
-  %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 1
-  %buff.p = load i8*, i8** %tmp1
-  %buff.val = load i8, i8* %buff.p
-  store i8 0, i8* %buff.p, align 8
-  %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 0
-  %buff.int = load i64, i64* %tmp0, align 16
-  ret void
-}
-
-; Check reordering recurses correctly.
-
-; CHECK-LABEL: @transitive_reorder(
-; CHECK: load <2 x i64>
-; CHECK: %buff.val = load i8
-; CHECK: store i8 0
-define void @transitive_reorder(%struct.buffer_t* noalias %buff, %struct.nested.buffer* noalias %nest) #0 {
-entry:
-  %nest0_0 = getelementptr inbounds %struct.nested.buffer, %struct.nested.buffer* %nest, i64 0, i32 0
-  %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %nest0_0, i64 0, i32 1
-  %buff.p = load i8*, i8** %tmp1
-  %buff.val = load i8, i8* %buff.p
-  store i8 0, i8* %buff.p, align 8
-  %nest1_0 = getelementptr inbounds %struct.nested.buffer, %struct.nested.buffer* %nest, i64 0, i32 0
-  %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %nest1_0, i64 0, i32 0
-  %buff.int = load i64, i64* %tmp0, align 16
-  ret void
-}
-
-; Check for no vectorization over phi node
-
-; CHECK-LABEL: @no_vect_phi(
-; CHECK: load i8*
-; CHECK: load i8
-; CHECK: store i8 0
-; CHECK: load i64
-define void @no_vect_phi(i32* noalias %ptr, %struct.buffer_t* noalias %buff) {
-entry:
-  %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 1
-  %buff.p = load i8*, i8** %tmp1
-  %buff.val = load i8, i8* %buff.p
-  store i8 0, i8* %buff.p, align 8
-  br label %"for something"
-
-"for something":
-  %index = phi i64 [ 0, %entry ], [ %index.next, %"for something" ]
-
-  %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 0
-  %buff.int = load i64, i64* %tmp0, align 16
-
-  %index.next = add i64 %index, 8
-  %cmp_res = icmp eq i64 %index.next, 8
-  br i1 %cmp_res, label %ending, label %"for something"
-
-ending:
-  ret void
-}
-
-attributes #0 = { nounwind }
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll
deleted file mode 100644
index 72b29912d81..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll
+++ /dev/null
@@ -1,118 +0,0 @@
-; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s
-; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-
-; Vectorized subsets of the load/store chains in the presence of
-; interleaved loads/stores
-
-; CHECK-LABEL: @interleave_2L_2S(
-; CHECK: load <2 x i32>
-; CHECK: load i32
-; CHECK: store <2 x i32>
-; CHECK: load i32
-define void @interleave_2L_2S(i32* noalias %ptr) {
-  %next.gep = getelementptr i32, i32* %ptr, i64 0
-  %next.gep1 = getelementptr i32, i32* %ptr, i64 1
-  %next.gep2 = getelementptr i32, i32* %ptr, i64 2
-
-  %l1 = load i32, i32* %next.gep1, align 4
-  %l2 = load i32, i32* %next.gep, align 4
-  store i32 0, i32* %next.gep1, align 4
-  store i32 0, i32* %next.gep, align 4
-  %l3 = load i32, i32* %next.gep1, align 4
-  %l4 = load i32, i32* %next.gep2, align 4
-
-  ret void
-}
-
-; CHECK-LABEL: @interleave_3L_2S_1L(
-; CHECK: load <3 x i32>
-; CHECK: store <2 x i32>
-; CHECK: load i32
-
-define void @interleave_3L_2S_1L(i32* noalias %ptr) {
-  %next.gep = getelementptr i32, i32* %ptr, i64 0
-  %next.gep1 = getelementptr i32, i32* %ptr, i64 1
-  %next.gep2 = getelementptr i32, i32* %ptr, i64 2
-
-  %l2 = load i32, i32* %next.gep, align 4
-  %l1 = load i32, i32* %next.gep1, align 4
-  store i32 0, i32* %next.gep1, align 4
-  store i32 0, i32* %next.gep, align 4
-  %l3 = load i32, i32* %next.gep1, align 4
-  %l4 = load i32, i32* %next.gep2, align 4
-
-  ret void
-}
-
-; CHECK-LABEL: @chain_suffix(
-; CHECK: load i32
-; CHECK: store <2 x i32>
-; CHECK: load <2 x i32>
-define void @chain_suffix(i32* noalias %ptr) {
-  %next.gep = getelementptr i32, i32* %ptr, i64 0
-  %next.gep1 = getelementptr i32, i32* %ptr, i64 1
-  %next.gep2 = getelementptr i32, i32* %ptr, i64 2
-
-  %l2 = load i32, i32* %next.gep, align 4
-  store i32 0, i32* %next.gep1, align 4
-  store i32 0, i32* %next.gep, align 4
-  %l3 = load i32, i32* %next.gep1, align 4
-  %l4 = load i32, i32* %next.gep2, align 4
-
-  ret void
-}
-
-
-; CHECK-LABEL: @chain_prefix_suffix(
-; CHECK: load <2 x i32>
-; CHECK: store <2 x i32>
-; CHECK: load <3 x i32>
-define void  @chain_prefix_suffix(i32* noalias %ptr) {
-  %next.gep = getelementptr i32, i32* %ptr, i64 0
-  %next.gep1 = getelementptr i32, i32* %ptr, i64 1
-  %next.gep2 = getelementptr i32, i32* %ptr, i64 2
-  %next.gep3 = getelementptr i32, i32* %ptr, i64 3
-
-  %l1 = load i32, i32* %next.gep, align 4
-  %l2 = load i32, i32* %next.gep1, align 4
-  store i32 0, i32* %next.gep1, align 4
-  store i32 0, i32* %next.gep2, align 4
-  %l3 = load i32, i32* %next.gep1, align 4
-  %l4 = load i32, i32* %next.gep2, align 4
-  %l5 = load i32, i32* %next.gep3, align 4
-
-  ret void
-}
-
-; FIXME: If the chain is too long and TLI says misaligned is not fast,
-; then LSV fails to vectorize anything in that chain.
-; To reproduce below, add a tmp5 (ptr+4) and load tmp5 into l6 and l7.
-
-; CHECK-LABEL: @interleave_get_longest
-; CHECK: load <3 x i32>
-; CHECK: load i32
-; CHECK: store <2 x i32> zeroinitializer
-; CHECK: load i32
-; CHECK: load i32
-; CHECK: load i32
-
-define void @interleave_get_longest(i32* noalias %ptr) {
-  %tmp1 = getelementptr i32, i32* %ptr, i64 0
-  %tmp2 = getelementptr i32, i32* %ptr, i64 1
-  %tmp3 = getelementptr i32, i32* %ptr, i64 2
-  %tmp4 = getelementptr i32, i32* %ptr, i64 3
-
-  %l1 = load i32, i32* %tmp2, align 4
-  %l2 = load i32, i32* %tmp1, align 4
-  store i32 0, i32* %tmp2, align 4
-  store i32 0, i32* %tmp1, align 4
-  %l3 = load i32, i32* %tmp2, align 4
-  %l4 = load i32, i32* %tmp3, align 4
-  %l5 = load i32, i32* %tmp4, align 4
-  %l6 = load i32, i32* %tmp4, align 4
-  %l7 = load i32, i32* %tmp4, align 4
-
-  ret void
-}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll
deleted file mode 100644
index 00971f35038..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu haswell -S -o - %s | FileCheck %s
-; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -mcpu haswell -S -o - %s | FileCheck %s
-
-; Check that the LoadStoreVectorizer does not crash due to not differentiating <1 x T> and T.
-
-; CHECK-LABEL: @vector_scalar(
-; CHECK: store double
-; CHECK: store <1 x double>
-define void @vector_scalar(double* %ptr, double %a, <1 x double> %b) {
-  %1 = bitcast double* %ptr to <1 x double>*
-  %2 = getelementptr <1 x double>, <1 x double>* %1, i32 1
-  store double %a, double* %ptr, align 8
-  store <1 x double> %b, <1 x double>* %2, align 8
-  ret void
-}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll b/llvm/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll
deleted file mode 100644
index 07487b57803..00000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; RUN: opt -S < %s -load-store-vectorizer | FileCheck %s
-; RUN: opt -S < %s -passes='function(load-store-vectorizer)' | FileCheck %s
-
-declare void @llvm.sideeffect()
-
-; load-store vectorization across a @llvm.sideeffect.
-
-; CHECK-LABEL: test
-; CHECK: load <4 x float>
-; CHECK: store <4 x float>
-define void @test(float* %p) {
-    %p0 = getelementptr float, float* %p, i64 0
-    %p1 = getelementptr float, float* %p, i64 1
-    %p2 = getelementptr float, float* %p, i64 2
-    %p3 = getelementptr float, float* %p, i64 3
-    %l0 = load float, float* %p0, align 16
-    %l1 = load float, float* %p1
-    %l2 = load float, float* %p2
-    call void @llvm.sideeffect()
-    %l3 = load float, float* %p3
-    store float %l0, float* %p0, align 16
-    call void @llvm.sideeffect()
-    store float %l1, float* %p1
-    store float %l2, float* %p2
-    store float %l3, float* %p3
-    ret void
-}
author	Eric Christopher <echristo@gmail.com>	2019-04-17 02:12:23 +0000
committer	Eric Christopher <echristo@gmail.com>	2019-04-17 02:12:23 +0000
commit	a86343512845c9c1fdbac865fea88aa5fce7142a (patch)
tree	666fc6353de19ad8b00e56b67edd33f24104e4a7 /llvm/test/Transforms/LoadStoreVectorizer
parent	7f8ca6e3679b3af951cb7a4b1377edfaa3244b93 (diff)
download	bcm5719-llvm-a86343512845c9c1fdbac865fea88aa5fce7142a.tar.gz bcm5719-llvm-a86343512845c9c1fdbac865fea88aa5fce7142a.zip