diff options
| author | Eric Christopher <echristo@gmail.com> | 2019-04-17 02:12:23 +0000 |
|---|---|---|
| committer | Eric Christopher <echristo@gmail.com> | 2019-04-17 02:12:23 +0000 |
| commit | a86343512845c9c1fdbac865fea88aa5fce7142a (patch) | |
| tree | 666fc6353de19ad8b00e56b67edd33f24104e4a7 /llvm/test/Transforms/LoadStoreVectorizer/AMDGPU | |
| parent | 7f8ca6e3679b3af951cb7a4b1377edfaa3244b93 (diff) | |
| download | bcm5719-llvm-a86343512845c9c1fdbac865fea88aa5fce7142a.tar.gz bcm5719-llvm-a86343512845c9c1fdbac865fea88aa5fce7142a.zip | |
Temporarily Revert "Add basic loop fusion pass."
As it's causing some bot failures (and per request from kbarton).
This reverts commit r358543/ab70da07286e618016e78247e4a24fcb84077fda.
llvm-svn: 358546
Diffstat (limited to 'llvm/test/Transforms/LoadStoreVectorizer/AMDGPU')
20 files changed, 0 insertions, 2538 deletions
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll deleted file mode 100644 index d2834be18b0..00000000000 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll +++ /dev/null @@ -1,32 +0,0 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -scoped-noalias -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=SCOPE -check-prefix=ALL %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=NOSCOPE -check-prefix=ALL %s - -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" - -; This fails to vectorize if the !alias.scope is not used - -; ALL-LABEL: @vectorize_alias_scope( -; SCOPE: load float, float addrspace(1)* %c -; SCOPE: bitcast float addrspace(1)* %a to <2 x float> addrspace(1)* -; SCOPE: store <2 x float> zeroinitializer -; SCOPE: store float %ld.c, float addrspace(1)* %b, - -; NOSCOPE: store float -; NOSCOPE: load float -; NOSCOPE: store float -; NOSCOPE: store float -define amdgpu_kernel void @vectorize_alias_scope(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 { -entry: - %a.idx.1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1 - store float 0.0, float addrspace(1)* %a, align 4, !noalias !0 - %ld.c = load float, float addrspace(1)* %c, align 4, !alias.scope !0 - store float 0.0, float addrspace(1)* %a.idx.1, align 4, !noalias !0 - store float %ld.c, float addrspace(1)* %b, align 4, !noalias !0 - ret void -} - -attributes #0 = { nounwind } - -!0 = !{!1} -!1 = distinct !{!1, !2, !"some scope"} -!2 = distinct !{!2, !"some domain"} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll deleted file mode 100644 index b0dd5d185c7..00000000000 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll +++ /dev/null @@ -1,210 +0,0 @@ -; RUN: opt -S -load-store-vectorizer -mattr=-unaligned-buffer-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=ALIGNED -check-prefix=ALL %s -; RUN: opt -S -load-store-vectorizer -mattr=+unaligned-buffer-access,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=UNALIGNED -check-prefix=ALL %s -; RUN: opt -S -passes='function(load-store-vectorizer)' -mattr=-unaligned-buffer-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=ALIGNED -check-prefix=ALL %s -; RUN: opt -S -passes='function(load-store-vectorizer)' -mattr=+unaligned-buffer-access,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=UNALIGNED -check-prefix=ALL %s - -target triple = "amdgcn--" -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" - -; ALL-LABEL: @load_unknown_offset_align1_i8( -; ALL: alloca [128 x i8], align 1 -; UNALIGNED: load <2 x i8>, <2 x i8> addrspace(5)* %{{[0-9]+}}, align 1{{$}} - -; ALIGNED: load i8, i8 addrspace(5)* %ptr0, align 1{{$}} -; ALIGNED: load i8, i8 addrspace(5)* %ptr1, align 1{{$}} -define amdgpu_kernel void @load_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 { - %alloca = alloca [128 x i8], align 1, addrspace(5) - %ptr0 = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* %alloca, i32 0, i32 %offset - %val0 = load i8, i8 addrspace(5)* %ptr0, align 1 - %ptr1 = getelementptr inbounds i8, i8 addrspace(5)* %ptr0, i32 1 - %val1 = load i8, i8 addrspace(5)* %ptr1, align 1 - %add = add i8 %val0, %val1 - store i8 %add, i8 addrspace(1)* %out - ret void -} - -; ALL-LABEL: @load_unknown_offset_align1_i16( -; ALL: alloca [128 x i16], align 1, addrspace(5){{$}} -; UNALIGNED: load <2 x i16>, <2 x i16> addrspace(5)* %{{[0-9]+}}, align 1{{$}} - -; ALIGNED: load i16, i16 addrspace(5)* %ptr0, align 1{{$}} -; ALIGNED: load i16, i16 addrspace(5)* %ptr1, align 1{{$}} -define amdgpu_kernel void @load_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 { - %alloca = alloca [128 x i16], align 1, addrspace(5) - %ptr0 = getelementptr inbounds [128 x i16], [128 x i16] addrspace(5)* %alloca, i32 0, i32 %offset - %val0 = load i16, i16 addrspace(5)* %ptr0, align 1 - %ptr1 = getelementptr inbounds i16, i16 addrspace(5)* %ptr0, i32 1 - %val1 = load i16, i16 addrspace(5)* %ptr1, align 1 - %add = add i16 %val0, %val1 - store i16 %add, i16 addrspace(1)* %out - ret void -} - -; FIXME: Although the offset is unknown here, we know it is a multiple -; of the element size, so should still be align 4 - -; ALL-LABEL: @load_unknown_offset_align1_i32( -; ALL: alloca [128 x i32], align 1 -; UNALIGNED: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 1{{$}} - -; ALIGNED: load i32, i32 addrspace(5)* %ptr0, align 1 -; ALIGNED: load i32, i32 addrspace(5)* %ptr1, align 1 -define amdgpu_kernel void @load_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 { - %alloca = alloca [128 x i32], align 1, addrspace(5) - %ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset - %val0 = load i32, i32 addrspace(5)* %ptr0, align 1 - %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %ptr0, i32 1 - %val1 = load i32, i32 addrspace(5)* %ptr1, align 1 - %add = add i32 %val0, %val1 - store i32 %add, i32 addrspace(1)* %out - ret void -} - -; FIXME: Should always increase alignment of the load -; Make sure alloca alignment isn't decreased -; ALL-LABEL: @load_alloca16_unknown_offset_align1_i32( -; ALL: alloca [128 x i32], align 16 - -; UNALIGNED: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 1{{$}} -; ALIGNED: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 4{{$}} -define amdgpu_kernel void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 { - %alloca = alloca [128 x i32], align 16, addrspace(5) - %ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset - %val0 = load i32, i32 addrspace(5)* %ptr0, align 1 - %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %ptr0, i32 1 - %val1 = load i32, i32 addrspace(5)* %ptr1, align 1 - %add = add i32 %val0, %val1 - store i32 %add, i32 addrspace(1)* %out - ret void -} - -; ALL-LABEL: @store_unknown_offset_align1_i8( -; ALL: alloca [128 x i8], align 1 -; UNALIGNED: store <2 x i8> <i8 9, i8 10>, <2 x i8> addrspace(5)* %{{[0-9]+}}, align 1{{$}} - -; ALIGNED: store i8 9, i8 addrspace(5)* %ptr0, align 1{{$}} -; ALIGNED: store i8 10, i8 addrspace(5)* %ptr1, align 1{{$}} -define amdgpu_kernel void @store_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 { - %alloca = alloca [128 x i8], align 1, addrspace(5) - %ptr0 = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* %alloca, i32 0, i32 %offset - store i8 9, i8 addrspace(5)* %ptr0, align 1 - %ptr1 = getelementptr inbounds i8, i8 addrspace(5)* %ptr0, i32 1 - store i8 10, i8 addrspace(5)* %ptr1, align 1 - ret void -} - -; ALL-LABEL: @store_unknown_offset_align1_i16( -; ALL: alloca [128 x i16], align 1 -; UNALIGNED: store <2 x i16> <i16 9, i16 10>, <2 x i16> addrspace(5)* %{{[0-9]+}}, align 1{{$}} - -; ALIGNED: store i16 9, i16 addrspace(5)* %ptr0, align 1{{$}} -; ALIGNED: store i16 10, i16 addrspace(5)* %ptr1, align 1{{$}} -define amdgpu_kernel void @store_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 { - %alloca = alloca [128 x i16], align 1, addrspace(5) - %ptr0 = getelementptr inbounds [128 x i16], [128 x i16] addrspace(5)* %alloca, i32 0, i32 %offset - store i16 9, i16 addrspace(5)* %ptr0, align 1 - %ptr1 = getelementptr inbounds i16, i16 addrspace(5)* %ptr0, i32 1 - store i16 10, i16 addrspace(5)* %ptr1, align 1 - ret void -} - -; FIXME: Although the offset is unknown here, we know it is a multiple -; of the element size, so it still should be align 4. - -; ALL-LABEL: @store_unknown_offset_align1_i32( -; ALL: alloca [128 x i32], align 1 - -; UNALIGNED: store <2 x i32> <i32 9, i32 10>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 1{{$}} - -; ALIGNED: store i32 9, i32 addrspace(5)* %ptr0, align 1 -; ALIGNED: store i32 10, i32 addrspace(5)* %ptr1, align 1 -define amdgpu_kernel void @store_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 { - %alloca = alloca [128 x i32], align 1, addrspace(5) - %ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset - store i32 9, i32 addrspace(5)* %ptr0, align 1 - %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %ptr0, i32 1 - store i32 10, i32 addrspace(5)* %ptr1, align 1 - ret void -} - -; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32( -; ALIGNED: %alloca = alloca [8 x i32], align 4, addrspace(5) -; ALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 4 - -; UNALIGNED: %alloca = alloca [8 x i32], align 1, addrspace(5) -; UNALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 1 -define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32() { - %alloca = alloca [8 x i32], align 1, addrspace(5) - %out = bitcast [8 x i32] addrspace(5)* %alloca to i32 addrspace(5)* - %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 - %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3 - - store i32 9, i32 addrspace(5)* %out, align 1 - store i32 1, i32 addrspace(5)* %out.gep.1, align 1 - store i32 23, i32 addrspace(5)* %out.gep.2, align 1 - store i32 19, i32 addrspace(5)* %out.gep.3, align 1 - ret void -} - -; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8( -; ALIGNED: %alloca = alloca [8 x i8], align 4, addrspace(5) -; ALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8> addrspace(5)* %1, align 4 - -; UNALIGNED: %alloca = alloca [8 x i8], align 1, addrspace(5) -; UNALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8> addrspace(5)* %1, align 1 -define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8() { - %alloca = alloca [8 x i8], align 1, addrspace(5) - %out = bitcast [8 x i8] addrspace(5)* %alloca to i8 addrspace(5)* - %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1 - %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2 - %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i8 3 - - store i8 9, i8 addrspace(5)* %out, align 1 - store i8 1, i8 addrspace(5)* %out.gep.1, align 1 - store i8 23, i8 addrspace(5)* %out.gep.2, align 1 - store i8 19, i8 addrspace(5)* %out.gep.3, align 1 - ret void -} - -; ALL-LABEL: @merge_private_load_4_vector_elts_loads_v4i32( -; ALIGNED: %alloca = alloca [8 x i32], align 4, addrspace(5) -; ALIGNED: load <4 x i32>, <4 x i32> addrspace(5)* %1, align 4 - -; UNALIGNED: %alloca = alloca [8 x i32], align 1, addrspace(5) -; UNALIGNED: load <4 x i32>, <4 x i32> addrspace(5)* %1, align 1 -define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i32() { - %alloca = alloca [8 x i32], align 1, addrspace(5) - %out = bitcast [8 x i32] addrspace(5)* %alloca to i32 addrspace(5)* - %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 - %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3 - - %load0 = load i32, i32 addrspace(5)* %out, align 1 - %load1 = load i32, i32 addrspace(5)* %out.gep.1, align 1 - %load2 = load i32, i32 addrspace(5)* %out.gep.2, align 1 - %load3 = load i32, i32 addrspace(5)* %out.gep.3, align 1 - ret void -} - -; ALL-LABEL: @merge_private_load_4_vector_elts_loads_v4i8( -; ALIGNED: %alloca = alloca [8 x i8], align 4, addrspace(5) -; ALIGNED: load <4 x i8>, <4 x i8> addrspace(5)* %1, align 4 - -; UNALIGNED: %alloca = alloca [8 x i8], align 1, addrspace(5) -; UNALIGNED: load <4 x i8>, <4 x i8> addrspace(5)* %1, align 1 -define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i8() { - %alloca = alloca [8 x i8], align 1, addrspace(5) - %out = bitcast [8 x i8] addrspace(5)* %alloca to i8 addrspace(5)* - %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1 - %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2 - %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i8 3 - - %load0 = load i8, i8 addrspace(5)* %out, align 1 - %load1 = load i8, i8 addrspace(5)* %out.gep.1, align 1 - %load2 = load i8, i8 addrspace(5)* %out.gep.2, align 1 - %load3 = load i8, i8 addrspace(5)* %out.gep.3, align 1 - ret void -} - -attributes #0 = { nounwind } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll deleted file mode 100644 index cd1c7fdc521..00000000000 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll +++ /dev/null @@ -1,52 +0,0 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s - -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" - -declare i64 @_Z12get_local_idj(i32) - -declare i64 @_Z12get_group_idj(i32) - -declare double @llvm.fmuladd.f64(double, double, double) - -; CHECK-LABEL: @factorizedVsNonfactorizedAccess( -; CHECK: load <2 x float> -; CHECK: store <2 x float> -define amdgpu_kernel void @factorizedVsNonfactorizedAccess(float addrspace(1)* nocapture %c) { -entry: - %call = tail call i64 @_Z12get_local_idj(i32 0) - %call1 = tail call i64 @_Z12get_group_idj(i32 0) - %div = lshr i64 %call, 4 - %div2 = lshr i64 %call1, 3 - %mul = shl i64 %div2, 7 - %rem = shl i64 %call, 3 - %mul3 = and i64 %rem, 120 - %add = or i64 %mul, %mul3 - %rem4 = shl i64 %call1, 7 - %mul5 = and i64 %rem4, 896 - %mul6 = shl nuw nsw i64 %div, 3 - %add7 = add nuw i64 %mul5, %mul6 - %mul9 = shl i64 %add7, 10 - %add10 = add i64 %mul9, %add - %arrayidx = getelementptr inbounds float, float addrspace(1)* %c, i64 %add10 - %load1 = load float, float addrspace(1)* %arrayidx, align 4 - %conv = fpext float %load1 to double - %mul11 = fmul double %conv, 0x3FEAB481D8F35506 - %conv12 = fptrunc double %mul11 to float - %conv18 = fpext float %conv12 to double - %storeval1 = tail call double @llvm.fmuladd.f64(double 0x3FF4FFAFBBEC946A, double 0.000000e+00, double %conv18) - %cstoreval1 = fptrunc double %storeval1 to float - store float %cstoreval1, float addrspace(1)* %arrayidx, align 4 - - %add23 = or i64 %add10, 1 - %arrayidx24 = getelementptr inbounds float, float addrspace(1)* %c, i64 %add23 - %load2 = load float, float addrspace(1)* %arrayidx24, align 4 - %conv25 = fpext float %load2 to double - %mul26 = fmul double %conv25, 0x3FEAB481D8F35506 - %conv27 = fptrunc double %mul26 to float - %conv34 = fpext float %conv27 to double - %storeval2 = tail call double @llvm.fmuladd.f64(double 0x3FF4FFAFBBEC946A, double 0.000000e+00, double %conv34) - %cstoreval2 = fptrunc double %storeval2 to float - store float %cstoreval2, float addrspace(1)* %arrayidx24, align 4 - ret void -} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll deleted file mode 100644 index b8e95a6793e..00000000000 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll +++ /dev/null @@ -1,151 +0,0 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s - -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" - -declare i32 @llvm.amdgcn.workitem.id.x() #1 - -; CHECK-LABEL: @basic_merge_sext_index( -; CHECK: sext i32 %id.x to i64 -; CHECK: load <2 x float> -; CHECK: store <2 x float> zeroinitializer -define amdgpu_kernel void @basic_merge_sext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 { -entry: - %id.x = call i32 @llvm.amdgcn.workitem.id.x() - %sext.id.x = sext i32 %id.x to i64 - %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %sext.id.x - %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %sext.id.x - %a.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %a.idx.x, i64 1 - %c.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %c.idx.x, i64 1 - - %ld.c = load float, float addrspace(1)* %c.idx.x, align 4 - %ld.c.idx.1 = load float, float addrspace(1)* %c.idx.x.1, align 4 - - store float 0.0, float addrspace(1)* %a.idx.x, align 4 - store float 0.0, float addrspace(1)* %a.idx.x.1, align 4 - - %add = fadd float %ld.c, %ld.c.idx.1 - store float %add, float addrspace(1)* %b, align 4 - ret void -} - -; CHECK-LABEL: @basic_merge_zext_index( -; CHECK: zext i32 %id.x to i64 -; CHECK: load <2 x float> -; CHECK: store <2 x float> -define amdgpu_kernel void @basic_merge_zext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 { -entry: - %id.x = call i32 @llvm.amdgcn.workitem.id.x() - %zext.id.x = zext i32 %id.x to i64 - %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %zext.id.x - %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %zext.id.x - %a.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %a.idx.x, i64 1 - %c.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %c.idx.x, i64 1 - - %ld.c = load float, float addrspace(1)* %c.idx.x, align 4 - %ld.c.idx.1 = load float, float addrspace(1)* %c.idx.x.1, align 4 - store float 0.0, float addrspace(1)* %a.idx.x, align 4 - store float 0.0, float addrspace(1)* %a.idx.x.1, align 4 - - %add = fadd float %ld.c, %ld.c.idx.1 - store float %add, float addrspace(1)* %b, align 4 - ret void -} - -; CHECK-LABEL: @merge_op_zext_index( -; CHECK: load <2 x float> -; CHECK: store <2 x float> -define amdgpu_kernel void @merge_op_zext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 { -entry: - %id.x = call i32 @llvm.amdgcn.workitem.id.x() - %shl = shl i32 %id.x, 2 - %zext.id.x = zext i32 %shl to i64 - %a.0 = getelementptr inbounds float, float addrspace(1)* %a, i64 %zext.id.x - %c.0 = getelementptr inbounds float, float addrspace(1)* %c, i64 %zext.id.x - - %id.x.1 = or i32 %shl, 1 - %id.x.1.ext = zext i32 %id.x.1 to i64 - - %a.1 = getelementptr inbounds float, float addrspace(1)* %a, i64 %id.x.1.ext - %c.1 = getelementptr inbounds float, float addrspace(1)* %c, i64 %id.x.1.ext - - %ld.c.0 = load float, float addrspace(1)* %c.0, align 4 - store float 0.0, float addrspace(1)* %a.0, align 4 - %ld.c.1 = load float, float addrspace(1)* %c.1, align 4 - store float 0.0, float addrspace(1)* %a.1, align 4 - - %add = fadd float %ld.c.0, %ld.c.1 - store float %add, float addrspace(1)* %b, align 4 - ret void -} - -; CHECK-LABEL: @merge_op_sext_index( -; CHECK: load <2 x float> -; CHECK: store <2 x float> -define amdgpu_kernel void @merge_op_sext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 { -entry: - %id.x = call i32 @llvm.amdgcn.workitem.id.x() - %shl = shl i32 %id.x, 2 - %zext.id.x = sext i32 %shl to i64 - %a.0 = getelementptr inbounds float, float addrspace(1)* %a, i64 %zext.id.x - %c.0 = getelementptr inbounds float, float addrspace(1)* %c, i64 %zext.id.x - - %id.x.1 = or i32 %shl, 1 - %id.x.1.ext = sext i32 %id.x.1 to i64 - - %a.1 = getelementptr inbounds float, float addrspace(1)* %a, i64 %id.x.1.ext - %c.1 = getelementptr inbounds float, float addrspace(1)* %c, i64 %id.x.1.ext - - %ld.c.0 = load float, float addrspace(1)* %c.0, align 4 - store float 0.0, float addrspace(1)* %a.0, align 4 - %ld.c.1 = load float, float addrspace(1)* %c.1, align 4 - store float 0.0, float addrspace(1)* %a.1, align 4 - - %add = fadd float %ld.c.0, %ld.c.1 - store float %add, float addrspace(1)* %b, align 4 - ret void -} - -; This case fails to vectorize if not using the extra extension -; handling in isConsecutiveAccess. - -; CHECK-LABEL: @zext_trunc_phi_1( -; CHECK: loop: -; CHECK: load <2 x i32> -; CHECK: store <2 x i32> -define amdgpu_kernel void @zext_trunc_phi_1(i32 addrspace(1)* nocapture noalias %a, i32 addrspace(1)* nocapture noalias %b, i32 addrspace(1)* nocapture readonly noalias %c, i32 %n, i64 %arst, i64 %aoeu) #0 { -entry: - %cmp0 = icmp eq i32 %n, 0 - br i1 %cmp0, label %exit, label %loop - -loop: - %indvars.iv = phi i64 [ %indvars.iv.next, %loop ], [ 0, %entry ] - %trunc.iv = trunc i64 %indvars.iv to i32 - %idx = shl i32 %trunc.iv, 4 - - %idx.ext = zext i32 %idx to i64 - %c.0 = getelementptr inbounds i32, i32 addrspace(1)* %c, i64 %idx.ext - %a.0 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idx.ext - - %idx.1 = or i32 %idx, 1 - %idx.1.ext = zext i32 %idx.1 to i64 - %c.1 = getelementptr inbounds i32, i32 addrspace(1)* %c, i64 %idx.1.ext - %a.1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idx.1.ext - - %ld.c.0 = load i32, i32 addrspace(1)* %c.0, align 4 - store i32 %ld.c.0, i32 addrspace(1)* %a.0, align 4 - %ld.c.1 = load i32, i32 addrspace(1)* %c.1, align 4 - store i32 %ld.c.1, i32 addrspace(1)* %a.1, align 4 - - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - - %exitcond = icmp eq i32 %lftr.wideiv, %n - br i1 %exitcond, label %exit, label %loop - -exit: - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll deleted file mode 100644 index 5bb6289ff19..00000000000 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll +++ /dev/null @@ -1,135 +0,0 @@ -; RUN: opt -S -mtriple=amdgcn--amdhsa -load-store-vectorizer < %s | FileCheck %s -; RUN: opt -S -mtriple=amdgcn--amdhsa -passes='function(load-store-vectorizer)' < %s | FileCheck %s - -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" - -; Check that vectorizer can find a GEP through bitcast -; CHECK-LABEL: @vect_zext_bitcast_f32_to_i32_idx -; CHECK: load <4 x i32> -define void @vect_zext_bitcast_f32_to_i32_idx(float addrspace(1)* %arg1, i32 %base) { - %add1 = add nuw i32 %base, 0 - %zext1 = zext i32 %add1 to i64 - %gep1 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 %zext1 - %f2i1 = bitcast float addrspace(1)* %gep1 to i32 addrspace(1)* - %load1 = load i32, i32 addrspace(1)* %f2i1, align 4 - %add2 = add nuw i32 %base, 1 - %zext2 = zext i32 %add2 to i64 - %gep2 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 %zext2 - %f2i2 = bitcast float addrspace(1)* %gep2 to i32 addrspace(1)* - %load2 = load i32, i32 addrspace(1)* %f2i2, align 4 - %add3 = add nuw i32 %base, 2 - %zext3 = zext i32 %add3 to i64 - %gep3 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 %zext3 - %f2i3 = bitcast float addrspace(1)* %gep3 to i32 addrspace(1)* - %load3 = load i32, i32 addrspace(1)* %f2i3, align 4 - %add4 = add nuw i32 %base, 3 - %zext4 = zext i32 %add4 to i64 - %gep4 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 %zext4 - %f2i4 = bitcast float addrspace(1)* %gep4 to i32 addrspace(1)* - %load4 = load i32, i32 addrspace(1)* %f2i4, align 4 - ret void -} - -; CHECK-LABEL: @vect_zext_bitcast_i8_st1_to_i32_idx -; CHECK: load i32 -; CHECK: load i32 -; CHECK: load i32 -; CHECK: load i32 -define void @vect_zext_bitcast_i8_st1_to_i32_idx(i8 addrspace(1)* %arg1, i32 %base) { - %add1 = add nuw i32 %base, 0 - %zext1 = zext i32 %add1 to i64 - %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext1 - %f2i1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* - %load1 = load i32, i32 addrspace(1)* %f2i1, align 4 - %add2 = add nuw i32 %base, 1 - %zext2 = zext i32 %add2 to i64 - %gep2 = getelementptr inbounds i8,i8 addrspace(1)* %arg1, i64 %zext2 - %f2i2 = bitcast i8 addrspace(1)* %gep2 to i32 addrspace(1)* - %load2 = load i32, i32 addrspace(1)* %f2i2, align 4 - %add3 = add nuw i32 %base, 2 - %zext3 = zext i32 %add3 to i64 - %gep3 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext3 - %f2i3 = bitcast i8 addrspace(1)* %gep3 to i32 addrspace(1)* - %load3 = load i32, i32 addrspace(1)* %f2i3, align 4 - %add4 = add nuw i32 %base, 3 - %zext4 = zext i32 %add4 to i64 - %gep4 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext4 - %f2i4 = bitcast i8 addrspace(1)* %gep4 to i32 addrspace(1)* - %load4 = load i32, i32 addrspace(1)* %f2i4, align 4 - ret void -} - -; CHECK-LABEL: @vect_zext_bitcast_i8_st4_to_i32_idx -; CHECK: load <4 x i32> -define void @vect_zext_bitcast_i8_st4_to_i32_idx(i8 addrspace(1)* %arg1, i32 %base) { - %add1 = add nuw i32 %base, 0 - %zext1 = zext i32 %add1 to i64 - %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext1 - %f2i1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* - %load1 = load i32, i32 addrspace(1)* %f2i1, align 4 - %add2 = add nuw i32 %base, 4 - %zext2 = zext i32 %add2 to i64 - %gep2 = getelementptr inbounds i8,i8 addrspace(1)* %arg1, i64 %zext2 - %f2i2 = bitcast i8 addrspace(1)* %gep2 to i32 addrspace(1)* - %load2 = load i32, i32 addrspace(1)* %f2i2, align 4 - %add3 = add nuw i32 %base, 8 - %zext3 = zext i32 %add3 to i64 - %gep3 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext3 - %f2i3 = bitcast i8 addrspace(1)* %gep3 to i32 addrspace(1)* - %load3 = load i32, i32 addrspace(1)* %f2i3, align 4 - %add4 = add nuw i32 %base, 12 - %zext4 = zext i32 %add4 to i64 - %gep4 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext4 - %f2i4 = bitcast i8 addrspace(1)* %gep4 to i32 addrspace(1)* - %load4 = load i32, i32 addrspace(1)* %f2i4, align 4 - ret void -} - -; CHECK-LABEL: @vect_zext_bitcast_negative_ptr_delta -; CHECK: load <2 x i32> -define void @vect_zext_bitcast_negative_ptr_delta(i32 addrspace(1)* %p, i32 %base) { - %p.bitcasted = bitcast i32 addrspace(1)* %p to i16 addrspace(1)* - %a.offset = add nuw i32 %base, 4 - %t.offset.zexted = zext i32 %base to i64 - %a.offset.zexted = zext i32 %a.offset to i64 - %t.ptr = getelementptr inbounds i16, i16 addrspace(1)* %p.bitcasted, i64 %t.offset.zexted - %a.ptr = getelementptr inbounds i16, i16 addrspace(1)* %p.bitcasted, i64 %a.offset.zexted - %b.ptr = getelementptr inbounds i16, i16 addrspace(1)* %t.ptr, i64 6 - %a.ptr.bitcasted = bitcast i16 addrspace(1)* %a.ptr to i32 addrspace(1)* - %b.ptr.bitcasted = bitcast i16 addrspace(1)* %b.ptr to i32 addrspace(1)* - %a.val = load i32, i32 addrspace(1)* %a.ptr.bitcasted - %b.val = load i32, i32 addrspace(1)* %b.ptr.bitcasted - ret void -} - -; Check i1 corner case -; CHECK-LABEL: @zexted_i1_gep_index -; CHECK: load i32 -; CHECK: load i32 -define void @zexted_i1_gep_index(i32 addrspace(1)* %p, i32 %val) { - %selector = icmp eq i32 %val, 0 - %flipped = xor i1 %selector, 1 - %index.0 = zext i1 %selector to i64 - %index.1 = zext i1 %flipped to i64 - %gep.0 = getelementptr inbounds i32, i32 addrspace(1)* %p, i64 %index.0 - %gep.1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i64 %index.1 - %val0 = load i32, i32 addrspace(1)* %gep.0 - %val1 = load i32, i32 addrspace(1)* %gep.1 - ret void -} - -; Check i1 corner case -; CHECK-LABEL: @sexted_i1_gep_index -; CHECK: load i32 -; CHECK: load i32 -define void @sexted_i1_gep_index(i32 addrspace(1)* %p, i32 %val) { - %selector = icmp eq i32 %val, 0 - %flipped = xor i1 %selector, 1 - %index.0 = sext i1 %selector to i64 - %index.1 = sext i1 %flipped to i64 - %gep.0 = getelementptr inbounds i32, i32 addrspace(1)* %p, i64 %index.0 - %gep.1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i64 %index.1 - %val0 = load i32, i32 addrspace(1)* %gep.0 - %val1 = load i32, i32 addrspace(1)* %gep.1 - ret void -} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll deleted file mode 100644 index 35836f80456..00000000000 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll +++ /dev/null @@ -1,118 +0,0 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s - -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" - -; Check position of the inserted vector load/store. Vectorized loads should be -; inserted at the position of the first load in the chain, and stores should be -; inserted at the position of the last store. - -; CHECK-LABEL: @insert_load_point( -; CHECK: %z = add i32 %x, 4 -; CHECK: load <2 x float> -; CHECK: %w = add i32 %y, 9 -; CHECK: %foo = add i32 %z, %w -define amdgpu_kernel void @insert_load_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 { -entry: - %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx - %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %idx - %a.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %a.idx.x, i64 1 - %c.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %c.idx.x, i64 1 - - %z = add i32 %x, 4 - %ld.c = load float, float addrspace(1)* %c.idx.x, align 4 - %w = add i32 %y, 9 - %ld.c.idx.1 = load float, float addrspace(1)* %c.idx.x.1, align 4 - %foo = add i32 %z, %w - - store float 0.0, float addrspace(1)* %a.idx.x, align 4 - store float 0.0, float addrspace(1)* %a.idx.x.1, align 4 - - %add = fadd float %ld.c, %ld.c.idx.1 - store float %add, float addrspace(1)* %b, align 4 - store i32 %foo, i32 addrspace(3)* null, align 4 - ret void -} - -; CHECK-LABEL: @insert_store_point( -; CHECK: %z = add i32 %x, 4 -; CHECK: %w = add i32 %y, 9 -; CHECK: store <2 x float> -; CHECK: %foo = add i32 %z, %w -define amdgpu_kernel void @insert_store_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 { -entry: - %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx - %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %idx - %a.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %a.idx.x, i64 1 - %c.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %c.idx.x, i64 1 - - %ld.c = load float, float addrspace(1)* %c.idx.x, align 4 - %ld.c.idx.1 = load float, float addrspace(1)* %c.idx.x.1, align 4 - - %z = add i32 %x, 4 - store float 0.0, float addrspace(1)* %a.idx.x, align 4 - %w = add i32 %y, 9 - store float 0.0, float addrspace(1)* %a.idx.x.1, align 4 - %foo = add i32 %z, %w - - %add = fadd float %ld.c, %ld.c.idx.1 - store float %add, float addrspace(1)* %b, align 4 - store i32 %foo, i32 addrspace(3)* null, align 4 - ret void -} - -; Here we have four stores, with an aliasing load before the last one. We can -; vectorize the first three stores as <3 x float>, but this vectorized store must -; be inserted at the location of the third scalar store, not the fourth one. -; -; CHECK-LABEL: @insert_store_point_alias -; CHECK: store <3 x float> -; CHECK: load float, float addrspace(1)* %a.idx.2 -; CHECK: store float -; CHECK-SAME: %a.idx.3 -define float @insert_store_point_alias(float addrspace(1)* nocapture %a, i64 %idx) { - %a.idx = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx - %a.idx.1 = getelementptr inbounds float, float addrspace(1)* %a.idx, i64 1 - %a.idx.2 = getelementptr inbounds float, float addrspace(1)* %a.idx.1, i64 1 - %a.idx.3 = getelementptr inbounds float, float addrspace(1)* %a.idx.2, i64 1 - - store float 0.0, float addrspace(1)* %a.idx, align 4 - store float 0.0, float addrspace(1)* %a.idx.1, align 4 - store float 0.0, float addrspace(1)* %a.idx.2, align 4 - %x = load float, float addrspace(1)* %a.idx.2, align 4 - store float 0.0, float addrspace(1)* %a.idx.3, align 4 - - ret float %x -} - -; Here we have four stores, with an aliasing load before the last one. We -; could vectorize two of the stores before the load (although we currently -; don't), but the important thing is that we *don't* sink the store to -; a[idx + 1] below the load. -; -; CHECK-LABEL: @insert_store_point_alias_ooo -; CHECK: store float -; CHECK-SAME: %a.idx.3 -; CHECK: store float -; CHECK-SAME: %a.idx.1 -; CHECK: store float -; CHECK-SAME: %a.idx.2 -; CHECK: load float, float addrspace(1)* %a.idx.2 -; CHECK: store float -; CHECK-SAME: %a.idx -define float @insert_store_point_alias_ooo(float addrspace(1)* nocapture %a, i64 %idx) { - %a.idx = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx - %a.idx.1 = getelementptr inbounds float, float addrspace(1)* %a.idx, i64 1 - %a.idx.2 = getelementptr inbounds float, float addrspace(1)* %a.idx.1, i64 1 - %a.idx.3 = getelementptr inbounds float, float addrspace(1)* %a.idx.2, i64 1 - - store float 0.0, float addrspace(1)* %a.idx.3, align 4 - store float 0.0, float addrspace(1)* %a.idx.1, align 4 - store float 0.0, float addrspace(1)* %a.idx.2, align 4 - %x = load float, float addrspace(1)* %a.idx.2, align 4 - store float 0.0, float addrspace(1)* %a.idx, align 4 - - ret float %x -} - -attributes #0 = { nounwind } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll deleted file mode 100644 index 81ebb712e33..00000000000 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll +++ /dev/null @@ -1,29 +0,0 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s - -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" - -; This is NOT OK to vectorize, as either load may alias either store. - -; CHECK: load double -; CHECK: store double 0.000000e+00, double addrspace(1)* %a, -; CHECK: load double -; CHECK: store double 0.000000e+00, double addrspace(1)* %a.idx.1 -define amdgpu_kernel void @interleave(double addrspace(1)* nocapture %a, double addrspace(1)* nocapture %b, double addrspace(1)* nocapture readonly %c) #0 { -entry: - %a.idx.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1 - %c.idx.1 = getelementptr inbounds double, double addrspace(1)* %c, i64 1 - - %ld.c = load double, double addrspace(1)* %c, align 8 ; may alias store to %a - store double 0.0, double addrspace(1)* %a, align 8 - - %ld.c.idx.1 = load double, double addrspace(1)* %c.idx.1, align 8 ; may alias store to %a - store double 0.0, double addrspace(1)* %a.idx.1, align 8 - - %add = fadd double %ld.c, %ld.c.idx.1 - store double %add, double addrspace(1)* %b - - ret void -} - -attributes #0 = { nounwind } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/invariant-load.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/invariant-load.ll deleted file mode 100644 index 15c47716aaf..00000000000 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/invariant-load.ll +++ /dev/null @@ -1,29 +0,0 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s - -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" - -; CHECK-LABEL: @interleave -; CHECK: load <2 x double>, <2 x double> addrspace(1)* %{{.}}, align 8{{$}} -; CHECK: store <2 x double> zeroinitializer -; CHECK: store double %add -define amdgpu_kernel void @interleave(double addrspace(1)* nocapture %a, double addrspace(1)* nocapture %b, double addrspace(1)* nocapture readonly %c) #0 { -entry: - %a.idx.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1 - %c.idx.1 = getelementptr inbounds double, double addrspace(1)* %c, i64 1 - - %ld.c = load double, double addrspace(1)* %c, align 8 - store double 0.0, double addrspace(1)* %a, align 8 ; Cannot alias invariant load - - %ld.c.idx.1 = load double, double addrspace(1)* %c.idx.1, align 8, !invariant.load !0 - store double 0.0, double addrspace(1)* %a.idx.1, align 8 - - %add = fadd double %ld.c, %ld.c.idx.1 - store double %add, double addrspace(1)* %b - - ret void -} - -attributes #0 = { nounwind } - -!0 = !{} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/lit.local.cfg b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/lit.local.cfg deleted file mode 100644 index 6baccf05fff..00000000000 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/lit.local.cfg +++ /dev/null @@ -1,3 +0,0 @@ -if not 'AMDGPU' in config.root.targets: - config.unsupported = True - diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll deleted file mode 100644 index 4292cbcec85..00000000000 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll +++ /dev/null @@ -1,223 +0,0 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4,-unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ELT4-ALIGNED,ALIGNED,ALL %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,-unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ELT8-ALIGNED,ALIGNED,ALL %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,-unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ELT16-ALIGNED,ALIGNED,ALL %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ELT4-UNALIGNED,UNALIGNED,ALL %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ELT8-UNALIGNED,UNALIGNED,ALL %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ELT16-UNALIGNED,UNALIGNED,ALL %s - -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" - -; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32 -; ELT4-ALIGNED: store i32 -; ELT4-ALIGNED: store i32 -; ELT4-ALIGNED: store i32 -; ELT4-ALIGNED: store i32 - -; ELT8: store <2 x i32> -; ELT8: store <2 x i32> - -; ELT16-UNALIGNED: store <4 x i32> -define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32(i32 addrspace(5)* %out) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 - %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3 - - store i32 9, i32 addrspace(5)* %out - store i32 1, i32 addrspace(5)* %out.gep.1 - store i32 23, i32 addrspace(5)* %out.gep.2 - store i32 19, i32 addrspace(5)* %out.gep.3 - ret void -} - -; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32_align1( -; ALIGNED: store i32 9, i32 addrspace(5)* %out, align 1 -; ALIGNED: store i32 1, i32 addrspace(5)* %out.gep.1, align 1 -; ALIGNED: store i32 23, i32 addrspace(5)* %out.gep.2, align 1 -; ALIGNED: store i32 19, i32 addrspace(5)* %out.gep.3, align 1 - -; ELT16-UNALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 1 - -; ELT8-UNALIGNED: store <2 x i32> <i32 9, i32 1>, <2 x i32> addrspace(5)* %1, align 1 -; ELT8-UNALIGNED: store <2 x i32> <i32 23, i32 19>, <2 x i32> addrspace(5)* %2, align 1 - -; ELT4-UNALIGNED: store i32 -; ELT4-UNALIGNED: store i32 -; ELT4-UNALIGNED: store i32 -; ELT4-UNALIGNED: store i32 -define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align1(i32 addrspace(5)* %out) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 - %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3 - - store i32 9, i32 addrspace(5)* %out, align 1 - store i32 1, i32 addrspace(5)* %out.gep.1, align 1 - store i32 23, i32 addrspace(5)* %out.gep.2, align 1 - store i32 19, i32 addrspace(5)* %out.gep.3, align 1 - ret void -} - -; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32_align2( -; ALIGNED: store i32 9, i32 addrspace(5)* %out, align 2 -; ALIGNED: store i32 1, i32 addrspace(5)* %out.gep.1, align 2 -; ALIGNED: store i32 23, i32 addrspace(5)* %out.gep.2, align 2 -; ALIGNED: store i32 19, i32 addrspace(5)* %out.gep.3, align 2 - -; ELT16-UNALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 2 - -; ELT8-UNALIGNED: store <2 x i32> -; ELT8-UNALIGNED: store <2 x i32> - -; ELT4-UNALIGNED: store i32 -; ELT4-UNALIGNED: store i32 -; ELT4-UNALIGNED: store i32 -; ELT4-UNALIGNED: store i32 -define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align2(i32 addrspace(5)* %out) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 - %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3 - - store i32 9, i32 addrspace(5)* %out, align 2 - store i32 1, i32 addrspace(5)* %out.gep.1, align 2 - store i32 23, i32 addrspace(5)* %out.gep.2, align 2 - store i32 19, i32 addrspace(5)* %out.gep.3, align 2 - ret void -} - -; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8( -; ALL: store <4 x i8> -define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8(i8 addrspace(5)* %out) #0 { - %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i32 1 - %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i32 2 - %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i32 3 - - store i8 9, i8 addrspace(5)* %out, align 4 - store i8 1, i8 addrspace(5)* %out.gep.1 - store i8 23, i8 addrspace(5)* %out.gep.2 - store i8 19, i8 addrspace(5)* %out.gep.3 - ret void -} - -; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8_align1( -; ALIGNED: store i8 -; ALIGNED: store i8 -; ALIGNED: store i8 -; ALIGNED: store i8 - -; UNALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8> addrspace(5)* %1, align 1 -define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8_align1(i8 addrspace(5)* %out) #0 { - %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i32 1 - %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i32 2 - %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i32 3 - - store i8 9, i8 addrspace(5)* %out, align 1 - store i8 1, i8 addrspace(5)* %out.gep.1, align 1 - store i8 23, i8 addrspace(5)* %out.gep.2, align 1 - store i8 19, i8 addrspace(5)* %out.gep.3, align 1 - ret void -} - -; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16( -; ALL: store <2 x i16> -define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16(i16 addrspace(5)* %out) #0 { - %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1 - - store i16 9, i16 addrspace(5)* %out, align 4 - store i16 12, i16 addrspace(5)* %out.gep.1 - ret void -} - -; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align2( -; ALIGNED: store i16 -; ALIGNED: store i16 - -; UNALIGNED: store <2 x i16> <i16 9, i16 12>, <2 x i16> addrspace(5)* %1, align 2 -define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align2(i16 addrspace(5)* %out) #0 { - %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1 - - store i16 9, i16 addrspace(5)* %out, align 2 - store i16 12, i16 addrspace(5)* %out.gep.1, align 2 - ret void -} - -; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align1( -; ALIGNED: store i16 -; ALIGNED: store i16 - -; UNALIGNED: store <2 x i16> <i16 9, i16 12>, <2 x i16> addrspace(5)* %1, align 1 -define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align1(i16 addrspace(5)* %out) #0 { - %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1 - - store i16 9, i16 addrspace(5)* %out, align 1 - store i16 12, i16 addrspace(5)* %out.gep.1, align 1 - ret void -} - -; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align8( -; ALL: store <2 x i16> <i16 9, i16 12>, <2 x i16> addrspace(5)* %1, align 8 -define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align8(i16 addrspace(5)* %out) #0 { - %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1 - - store i16 9, i16 addrspace(5)* %out, align 8 - store i16 12, i16 addrspace(5)* %out.gep.1, align 2 - ret void -} - -; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i32 -; ELT4: store i32 -; ELT4: store i32 -; ELT4: store i32 - -; ELT8: store <2 x i32> -; ELT8: store i32 - -; ELT16: store <3 x i32> -define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32(i32 addrspace(5)* %out) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 - - store i32 9, i32 addrspace(5)* %out - store i32 1, i32 addrspace(5)* %out.gep.1 - store i32 23, i32 addrspace(5)* %out.gep.2 - ret void -} - -; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i32_align1( -; ALIGNED: store i32 -; ALIGNED: store i32 -; ALIGNED: store i32 - -; ELT4-UNALIGNED: store i32 -; ELT4-UNALIGNED: store i32 -; ELT4-UNALIGNED: store i32 - -; ELT8-UNALIGNED: store <2 x i32> -; ELT8-UNALIGNED: store i32 - -; ELT16-UNALIGNED: store <3 x i32> -define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32_align1(i32 addrspace(5)* %out) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 - - store i32 9, i32 addrspace(5)* %out, align 1 - store i32 1, i32 addrspace(5)* %out.gep.1, align 1 - store i32 23, i32 addrspace(5)* %out.gep.2, align 1 - ret void -} - -; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i8_align1( -; ALIGNED: store i8 -; ALIGNED: store i8 -; ALIGNED: store i8 - -; UNALIGNED: store <3 x i8> -define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i8_align1(i8 addrspace(5)* %out) #0 { - %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1 - %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2 - - store i8 9, i8 addrspace(5)* %out, align 1 - store i8 1, i8 addrspace(5)* %out.gep.1, align 1 - store i8 23, i8 addrspace(5)* %out.gep.2, align 1 - ret void -} - -attributes #0 = { nounwind } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll deleted file mode 100644 index 0d9a4184e71..00000000000 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll +++ /dev/null @@ -1,657 +0,0 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s -; Copy of test/CodeGen/AMDGPU/merge-stores.ll with some additions - -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" - -; TODO: Vector element tests -; TODO: Non-zero base offset for load and store combinations -; TODO: Same base addrspacecasted - - -; CHECK-LABEL: @merge_global_store_2_constants_i8( -; CHECK: store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(1)* %{{[0-9]+}}, align 2 -define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 - - store i8 123, i8 addrspace(1)* %out.gep.1 - store i8 456, i8 addrspace(1)* %out, align 2 - ret void -} - -; CHECK-LABEL: @merge_global_store_2_constants_i8_natural_align -; CHECK: store <2 x i8> -define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 - - store i8 123, i8 addrspace(1)* %out.gep.1 - store i8 456, i8 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: @merge_global_store_2_constants_i16 -; CHECK: store <2 x i16> <i16 456, i16 123>, <2 x i16> addrspace(1)* %{{[0-9]+}}, align 4 -define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 - - store i16 123, i16 addrspace(1)* %out.gep.1 - store i16 456, i16 addrspace(1)* %out, align 4 - ret void -} - -; CHECK-LABEL: @merge_global_store_2_constants_0_i16 -; CHECK: store <2 x i16> zeroinitializer, <2 x i16> addrspace(1)* %{{[0-9]+}}, align 4 -define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 - - store i16 0, i16 addrspace(1)* %out.gep.1 - store i16 0, i16 addrspace(1)* %out, align 4 - ret void -} - -; CHECK-LABEL: @merge_global_store_2_constants_i16_natural_align -; CHECK: store <2 x i16> -define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 - - store i16 123, i16 addrspace(1)* %out.gep.1 - store i16 456, i16 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: @merge_global_store_2_constants_half_natural_align -; CHECK: store <2 x half> -define amdgpu_kernel void @merge_global_store_2_constants_half_natural_align(half addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1 - - store half 2.0, half addrspace(1)* %out.gep.1 - store half 1.0, half addrspace(1)* %out - ret void -} - -; CHECK-LABEL: @merge_global_store_2_constants_i32 -; CHECK: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4 -define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - - store i32 123, i32 addrspace(1)* %out.gep.1 - store i32 456, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: @merge_global_store_2_constants_i32_f32 -; CHECK: store <2 x i32> <i32 456, i32 1065353216>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4 -define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)* - store float 1.0, float addrspace(1)* %out.gep.1.bc - store i32 456, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: @merge_global_store_2_constants_f32_i32 -; CHECK store <2 x float> <float 4.000000e+00, float 0x370EC00000000000>, <2 x float> addrspace(1)* %{{[0-9]+$}} -define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 - %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)* - store i32 123, i32 addrspace(1)* %out.gep.1.bc - store float 4.0, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: @merge_global_store_4_constants_i32 -; CHECK: store <4 x i32> <i32 1234, i32 123, i32 456, i32 333>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4 -define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 - %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 - - store i32 123, i32 addrspace(1)* %out.gep.1 - store i32 456, i32 addrspace(1)* %out.gep.2 - store i32 333, i32 addrspace(1)* %out.gep.3 - store i32 1234, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: @merge_global_store_4_constants_f32_order -; CHECK: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* %{{[0-9]+}} -define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 - %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 - - store float 8.0, float addrspace(1)* %out - store float 1.0, float addrspace(1)* %out.gep.1 - store float 2.0, float addrspace(1)* %out.gep.2 - store float 4.0, float addrspace(1)* %out.gep.3 - ret void -} - -; First store is out of order. -; CHECK-LABEL: @merge_global_store_4_constants_f32 -; CHECK: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* %{{[0-9]+}}, align 4 -define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 - %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 - - store float 1.0, float addrspace(1)* %out.gep.1 - store float 2.0, float addrspace(1)* %out.gep.2 - store float 4.0, float addrspace(1)* %out.gep.3 - store float 8.0, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: @merge_global_store_4_constants_mixed_i32_f32 -; CHECK: store <4 x i32> <i32 1090519040, i32 11, i32 1073741824, i32 17>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4 -define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 - %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 - - %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)* - %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)* - - store i32 11, i32 addrspace(1)* %out.gep.1.bc - store float 2.0, float addrspace(1)* %out.gep.2 - store i32 17, i32 addrspace(1)* %out.gep.3.bc - store float 8.0, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: @merge_global_store_3_constants_i32 -; CHECK: store <3 x i32> <i32 1234, i32 123, i32 456>, <3 x i32> addrspace(1)* %{{[0-9]+}}, align 4 -define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 - - store i32 123, i32 addrspace(1)* %out.gep.1 - store i32 456, i32 addrspace(1)* %out.gep.2 - store i32 1234, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: @merge_global_store_2_constants_i64 -; CHECK: store <2 x i64> <i64 456, i64 123>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8 -define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 - - store i64 123, i64 addrspace(1)* %out.gep.1 - store i64 456, i64 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: @merge_global_store_4_constants_i64 -; CHECK: store <2 x i64> <i64 456, i64 333>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8 -; CHECK: store <2 x i64> <i64 1234, i64 123>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8 -define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 - %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2 - %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3 - - store i64 123, i64 addrspace(1)* %out.gep.1 - store i64 456, i64 addrspace(1)* %out.gep.2 - store i64 333, i64 addrspace(1)* %out.gep.3 - store i64 1234, i64 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: @merge_global_store_2_adjacent_loads_i32 -; CHECK: [[LOAD:%[^ ]+]] = load <2 x i32> -; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 0 -; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 1 -; CHECK: [[INSERT0:%[^ ]+]] = insertelement <2 x i32> undef, i32 [[ELT0]], i32 0 -; CHECK: [[INSERT1:%[^ ]+]] = insertelement <2 x i32> [[INSERT0]], i32 [[ELT1]], i32 1 -; CHECK: store <2 x i32> [[INSERT1]] -define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 - - %lo = load i32, i32 addrspace(1)* %in - %hi = load i32, i32 addrspace(1)* %in.gep.1 - - store i32 %lo, i32 addrspace(1)* %out - store i32 %hi, i32 addrspace(1)* %out.gep.1 - ret void -} - -; CHECK-LABEL: @merge_global_store_2_adjacent_loads_i32_nonzero_base -; CHECK: extractelement -; CHECK: extractelement -; CHECK: insertelement -; CHECK: insertelement -; CHECK: store <2 x i32> -define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2 - %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3 - - %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2 - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3 - %lo = load i32, i32 addrspace(1)* %in.gep.0 - %hi = load i32, i32 addrspace(1)* %in.gep.1 - - store i32 %lo, i32 addrspace(1)* %out.gep.0 - store i32 %hi, i32 addrspace(1)* %out.gep.1 - ret void -} - -; CHECK-LABEL: @merge_global_store_2_adjacent_loads_shuffle_i32 -; CHECK: [[LOAD:%[^ ]+]] = load <2 x i32> -; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 0 -; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 1 -; CHECK: [[INSERT0:%[^ ]+]] = insertelement <2 x i32> undef, i32 [[ELT1]], i32 0 -; CHECK: [[INSERT1:%[^ ]+]] = insertelement <2 x i32> [[INSERT0]], i32 [[ELT0]], i32 1 -; CHECK: store <2 x i32> [[INSERT1]] -define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 - - %lo = load i32, i32 addrspace(1)* %in - %hi = load i32, i32 addrspace(1)* %in.gep.1 - - store i32 %hi, i32 addrspace(1)* %out - store i32 %lo, i32 addrspace(1)* %out.gep.1 - ret void -} - -; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32 -; CHECK: load <4 x i32> -; CHECK: store <4 x i32> -define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 - %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 - %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 - %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 - - %x = load i32, i32 addrspace(1)* %in - %y = load i32, i32 addrspace(1)* %in.gep.1 - %z = load i32, i32 addrspace(1)* %in.gep.2 - %w = load i32, i32 addrspace(1)* %in.gep.3 - - store i32 %x, i32 addrspace(1)* %out - store i32 %y, i32 addrspace(1)* %out.gep.1 - store i32 %z, i32 addrspace(1)* %out.gep.2 - store i32 %w, i32 addrspace(1)* %out.gep.3 - ret void -} - -; CHECK-LABEL: @merge_global_store_3_adjacent_loads_i32 -; CHECK: load <3 x i32> -; CHECK: store <3 x i32> -define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 - %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 - - %x = load i32, i32 addrspace(1)* %in - %y = load i32, i32 addrspace(1)* %in.gep.1 - %z = load i32, i32 addrspace(1)* %in.gep.2 - - store i32 %x, i32 addrspace(1)* %out - store i32 %y, i32 addrspace(1)* %out.gep.1 - store i32 %z, i32 addrspace(1)* %out.gep.2 - ret void -} - -; CHECK-LABEL: @merge_global_store_4_adjacent_loads_f32 -; CHECK: load <4 x float> -; CHECK: store <4 x float> -define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 - %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 - %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1 - %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2 - %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3 - - %x = load float, float addrspace(1)* %in - %y = load float, float addrspace(1)* %in.gep.1 - %z = load float, float addrspace(1)* %in.gep.2 - %w = load float, float addrspace(1)* %in.gep.3 - - store float %x, float addrspace(1)* %out - store float %y, float addrspace(1)* %out.gep.1 - store float %z, float addrspace(1)* %out.gep.2 - store float %w, float addrspace(1)* %out.gep.3 - ret void -} - -; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32_nonzero_base -; CHECK: load <4 x i32> -; CHECK: store <4 x i32> -define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11 - %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12 - %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13 - %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14 - %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7 - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8 - %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9 - %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10 - - %x = load i32, i32 addrspace(1)* %in.gep.0 - %y = load i32, i32 addrspace(1)* %in.gep.1 - %z = load i32, i32 addrspace(1)* %in.gep.2 - %w = load i32, i32 addrspace(1)* %in.gep.3 - - store i32 %x, i32 addrspace(1)* %out.gep.0 - store i32 %y, i32 addrspace(1)* %out.gep.1 - store i32 %z, i32 addrspace(1)* %out.gep.2 - store i32 %w, i32 addrspace(1)* %out.gep.3 - ret void -} - -; CHECK-LABEL: @merge_global_store_4_adjacent_loads_inverse_i32 -; CHECK: load <4 x i32> -; CHECK: store <4 x i32> -define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 - %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 - %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 - %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 - - %x = load i32, i32 addrspace(1)* %in - %y = load i32, i32 addrspace(1)* %in.gep.1 - %z = load i32, i32 addrspace(1)* %in.gep.2 - %w = load i32, i32 addrspace(1)* %in.gep.3 - - ; Make sure the barrier doesn't stop this - tail call void @llvm.amdgcn.s.barrier() #1 - - store i32 %w, i32 addrspace(1)* %out.gep.3 - store i32 %z, i32 addrspace(1)* %out.gep.2 - store i32 %y, i32 addrspace(1)* %out.gep.1 - store i32 %x, i32 addrspace(1)* %out - - ret void -} - -; CHECK-LABEL: @merge_global_store_4_adjacent_loads_shuffle_i32 -; CHECK: load <4 x i32> -; CHECK: store <4 x i32> -define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 - %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 - %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 - %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 - - %x = load i32, i32 addrspace(1)* %in - %y = load i32, i32 addrspace(1)* %in.gep.1 - %z = load i32, i32 addrspace(1)* %in.gep.2 - %w = load i32, i32 addrspace(1)* %in.gep.3 - - ; Make sure the barrier doesn't stop this - tail call void @llvm.amdgcn.s.barrier() #1 - - store i32 %w, i32 addrspace(1)* %out - store i32 %z, i32 addrspace(1)* %out.gep.1 - store i32 %y, i32 addrspace(1)* %out.gep.2 - store i32 %x, i32 addrspace(1)* %out.gep.3 - - ret void -} - -; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8 -; CHECK: load <4 x i8> -; CHECK: extractelement <4 x i8> -; CHECK: extractelement <4 x i8> -; CHECK: extractelement <4 x i8> -; CHECK: extractelement <4 x i8> -; CHECK: insertelement <4 x i8> -; CHECK: insertelement <4 x i8> -; CHECK: insertelement <4 x i8> -; CHECK: insertelement <4 x i8> -; CHECK: store <4 x i8> -define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 - %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 - %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 - %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1 - %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2 - %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3 - - %x = load i8, i8 addrspace(1)* %in, align 4 - %y = load i8, i8 addrspace(1)* %in.gep.1 - %z = load i8, i8 addrspace(1)* %in.gep.2 - %w = load i8, i8 addrspace(1)* %in.gep.3 - - store i8 %x, i8 addrspace(1)* %out, align 4 - store i8 %y, i8 addrspace(1)* %out.gep.1 - store i8 %z, i8 addrspace(1)* %out.gep.2 - store i8 %w, i8 addrspace(1)* %out.gep.3 - ret void -} - -; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8_natural_align -; CHECK: load <4 x i8> -; CHECK: store <4 x i8> -define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 - %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 - %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 - %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1 - %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2 - %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3 - - %x = load i8, i8 addrspace(1)* %in - %y = load i8, i8 addrspace(1)* %in.gep.1 - %z = load i8, i8 addrspace(1)* %in.gep.2 - %w = load i8, i8 addrspace(1)* %in.gep.3 - - store i8 %x, i8 addrspace(1)* %out - store i8 %y, i8 addrspace(1)* %out.gep.1 - store i8 %z, i8 addrspace(1)* %out.gep.2 - store i8 %w, i8 addrspace(1)* %out.gep.3 - ret void -} - -; CHECK-LABEL: @merge_global_store_4_vector_elts_loads_v4i32 -; CHECK: load <4 x i32> -; CHECK: store <4 x i32> -define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 - %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 - %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in - - %x = extractelement <4 x i32> %vec, i32 0 - %y = extractelement <4 x i32> %vec, i32 1 - %z = extractelement <4 x i32> %vec, i32 2 - %w = extractelement <4 x i32> %vec, i32 3 - - store i32 %x, i32 addrspace(1)* %out - store i32 %y, i32 addrspace(1)* %out.gep.1 - store i32 %z, i32 addrspace(1)* %out.gep.2 - store i32 %w, i32 addrspace(1)* %out.gep.3 - ret void -} - -; CHECK-LABEL: @merge_local_store_2_constants_i8 -; CHECK: store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(3)* %{{[0-9]+}}, align 2 -define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 { - %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1 - - store i8 123, i8 addrspace(3)* %out.gep.1 - store i8 456, i8 addrspace(3)* %out, align 2 - ret void -} - -; CHECK-LABEL: @merge_local_store_2_constants_i32 -; CHECK: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(3)* %{{[0-9]+}}, align 4 -define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 - - store i32 123, i32 addrspace(3)* %out.gep.1 - store i32 456, i32 addrspace(3)* %out - ret void -} - -; CHECK-LABEL: @merge_local_store_2_constants_i32_align_2 -; CHECK: store i32 -; CHECK: store i32 -define amdgpu_kernel void @merge_local_store_2_constants_i32_align_2(i32 addrspace(3)* %out) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 - - store i32 123, i32 addrspace(3)* %out.gep.1, align 2 - store i32 456, i32 addrspace(3)* %out, align 2 - ret void -} - -; CHECK-LABEL: @merge_local_store_4_constants_i32 -; CHECK: store <4 x i32> <i32 1234, i32 123, i32 456, i32 333>, <4 x i32> addrspace(3)* -define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2 - %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3 - - store i32 123, i32 addrspace(3)* %out.gep.1 - store i32 456, i32 addrspace(3)* %out.gep.2 - store i32 333, i32 addrspace(3)* %out.gep.3 - store i32 1234, i32 addrspace(3)* %out - ret void -} - -; CHECK-LABEL: @merge_global_store_5_constants_i32 -; CHECK: store <4 x i32> <i32 9, i32 12, i32 16, i32 -12>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4 -; CHECK: store i32 -define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) { - store i32 9, i32 addrspace(1)* %out, align 4 - %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 - store i32 12, i32 addrspace(1)* %idx1, align 4 - %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 - store i32 16, i32 addrspace(1)* %idx2, align 4 - %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 - store i32 -12, i32 addrspace(1)* %idx3, align 4 - %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 - store i32 11, i32 addrspace(1)* %idx4, align 4 - ret void -} - -; CHECK-LABEL: @merge_global_store_6_constants_i32 -; CHECK: store <4 x i32> <i32 13, i32 15, i32 62, i32 63>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4 -; CHECK: store <2 x i32> <i32 11, i32 123>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4 -define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) { - store i32 13, i32 addrspace(1)* %out, align 4 - %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 - store i32 15, i32 addrspace(1)* %idx1, align 4 - %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 - store i32 62, i32 addrspace(1)* %idx2, align 4 - %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 - store i32 63, i32 addrspace(1)* %idx3, align 4 - %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 - store i32 11, i32 addrspace(1)* %idx4, align 4 - %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5 - store i32 123, i32 addrspace(1)* %idx5, align 4 - ret void -} - -; CHECK-LABEL: @merge_global_store_7_constants_i32 -; CHECK: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4 -; CHECK: store <3 x i32> <i32 98, i32 91, i32 212>, <3 x i32> addrspace(1)* %{{[0-9]+}}, align 4 -define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) { - store i32 34, i32 addrspace(1)* %out, align 4 - %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 - store i32 999, i32 addrspace(1)* %idx1, align 4 - %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 - store i32 65, i32 addrspace(1)* %idx2, align 4 - %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 - store i32 33, i32 addrspace(1)* %idx3, align 4 - %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 - store i32 98, i32 addrspace(1)* %idx4, align 4 - %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5 - store i32 91, i32 addrspace(1)* %idx5, align 4 - %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6 - store i32 212, i32 addrspace(1)* %idx6, align 4 - ret void -} - -; CHECK-LABEL: @merge_global_store_8_constants_i32 -; CHECK: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4 -; CHECK: store <4 x i32> <i32 98, i32 91, i32 212, i32 999>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4 -define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) { - store i32 34, i32 addrspace(1)* %out, align 4 - %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 - store i32 999, i32 addrspace(1)* %idx1, align 4 - %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 - store i32 65, i32 addrspace(1)* %idx2, align 4 - %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 - store i32 33, i32 addrspace(1)* %idx3, align 4 - %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 - store i32 98, i32 addrspace(1)* %idx4, align 4 - %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5 - store i32 91, i32 addrspace(1)* %idx5, align 4 - %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6 - store i32 212, i32 addrspace(1)* %idx6, align 4 - %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7 - store i32 999, i32 addrspace(1)* %idx7, align 4 - ret void -} - -; CHECK-LABEL: @copy_v3i32_align4 -; CHECK: %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4 -; CHECK: store <3 x i32> %vec, <3 x i32> addrspace(1)* %out -define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 { - %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4 - store <3 x i32> %vec, <3 x i32> addrspace(1)* %out - ret void -} - -; CHECK-LABEL: @copy_v3i64_align4 -; CHECK: %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4 -; CHECK: store <3 x i64> %vec, <3 x i64> addrspace(1)* %out -define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 { - %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4 - store <3 x i64> %vec, <3 x i64> addrspace(1)* %out - ret void -} - -; CHECK-LABEL: @copy_v3f32_align4 -; CHECK: %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4 -; CHECK: store <3 x float> -define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 { - %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4 - %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0> - store <3 x float> %fadd, <3 x float> addrspace(1)* %out - ret void -} - -; CHECK-LABEL: @copy_v3f64_align4 -; CHECK: %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4 -; CHECK: store <3 x double> %fadd, <3 x double> addrspace(1)* %out -define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 { - %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4 - %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0> - store <3 x double> %fadd, <3 x double> addrspace(1)* %out - ret void -} - -; Verify that we no longer hit asserts for this test case. No change expected. -; CHECK-LABEL: @copy_vec_of_ptrs -; CHECK: %in.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %in, i32 1 -; CHECK: %vec1 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in.gep.1 -; CHECK: %vec2 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in, align 4 -; CHECK: %out.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %out, i32 1 -; CHECK: store <2 x i16*> %vec1, <2 x i16*> addrspace(1)* %out.gep.1 -; CHECK: store <2 x i16*> %vec2, <2 x i16*> addrspace(1)* %out, align 4 -define amdgpu_kernel void @copy_vec_of_ptrs(<2 x i16*> addrspace(1)* %out, - <2 x i16*> addrspace(1)* %in ) #0 { - %in.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %in, i32 1 - %vec1 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in.gep.1 - %vec2 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in, align 4 - - %out.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %out, i32 1 - store <2 x i16*> %vec1, <2 x i16*> addrspace(1)* %out.gep.1 - store <2 x i16*> %vec2, <2 x i16*> addrspace(1)* %out, align 4 - ret void -} - -declare void @llvm.amdgcn.s.barrier() #1 - -attributes #0 = { nounwind } -attributes #1 = { convergent nounwind } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll deleted file mode 100644 index bcf2265f310..00000000000 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll +++ /dev/null @@ -1,91 +0,0 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s - -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" - -; CHECK-LABEL: @merge_v2i32_v2i32( -; CHECK: load <4 x i32> -; CHECK: store <4 x i32> zeroinitializer -define amdgpu_kernel void @merge_v2i32_v2i32(<2 x i32> addrspace(1)* nocapture %a, <2 x i32> addrspace(1)* nocapture readonly %b) #0 { -entry: - %a.1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %a, i64 1 - %b.1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %b, i64 1 - - %ld.c = load <2 x i32>, <2 x i32> addrspace(1)* %b, align 4 - %ld.c.idx.1 = load <2 x i32>, <2 x i32> addrspace(1)* %b.1, align 4 - - store <2 x i32> zeroinitializer, <2 x i32> addrspace(1)* %a, align 4 - store <2 x i32> zeroinitializer, <2 x i32> addrspace(1)* %a.1, align 4 - - ret void -} - -; CHECK-LABEL: @merge_v1i32_v1i32( -; CHECK: load <2 x i32> -; CHECK: store <2 x i32> zeroinitializer -define amdgpu_kernel void @merge_v1i32_v1i32(<1 x i32> addrspace(1)* nocapture %a, <1 x i32> addrspace(1)* nocapture readonly %b) #0 { -entry: - %a.1 = getelementptr inbounds <1 x i32>, <1 x i32> addrspace(1)* %a, i64 1 - %b.1 = getelementptr inbounds <1 x i32>, <1 x i32> addrspace(1)* %b, i64 1 - - %ld.c = load <1 x i32>, <1 x i32> addrspace(1)* %b, align 4 - %ld.c.idx.1 = load <1 x i32>, <1 x i32> addrspace(1)* %b.1, align 4 - - store <1 x i32> zeroinitializer, <1 x i32> addrspace(1)* %a, align 4 - store <1 x i32> zeroinitializer, <1 x i32> addrspace(1)* %a.1, align 4 - - ret void -} - -; CHECK-LABEL: @no_merge_v3i32_v3i32( -; CHECK: load <3 x i32> -; CHECK: load <3 x i32> -; CHECK: store <3 x i32> zeroinitializer -; CHECK: store <3 x i32> zeroinitializer -define amdgpu_kernel void @no_merge_v3i32_v3i32(<3 x i32> addrspace(1)* nocapture %a, <3 x i32> addrspace(1)* nocapture readonly %b) #0 { -entry: - %a.1 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %a, i64 1 - %b.1 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %b, i64 1 - - %ld.c = load <3 x i32>, <3 x i32> addrspace(1)* %b, align 4 - %ld.c.idx.1 = load <3 x i32>, <3 x i32> addrspace(1)* %b.1, align 4 - - store <3 x i32> zeroinitializer, <3 x i32> addrspace(1)* %a, align 4 - store <3 x i32> zeroinitializer, <3 x i32> addrspace(1)* %a.1, align 4 - - ret void -} - -; CHECK-LABEL: @merge_v2i16_v2i16( -; CHECK: load <4 x i16> -; CHECK: store <4 x i16> zeroinitializer -define amdgpu_kernel void @merge_v2i16_v2i16(<2 x i16> addrspace(1)* nocapture %a, <2 x i16> addrspace(1)* nocapture readonly %b) #0 { -entry: - %a.1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a, i64 1 - %b.1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b, i64 1 - - %ld.c = load <2 x i16>, <2 x i16> addrspace(1)* %b, align 4 - %ld.c.idx.1 = load <2 x i16>, <2 x i16> addrspace(1)* %b.1, align 4 - - store <2 x i16> zeroinitializer, <2 x i16> addrspace(1)* %a, align 4 - store <2 x i16> zeroinitializer, <2 x i16> addrspace(1)* %a.1, align 4 - - ret void -} - -; Ideally this would be merged -; CHECK-LABEL: @merge_load_i32_v2i16( -; CHECK: load i32, -; CHECK: load <2 x i16> -define amdgpu_kernel void @merge_load_i32_v2i16(i32 addrspace(1)* nocapture %a) #0 { -entry: - %a.1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 1 - %a.1.cast = bitcast i32 addrspace(1)* %a.1 to <2 x i16> addrspace(1)* - - %ld.0 = load i32, i32 addrspace(1)* %a - %ld.1 = load <2 x i16>, <2 x i16> addrspace(1)* %a.1.cast - - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll deleted file mode 100644 index ff718c1b101..00000000000 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll +++ /dev/null @@ -1,32 +0,0 @@ -; RUN: opt -mtriple=amdgcn-- -load-store-vectorizer -S -o - %s | FileCheck %s - -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" - -@lds = internal addrspace(3) global [512 x float] undef, align 4 - -; The original load has an implicit alignment of 4, and should not -; increase to an align 8 load. - -; CHECK-LABEL: @load_keep_base_alignment_missing_align( -; CHECK: load <2 x float>, <2 x float> addrspace(3)* %{{[0-9]+}}, align 4 -define amdgpu_kernel void @load_keep_base_alignment_missing_align(float addrspace(1)* %out) { - %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 11 - %val0 = load float, float addrspace(3)* %ptr0 - - %ptr1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 12 - %val1 = load float, float addrspace(3)* %ptr1 - %add = fadd float %val0, %val1 - store float %add, float addrspace(1)* %out - ret void -} - - -; CHECK-LABEL: @store_keep_base_alignment_missing_align( -; CHECK: store <2 x float> zeroinitializer, <2 x float> addrspace(3)* %{{[0-9]+}}, align 4 -define amdgpu_kernel void @store_keep_base_alignment_missing_align() { - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 1 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 2 - store float 0.0, float addrspace(3)* %arrayidx0 - store float 0.0, float addrspace(3)* %arrayidx1 - ret void -} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll deleted file mode 100644 index ffd651b2c65..00000000000 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll +++ /dev/null @@ -1,63 +0,0 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s - -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" - -; Checks that there is no crash when there are multiple tails -; for a the same head starting a chain. -@0 = internal addrspace(3) global [16384 x i32] undef - -; CHECK-LABEL: @no_crash( -; CHECK: store <2 x i32> zeroinitializer -; CHECK: store i32 0 -; CHECK: store i32 0 - -define amdgpu_kernel void @no_crash(i32 %arg) { - %tmp2 = add i32 %arg, 14 - %tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp2 - %tmp4 = add i32 %arg, 15 - %tmp5 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp4 - - store i32 0, i32 addrspace(3)* %tmp3, align 4 - store i32 0, i32 addrspace(3)* %tmp5, align 4 - store i32 0, i32 addrspace(3)* %tmp5, align 4 - store i32 0, i32 addrspace(3)* %tmp5, align 4 - - ret void -} - -; Check adjiacent memory locations are properly matched and the -; longest chain vectorized - -; CHECK-LABEL: @interleave_get_longest -; CHECK: load <4 x i32> -; CHECK: load i32 -; CHECK: store <2 x i32> zeroinitializer -; CHECK: load i32 -; CHECK: load i32 -; CHECK: load i32 - -define amdgpu_kernel void @interleave_get_longest(i32 %arg) { - %a1 = add i32 %arg, 1 - %a2 = add i32 %arg, 2 - %a3 = add i32 %arg, 3 - %a4 = add i32 %arg, 4 - %tmp1 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %arg - %tmp2 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a1 - %tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a2 - %tmp4 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a3 - %tmp5 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a4 - - %l1 = load i32, i32 addrspace(3)* %tmp2, align 4 - %l2 = load i32, i32 addrspace(3)* %tmp1, align 4 - store i32 0, i32 addrspace(3)* %tmp2, align 4 - store i32 0, i32 addrspace(3)* %tmp1, align 4 - %l3 = load i32, i32 addrspace(3)* %tmp2, align 4 - %l4 = load i32, i32 addrspace(3)* %tmp3, align 4 - %l5 = load i32, i32 addrspace(3)* %tmp4, align 4 - %l6 = load i32, i32 addrspace(3)* %tmp5, align 4 - %l7 = load i32, i32 addrspace(3)* %tmp5, align 4 - %l8 = load i32, i32 addrspace(3)* %tmp5, align 4 - - ret void -} - diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll deleted file mode 100644 index 86f6b6d55ec..00000000000 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll +++ /dev/null @@ -1,22 +0,0 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s - -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" - -; CHECK-LABEL: @no_implicit_float( -; CHECK: store i32 -; CHECK: store i32 -; CHECK: store i32 -; CHECK: store i32 -define amdgpu_kernel void @no_implicit_float(i32 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 - %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 - - store i32 123, i32 addrspace(1)* %out.gep.1 - store i32 456, i32 addrspace(1)* %out.gep.2 - store i32 333, i32 addrspace(1)* %out.gep.3 - store i32 1234, i32 addrspace(1)* %out - ret void -} - -attributes #0 = { nounwind noimplicitfloat } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll deleted file mode 100644 index 8a2abe50a5a..00000000000 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll +++ /dev/null @@ -1,24 +0,0 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s - -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" - -; CHECK-LABEL: @optnone( -; CHECK: store i32 -; CHECK: store i32 -define amdgpu_kernel void @optnone(i32 addrspace(1)* %out) noinline optnone { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - - store i32 123, i32 addrspace(1)* %out.gep.1 - store i32 456, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: @do_opt( -; CHECK: store <2 x i32> -define amdgpu_kernel void @do_opt(i32 addrspace(1)* %out) { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - - store i32 123, i32 addrspace(1)* %out.gep.1 - store i32 456, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll deleted file mode 100644 index 9290749bb89..00000000000 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll +++ /dev/null @@ -1,311 +0,0 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s - -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" - -declare i32 @llvm.amdgcn.workitem.id.x() #1 - -; CHECK-LABEL: @merge_v2p1i8( -; CHECK: load <2 x i64> -; CHECK: inttoptr i64 %{{[^ ]+}} to i8 addrspace(1)* -; CHECK: inttoptr i64 %{{[^ ]+}} to i8 addrspace(1)* -; CHECK: store <2 x i64> zeroinitializer -define amdgpu_kernel void @merge_v2p1i8(i8 addrspace(1)* addrspace(1)* nocapture %a, i8 addrspace(1)* addrspace(1)* nocapture readonly %b) #0 { -entry: - %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1 - %b.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %b, i64 1 - - %ld.c = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %b, align 4 - %ld.c.idx.1 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %b.1, align 4 - - store i8 addrspace(1)* null, i8 addrspace(1)* addrspace(1)* %a, align 4 - store i8 addrspace(1)* null, i8 addrspace(1)* addrspace(1)* %a.1, align 4 - - ret void -} - -; CHECK-LABEL: @merge_v2p3i8( -; CHECK: load <2 x i32> -; CHECK: inttoptr i32 %{{[^ ]+}} to i8 addrspace(3)* -; CHECK: inttoptr i32 %{{[^ ]+}} to i8 addrspace(3)* -; CHECK: store <2 x i32> zeroinitializer -define amdgpu_kernel void @merge_v2p3i8(i8 addrspace(3)* addrspace(3)* nocapture %a, i8 addrspace(3)* addrspace(3)* nocapture readonly %b) #0 { -entry: - %a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a, i64 1 - %b.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %b, i64 1 - - %ld.c = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %b, align 4 - %ld.c.idx.1 = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %b.1, align 4 - - store i8 addrspace(3)* null, i8 addrspace(3)* addrspace(3)* %a, align 4 - store i8 addrspace(3)* null, i8 addrspace(3)* addrspace(3)* %a.1, align 4 - - ret void -} - -; CHECK-LABEL: @merge_load_i64_ptr64( -; CHECK: load <2 x i64> -; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1 -; CHECK: inttoptr i64 [[ELT1]] to i8 addrspace(1)* -define amdgpu_kernel void @merge_load_i64_ptr64(i64 addrspace(1)* nocapture %a) #0 { -entry: - %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1 - %a.1.cast = bitcast i64 addrspace(1)* %a.1 to i8 addrspace(1)* addrspace(1)* - - %ld.0 = load i64, i64 addrspace(1)* %a - %ld.1 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a.1.cast - - ret void -} - -; CHECK-LABEL: @merge_load_ptr64_i64( -; CHECK: load <2 x i64> -; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 0 -; CHECK: inttoptr i64 [[ELT0]] to i8 addrspace(1)* -define amdgpu_kernel void @merge_load_ptr64_i64(i64 addrspace(1)* nocapture %a) #0 { -entry: - %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(1)* addrspace(1)* - %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1 - - %ld.0 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a.cast - %ld.1 = load i64, i64 addrspace(1)* %a.1 - - ret void -} - -; CHECK-LABEL: @merge_store_ptr64_i64( -; CHECK: [[ELT0:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr0 to i64 -; CHECK: insertelement <2 x i64> undef, i64 [[ELT0]], i32 0 -; CHECK: store <2 x i64> -define amdgpu_kernel void @merge_store_ptr64_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, i64 %val1) #0 { -entry: - %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(1)* addrspace(1)* - %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1 - - - store i8 addrspace(1)* %ptr0, i8 addrspace(1)* addrspace(1)* %a.cast - store i64 %val1, i64 addrspace(1)* %a.1 - - ret void -} - -; CHECK-LABEL: @merge_store_i64_ptr64( -; CHECK: [[ELT1:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr1 to i64 -; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1]], i32 1 -; CHECK: store <2 x i64> -define amdgpu_kernel void @merge_store_i64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(1)* %ptr1) #0 { -entry: - %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1 - %a.cast = bitcast i8 addrspace(1)* addrspace(1)* %a to i64 addrspace(1)* - - store i64 %val0, i64 addrspace(1)* %a.cast - store i8 addrspace(1)* %ptr1, i8 addrspace(1)* addrspace(1)* %a.1 - - ret void -} - -; CHECK-LABEL: @merge_load_i32_ptr32( -; CHECK: load <2 x i32> -; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 1 -; CHECK: inttoptr i32 [[ELT1]] to i8 addrspace(3)* -define amdgpu_kernel void @merge_load_i32_ptr32(i32 addrspace(3)* nocapture %a) #0 { -entry: - %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1 - %a.1.cast = bitcast i32 addrspace(3)* %a.1 to i8 addrspace(3)* addrspace(3)* - - %ld.0 = load i32, i32 addrspace(3)* %a - %ld.1 = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a.1.cast - - ret void -} - -; CHECK-LABEL: @merge_load_ptr32_i32( -; CHECK: load <2 x i32> -; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 0 -; CHECK: inttoptr i32 [[ELT0]] to i8 addrspace(3)* -define amdgpu_kernel void @merge_load_ptr32_i32(i32 addrspace(3)* nocapture %a) #0 { -entry: - %a.cast = bitcast i32 addrspace(3)* %a to i8 addrspace(3)* addrspace(3)* - %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1 - - %ld.0 = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a.cast - %ld.1 = load i32, i32 addrspace(3)* %a.1 - - ret void -} - -; CHECK-LABEL: @merge_store_ptr32_i32( -; CHECK: [[ELT0:%[^ ]+]] = ptrtoint i8 addrspace(3)* %ptr0 to i32 -; CHECK: insertelement <2 x i32> undef, i32 [[ELT0]], i32 0 -; CHECK: store <2 x i32> -define amdgpu_kernel void @merge_store_ptr32_i32(i32 addrspace(3)* nocapture %a, i8 addrspace(3)* %ptr0, i32 %val1) #0 { -entry: - %a.cast = bitcast i32 addrspace(3)* %a to i8 addrspace(3)* addrspace(3)* - %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1 - - store i8 addrspace(3)* %ptr0, i8 addrspace(3)* addrspace(3)* %a.cast - store i32 %val1, i32 addrspace(3)* %a.1 - - ret void -} - -; CHECK-LABEL: @merge_store_i32_ptr32( -; CHECK: [[ELT1:%[^ ]+]] = ptrtoint i8 addrspace(3)* %ptr1 to i32 -; CHECK: insertelement <2 x i32> %{{[^ ]+}}, i32 [[ELT1]], i32 1 -; CHECK: store <2 x i32> -define amdgpu_kernel void @merge_store_i32_ptr32(i8 addrspace(3)* addrspace(3)* nocapture %a, i32 %val0, i8 addrspace(3)* %ptr1) #0 { -entry: - %a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a, i32 1 - %a.cast = bitcast i8 addrspace(3)* addrspace(3)* %a to i32 addrspace(3)* - - store i32 %val0, i32 addrspace(3)* %a.cast - store i8 addrspace(3)* %ptr1, i8 addrspace(3)* addrspace(3)* %a.1 - - ret void -} - -; CHECK-LABEL: @no_merge_store_ptr32_i64( -; CHECK: store i8 addrspace(3)* -; CHECK: store i64 -define amdgpu_kernel void @no_merge_store_ptr32_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(3)* %ptr0, i64 %val1) #0 { -entry: - %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(3)* addrspace(1)* - %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1 - - - store i8 addrspace(3)* %ptr0, i8 addrspace(3)* addrspace(1)* %a.cast - store i64 %val1, i64 addrspace(1)* %a.1 - - ret void -} - -; CHECK-LABEL: @no_merge_store_i64_ptr32( -; CHECK: store i64 -; CHECK: store i8 addrspace(3)* -define amdgpu_kernel void @no_merge_store_i64_ptr32(i8 addrspace(3)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(3)* %ptr1) #0 { -entry: - %a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %a, i64 1 - %a.cast = bitcast i8 addrspace(3)* addrspace(1)* %a to i64 addrspace(1)* - - store i64 %val0, i64 addrspace(1)* %a.cast - store i8 addrspace(3)* %ptr1, i8 addrspace(3)* addrspace(1)* %a.1 - - ret void -} - -; CHECK-LABEL: @no_merge_load_i64_ptr32( -; CHECK: load i64, -; CHECK: load i8 addrspace(3)*, -define amdgpu_kernel void @no_merge_load_i64_ptr32(i64 addrspace(1)* nocapture %a) #0 { -entry: - %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1 - %a.1.cast = bitcast i64 addrspace(1)* %a.1 to i8 addrspace(3)* addrspace(1)* - - %ld.0 = load i64, i64 addrspace(1)* %a - %ld.1 = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %a.1.cast - - ret void -} - -; CHECK-LABEL: @no_merge_load_ptr32_i64( -; CHECK: load i8 addrspace(3)*, -; CHECK: load i64, -define amdgpu_kernel void @no_merge_load_ptr32_i64(i64 addrspace(1)* nocapture %a) #0 { -entry: - %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(3)* addrspace(1)* - %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1 - - %ld.0 = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %a.cast - %ld.1 = load i64, i64 addrspace(1)* %a.1 - - ret void -} - -; XXX - This isn't merged for some reason -; CHECK-LABEL: @merge_v2p1i8_v2p1i8( -; CHECK: load <2 x i8 addrspace(1)*> -; CHECK: load <2 x i8 addrspace(1)*> -; CHECK: store <2 x i8 addrspace(1)*> -; CHECK: store <2 x i8 addrspace(1)*> -define amdgpu_kernel void @merge_v2p1i8_v2p1i8(<2 x i8 addrspace(1)*> addrspace(1)* nocapture noalias %a, <2 x i8 addrspace(1)*> addrspace(1)* nocapture readonly noalias %b) #0 { -entry: - %a.1 = getelementptr inbounds <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %a, i64 1 - %b.1 = getelementptr inbounds <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %b, i64 1 - - %ld.c = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %b, align 4 - %ld.c.idx.1 = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %b.1, align 4 - - store <2 x i8 addrspace(1)*> zeroinitializer, <2 x i8 addrspace(1)*> addrspace(1)* %a, align 4 - store <2 x i8 addrspace(1)*> zeroinitializer, <2 x i8 addrspace(1)*> addrspace(1)* %a.1, align 4 - ret void -} - -; CHECK-LABEL: @merge_load_ptr64_f64( -; CHECK: load <2 x i64> -; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 0 -; CHECK: [[ELT0_INT:%[^ ]+]] = inttoptr i64 [[ELT0]] to i8 addrspace(1)* -; CHECK: [[ELT1_INT:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1 -; CHECK: bitcast i64 [[ELT1_INT]] to double -define amdgpu_kernel void @merge_load_ptr64_f64(double addrspace(1)* nocapture %a) #0 { -entry: - %a.cast = bitcast double addrspace(1)* %a to i8 addrspace(1)* addrspace(1)* - %a.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1 - - %ld.0 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a.cast - %ld.1 = load double, double addrspace(1)* %a.1 - - ret void -} - -; CHECK-LABEL: @merge_load_f64_ptr64( -; CHECK: load <2 x i64> -; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 0 -; CHECK: bitcast i64 [[ELT0]] to double -; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1 -; CHECK: inttoptr i64 [[ELT1]] to i8 addrspace(1)* -define amdgpu_kernel void @merge_load_f64_ptr64(double addrspace(1)* nocapture %a) #0 { -entry: - %a.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1 - %a.1.cast = bitcast double addrspace(1)* %a.1 to i8 addrspace(1)* addrspace(1)* - - %ld.0 = load double, double addrspace(1)* %a - %ld.1 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a.1.cast - - ret void -} - -; CHECK-LABEL: @merge_store_ptr64_f64( -; CHECK: [[ELT0_INT:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr0 to i64 -; CHECK: insertelement <2 x i64> undef, i64 [[ELT0_INT]], i32 0 -; CHECK: [[ELT1_INT:%[^ ]+]] = bitcast double %val1 to i64 -; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1_INT]], i32 1 -; CHECK: store <2 x i64> -define amdgpu_kernel void @merge_store_ptr64_f64(double addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, double %val1) #0 { -entry: - %a.cast = bitcast double addrspace(1)* %a to i8 addrspace(1)* addrspace(1)* - %a.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1 - - store i8 addrspace(1)* %ptr0, i8 addrspace(1)* addrspace(1)* %a.cast - store double %val1, double addrspace(1)* %a.1 - - ret void -} - -; CHECK-LABEL: @merge_store_f64_ptr64( -; CHECK: [[ELT0_INT:%[^ ]+]] = bitcast double %val0 to i64 -; CHECK: insertelement <2 x i64> undef, i64 [[ELT0_INT]], i32 0 -; CHECK: [[ELT1_INT:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr1 to i64 -; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1_INT]], i32 1 -; CHECK: store <2 x i64> -define amdgpu_kernel void @merge_store_f64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, double %val0, i8 addrspace(1)* %ptr1) #0 { -entry: - %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1 - %a.cast = bitcast i8 addrspace(1)* addrspace(1)* %a to double addrspace(1)* - - store double %val0, double addrspace(1)* %a.cast - store i8 addrspace(1)* %ptr1, i8 addrspace(1)* addrspace(1)* %a.1 - - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/selects.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/selects.ll deleted file mode 100644 index c020cc71b4a..00000000000 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/selects.ll +++ /dev/null @@ -1,95 +0,0 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -dce -S -o - %s | FileCheck %s - -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" - -define void @base_case(i1 %cnd, i32 addrspace(1)* %a, i32 addrspace(1)* %b, <3 x i32> addrspace(1)* %out) { -; CHECK-LABEL: @base_case -; CHECK: load <3 x i32> -entry: - %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 1 - %gep2 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 2 - %gep4 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 1 - %gep5 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 2 - %selected = select i1 %cnd, i32 addrspace(1)* %a, i32 addrspace(1)* %b - %selected14 = select i1 %cnd, i32 addrspace(1)* %gep1, i32 addrspace(1)* %gep4 - %selected25 = select i1 %cnd, i32 addrspace(1)* %gep2, i32 addrspace(1)* %gep5 - %val0 = load i32, i32 addrspace(1)* %selected, align 4 - %val1 = load i32, i32 addrspace(1)* %selected14, align 4 - %val2 = load i32, i32 addrspace(1)* %selected25, align 4 - %t0 = insertelement <3 x i32> undef, i32 %val0, i32 0 - %t1 = insertelement <3 x i32> %t0, i32 %val1, i32 1 - %t2 = insertelement <3 x i32> %t1, i32 %val2, i32 2 - store <3 x i32> %t2, <3 x i32> addrspace(1)* %out - ret void -} - -define void @scev_targeting_complex_case(i1 %cnd, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 %base, <2 x i32> addrspace(1)* %out) { -; CHECK-LABEL: @scev_targeting_complex_case -; CHECK: load <2 x i32> -entry: - %base.x4 = shl i32 %base, 2 - %base.x4.p1 = add i32 %base.x4, 1 - %base.x4.p2 = add i32 %base.x4, 2 - %base.x4.p3 = add i32 %base.x4, 3 - %zext.x4 = zext i32 %base.x4 to i64 - %zext.x4.p1 = zext i32 %base.x4.p1 to i64 - %zext.x4.p2 = zext i32 %base.x4.p2 to i64 - %zext.x4.p3 = zext i32 %base.x4.p3 to i64 - %base.x16 = mul i64 %zext.x4, 4 - %base.x16.p4 = shl i64 %zext.x4.p1, 2 - %base.x16.p8 = shl i64 %zext.x4.p2, 2 - %base.x16.p12 = mul i64 %zext.x4.p3, 4 - %a.pi8 = bitcast i32 addrspace(1)* %a to i8 addrspace(1)* - %b.pi8 = bitcast i32 addrspace(1)* %b to i8 addrspace(1)* - %gep.a.base.x16 = getelementptr inbounds i8, i8 addrspace(1)* %a.pi8, i64 %base.x16 - %gep.b.base.x16.p4 = getelementptr inbounds i8, i8 addrspace(1)* %b.pi8, i64 %base.x16.p4 - %gep.a.base.x16.p8 = getelementptr inbounds i8, i8 addrspace(1)* %a.pi8, i64 %base.x16.p8 - %gep.b.base.x16.p12 = getelementptr inbounds i8, i8 addrspace(1)* %b.pi8, i64 %base.x16.p12 - %a.base.x16 = bitcast i8 addrspace(1)* %gep.a.base.x16 to i32 addrspace(1)* - %b.base.x16.p4 = bitcast i8 addrspace(1)* %gep.b.base.x16.p4 to i32 addrspace(1)* - %selected.base.x16.p0.or.4 = select i1 %cnd, i32 addrspace(1)* %a.base.x16, i32 addrspace(1)* %b.base.x16.p4 - %gep.selected.base.x16.p8.or.12 = select i1 %cnd, i8 addrspace(1)* %gep.a.base.x16.p8, i8 addrspace(1)* %gep.b.base.x16.p12 - %selected.base.x16.p8.or.12 = bitcast i8 addrspace(1)* %gep.selected.base.x16.p8.or.12 to i32 addrspace(1)* - %selected.base.x16.p40.or.44 = getelementptr inbounds i32, i32 addrspace(1)* %selected.base.x16.p0.or.4, i64 10 - %selected.base.x16.p44.or.48 = getelementptr inbounds i32, i32 addrspace(1)* %selected.base.x16.p8.or.12, i64 9 - %val0 = load i32, i32 addrspace(1)* %selected.base.x16.p40.or.44, align 4 - %val1 = load i32, i32 addrspace(1)* %selected.base.x16.p44.or.48, align 4 - %t0 = insertelement <2 x i32> undef, i32 %val0, i32 0 - %t1 = insertelement <2 x i32> %t0, i32 %val1, i32 1 - store <2 x i32> %t1, <2 x i32> addrspace(1)* %out - ret void -} - -define void @nested_selects(i1 %cnd0, i1 %cnd1, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 %base, <2 x i32> addrspace(1)* %out) { -; CHECK-LABEL: @nested_selects -; CHECK: load <2 x i32> -entry: - %base.p1 = add nsw i32 %base, 1 - %base.p2 = add i32 %base, 2 - %base.p3 = add nsw i32 %base, 3 - %base.x4 = mul i32 %base, 4 - %base.x4.p5 = add i32 %base.x4, 5 - %base.x4.p6 = add i32 %base.x4, 6 - %sext = sext i32 %base to i64 - %sext.p1 = sext i32 %base.p1 to i64 - %sext.p2 = sext i32 %base.p2 to i64 - %sext.p3 = sext i32 %base.p3 to i64 - %sext.x4.p5 = sext i32 %base.x4.p5 to i64 - %sext.x4.p6 = sext i32 %base.x4.p6 to i64 - %gep.a.base = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext - %gep.a.base.p1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.p1 - %gep.a.base.p2 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.p2 - %gep.a.base.p3 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.p3 - %gep.b.base.x4.p5 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.x4.p5 - %gep.b.base.x4.p6 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.x4.p6 - %selected.1.L = select i1 %cnd1, i32 addrspace(1)* %gep.a.base.p2, i32 addrspace(1)* %gep.b.base.x4.p5 - %selected.1.R = select i1 %cnd1, i32 addrspace(1)* %gep.a.base.p3, i32 addrspace(1)* %gep.b.base.x4.p6 - %selected.0.L = select i1 %cnd0, i32 addrspace(1)* %gep.a.base, i32 addrspace(1)* %selected.1.L - %selected.0.R = select i1 %cnd0, i32 addrspace(1)* %gep.a.base.p1, i32 addrspace(1)* %selected.1.R - %val0 = load i32, i32 addrspace(1)* %selected.0.L, align 4 - %val1 = load i32, i32 addrspace(1)* %selected.0.R, align 4 - %t0 = insertelement <2 x i32> undef, i32 %val0, i32 0 - %t1 = insertelement <2 x i32> %t0, i32 %val1, i32 1 - store <2 x i32> %t1, <2 x i32> addrspace(1)* %out - ret void -} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll deleted file mode 100644 index 5ed7ee80ea0..00000000000 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll +++ /dev/null @@ -1,60 +0,0 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s - -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" - -; Check that, in the presence of an aliasing load, the stores preceding the -; aliasing load are safe to vectorize. - -; CHECK-LABEL: store_vectorize_with_alias -; CHECK: store <4 x float> -; CHECK: load <4 x float> -; CHECK: store <4 x float> - -; Function Attrs: nounwind -define amdgpu_kernel void @store_vectorize_with_alias(i8 addrspace(1)* %a, i8 addrspace(1)* %b) #0 { -bb: - %tmp = bitcast i8 addrspace(1)* %b to float addrspace(1)* - %tmp1 = load float, float addrspace(1)* %tmp, align 4 - - %tmp2 = bitcast i8 addrspace(1)* %a to float addrspace(1)* - store float %tmp1, float addrspace(1)* %tmp2, align 4 - %tmp3 = getelementptr i8, i8 addrspace(1)* %a, i64 4 - %tmp4 = bitcast i8 addrspace(1)* %tmp3 to float addrspace(1)* - store float %tmp1, float addrspace(1)* %tmp4, align 4 - %tmp5 = getelementptr i8, i8 addrspace(1)* %a, i64 8 - %tmp6 = bitcast i8 addrspace(1)* %tmp5 to float addrspace(1)* - store float %tmp1, float addrspace(1)* %tmp6, align 4 - %tmp7 = getelementptr i8, i8 addrspace(1)* %a, i64 12 - %tmp8 = bitcast i8 addrspace(1)* %tmp7 to float addrspace(1)* - store float %tmp1, float addrspace(1)* %tmp8, align 4 - - %tmp9 = getelementptr i8, i8 addrspace(1)* %b, i64 16 - %tmp10 = bitcast i8 addrspace(1)* %tmp9 to float addrspace(1)* - %tmp11 = load float, float addrspace(1)* %tmp10, align 4 - %tmp12 = getelementptr i8, i8 addrspace(1)* %b, i64 20 - %tmp13 = bitcast i8 addrspace(1)* %tmp12 to float addrspace(1)* - %tmp14 = load float, float addrspace(1)* %tmp13, align 4 - %tmp15 = getelementptr i8, i8 addrspace(1)* %b, i64 24 - %tmp16 = bitcast i8 addrspace(1)* %tmp15 to float addrspace(1)* - %tmp17 = load float, float addrspace(1)* %tmp16, align 4 - %tmp18 = getelementptr i8, i8 addrspace(1)* %b, i64 28 - %tmp19 = bitcast i8 addrspace(1)* %tmp18 to float addrspace(1)* - %tmp20 = load float, float addrspace(1)* %tmp19, align 4 - - %tmp21 = getelementptr i8, i8 addrspace(1)* %a, i64 16 - %tmp22 = bitcast i8 addrspace(1)* %tmp21 to float addrspace(1)* - store float %tmp11, float addrspace(1)* %tmp22, align 4 - %tmp23 = getelementptr i8, i8 addrspace(1)* %a, i64 20 - %tmp24 = bitcast i8 addrspace(1)* %tmp23 to float addrspace(1)* - store float %tmp14, float addrspace(1)* %tmp24, align 4 - %tmp25 = getelementptr i8, i8 addrspace(1)* %a, i64 24 - %tmp26 = bitcast i8 addrspace(1)* %tmp25 to float addrspace(1)* - store float %tmp17, float addrspace(1)* %tmp26, align 4 - %tmp27 = getelementptr i8, i8 addrspace(1)* %a, i64 28 - %tmp28 = bitcast i8 addrspace(1)* %tmp27 to float addrspace(1)* - store float %tmp20, float addrspace(1)* %tmp28, align 4 - - ret void -} - -attributes #0 = { argmemonly nounwind } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll deleted file mode 100644 index 65d114478b4..00000000000 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll +++ /dev/null @@ -1,201 +0,0 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s - -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" - -; Checks that we don't merge loads/stores of types smaller than one -; byte, or vectors with elements smaller than one byte. - -%struct.foo = type { i32, i8 } - -declare void @use_i1(i1) -declare void @use_i2(i2) -declare void @use_i8(i8) -declare void @use_foo(%struct.foo) -declare void @use_v2i2(<2 x i2>) -declare void @use_v4i2(<4 x i2>) -declare void @use_v2i9(<2 x i9>) - -; CHECK-LABEL: @merge_store_2_constants_i1( -; CHECK: store i1 -; CHECK: store i1 -define amdgpu_kernel void @merge_store_2_constants_i1(i1 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i1, i1 addrspace(1)* %out, i32 1 - store i1 true, i1 addrspace(1)* %out.gep.1 - store i1 false, i1 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: @merge_store_2_constants_i2( -; CHECK: store i2 1 -; CHECK: store i2 -1 -define amdgpu_kernel void @merge_store_2_constants_i2(i2 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i2, i2 addrspace(1)* %out, i32 1 - store i2 1, i2 addrspace(1)* %out.gep.1 - store i2 -1, i2 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: @merge_different_store_sizes_i1_i8( -; CHECK: store i1 true -; CHECK: store i8 123 -define amdgpu_kernel void @merge_different_store_sizes_i1_i8(i8 addrspace(1)* %out) #0 { - %out.i1 = bitcast i8 addrspace(1)* %out to i1 addrspace(1)* - %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 - store i1 true, i1 addrspace(1)* %out.i1 - store i8 123, i8 addrspace(1)* %out.gep.1 - ret void -} - -; CHECK-LABEL: @merge_different_store_sizes_i8_i1( -; CHECK: store i8 123 -; CHECK: store i1 true -define amdgpu_kernel void @merge_different_store_sizes_i8_i1(i1 addrspace(1)* %out) #0 { - %out.i8 = bitcast i1 addrspace(1)* %out to i8 addrspace(1)* - %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out.i8, i32 1 - store i8 123, i8 addrspace(1)* %out.gep.1 - store i1 true, i1 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: @merge_store_2_constant_structs( -; CHECK: store %struct.foo -; CHECK: store %struct.foo -define amdgpu_kernel void @merge_store_2_constant_structs(%struct.foo addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr %struct.foo, %struct.foo addrspace(1)* %out, i32 1 - store %struct.foo { i32 12, i8 3 }, %struct.foo addrspace(1)* %out.gep.1 - store %struct.foo { i32 92, i8 9 }, %struct.foo addrspace(1)* %out - ret void -} - -; sub-byte element size -; CHECK-LABEL: @merge_store_2_constants_v2i2( -; CHECK: store <2 x i2> -; CHECK: store <2 x i2> -define amdgpu_kernel void @merge_store_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr <2 x i2>, <2 x i2> addrspace(1)* %out, i32 1 - store <2 x i2> <i2 1, i2 -1>, <2 x i2> addrspace(1)* %out.gep.1 - store <2 x i2> <i2 -1, i2 1>, <2 x i2> addrspace(1)* %out - ret void -} - -; sub-byte element size but byte size - -; CHECK-LABEL: @merge_store_2_constants_v4i2( -; CHECK: store <4 x i2> -; CHECK: store <4 x i2> -define amdgpu_kernel void @merge_store_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr <4 x i2>, <4 x i2> addrspace(1)* %out, i32 1 - store <4 x i2> <i2 1, i2 -1, i2 1, i2 -1>, <4 x i2> addrspace(1)* %out.gep.1 - store <4 x i2> <i2 -1, i2 1, i2 -1, i2 1>, <4 x i2> addrspace(1)* %out - ret void -} - -; CHECK-LABEL: @merge_load_2_constants_i1( -; CHECK: load i1 -; CHECK: load i1 -define amdgpu_kernel void @merge_load_2_constants_i1(i1 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i1, i1 addrspace(1)* %out, i32 1 - %x = load i1, i1 addrspace(1)* %out.gep.1 - %y = load i1, i1 addrspace(1)* %out - call void @use_i1(i1 %x) - call void @use_i1(i1 %y) - ret void -} - -; CHECK-LABEL: @merge_load_2_constants_i2( -; CHECK: load i2 -; CHECK: load i2 -define amdgpu_kernel void @merge_load_2_constants_i2(i2 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i2, i2 addrspace(1)* %out, i32 1 - %x = load i2, i2 addrspace(1)* %out.gep.1 - %y = load i2, i2 addrspace(1)* %out - call void @use_i2(i2 %x) - call void @use_i2(i2 %y) - ret void -} - -; CHECK-LABEL: @merge_different_load_sizes_i1_i8( -; CHECK: load i1 -; CHECK: load i8 -define amdgpu_kernel void @merge_different_load_sizes_i1_i8(i8 addrspace(1)* %out) #0 { - %out.i1 = bitcast i8 addrspace(1)* %out to i1 addrspace(1)* - %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 - %x = load i1, i1 addrspace(1)* %out.i1 - %y = load i8, i8 addrspace(1)* %out.gep.1 - call void @use_i1(i1 %x) - call void @use_i8(i8 %y) - ret void -} - -; CHECK-LABEL: @merge_different_load_sizes_i8_i1( -; CHECK: load i8 -; CHECK: load i1 -define amdgpu_kernel void @merge_different_load_sizes_i8_i1(i1 addrspace(1)* %out) #0 { - %out.i8 = bitcast i1 addrspace(1)* %out to i8 addrspace(1)* - %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out.i8, i32 1 - %x = load i8, i8 addrspace(1)* %out.gep.1 - %y = load i1, i1 addrspace(1)* %out - call void @use_i8(i8 %x) - call void @use_i1(i1 %y) - ret void -} - -; CHECK-LABEL: @merge_load_2_constant_structs( -; CHECK: load %struct.foo -; CHECK: load %struct.foo -define amdgpu_kernel void @merge_load_2_constant_structs(%struct.foo addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr %struct.foo, %struct.foo addrspace(1)* %out, i32 1 - %x = load %struct.foo, %struct.foo addrspace(1)* %out.gep.1 - %y = load %struct.foo, %struct.foo addrspace(1)* %out - call void @use_foo(%struct.foo %x) - call void @use_foo(%struct.foo %y) - ret void -} - -; CHECK-LABEL: @merge_load_2_constants_v2i2( -; CHECK: load <2 x i2> -; CHECK: load <2 x i2> -define amdgpu_kernel void @merge_load_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr <2 x i2>, <2 x i2> addrspace(1)* %out, i32 1 - %x = load <2 x i2>, <2 x i2> addrspace(1)* %out.gep.1 - %y = load <2 x i2>, <2 x i2> addrspace(1)* %out - call void @use_v2i2(<2 x i2> %x) - call void @use_v2i2(<2 x i2> %y) - ret void -} - -; CHECK-LABEL: @merge_load_2_constants_v4i2( -; CHECK: load <4 x i2> -; CHECK: load <4 x i2> -define amdgpu_kernel void @merge_load_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr <4 x i2>, <4 x i2> addrspace(1)* %out, i32 1 - %x = load <4 x i2>, <4 x i2> addrspace(1)* %out.gep.1 - %y = load <4 x i2>, <4 x i2> addrspace(1)* %out - call void @use_v4i2(<4 x i2> %x) - call void @use_v4i2(<4 x i2> %y) - ret void -} - -; CHECK-LABEL: @merge_store_2_constants_i9( -; CHECK: store i9 3 -; CHECK: store i9 -5 -define amdgpu_kernel void @merge_store_2_constants_i9(i9 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i9, i9 addrspace(1)* %out, i32 1 - store i9 3, i9 addrspace(1)* %out.gep.1 - store i9 -5, i9 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: @merge_load_2_constants_v2i9( -; CHECK: load <2 x i9> -; CHECK: load <2 x i9> -define amdgpu_kernel void @merge_load_2_constants_v2i9(<2 x i9> addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr <2 x i9>, <2 x i9> addrspace(1)* %out, i32 1 - %x = load <2 x i9>, <2 x i9> addrspace(1)* %out.gep.1 - %y = load <2 x i9>, <2 x i9> addrspace(1)* %out - call void @use_v2i9(<2 x i9> %x) - call void @use_v2i9(<2 x i9> %y) - ret void -} - -attributes #0 = { nounwind } |

