diff options
Diffstat (limited to 'llvm/test/Transforms/LoadStoreVectorizer')
36 files changed, 3355 insertions, 0 deletions
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll new file mode 100644 index 00000000000..d2834be18b0 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll @@ -0,0 +1,32 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -scoped-noalias -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=SCOPE -check-prefix=ALL %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=NOSCOPE -check-prefix=ALL %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + +; This fails to vectorize if the !alias.scope is not used + +; ALL-LABEL: @vectorize_alias_scope( +; SCOPE: load float, float addrspace(1)* %c +; SCOPE: bitcast float addrspace(1)* %a to <2 x float> addrspace(1)* +; SCOPE: store <2 x float> zeroinitializer +; SCOPE: store float %ld.c, float addrspace(1)* %b, + +; NOSCOPE: store float +; NOSCOPE: load float +; NOSCOPE: store float +; NOSCOPE: store float +define amdgpu_kernel void @vectorize_alias_scope(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 { +entry: + %a.idx.1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1 + store float 0.0, float addrspace(1)* %a, align 4, !noalias !0 + %ld.c = load float, float addrspace(1)* %c, align 4, !alias.scope !0 + store float 0.0, float addrspace(1)* %a.idx.1, align 4, !noalias !0 + store float %ld.c, float addrspace(1)* %b, align 4, !noalias !0 + ret void +} + +attributes #0 = { nounwind } + +!0 = !{!1} +!1 = distinct !{!1, !2, !"some scope"} +!2 = distinct !{!2, !"some domain"} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll new file mode 100644 index 00000000000..b0dd5d185c7 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll @@ -0,0 +1,210 @@ +; RUN: opt -S -load-store-vectorizer -mattr=-unaligned-buffer-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=ALIGNED -check-prefix=ALL %s +; RUN: opt -S -load-store-vectorizer -mattr=+unaligned-buffer-access,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=UNALIGNED -check-prefix=ALL %s +; RUN: opt -S -passes='function(load-store-vectorizer)' -mattr=-unaligned-buffer-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=ALIGNED -check-prefix=ALL %s +; RUN: opt -S -passes='function(load-store-vectorizer)' -mattr=+unaligned-buffer-access,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=UNALIGNED -check-prefix=ALL %s + +target triple = "amdgcn--" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + +; ALL-LABEL: @load_unknown_offset_align1_i8( +; ALL: alloca [128 x i8], align 1 +; UNALIGNED: load <2 x i8>, <2 x i8> addrspace(5)* %{{[0-9]+}}, align 1{{$}} + +; ALIGNED: load i8, i8 addrspace(5)* %ptr0, align 1{{$}} +; ALIGNED: load i8, i8 addrspace(5)* %ptr1, align 1{{$}} +define amdgpu_kernel void @load_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 { + %alloca = alloca [128 x i8], align 1, addrspace(5) + %ptr0 = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* %alloca, i32 0, i32 %offset + %val0 = load i8, i8 addrspace(5)* %ptr0, align 1 + %ptr1 = getelementptr inbounds i8, i8 addrspace(5)* %ptr0, i32 1 + %val1 = load i8, i8 addrspace(5)* %ptr1, align 1 + %add = add i8 %val0, %val1 + store i8 %add, i8 addrspace(1)* %out + ret void +} + +; ALL-LABEL: @load_unknown_offset_align1_i16( +; ALL: alloca [128 x i16], align 1, addrspace(5){{$}} +; UNALIGNED: load <2 x i16>, <2 x i16> addrspace(5)* %{{[0-9]+}}, align 1{{$}} + +; ALIGNED: load i16, i16 addrspace(5)* %ptr0, align 1{{$}} +; ALIGNED: load i16, i16 addrspace(5)* %ptr1, align 1{{$}} +define amdgpu_kernel void @load_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 { + %alloca = alloca [128 x i16], align 1, addrspace(5) + %ptr0 = getelementptr inbounds [128 x i16], [128 x i16] addrspace(5)* %alloca, i32 0, i32 %offset + %val0 = load i16, i16 addrspace(5)* %ptr0, align 1 + %ptr1 = getelementptr inbounds i16, i16 addrspace(5)* %ptr0, i32 1 + %val1 = load i16, i16 addrspace(5)* %ptr1, align 1 + %add = add i16 %val0, %val1 + store i16 %add, i16 addrspace(1)* %out + ret void +} + +; FIXME: Although the offset is unknown here, we know it is a multiple +; of the element size, so should still be align 4 + +; ALL-LABEL: @load_unknown_offset_align1_i32( +; ALL: alloca [128 x i32], align 1 +; UNALIGNED: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 1{{$}} + +; ALIGNED: load i32, i32 addrspace(5)* %ptr0, align 1 +; ALIGNED: load i32, i32 addrspace(5)* %ptr1, align 1 +define amdgpu_kernel void @load_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 { + %alloca = alloca [128 x i32], align 1, addrspace(5) + %ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset + %val0 = load i32, i32 addrspace(5)* %ptr0, align 1 + %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %ptr0, i32 1 + %val1 = load i32, i32 addrspace(5)* %ptr1, align 1 + %add = add i32 %val0, %val1 + store i32 %add, i32 addrspace(1)* %out + ret void +} + +; FIXME: Should always increase alignment of the load +; Make sure alloca alignment isn't decreased +; ALL-LABEL: @load_alloca16_unknown_offset_align1_i32( +; ALL: alloca [128 x i32], align 16 + +; UNALIGNED: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 1{{$}} +; ALIGNED: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 4{{$}} +define amdgpu_kernel void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 { + %alloca = alloca [128 x i32], align 16, addrspace(5) + %ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset + %val0 = load i32, i32 addrspace(5)* %ptr0, align 1 + %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %ptr0, i32 1 + %val1 = load i32, i32 addrspace(5)* %ptr1, align 1 + %add = add i32 %val0, %val1 + store i32 %add, i32 addrspace(1)* %out + ret void +} + +; ALL-LABEL: @store_unknown_offset_align1_i8( +; ALL: alloca [128 x i8], align 1 +; UNALIGNED: store <2 x i8> <i8 9, i8 10>, <2 x i8> addrspace(5)* %{{[0-9]+}}, align 1{{$}} + +; ALIGNED: store i8 9, i8 addrspace(5)* %ptr0, align 1{{$}} +; ALIGNED: store i8 10, i8 addrspace(5)* %ptr1, align 1{{$}} +define amdgpu_kernel void @store_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 { + %alloca = alloca [128 x i8], align 1, addrspace(5) + %ptr0 = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* %alloca, i32 0, i32 %offset + store i8 9, i8 addrspace(5)* %ptr0, align 1 + %ptr1 = getelementptr inbounds i8, i8 addrspace(5)* %ptr0, i32 1 + store i8 10, i8 addrspace(5)* %ptr1, align 1 + ret void +} + +; ALL-LABEL: @store_unknown_offset_align1_i16( +; ALL: alloca [128 x i16], align 1 +; UNALIGNED: store <2 x i16> <i16 9, i16 10>, <2 x i16> addrspace(5)* %{{[0-9]+}}, align 1{{$}} + +; ALIGNED: store i16 9, i16 addrspace(5)* %ptr0, align 1{{$}} +; ALIGNED: store i16 10, i16 addrspace(5)* %ptr1, align 1{{$}} +define amdgpu_kernel void @store_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 { + %alloca = alloca [128 x i16], align 1, addrspace(5) + %ptr0 = getelementptr inbounds [128 x i16], [128 x i16] addrspace(5)* %alloca, i32 0, i32 %offset + store i16 9, i16 addrspace(5)* %ptr0, align 1 + %ptr1 = getelementptr inbounds i16, i16 addrspace(5)* %ptr0, i32 1 + store i16 10, i16 addrspace(5)* %ptr1, align 1 + ret void +} + +; FIXME: Although the offset is unknown here, we know it is a multiple +; of the element size, so it still should be align 4. + +; ALL-LABEL: @store_unknown_offset_align1_i32( +; ALL: alloca [128 x i32], align 1 + +; UNALIGNED: store <2 x i32> <i32 9, i32 10>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 1{{$}} + +; ALIGNED: store i32 9, i32 addrspace(5)* %ptr0, align 1 +; ALIGNED: store i32 10, i32 addrspace(5)* %ptr1, align 1 +define amdgpu_kernel void @store_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 { + %alloca = alloca [128 x i32], align 1, addrspace(5) + %ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset + store i32 9, i32 addrspace(5)* %ptr0, align 1 + %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %ptr0, i32 1 + store i32 10, i32 addrspace(5)* %ptr1, align 1 + ret void +} + +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32( +; ALIGNED: %alloca = alloca [8 x i32], align 4, addrspace(5) +; ALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 4 + +; UNALIGNED: %alloca = alloca [8 x i32], align 1, addrspace(5) +; UNALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 1 +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32() { + %alloca = alloca [8 x i32], align 1, addrspace(5) + %out = bitcast [8 x i32] addrspace(5)* %alloca to i32 addrspace(5)* + %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3 + + store i32 9, i32 addrspace(5)* %out, align 1 + store i32 1, i32 addrspace(5)* %out.gep.1, align 1 + store i32 23, i32 addrspace(5)* %out.gep.2, align 1 + store i32 19, i32 addrspace(5)* %out.gep.3, align 1 + ret void +} + +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8( +; ALIGNED: %alloca = alloca [8 x i8], align 4, addrspace(5) +; ALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8> addrspace(5)* %1, align 4 + +; UNALIGNED: %alloca = alloca [8 x i8], align 1, addrspace(5) +; UNALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8> addrspace(5)* %1, align 1 +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8() { + %alloca = alloca [8 x i8], align 1, addrspace(5) + %out = bitcast [8 x i8] addrspace(5)* %alloca to i8 addrspace(5)* + %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1 + %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2 + %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i8 3 + + store i8 9, i8 addrspace(5)* %out, align 1 + store i8 1, i8 addrspace(5)* %out.gep.1, align 1 + store i8 23, i8 addrspace(5)* %out.gep.2, align 1 + store i8 19, i8 addrspace(5)* %out.gep.3, align 1 + ret void +} + +; ALL-LABEL: @merge_private_load_4_vector_elts_loads_v4i32( +; ALIGNED: %alloca = alloca [8 x i32], align 4, addrspace(5) +; ALIGNED: load <4 x i32>, <4 x i32> addrspace(5)* %1, align 4 + +; UNALIGNED: %alloca = alloca [8 x i32], align 1, addrspace(5) +; UNALIGNED: load <4 x i32>, <4 x i32> addrspace(5)* %1, align 1 +define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i32() { + %alloca = alloca [8 x i32], align 1, addrspace(5) + %out = bitcast [8 x i32] addrspace(5)* %alloca to i32 addrspace(5)* + %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3 + + %load0 = load i32, i32 addrspace(5)* %out, align 1 + %load1 = load i32, i32 addrspace(5)* %out.gep.1, align 1 + %load2 = load i32, i32 addrspace(5)* %out.gep.2, align 1 + %load3 = load i32, i32 addrspace(5)* %out.gep.3, align 1 + ret void +} + +; ALL-LABEL: @merge_private_load_4_vector_elts_loads_v4i8( +; ALIGNED: %alloca = alloca [8 x i8], align 4, addrspace(5) +; ALIGNED: load <4 x i8>, <4 x i8> addrspace(5)* %1, align 4 + +; UNALIGNED: %alloca = alloca [8 x i8], align 1, addrspace(5) +; UNALIGNED: load <4 x i8>, <4 x i8> addrspace(5)* %1, align 1 +define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i8() { + %alloca = alloca [8 x i8], align 1, addrspace(5) + %out = bitcast [8 x i8] addrspace(5)* %alloca to i8 addrspace(5)* + %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1 + %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2 + %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i8 3 + + %load0 = load i8, i8 addrspace(5)* %out, align 1 + %load1 = load i8, i8 addrspace(5)* %out.gep.1, align 1 + %load2 = load i8, i8 addrspace(5)* %out.gep.2, align 1 + %load3 = load i8, i8 addrspace(5)* %out.gep.3, align 1 + ret void +} + +attributes #0 = { nounwind } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll new file mode 100644 index 00000000000..cd1c7fdc521 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll @@ -0,0 +1,52 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + +declare i64 @_Z12get_local_idj(i32) + +declare i64 @_Z12get_group_idj(i32) + +declare double @llvm.fmuladd.f64(double, double, double) + +; CHECK-LABEL: @factorizedVsNonfactorizedAccess( +; CHECK: load <2 x float> +; CHECK: store <2 x float> +define amdgpu_kernel void @factorizedVsNonfactorizedAccess(float addrspace(1)* nocapture %c) { +entry: + %call = tail call i64 @_Z12get_local_idj(i32 0) + %call1 = tail call i64 @_Z12get_group_idj(i32 0) + %div = lshr i64 %call, 4 + %div2 = lshr i64 %call1, 3 + %mul = shl i64 %div2, 7 + %rem = shl i64 %call, 3 + %mul3 = and i64 %rem, 120 + %add = or i64 %mul, %mul3 + %rem4 = shl i64 %call1, 7 + %mul5 = and i64 %rem4, 896 + %mul6 = shl nuw nsw i64 %div, 3 + %add7 = add nuw i64 %mul5, %mul6 + %mul9 = shl i64 %add7, 10 + %add10 = add i64 %mul9, %add + %arrayidx = getelementptr inbounds float, float addrspace(1)* %c, i64 %add10 + %load1 = load float, float addrspace(1)* %arrayidx, align 4 + %conv = fpext float %load1 to double + %mul11 = fmul double %conv, 0x3FEAB481D8F35506 + %conv12 = fptrunc double %mul11 to float + %conv18 = fpext float %conv12 to double + %storeval1 = tail call double @llvm.fmuladd.f64(double 0x3FF4FFAFBBEC946A, double 0.000000e+00, double %conv18) + %cstoreval1 = fptrunc double %storeval1 to float + store float %cstoreval1, float addrspace(1)* %arrayidx, align 4 + + %add23 = or i64 %add10, 1 + %arrayidx24 = getelementptr inbounds float, float addrspace(1)* %c, i64 %add23 + %load2 = load float, float addrspace(1)* %arrayidx24, align 4 + %conv25 = fpext float %load2 to double + %mul26 = fmul double %conv25, 0x3FEAB481D8F35506 + %conv27 = fptrunc double %mul26 to float + %conv34 = fpext float %conv27 to double + %storeval2 = tail call double @llvm.fmuladd.f64(double 0x3FF4FFAFBBEC946A, double 0.000000e+00, double %conv34) + %cstoreval2 = fptrunc double %storeval2 to float + store float %cstoreval2, float addrspace(1)* %arrayidx24, align 4 + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll new file mode 100644 index 00000000000..b8e95a6793e --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll @@ -0,0 +1,151 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +; CHECK-LABEL: @basic_merge_sext_index( +; CHECK: sext i32 %id.x to i64 +; CHECK: load <2 x float> +; CHECK: store <2 x float> zeroinitializer +define amdgpu_kernel void @basic_merge_sext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 { +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %sext.id.x = sext i32 %id.x to i64 + %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %sext.id.x + %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %sext.id.x + %a.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %a.idx.x, i64 1 + %c.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %c.idx.x, i64 1 + + %ld.c = load float, float addrspace(1)* %c.idx.x, align 4 + %ld.c.idx.1 = load float, float addrspace(1)* %c.idx.x.1, align 4 + + store float 0.0, float addrspace(1)* %a.idx.x, align 4 + store float 0.0, float addrspace(1)* %a.idx.x.1, align 4 + + %add = fadd float %ld.c, %ld.c.idx.1 + store float %add, float addrspace(1)* %b, align 4 + ret void +} + +; CHECK-LABEL: @basic_merge_zext_index( +; CHECK: zext i32 %id.x to i64 +; CHECK: load <2 x float> +; CHECK: store <2 x float> +define amdgpu_kernel void @basic_merge_zext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 { +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %zext.id.x = zext i32 %id.x to i64 + %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %zext.id.x + %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %zext.id.x + %a.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %a.idx.x, i64 1 + %c.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %c.idx.x, i64 1 + + %ld.c = load float, float addrspace(1)* %c.idx.x, align 4 + %ld.c.idx.1 = load float, float addrspace(1)* %c.idx.x.1, align 4 + store float 0.0, float addrspace(1)* %a.idx.x, align 4 + store float 0.0, float addrspace(1)* %a.idx.x.1, align 4 + + %add = fadd float %ld.c, %ld.c.idx.1 + store float %add, float addrspace(1)* %b, align 4 + ret void +} + +; CHECK-LABEL: @merge_op_zext_index( +; CHECK: load <2 x float> +; CHECK: store <2 x float> +define amdgpu_kernel void @merge_op_zext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 { +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %shl = shl i32 %id.x, 2 + %zext.id.x = zext i32 %shl to i64 + %a.0 = getelementptr inbounds float, float addrspace(1)* %a, i64 %zext.id.x + %c.0 = getelementptr inbounds float, float addrspace(1)* %c, i64 %zext.id.x + + %id.x.1 = or i32 %shl, 1 + %id.x.1.ext = zext i32 %id.x.1 to i64 + + %a.1 = getelementptr inbounds float, float addrspace(1)* %a, i64 %id.x.1.ext + %c.1 = getelementptr inbounds float, float addrspace(1)* %c, i64 %id.x.1.ext + + %ld.c.0 = load float, float addrspace(1)* %c.0, align 4 + store float 0.0, float addrspace(1)* %a.0, align 4 + %ld.c.1 = load float, float addrspace(1)* %c.1, align 4 + store float 0.0, float addrspace(1)* %a.1, align 4 + + %add = fadd float %ld.c.0, %ld.c.1 + store float %add, float addrspace(1)* %b, align 4 + ret void +} + +; CHECK-LABEL: @merge_op_sext_index( +; CHECK: load <2 x float> +; CHECK: store <2 x float> +define amdgpu_kernel void @merge_op_sext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 { +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %shl = shl i32 %id.x, 2 + %zext.id.x = sext i32 %shl to i64 + %a.0 = getelementptr inbounds float, float addrspace(1)* %a, i64 %zext.id.x + %c.0 = getelementptr inbounds float, float addrspace(1)* %c, i64 %zext.id.x + + %id.x.1 = or i32 %shl, 1 + %id.x.1.ext = sext i32 %id.x.1 to i64 + + %a.1 = getelementptr inbounds float, float addrspace(1)* %a, i64 %id.x.1.ext + %c.1 = getelementptr inbounds float, float addrspace(1)* %c, i64 %id.x.1.ext + + %ld.c.0 = load float, float addrspace(1)* %c.0, align 4 + store float 0.0, float addrspace(1)* %a.0, align 4 + %ld.c.1 = load float, float addrspace(1)* %c.1, align 4 + store float 0.0, float addrspace(1)* %a.1, align 4 + + %add = fadd float %ld.c.0, %ld.c.1 + store float %add, float addrspace(1)* %b, align 4 + ret void +} + +; This case fails to vectorize if not using the extra extension +; handling in isConsecutiveAccess. + +; CHECK-LABEL: @zext_trunc_phi_1( +; CHECK: loop: +; CHECK: load <2 x i32> +; CHECK: store <2 x i32> +define amdgpu_kernel void @zext_trunc_phi_1(i32 addrspace(1)* nocapture noalias %a, i32 addrspace(1)* nocapture noalias %b, i32 addrspace(1)* nocapture readonly noalias %c, i32 %n, i64 %arst, i64 %aoeu) #0 { +entry: + %cmp0 = icmp eq i32 %n, 0 + br i1 %cmp0, label %exit, label %loop + +loop: + %indvars.iv = phi i64 [ %indvars.iv.next, %loop ], [ 0, %entry ] + %trunc.iv = trunc i64 %indvars.iv to i32 + %idx = shl i32 %trunc.iv, 4 + + %idx.ext = zext i32 %idx to i64 + %c.0 = getelementptr inbounds i32, i32 addrspace(1)* %c, i64 %idx.ext + %a.0 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idx.ext + + %idx.1 = or i32 %idx, 1 + %idx.1.ext = zext i32 %idx.1 to i64 + %c.1 = getelementptr inbounds i32, i32 addrspace(1)* %c, i64 %idx.1.ext + %a.1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idx.1.ext + + %ld.c.0 = load i32, i32 addrspace(1)* %c.0, align 4 + store i32 %ld.c.0, i32 addrspace(1)* %a.0, align 4 + %ld.c.1 = load i32, i32 addrspace(1)* %c.1, align 4 + store i32 %ld.c.1, i32 addrspace(1)* %a.1, align 4 + + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll new file mode 100644 index 00000000000..5bb6289ff19 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll @@ -0,0 +1,135 @@ +; RUN: opt -S -mtriple=amdgcn--amdhsa -load-store-vectorizer < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn--amdhsa -passes='function(load-store-vectorizer)' < %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + +; Check that vectorizer can find a GEP through bitcast +; CHECK-LABEL: @vect_zext_bitcast_f32_to_i32_idx +; CHECK: load <4 x i32> +define void @vect_zext_bitcast_f32_to_i32_idx(float addrspace(1)* %arg1, i32 %base) { + %add1 = add nuw i32 %base, 0 + %zext1 = zext i32 %add1 to i64 + %gep1 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 %zext1 + %f2i1 = bitcast float addrspace(1)* %gep1 to i32 addrspace(1)* + %load1 = load i32, i32 addrspace(1)* %f2i1, align 4 + %add2 = add nuw i32 %base, 1 + %zext2 = zext i32 %add2 to i64 + %gep2 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 %zext2 + %f2i2 = bitcast float addrspace(1)* %gep2 to i32 addrspace(1)* + %load2 = load i32, i32 addrspace(1)* %f2i2, align 4 + %add3 = add nuw i32 %base, 2 + %zext3 = zext i32 %add3 to i64 + %gep3 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 %zext3 + %f2i3 = bitcast float addrspace(1)* %gep3 to i32 addrspace(1)* + %load3 = load i32, i32 addrspace(1)* %f2i3, align 4 + %add4 = add nuw i32 %base, 3 + %zext4 = zext i32 %add4 to i64 + %gep4 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 %zext4 + %f2i4 = bitcast float addrspace(1)* %gep4 to i32 addrspace(1)* + %load4 = load i32, i32 addrspace(1)* %f2i4, align 4 + ret void +} + +; CHECK-LABEL: @vect_zext_bitcast_i8_st1_to_i32_idx +; CHECK: load i32 +; CHECK: load i32 +; CHECK: load i32 +; CHECK: load i32 +define void @vect_zext_bitcast_i8_st1_to_i32_idx(i8 addrspace(1)* %arg1, i32 %base) { + %add1 = add nuw i32 %base, 0 + %zext1 = zext i32 %add1 to i64 + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext1 + %f2i1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %load1 = load i32, i32 addrspace(1)* %f2i1, align 4 + %add2 = add nuw i32 %base, 1 + %zext2 = zext i32 %add2 to i64 + %gep2 = getelementptr inbounds i8,i8 addrspace(1)* %arg1, i64 %zext2 + %f2i2 = bitcast i8 addrspace(1)* %gep2 to i32 addrspace(1)* + %load2 = load i32, i32 addrspace(1)* %f2i2, align 4 + %add3 = add nuw i32 %base, 2 + %zext3 = zext i32 %add3 to i64 + %gep3 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext3 + %f2i3 = bitcast i8 addrspace(1)* %gep3 to i32 addrspace(1)* + %load3 = load i32, i32 addrspace(1)* %f2i3, align 4 + %add4 = add nuw i32 %base, 3 + %zext4 = zext i32 %add4 to i64 + %gep4 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext4 + %f2i4 = bitcast i8 addrspace(1)* %gep4 to i32 addrspace(1)* + %load4 = load i32, i32 addrspace(1)* %f2i4, align 4 + ret void +} + +; CHECK-LABEL: @vect_zext_bitcast_i8_st4_to_i32_idx +; CHECK: load <4 x i32> +define void @vect_zext_bitcast_i8_st4_to_i32_idx(i8 addrspace(1)* %arg1, i32 %base) { + %add1 = add nuw i32 %base, 0 + %zext1 = zext i32 %add1 to i64 + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext1 + %f2i1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %load1 = load i32, i32 addrspace(1)* %f2i1, align 4 + %add2 = add nuw i32 %base, 4 + %zext2 = zext i32 %add2 to i64 + %gep2 = getelementptr inbounds i8,i8 addrspace(1)* %arg1, i64 %zext2 + %f2i2 = bitcast i8 addrspace(1)* %gep2 to i32 addrspace(1)* + %load2 = load i32, i32 addrspace(1)* %f2i2, align 4 + %add3 = add nuw i32 %base, 8 + %zext3 = zext i32 %add3 to i64 + %gep3 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext3 + %f2i3 = bitcast i8 addrspace(1)* %gep3 to i32 addrspace(1)* + %load3 = load i32, i32 addrspace(1)* %f2i3, align 4 + %add4 = add nuw i32 %base, 12 + %zext4 = zext i32 %add4 to i64 + %gep4 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext4 + %f2i4 = bitcast i8 addrspace(1)* %gep4 to i32 addrspace(1)* + %load4 = load i32, i32 addrspace(1)* %f2i4, align 4 + ret void +} + +; CHECK-LABEL: @vect_zext_bitcast_negative_ptr_delta +; CHECK: load <2 x i32> +define void @vect_zext_bitcast_negative_ptr_delta(i32 addrspace(1)* %p, i32 %base) { + %p.bitcasted = bitcast i32 addrspace(1)* %p to i16 addrspace(1)* + %a.offset = add nuw i32 %base, 4 + %t.offset.zexted = zext i32 %base to i64 + %a.offset.zexted = zext i32 %a.offset to i64 + %t.ptr = getelementptr inbounds i16, i16 addrspace(1)* %p.bitcasted, i64 %t.offset.zexted + %a.ptr = getelementptr inbounds i16, i16 addrspace(1)* %p.bitcasted, i64 %a.offset.zexted + %b.ptr = getelementptr inbounds i16, i16 addrspace(1)* %t.ptr, i64 6 + %a.ptr.bitcasted = bitcast i16 addrspace(1)* %a.ptr to i32 addrspace(1)* + %b.ptr.bitcasted = bitcast i16 addrspace(1)* %b.ptr to i32 addrspace(1)* + %a.val = load i32, i32 addrspace(1)* %a.ptr.bitcasted + %b.val = load i32, i32 addrspace(1)* %b.ptr.bitcasted + ret void +} + +; Check i1 corner case +; CHECK-LABEL: @zexted_i1_gep_index +; CHECK: load i32 +; CHECK: load i32 +define void @zexted_i1_gep_index(i32 addrspace(1)* %p, i32 %val) { + %selector = icmp eq i32 %val, 0 + %flipped = xor i1 %selector, 1 + %index.0 = zext i1 %selector to i64 + %index.1 = zext i1 %flipped to i64 + %gep.0 = getelementptr inbounds i32, i32 addrspace(1)* %p, i64 %index.0 + %gep.1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i64 %index.1 + %val0 = load i32, i32 addrspace(1)* %gep.0 + %val1 = load i32, i32 addrspace(1)* %gep.1 + ret void +} + +; Check i1 corner case +; CHECK-LABEL: @sexted_i1_gep_index +; CHECK: load i32 +; CHECK: load i32 +define void @sexted_i1_gep_index(i32 addrspace(1)* %p, i32 %val) { + %selector = icmp eq i32 %val, 0 + %flipped = xor i1 %selector, 1 + %index.0 = sext i1 %selector to i64 + %index.1 = sext i1 %flipped to i64 + %gep.0 = getelementptr inbounds i32, i32 addrspace(1)* %p, i64 %index.0 + %gep.1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i64 %index.1 + %val0 = load i32, i32 addrspace(1)* %gep.0 + %val1 = load i32, i32 addrspace(1)* %gep.1 + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll new file mode 100644 index 00000000000..35836f80456 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll @@ -0,0 +1,118 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + +; Check position of the inserted vector load/store. Vectorized loads should be +; inserted at the position of the first load in the chain, and stores should be +; inserted at the position of the last store. + +; CHECK-LABEL: @insert_load_point( +; CHECK: %z = add i32 %x, 4 +; CHECK: load <2 x float> +; CHECK: %w = add i32 %y, 9 +; CHECK: %foo = add i32 %z, %w +define amdgpu_kernel void @insert_load_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 { +entry: + %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx + %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %idx + %a.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %a.idx.x, i64 1 + %c.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %c.idx.x, i64 1 + + %z = add i32 %x, 4 + %ld.c = load float, float addrspace(1)* %c.idx.x, align 4 + %w = add i32 %y, 9 + %ld.c.idx.1 = load float, float addrspace(1)* %c.idx.x.1, align 4 + %foo = add i32 %z, %w + + store float 0.0, float addrspace(1)* %a.idx.x, align 4 + store float 0.0, float addrspace(1)* %a.idx.x.1, align 4 + + %add = fadd float %ld.c, %ld.c.idx.1 + store float %add, float addrspace(1)* %b, align 4 + store i32 %foo, i32 addrspace(3)* null, align 4 + ret void +} + +; CHECK-LABEL: @insert_store_point( +; CHECK: %z = add i32 %x, 4 +; CHECK: %w = add i32 %y, 9 +; CHECK: store <2 x float> +; CHECK: %foo = add i32 %z, %w +define amdgpu_kernel void @insert_store_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 { +entry: + %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx + %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %idx + %a.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %a.idx.x, i64 1 + %c.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %c.idx.x, i64 1 + + %ld.c = load float, float addrspace(1)* %c.idx.x, align 4 + %ld.c.idx.1 = load float, float addrspace(1)* %c.idx.x.1, align 4 + + %z = add i32 %x, 4 + store float 0.0, float addrspace(1)* %a.idx.x, align 4 + %w = add i32 %y, 9 + store float 0.0, float addrspace(1)* %a.idx.x.1, align 4 + %foo = add i32 %z, %w + + %add = fadd float %ld.c, %ld.c.idx.1 + store float %add, float addrspace(1)* %b, align 4 + store i32 %foo, i32 addrspace(3)* null, align 4 + ret void +} + +; Here we have four stores, with an aliasing load before the last one. We can +; vectorize the first three stores as <3 x float>, but this vectorized store must +; be inserted at the location of the third scalar store, not the fourth one. +; +; CHECK-LABEL: @insert_store_point_alias +; CHECK: store <3 x float> +; CHECK: load float, float addrspace(1)* %a.idx.2 +; CHECK: store float +; CHECK-SAME: %a.idx.3 +define float @insert_store_point_alias(float addrspace(1)* nocapture %a, i64 %idx) { + %a.idx = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx + %a.idx.1 = getelementptr inbounds float, float addrspace(1)* %a.idx, i64 1 + %a.idx.2 = getelementptr inbounds float, float addrspace(1)* %a.idx.1, i64 1 + %a.idx.3 = getelementptr inbounds float, float addrspace(1)* %a.idx.2, i64 1 + + store float 0.0, float addrspace(1)* %a.idx, align 4 + store float 0.0, float addrspace(1)* %a.idx.1, align 4 + store float 0.0, float addrspace(1)* %a.idx.2, align 4 + %x = load float, float addrspace(1)* %a.idx.2, align 4 + store float 0.0, float addrspace(1)* %a.idx.3, align 4 + + ret float %x +} + +; Here we have four stores, with an aliasing load before the last one. We +; could vectorize two of the stores before the load (although we currently +; don't), but the important thing is that we *don't* sink the store to +; a[idx + 1] below the load. +; +; CHECK-LABEL: @insert_store_point_alias_ooo +; CHECK: store float +; CHECK-SAME: %a.idx.3 +; CHECK: store float +; CHECK-SAME: %a.idx.1 +; CHECK: store float +; CHECK-SAME: %a.idx.2 +; CHECK: load float, float addrspace(1)* %a.idx.2 +; CHECK: store float +; CHECK-SAME: %a.idx +define float @insert_store_point_alias_ooo(float addrspace(1)* nocapture %a, i64 %idx) { + %a.idx = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx + %a.idx.1 = getelementptr inbounds float, float addrspace(1)* %a.idx, i64 1 + %a.idx.2 = getelementptr inbounds float, float addrspace(1)* %a.idx.1, i64 1 + %a.idx.3 = getelementptr inbounds float, float addrspace(1)* %a.idx.2, i64 1 + + store float 0.0, float addrspace(1)* %a.idx.3, align 4 + store float 0.0, float addrspace(1)* %a.idx.1, align 4 + store float 0.0, float addrspace(1)* %a.idx.2, align 4 + %x = load float, float addrspace(1)* %a.idx.2, align 4 + store float 0.0, float addrspace(1)* %a.idx, align 4 + + ret float %x +} + +attributes #0 = { nounwind } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll new file mode 100644 index 00000000000..81ebb712e33 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll @@ -0,0 +1,29 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + +; This is NOT OK to vectorize, as either load may alias either store. + +; CHECK: load double +; CHECK: store double 0.000000e+00, double addrspace(1)* %a, +; CHECK: load double +; CHECK: store double 0.000000e+00, double addrspace(1)* %a.idx.1 +define amdgpu_kernel void @interleave(double addrspace(1)* nocapture %a, double addrspace(1)* nocapture %b, double addrspace(1)* nocapture readonly %c) #0 { +entry: + %a.idx.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1 + %c.idx.1 = getelementptr inbounds double, double addrspace(1)* %c, i64 1 + + %ld.c = load double, double addrspace(1)* %c, align 8 ; may alias store to %a + store double 0.0, double addrspace(1)* %a, align 8 + + %ld.c.idx.1 = load double, double addrspace(1)* %c.idx.1, align 8 ; may alias store to %a + store double 0.0, double addrspace(1)* %a.idx.1, align 8 + + %add = fadd double %ld.c, %ld.c.idx.1 + store double %add, double addrspace(1)* %b + + ret void +} + +attributes #0 = { nounwind } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/invariant-load.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/invariant-load.ll new file mode 100644 index 00000000000..15c47716aaf --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/invariant-load.ll @@ -0,0 +1,29 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + +; CHECK-LABEL: @interleave +; CHECK: load <2 x double>, <2 x double> addrspace(1)* %{{.}}, align 8{{$}} +; CHECK: store <2 x double> zeroinitializer +; CHECK: store double %add +define amdgpu_kernel void @interleave(double addrspace(1)* nocapture %a, double addrspace(1)* nocapture %b, double addrspace(1)* nocapture readonly %c) #0 { +entry: + %a.idx.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1 + %c.idx.1 = getelementptr inbounds double, double addrspace(1)* %c, i64 1 + + %ld.c = load double, double addrspace(1)* %c, align 8 + store double 0.0, double addrspace(1)* %a, align 8 ; Cannot alias invariant load + + %ld.c.idx.1 = load double, double addrspace(1)* %c.idx.1, align 8, !invariant.load !0 + store double 0.0, double addrspace(1)* %a.idx.1, align 8 + + %add = fadd double %ld.c, %ld.c.idx.1 + store double %add, double addrspace(1)* %b + + ret void +} + +attributes #0 = { nounwind } + +!0 = !{} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/lit.local.cfg b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/lit.local.cfg new file mode 100644 index 00000000000..6baccf05fff --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/lit.local.cfg @@ -0,0 +1,3 @@ +if not 'AMDGPU' in config.root.targets: + config.unsupported = True + diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll new file mode 100644 index 00000000000..4292cbcec85 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll @@ -0,0 +1,223 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4,-unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ELT4-ALIGNED,ALIGNED,ALL %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,-unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ELT8-ALIGNED,ALIGNED,ALL %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,-unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ELT16-ALIGNED,ALIGNED,ALL %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ELT4-UNALIGNED,UNALIGNED,ALL %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ELT8-UNALIGNED,UNALIGNED,ALL %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ELT16-UNALIGNED,UNALIGNED,ALL %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32 +; ELT4-ALIGNED: store i32 +; ELT4-ALIGNED: store i32 +; ELT4-ALIGNED: store i32 +; ELT4-ALIGNED: store i32 + +; ELT8: store <2 x i32> +; ELT8: store <2 x i32> + +; ELT16-UNALIGNED: store <4 x i32> +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32(i32 addrspace(5)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3 + + store i32 9, i32 addrspace(5)* %out + store i32 1, i32 addrspace(5)* %out.gep.1 + store i32 23, i32 addrspace(5)* %out.gep.2 + store i32 19, i32 addrspace(5)* %out.gep.3 + ret void +} + +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32_align1( +; ALIGNED: store i32 9, i32 addrspace(5)* %out, align 1 +; ALIGNED: store i32 1, i32 addrspace(5)* %out.gep.1, align 1 +; ALIGNED: store i32 23, i32 addrspace(5)* %out.gep.2, align 1 +; ALIGNED: store i32 19, i32 addrspace(5)* %out.gep.3, align 1 + +; ELT16-UNALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 1 + +; ELT8-UNALIGNED: store <2 x i32> <i32 9, i32 1>, <2 x i32> addrspace(5)* %1, align 1 +; ELT8-UNALIGNED: store <2 x i32> <i32 23, i32 19>, <2 x i32> addrspace(5)* %2, align 1 + +; ELT4-UNALIGNED: store i32 +; ELT4-UNALIGNED: store i32 +; ELT4-UNALIGNED: store i32 +; ELT4-UNALIGNED: store i32 +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align1(i32 addrspace(5)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3 + + store i32 9, i32 addrspace(5)* %out, align 1 + store i32 1, i32 addrspace(5)* %out.gep.1, align 1 + store i32 23, i32 addrspace(5)* %out.gep.2, align 1 + store i32 19, i32 addrspace(5)* %out.gep.3, align 1 + ret void +} + +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32_align2( +; ALIGNED: store i32 9, i32 addrspace(5)* %out, align 2 +; ALIGNED: store i32 1, i32 addrspace(5)* %out.gep.1, align 2 +; ALIGNED: store i32 23, i32 addrspace(5)* %out.gep.2, align 2 +; ALIGNED: store i32 19, i32 addrspace(5)* %out.gep.3, align 2 + +; ELT16-UNALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 2 + +; ELT8-UNALIGNED: store <2 x i32> +; ELT8-UNALIGNED: store <2 x i32> + +; ELT4-UNALIGNED: store i32 +; ELT4-UNALIGNED: store i32 +; ELT4-UNALIGNED: store i32 +; ELT4-UNALIGNED: store i32 +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align2(i32 addrspace(5)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3 + + store i32 9, i32 addrspace(5)* %out, align 2 + store i32 1, i32 addrspace(5)* %out.gep.1, align 2 + store i32 23, i32 addrspace(5)* %out.gep.2, align 2 + store i32 19, i32 addrspace(5)* %out.gep.3, align 2 + ret void +} + +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8( +; ALL: store <4 x i8> +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8(i8 addrspace(5)* %out) #0 { + %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i32 1 + %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i32 2 + %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i32 3 + + store i8 9, i8 addrspace(5)* %out, align 4 + store i8 1, i8 addrspace(5)* %out.gep.1 + store i8 23, i8 addrspace(5)* %out.gep.2 + store i8 19, i8 addrspace(5)* %out.gep.3 + ret void +} + +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8_align1( +; ALIGNED: store i8 +; ALIGNED: store i8 +; ALIGNED: store i8 +; ALIGNED: store i8 + +; UNALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8> addrspace(5)* %1, align 1 +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8_align1(i8 addrspace(5)* %out) #0 { + %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i32 1 + %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i32 2 + %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i32 3 + + store i8 9, i8 addrspace(5)* %out, align 1 + store i8 1, i8 addrspace(5)* %out.gep.1, align 1 + store i8 23, i8 addrspace(5)* %out.gep.2, align 1 + store i8 19, i8 addrspace(5)* %out.gep.3, align 1 + ret void +} + +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16( +; ALL: store <2 x i16> +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16(i16 addrspace(5)* %out) #0 { + %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1 + + store i16 9, i16 addrspace(5)* %out, align 4 + store i16 12, i16 addrspace(5)* %out.gep.1 + ret void +} + +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align2( +; ALIGNED: store i16 +; ALIGNED: store i16 + +; UNALIGNED: store <2 x i16> <i16 9, i16 12>, <2 x i16> addrspace(5)* %1, align 2 +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align2(i16 addrspace(5)* %out) #0 { + %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1 + + store i16 9, i16 addrspace(5)* %out, align 2 + store i16 12, i16 addrspace(5)* %out.gep.1, align 2 + ret void +} + +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align1( +; ALIGNED: store i16 +; ALIGNED: store i16 + +; UNALIGNED: store <2 x i16> <i16 9, i16 12>, <2 x i16> addrspace(5)* %1, align 1 +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align1(i16 addrspace(5)* %out) #0 { + %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1 + + store i16 9, i16 addrspace(5)* %out, align 1 + store i16 12, i16 addrspace(5)* %out.gep.1, align 1 + ret void +} + +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align8( +; ALL: store <2 x i16> <i16 9, i16 12>, <2 x i16> addrspace(5)* %1, align 8 +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align8(i16 addrspace(5)* %out) #0 { + %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1 + + store i16 9, i16 addrspace(5)* %out, align 8 + store i16 12, i16 addrspace(5)* %out.gep.1, align 2 + ret void +} + +; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i32 +; ELT4: store i32 +; ELT4: store i32 +; ELT4: store i32 + +; ELT8: store <2 x i32> +; ELT8: store i32 + +; ELT16: store <3 x i32> +define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32(i32 addrspace(5)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 + + store i32 9, i32 addrspace(5)* %out + store i32 1, i32 addrspace(5)* %out.gep.1 + store i32 23, i32 addrspace(5)* %out.gep.2 + ret void +} + +; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i32_align1( +; ALIGNED: store i32 +; ALIGNED: store i32 +; ALIGNED: store i32 + +; ELT4-UNALIGNED: store i32 +; ELT4-UNALIGNED: store i32 +; ELT4-UNALIGNED: store i32 + +; ELT8-UNALIGNED: store <2 x i32> +; ELT8-UNALIGNED: store i32 + +; ELT16-UNALIGNED: store <3 x i32> +define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32_align1(i32 addrspace(5)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 + + store i32 9, i32 addrspace(5)* %out, align 1 + store i32 1, i32 addrspace(5)* %out.gep.1, align 1 + store i32 23, i32 addrspace(5)* %out.gep.2, align 1 + ret void +} + +; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i8_align1( +; ALIGNED: store i8 +; ALIGNED: store i8 +; ALIGNED: store i8 + +; UNALIGNED: store <3 x i8> +define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i8_align1(i8 addrspace(5)* %out) #0 { + %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1 + %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2 + + store i8 9, i8 addrspace(5)* %out, align 1 + store i8 1, i8 addrspace(5)* %out.gep.1, align 1 + store i8 23, i8 addrspace(5)* %out.gep.2, align 1 + ret void +} + +attributes #0 = { nounwind } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll new file mode 100644 index 00000000000..0d9a4184e71 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll @@ -0,0 +1,657 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s +; Copy of test/CodeGen/AMDGPU/merge-stores.ll with some additions + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + +; TODO: Vector element tests +; TODO: Non-zero base offset for load and store combinations +; TODO: Same base addrspacecasted + + +; CHECK-LABEL: @merge_global_store_2_constants_i8( +; CHECK: store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(1)* %{{[0-9]+}}, align 2 +define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 + + store i8 123, i8 addrspace(1)* %out.gep.1 + store i8 456, i8 addrspace(1)* %out, align 2 + ret void +} + +; CHECK-LABEL: @merge_global_store_2_constants_i8_natural_align +; CHECK: store <2 x i8> +define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 + + store i8 123, i8 addrspace(1)* %out.gep.1 + store i8 456, i8 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @merge_global_store_2_constants_i16 +; CHECK: store <2 x i16> <i16 456, i16 123>, <2 x i16> addrspace(1)* %{{[0-9]+}}, align 4 +define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 + + store i16 123, i16 addrspace(1)* %out.gep.1 + store i16 456, i16 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: @merge_global_store_2_constants_0_i16 +; CHECK: store <2 x i16> zeroinitializer, <2 x i16> addrspace(1)* %{{[0-9]+}}, align 4 +define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 + + store i16 0, i16 addrspace(1)* %out.gep.1 + store i16 0, i16 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: @merge_global_store_2_constants_i16_natural_align +; CHECK: store <2 x i16> +define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 + + store i16 123, i16 addrspace(1)* %out.gep.1 + store i16 456, i16 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @merge_global_store_2_constants_half_natural_align +; CHECK: store <2 x half> +define amdgpu_kernel void @merge_global_store_2_constants_half_natural_align(half addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1 + + store half 2.0, half addrspace(1)* %out.gep.1 + store half 1.0, half addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @merge_global_store_2_constants_i32 +; CHECK: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4 +define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + + store i32 123, i32 addrspace(1)* %out.gep.1 + store i32 456, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @merge_global_store_2_constants_i32_f32 +; CHECK: store <2 x i32> <i32 456, i32 1065353216>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4 +define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)* + store float 1.0, float addrspace(1)* %out.gep.1.bc + store i32 456, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @merge_global_store_2_constants_f32_i32 +; CHECK store <2 x float> <float 4.000000e+00, float 0x370EC00000000000>, <2 x float> addrspace(1)* %{{[0-9]+$}} +define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 + %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)* + store i32 123, i32 addrspace(1)* %out.gep.1.bc + store float 4.0, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @merge_global_store_4_constants_i32 +; CHECK: store <4 x i32> <i32 1234, i32 123, i32 456, i32 333>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4 +define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 + + store i32 123, i32 addrspace(1)* %out.gep.1 + store i32 456, i32 addrspace(1)* %out.gep.2 + store i32 333, i32 addrspace(1)* %out.gep.3 + store i32 1234, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @merge_global_store_4_constants_f32_order +; CHECK: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* %{{[0-9]+}} +define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 + %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 + + store float 8.0, float addrspace(1)* %out + store float 1.0, float addrspace(1)* %out.gep.1 + store float 2.0, float addrspace(1)* %out.gep.2 + store float 4.0, float addrspace(1)* %out.gep.3 + ret void +} + +; First store is out of order. +; CHECK-LABEL: @merge_global_store_4_constants_f32 +; CHECK: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* %{{[0-9]+}}, align 4 +define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 + %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 + + store float 1.0, float addrspace(1)* %out.gep.1 + store float 2.0, float addrspace(1)* %out.gep.2 + store float 4.0, float addrspace(1)* %out.gep.3 + store float 8.0, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @merge_global_store_4_constants_mixed_i32_f32 +; CHECK: store <4 x i32> <i32 1090519040, i32 11, i32 1073741824, i32 17>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4 +define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 + %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 + + %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)* + %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)* + + store i32 11, i32 addrspace(1)* %out.gep.1.bc + store float 2.0, float addrspace(1)* %out.gep.2 + store i32 17, i32 addrspace(1)* %out.gep.3.bc + store float 8.0, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @merge_global_store_3_constants_i32 +; CHECK: store <3 x i32> <i32 1234, i32 123, i32 456>, <3 x i32> addrspace(1)* %{{[0-9]+}}, align 4 +define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 + + store i32 123, i32 addrspace(1)* %out.gep.1 + store i32 456, i32 addrspace(1)* %out.gep.2 + store i32 1234, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @merge_global_store_2_constants_i64 +; CHECK: store <2 x i64> <i64 456, i64 123>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8 +define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 + + store i64 123, i64 addrspace(1)* %out.gep.1 + store i64 456, i64 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @merge_global_store_4_constants_i64 +; CHECK: store <2 x i64> <i64 456, i64 333>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8 +; CHECK: store <2 x i64> <i64 1234, i64 123>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8 +define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 + %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2 + %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3 + + store i64 123, i64 addrspace(1)* %out.gep.1 + store i64 456, i64 addrspace(1)* %out.gep.2 + store i64 333, i64 addrspace(1)* %out.gep.3 + store i64 1234, i64 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @merge_global_store_2_adjacent_loads_i32 +; CHECK: [[LOAD:%[^ ]+]] = load <2 x i32> +; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 0 +; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 1 +; CHECK: [[INSERT0:%[^ ]+]] = insertelement <2 x i32> undef, i32 [[ELT0]], i32 0 +; CHECK: [[INSERT1:%[^ ]+]] = insertelement <2 x i32> [[INSERT0]], i32 [[ELT1]], i32 1 +; CHECK: store <2 x i32> [[INSERT1]] +define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 + + %lo = load i32, i32 addrspace(1)* %in + %hi = load i32, i32 addrspace(1)* %in.gep.1 + + store i32 %lo, i32 addrspace(1)* %out + store i32 %hi, i32 addrspace(1)* %out.gep.1 + ret void +} + +; CHECK-LABEL: @merge_global_store_2_adjacent_loads_i32_nonzero_base +; CHECK: extractelement +; CHECK: extractelement +; CHECK: insertelement +; CHECK: insertelement +; CHECK: store <2 x i32> +define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2 + %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3 + + %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2 + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3 + %lo = load i32, i32 addrspace(1)* %in.gep.0 + %hi = load i32, i32 addrspace(1)* %in.gep.1 + + store i32 %lo, i32 addrspace(1)* %out.gep.0 + store i32 %hi, i32 addrspace(1)* %out.gep.1 + ret void +} + +; CHECK-LABEL: @merge_global_store_2_adjacent_loads_shuffle_i32 +; CHECK: [[LOAD:%[^ ]+]] = load <2 x i32> +; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 0 +; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 1 +; CHECK: [[INSERT0:%[^ ]+]] = insertelement <2 x i32> undef, i32 [[ELT1]], i32 0 +; CHECK: [[INSERT1:%[^ ]+]] = insertelement <2 x i32> [[INSERT0]], i32 [[ELT0]], i32 1 +; CHECK: store <2 x i32> [[INSERT1]] +define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 + + %lo = load i32, i32 addrspace(1)* %in + %hi = load i32, i32 addrspace(1)* %in.gep.1 + + store i32 %hi, i32 addrspace(1)* %out + store i32 %lo, i32 addrspace(1)* %out.gep.1 + ret void +} + +; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32 +; CHECK: load <4 x i32> +; CHECK: store <4 x i32> +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 + %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 + %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 + + %x = load i32, i32 addrspace(1)* %in + %y = load i32, i32 addrspace(1)* %in.gep.1 + %z = load i32, i32 addrspace(1)* %in.gep.2 + %w = load i32, i32 addrspace(1)* %in.gep.3 + + store i32 %x, i32 addrspace(1)* %out + store i32 %y, i32 addrspace(1)* %out.gep.1 + store i32 %z, i32 addrspace(1)* %out.gep.2 + store i32 %w, i32 addrspace(1)* %out.gep.3 + ret void +} + +; CHECK-LABEL: @merge_global_store_3_adjacent_loads_i32 +; CHECK: load <3 x i32> +; CHECK: store <3 x i32> +define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 + %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 + + %x = load i32, i32 addrspace(1)* %in + %y = load i32, i32 addrspace(1)* %in.gep.1 + %z = load i32, i32 addrspace(1)* %in.gep.2 + + store i32 %x, i32 addrspace(1)* %out + store i32 %y, i32 addrspace(1)* %out.gep.1 + store i32 %z, i32 addrspace(1)* %out.gep.2 + ret void +} + +; CHECK-LABEL: @merge_global_store_4_adjacent_loads_f32 +; CHECK: load <4 x float> +; CHECK: store <4 x float> +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 + %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 + %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1 + %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2 + %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3 + + %x = load float, float addrspace(1)* %in + %y = load float, float addrspace(1)* %in.gep.1 + %z = load float, float addrspace(1)* %in.gep.2 + %w = load float, float addrspace(1)* %in.gep.3 + + store float %x, float addrspace(1)* %out + store float %y, float addrspace(1)* %out.gep.1 + store float %z, float addrspace(1)* %out.gep.2 + store float %w, float addrspace(1)* %out.gep.3 + ret void +} + +; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32_nonzero_base +; CHECK: load <4 x i32> +; CHECK: store <4 x i32> +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11 + %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12 + %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13 + %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14 + %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7 + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8 + %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9 + %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10 + + %x = load i32, i32 addrspace(1)* %in.gep.0 + %y = load i32, i32 addrspace(1)* %in.gep.1 + %z = load i32, i32 addrspace(1)* %in.gep.2 + %w = load i32, i32 addrspace(1)* %in.gep.3 + + store i32 %x, i32 addrspace(1)* %out.gep.0 + store i32 %y, i32 addrspace(1)* %out.gep.1 + store i32 %z, i32 addrspace(1)* %out.gep.2 + store i32 %w, i32 addrspace(1)* %out.gep.3 + ret void +} + +; CHECK-LABEL: @merge_global_store_4_adjacent_loads_inverse_i32 +; CHECK: load <4 x i32> +; CHECK: store <4 x i32> +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 + %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 + %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 + + %x = load i32, i32 addrspace(1)* %in + %y = load i32, i32 addrspace(1)* %in.gep.1 + %z = load i32, i32 addrspace(1)* %in.gep.2 + %w = load i32, i32 addrspace(1)* %in.gep.3 + + ; Make sure the barrier doesn't stop this + tail call void @llvm.amdgcn.s.barrier() #1 + + store i32 %w, i32 addrspace(1)* %out.gep.3 + store i32 %z, i32 addrspace(1)* %out.gep.2 + store i32 %y, i32 addrspace(1)* %out.gep.1 + store i32 %x, i32 addrspace(1)* %out + + ret void +} + +; CHECK-LABEL: @merge_global_store_4_adjacent_loads_shuffle_i32 +; CHECK: load <4 x i32> +; CHECK: store <4 x i32> +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 + %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 + %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 + + %x = load i32, i32 addrspace(1)* %in + %y = load i32, i32 addrspace(1)* %in.gep.1 + %z = load i32, i32 addrspace(1)* %in.gep.2 + %w = load i32, i32 addrspace(1)* %in.gep.3 + + ; Make sure the barrier doesn't stop this + tail call void @llvm.amdgcn.s.barrier() #1 + + store i32 %w, i32 addrspace(1)* %out + store i32 %z, i32 addrspace(1)* %out.gep.1 + store i32 %y, i32 addrspace(1)* %out.gep.2 + store i32 %x, i32 addrspace(1)* %out.gep.3 + + ret void +} + +; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8 +; CHECK: load <4 x i8> +; CHECK: extractelement <4 x i8> +; CHECK: extractelement <4 x i8> +; CHECK: extractelement <4 x i8> +; CHECK: extractelement <4 x i8> +; CHECK: insertelement <4 x i8> +; CHECK: insertelement <4 x i8> +; CHECK: insertelement <4 x i8> +; CHECK: insertelement <4 x i8> +; CHECK: store <4 x i8> +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 + %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 + %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 + %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1 + %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2 + %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3 + + %x = load i8, i8 addrspace(1)* %in, align 4 + %y = load i8, i8 addrspace(1)* %in.gep.1 + %z = load i8, i8 addrspace(1)* %in.gep.2 + %w = load i8, i8 addrspace(1)* %in.gep.3 + + store i8 %x, i8 addrspace(1)* %out, align 4 + store i8 %y, i8 addrspace(1)* %out.gep.1 + store i8 %z, i8 addrspace(1)* %out.gep.2 + store i8 %w, i8 addrspace(1)* %out.gep.3 + ret void +} + +; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8_natural_align +; CHECK: load <4 x i8> +; CHECK: store <4 x i8> +define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 + %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 + %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 + %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1 + %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2 + %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3 + + %x = load i8, i8 addrspace(1)* %in + %y = load i8, i8 addrspace(1)* %in.gep.1 + %z = load i8, i8 addrspace(1)* %in.gep.2 + %w = load i8, i8 addrspace(1)* %in.gep.3 + + store i8 %x, i8 addrspace(1)* %out + store i8 %y, i8 addrspace(1)* %out.gep.1 + store i8 %z, i8 addrspace(1)* %out.gep.2 + store i8 %w, i8 addrspace(1)* %out.gep.3 + ret void +} + +; CHECK-LABEL: @merge_global_store_4_vector_elts_loads_v4i32 +; CHECK: load <4 x i32> +; CHECK: store <4 x i32> +define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 + %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in + + %x = extractelement <4 x i32> %vec, i32 0 + %y = extractelement <4 x i32> %vec, i32 1 + %z = extractelement <4 x i32> %vec, i32 2 + %w = extractelement <4 x i32> %vec, i32 3 + + store i32 %x, i32 addrspace(1)* %out + store i32 %y, i32 addrspace(1)* %out.gep.1 + store i32 %z, i32 addrspace(1)* %out.gep.2 + store i32 %w, i32 addrspace(1)* %out.gep.3 + ret void +} + +; CHECK-LABEL: @merge_local_store_2_constants_i8 +; CHECK: store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(3)* %{{[0-9]+}}, align 2 +define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 { + %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1 + + store i8 123, i8 addrspace(3)* %out.gep.1 + store i8 456, i8 addrspace(3)* %out, align 2 + ret void +} + +; CHECK-LABEL: @merge_local_store_2_constants_i32 +; CHECK: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(3)* %{{[0-9]+}}, align 4 +define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 + + store i32 123, i32 addrspace(3)* %out.gep.1 + store i32 456, i32 addrspace(3)* %out + ret void +} + +; CHECK-LABEL: @merge_local_store_2_constants_i32_align_2 +; CHECK: store i32 +; CHECK: store i32 +define amdgpu_kernel void @merge_local_store_2_constants_i32_align_2(i32 addrspace(3)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 + + store i32 123, i32 addrspace(3)* %out.gep.1, align 2 + store i32 456, i32 addrspace(3)* %out, align 2 + ret void +} + +; CHECK-LABEL: @merge_local_store_4_constants_i32 +; CHECK: store <4 x i32> <i32 1234, i32 123, i32 456, i32 333>, <4 x i32> addrspace(3)* +define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3 + + store i32 123, i32 addrspace(3)* %out.gep.1 + store i32 456, i32 addrspace(3)* %out.gep.2 + store i32 333, i32 addrspace(3)* %out.gep.3 + store i32 1234, i32 addrspace(3)* %out + ret void +} + +; CHECK-LABEL: @merge_global_store_5_constants_i32 +; CHECK: store <4 x i32> <i32 9, i32 12, i32 16, i32 -12>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4 +; CHECK: store i32 +define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) { + store i32 9, i32 addrspace(1)* %out, align 4 + %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 + store i32 12, i32 addrspace(1)* %idx1, align 4 + %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 + store i32 16, i32 addrspace(1)* %idx2, align 4 + %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 + store i32 -12, i32 addrspace(1)* %idx3, align 4 + %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 + store i32 11, i32 addrspace(1)* %idx4, align 4 + ret void +} + +; CHECK-LABEL: @merge_global_store_6_constants_i32 +; CHECK: store <4 x i32> <i32 13, i32 15, i32 62, i32 63>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4 +; CHECK: store <2 x i32> <i32 11, i32 123>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4 +define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) { + store i32 13, i32 addrspace(1)* %out, align 4 + %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 + store i32 15, i32 addrspace(1)* %idx1, align 4 + %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 + store i32 62, i32 addrspace(1)* %idx2, align 4 + %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 + store i32 63, i32 addrspace(1)* %idx3, align 4 + %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 + store i32 11, i32 addrspace(1)* %idx4, align 4 + %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5 + store i32 123, i32 addrspace(1)* %idx5, align 4 + ret void +} + +; CHECK-LABEL: @merge_global_store_7_constants_i32 +; CHECK: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4 +; CHECK: store <3 x i32> <i32 98, i32 91, i32 212>, <3 x i32> addrspace(1)* %{{[0-9]+}}, align 4 +define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) { + store i32 34, i32 addrspace(1)* %out, align 4 + %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 + store i32 999, i32 addrspace(1)* %idx1, align 4 + %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 + store i32 65, i32 addrspace(1)* %idx2, align 4 + %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 + store i32 33, i32 addrspace(1)* %idx3, align 4 + %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 + store i32 98, i32 addrspace(1)* %idx4, align 4 + %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5 + store i32 91, i32 addrspace(1)* %idx5, align 4 + %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6 + store i32 212, i32 addrspace(1)* %idx6, align 4 + ret void +} + +; CHECK-LABEL: @merge_global_store_8_constants_i32 +; CHECK: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4 +; CHECK: store <4 x i32> <i32 98, i32 91, i32 212, i32 999>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4 +define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) { + store i32 34, i32 addrspace(1)* %out, align 4 + %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 + store i32 999, i32 addrspace(1)* %idx1, align 4 + %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 + store i32 65, i32 addrspace(1)* %idx2, align 4 + %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 + store i32 33, i32 addrspace(1)* %idx3, align 4 + %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 + store i32 98, i32 addrspace(1)* %idx4, align 4 + %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5 + store i32 91, i32 addrspace(1)* %idx5, align 4 + %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6 + store i32 212, i32 addrspace(1)* %idx6, align 4 + %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7 + store i32 999, i32 addrspace(1)* %idx7, align 4 + ret void +} + +; CHECK-LABEL: @copy_v3i32_align4 +; CHECK: %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4 +; CHECK: store <3 x i32> %vec, <3 x i32> addrspace(1)* %out +define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 { + %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4 + store <3 x i32> %vec, <3 x i32> addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @copy_v3i64_align4 +; CHECK: %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4 +; CHECK: store <3 x i64> %vec, <3 x i64> addrspace(1)* %out +define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 { + %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4 + store <3 x i64> %vec, <3 x i64> addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @copy_v3f32_align4 +; CHECK: %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4 +; CHECK: store <3 x float> +define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 { + %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4 + %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0> + store <3 x float> %fadd, <3 x float> addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @copy_v3f64_align4 +; CHECK: %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4 +; CHECK: store <3 x double> %fadd, <3 x double> addrspace(1)* %out +define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 { + %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4 + %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0> + store <3 x double> %fadd, <3 x double> addrspace(1)* %out + ret void +} + +; Verify that we no longer hit asserts for this test case. No change expected. +; CHECK-LABEL: @copy_vec_of_ptrs +; CHECK: %in.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %in, i32 1 +; CHECK: %vec1 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in.gep.1 +; CHECK: %vec2 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in, align 4 +; CHECK: %out.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %out, i32 1 +; CHECK: store <2 x i16*> %vec1, <2 x i16*> addrspace(1)* %out.gep.1 +; CHECK: store <2 x i16*> %vec2, <2 x i16*> addrspace(1)* %out, align 4 +define amdgpu_kernel void @copy_vec_of_ptrs(<2 x i16*> addrspace(1)* %out, + <2 x i16*> addrspace(1)* %in ) #0 { + %in.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %in, i32 1 + %vec1 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in.gep.1 + %vec2 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in, align 4 + + %out.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %out, i32 1 + store <2 x i16*> %vec1, <2 x i16*> addrspace(1)* %out.gep.1 + store <2 x i16*> %vec2, <2 x i16*> addrspace(1)* %out, align 4 + ret void +} + +declare void @llvm.amdgcn.s.barrier() #1 + +attributes #0 = { nounwind } +attributes #1 = { convergent nounwind } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll new file mode 100644 index 00000000000..bcf2265f310 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll @@ -0,0 +1,91 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + +; CHECK-LABEL: @merge_v2i32_v2i32( +; CHECK: load <4 x i32> +; CHECK: store <4 x i32> zeroinitializer +define amdgpu_kernel void @merge_v2i32_v2i32(<2 x i32> addrspace(1)* nocapture %a, <2 x i32> addrspace(1)* nocapture readonly %b) #0 { +entry: + %a.1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %a, i64 1 + %b.1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %b, i64 1 + + %ld.c = load <2 x i32>, <2 x i32> addrspace(1)* %b, align 4 + %ld.c.idx.1 = load <2 x i32>, <2 x i32> addrspace(1)* %b.1, align 4 + + store <2 x i32> zeroinitializer, <2 x i32> addrspace(1)* %a, align 4 + store <2 x i32> zeroinitializer, <2 x i32> addrspace(1)* %a.1, align 4 + + ret void +} + +; CHECK-LABEL: @merge_v1i32_v1i32( +; CHECK: load <2 x i32> +; CHECK: store <2 x i32> zeroinitializer +define amdgpu_kernel void @merge_v1i32_v1i32(<1 x i32> addrspace(1)* nocapture %a, <1 x i32> addrspace(1)* nocapture readonly %b) #0 { +entry: + %a.1 = getelementptr inbounds <1 x i32>, <1 x i32> addrspace(1)* %a, i64 1 + %b.1 = getelementptr inbounds <1 x i32>, <1 x i32> addrspace(1)* %b, i64 1 + + %ld.c = load <1 x i32>, <1 x i32> addrspace(1)* %b, align 4 + %ld.c.idx.1 = load <1 x i32>, <1 x i32> addrspace(1)* %b.1, align 4 + + store <1 x i32> zeroinitializer, <1 x i32> addrspace(1)* %a, align 4 + store <1 x i32> zeroinitializer, <1 x i32> addrspace(1)* %a.1, align 4 + + ret void +} + +; CHECK-LABEL: @no_merge_v3i32_v3i32( +; CHECK: load <3 x i32> +; CHECK: load <3 x i32> +; CHECK: store <3 x i32> zeroinitializer +; CHECK: store <3 x i32> zeroinitializer +define amdgpu_kernel void @no_merge_v3i32_v3i32(<3 x i32> addrspace(1)* nocapture %a, <3 x i32> addrspace(1)* nocapture readonly %b) #0 { +entry: + %a.1 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %a, i64 1 + %b.1 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %b, i64 1 + + %ld.c = load <3 x i32>, <3 x i32> addrspace(1)* %b, align 4 + %ld.c.idx.1 = load <3 x i32>, <3 x i32> addrspace(1)* %b.1, align 4 + + store <3 x i32> zeroinitializer, <3 x i32> addrspace(1)* %a, align 4 + store <3 x i32> zeroinitializer, <3 x i32> addrspace(1)* %a.1, align 4 + + ret void +} + +; CHECK-LABEL: @merge_v2i16_v2i16( +; CHECK: load <4 x i16> +; CHECK: store <4 x i16> zeroinitializer +define amdgpu_kernel void @merge_v2i16_v2i16(<2 x i16> addrspace(1)* nocapture %a, <2 x i16> addrspace(1)* nocapture readonly %b) #0 { +entry: + %a.1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a, i64 1 + %b.1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b, i64 1 + + %ld.c = load <2 x i16>, <2 x i16> addrspace(1)* %b, align 4 + %ld.c.idx.1 = load <2 x i16>, <2 x i16> addrspace(1)* %b.1, align 4 + + store <2 x i16> zeroinitializer, <2 x i16> addrspace(1)* %a, align 4 + store <2 x i16> zeroinitializer, <2 x i16> addrspace(1)* %a.1, align 4 + + ret void +} + +; Ideally this would be merged +; CHECK-LABEL: @merge_load_i32_v2i16( +; CHECK: load i32, +; CHECK: load <2 x i16> +define amdgpu_kernel void @merge_load_i32_v2i16(i32 addrspace(1)* nocapture %a) #0 { +entry: + %a.1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 1 + %a.1.cast = bitcast i32 addrspace(1)* %a.1 to <2 x i16> addrspace(1)* + + %ld.0 = load i32, i32 addrspace(1)* %a + %ld.1 = load <2 x i16>, <2 x i16> addrspace(1)* %a.1.cast + + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll new file mode 100644 index 00000000000..ff718c1b101 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll @@ -0,0 +1,32 @@ +; RUN: opt -mtriple=amdgcn-- -load-store-vectorizer -S -o - %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + +@lds = internal addrspace(3) global [512 x float] undef, align 4 + +; The original load has an implicit alignment of 4, and should not +; increase to an align 8 load. + +; CHECK-LABEL: @load_keep_base_alignment_missing_align( +; CHECK: load <2 x float>, <2 x float> addrspace(3)* %{{[0-9]+}}, align 4 +define amdgpu_kernel void @load_keep_base_alignment_missing_align(float addrspace(1)* %out) { + %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 11 + %val0 = load float, float addrspace(3)* %ptr0 + + %ptr1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 12 + %val1 = load float, float addrspace(3)* %ptr1 + %add = fadd float %val0, %val1 + store float %add, float addrspace(1)* %out + ret void +} + + +; CHECK-LABEL: @store_keep_base_alignment_missing_align( +; CHECK: store <2 x float> zeroinitializer, <2 x float> addrspace(3)* %{{[0-9]+}}, align 4 +define amdgpu_kernel void @store_keep_base_alignment_missing_align() { + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 1 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 2 + store float 0.0, float addrspace(3)* %arrayidx0 + store float 0.0, float addrspace(3)* %arrayidx1 + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll new file mode 100644 index 00000000000..ffd651b2c65 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll @@ -0,0 +1,63 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + +; Checks that there is no crash when there are multiple tails +; for a the same head starting a chain. +@0 = internal addrspace(3) global [16384 x i32] undef + +; CHECK-LABEL: @no_crash( +; CHECK: store <2 x i32> zeroinitializer +; CHECK: store i32 0 +; CHECK: store i32 0 + +define amdgpu_kernel void @no_crash(i32 %arg) { + %tmp2 = add i32 %arg, 14 + %tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp2 + %tmp4 = add i32 %arg, 15 + %tmp5 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp4 + + store i32 0, i32 addrspace(3)* %tmp3, align 4 + store i32 0, i32 addrspace(3)* %tmp5, align 4 + store i32 0, i32 addrspace(3)* %tmp5, align 4 + store i32 0, i32 addrspace(3)* %tmp5, align 4 + + ret void +} + +; Check adjiacent memory locations are properly matched and the +; longest chain vectorized + +; CHECK-LABEL: @interleave_get_longest +; CHECK: load <4 x i32> +; CHECK: load i32 +; CHECK: store <2 x i32> zeroinitializer +; CHECK: load i32 +; CHECK: load i32 +; CHECK: load i32 + +define amdgpu_kernel void @interleave_get_longest(i32 %arg) { + %a1 = add i32 %arg, 1 + %a2 = add i32 %arg, 2 + %a3 = add i32 %arg, 3 + %a4 = add i32 %arg, 4 + %tmp1 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %arg + %tmp2 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a1 + %tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a2 + %tmp4 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a3 + %tmp5 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a4 + + %l1 = load i32, i32 addrspace(3)* %tmp2, align 4 + %l2 = load i32, i32 addrspace(3)* %tmp1, align 4 + store i32 0, i32 addrspace(3)* %tmp2, align 4 + store i32 0, i32 addrspace(3)* %tmp1, align 4 + %l3 = load i32, i32 addrspace(3)* %tmp2, align 4 + %l4 = load i32, i32 addrspace(3)* %tmp3, align 4 + %l5 = load i32, i32 addrspace(3)* %tmp4, align 4 + %l6 = load i32, i32 addrspace(3)* %tmp5, align 4 + %l7 = load i32, i32 addrspace(3)* %tmp5, align 4 + %l8 = load i32, i32 addrspace(3)* %tmp5, align 4 + + ret void +} + diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll new file mode 100644 index 00000000000..86f6b6d55ec --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll @@ -0,0 +1,22 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + +; CHECK-LABEL: @no_implicit_float( +; CHECK: store i32 +; CHECK: store i32 +; CHECK: store i32 +; CHECK: store i32 +define amdgpu_kernel void @no_implicit_float(i32 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 + + store i32 123, i32 addrspace(1)* %out.gep.1 + store i32 456, i32 addrspace(1)* %out.gep.2 + store i32 333, i32 addrspace(1)* %out.gep.3 + store i32 1234, i32 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind noimplicitfloat } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll new file mode 100644 index 00000000000..8a2abe50a5a --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll @@ -0,0 +1,24 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + +; CHECK-LABEL: @optnone( +; CHECK: store i32 +; CHECK: store i32 +define amdgpu_kernel void @optnone(i32 addrspace(1)* %out) noinline optnone { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + + store i32 123, i32 addrspace(1)* %out.gep.1 + store i32 456, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @do_opt( +; CHECK: store <2 x i32> +define amdgpu_kernel void @do_opt(i32 addrspace(1)* %out) { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + + store i32 123, i32 addrspace(1)* %out.gep.1 + store i32 456, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll new file mode 100644 index 00000000000..9290749bb89 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll @@ -0,0 +1,311 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +; CHECK-LABEL: @merge_v2p1i8( +; CHECK: load <2 x i64> +; CHECK: inttoptr i64 %{{[^ ]+}} to i8 addrspace(1)* +; CHECK: inttoptr i64 %{{[^ ]+}} to i8 addrspace(1)* +; CHECK: store <2 x i64> zeroinitializer +define amdgpu_kernel void @merge_v2p1i8(i8 addrspace(1)* addrspace(1)* nocapture %a, i8 addrspace(1)* addrspace(1)* nocapture readonly %b) #0 { +entry: + %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1 + %b.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %b, i64 1 + + %ld.c = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %b, align 4 + %ld.c.idx.1 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %b.1, align 4 + + store i8 addrspace(1)* null, i8 addrspace(1)* addrspace(1)* %a, align 4 + store i8 addrspace(1)* null, i8 addrspace(1)* addrspace(1)* %a.1, align 4 + + ret void +} + +; CHECK-LABEL: @merge_v2p3i8( +; CHECK: load <2 x i32> +; CHECK: inttoptr i32 %{{[^ ]+}} to i8 addrspace(3)* +; CHECK: inttoptr i32 %{{[^ ]+}} to i8 addrspace(3)* +; CHECK: store <2 x i32> zeroinitializer +define amdgpu_kernel void @merge_v2p3i8(i8 addrspace(3)* addrspace(3)* nocapture %a, i8 addrspace(3)* addrspace(3)* nocapture readonly %b) #0 { +entry: + %a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a, i64 1 + %b.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %b, i64 1 + + %ld.c = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %b, align 4 + %ld.c.idx.1 = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %b.1, align 4 + + store i8 addrspace(3)* null, i8 addrspace(3)* addrspace(3)* %a, align 4 + store i8 addrspace(3)* null, i8 addrspace(3)* addrspace(3)* %a.1, align 4 + + ret void +} + +; CHECK-LABEL: @merge_load_i64_ptr64( +; CHECK: load <2 x i64> +; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1 +; CHECK: inttoptr i64 [[ELT1]] to i8 addrspace(1)* +define amdgpu_kernel void @merge_load_i64_ptr64(i64 addrspace(1)* nocapture %a) #0 { +entry: + %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1 + %a.1.cast = bitcast i64 addrspace(1)* %a.1 to i8 addrspace(1)* addrspace(1)* + + %ld.0 = load i64, i64 addrspace(1)* %a + %ld.1 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a.1.cast + + ret void +} + +; CHECK-LABEL: @merge_load_ptr64_i64( +; CHECK: load <2 x i64> +; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 0 +; CHECK: inttoptr i64 [[ELT0]] to i8 addrspace(1)* +define amdgpu_kernel void @merge_load_ptr64_i64(i64 addrspace(1)* nocapture %a) #0 { +entry: + %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(1)* addrspace(1)* + %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1 + + %ld.0 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a.cast + %ld.1 = load i64, i64 addrspace(1)* %a.1 + + ret void +} + +; CHECK-LABEL: @merge_store_ptr64_i64( +; CHECK: [[ELT0:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr0 to i64 +; CHECK: insertelement <2 x i64> undef, i64 [[ELT0]], i32 0 +; CHECK: store <2 x i64> +define amdgpu_kernel void @merge_store_ptr64_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, i64 %val1) #0 { +entry: + %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(1)* addrspace(1)* + %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1 + + + store i8 addrspace(1)* %ptr0, i8 addrspace(1)* addrspace(1)* %a.cast + store i64 %val1, i64 addrspace(1)* %a.1 + + ret void +} + +; CHECK-LABEL: @merge_store_i64_ptr64( +; CHECK: [[ELT1:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr1 to i64 +; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1]], i32 1 +; CHECK: store <2 x i64> +define amdgpu_kernel void @merge_store_i64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(1)* %ptr1) #0 { +entry: + %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1 + %a.cast = bitcast i8 addrspace(1)* addrspace(1)* %a to i64 addrspace(1)* + + store i64 %val0, i64 addrspace(1)* %a.cast + store i8 addrspace(1)* %ptr1, i8 addrspace(1)* addrspace(1)* %a.1 + + ret void +} + +; CHECK-LABEL: @merge_load_i32_ptr32( +; CHECK: load <2 x i32> +; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 1 +; CHECK: inttoptr i32 [[ELT1]] to i8 addrspace(3)* +define amdgpu_kernel void @merge_load_i32_ptr32(i32 addrspace(3)* nocapture %a) #0 { +entry: + %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1 + %a.1.cast = bitcast i32 addrspace(3)* %a.1 to i8 addrspace(3)* addrspace(3)* + + %ld.0 = load i32, i32 addrspace(3)* %a + %ld.1 = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a.1.cast + + ret void +} + +; CHECK-LABEL: @merge_load_ptr32_i32( +; CHECK: load <2 x i32> +; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 0 +; CHECK: inttoptr i32 [[ELT0]] to i8 addrspace(3)* +define amdgpu_kernel void @merge_load_ptr32_i32(i32 addrspace(3)* nocapture %a) #0 { +entry: + %a.cast = bitcast i32 addrspace(3)* %a to i8 addrspace(3)* addrspace(3)* + %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1 + + %ld.0 = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a.cast + %ld.1 = load i32, i32 addrspace(3)* %a.1 + + ret void +} + +; CHECK-LABEL: @merge_store_ptr32_i32( +; CHECK: [[ELT0:%[^ ]+]] = ptrtoint i8 addrspace(3)* %ptr0 to i32 +; CHECK: insertelement <2 x i32> undef, i32 [[ELT0]], i32 0 +; CHECK: store <2 x i32> +define amdgpu_kernel void @merge_store_ptr32_i32(i32 addrspace(3)* nocapture %a, i8 addrspace(3)* %ptr0, i32 %val1) #0 { +entry: + %a.cast = bitcast i32 addrspace(3)* %a to i8 addrspace(3)* addrspace(3)* + %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1 + + store i8 addrspace(3)* %ptr0, i8 addrspace(3)* addrspace(3)* %a.cast + store i32 %val1, i32 addrspace(3)* %a.1 + + ret void +} + +; CHECK-LABEL: @merge_store_i32_ptr32( +; CHECK: [[ELT1:%[^ ]+]] = ptrtoint i8 addrspace(3)* %ptr1 to i32 +; CHECK: insertelement <2 x i32> %{{[^ ]+}}, i32 [[ELT1]], i32 1 +; CHECK: store <2 x i32> +define amdgpu_kernel void @merge_store_i32_ptr32(i8 addrspace(3)* addrspace(3)* nocapture %a, i32 %val0, i8 addrspace(3)* %ptr1) #0 { +entry: + %a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a, i32 1 + %a.cast = bitcast i8 addrspace(3)* addrspace(3)* %a to i32 addrspace(3)* + + store i32 %val0, i32 addrspace(3)* %a.cast + store i8 addrspace(3)* %ptr1, i8 addrspace(3)* addrspace(3)* %a.1 + + ret void +} + +; CHECK-LABEL: @no_merge_store_ptr32_i64( +; CHECK: store i8 addrspace(3)* +; CHECK: store i64 +define amdgpu_kernel void @no_merge_store_ptr32_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(3)* %ptr0, i64 %val1) #0 { +entry: + %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(3)* addrspace(1)* + %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1 + + + store i8 addrspace(3)* %ptr0, i8 addrspace(3)* addrspace(1)* %a.cast + store i64 %val1, i64 addrspace(1)* %a.1 + + ret void +} + +; CHECK-LABEL: @no_merge_store_i64_ptr32( +; CHECK: store i64 +; CHECK: store i8 addrspace(3)* +define amdgpu_kernel void @no_merge_store_i64_ptr32(i8 addrspace(3)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(3)* %ptr1) #0 { +entry: + %a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %a, i64 1 + %a.cast = bitcast i8 addrspace(3)* addrspace(1)* %a to i64 addrspace(1)* + + store i64 %val0, i64 addrspace(1)* %a.cast + store i8 addrspace(3)* %ptr1, i8 addrspace(3)* addrspace(1)* %a.1 + + ret void +} + +; CHECK-LABEL: @no_merge_load_i64_ptr32( +; CHECK: load i64, +; CHECK: load i8 addrspace(3)*, +define amdgpu_kernel void @no_merge_load_i64_ptr32(i64 addrspace(1)* nocapture %a) #0 { +entry: + %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1 + %a.1.cast = bitcast i64 addrspace(1)* %a.1 to i8 addrspace(3)* addrspace(1)* + + %ld.0 = load i64, i64 addrspace(1)* %a + %ld.1 = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %a.1.cast + + ret void +} + +; CHECK-LABEL: @no_merge_load_ptr32_i64( +; CHECK: load i8 addrspace(3)*, +; CHECK: load i64, +define amdgpu_kernel void @no_merge_load_ptr32_i64(i64 addrspace(1)* nocapture %a) #0 { +entry: + %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(3)* addrspace(1)* + %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1 + + %ld.0 = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %a.cast + %ld.1 = load i64, i64 addrspace(1)* %a.1 + + ret void +} + +; XXX - This isn't merged for some reason +; CHECK-LABEL: @merge_v2p1i8_v2p1i8( +; CHECK: load <2 x i8 addrspace(1)*> +; CHECK: load <2 x i8 addrspace(1)*> +; CHECK: store <2 x i8 addrspace(1)*> +; CHECK: store <2 x i8 addrspace(1)*> +define amdgpu_kernel void @merge_v2p1i8_v2p1i8(<2 x i8 addrspace(1)*> addrspace(1)* nocapture noalias %a, <2 x i8 addrspace(1)*> addrspace(1)* nocapture readonly noalias %b) #0 { +entry: + %a.1 = getelementptr inbounds <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %a, i64 1 + %b.1 = getelementptr inbounds <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %b, i64 1 + + %ld.c = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %b, align 4 + %ld.c.idx.1 = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %b.1, align 4 + + store <2 x i8 addrspace(1)*> zeroinitializer, <2 x i8 addrspace(1)*> addrspace(1)* %a, align 4 + store <2 x i8 addrspace(1)*> zeroinitializer, <2 x i8 addrspace(1)*> addrspace(1)* %a.1, align 4 + ret void +} + +; CHECK-LABEL: @merge_load_ptr64_f64( +; CHECK: load <2 x i64> +; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 0 +; CHECK: [[ELT0_INT:%[^ ]+]] = inttoptr i64 [[ELT0]] to i8 addrspace(1)* +; CHECK: [[ELT1_INT:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1 +; CHECK: bitcast i64 [[ELT1_INT]] to double +define amdgpu_kernel void @merge_load_ptr64_f64(double addrspace(1)* nocapture %a) #0 { +entry: + %a.cast = bitcast double addrspace(1)* %a to i8 addrspace(1)* addrspace(1)* + %a.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1 + + %ld.0 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a.cast + %ld.1 = load double, double addrspace(1)* %a.1 + + ret void +} + +; CHECK-LABEL: @merge_load_f64_ptr64( +; CHECK: load <2 x i64> +; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 0 +; CHECK: bitcast i64 [[ELT0]] to double +; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1 +; CHECK: inttoptr i64 [[ELT1]] to i8 addrspace(1)* +define amdgpu_kernel void @merge_load_f64_ptr64(double addrspace(1)* nocapture %a) #0 { +entry: + %a.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1 + %a.1.cast = bitcast double addrspace(1)* %a.1 to i8 addrspace(1)* addrspace(1)* + + %ld.0 = load double, double addrspace(1)* %a + %ld.1 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a.1.cast + + ret void +} + +; CHECK-LABEL: @merge_store_ptr64_f64( +; CHECK: [[ELT0_INT:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr0 to i64 +; CHECK: insertelement <2 x i64> undef, i64 [[ELT0_INT]], i32 0 +; CHECK: [[ELT1_INT:%[^ ]+]] = bitcast double %val1 to i64 +; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1_INT]], i32 1 +; CHECK: store <2 x i64> +define amdgpu_kernel void @merge_store_ptr64_f64(double addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, double %val1) #0 { +entry: + %a.cast = bitcast double addrspace(1)* %a to i8 addrspace(1)* addrspace(1)* + %a.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1 + + store i8 addrspace(1)* %ptr0, i8 addrspace(1)* addrspace(1)* %a.cast + store double %val1, double addrspace(1)* %a.1 + + ret void +} + +; CHECK-LABEL: @merge_store_f64_ptr64( +; CHECK: [[ELT0_INT:%[^ ]+]] = bitcast double %val0 to i64 +; CHECK: insertelement <2 x i64> undef, i64 [[ELT0_INT]], i32 0 +; CHECK: [[ELT1_INT:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr1 to i64 +; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1_INT]], i32 1 +; CHECK: store <2 x i64> +define amdgpu_kernel void @merge_store_f64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, double %val0, i8 addrspace(1)* %ptr1) #0 { +entry: + %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1 + %a.cast = bitcast i8 addrspace(1)* addrspace(1)* %a to double addrspace(1)* + + store double %val0, double addrspace(1)* %a.cast + store i8 addrspace(1)* %ptr1, i8 addrspace(1)* addrspace(1)* %a.1 + + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/selects.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/selects.ll new file mode 100644 index 00000000000..c020cc71b4a --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/selects.ll @@ -0,0 +1,95 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -dce -S -o - %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + +define void @base_case(i1 %cnd, i32 addrspace(1)* %a, i32 addrspace(1)* %b, <3 x i32> addrspace(1)* %out) { +; CHECK-LABEL: @base_case +; CHECK: load <3 x i32> +entry: + %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 1 + %gep2 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 2 + %gep4 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 1 + %gep5 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 2 + %selected = select i1 %cnd, i32 addrspace(1)* %a, i32 addrspace(1)* %b + %selected14 = select i1 %cnd, i32 addrspace(1)* %gep1, i32 addrspace(1)* %gep4 + %selected25 = select i1 %cnd, i32 addrspace(1)* %gep2, i32 addrspace(1)* %gep5 + %val0 = load i32, i32 addrspace(1)* %selected, align 4 + %val1 = load i32, i32 addrspace(1)* %selected14, align 4 + %val2 = load i32, i32 addrspace(1)* %selected25, align 4 + %t0 = insertelement <3 x i32> undef, i32 %val0, i32 0 + %t1 = insertelement <3 x i32> %t0, i32 %val1, i32 1 + %t2 = insertelement <3 x i32> %t1, i32 %val2, i32 2 + store <3 x i32> %t2, <3 x i32> addrspace(1)* %out + ret void +} + +define void @scev_targeting_complex_case(i1 %cnd, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 %base, <2 x i32> addrspace(1)* %out) { +; CHECK-LABEL: @scev_targeting_complex_case +; CHECK: load <2 x i32> +entry: + %base.x4 = shl i32 %base, 2 + %base.x4.p1 = add i32 %base.x4, 1 + %base.x4.p2 = add i32 %base.x4, 2 + %base.x4.p3 = add i32 %base.x4, 3 + %zext.x4 = zext i32 %base.x4 to i64 + %zext.x4.p1 = zext i32 %base.x4.p1 to i64 + %zext.x4.p2 = zext i32 %base.x4.p2 to i64 + %zext.x4.p3 = zext i32 %base.x4.p3 to i64 + %base.x16 = mul i64 %zext.x4, 4 + %base.x16.p4 = shl i64 %zext.x4.p1, 2 + %base.x16.p8 = shl i64 %zext.x4.p2, 2 + %base.x16.p12 = mul i64 %zext.x4.p3, 4 + %a.pi8 = bitcast i32 addrspace(1)* %a to i8 addrspace(1)* + %b.pi8 = bitcast i32 addrspace(1)* %b to i8 addrspace(1)* + %gep.a.base.x16 = getelementptr inbounds i8, i8 addrspace(1)* %a.pi8, i64 %base.x16 + %gep.b.base.x16.p4 = getelementptr inbounds i8, i8 addrspace(1)* %b.pi8, i64 %base.x16.p4 + %gep.a.base.x16.p8 = getelementptr inbounds i8, i8 addrspace(1)* %a.pi8, i64 %base.x16.p8 + %gep.b.base.x16.p12 = getelementptr inbounds i8, i8 addrspace(1)* %b.pi8, i64 %base.x16.p12 + %a.base.x16 = bitcast i8 addrspace(1)* %gep.a.base.x16 to i32 addrspace(1)* + %b.base.x16.p4 = bitcast i8 addrspace(1)* %gep.b.base.x16.p4 to i32 addrspace(1)* + %selected.base.x16.p0.or.4 = select i1 %cnd, i32 addrspace(1)* %a.base.x16, i32 addrspace(1)* %b.base.x16.p4 + %gep.selected.base.x16.p8.or.12 = select i1 %cnd, i8 addrspace(1)* %gep.a.base.x16.p8, i8 addrspace(1)* %gep.b.base.x16.p12 + %selected.base.x16.p8.or.12 = bitcast i8 addrspace(1)* %gep.selected.base.x16.p8.or.12 to i32 addrspace(1)* + %selected.base.x16.p40.or.44 = getelementptr inbounds i32, i32 addrspace(1)* %selected.base.x16.p0.or.4, i64 10 + %selected.base.x16.p44.or.48 = getelementptr inbounds i32, i32 addrspace(1)* %selected.base.x16.p8.or.12, i64 9 + %val0 = load i32, i32 addrspace(1)* %selected.base.x16.p40.or.44, align 4 + %val1 = load i32, i32 addrspace(1)* %selected.base.x16.p44.or.48, align 4 + %t0 = insertelement <2 x i32> undef, i32 %val0, i32 0 + %t1 = insertelement <2 x i32> %t0, i32 %val1, i32 1 + store <2 x i32> %t1, <2 x i32> addrspace(1)* %out + ret void +} + +define void @nested_selects(i1 %cnd0, i1 %cnd1, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 %base, <2 x i32> addrspace(1)* %out) { +; CHECK-LABEL: @nested_selects +; CHECK: load <2 x i32> +entry: + %base.p1 = add nsw i32 %base, 1 + %base.p2 = add i32 %base, 2 + %base.p3 = add nsw i32 %base, 3 + %base.x4 = mul i32 %base, 4 + %base.x4.p5 = add i32 %base.x4, 5 + %base.x4.p6 = add i32 %base.x4, 6 + %sext = sext i32 %base to i64 + %sext.p1 = sext i32 %base.p1 to i64 + %sext.p2 = sext i32 %base.p2 to i64 + %sext.p3 = sext i32 %base.p3 to i64 + %sext.x4.p5 = sext i32 %base.x4.p5 to i64 + %sext.x4.p6 = sext i32 %base.x4.p6 to i64 + %gep.a.base = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext + %gep.a.base.p1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.p1 + %gep.a.base.p2 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.p2 + %gep.a.base.p3 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.p3 + %gep.b.base.x4.p5 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.x4.p5 + %gep.b.base.x4.p6 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.x4.p6 + %selected.1.L = select i1 %cnd1, i32 addrspace(1)* %gep.a.base.p2, i32 addrspace(1)* %gep.b.base.x4.p5 + %selected.1.R = select i1 %cnd1, i32 addrspace(1)* %gep.a.base.p3, i32 addrspace(1)* %gep.b.base.x4.p6 + %selected.0.L = select i1 %cnd0, i32 addrspace(1)* %gep.a.base, i32 addrspace(1)* %selected.1.L + %selected.0.R = select i1 %cnd0, i32 addrspace(1)* %gep.a.base.p1, i32 addrspace(1)* %selected.1.R + %val0 = load i32, i32 addrspace(1)* %selected.0.L, align 4 + %val1 = load i32, i32 addrspace(1)* %selected.0.R, align 4 + %t0 = insertelement <2 x i32> undef, i32 %val0, i32 0 + %t1 = insertelement <2 x i32> %t0, i32 %val1, i32 1 + store <2 x i32> %t1, <2 x i32> addrspace(1)* %out + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll new file mode 100644 index 00000000000..5ed7ee80ea0 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll @@ -0,0 +1,60 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + +; Check that, in the presence of an aliasing load, the stores preceding the +; aliasing load are safe to vectorize. + +; CHECK-LABEL: store_vectorize_with_alias +; CHECK: store <4 x float> +; CHECK: load <4 x float> +; CHECK: store <4 x float> + +; Function Attrs: nounwind +define amdgpu_kernel void @store_vectorize_with_alias(i8 addrspace(1)* %a, i8 addrspace(1)* %b) #0 { +bb: + %tmp = bitcast i8 addrspace(1)* %b to float addrspace(1)* + %tmp1 = load float, float addrspace(1)* %tmp, align 4 + + %tmp2 = bitcast i8 addrspace(1)* %a to float addrspace(1)* + store float %tmp1, float addrspace(1)* %tmp2, align 4 + %tmp3 = getelementptr i8, i8 addrspace(1)* %a, i64 4 + %tmp4 = bitcast i8 addrspace(1)* %tmp3 to float addrspace(1)* + store float %tmp1, float addrspace(1)* %tmp4, align 4 + %tmp5 = getelementptr i8, i8 addrspace(1)* %a, i64 8 + %tmp6 = bitcast i8 addrspace(1)* %tmp5 to float addrspace(1)* + store float %tmp1, float addrspace(1)* %tmp6, align 4 + %tmp7 = getelementptr i8, i8 addrspace(1)* %a, i64 12 + %tmp8 = bitcast i8 addrspace(1)* %tmp7 to float addrspace(1)* + store float %tmp1, float addrspace(1)* %tmp8, align 4 + + %tmp9 = getelementptr i8, i8 addrspace(1)* %b, i64 16 + %tmp10 = bitcast i8 addrspace(1)* %tmp9 to float addrspace(1)* + %tmp11 = load float, float addrspace(1)* %tmp10, align 4 + %tmp12 = getelementptr i8, i8 addrspace(1)* %b, i64 20 + %tmp13 = bitcast i8 addrspace(1)* %tmp12 to float addrspace(1)* + %tmp14 = load float, float addrspace(1)* %tmp13, align 4 + %tmp15 = getelementptr i8, i8 addrspace(1)* %b, i64 24 + %tmp16 = bitcast i8 addrspace(1)* %tmp15 to float addrspace(1)* + %tmp17 = load float, float addrspace(1)* %tmp16, align 4 + %tmp18 = getelementptr i8, i8 addrspace(1)* %b, i64 28 + %tmp19 = bitcast i8 addrspace(1)* %tmp18 to float addrspace(1)* + %tmp20 = load float, float addrspace(1)* %tmp19, align 4 + + %tmp21 = getelementptr i8, i8 addrspace(1)* %a, i64 16 + %tmp22 = bitcast i8 addrspace(1)* %tmp21 to float addrspace(1)* + store float %tmp11, float addrspace(1)* %tmp22, align 4 + %tmp23 = getelementptr i8, i8 addrspace(1)* %a, i64 20 + %tmp24 = bitcast i8 addrspace(1)* %tmp23 to float addrspace(1)* + store float %tmp14, float addrspace(1)* %tmp24, align 4 + %tmp25 = getelementptr i8, i8 addrspace(1)* %a, i64 24 + %tmp26 = bitcast i8 addrspace(1)* %tmp25 to float addrspace(1)* + store float %tmp17, float addrspace(1)* %tmp26, align 4 + %tmp27 = getelementptr i8, i8 addrspace(1)* %a, i64 28 + %tmp28 = bitcast i8 addrspace(1)* %tmp27 to float addrspace(1)* + store float %tmp20, float addrspace(1)* %tmp28, align 4 + + ret void +} + +attributes #0 = { argmemonly nounwind } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll new file mode 100644 index 00000000000..65d114478b4 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll @@ -0,0 +1,201 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + +; Checks that we don't merge loads/stores of types smaller than one +; byte, or vectors with elements smaller than one byte. + +%struct.foo = type { i32, i8 } + +declare void @use_i1(i1) +declare void @use_i2(i2) +declare void @use_i8(i8) +declare void @use_foo(%struct.foo) +declare void @use_v2i2(<2 x i2>) +declare void @use_v4i2(<4 x i2>) +declare void @use_v2i9(<2 x i9>) + +; CHECK-LABEL: @merge_store_2_constants_i1( +; CHECK: store i1 +; CHECK: store i1 +define amdgpu_kernel void @merge_store_2_constants_i1(i1 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i1, i1 addrspace(1)* %out, i32 1 + store i1 true, i1 addrspace(1)* %out.gep.1 + store i1 false, i1 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @merge_store_2_constants_i2( +; CHECK: store i2 1 +; CHECK: store i2 -1 +define amdgpu_kernel void @merge_store_2_constants_i2(i2 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i2, i2 addrspace(1)* %out, i32 1 + store i2 1, i2 addrspace(1)* %out.gep.1 + store i2 -1, i2 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @merge_different_store_sizes_i1_i8( +; CHECK: store i1 true +; CHECK: store i8 123 +define amdgpu_kernel void @merge_different_store_sizes_i1_i8(i8 addrspace(1)* %out) #0 { + %out.i1 = bitcast i8 addrspace(1)* %out to i1 addrspace(1)* + %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 + store i1 true, i1 addrspace(1)* %out.i1 + store i8 123, i8 addrspace(1)* %out.gep.1 + ret void +} + +; CHECK-LABEL: @merge_different_store_sizes_i8_i1( +; CHECK: store i8 123 +; CHECK: store i1 true +define amdgpu_kernel void @merge_different_store_sizes_i8_i1(i1 addrspace(1)* %out) #0 { + %out.i8 = bitcast i1 addrspace(1)* %out to i8 addrspace(1)* + %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out.i8, i32 1 + store i8 123, i8 addrspace(1)* %out.gep.1 + store i1 true, i1 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @merge_store_2_constant_structs( +; CHECK: store %struct.foo +; CHECK: store %struct.foo +define amdgpu_kernel void @merge_store_2_constant_structs(%struct.foo addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr %struct.foo, %struct.foo addrspace(1)* %out, i32 1 + store %struct.foo { i32 12, i8 3 }, %struct.foo addrspace(1)* %out.gep.1 + store %struct.foo { i32 92, i8 9 }, %struct.foo addrspace(1)* %out + ret void +} + +; sub-byte element size +; CHECK-LABEL: @merge_store_2_constants_v2i2( +; CHECK: store <2 x i2> +; CHECK: store <2 x i2> +define amdgpu_kernel void @merge_store_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr <2 x i2>, <2 x i2> addrspace(1)* %out, i32 1 + store <2 x i2> <i2 1, i2 -1>, <2 x i2> addrspace(1)* %out.gep.1 + store <2 x i2> <i2 -1, i2 1>, <2 x i2> addrspace(1)* %out + ret void +} + +; sub-byte element size but byte size + +; CHECK-LABEL: @merge_store_2_constants_v4i2( +; CHECK: store <4 x i2> +; CHECK: store <4 x i2> +define amdgpu_kernel void @merge_store_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr <4 x i2>, <4 x i2> addrspace(1)* %out, i32 1 + store <4 x i2> <i2 1, i2 -1, i2 1, i2 -1>, <4 x i2> addrspace(1)* %out.gep.1 + store <4 x i2> <i2 -1, i2 1, i2 -1, i2 1>, <4 x i2> addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @merge_load_2_constants_i1( +; CHECK: load i1 +; CHECK: load i1 +define amdgpu_kernel void @merge_load_2_constants_i1(i1 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i1, i1 addrspace(1)* %out, i32 1 + %x = load i1, i1 addrspace(1)* %out.gep.1 + %y = load i1, i1 addrspace(1)* %out + call void @use_i1(i1 %x) + call void @use_i1(i1 %y) + ret void +} + +; CHECK-LABEL: @merge_load_2_constants_i2( +; CHECK: load i2 +; CHECK: load i2 +define amdgpu_kernel void @merge_load_2_constants_i2(i2 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i2, i2 addrspace(1)* %out, i32 1 + %x = load i2, i2 addrspace(1)* %out.gep.1 + %y = load i2, i2 addrspace(1)* %out + call void @use_i2(i2 %x) + call void @use_i2(i2 %y) + ret void +} + +; CHECK-LABEL: @merge_different_load_sizes_i1_i8( +; CHECK: load i1 +; CHECK: load i8 +define amdgpu_kernel void @merge_different_load_sizes_i1_i8(i8 addrspace(1)* %out) #0 { + %out.i1 = bitcast i8 addrspace(1)* %out to i1 addrspace(1)* + %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 + %x = load i1, i1 addrspace(1)* %out.i1 + %y = load i8, i8 addrspace(1)* %out.gep.1 + call void @use_i1(i1 %x) + call void @use_i8(i8 %y) + ret void +} + +; CHECK-LABEL: @merge_different_load_sizes_i8_i1( +; CHECK: load i8 +; CHECK: load i1 +define amdgpu_kernel void @merge_different_load_sizes_i8_i1(i1 addrspace(1)* %out) #0 { + %out.i8 = bitcast i1 addrspace(1)* %out to i8 addrspace(1)* + %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out.i8, i32 1 + %x = load i8, i8 addrspace(1)* %out.gep.1 + %y = load i1, i1 addrspace(1)* %out + call void @use_i8(i8 %x) + call void @use_i1(i1 %y) + ret void +} + +; CHECK-LABEL: @merge_load_2_constant_structs( +; CHECK: load %struct.foo +; CHECK: load %struct.foo +define amdgpu_kernel void @merge_load_2_constant_structs(%struct.foo addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr %struct.foo, %struct.foo addrspace(1)* %out, i32 1 + %x = load %struct.foo, %struct.foo addrspace(1)* %out.gep.1 + %y = load %struct.foo, %struct.foo addrspace(1)* %out + call void @use_foo(%struct.foo %x) + call void @use_foo(%struct.foo %y) + ret void +} + +; CHECK-LABEL: @merge_load_2_constants_v2i2( +; CHECK: load <2 x i2> +; CHECK: load <2 x i2> +define amdgpu_kernel void @merge_load_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr <2 x i2>, <2 x i2> addrspace(1)* %out, i32 1 + %x = load <2 x i2>, <2 x i2> addrspace(1)* %out.gep.1 + %y = load <2 x i2>, <2 x i2> addrspace(1)* %out + call void @use_v2i2(<2 x i2> %x) + call void @use_v2i2(<2 x i2> %y) + ret void +} + +; CHECK-LABEL: @merge_load_2_constants_v4i2( +; CHECK: load <4 x i2> +; CHECK: load <4 x i2> +define amdgpu_kernel void @merge_load_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr <4 x i2>, <4 x i2> addrspace(1)* %out, i32 1 + %x = load <4 x i2>, <4 x i2> addrspace(1)* %out.gep.1 + %y = load <4 x i2>, <4 x i2> addrspace(1)* %out + call void @use_v4i2(<4 x i2> %x) + call void @use_v4i2(<4 x i2> %y) + ret void +} + +; CHECK-LABEL: @merge_store_2_constants_i9( +; CHECK: store i9 3 +; CHECK: store i9 -5 +define amdgpu_kernel void @merge_store_2_constants_i9(i9 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i9, i9 addrspace(1)* %out, i32 1 + store i9 3, i9 addrspace(1)* %out.gep.1 + store i9 -5, i9 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @merge_load_2_constants_v2i9( +; CHECK: load <2 x i9> +; CHECK: load <2 x i9> +define amdgpu_kernel void @merge_load_2_constants_v2i9(<2 x i9> addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr <2 x i9>, <2 x i9> addrspace(1)* %out, i32 1 + %x = load <2 x i9>, <2 x i9> addrspace(1)* %out.gep.1 + %y = load <2 x i9>, <2 x i9> addrspace(1)* %out + call void @use_v2i9(<2 x i9> %x) + call void @use_v2i9(<2 x i9> %y) + ret void +} + +attributes #0 = { nounwind } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/lit.local.cfg b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/lit.local.cfg new file mode 100644 index 00000000000..a5e90f8e3c1 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/lit.local.cfg @@ -0,0 +1,3 @@ +if not 'NVPTX' in config.root.targets: + config.unsupported = True + diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/merge-across-side-effects.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/merge-across-side-effects.ll new file mode 100644 index 00000000000..72c13b4d12e --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/merge-across-side-effects.ll @@ -0,0 +1,209 @@ +; RUN: opt -mtriple=nvptx64-nvidia-cuda -load-store-vectorizer -S -o - %s | FileCheck %s + +; Check that the load/store vectorizer is willing to move loads/stores across +; intervening instructions only if it's safe. +; +; - Loads can be moved across instructions that don't write or throw. +; - Stores can only be moved across instructions which don't read, write, or +; throw. + +declare void @fn() +declare void @fn_nounwind() #0 +declare void @fn_nounwind_writeonly() #1 +declare void @fn_nounwind_readonly() #2 +declare void @fn_writeonly() #3 +declare void @fn_readonly() #4 +declare void @fn_readnone() #5 + +; CHECK-LABEL: @load_fn +; CHECK: load +; CHECK: call void @fn() +; CHECK: load +define void @load_fn(i32* %p) #0 { + %p.1 = getelementptr i32, i32* %p, i32 1 + + %v0 = load i32, i32* %p, align 8 + call void @fn() + %v1 = load i32, i32* %p.1, align 4 + ret void +} + +; CHECK-LABEL: @load_fn_nounwind +; CHECK: load +; CHECK: call void @fn_nounwind() +; CHECK: load +define void @load_fn_nounwind(i32* %p) #0 { + %p.1 = getelementptr i32, i32* %p, i32 1 + + %v0 = load i32, i32* %p, align 8 + call void @fn_nounwind() #0 + %v1 = load i32, i32* %p.1, align 4 + ret void +} + +; CHECK-LABEL: @load_fn_nounwind_writeonly +; CHECK: load +; CHECK: call void @fn_nounwind_writeonly() +; CHECK: load +define void @load_fn_nounwind_writeonly(i32* %p) #0 { + %p.1 = getelementptr i32, i32* %p, i32 1 + + %v0 = load i32, i32* %p, align 8 + call void @fn_nounwind_writeonly() #1 + %v1 = load i32, i32* %p.1, align 4 + ret void +} + +; CHECK-LABEL: @load_fn_nounwind_readonly +; CHECK-DAG: load <2 x i32> +; CHECK-DAG: call void @fn_nounwind_readonly() +define void @load_fn_nounwind_readonly(i32* %p) #0 { + %p.1 = getelementptr i32, i32* %p, i32 1 + + %v0 = load i32, i32* %p, align 8 + call void @fn_nounwind_readonly() #2 + %v1 = load i32, i32* %p.1, align 4 + ret void +} + +; CHECK-LABEL: @load_fn_readonly +; CHECK: load +; CHECK: call void @fn_readonly +; CHECK: load +define void @load_fn_readonly(i32* %p) #0 { + %p.1 = getelementptr i32, i32* %p, i32 1 + + %v0 = load i32, i32* %p, align 8 + call void @fn_readonly() #4 + %v1 = load i32, i32* %p.1, align 4 + ret void +} + +; CHECK-LABEL: @load_fn_writeonly +; CHECK: load +; CHECK: call void @fn_writeonly() +; CHECK: load +define void @load_fn_writeonly(i32* %p) #0 { + %p.1 = getelementptr i32, i32* %p, i32 1 + + %v0 = load i32, i32* %p, align 8 + call void @fn_writeonly() #3 + %v1 = load i32, i32* %p.1, align 4 + ret void +} + +; CHECK-LABEL: @load_fn_readnone +; CHECK-DAG: load <2 x i32> +; CHECK-DAG: call void @fn_readnone() +define void @load_fn_readnone(i32* %p) #0 { + %p.1 = getelementptr i32, i32* %p, i32 1 + + %v0 = load i32, i32* %p, align 8 + call void @fn_readnone() #5 + %v1 = load i32, i32* %p.1, align 4 + ret void +} + +; ------------------------------------------------ +; Same tests, but now for stores instead of loads. +; ------------------------------------------------ + +; CHECK-LABEL: @store_fn +; CHECK: store +; CHECK: call void @fn() +; CHECK: store +define void @store_fn(i32* %p) #0 { + %p.1 = getelementptr i32, i32* %p, i32 1 + + store i32 0, i32* %p + call void @fn() + store i32 0, i32* %p.1 + ret void +} + +; CHECK-LABEL: @store_fn_nounwind +; CHECK: store +; CHECK: call void @fn_nounwind() +; CHECK: store +define void @store_fn_nounwind(i32* %p) #0 { + %p.1 = getelementptr i32, i32* %p, i32 1 + + store i32 0, i32* %p + call void @fn_nounwind() #0 + store i32 0, i32* %p.1 + ret void +} + +; CHECK-LABEL: @store_fn_nounwind_writeonly +; CHECK: store +; CHECK: call void @fn_nounwind_writeonly() +; CHECK: store +define void @store_fn_nounwind_writeonly(i32* %p) #0 { + %p.1 = getelementptr i32, i32* %p, i32 1 + + store i32 0, i32* %p + call void @fn_nounwind_writeonly() #1 + store i32 0, i32* %p.1 + ret void +} + +; CHECK-LABEL: @store_fn_nounwind_readonly +; CHECK: store +; CHECK: call void @fn_nounwind_readonly() +; CHECK: store +define void @store_fn_nounwind_readonly(i32* %p) #0 { + %p.1 = getelementptr i32, i32* %p, i32 1 + + store i32 0, i32* %p + call void @fn_nounwind_readonly() #2 + store i32 0, i32* %p.1 + ret void +} + +; CHECK-LABEL: @store_fn_readonly +; CHECK: store +; CHECK: call void @fn_readonly +; CHECK: store +define void @store_fn_readonly(i32* %p) #0 { + %p.1 = getelementptr i32, i32* %p, i32 1 + + store i32 0, i32* %p + call void @fn_readonly() #4 + store i32 0, i32* %p.1 + ret void +} + +; CHECK-LABEL: @store_fn_writeonly +; CHECK: store +; CHECK: call void @fn_writeonly() +; CHECK: store +define void @store_fn_writeonly(i32* %p) #0 { + %p.1 = getelementptr i32, i32* %p, i32 1 + + store i32 0, i32* %p + call void @fn_writeonly() #3 + store i32 0, i32* %p.1 + ret void +} + +; This is the only store idiom we can vectorize. +; CHECK-LABEL: @store_fn_readnone +; CHECK-DAG: store <2 x i32> +; CHECK-DAG: call void @fn_readnone() +define void @store_fn_readnone(i32* %p) #0 { + %p.1 = getelementptr i32, i32* %p, i32 1 + + store i32 0, i32* %p, align 8 + call void @fn_readnone() #5 + store i32 0, i32* %p.1, align 8 + ret void +} + + +attributes #0 = { nounwind } +attributes #1 = { nounwind writeonly } +attributes #2 = { nounwind readonly } +attributes #3 = { writeonly } +attributes #4 = { readonly } +; readnone implies nounwind, so no need to test separately +attributes #5 = { nounwind readnone } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/non-instr-bitcast.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/non-instr-bitcast.ll new file mode 100644 index 00000000000..ff5e54f03ae --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/non-instr-bitcast.ll @@ -0,0 +1,14 @@ +; RUN: opt -mtriple=nvptx64-nvidia-cuda -load-store-vectorizer -S -o - %s | FileCheck %s + +; Load from a constant. This can be vectorized, but shouldn't crash us. + +@global = internal addrspace(1) constant [4 x float] [float 0xBF71111120000000, float 0x3F70410420000000, float 0xBF81111120000000, float 0x3FB5555560000000], align 4 + +define void @foo() { + ; CHECK: load <4 x float> + %a = load float, float addrspace(1)* getelementptr inbounds ([4 x float], [4 x float] addrspace(1)* @global, i64 0, i64 0), align 16 + %b = load float, float addrspace(1)* getelementptr inbounds ([4 x float], [4 x float] addrspace(1)* @global, i64 0, i64 1), align 4 + %c = load float, float addrspace(1)* getelementptr inbounds ([4 x float], [4 x float] addrspace(1)* @global, i64 0, i64 2), align 4 + %d = load float, float addrspace(1)* getelementptr inbounds ([4 x float], [4 x float] addrspace(1)* @global, i64 0, i64 3), align 4 + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/propagate-invariance-metadata.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/propagate-invariance-metadata.ll new file mode 100644 index 00000000000..ac0660e7833 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/propagate-invariance-metadata.ll @@ -0,0 +1,17 @@ +; RUN: opt -load-store-vectorizer -march=nvptx64 -mcpu=sm_35 -S < %s | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; CHECK-LABEL: @foo +define i32 @foo(i32* %ptr) { + %ptr1 = getelementptr i32, i32* %ptr, i32 1 + %p1 = addrspacecast i32* %ptr1 to i32 addrspace(1)* + ; CHECK: load <2 x i32>, <2 x i32>* %{{[0-9]+}}, align 8, !invariant.load !0 + %v0 = load i32, i32* %ptr, align 8, !invariant.load !0 + %v1 = load i32, i32* %ptr1, align 4, !invariant.load !0 + %sum = add i32 %v0, %v1 + ret i32 %sum +} + +!0 = !{} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll new file mode 100644 index 00000000000..e29f3dfa537 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll @@ -0,0 +1,80 @@ +; RUN: opt -codegenprepare -load-store-vectorizer %s -S -o - | FileCheck %s +; RUN: opt -load-store-vectorizer %s -S -o - | FileCheck %s +; RUN: opt -codegenprepare -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' %s -S -o - | FileCheck %s +; RUN: opt -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' %s -S -o - | FileCheck %s + +target triple = "x86_64--" + +%union = type { { [4 x [4 x [4 x [16 x float]]]], [4 x [4 x [4 x [16 x float]]]], [10 x [10 x [4 x float]]] } } + +@global_pointer = external unnamed_addr global { %union, [2000 x i8] }, align 4 + +; Function Attrs: convergent nounwind +define void @test(i32 %base) #0 { +; CHECK-LABEL: @test( +; CHECK-NOT: load i32 +; CHECK: load <2 x i32> +; CHECK-NOT: load i32 +entry: + %mul331 = and i32 %base, -4 + %add350.4 = add i32 4, %mul331 + %idx351.4 = zext i32 %add350.4 to i64 + %arrayidx352.4 = getelementptr inbounds { %union, [2000 x i8] }, { %union, [2000 x i8] }* @global_pointer, i64 0, i32 0, i32 0, i32 1, i64 0, i64 0, i64 0, i64 %idx351.4 + %tmp296.4 = bitcast float* %arrayidx352.4 to i32* + %add350.5 = add i32 5, %mul331 + %idx351.5 = zext i32 %add350.5 to i64 + %arrayidx352.5 = getelementptr inbounds { %union, [2000 x i8] }, { %union, [2000 x i8] }* @global_pointer, i64 0, i32 0, i32 0, i32 1, i64 0, i64 0, i64 0, i64 %idx351.5 + %tmp296.5 = bitcast float* %arrayidx352.5 to i32* + %cnd = icmp ult i32 %base, 1000 + br i1 %cnd, label %loads, label %exit + +loads: + ; If and only if the loads are in a different BB from the GEPs codegenprepare + ; would try to turn the GEPs into math, which makes LoadStoreVectorizer's job + ; harder + %tmp297.4 = load i32, i32* %tmp296.4, align 4, !tbaa !0 + %tmp297.5 = load i32, i32* %tmp296.5, align 4, !tbaa !0 + br label %exit + +exit: + ret void +} + +; Function Attrs: convergent nounwind +define void @test.codegenprepared(i32 %base) #0 { +; CHECK-LABEL: @test.codegenprepared( +; CHECK-NOT: load i32 +; CHECK: load <2 x i32> +; CHECK-NOT: load i32 +entry: + %mul331 = and i32 %base, -4 + %add350.4 = add i32 4, %mul331 + %idx351.4 = zext i32 %add350.4 to i64 + %add350.5 = add i32 5, %mul331 + %idx351.5 = zext i32 %add350.5 to i64 + %cnd = icmp ult i32 %base, 1000 + br i1 %cnd, label %loads, label %exit + +loads: ; preds = %entry + %sunkaddr = mul i64 %idx351.4, 4 + %sunkaddr1 = getelementptr inbounds i8, i8* bitcast ({ %union, [2000 x i8] }* @global_pointer to i8*), i64 %sunkaddr + %sunkaddr2 = getelementptr inbounds i8, i8* %sunkaddr1, i64 4096 + %0 = bitcast i8* %sunkaddr2 to i32* + %tmp297.4 = load i32, i32* %0, align 4, !tbaa !0 + %sunkaddr3 = mul i64 %idx351.5, 4 + %sunkaddr4 = getelementptr inbounds i8, i8* bitcast ({ %union, [2000 x i8] }* @global_pointer to i8*), i64 %sunkaddr3 + %sunkaddr5 = getelementptr inbounds i8, i8* %sunkaddr4, i64 4096 + %1 = bitcast i8* %sunkaddr5 to i32* + %tmp297.5 = load i32, i32* %1, align 4, !tbaa !0 + br label %exit + +exit: ; preds = %loads, %entry + ret void +} + +attributes #0 = { convergent nounwind } + +!0 = !{!1, !1, i64 0} +!1 = !{!"float", !2, i64 0} +!2 = !{!"omnipotent char", !3, i64 0} +!3 = !{!"Simple C++ TBAA"} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/compare-scev-by-complexity.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/compare-scev-by-complexity.ll new file mode 100644 index 00000000000..e2181f6086c --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/compare-scev-by-complexity.ll @@ -0,0 +1,77 @@ +; RUN: opt -load-store-vectorizer %s -S | FileCheck %s +; RUN: opt -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' %s -S | FileCheck %s + +; Check that setting wrapping flags after a SCEV node is created +; does not invalidate "sorted by complexity" invariant for +; operands of commutative and associative SCEV operators. + +target triple = "x86_64--" + +@global_value0 = external constant i32 +@global_value1 = external constant i32 +@other_value = external global float +@a = external global float +@b = external global float +@c = external global float +@d = external global float +@plus1 = external global i32 +@cnd = external global i8 + +; Function Attrs: nounwind +define void @main() local_unnamed_addr #0 { +; CHECK-LABEL: @main() +; CHECK: [[PTR:%[0-9]+]] = bitcast float* %preheader.load0.address to <2 x float>* +; CHECK: = load <2 x float>, <2 x float>* [[PTR]] +; CHECK-LABEL: for.body23: +entry: + %tmp = load i32, i32* @global_value0, !range !0 + %tmp2 = load i32, i32* @global_value1 + %and.i.i = and i32 %tmp2, 2 + %add.nuw.nsw.i.i = add nuw nsw i32 %and.i.i, 0 + %mul.i.i = shl nuw nsw i32 %add.nuw.nsw.i.i, 1 + %and6.i.i = and i32 %tmp2, 3 + %and9.i.i = and i32 %tmp2, 4 + %add.nuw.nsw10.i.i = add nuw nsw i32 %and6.i.i, %and9.i.i + %conv3.i42.i = add nuw nsw i32 %mul.i.i, 1 + %reass.add346.7 = add nuw nsw i32 %add.nuw.nsw10.i.i, 56 + %reass.mul347.7 = mul nuw nsw i32 %tmp, %reass.add346.7 + %add7.i.7 = add nuw nsw i32 %reass.mul347.7, 0 + %preheader.address0.idx = add nuw nsw i32 %add7.i.7, %mul.i.i + %preheader.address0.idx.zext = zext i32 %preheader.address0.idx to i64 + %preheader.load0.address = getelementptr inbounds float, float* @other_value, i64 %preheader.address0.idx.zext + %preheader.load0. = load float, float* %preheader.load0.address, align 4, !tbaa !1 + %common.address.idx = add nuw nsw i32 %add7.i.7, %conv3.i42.i + %preheader.header.common.address.idx.zext = zext i32 %common.address.idx to i64 + %preheader.load1.address = getelementptr inbounds float, float* @other_value, i64 %preheader.header.common.address.idx.zext + %preheader.load1. = load float, float* %preheader.load1.address, align 4, !tbaa !1 + br label %for.body23 + +for.body23: ; preds = %for.body23, %entry + %loop.header.load0.address = getelementptr inbounds float, float* @other_value, i64 %preheader.header.common.address.idx.zext + %loop.header.load0. = load float, float* %loop.header.load0.address, align 4, !tbaa !1 + %reass.mul343.7 = mul nuw nsw i32 %reass.add346.7, 72 + %add7.i286.7.7 = add nuw nsw i32 %reass.mul343.7, 56 + %add9.i288.7.7 = add nuw nsw i32 %add7.i286.7.7, %mul.i.i + %loop.header.address1.idx = add nuw nsw i32 %add9.i288.7.7, 1 + %loop.header.address1.idx.zext = zext i32 %loop.header.address1.idx to i64 + %loop.header.load1.address = getelementptr inbounds float, float* @other_value, i64 %loop.header.address1.idx.zext + %loop.header.load1. = load float, float* %loop.header.load1.address, align 4, !tbaa !1 + store float %preheader.load0., float* @a, align 4, !tbaa !1 + store float %preheader.load1., float* @b, align 4, !tbaa !1 + store float %loop.header.load0., float* @c, align 4, !tbaa !1 + store float %loop.header.load1., float* @d, align 4, !tbaa !1 + %loaded.cnd = load i8, i8* @cnd + %condition = trunc i8 %loaded.cnd to i1 + br i1 %condition, label %for.body23, label %exit + +exit: + ret void +} + +attributes #0 = { nounwind } + +!0 = !{i32 0, i32 65536} +!1 = !{!2, !2, i64 0} +!2 = !{!"float", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C++ TBAA"} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll new file mode 100644 index 00000000000..043d6ea7e92 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll @@ -0,0 +1,28 @@ +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; CHECK-LABEL: @correct_order( +; CHECK: [[LOAD_PTR:%[0-9]+]] = bitcast i32* %next.gep1 +; CHECK: load <2 x i32>, <2 x i32>* [[LOAD_PTR]] +; CHECK: load i32, i32* %next.gep +; CHECK: [[STORE_PTR:%[0-9]+]] = bitcast i32* %next.gep +; CHECK: store <2 x i32> +; CHECK-SAME: <2 x i32>* [[STORE_PTR]] +; CHECK: load i32, i32* %next.gep1 +define void @correct_order(i32* noalias %ptr) { + %next.gep = getelementptr i32, i32* %ptr, i64 0 + %next.gep1 = getelementptr i32, i32* %ptr, i64 1 + %next.gep2 = getelementptr i32, i32* %ptr, i64 2 + + %l1 = load i32, i32* %next.gep1, align 4 + %l2 = load i32, i32* %next.gep, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + + ret void +} + diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/lit.local.cfg b/llvm/test/Transforms/LoadStoreVectorizer/X86/lit.local.cfg new file mode 100644 index 00000000000..e71f3cc4c41 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/lit.local.cfg @@ -0,0 +1,3 @@ +if not 'X86' in config.root.targets: + config.unsupported = True + diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/load-width.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/load-width.ll new file mode 100644 index 00000000000..ac5f3ea9f0f --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/load-width.ll @@ -0,0 +1,40 @@ +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu haswell -S -o - %s | FileCheck --check-prefix=CHECK-HSW %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu knl -S -o - %s | FileCheck --check-prefix=CHECK-KNL %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -mcpu haswell -S -o - %s | FileCheck --check-prefix=CHECK-HSW %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -mcpu knl -S -o - %s | FileCheck --check-prefix=CHECK-KNL %s + +define <8 x double> @loadwidth_insert_extract(double* %ptr) { + %a = bitcast double* %ptr to <2 x double> * + %b = getelementptr <2 x double>, <2 x double>* %a, i32 1 + %c = getelementptr <2 x double>, <2 x double>* %a, i32 2 + %d = getelementptr <2 x double>, <2 x double>* %a, i32 3 +; CHECK-HSW: load <4 x double> +; CHECK-HSW: load <4 x double> +; CHECK-HSW-NOT: load +; CHECK-KNL: load <8 x double> +; CHECK-KNL-NOT: load + %la = load <2 x double>, <2 x double> *%a + %lb = load <2 x double>, <2 x double> *%b + %lc = load <2 x double>, <2 x double> *%c + %ld = load <2 x double>, <2 x double> *%d + ; Scalarize everything - Explicitly not a shufflevector to test this code + ; path in the LSV + %v1 = extractelement <2 x double> %la, i32 0 + %v2 = extractelement <2 x double> %la, i32 1 + %v3 = extractelement <2 x double> %lb, i32 0 + %v4 = extractelement <2 x double> %lb, i32 1 + %v5 = extractelement <2 x double> %lc, i32 0 + %v6 = extractelement <2 x double> %lc, i32 1 + %v7 = extractelement <2 x double> %ld, i32 0 + %v8 = extractelement <2 x double> %ld, i32 1 + ; Make a vector again + %i1 = insertelement <8 x double> undef, double %v1, i32 0 + %i2 = insertelement <8 x double> %i1, double %v2, i32 1 + %i3 = insertelement <8 x double> %i2, double %v3, i32 2 + %i4 = insertelement <8 x double> %i3, double %v4, i32 3 + %i5 = insertelement <8 x double> %i4, double %v5, i32 4 + %i6 = insertelement <8 x double> %i5, double %v6, i32 5 + %i7 = insertelement <8 x double> %i6, double %v7, i32 6 + %i8 = insertelement <8 x double> %i7, double %v8, i32 7 + ret <8 x double> %i8 +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll new file mode 100644 index 00000000000..a93e9aceb73 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll @@ -0,0 +1,48 @@ +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S < %s | \ +; RUN: FileCheck %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S < %s | \ +; RUN: FileCheck %s +; +; The GPU Load & Store Vectorizer may merge differently-typed accesses into a +; single instruction. This test checks that we merge TBAA tags for such +; accesses correctly. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; struct S { +; float f; +; int i; +; }; +%struct.S = type { float, i32 } + +; float foo(S *p) { +; p->f -= 1; +; p->i -= 1; +; return p->f; +; } +define float @foo(%struct.S* %p) { +entry: +; CHECK-LABEL: foo +; CHECK: load <2 x i32>, {{.*}}, !tbaa [[TAG_char:!.*]] +; CHECK: store <2 x i32> {{.*}}, !tbaa [[TAG_char]] + %f = getelementptr inbounds %struct.S, %struct.S* %p, i64 0, i32 0 + %0 = load float, float* %f, align 4, !tbaa !2 + %sub = fadd float %0, -1.000000e+00 + store float %sub, float* %f, align 4, !tbaa !2 + %i = getelementptr inbounds %struct.S, %struct.S* %p, i64 0, i32 1 + %1 = load i32, i32* %i, align 4, !tbaa !8 + %sub1 = add nsw i32 %1, -1 + store i32 %sub1, i32* %i, align 4, !tbaa !8 + ret float %sub +} + +!2 = !{!3, !4, i64 0} +!3 = !{!"_ZTS1S", !4, i64 0, !7, i64 4} +!4 = !{!"float", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C++ TBAA"} +!7 = !{!"int", !5, i64 0} +!8 = !{!3, !7, i64 4} + +; CHECK-DAG: [[TYPE_char:!.*]] = !{!"omnipotent char", {{.*}}, i64 0} +; CHECK-DAG: [[TAG_char]] = !{[[TYPE_char]], [[TYPE_char]], i64 0} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll new file mode 100644 index 00000000000..7a0073808a0 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -load-store-vectorizer -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s +; RUN: opt < %s -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s + +%rec = type { i32, i28 } + +; We currently do not optimize this scenario. +; But we verify that we no longer crash when compiling this. +define void @test1(%rec* %out, %rec* %in) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: [[IN1:%.*]] = getelementptr [[REC:%.*]], %rec* [[IN:%.*]], i16 0, i32 0 +; CHECK-NEXT: [[IN2:%.*]] = getelementptr [[REC]], %rec* [[IN]], i16 0, i32 1 +; CHECK-NEXT: [[VAL1:%.*]] = load i32, i32* [[IN1]], align 8 +; CHECK-NEXT: [[VAL2:%.*]] = load i28, i28* [[IN2]] +; CHECK-NEXT: [[OUT1:%.*]] = getelementptr [[REC]], %rec* [[OUT:%.*]], i16 0, i32 0 +; CHECK-NEXT: [[OUT2:%.*]] = getelementptr [[REC]], %rec* [[OUT]], i16 0, i32 1 +; CHECK-NEXT: store i32 [[VAL1]], i32* [[OUT1]], align 8 +; CHECK-NEXT: store i28 [[VAL2]], i28* [[OUT2]] +; CHECK-NEXT: ret void +; + %in1 = getelementptr %rec, %rec* %in, i16 0, i32 0 + %in2 = getelementptr %rec, %rec* %in, i16 0, i32 1 + %val1 = load i32, i32* %in1, align 8 + %val2 = load i28, i28* %in2 + %out1 = getelementptr %rec, %rec* %out, i16 0, i32 0 + %out2 = getelementptr %rec, %rec* %out, i16 0, i32 1 + store i32 %val1, i32* %out1, align 8 + store i28 %val2, i28* %out2 + ret void +} + diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll new file mode 100644 index 00000000000..3cfe7454baf --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll @@ -0,0 +1,29 @@ +; RUN: opt -mtriple=x86_64-unknown-linux -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=x86_64-unknown-linux -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s + +target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" + +%struct.buffer_t = type { i32, i8* } + +; Check an i32 and i8* get vectorized, and that the two accesses +; (load into buff.val and store to buff.p) preserve their order. +; Vectorized loads should be inserted at the position of the first load, +; and instructions which were between the first and last load should be +; reordered preserving their relative order inasmuch as possible. + +; CHECK-LABEL: @preserve_order_32( +; CHECK: load <2 x i32> +; CHECK: %buff.val = load i8 +; CHECK: store i8 0 +define void @preserve_order_32(%struct.buffer_t* noalias %buff) #0 { +entry: + %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i32 0, i32 1 + %buff.p = load i8*, i8** %tmp1 + %buff.val = load i8, i8* %buff.p + store i8 0, i8* %buff.p, align 8 + %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i32 0, i32 0 + %buff.int = load i32, i32* %tmp0, align 8 + ret void +} + +attributes #0 = { nounwind } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll new file mode 100644 index 00000000000..3ae0d891dc5 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll @@ -0,0 +1,78 @@ +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +%struct.buffer_t = type { i64, i8* } +%struct.nested.buffer = type { %struct.buffer_t, %struct.buffer_t } + +; Check an i64 and i8* get vectorized, and that the two accesses +; (load into buff.val and store to buff.p) preserve their order. +; Vectorized loads should be inserted at the position of the first load, +; and instructions which were between the first and last load should be +; reordered preserving their relative order inasmuch as possible. + +; CHECK-LABEL: @preserve_order_64( +; CHECK: load <2 x i64> +; CHECK: %buff.val = load i8 +; CHECK: store i8 0 +define void @preserve_order_64(%struct.buffer_t* noalias %buff) #0 { +entry: + %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 1 + %buff.p = load i8*, i8** %tmp1 + %buff.val = load i8, i8* %buff.p + store i8 0, i8* %buff.p, align 8 + %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 0 + %buff.int = load i64, i64* %tmp0, align 16 + ret void +} + +; Check reordering recurses correctly. + +; CHECK-LABEL: @transitive_reorder( +; CHECK: load <2 x i64> +; CHECK: %buff.val = load i8 +; CHECK: store i8 0 +define void @transitive_reorder(%struct.buffer_t* noalias %buff, %struct.nested.buffer* noalias %nest) #0 { +entry: + %nest0_0 = getelementptr inbounds %struct.nested.buffer, %struct.nested.buffer* %nest, i64 0, i32 0 + %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %nest0_0, i64 0, i32 1 + %buff.p = load i8*, i8** %tmp1 + %buff.val = load i8, i8* %buff.p + store i8 0, i8* %buff.p, align 8 + %nest1_0 = getelementptr inbounds %struct.nested.buffer, %struct.nested.buffer* %nest, i64 0, i32 0 + %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %nest1_0, i64 0, i32 0 + %buff.int = load i64, i64* %tmp0, align 16 + ret void +} + +; Check for no vectorization over phi node + +; CHECK-LABEL: @no_vect_phi( +; CHECK: load i8* +; CHECK: load i8 +; CHECK: store i8 0 +; CHECK: load i64 +define void @no_vect_phi(i32* noalias %ptr, %struct.buffer_t* noalias %buff) { +entry: + %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 1 + %buff.p = load i8*, i8** %tmp1 + %buff.val = load i8, i8* %buff.p + store i8 0, i8* %buff.p, align 8 + br label %"for something" + +"for something": + %index = phi i64 [ 0, %entry ], [ %index.next, %"for something" ] + + %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 0 + %buff.int = load i64, i64* %tmp0, align 16 + + %index.next = add i64 %index, 8 + %cmp_res = icmp eq i64 %index.next, 8 + br i1 %cmp_res, label %ending, label %"for something" + +ending: + ret void +} + +attributes #0 = { nounwind } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll new file mode 100644 index 00000000000..72b29912d81 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll @@ -0,0 +1,118 @@ +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; Vectorized subsets of the load/store chains in the presence of +; interleaved loads/stores + +; CHECK-LABEL: @interleave_2L_2S( +; CHECK: load <2 x i32> +; CHECK: load i32 +; CHECK: store <2 x i32> +; CHECK: load i32 +define void @interleave_2L_2S(i32* noalias %ptr) { + %next.gep = getelementptr i32, i32* %ptr, i64 0 + %next.gep1 = getelementptr i32, i32* %ptr, i64 1 + %next.gep2 = getelementptr i32, i32* %ptr, i64 2 + + %l1 = load i32, i32* %next.gep1, align 4 + %l2 = load i32, i32* %next.gep, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + + ret void +} + +; CHECK-LABEL: @interleave_3L_2S_1L( +; CHECK: load <3 x i32> +; CHECK: store <2 x i32> +; CHECK: load i32 + +define void @interleave_3L_2S_1L(i32* noalias %ptr) { + %next.gep = getelementptr i32, i32* %ptr, i64 0 + %next.gep1 = getelementptr i32, i32* %ptr, i64 1 + %next.gep2 = getelementptr i32, i32* %ptr, i64 2 + + %l2 = load i32, i32* %next.gep, align 4 + %l1 = load i32, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + + ret void +} + +; CHECK-LABEL: @chain_suffix( +; CHECK: load i32 +; CHECK: store <2 x i32> +; CHECK: load <2 x i32> +define void @chain_suffix(i32* noalias %ptr) { + %next.gep = getelementptr i32, i32* %ptr, i64 0 + %next.gep1 = getelementptr i32, i32* %ptr, i64 1 + %next.gep2 = getelementptr i32, i32* %ptr, i64 2 + + %l2 = load i32, i32* %next.gep, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + + ret void +} + + +; CHECK-LABEL: @chain_prefix_suffix( +; CHECK: load <2 x i32> +; CHECK: store <2 x i32> +; CHECK: load <3 x i32> +define void @chain_prefix_suffix(i32* noalias %ptr) { + %next.gep = getelementptr i32, i32* %ptr, i64 0 + %next.gep1 = getelementptr i32, i32* %ptr, i64 1 + %next.gep2 = getelementptr i32, i32* %ptr, i64 2 + %next.gep3 = getelementptr i32, i32* %ptr, i64 3 + + %l1 = load i32, i32* %next.gep, align 4 + %l2 = load i32, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep2, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + %l5 = load i32, i32* %next.gep3, align 4 + + ret void +} + +; FIXME: If the chain is too long and TLI says misaligned is not fast, +; then LSV fails to vectorize anything in that chain. +; To reproduce below, add a tmp5 (ptr+4) and load tmp5 into l6 and l7. + +; CHECK-LABEL: @interleave_get_longest +; CHECK: load <3 x i32> +; CHECK: load i32 +; CHECK: store <2 x i32> zeroinitializer +; CHECK: load i32 +; CHECK: load i32 +; CHECK: load i32 + +define void @interleave_get_longest(i32* noalias %ptr) { + %tmp1 = getelementptr i32, i32* %ptr, i64 0 + %tmp2 = getelementptr i32, i32* %ptr, i64 1 + %tmp3 = getelementptr i32, i32* %ptr, i64 2 + %tmp4 = getelementptr i32, i32* %ptr, i64 3 + + %l1 = load i32, i32* %tmp2, align 4 + %l2 = load i32, i32* %tmp1, align 4 + store i32 0, i32* %tmp2, align 4 + store i32 0, i32* %tmp1, align 4 + %l3 = load i32, i32* %tmp2, align 4 + %l4 = load i32, i32* %tmp3, align 4 + %l5 = load i32, i32* %tmp4, align 4 + %l6 = load i32, i32* %tmp4, align 4 + %l7 = load i32, i32* %tmp4, align 4 + + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll new file mode 100644 index 00000000000..00971f35038 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll @@ -0,0 +1,15 @@ +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu haswell -S -o - %s | FileCheck %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -mcpu haswell -S -o - %s | FileCheck %s + +; Check that the LoadStoreVectorizer does not crash due to not differentiating <1 x T> and T. + +; CHECK-LABEL: @vector_scalar( +; CHECK: store double +; CHECK: store <1 x double> +define void @vector_scalar(double* %ptr, double %a, <1 x double> %b) { + %1 = bitcast double* %ptr to <1 x double>* + %2 = getelementptr <1 x double>, <1 x double>* %1, i32 1 + store double %a, double* %ptr, align 8 + store <1 x double> %b, <1 x double>* %2, align 8 + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll b/llvm/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll new file mode 100644 index 00000000000..07487b57803 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll @@ -0,0 +1,27 @@ +; RUN: opt -S < %s -load-store-vectorizer | FileCheck %s +; RUN: opt -S < %s -passes='function(load-store-vectorizer)' | FileCheck %s + +declare void @llvm.sideeffect() + +; load-store vectorization across a @llvm.sideeffect. + +; CHECK-LABEL: test +; CHECK: load <4 x float> +; CHECK: store <4 x float> +define void @test(float* %p) { + %p0 = getelementptr float, float* %p, i64 0 + %p1 = getelementptr float, float* %p, i64 1 + %p2 = getelementptr float, float* %p, i64 2 + %p3 = getelementptr float, float* %p, i64 3 + %l0 = load float, float* %p0, align 16 + %l1 = load float, float* %p1 + %l2 = load float, float* %p2 + call void @llvm.sideeffect() + %l3 = load float, float* %p3 + store float %l0, float* %p0, align 16 + call void @llvm.sideeffect() + store float %l1, float* %p1 + store float %l2, float* %p2 + store float %l3, float* %p3 + ret void +} |