diff options
author | Jan Vesely <jan.vesely@rutgers.edu> | 2016-05-16 23:56:32 +0000 |
---|---|---|
committer | Jan Vesely <jan.vesely@rutgers.edu> | 2016-05-16 23:56:32 +0000 |
commit | 687ca8df18a3d2bac411cc29fd1abcbd281ddf54 (patch) | |
tree | 04d11b175748afd9aa4c757b8339a1df57a5da31 | |
parent | cfec6c6a282e0063861ef73bd3d04a28841c39c8 (diff) | |
download | bcm5719-llvm-687ca8df18a3d2bac411cc29fd1abcbd281ddf54.tar.gz bcm5719-llvm-687ca8df18a3d2bac411cc29fd1abcbd281ddf54.zip |
AMDGPU/R600: Use correct number of vector elements when lowering private loads
Reviewer: tstellardAMD, arsenm
Subscribers: arsenm, kzhuravl, llvm-commits
Differential Revision: http://reviews.llvm.org/D20032
llvm-svn: 269725
-rw-r--r-- | llvm/lib/Target/AMDGPU/R600ISelLowering.cpp | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll | 105 |
2 files changed, 108 insertions, 5 deletions
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 66486629bde..c8dcbe50260 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1679,6 +1679,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { EVT ElemVT = VT.getVectorElementType(); SDValue Loads[4]; + assert(NumElemVT <= 4); assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " "vector width in load"); @@ -1692,11 +1693,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { DAG.getTargetConstant(Channel, DL, MVT::i32), Op.getOperand(2)); } - for (unsigned i = NumElemVT; i < 4; ++i) { - Loads[i] = DAG.getUNDEF(ElemVT); - } - EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4); - LoweredLoad = DAG.getBuildVector(TargetVT, DL, Loads); + EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElemVT); + LoweredLoad = DAG.getBuildVector(TargetVT, DL, makeArrayRef(Loads, NumElemVT)); } else { LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, Chain, Ptr, diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll index ee6e40eec67..3564302c8ee 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -401,4 +401,109 @@ define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { ; NOHSAOPT: !0 = !{i32 0, i32 2048} + +; FUNC-LABEL: v16i32_stack: + +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT + +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +define void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) { + %alloca = alloca [2 x <16 x i32>] + %tmp0 = getelementptr [2 x <16 x i32>], [2 x <16 x i32>]* %alloca, i32 0, i32 %a + %tmp5 = load <16 x i32>, <16 x i32>* %tmp0 + store <16 x i32> %tmp5, <16 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: v16float_stack: + +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT +; R600: MOVA_INT + +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +define void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) { + %alloca = alloca [2 x <16 x float>] + %tmp0 = getelementptr [2 x <16 x float>], [2 x <16 x float>]* %alloca, i32 0, i32 %a + %tmp5 = load <16 x float>, <16 x float>* %tmp0 + store <16 x float> %tmp5, <16 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: v2float_stack: + +; R600: MOVA_INT +; R600: MOVA_INT + +; SI: buffer_load_dword +; SI: buffer_load_dword + +define void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) { + %alloca = alloca [16 x <2 x float>] + %tmp0 = getelementptr [16 x <2 x float>], [16 x <2 x float>]* %alloca, i32 0, i32 %a + %tmp5 = load <2 x float>, <2 x float>* %tmp0 + store <2 x float> %tmp5, <2 x float> addrspace(1)* %out + ret void +} + attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="2" } |