diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-07-28 14:11:34 +0000 |
---|---|---|
committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-07-28 14:11:34 +0000 |
commit | 8f9dde94b7f04df8332d459cd9e3b8f6a3c7f816 (patch) | |
tree | e767bfcb61e447691f96ac7f2c6ed9c99b3c9853 | |
parent | 3878bf83ddf7d83b446ab79dc30158fc50465c54 (diff) | |
download | bcm5719-llvm-8f9dde94b7f04df8332d459cd9e3b8f6a3c7f816.tar.gz bcm5719-llvm-8f9dde94b7f04df8332d459cd9e3b8f6a3c7f816.zip |
AMDGPU: Stop wasting argument registers with v3i32/v3f32
SelectionDAGBuilder widens v3i32/v3f32 arguments to
to v4i32/v4f32 which consume an additional register.
In addition to wasting argument space, this produces extra
instructions since now it appears the 4th vector component has
a meaningful value to most combines.
llvm-svn: 338197
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 46 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 13 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/call-argument-types.ll | 64 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/fmaxnum.ll | 22 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/fminnum.ll | 11 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/function-args.ll | 39 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/function-returns.ll | 39 |
7 files changed, 229 insertions, 5 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 5b7fc2656a2..97c38e44e40 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -694,6 +694,52 @@ bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const { return false; } +MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, + CallingConv::ID CC, + EVT VT) const { + if (CC != CallingConv::AMDGPU_KERNEL && + VT.isVector() && VT.getVectorNumElements() == 3) { + EVT ScalarVT = VT.getScalarType(); + if (ScalarVT.getSizeInBits() == 32) + return ScalarVT.getSimpleVT(); + } + + return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); +} + +unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, + CallingConv::ID CC, + EVT VT) const { + if (CC != CallingConv::AMDGPU_KERNEL && + VT.isVector() && VT.getVectorNumElements() == 3) { + EVT ScalarVT = VT.getScalarType(); + if (ScalarVT.getSizeInBits() == 32) + return 3; + } + + return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); +} + +unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( + LLVMContext &Context, CallingConv::ID CC, + EVT VT, EVT &IntermediateVT, + unsigned &NumIntermediates, MVT &RegisterVT) const { + + if (CC != CallingConv::AMDGPU_KERNEL && VT.getVectorNumElements() == 3) { + EVT ScalarVT = VT.getScalarType(); + if (ScalarVT.getSizeInBits() == 32 || + ScalarVT.getSizeInBits() == 64) { + RegisterVT = ScalarVT.getSimpleVT(); + IntermediateVT = RegisterVT; + NumIntermediates = 3; + return NumIntermediates; + } + } + + return TargetLowering::getVectorTypeBreakdownForCallingConv( + Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); +} + bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &CI, MachineFunction &MF, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index ad049f2a71c..5b3d49b3d8e 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -25,6 +25,19 @@ class SITargetLowering final : public AMDGPUTargetLowering { private: const GCNSubtarget *Subtarget; +public: + MVT getRegisterTypeForCallingConv(LLVMContext &Context, + CallingConv::ID CC, + EVT VT) const override; + unsigned getNumRegistersForCallingConv(LLVMContext &Context, + CallingConv::ID CC, + EVT VT) const override; + + unsigned getVectorTypeBreakdownForCallingConv( + LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, + unsigned &NumIntermediates, MVT &RegisterVT) const override; + +private: SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, uint64_t Offset) const; SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const; diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index b0998355395..e17c86bde41 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -17,6 +17,9 @@ declare void @external_void_func_i16_zeroext(i16 zeroext) #0 declare void @external_void_func_i32(i32) #0 declare void @external_void_func_i64(i64) #0 +declare void @external_void_func_v2i64(<2 x i64>) #0 +declare void @external_void_func_v3i64(<3 x i64>) #0 +declare void @external_void_func_v4i64(<4 x i64>) #0 declare void @external_void_func_f16(half) #0 declare void @external_void_func_f32(float) #0 @@ -27,6 +30,7 @@ declare void @external_void_func_v2f16(<2 x half>) #0 declare void @external_void_func_v2i32(<2 x i32>) #0 declare void @external_void_func_v3i32(<3 x i32>) #0 +declare void @external_void_func_v3i32_i32(<3 x i32>, i32) #0 declare void @external_void_func_v4i32(<4 x i32>) #0 declare void @external_void_func_v8i32(<8 x i32>) #0 declare void @external_void_func_v16i32(<16 x i32>) #0 @@ -255,6 +259,47 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { ret void } +; GCN-LABEL: {{^}}test_call_external_void_func_v2i64: +; GCN: buffer_load_dwordx4 v[0:3] +; GCN: s_waitcnt +; GCN-NEXT: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { + %val = load <2 x i64>, <2 x i64> addrspace(1)* null + call void @external_void_func_v2i64(<2 x i64> %val) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_v3i64: +; GCN: buffer_load_dwordx4 v[0:3] +; GCN: v_mov_b32_e32 v4, s +; GCN: v_mov_b32_e32 v5, s +; GCN: s_waitcnt +; GCN-NEXT: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { + %load = load <2 x i64>, <2 x i64> addrspace(1)* null + %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 undef>, <3 x i32> <i32 0, i32 1, i32 2> + + call void @external_void_func_v3i64(<3 x i64> %val) + ret void +} + +; FIXME: Immedites should fold directly into v_mov_b32s +; GCN-LABEL: {{^}}test_call_external_void_func_v4i64: +; GCN: buffer_load_dwordx4 v[0:3] +; GCN: v_mov_b32_e32 v4, s +; GCN: v_mov_b32_e32 v5, s +; GCN: v_mov_b32_e32 v6, s +; GCN: v_mov_b32_e32 v7, s + +; GCN: s_waitcnt +; GCN-NEXT: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { + %load = load <2 x i64>, <2 x i64> addrspace(1)* null + %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 17179869187>, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + call void @external_void_func_v4i64(<4 x i64> %val) + ret void +} + ; GCN-LABEL: {{^}}test_call_external_void_func_f16_imm: ; VI: v_mov_b32_e32 v0, 0x4400 ; CI: v_mov_b32_e32 v0, 4.0 @@ -313,15 +358,14 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { ret void } -; FIXME: Passing 4th ; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm: ; HSA-DAG: s_mov_b32 s33, s9 ; MESA-DAG: s_mov_b32 s33, s3{{$}} -; GCN-DAG: v_mov_b32_e32 v0 -; GCN-DAG: v_mov_b32_e32 v1 -; GCN-DAG: v_mov_b32_e32 v2 -; GCN-DAG: v_mov_b32_e32 v3 +; GCN-DAG: v_mov_b32_e32 v0, 3 +; GCN-DAG: v_mov_b32_e32 v1, 4 +; GCN-DAG: v_mov_b32_e32 v2, 5 +; GCN-NOT: v3 ; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { @@ -329,6 +373,16 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { ret void } +; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_i32: +; GCN-DAG: v_mov_b32_e32 v0, 3 +; GCN-DAG: v_mov_b32_e32 v1, 4 +; GCN-DAG: v_mov_b32_e32 v2, 5 +; GCN-DAG: v_mov_b32_e32 v3, 6 +define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { + call void @external_void_func_v3i32_i32(<3 x i32> <i32 3, i32 4, i32 5>, i32 6) + ret void +} + ; GCN-LABEL: {{^}}test_call_external_void_func_v4i32: ; GCN: buffer_load_dwordx4 v[0:3] ; GCN: s_waitcnt diff --git a/llvm/test/CodeGen/AMDGPU/fmaxnum.ll b/llvm/test/CodeGen/AMDGPU/fmaxnum.ll index 277b8ce04c4..7193d69cde5 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaxnum.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaxnum.ll @@ -3,6 +3,7 @@ declare float @llvm.maxnum.f32(float, float) #0 declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #0 +declare <3 x float> @llvm.maxnum.v3f32(<3 x float>, <3 x float>) #0 declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #0 declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>) #0 declare <16 x float> @llvm.maxnum.v16f32(<16 x float>, <16 x float>) #0 @@ -33,6 +34,17 @@ define amdgpu_kernel void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x ret void } +; FUNC-LABEL: {{^}}test_fmax_v3f32: +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI-NOT: v_max_f32 +define amdgpu_kernel void @test_fmax_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, <3 x float> %b) nounwind { + %val = call <3 x float> @llvm.maxnum.v3f32(<3 x float> %a, <3 x float> %b) #0 + store <3 x float> %val, <3 x float> addrspace(1)* %out, align 16 + ret void +} + ; FUNC-LABEL: @test_fmax_v4f32 ; SI: v_max_f32_e32 ; SI: v_max_f32_e32 @@ -280,4 +292,14 @@ define amdgpu_kernel void @fmax_literal_var_f32(float addrspace(1)* %out, float ret void } +; FUNC-LABEL: {{^}}test_func_fmax_v3f32: +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI-NOT: v_max_f32 +define <3 x float> @test_func_fmax_v3f32(<3 x float> %a, <3 x float> %b) nounwind { + %val = call <3 x float> @llvm.maxnum.v3f32(<3 x float> %a, <3 x float> %b) #0 + ret <3 x float> %val +} + attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/fminnum.ll b/llvm/test/CodeGen/AMDGPU/fminnum.ll index 9e997c7a104..1a88306ea81 100644 --- a/llvm/test/CodeGen/AMDGPU/fminnum.ll +++ b/llvm/test/CodeGen/AMDGPU/fminnum.ll @@ -4,6 +4,7 @@ declare float @llvm.minnum.f32(float, float) #0 declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #0 +declare <3 x float> @llvm.minnum.v3f32(<3 x float>, <3 x float>) #0 declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #0 declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>) #0 declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>) #0 @@ -278,4 +279,14 @@ define amdgpu_kernel void @fmin_literal_var_f32(float addrspace(1)* %out, float ret void } +; FUNC-LABEL: {{^}}test_func_fmin_v3f32: +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI-NOT: v_min_f32 +define <3 x float> @test_func_fmin_v3f32(<3 x float> %a, <3 x float> %b) nounwind { + %val = call <3 x float> @llvm.minnum.v3f32(<3 x float> %a, <3 x float> %b) #0 + ret <3 x float> %val +} + attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll index 48d94465c13..71541b29553 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -739,6 +739,45 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ret void } +; Make sure v3 isn't a wasted register because of v3 types being promoted to v4 +; GCN-LABEL: {{^}}void_func_v3f32_wasted_reg: +; GCN: s_waitcnt +; GCN: ds_write_b32 v{{[0-9]+}}, v0 +; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v1 +; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v2 +; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v3 +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @void_func_v3f32_wasted_reg(<3 x float> %arg0, i32 %arg1) #0 { + %arg0.0 = extractelement <3 x float> %arg0, i32 0 + %arg0.1 = extractelement <3 x float> %arg0, i32 1 + %arg0.2 = extractelement <3 x float> %arg0, i32 2 + store volatile float %arg0.0, float addrspace(3)* undef + store volatile float %arg0.1, float addrspace(3)* undef + store volatile float %arg0.2, float addrspace(3)* undef + store volatile i32 %arg1, i32 addrspace(3)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v3i32_wasted_reg: +; GCN: s_waitcnt +; GCN: ds_write_b32 v{{[0-9]+}}, v0 +; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v1 +; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v2 +; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v3 +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @void_func_v3i32_wasted_reg(<3 x i32> %arg0, i32 %arg1) #0 { + %arg0.0 = extractelement <3 x i32> %arg0, i32 0 + %arg0.1 = extractelement <3 x i32> %arg0, i32 1 + %arg0.2 = extractelement <3 x i32> %arg0, i32 2 + store volatile i32 %arg0.0, i32 addrspace(3)* undef + store volatile i32 %arg0.1, i32 addrspace(3)* undef + store volatile i32 %arg0.2, i32 addrspace(3)* undef + store volatile i32 %arg1, i32 addrspace(3)* undef + ret void +} + ; Check there is no crash. ; GCN-LABEL: {{^}}void_func_v16i8: define void @void_func_v16i8(<16 x i8> %arg0) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll index 32ecc417fed..20208b188d7 100644 --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -531,4 +531,43 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { ret { i32, <32 x i32> }%val } +; Make sure the last struct component is returned in v3, not v4. +; GCN-LABEL: {{^}}v3i32_struct_func_void_wasted_reg: +; GCN: ds_read_b32 v0, +; GCN: ds_read_b32 v1, +; GCN: ds_read_b32 v2, +; GCN: ds_read_b32 v3, +define { <3 x i32>, i32 } @v3i32_struct_func_void_wasted_reg() #0 { + %load0 = load volatile i32, i32 addrspace(3)* undef + %load1 = load volatile i32, i32 addrspace(3)* undef + %load2 = load volatile i32, i32 addrspace(3)* undef + %load3 = load volatile i32, i32 addrspace(3)* undef + + %insert.0 = insertelement <3 x i32> undef, i32 %load0, i32 0 + %insert.1 = insertelement <3 x i32> %insert.0, i32 %load1, i32 1 + %insert.2 = insertelement <3 x i32> %insert.1, i32 %load2, i32 2 + %insert.3 = insertvalue { <3 x i32>, i32 } undef, <3 x i32> %insert.2, 0 + %insert.4 = insertvalue { <3 x i32>, i32 } %insert.3, i32 %load3, 1 + ret { <3 x i32>, i32 } %insert.4 +} + +; GCN-LABEL: {{^}}v3f32_struct_func_void_wasted_reg: +; GCN: ds_read_b32 v0, +; GCN: ds_read_b32 v1, +; GCN: ds_read_b32 v2, +; GCN: ds_read_b32 v3, +define { <3 x float>, i32 } @v3f32_struct_func_void_wasted_reg() #0 { + %load0 = load volatile float, float addrspace(3)* undef + %load1 = load volatile float, float addrspace(3)* undef + %load2 = load volatile float, float addrspace(3)* undef + %load3 = load volatile i32, i32 addrspace(3)* undef + + %insert.0 = insertelement <3 x float> undef, float %load0, i32 0 + %insert.1 = insertelement <3 x float> %insert.0, float %load1, i32 1 + %insert.2 = insertelement <3 x float> %insert.1, float %load2, i32 2 + %insert.3 = insertvalue { <3 x float>, i32 } undef, <3 x float> %insert.2, 0 + %insert.4 = insertvalue { <3 x float>, i32 } %insert.3, i32 %load3, 1 + ret { <3 x float>, i32 } %insert.4 +} + attributes #0 = { nounwind } |