diff options
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 46 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/bfi_int.ll | 2 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/call-argument-types.ll | 43 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/mad-mix.ll | 58 |
4 files changed, 86 insertions, 63 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 64d63e43855..097b9352043 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -697,10 +697,11 @@ bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const { MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { - if (CC != CallingConv::AMDGPU_KERNEL && - VT.isVector() && VT.getVectorNumElements() == 3) { + // TODO: Consider splitting all arguments into 32-bit pieces. + if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { EVT ScalarVT = VT.getScalarType(); - if (ScalarVT.getSizeInBits() == 32) + unsigned Size = ScalarVT.getSizeInBits(); + if (Size == 32 || Size == 64) return ScalarVT.getSimpleVT(); } @@ -710,11 +711,11 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { - if (CC != CallingConv::AMDGPU_KERNEL && - VT.isVector() && VT.getVectorNumElements() == 3) { + if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { EVT ScalarVT = VT.getScalarType(); - if (ScalarVT.getSizeInBits() == 32) - return 3; + unsigned Size = ScalarVT.getSizeInBits(); + if (Size == 32 || Size == 64) + return VT.getVectorNumElements(); } return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); @@ -724,14 +725,13 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const { - - if (CC != CallingConv::AMDGPU_KERNEL && VT.getVectorNumElements() == 3) { + if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { EVT ScalarVT = VT.getScalarType(); - if (ScalarVT.getSizeInBits() == 32 || - ScalarVT.getSizeInBits() == 64) { + unsigned Size = ScalarVT.getSizeInBits(); + if (Size == 32 || Size == 64) { RegisterVT = ScalarVT.getSimpleVT(); IntermediateVT = RegisterVT; - NumIntermediates = 3; + NumIntermediates = VT.getVectorNumElements(); return NumIntermediates; } } @@ -1314,6 +1314,8 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) { const ISD::InputArg *Arg = &Ins[I]; + assert(!Arg->VT.isVector() && "vector type argument should have been split"); + // First check if it's a PS input addr. if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) { @@ -1347,25 +1349,7 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, ++PSInputNum; } - // Second split vertices into their elements. - if (Arg->VT.isVector()) { - ISD::InputArg NewArg = *Arg; - NewArg.Flags.setSplit(); - NewArg.VT = Arg->VT.getVectorElementType(); - - // We REALLY want the ORIGINAL number of vertex elements here, e.g. a - // three or five element vertex only needs three or five registers, - // NOT four or eight. - Type *ParamType = FType->getParamType(Arg->getOrigArgIndex()); - unsigned NumElements = ParamType->getVectorNumElements(); - - for (unsigned J = 0; J != NumElements; ++J) { - Splits.push_back(NewArg); - NewArg.PartOffset += NewArg.VT.getStoreSize(); - } - } else { - Splits.push_back(*Arg); - } + Splits.push_back(*Arg); } } diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll index 77c5e53481e..66f8a2b111a 100644 --- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll +++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll @@ -54,8 +54,8 @@ entry: ; FUNC-LABEL: {{^}}v_bitselect_v2i32_pat1: ; GCN: s_waitcnt -; GCN-NEXT: v_bfi_b32 v1, v3, v1, v5 ; GCN-NEXT: v_bfi_b32 v0, v2, v0, v4 +; GCN-NEXT: v_bfi_b32 v1, v3, v1, v5 ; GCN-NEXT: s_setpc_b64 define <2 x i32> @v_bitselect_v2i32_pat1(<2 x i32> %a, <2 x i32> %b, <2 x i32> %mask) { %xor.0 = xor <2 x i32> %a, %mask diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index e17c86bde41..afa62be8bc2 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -286,10 +286,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { ; FIXME: Immedites should fold directly into v_mov_b32s ; GCN-LABEL: {{^}}test_call_external_void_func_v4i64: ; GCN: buffer_load_dwordx4 v[0:3] -; GCN: v_mov_b32_e32 v4, s -; GCN: v_mov_b32_e32 v5, s -; GCN: v_mov_b32_e32 v6, s -; GCN: v_mov_b32_e32 v7, s +; GCN-DAG: v_mov_b32_e32 v4, s +; GCN-DAG: v_mov_b32_e32 v5, s +; GCN-DAG: v_mov_b32_e32 v6, s +; GCN-DAG: v_mov_b32_e32 v7, s ; GCN: s_waitcnt ; GCN-NEXT: s_swappc_b64 @@ -358,6 +358,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { ret void } +; GCN-LABEL: {{^}}test_call_external_void_func_v2i32_imm: +; GCN-DAG: v_mov_b32_e32 v0, 1 +; GCN-DAG: v_mov_b32_e32 v1, 2 +; GCN: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { + call void @external_void_func_v2i32(<2 x i32> <i32 1, i32 2>) + ret void +} + ; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm: ; HSA-DAG: s_mov_b32 s33, s9 ; MESA-DAG: s_mov_b32 s33, s3{{$}} @@ -393,6 +402,17 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { ret void } +; GCN-LABEL: {{^}}test_call_external_void_func_v4i32_imm: +; GCN-DAG: v_mov_b32_e32 v0, 1 +; GCN-DAG: v_mov_b32_e32 v1, 2 +; GCN-DAG: v_mov_b32_e32 v2, 3 +; GCN-DAG: v_mov_b32_e32 v3, 4 +; GCN: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { + call void @external_void_func_v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>) + ret void +} + ; GCN-LABEL: {{^}}test_call_external_void_func_v8i32: ; GCN-DAG: buffer_load_dwordx4 v[0:3], off ; GCN-DAG: buffer_load_dwordx4 v[4:7], off @@ -405,6 +425,21 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { ret void } +; GCN-LABEL: {{^}}test_call_external_void_func_v8i32_imm: +; GCN-DAG: v_mov_b32_e32 v0, 1 +; GCN-DAG: v_mov_b32_e32 v1, 2 +; GCN-DAG: v_mov_b32_e32 v2, 3 +; GCN-DAG: v_mov_b32_e32 v3, 4 +; GCN-DAG: v_mov_b32_e32 v4, 5 +; GCN-DAG: v_mov_b32_e32 v5, 6 +; GCN-DAG: v_mov_b32_e32 v6, 7 +; GCN-DAG: v_mov_b32_e32 v7, 8 +; GCN: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { + call void @external_void_func_v8i32(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>) + ret void +} + ; GCN-LABEL: {{^}}test_call_external_void_func_v16i32: ; GCN-DAG: buffer_load_dwordx4 v[0:3], off ; GCN-DAG: buffer_load_dwordx4 v[4:7], off diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll index 6f56be1a8a2..b68a43ecb8c 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll @@ -54,13 +54,13 @@ define float @v_mad_mix_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> % } ; GCN-LABEL: {{^}}v_mad_mix_v2f32: -; GFX900: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mad_mix_f32 v1, v0, v3, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v3, v2 op_sel_hi:[1,1,1] +; GFX900: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX906: v_mov_b32_e32 v3, v1 -; GFX906-NEXT: v_fma_mix_f32 v1, v0, v3, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v3, v2 op_sel_hi:[1,1,1] +; GFX906: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX906-NEXT: v_mov_b32_e32 v1, v3 ; CIVI: v_mac_f32 define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { @@ -73,14 +73,14 @@ define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x hal ; GCN-LABEL: {{^}}v_mad_mix_v2f32_shuffle: ; GCN: s_waitcnt -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mad_mix_f32 v1, v0, v3, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v3, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] +; GFX900: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: s_setpc_b64 -; GFX906-NEXT: v_mov_b32_e32 v3, v1 -; GFX906-NEXT: v_fma_mix_f32 v1, v0, v3, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1] -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v3, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_mov_b32_e32 v0, v3 ; GFX906-NEXT: s_setpc_b64 ; CIVI: v_mac_f32 @@ -274,13 +274,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 { } ; GCN-LABEL: {{^}}v_mad_mix_v2f32_f32imm1: -; GFX9: v_mov_b32_e32 v2, v1 ; GFX9: v_mov_b32_e32 v3, 1.0 -; GFX900: v_mad_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX900: v_mad_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mov_b32_e32 v1, v2 -; GFX906: v_fma_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX906: v_fma_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding +; GFX906: v_mov_b32_e32 v1, v2 define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) #0 { %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> @@ -289,13 +290,15 @@ define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) } ; GCN-LABEL: {{^}}v_mad_mix_v2f32_cvtf16imminv2pi: -; GFX9: v_mov_b32_e32 v2, v1 ; GFX9: v_mov_b32_e32 v3, 0x3e230000 -; GFX900: v_mad_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX900: v_mad_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding -; GFX906: v_fma_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX906: v_fma_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mov_b32_e32 v1, v2 + +; GFX906: v_fma_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding +; GFX906: v_mov_b32_e32 v1, v2 define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 { %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> @@ -305,14 +308,15 @@ define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> } ; GCN-LABEL: {{^}}v_mad_mix_v2f32_f32imminv2pi: -; GFX9: v_mov_b32_e32 v2, v1 ; GFX9: v_mov_b32_e32 v3, 0.15915494 -; GFX900: v_mad_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX900: v_mad_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mov_b32_e32 v1, v2 -; GFX906: v_fma_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX906: v_fma_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding +; GFX906: v_mov_b32_e32 v1, v2 define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 { %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> |

