diff options
-rw-r--r-- | llvm/lib/Target/AMDGPU/MIMGInstructions.td | 12 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 15 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll | 24 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll | 6 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll | 6 |
5 files changed, 30 insertions, 33 deletions
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 4071adcf200..690510110b6 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -178,8 +178,8 @@ multiclass MIMG_NoSampler <bits<7> op, string asm, bit has_d16, bit mip = 0, defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0>; let VDataDwords = 4 in defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0>; - let VDataDwords = 8 in - defm _V8 : MIMG_NoSampler_Src_Helper <op, asm, VReg_256, 0>; + let VDataDwords = 5 in + defm _V5 : MIMG_NoSampler_Src_Helper <op, asm, VReg_160, 0>; } } @@ -412,8 +412,8 @@ multiclass MIMG_Sampler <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0, defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96>; let VDataDwords = 4 in defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>; - let VDataDwords = 8 in - defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>; + let VDataDwords = 5 in + defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160>; } } @@ -433,8 +433,8 @@ multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0, defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */ let VDataDwords = 4 in defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 1>; - let VDataDwords = 8 in - defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>; + let VDataDwords = 5 in + defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160>; } } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 4155a013ad8..8437e4bb34e 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4701,14 +4701,14 @@ static SDValue constructRetValue(SelectionDAG &DAG, EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts) : AdjEltVT; - // Special case for v8f16. Rather than add support for this, use v4i32 to + // Special case for v6f16. Rather than add support for this, use v3i32 to // extract the data elements - bool V8F16Special = false; - if (CastVT == MVT::v8f16) { - CastVT = MVT::v4i32; + bool V6F16Special = false; + if (NumElts == 6) { + CastVT = EVT::getVectorVT(Context, MVT::i32, NumElts / 2); DMaskPop >>= 1; ReqRetNumElts >>= 1; - V8F16Special = true; + V6F16Special = true; AdjVT = MVT::v2i32; } @@ -4738,7 +4738,7 @@ static SDValue constructRetValue(SelectionDAG &DAG, PreTFCRes = BVElts[0]; } - if (V8F16Special) + if (V6F16Special) PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes); if (!IsTexFail) { @@ -4971,9 +4971,6 @@ SDValue SITargetLowering::lowerImage(SDValue Op, return Undef; } - // Have to use a power of 2 number of dwords - NumVDataDwords = 1 << Log2_32_Ceil(NumVDataDwords); - EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords) : MVT::f32; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll index b297acab36c..6084789e59e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -22,7 +22,7 @@ main_body: ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:7], v{{[0-9]+}}, s[0:7] dmask:0xf unorm tfe{{$}} +; GCN: image_load v[0:4], v{{[0-9]+}}, s[0:7] dmask:0xf unorm tfe{{$}} ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) { @@ -45,7 +45,7 @@ main_body: ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:7], v{{[0-9]+}}, s[0:7] dmask:0xf unorm lwe{{$}} +; GCN: image_load v[0:4], v{{[0-9]+}}, s[0:7] dmask:0xf unorm lwe{{$}} ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) { @@ -76,7 +76,7 @@ main_body: ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}} +; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}} ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t) { @@ -107,7 +107,7 @@ main_body: ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}} +; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}} ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %r) { @@ -138,7 +138,7 @@ main_body: ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}} +; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}} ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) { @@ -169,7 +169,7 @@ main_body: ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}} +; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}} ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %slice) { @@ -200,7 +200,7 @@ main_body: ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}} +; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}} ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) { @@ -231,7 +231,7 @@ main_body: ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}} +; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}} ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %fragid) { @@ -262,7 +262,7 @@ main_body: ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}} +; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}} ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) { @@ -293,7 +293,7 @@ main_body: ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load_mip v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe{{$}} +; GCN: image_load_mip v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe{{$}} ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %mip) { @@ -324,7 +324,7 @@ main_body: ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load_mip v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}} +; GCN: image_load_mip v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}} ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %mip) { @@ -451,7 +451,7 @@ main_body: ; NOPRT: v_mov_b32_e32 v2, 0 ; NOPRT-NOT: v_mov_b32_e32 v0 ; NOPRT-NOT: v_mov_b32_e32 v1 -; GCN: image_load v[0:3], v{{[0-9]+}}, s[0:7] dmask:0x6 unorm tfe{{$}} +; GCN: image_load v[0:2], v{{[0-9]+}}, s[0:7] dmask:0x6 unorm tfe{{$}} ; SIVI: buffer_store_dword v2, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v2 define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll index be579b84eb4..b05b85eaa4f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll @@ -19,7 +19,7 @@ main_body: } ; GCN-LABEL: {{^}}load.v3f32.1d: -; GCN: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm a16 +; GCN: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm a16 define amdgpu_ps <4 x float> @load.v3f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) { main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -57,7 +57,7 @@ main_body: } ; GCN-LABEL: {{^}}load.v3f32.2d: -; GCN: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm a16 +; GCN: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm a16 define amdgpu_ps <4 x float> @load.v3f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) { main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -99,7 +99,7 @@ main_body: } ; GCN-LABEL: {{^}}load.v3f32.3d: -; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0x7 unorm a16 +; GCN: image_load v[0:2], v[0:1], s[0:7] dmask:0x7 unorm a16 define amdgpu_ps <4 x float> @load.v3f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) { main_body: %x = extractelement <2 x i16> %coords_lo, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll index 2ee69ac6e8b..28e747c32c4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -15,7 +15,7 @@ main_body: ; GCN: v_mov_b32_e32 v2, v0 ; GCN: v_mov_b32_e32 v3, v0 ; GCN: v_mov_b32_e32 v4, v0 -; GCN: image_sample v[0:7], v5, s[0:7], s[8:11] dmask:0xf tfe{{$}} +; GCN: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf tfe{{$}} define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) { main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) @@ -155,7 +155,7 @@ main_body: ; GCN: v_mov_b32_e32 v2, v0 ; GCN: v_mov_b32_e32 v3, v0 ; GCN: v_mov_b32_e32 v4, v0 -; GCN: image_sample v[0:7], v5, s[0:7], s[8:11] dmask:0xf lwe{{$}} +; GCN: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf lwe{{$}} define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) { main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 2, i32 0) @@ -537,7 +537,7 @@ main_body: } ; GCN-LABEL: {{^}}sample_c_d_o_2darray_V2_tfe: -; GCN: image_sample_c_d_o v[9:12], v[0:15], s[0:7], s[8:11] dmask:0x6 tfe da{{$}} +; GCN: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 tfe da{{$}} define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) { main_body: %v = call {<2 x float>, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32i32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) |