diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-05-09 18:37:39 +0000 |
---|---|---|
committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-05-09 18:37:39 +0000 |
commit | 762d49880876eca14aa75a1bc893eba23ed50788 (patch) | |
tree | 2d8e059a73d5b7c2f2e60458e71e2503e58dadad | |
parent | 73634e40378fdba2b39c777a871961af2651b4e0 (diff) | |
download | bcm5719-llvm-762d49880876eca14aa75a1bc893eba23ed50788.tar.gz bcm5719-llvm-762d49880876eca14aa75a1bc893eba23ed50788.zip |
AMDGPU: Add combine for trunc of bitcast from build_vector
If the truncate is only accessing the first element of the vector,
we can use the original source value.
This helps with some combine ordering issues after operations are
lowered to integer operations between bitcasts of build_vector.
In particular it stops unnecessarily materializing the unused
top half of a vector in some cases.
llvm-svn: 331909
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 30 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 1 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/function-returns.ll | 5 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.ll | 20 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.ll | 22 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll | 22 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/trunc-combine.ll | 55 |
7 files changed, 126 insertions, 29 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index fffcb2fb566..bfd28b93569 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -574,6 +574,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SRL); + setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::MULHU); setTargetDAGCombine(ISD::MULHS); @@ -3119,6 +3120,33 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair); } +SDValue AMDGPUTargetLowering::performTruncateCombine( + SDNode *N, DAGCombinerInfo &DCI) const { + SDLoc SL(N); + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + SDValue Src = N->getOperand(0); + + // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x) + if (Src.getOpcode() == ISD::BITCAST) { + SDValue Vec = Src.getOperand(0); + if (Vec.getOpcode() == ISD::BUILD_VECTOR) { + SDValue Elt0 = Vec.getOperand(0); + EVT EltVT = Elt0.getValueType(); + if (VT.getSizeInBits() <= EltVT.getSizeInBits()) { + if (EltVT.isFloatingPoint()) { + Elt0 = DAG.getNode(ISD::BITCAST, SL, + EltVT.changeTypeToInteger(), Elt0); + } + + return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0); + } + } + } + + return SDValue(); +} + // We need to specifically handle i64 mul here to avoid unnecessary conversion // instructions. If we only match on the legalized i64 mul expansion, // SimplifyDemandedBits will be unable to remove them because there will be @@ -3758,6 +3786,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, return performSraCombine(N, DCI); } + case ISD::TRUNCATE: + return performTruncateCombine(N, DCI); case ISD::MUL: return performMulCombine(N, DCI); case ISD::MULHS: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 857a69a1951..6db83395dc5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -87,6 +87,7 @@ protected: SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const; diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll index 758def5b044..05ab36baa3a 100644 --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -282,7 +282,7 @@ define <2 x i16> @v2i16_func_void() #0 { } ; GCN-LABEL: {{^}}v3i16_func_void: -; GFX9: buffer_load_dwordx2 v[0:1], off +; GFX9: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off ; GFX9: s_waitcnt vmcnt(0) ; GFX9: v_lshrrev_b32 ; GFX9: s_setpc_b64 @@ -304,9 +304,8 @@ define <4 x i16> @v4i16_func_void() #0 { ; GCN-LABEL: {{^}}v5i16_func_void: ; GFX9: buffer_load_dwordx2 v[0:1] ; GFX9: buffer_load_ushort v4 +; GFX9: v_lshrrev_b32_e32 v5, 16, v0 ; GFX9: v_lshrrev_b32_e32 v3, 16, v1 -; GFX9: v_mov_b32_e32 v2, v1 -; GFX9: v_lshrrev_b32_e32 v1, 16, v0 ; GCN: s_setpc_b64 define <5 x i16> @v5i16_func_void() #0 { %ptr = load volatile <5 x i16> addrspace(1)*, <5 x i16> addrspace(1)* addrspace(4)* undef diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.ll index bb3279b3ed7..2c52b61b218 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.ll @@ -8,11 +8,11 @@ ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] ; PACKED: image_gather4 v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 -; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] +; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] -; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off define amdgpu_kernel void @image_gather4_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { main_body: %tex = call <4 x half> @llvm.amdgcn.image.gather4.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) @@ -26,11 +26,11 @@ main_body: ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] ; PACKED: image_gather4_cl v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 -; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] +; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] -; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off define amdgpu_kernel void @image_gather4_cl_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { main_body: %tex = call <4 x half> @llvm.amdgcn.image.gather4.cl.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) @@ -44,11 +44,11 @@ main_body: ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] ; PACKED: image_gather4_c v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 -; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] +; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] -; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off define amdgpu_kernel void @image_gather4_c_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { main_body: %tex = call <4 x half> @llvm.amdgcn.image.gather4.c.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) @@ -62,11 +62,11 @@ main_body: ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] ; PACKED: image_gather4_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 -; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] +; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] -; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off define amdgpu_kernel void @image_gather4_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { main_body: %tex = call <4 x half> @llvm.amdgcn.image.gather4.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) @@ -80,11 +80,11 @@ main_body: ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] ; PACKED: image_gather4_c_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 -; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] +; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] -; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off define amdgpu_kernel void @image_gather4_c_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { main_body: %tex = call <4 x half> @llvm.amdgcn.image.gather4.c.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.ll index b5f8da64628..c50b508ed92 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.ll @@ -41,11 +41,11 @@ main_body: ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] ; PACKED: image_sample v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 -; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] +; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] -; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off define amdgpu_kernel void @image_sample_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { main_body: %tex = call <4 x half> @llvm.amdgcn.image.sample.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) @@ -59,11 +59,11 @@ main_body: ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] ; PACKED: image_sample_cl v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 -; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] +; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] -; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off define amdgpu_kernel void @image_sample_cl_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { main_body: %tex = call <4 x half> @llvm.amdgcn.image.sample.cl.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) @@ -77,11 +77,11 @@ main_body: ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] ; PACKED: image_sample_c v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 -; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] +; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] -; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off define amdgpu_kernel void @image_sample_c_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { main_body: %tex = call <4 x half> @llvm.amdgcn.image.sample.c.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) @@ -95,10 +95,11 @@ main_body: ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] ; PACKED: image_sample_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 -; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] +; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] -; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off + +; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off define amdgpu_kernel void @image_sample_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { main_body: %tex = call <4 x half> @llvm.amdgcn.image.sample.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) @@ -112,10 +113,11 @@ main_body: ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] ; PACKED: image_sample_c_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 -; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] +; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] -; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off + +; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off define amdgpu_kernel void @image_sample_c_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { main_body: %tex = call <4 x half> @llvm.amdgcn.image.sample.c.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll index fd82dccb0fa..280ce628c18 100644 --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -3,18 +3,28 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: {{^}}local_store_i56: -; GCN-DAG: ds_write_b8 v0, v{{[0-9]+}} offset:6 -; GCN-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4 -; GCN-DAG: ds_write_b32 v0, v{{[0-9]+$}} +; CIVI-DAG: ds_write_b8 v0, v{{[0-9]+}} offset:6 +; CIVI-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4 +; CIVI-DAG: ds_write_b32 v0, v{{[0-9]+$}} + +; GFX9-DAG: ds_write_b8_d16_hi v0, v{{[0-9]+}} offset:6 +; GFX9-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4 +; GFX9-DAG: ds_write_b32 v0, v{{[0-9]+$}} + + define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 { store i56 %arg, i56 addrspace(3)* %ptr, align 8 ret void } ; GCN-LABEL: {{^}}local_store_i55: -; GCN-DAG: ds_write_b8 v0, v{{[0-9]+}} offset:6 -; GCN-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4 -; GCN-DAG: ds_write_b32 v0, v{{[0-9]+$}} +; CIVI-DAG: ds_write_b8 v0, v{{[0-9]+}} offset:6 +; CIVI-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4 +; CIVI-DAG: ds_write_b32 v0, v{{[0-9]+$}} + +; GFX9-DAG: ds_write_b8_d16_hi v0, v{{[0-9]+}} offset:6 +; GFX9-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4 +; GFX9-DAG: ds_write_b32 v0, v{{[0-9]+$}} define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 { store i55 %arg, i55 addrspace(3)* %ptr, align 8 ret void diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll new file mode 100644 index 00000000000..2c8a5b42b65 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -0,0 +1,55 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s + +; Make sure high constant 0 isn't pointlessly materialized +; GCN-LABEL: {{^}}trunc_bitcast_i64_lshr_32_i16: +; GCN: s_waitcnt +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 +define i16 @trunc_bitcast_i64_lshr_32_i16(i64 %bar) { + %srl = lshr i64 %bar, 32 + %trunc = trunc i64 %srl to i16 + ret i16 %trunc +} + +; GCN-LABEL: {{^}}trunc_bitcast_i64_lshr_32_i32: +; GCN: s_waitcnt +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 +define i32 @trunc_bitcast_i64_lshr_32_i32(i64 %bar) { + %srl = lshr i64 %bar, 32 + %trunc = trunc i64 %srl to i32 + ret i32 %trunc +} + +; GCN-LABEL: {{^}}trunc_bitcast_v2i32_to_i16: +; GCN: _load_dword +; GCN-NOT: _load_dword +; GCN-NOT: v_mov_b32 +; GCN: v_add_u32_e32 v0, vcc, 4, v0 +define i16 @trunc_bitcast_v2i32_to_i16(<2 x i32> %bar) { + %load0 = load i32, i32 addrspace(1)* undef + %load1 = load i32, i32 addrspace(1)* null + %insert.0 = insertelement <2 x i32> undef, i32 %load0, i32 0 + %insert.1 = insertelement <2 x i32> %insert.0, i32 99, i32 1 + %bc = bitcast <2 x i32> %insert.1 to i64 + %trunc = trunc i64 %bc to i16 + %add = add i16 %trunc, 4 + ret i16 %add +} + +; Make sure there's no crash if the source vector type is FP +; GCN-LABEL: {{^}}trunc_bitcast_v2f32_to_i16: +; GCN: _load_dword +; GCN-NOT: _load_dword +; GCN-NOT: v_mov_b32 +; GCN: v_add_u32_e32 v0, vcc, 4, v0 +define i16 @trunc_bitcast_v2f32_to_i16(<2 x float> %bar) { + %load0 = load float, float addrspace(1)* undef + %load1 = load float, float addrspace(1)* null + %insert.0 = insertelement <2 x float> undef, float %load0, i32 0 + %insert.1 = insertelement <2 x float> %insert.0, float 4.0, i32 1 + %bc = bitcast <2 x float> %insert.1 to i64 + %trunc = trunc i64 %bc to i16 + %add = add i16 %trunc, 4 + ret i16 %add +} |