DAG: Don't try to cluster loads with tied inputs

This avoids breaking possible value dependencies when sorting loads by offset. AMDGPU has some load instructions that write into the high or low bits of the destination register, and have a tied input for the other input bits. These can easily have the same base pointer, but be a swizzle so the high address load needs to come first. This was inserting glue forcing the opposite ordering, producing a cycle the InstrEmitter would assert on. It may be potentially expensive to look for the dependency between the other loads, so just skip any where this could happen. Fixes bug 40936 by reverting r351379, which added a hacky attempt to fix this by adding chains in this case, which I think was just working around broken glue before the InstrEmitter. The core of the patch is re-implementing the fix for that problem. llvm-svn: 355728
author: Matt Arsenault <Matthew.Arsenault@amd.com> 2019-03-08 20:46:15 +0000
committer: Matt Arsenault <Matthew.Arsenault@amd.com> 2019-03-08 20:46:15 +0000
commit: 26e76ef0e2cf358809d2b41e657074fc21133d59 (patch)
tree: 69c6fc55e8c3494b30a3a91d7e83f29707339e16 /llvm/test/CodeGen/AMDGPU
parent: 43f098e719d022a935dd9244bcf936bd7d3d021f (diff)
download: bcm5719-llvm-26e76ef0e2cf358809d2b41e657074fc21133d59.tar.gz
bcm5719-llvm-26e76ef0e2cf358809d2b41e657074fc21133d59.zip
1 files changed, 37 insertions, 1 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index 320d008c220..696b33e75fe 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}chain_hi_to_lo_private:
 ; GCN: buffer_load_ushort [[DST:v[0-9]+]], off, [[RSRC:s\[[0-9]+:[0-9]+\]]], [[SOFF:s[0-9]+]] offset:2
@@ -139,3 +139,39 @@ bb:
 
   ret <2 x half> %result
 }
+
+; Make sure we don't lose any of the private stores.
+; GCN-LABEL: {{^}}vload2_private:
+; GCN: buffer_store_short v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:4
+; GCN: buffer_store_short_d16_hi v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:6
+; GCN: buffer_store_short v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:8
+
+; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:4
+; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:6
+; GCN: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:8
+define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %in, <2 x i16> addrspace(1)* nocapture %out) #0 {
+entry:
+  %loc = alloca [3 x i16], align 2, addrspace(5)
+  %loc.0.sroa_cast1 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)*
+  %tmp = load i16, i16 addrspace(1)* %in, align 2
+  %loc.0.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 0
+  store volatile i16 %tmp, i16 addrspace(5)* %loc.0.sroa_idx
+  %arrayidx.1 = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1
+  %tmp1 = load i16, i16 addrspace(1)* %arrayidx.1, align 2
+  %loc.2.sroa_idx3 = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 1
+  store volatile i16 %tmp1, i16 addrspace(5)* %loc.2.sroa_idx3
+  %arrayidx.2 = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 2
+  %tmp2 = load i16, i16 addrspace(1)* %arrayidx.2, align 2
+  %loc.4.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 2
+  store volatile i16 %tmp2, i16 addrspace(5)* %loc.4.sroa_idx
+  %loc.0.sroa_cast = bitcast [3 x i16] addrspace(5)* %loc to <2 x i16> addrspace(5)*
+  %loc.0. = load <2 x i16>, <2 x i16> addrspace(5)* %loc.0.sroa_cast, align 2
+  store <2 x i16> %loc.0., <2 x i16> addrspace(1)* %out, align 4
+  %loc.2.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 1
+  %loc.2.sroa_cast = bitcast i16 addrspace(5)* %loc.2.sroa_idx to <2 x i16> addrspace(5)*
+  %loc.2. = load <2 x i16>, <2 x i16> addrspace(5)* %loc.2.sroa_cast, align 2
+  %arrayidx6 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 1
+  store <2 x i16> %loc.2., <2 x i16> addrspace(1)* %arrayidx6, align 4
+  %loc.0.sroa_cast2 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)*
+  ret void
+}
author	Matt Arsenault <Matthew.Arsenault@amd.com>	2019-03-08 20:46:15 +0000
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	2019-03-08 20:46:15 +0000
commit	26e76ef0e2cf358809d2b41e657074fc21133d59 (patch)
tree	69c6fc55e8c3494b30a3a91d7e83f29707339e16 /llvm/test/CodeGen/AMDGPU
parent	43f098e719d022a935dd9244bcf936bd7d3d021f (diff)
download	bcm5719-llvm-26e76ef0e2cf358809d2b41e657074fc21133d59.tar.gz bcm5719-llvm-26e76ef0e2cf358809d2b41e657074fc21133d59.zip