2 files changed, 179 insertions, 2 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 0c3a46dc80d..cb15c4051fc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1178,7 +1178,8 @@ SDValue SelectionDAGBuilder::getCopyFromRegs(const Value *V, Type *Ty) {
     unsigned InReg = It->second;
 
     RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
-                     DAG.getDataLayout(), InReg, Ty, getABIRegCopyCC(V));
+                     DAG.getDataLayout(), InReg, Ty,
+                     None); // This is not an ABI copy.
     SDValue Chain = DAG.getEntryNode();
     Result = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr,
                                  V);
@@ -8696,7 +8697,7 @@ SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) {
   // notional registers required by the type.
 
   RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), Reg, V->getType(),
-                   getABIRegCopyCC(V));
+                   None); // This is not an ABI copy.
   SDValue Chain = DAG.getEntryNode();
 
   ISD::NodeType ExtendType = (FuncInfo.PreferredExtendType.find(V) ==
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
new file mode 100644
index 00000000000..805b2f5ad2b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -0,0 +1,176 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; SelectionDAG builder was using the IR value kind to decide how to
+; split the types for copyToRegs/copyFromRegs in all contexts. This
+; was incorrect if the ABI-like value such as a call was used outside
+; of the block. The value in that case is not used directly, but
+; through another set of copies to potentially different register
+; types in the parent block.
+
+; This would then end up producing inconsistent pairs of copies with
+; the wrong sizes when the vector type result from the call was split
+; into multiple pieces, but expected to be a single register in the
+; cross-block copy.
+;
+; This isn't exactly ideal for AMDGPU, since in reality the
+; intermediate vector register type is undesirable anyway, but it
+; requires more work to be able to split all vector copies in all
+; contexts.
+;
+; This was only an issue if the value was used directly in another
+; block. If there was an intermediate operation or a phi it was fine,
+; since that didn't look like an ABI copy.
+
+
+define float @call_split_type_used_outside_block_v2f32() #0 {
+; GCN-LABEL: call_split_type_used_outside_block_v2f32:
+; GCN:       ; %bb.0: ; %bb0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s5, s32
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    v_writelane_b32 v32, s33, 0
+; GCN-NEXT:    v_writelane_b32 v32, s34, 1
+; GCN-NEXT:    s_add_u32 s32, s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v32, s35, 2
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, func_v2f32@rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, func_v2f32@rel32@hi+4
+; GCN-NEXT:    s_mov_b64 s[34:35], s[30:31]
+; GCN-NEXT:    s_mov_b32 s33, s5
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GCN-NEXT:    s_mov_b32 s5, s33
+; GCN-NEXT:    s_mov_b64 s[30:31], s[34:35]
+; GCN-NEXT:    v_readlane_b32 s35, v32, 2
+; GCN-NEXT:    v_readlane_b32 s34, v32, 1
+; GCN-NEXT:    v_readlane_b32 s33, v32, 0
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+bb0:
+  %split.ret.type = call <2 x float> @func_v2f32()
+  br label %bb1
+
+bb1:
+  %extract = extractelement <2 x float> %split.ret.type, i32 0
+  ret float %extract
+}
+
+define float @call_split_type_used_outside_block_v3f32() #0 {
+; GCN-LABEL: call_split_type_used_outside_block_v3f32:
+; GCN:       ; %bb.0: ; %bb0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s5, s32
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    v_writelane_b32 v32, s33, 0
+; GCN-NEXT:    v_writelane_b32 v32, s34, 1
+; GCN-NEXT:    s_add_u32 s32, s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v32, s35, 2
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, func_v3f32@rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, func_v3f32@rel32@hi+4
+; GCN-NEXT:    s_mov_b64 s[34:35], s[30:31]
+; GCN-NEXT:    s_mov_b32 s33, s5
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GCN-NEXT:    s_mov_b32 s5, s33
+; GCN-NEXT:    s_mov_b64 s[30:31], s[34:35]
+; GCN-NEXT:    v_readlane_b32 s35, v32, 2
+; GCN-NEXT:    v_readlane_b32 s34, v32, 1
+; GCN-NEXT:    v_readlane_b32 s33, v32, 0
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+bb0:
+  %split.ret.type = call <3 x float> @func_v3f32()
+  br label %bb1
+
+bb1:
+  %extract = extractelement <3 x float> %split.ret.type, i32 0
+  ret float %extract
+}
+
+define half @call_split_type_used_outside_block_v4f16() #0 {
+; GCN-LABEL: call_split_type_used_outside_block_v4f16:
+; GCN:       ; %bb.0: ; %bb0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s5, s32
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    v_writelane_b32 v32, s33, 0
+; GCN-NEXT:    v_writelane_b32 v32, s34, 1
+; GCN-NEXT:    s_add_u32 s32, s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v32, s35, 2
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, func_v4f16@rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, func_v4f16@rel32@hi+4
+; GCN-NEXT:    s_mov_b64 s[34:35], s[30:31]
+; GCN-NEXT:    s_mov_b32 s33, s5
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GCN-NEXT:    s_mov_b32 s5, s33
+; GCN-NEXT:    s_mov_b64 s[30:31], s[34:35]
+; GCN-NEXT:    v_readlane_b32 s35, v32, 2
+; GCN-NEXT:    v_readlane_b32 s34, v32, 1
+; GCN-NEXT:    v_readlane_b32 s33, v32, 0
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+bb0:
+  %split.ret.type = call <4 x half> @func_v4f16()
+  br label %bb1
+
+bb1:
+  %extract = extractelement <4 x half> %split.ret.type, i32 0
+  ret half %extract
+}
+
+define { i32, half } @call_split_type_used_outside_block_struct() #0 {
+; GCN-LABEL: call_split_type_used_outside_block_struct:
+; GCN:       ; %bb.0: ; %bb0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s5, s32
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    v_writelane_b32 v32, s33, 0
+; GCN-NEXT:    v_writelane_b32 v32, s34, 1
+; GCN-NEXT:    s_add_u32 s32, s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v32, s35, 2
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, func_struct@rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, func_struct@rel32@hi+4
+; GCN-NEXT:    s_mov_b64 s[34:35], s[30:31]
+; GCN-NEXT:    s_mov_b32 s33, s5
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GCN-NEXT:    s_mov_b32 s5, s33
+; GCN-NEXT:    s_mov_b64 s[30:31], s[34:35]
+; GCN-NEXT:    v_readlane_b32 s35, v32, 2
+; GCN-NEXT:    v_readlane_b32 s34, v32, 1
+; GCN-NEXT:    v_readlane_b32 s33, v32, 0
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    v_mov_b32_e32 v1, v4
+; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+bb0:
+  %split.ret.type = call { <4 x i32>, <4 x half> } @func_struct()
+  br label %bb1
+
+bb1:
+  %val0 = extractvalue { <4 x i32>, <4 x half> } %split.ret.type, 0
+  %val1 = extractvalue { <4 x i32>, <4 x half> } %split.ret.type, 1
+  %extract0 = extractelement <4 x i32> %val0, i32 0
+  %extract1 = extractelement <4 x half> %val1, i32 0
+  %ins0 = insertvalue { i32, half } undef, i32 %extract0, 0
+  %ins1 = insertvalue { i32, half } %ins0, half %extract1, 1
+  ret { i32, half } %ins1
+}
+
+
+declare <2 x float> @func_v2f32() #0
+declare <3 x float> @func_v3f32() #0
+declare <4 x float> @func_v4f32() #0
+declare <4 x half> @func_v4f16() #0
+
+declare { <4 x i32>, <4 x half> } @func_struct() #0
+
+attributes #0 = { nounwind}