diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-08-22 11:09:45 +0000 |
---|---|---|
committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-08-22 11:09:45 +0000 |
commit | bb8e64e7f5ad448bf04ba84c995b8a7cbf9bb7e4 (patch) | |
tree | 8f681d00dc1625318f78bf0da76e8b874cf76577 | |
parent | 4660fd25d1f7524d89ed2374daaa1bceb707b808 (diff) | |
download | bcm5719-llvm-bb8e64e7f5ad448bf04ba84c995b8a7cbf9bb7e4.tar.gz bcm5719-llvm-bb8e64e7f5ad448bf04ba84c995b8a7cbf9bb7e4.zip |
AMDGPU: Fix not respecting byval alignment in call frame setup
This was hackily adding in the 4-bytes reserved for the callee's
emergency stack slot. Treat it like a normal stack allocation
so we get the correct alignment padding behavior. This fixes
an inconsistency between the caller and callee.
llvm-svn: 340396
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 27 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll | 144 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll | 36 |
6 files changed, 174 insertions, 41 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 7067bf15322..8c2f828e65e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4003,13 +4003,12 @@ SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG, SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, - SDValue StackPtr, SDValue ArgVal, int64_t Offset) const { MachineFunction &MF = DAG.getMachineFunction(); MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset); - SDValue Ptr = DAG.getObjectPtrOffset(SL, StackPtr, Offset); + SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32); SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4, MachineMemOperand::MODereferenceable); return Store; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index d56f9cd06cd..52c3838a40b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -287,7 +287,6 @@ public: SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, - SDValue StackPtr, SDValue ArgVal, int64_t Offset) const; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 0764dae962f..0cf23c5f76a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2181,11 +2181,11 @@ SDValue SITargetLowering::LowerCallResult( // from the explicit user arguments present in the IR. void SITargetLowering::passSpecialInputs( CallLoweringInfo &CLI, + CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, SmallVectorImpl<SDValue> &MemOpChains, - SDValue Chain, - SDValue StackPtr) const { + SDValue Chain) const { // If we don't have a call site, this was a call inserted by // legalization. These can never use special inputs. if (!CLI.CS) @@ -2253,9 +2253,9 @@ void SITargetLowering::passSpecialInputs( if (OutgoingArg->isRegister()) { RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); } else { - SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr, - InputReg, - OutgoingArg->getStackOffset()); + unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4); + SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg, + SpecialArgOffset); MemOpChains.push_back(ArgStore); } } @@ -2401,8 +2401,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, } // The first 4 bytes are reserved for the callee's emergency stack slot. - const unsigned CalleeUsableStackOffset = 4; - if (IsTailCall) { IsTailCall = isEligibleForTailCallOptimization( Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); @@ -2441,6 +2439,10 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); + + // The first 4 bytes are reserved for the callee's emergency stack slot. + CCInfo.AllocateStack(4, 4); + CCInfo.AnalyzeCallOperands(Outs, AssignFn); // Get a count of how many bytes are to be pushed on the stack. @@ -2488,10 +2490,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, } } - // Stack pointer relative accesses are done by changing the offset SGPR. This - // is just the VGPR offset component. - SDValue StackPtr = DAG.getConstant(CalleeUsableStackOffset, DL, MVT::i32); - SmallVector<SDValue, 8> MemOpChains; MVT PtrVT = MVT::i32; @@ -2535,7 +2533,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, unsigned LocMemOffset = VA.getLocMemOffset(); int32_t Offset = LocMemOffset; - SDValue PtrOff = DAG.getObjectPtrOffset(DL, StackPtr, Offset); + SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT); if (IsTailCall) { ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; @@ -2545,8 +2543,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, Offset = Offset + FPDiff; int FI = MFI.CreateFixedObject(OpSize, Offset, true); - DstAddr = DAG.getObjectPtrOffset(DL, DAG.getFrameIndex(FI, PtrVT), - StackPtr); + DstAddr = DAG.getFrameIndex(FI, PtrVT); DstInfo = MachinePointerInfo::getFixedStack(MF, FI); // Make sure any stack arguments overlapping with where we're storing @@ -2581,7 +2578,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, } // Copy special input registers after user input arguments. - passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr); + passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 8fe7a4442cd..b4265376935 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -265,11 +265,11 @@ public: void passSpecialInputs( CallLoweringInfo &CLI, + CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, SmallVectorImpl<SDValue> &MemOpChains, - SDValue Chain, - SDValue StackPtr) const; + SDValue Chain) const; SDValue LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, diff --git a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll index 0b34524d156..b0268dc61ee 100644 --- a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -110,7 +110,7 @@ entry: ; GCN: s_sub_u32 s32, s32, 0xc00{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 -define void @call_void_func_byval_struct_func() #0 { +define void @call_void_func_byval_struct_func() #1 { entry: %arg0 = alloca %struct.ByValStruct, align 4, addrspace(5) %arg1 = alloca %struct.ByValStruct, align 4, addrspace(5) @@ -163,7 +163,7 @@ entry: ; GCN: s_swappc_b64 ; GCN-NOT: s_sub_u32 s32 ; GCN: s_endpgm -define amdgpu_kernel void @call_void_func_byval_struct_kernel() #0 { +define amdgpu_kernel void @call_void_func_byval_struct_kernel() #1 { entry: %arg0 = alloca %struct.ByValStruct, align 4, addrspace(5) %arg1 = alloca %struct.ByValStruct, align 4, addrspace(5) @@ -181,6 +181,146 @@ entry: ret void } +; GCN-LABEL: {{^}}void_func_byval_struct_align8: +; GCN: s_mov_b32 s5, s32 +; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8{{$}} +; GCN-NOT: s32 +; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s5 offset:8{{$}} +; GCN-NOT: s32 + +; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:24{{$}} +; GCN-NOT: s32 +; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s5 offset:24{{$}} +; GCN-NOT: s32 +define void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg1) #1 { +entry: + %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 + %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 8 + %add = add nsw i32 %tmp, 1 + store volatile i32 %add, i32 addrspace(5)* %arrayidx, align 8 + %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 + %tmp1 = load volatile i32, i32 addrspace(5)* %arrayidx2, align 8 + %add3 = add nsw i32 %tmp1, 2 + store volatile i32 %add3, i32 addrspace(5)* %arrayidx2, align 8 + store volatile i32 9, i32 addrspace(1)* null, align 4 + ret void +} + +; Make sure the byval alignment is respected in the call frame setup +; GCN-LABEL: {{^}}call_void_func_byval_struct_align8_kernel: +; GCN: s_mov_b32 s33, s7 +; GCN: s_add_u32 s32, s33, 0xc00{{$}} + +; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 +; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13 +; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s33 offset:8 +; GCN: buffer_store_dword [[THIRTEEN]], off, s[0:3], s33 offset:24 + +; GCN-NOT: s_add_u32 s32, s32, 0x800 + +; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s33 offset:8 +; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s33 offset:12 +; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s33 offset:16 +; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s33 offset:20 + +; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8{{$}} +; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:12 +; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:16 +; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:20 + +; GCN-DAG: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s33 offset:24 +; GCN-DAG: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s33 offset:28 +; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s33 offset:32 +; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s33 offset:36 + +; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24 +; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28 +; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:32 +; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:36 + + +; GCN: s_swappc_b64 +; GCN-NOT: s_sub_u32 s32 +; GCN: s_endpgm +define amdgpu_kernel void @call_void_func_byval_struct_align8_kernel() #1 { +entry: + %arg0 = alloca %struct.ByValStruct, align 8, addrspace(5) + %arg1 = alloca %struct.ByValStruct, align 8, addrspace(5) + %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)* + call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp) + %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)* + call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1) + %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 + store volatile i32 9, i32 addrspace(5)* %arrayidx, align 8 + %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 + store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 8 + call void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg1) + call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1) + call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp) + ret void +} + +; GCN-LABEL: {{^}}call_void_func_byval_struct_align8_func: +; GCN: s_mov_b32 s5, s32 +; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}} +; GCN-DAG: v_writelane_b32 + +; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 +; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13 + +; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:8 +; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:24 + +; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8 +; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:12 +; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:16 +; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:20 + +; GCN-NOT: s_add_u32 s32, s32, 0x800 + +; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8{{$}} +; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:12 +; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:16 +; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:20 + +; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:24 +; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:28 +; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:32 +; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:36 + +; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24 +; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28 +; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:32 +; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:36 + +; GCN: s_swappc_b64 +; GCN-NOT: v_readlane_b32 s32 +; GCN: v_readlane_b32 +; GCN-NOT: v_readlane_b32 s32 + +; GCN-NOT: s_sub_u32 s32, s32, 0x800 + +; GCN: s_sub_u32 s32, s32, 0xc00{{$}} +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @call_void_func_byval_struct_align8_func() #0 { +entry: + %arg0 = alloca %struct.ByValStruct, align 8, addrspace(5) + %arg1 = alloca %struct.ByValStruct, align 8, addrspace(5) + %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)* + call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp) + %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)* + call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1) + %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 + store volatile i32 9, i32 addrspace(5)* %arrayidx, align 8 + %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 + store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 8 + call void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg1) + call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1) + call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp) + ret void +} + ; GCN-LABEL: {{^}}call_void_func_byval_struct_kernel_no_frame_pointer_elim: define amdgpu_kernel void @call_void_func_byval_struct_kernel_no_frame_pointer_elim() #2 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll index 432a4d3c985..7f14a24d6da 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -290,7 +290,7 @@ define void @too_many_args_use_workitem_id_x( ; GCN: s_mov_b32 s33, s7 ; GCN: s_mov_b32 s32, s33 -; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; GCN: s_mov_b32 s4, s33 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 { @@ -308,7 +308,7 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 { ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x: ; GCN: s_mov_b32 s5, s32 -; GCN: buffer_store_dword v1, off, s[0:3], s32 offset:8 +; GCN: buffer_store_dword v1, off, s[0:3], s32 offset: ; GCN: s_swappc_b64 define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 { store volatile i32 %arg0, i32 addrspace(1)* undef @@ -330,7 +330,7 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 { ; GCN: s_add_u32 s32, s32, 0x400{{$}} ; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 -; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8{{$}} +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4{{$}} ; GCN: s_swappc_b64 @@ -428,7 +428,7 @@ define void @too_many_args_use_workitem_id_x_byval( ; GCN-NOT: s32 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} ; GCN: buffer_store_dword [[K]], off, s[0:3], s33 offset:4 -; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33 offset:4 ; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}} @@ -453,7 +453,7 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} ; GCN: buffer_store_dword [[K]], off, s[0:3], s5 offset:4 -; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s5 offset:4 ; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}} @@ -539,11 +539,10 @@ define void @too_many_args_use_workitem_id_xyz( ret void } -; frame[0] = kernel emergency stack slot -; frame[1] = callee emergency stack slot -; frame[2] = ID X -; frame[3] = ID Y -; frame[4] = ID Z +; frame[0] = callee emergency stack slot +; frame[1] = ID X +; frame[2] = ID Y +; frame[3] = ID Z ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz: ; GCN: enable_vgpr_workitem_id = 2 @@ -551,9 +550,9 @@ define void @too_many_args_use_workitem_id_xyz( ; GCN: s_mov_b32 s33, s7 ; GCN: s_mov_b32 s32, s33 -; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:8 -; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:12 -; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:16 +; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:8 +; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 { call void @too_many_args_use_workitem_id_xyz( @@ -635,10 +634,9 @@ define void @too_many_args_use_workitem_id_x_stack_yz( ret void } -; frame[0] = kernel emergency stack slot -; frame[1] = callee emergency stack slot -; frame[2] = ID Y -; frame[3] = ID Z +; frame[0] = callee emergency stack slot +; frame[1] = ID Y +; frame[2] = ID Z ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz: ; GCN: enable_vgpr_workitem_id = 2 @@ -647,8 +645,8 @@ define void @too_many_args_use_workitem_id_x_stack_yz( ; GCN: s_mov_b32 s32, s33 ; GCN-DAG: v_mov_b32_e32 v31, v0 -; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:8 -; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:12 +; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_stack_yz() #1 { call void @too_many_args_use_workitem_id_x_stack_yz( |